@Override
/** {@inheritDoc} */
public Instance buildInstance(SurfaceFormOccurrence surfaceFormOccurrence, Instance instance) {
TaggedText text = (TaggedText) surfaceFormOccurrence.context();
List<TaggedToken> candidateTokens = text.taggedTokenProvider().getTaggedTokens(surfaceFormOccurrence);
int termSize = candidateTokens.size();
TaggedToken firstTaggedToken = candidateTokens.get(0);
CandidateData firstTaggedTokenData = null;
try {
firstTaggedTokenData = dataProvider.getCandidateData(firstTaggedToken.getToken());
} catch (ItemNotFoundException e) {
//No information about the token!
}
CandidateData secondTaggedTokenData = null;
if(candidateTokens.size() > 1){
TaggedToken secondTaggedToken = candidateTokens.get(1);
try {
secondTaggedTokenData = dataProvider.getCandidateData(secondTaggedToken.getToken());
} catch (ItemNotFoundException e) {
//No information about the token!
}
}
TaggedToken lastTaggedToken = candidateTokens.get(candidateTokens.size()-1);
CandidateData lastTaggedTokenData = null;
try {
lastTaggedTokenData = dataProvider.getCandidateData(lastTaggedToken.getToken());
} catch (ItemNotFoundException e) {
//No information about the token!
}
CandidateData lastBut1TaggedTokenData = null;
if(candidateTokens.size() > 1) {
TaggedToken lastBut1TaggedToken = candidateTokens.get(candidateTokens.size()-2);
try {
lastBut1TaggedTokenData = dataProvider.getCandidateData(lastBut1TaggedToken.getToken());
} catch (ItemNotFoundException e) {
//No information about the token!
}
}
/**
* Left context
*/
List<TaggedToken> leftContext = null;
try {
leftContext = text.taggedTokenProvider().getLeftContext(surfaceFormOccurrence, 2);
} catch (ItemNotFoundException ignored) {}
CandidateData left1 = null;
if(leftContext != null && leftContext.size() > 0) {
try {
String token;
if(leftContext.size() == 1) {
/**
* There are no more tokens to the left, the token is sentence initial.
*/
token = leftContext.get(0).getToken().toLowerCase();
}else{
token = leftContext.get(0).getToken();
}
left1 = dataProvider.getCandidateData(token);
} catch (ItemNotFoundException e) {
//No information about the token
}
}
/**
* Right context
*/
List<TaggedToken> rightContext = null;
try {
rightContext = text.taggedTokenProvider().getRightContext(surfaceFormOccurrence, 2);
} catch (ItemNotFoundException ignored) {}
CandidateData right1 = null;
if(rightContext != null && rightContext.size() > 0) {
try {
right1 = dataProvider.getCandidateData(rightContext.get(0).getToken());
} catch (ItemNotFoundException e) {
//No information about the token
}
}
/**
* Features:
*/
if(termSize == 2) {
try {
if(firstTaggedTokenData != null && secondTaggedTokenData != null) {
CoOccurrenceData bigramData = dataProvider.getBigramData(firstTaggedTokenData, secondTaggedTokenData);
//if (bigramData.getUnitCountWeb() > bigramLeftWebMin)
instance.setValue(i(count_web, buildAttributeList()), bigramData.getUnitCountWeb());
}
} catch (ItemNotFoundException ignored) {}
catch (ArrayIndexOutOfBoundsException ignored) {}
}
List<String> verbs = new LinkedList<String>();
boolean allLowercase = surfaceFormOccurrence.surfaceForm().name().toLowerCase().equals(surfaceFormOccurrence.surfaceForm().name());
boolean allUppercase = surfaceFormOccurrence.surfaceForm().name().toUpperCase().equals(surfaceFormOccurrence.surfaceForm().name());
int capitalizedWords = 0;
for(TaggedToken candidateToken : candidateTokens) {
if(candidateToken.getPOSTag().startsWith("v") || candidateToken.getPOSTag().equals("be")) {
verbs.add(candidateToken.getPOSTag());
}
if(Character.isUpperCase(candidateToken.getToken().charAt(0)))
capitalizedWords++;
}
try{
if(verbs.size() > 1)
instance.setValue(i(contains_verb, buildAttributeList()), 5);
else if(verbs.size()==0)
instance.setValue(i(contains_verb, buildAttributeList()), 0);
else if(verbs.get(0).equals("vb"))
instance.setValue(i(contains_verb, buildAttributeList()), 1);
else if(verbs.get(0).equals("vbd"))
instance.setValue(i(contains_verb, buildAttributeList()), 2);
else if(verbs.get(0).equals("vbg"))
instance.setValue(i(contains_verb, buildAttributeList()), 3);
else if(verbs.get(0).equals("vbn"))
instance.setValue(i(contains_verb, buildAttributeList()), 4);
else if(verbs.get(0).equals("be"))
instance.setValue(i(contains_verb, buildAttributeList()), 5);
} catch (ArrayIndexOutOfBoundsException ignored) {}
try{
if(allLowercase)
instance.setValue(i(term_case, buildAttributeList()), 0);
else if(allUppercase)
instance.setValue(i(term_case, buildAttributeList()), 3);
else if(capitalizedWords == candidateTokens.size())
instance.setValue(i(term_case, buildAttributeList()), 2);
else if(capitalizedWords == 1 && Character.isUpperCase(candidateTokens.get(0).getToken().charAt(0)))
instance.setValue(i(term_case, buildAttributeList()), 4);
else
instance.setValue(i(term_case, buildAttributeList()), 1);
} catch (ArrayIndexOutOfBoundsException ignored) {}
try{
instance.setValue(i(candidate_size, buildAttributeList()), termSize);
} catch (ArrayIndexOutOfBoundsException ignored) {}
try {
TaggedToken leftNeighbourToken = text.taggedTokenProvider().getLeftNeighbourToken(surfaceFormOccurrence);
if(leftNeighbourToken.getPOSTag().equals("to")) {
instance.setValue(i(pre_pos, buildAttributeList()), 0);
}
else if(leftNeighbourToken.getPOSTag().matches("[mnf].*")) {