if(termSize == 2) {
try {
if(firstTaggedTokenData != null && secondTaggedTokenData != null) {
CoOccurrenceData bigramData = dataProvider.getBigramData(firstTaggedTokenData, secondTaggedTokenData);
//if (bigramData.getUnitCountWeb() > bigramLeftWebMin)
instance.setValue(i(count_web, buildAttributeList()), bigramData.getUnitCountWeb());
}
} catch (ItemNotFoundException ignored) {}
catch (ArrayIndexOutOfBoundsException ignored) {}
}
List<String> verbs = new LinkedList<String>();
boolean allLowercase = surfaceFormOccurrence.surfaceForm().name().toLowerCase().equals(surfaceFormOccurrence.surfaceForm().name());
boolean allUppercase = surfaceFormOccurrence.surfaceForm().name().toUpperCase().equals(surfaceFormOccurrence.surfaceForm().name());
int capitalizedWords = 0;
for(TaggedToken candidateToken : candidateTokens) {
if(candidateToken.getPOSTag().startsWith("v") || candidateToken.getPOSTag().equals("be")) {
verbs.add(candidateToken.getPOSTag());
}
if(Character.isUpperCase(candidateToken.getToken().charAt(0)))
capitalizedWords++;
}
try{
if(verbs.size() > 1)
instance.setValue(i(contains_verb, buildAttributeList()), 5);
else if(verbs.size()==0)
instance.setValue(i(contains_verb, buildAttributeList()), 0);
else if(verbs.get(0).equals("vb"))
instance.setValue(i(contains_verb, buildAttributeList()), 1);
else if(verbs.get(0).equals("vbd"))
instance.setValue(i(contains_verb, buildAttributeList()), 2);
else if(verbs.get(0).equals("vbg"))
instance.setValue(i(contains_verb, buildAttributeList()), 3);
else if(verbs.get(0).equals("vbn"))
instance.setValue(i(contains_verb, buildAttributeList()), 4);
else if(verbs.get(0).equals("be"))
instance.setValue(i(contains_verb, buildAttributeList()), 5);
} catch (ArrayIndexOutOfBoundsException ignored) {}
try{
if(allLowercase)
instance.setValue(i(term_case, buildAttributeList()), 0);
else if(allUppercase)
instance.setValue(i(term_case, buildAttributeList()), 3);
else if(capitalizedWords == candidateTokens.size())
instance.setValue(i(term_case, buildAttributeList()), 2);
else if(capitalizedWords == 1 && Character.isUpperCase(candidateTokens.get(0).getToken().charAt(0)))
instance.setValue(i(term_case, buildAttributeList()), 4);
else
instance.setValue(i(term_case, buildAttributeList()), 1);
} catch (ArrayIndexOutOfBoundsException ignored) {}
try{
instance.setValue(i(candidate_size, buildAttributeList()), termSize);
} catch (ArrayIndexOutOfBoundsException ignored) {}
try {
TaggedToken leftNeighbourToken = text.taggedTokenProvider().getLeftNeighbourToken(surfaceFormOccurrence);
if(leftNeighbourToken.getPOSTag().equals("to")) {
instance.setValue(i(pre_pos, buildAttributeList()), 0);
}
else if(leftNeighbourToken.getPOSTag().matches("[mnf].*")) {
instance.setValue(i(pre_pos, buildAttributeList()), 1);
}else if(leftNeighbourToken.getToken().matches("[aA][nN]?")) {
instance.setValue(i(pre_pos, buildAttributeList()), 2);
}
} catch (ItemNotFoundException ignored) {
} catch (ArrayIndexOutOfBoundsException ignored) {}
try {
if(leftContext.size() > 0) {
if(leftContext.get(0).getPOSTag().equals("to")) {
instance.setValue(i(pre_pos, buildAttributeList()), 0);
}
else if(leftContext.get(0).getPOSTag().matches("[mnf].*")) {
instance.setValue(i(pre_pos, buildAttributeList()), 1);
}else if(leftContext.get(0).getToken().matches("[aA][nN]?")) {
instance.setValue(i(pre_pos, buildAttributeList()), 2);
}
}
} catch (ArrayIndexOutOfBoundsException ignored) {}
try{
if (CandidateFeatures.quoted(surfaceFormOccurrence) == 1)
instance.setValue(i(quoted, buildAttributeList()), 0);
} catch (ArrayIndexOutOfBoundsException ignored) {}
try {
if(rightContext.size() > 0) {
if(rightContext.get(0).getToken().equals("of")) {
instance.setValue(i(next_pos, buildAttributeList()), 0);
}else if(rightContext.get(0).getToken().equals("to")) {
instance.setValue(i(next_pos, buildAttributeList()), 1);
}else if(rightContext.get(0).getPOSTag().startsWith("be")) {
instance.setValue(i(next_pos, buildAttributeList()), 2);
}else if(rightContext.get(0).getPOSTag().startsWith("v")) {
instance.setValue(i(next_pos, buildAttributeList()), 3);
}
}
} catch (ArrayIndexOutOfBoundsException ignored) {}
try {
TaggedToken lastToken = candidateTokens.get(candidateTokens.size() - 1);
if(lastToken.getPOSTag().equals("in")) {
instance.setValue(i(ends_with, buildAttributeList()), 0);
}
} catch (ArrayIndexOutOfBoundsException ignored) {}
/**
* Co-Occurrence data of the left neighbour token:
*/
if(left1 != null && firstTaggedTokenData != null && leftContext.size() > 0 && !leftContext.get(0).getPOSTag().matches(FUNCTION_WORD_PATTERN) && !leftContext.get(0).getPOSTag().equals("in")) {
CoOccurrenceData bigramLeft = null;
try {
bigramLeft = dataProvider.getBigramData(left1, firstTaggedTokenData);
} catch (ItemNotFoundException ignored) {}
if(bigramLeft != null && bigramLeft.getUnitSignificanceWeb() > bigramLeftWebMin) {
try{
instance.setValue(i(bigram_left_significance_web, buildAttributeList()), bigramLeft.getUnitSignificanceWeb());
} catch (ArrayIndexOutOfBoundsException ignored) {}
}
}
/**
* Co-Occurrence data for the left trigram
*/
if(firstTaggedTokenData != null && secondTaggedTokenData != null && left1 != null) {
CoOccurrenceData trigramLeft = null;
try {
trigramLeft = dataProvider.getTrigramData(left1, firstTaggedTokenData, secondTaggedTokenData);
} catch (ItemNotFoundException ignored) {}
if(trigramLeft != null && trigramLeft.getUnitCountWeb() > trigramLeftWebMin) {
try{
instance.setValue(i(trigram_left, buildAttributeList()), trigramLeft.getUnitCountWeb());
} catch (ArrayIndexOutOfBoundsException ignored) {}
}
}
if(lastTaggedTokenData != null && lastBut1TaggedTokenData != null && right1 != null) {
CoOccurrenceData trigramRight = null;
try {
trigramRight = dataProvider.getTrigramData(lastBut1TaggedTokenData, lastTaggedTokenData, right1);
} catch (ItemNotFoundException ignored) {}
catch(NullPointerException ignored) {}
if(trigramRight != null && trigramRight.getUnitCountWeb() > trigramRightWebMin) {
try{
instance.setValue(i(trigram_right, buildAttributeList()), trigramRight.getUnitCountWeb());
} catch (ArrayIndexOutOfBoundsException ignored) {}
}
}
/**
* Co-Occurrence data of the right neighbour token:
*/
if(lastTaggedTokenData != null && right1 != null && !rightContext.get(0).getPOSTag().matches(FUNCTION_WORD_PATTERN) && !rightContext.get(0).getPOSTag().equals("in")) {
CoOccurrenceData bigramRight = null;
try {
bigramRight = dataProvider.getBigramData(lastTaggedTokenData, right1);
} catch (ItemNotFoundException ignored) {}
if(bigramRight != null && bigramRight.getUnitSignificanceWeb() > bigramRightWebMin) {
try {
instance.setValue(i(bigram_right_significance_web, buildAttributeList()), bigramRight.getUnitSignificanceWeb());
} catch (ArrayIndexOutOfBoundsException ignored) {}
}
}