FilterPOS filterPOS = new FilterPOS();
FilterTermsize unigramFilter = new FilterTermsize(FilterTermsize.Termsize.unigram);
FilterPattern filterPattern = new FilterPattern();
SpotClassifier unigramClassifier = ClassifierFactory.getClassifierInstanceUnigram();
SpotClassifier ngramClassifier = ClassifierFactory.getClassifierInstanceNGram();
assert unigramClassifier != null;
assert ngramClassifier != null;
//ngramClassifier.setVerboseMode(true); f
//unigramClassifier.setVerboseMode(true);
List<String> decisions = new LinkedList<String>();
for(SurfaceFormOccurrence surfaceFormOccurrence : surfaceFormOccurrences) {
if (surfaceFormOccurrence.surfaceForm().name().trim().length()==0) {
LOG.warn("I have an occurrence with empty surface form. :-O Ignoring.");
LOG.error(surfaceFormOccurrence);
continue;
}
if (! (surfaceFormOccurrence.context() instanceof TaggedText)) { //FIXME added this to avoid breaking, but code below will never run if we don't pass the taggedtext
LOG.error(String.format("SurfaceFormOccurrence did not contain TaggedText. Cannot apply %s",this.getClass()));
selectedOccurrences.add(surfaceFormOccurrence);
continue;
}
if(unigramFilter.applies(surfaceFormOccurrence)) {
/**
* Unigram (n = 1)
*/
if(!filterPOS.applies(surfaceFormOccurrence)) {
/**
* The Surface Form is on the POS blacklist, i.e. a single adjective,
* verb, etc.
*/
if(Character.isUpperCase(surfaceFormOccurrence.surfaceForm().name().charAt(0))){
TaggedToken taggedToken = ((TaggedText) surfaceFormOccurrence.context()).taggedTokenProvider().getTaggedTokens(surfaceFormOccurrence).get(0);
/**
* Add uppercase adjectives (e.g. Canadian tv star)
*/
if(taggedToken.getPOSTag() != null && taggedToken.getPOSTag().startsWith("j"))
selectedOccurrences.add(surfaceFormOccurrence);
}else{
decisions.add("Dropped by POS filter: " + surfaceFormOccurrence);
}
}else if(!filterPattern.applies(surfaceFormOccurrence)){
decisions.add("Dropped by Pattern filter: " + surfaceFormOccurrence);
}else{
SpotClassification spotClassification;
try {
spotClassification = unigramClassifier.classify(surfaceFormOccurrence);
if(spotClassification.getCandidateClass() == SpotClass.valid) {
selectedOccurrences.add(surfaceFormOccurrence);
//LOG.info(("Kept by UnigramClassifier (Confidence: " + spotClassification.getConfidence() + "): " + surfaceFormOccurrence);
}else{
decisions.add("Dropped by UnigramClassifier (Confidence: " + spotClassification.getConfidence() + "): " + surfaceFormOccurrence);
}
} catch (Exception e) {
LOG.error("Exception when classifying unigram candidate: " + e);
}
}
}else{
/**
* n > 1
*/
SpotClassification spotClassification;
try{
spotClassification = ngramClassifier.classify(surfaceFormOccurrence);
}catch (Exception e) {
LOG.error("Exception when classifying ngram candidate: " + e);
continue;
}