package edu.stanford.nlp.ie;
import java.io.FileNotFoundException;
import java.util.List;
import java.util.Properties;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.RuntimeInterruptedException;
import edu.stanford.nlp.util.StringUtils;
/**
* Subclass of ClassifierCombiner that behaves like a NER, by copying
* the AnswerAnnotation labels to NERAnnotation. Also, it runs an additional
* classifier (QuantifiableEntityNormalizer) to recognize numeric entities.
*
* @author Mihai Surdeanu
*/
public class NERClassifierCombiner extends ClassifierCombiner<CoreLabel> {
private final boolean applyNumericClassifiers;
public static final boolean APPLY_NUMERIC_CLASSIFIERS_DEFAULT = true;
public static final String APPLY_NUMERIC_CLASSIFIERS_PROPERTY = "ner.applyNumericClassifiers";
private final boolean useSUTime;
private final AbstractSequenceClassifier<CoreLabel> nsc;
public NERClassifierCombiner(Properties props)
throws FileNotFoundException
{
super(props);
applyNumericClassifiers = PropertiesUtils.getBool(props, APPLY_NUMERIC_CLASSIFIERS_PROPERTY, APPLY_NUMERIC_CLASSIFIERS_DEFAULT);
useSUTime = PropertiesUtils.getBool(props, NumberSequenceClassifier.USE_SUTIME_PROPERTY, NumberSequenceClassifier.USE_SUTIME_DEFAULT);
nsc = new NumberSequenceClassifier(new Properties(), useSUTime, props);
}
public NERClassifierCombiner(String... loadPaths)
throws FileNotFoundException
{
this(APPLY_NUMERIC_CLASSIFIERS_DEFAULT, NumberSequenceClassifier.USE_SUTIME_DEFAULT, loadPaths);
}
public NERClassifierCombiner(boolean applyNumericClassifiers,
boolean useSUTime,
String... loadPaths)
throws FileNotFoundException
{
super(loadPaths);
this.applyNumericClassifiers = applyNumericClassifiers;
this.useSUTime = useSUTime;
this.nsc = new NumberSequenceClassifier(useSUTime);
}
public NERClassifierCombiner(boolean applyNumericClassifiers,
boolean useSUTime,
Properties nscProps,
String... loadPaths)
throws FileNotFoundException
{
super(ClassifierCombiner.extractCombinationModeSafe(nscProps), loadPaths);
this.applyNumericClassifiers = applyNumericClassifiers;
this.useSUTime = useSUTime;
this.nsc = new NumberSequenceClassifier(new Properties(), useSUTime, nscProps);
}
public NERClassifierCombiner(AbstractSequenceClassifier<CoreLabel>... classifiers)
throws FileNotFoundException
{
this(APPLY_NUMERIC_CLASSIFIERS_DEFAULT, NumberSequenceClassifier.USE_SUTIME_DEFAULT, classifiers);
}
public NERClassifierCombiner(boolean applyNumericClassifiers,
boolean useSUTime,
AbstractSequenceClassifier<CoreLabel>... classifiers)
throws FileNotFoundException
{
super(classifiers);
this.applyNumericClassifiers = applyNumericClassifiers;
this.useSUTime = useSUTime;
this.nsc = new NumberSequenceClassifier(useSUTime);
}
public boolean appliesNumericClassifiers() {
return applyNumericClassifiers;
}
public boolean usesSUTime() {
return useSUTime;
}
private static <INN extends CoreMap> void copyAnswerFieldsToNERField(List<INN> l) {
for (INN m: l) {
m.set(CoreAnnotations.NamedEntityTagAnnotation.class, m.get(CoreAnnotations.AnswerAnnotation.class));
}
}
@Override
public List<CoreLabel> classify(List<CoreLabel> tokens) {
return classifyWithGlobalInformation(tokens, null, null);
}
@Override
public List<CoreLabel> classifyWithGlobalInformation(List<CoreLabel> tokens, final CoreMap document, final CoreMap sentence) {
List<CoreLabel> output = super.classify(tokens);
if (applyNumericClassifiers) {
try {
// recognizes additional MONEY, TIME, DATE, and NUMBER using a set of deterministic rules
// note: some DATE and TIME entities are recognized by our statistical NER based on MUC
// note: this includes SUTime
// note: requires TextAnnotation, PartOfSpeechTagAnnotation, and AnswerAnnotation
// note: this sets AnswerAnnotation!
recognizeNumberSequences(output, document, sentence);
} catch (RuntimeInterruptedException e) {
throw e;
} catch (Exception e) {
System.err.println("Ignored an exception in NumberSequenceClassifier: (result is that some numbers were not classified)");
System.err.println("Tokens: " + StringUtils.joinWords(tokens, " "));
e.printStackTrace(System.err);
}
// AnswerAnnotation -> NERAnnotation
copyAnswerFieldsToNERField(output);
try {
// normalizes numeric entities such as MONEY, TIME, DATE, or PERCENT
// note: this uses and sets NamedEntityTagAnnotation!
QuantifiableEntityNormalizer.addNormalizedQuantitiesToEntities(output, false, useSUTime);
} catch (Exception e) {
System.err.println("Ignored an exception in QuantifiableEntityNormalizer: (result is that entities were not normalized)");
System.err.println("Tokens: " + StringUtils.joinWords(tokens, " "));
e.printStackTrace(System.err);
} catch(AssertionError e) {
System.err.println("Ignored an assertion in QuantifiableEntityNormalizer: (result is that entities were not normalized)");
System.err.println("Tokens: " + StringUtils.joinWords(tokens, " "));
e.printStackTrace(System.err);
}
} else {
// AnswerAnnotation -> NERAnnotation
copyAnswerFieldsToNERField(output);
}
return output;
}
private void recognizeNumberSequences(List<CoreLabel> words, final CoreMap document, final CoreMap sentence) {
// we need to copy here because NumberSequenceClassifier overwrites the AnswerAnnotation
List<CoreLabel> newWords = NumberSequenceClassifier.copyTokens(words, sentence);
nsc.classifyWithGlobalInformation(newWords, document, sentence);
// copy AnswerAnnotation back. Do not overwrite!
// also, copy all the additional annotations generated by SUTime and NumberNormalizer
for (int i = 0, sz = words.size(); i < sz; i++){
CoreLabel origWord = words.get(i);
CoreLabel newWord = newWords.get(i);
// System.err.println(newWord.word() + " => " + newWord.get(CoreAnnotations.AnswerAnnotation.class) + " " + origWord.ner());
String before = origWord.get(CoreAnnotations.AnswerAnnotation.class);
String newGuess = newWord.get(CoreAnnotations.AnswerAnnotation.class);
if ((before == null || before.equals(nsc.flags.backgroundSymbol) || before.equals("MISC")) && !newGuess.equals(nsc.flags.backgroundSymbol)) {
origWord.set(CoreAnnotations.AnswerAnnotation.class, newGuess);
}
// transfer other annotations generated by SUTime or NumberNormalizer
NumberSequenceClassifier.transferAnnotations(newWord, origWord);
}
}
public void finalizeAnnotation(Annotation annotation) {
nsc.finalizeClassification(annotation);
}
}