package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;
/**
* This calls NumberSequenceClassifier, which is a rule based classifier, which
* adds a NUMBER entity tag to numbers not already given another entity tag, and
* also has additional rules for marking MONEY, TIME, and DATE. It assumes that
* tokens already have a (POS) TagAnnotation, and an original round of NER that
* covers MONEY and American DATE/TIME formats, such as MUC NER in
* AnswerAnnotation, to which we add.
*
* @author Jenny Finkel
*/
public class NumberAnnotator implements Annotator {
private final AbstractSequenceClassifier<CoreLabel> nsc;
private boolean VERBOSE = true;
private static final String DEFAULT_BACKGROUND_SYMBOL = "O";
private String BACKGROUND_SYMBOL;
public static final String BACKGROUND_SYMBOL_PROPERTY = "background";
public NumberAnnotator() {
this(DEFAULT_BACKGROUND_SYMBOL, true, NumberSequenceClassifier.USE_SUTIME_DEFAULT);
}
public NumberAnnotator(boolean verbose) {
this(DEFAULT_BACKGROUND_SYMBOL, verbose, NumberSequenceClassifier.USE_SUTIME_DEFAULT);
}
public NumberAnnotator(boolean verbose, boolean useSUTime) {
this(DEFAULT_BACKGROUND_SYMBOL, verbose, useSUTime);
}
public NumberAnnotator(String backgroundSymbol, boolean verbose, boolean useSUTime) {
BACKGROUND_SYMBOL = backgroundSymbol;
VERBOSE = verbose;
nsc = new NumberSequenceClassifier(useSUTime);
}
public NumberAnnotator(String name, Properties props) {
String property = name + "." + BACKGROUND_SYMBOL_PROPERTY;
BACKGROUND_SYMBOL = props.getProperty(property, DEFAULT_BACKGROUND_SYMBOL);
boolean useSUTime = PropertiesUtils.getBool(props,
NumberSequenceClassifier.USE_SUTIME_PROPERTY,
NumberSequenceClassifier.USE_SUTIME_DEFAULT);
VERBOSE = false;
nsc = new NumberSequenceClassifier(useSUTime);
}
@Override
public void annotate(Annotation annotation) {
if (VERBOSE) {
System.err.print("Adding number annotation ... ");
}
if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
// classify tokens for each sentence
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
doOneSentenceNew(tokens, annotation, sentence);
}
if (VERBOSE) {
System.err.println("done. Output: " + annotation.get(CoreAnnotations.SentencesAnnotation.class));
}
} else if (annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
doOneSentenceNew(tokens, annotation, null);
} else {
throw new RuntimeException("unable to find sentences in: " + annotation);
}
}
private void doOneSentenceNew(List<CoreLabel> words, Annotation doc, CoreMap sentence) {
List<CoreLabel> newWords = NumberSequenceClassifier.copyTokens(words, sentence);
nsc.classifyWithGlobalInformation(newWords, doc, sentence);
Iterator<? extends CoreLabel> newFLIter = newWords.iterator();
for (CoreLabel origWord : words) {
CoreLabel newWord = newFLIter.next();
String before = origWord.ner();
String newGuess = newWord.get(CoreAnnotations.AnswerAnnotation.class);
// System.err.println(origWord.word());
// System.err.println(origWord.ner());
if (VERBOSE)
System.err.println(newWord);
// System.err.println("-------------------------------------");
if ((before == null || before.equals(BACKGROUND_SYMBOL) || before.equals("MISC"))
&& !newGuess.equals(BACKGROUND_SYMBOL)) {
origWord.setNER(newGuess);
}
// transfer other annotations generated by SUTime or NumberNormalizer
NumberSequenceClassifier.transferAnnotations(newWord, origWord);
}
}
@Override
public Set<Requirement> requires() {
return Collections.singleton(TOKENIZE_REQUIREMENT);
}
@Override
public Set<Requirement> requirementsSatisfied() {
// technically it adds some NER, but someone who wants full NER
// labels will be very disappointed, so we do not claim to produce NER
return Collections.singleton(NUMBER_REQUIREMENT);
}
}