Package edu.stanford.nlp.pipeline

Source Code of edu.stanford.nlp.pipeline.NumberAnnotator

package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;

import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;

/**
* This calls NumberSequenceClassifier, which is a rule based classifier, which
* adds a NUMBER entity tag to numbers not already given another entity tag, and
* also has additional rules for marking MONEY, TIME, and DATE. It assumes that
* tokens already have a (POS) TagAnnotation, and an original round of NER that
* covers MONEY and American DATE/TIME formats, such as MUC NER in
* AnswerAnnotation, to which we add.
*
* @author Jenny Finkel
*/

public class NumberAnnotator implements Annotator {

  private final AbstractSequenceClassifier<CoreLabel> nsc;

  private boolean VERBOSE = true;
  private static final String DEFAULT_BACKGROUND_SYMBOL = "O";
  private String BACKGROUND_SYMBOL;

  public static final String BACKGROUND_SYMBOL_PROPERTY = "background";

  public NumberAnnotator() {
    this(DEFAULT_BACKGROUND_SYMBOL, true, NumberSequenceClassifier.USE_SUTIME_DEFAULT);
  }

  public NumberAnnotator(boolean verbose) {
    this(DEFAULT_BACKGROUND_SYMBOL, verbose, NumberSequenceClassifier.USE_SUTIME_DEFAULT);
  }

  public NumberAnnotator(boolean verbose, boolean useSUTime) {
    this(DEFAULT_BACKGROUND_SYMBOL, verbose, useSUTime);
  }

  public NumberAnnotator(String backgroundSymbol, boolean verbose, boolean useSUTime) {
    BACKGROUND_SYMBOL = backgroundSymbol;
    VERBOSE = verbose;
    nsc = new NumberSequenceClassifier(useSUTime);
  }

  public NumberAnnotator(String name, Properties props) {
    String property = name + "." + BACKGROUND_SYMBOL_PROPERTY;
    BACKGROUND_SYMBOL = props.getProperty(property, DEFAULT_BACKGROUND_SYMBOL);
    boolean useSUTime = PropertiesUtils.getBool(props,
        NumberSequenceClassifier.USE_SUTIME_PROPERTY,
        NumberSequenceClassifier.USE_SUTIME_DEFAULT);
    VERBOSE = false;
    nsc = new NumberSequenceClassifier(useSUTime);
  }

  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      System.err.print("Adding number annotation ... ");
    }

    if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
      // classify tokens for each sentence
      for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        doOneSentenceNew(tokens, annotation, sentence);
      }
      if (VERBOSE) {
        System.err.println("done. Output: " + annotation.get(CoreAnnotations.SentencesAnnotation.class));
      }
    } else if (annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
      List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
      doOneSentenceNew(tokens, annotation, null);
    } else {
      throw new RuntimeException("unable to find sentences in: " + annotation);
    }
  }

  private void doOneSentenceNew(List<CoreLabel> words, Annotation doc, CoreMap sentence) {
    List<CoreLabel> newWords = NumberSequenceClassifier.copyTokens(words, sentence);

    nsc.classifyWithGlobalInformation(newWords, doc, sentence);

    Iterator<? extends CoreLabel> newFLIter = newWords.iterator();
    for (CoreLabel origWord : words) {
      CoreLabel newWord = newFLIter.next();
      String before = origWord.ner();
      String newGuess = newWord.get(CoreAnnotations.AnswerAnnotation.class);
      // System.err.println(origWord.word());
      // System.err.println(origWord.ner());
      if (VERBOSE)
        System.err.println(newWord);
      // System.err.println("-------------------------------------");
      if ((before == null || before.equals(BACKGROUND_SYMBOL) || before.equals("MISC"))
          && !newGuess.equals(BACKGROUND_SYMBOL)) {
        origWord.setNER(newGuess);
      }

      // transfer other annotations generated by SUTime or NumberNormalizer
      NumberSequenceClassifier.transferAnnotations(newWord, origWord);
    }
  }


  @Override
  public Set<Requirement> requires() {
    return Collections.singleton(TOKENIZE_REQUIREMENT);
  }

  @Override
  public Set<Requirement> requirementsSatisfied() {
    // technically it adds some NER, but someone who wants full NER
    // labels will be very disappointed, so we do not claim to produce NER
    return Collections.singleton(NUMBER_REQUIREMENT);
  }
}
TOP

Related Classes of edu.stanford.nlp.pipeline.NumberAnnotator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.