Source Code of edu.stanford.nlp.pipeline.ParserAnnotator

package edu.stanford.nlp.pipeline;


import java.util.List;
import java.util.Properties;
import java.util.Set;


import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.parser.common.NoSuchParseException;
import edu.stanford.nlp.parser.common.ParserAnnotations;
import edu.stanford.nlp.parser.common.ParserConstraint;
import edu.stanford.nlp.parser.common.ParserGrammar;
import edu.stanford.nlp.parser.common.ParserQuery;
import edu.stanford.nlp.parser.common.ParserUtils;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.parser.lexparser.TreeBinarizer;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Trees;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.util.CoreMap;
import java.util.function.Function;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.ReflectionLoading;
import edu.stanford.nlp.util.RuntimeInterruptedException;
import edu.stanford.nlp.util.StringUtils;


/**
 * This class will add parse information to an Annotation.
 * It assumes that the Annotation already contains the tokenized words
 * as a {@code List<CoreLabel>} in the TokensAnnotation under each
 * particular CoreMap in the SentencesAnnotation.
 * If the words have POS tags, they will be used.
 * <br>
 * Parse trees are added to each sentence's CoreMap (get with
 * {@code CoreAnnotations.SentencesAnnotation}) under
 * {@code CoreAnnotations.TreeAnnotation}).
 *
 * @author Jenny Finkel
 */
public class ParserAnnotator extends SentenceAnnotator {


  private final boolean VERBOSE;
  private final boolean BUILD_GRAPHS;
  private final ParserGrammar parser;


  private final Function<Tree, Tree> treeMap;


  /** Do not parse sentences larger than this sentence length */
  private final int maxSentenceLength;


  /**
   * Stop parsing if we exceed this time limit, in milliseconds.
   * Use 0 for no limit.
   */
  private final long maxParseTime;


  private final GrammaticalStructureFactory gsf;


  private final int nThreads;


  private final boolean saveBinaryTrees;


  public ParserAnnotator(boolean verbose, int maxSent) {
    this(System.getProperty("parse.model", LexicalizedParser.DEFAULT_PARSER_LOC), verbose, maxSent, StringUtils.EMPTY_STRING_ARRAY);
  }


  public ParserAnnotator(String parserLoc,
                         boolean verbose,
                         int maxSent,
                         String[] flags) {
    this(loadModel(parserLoc, verbose, flags), verbose, maxSent);
  }


  public ParserAnnotator(ParserGrammar parser, boolean verbose, int maxSent) {
    this(parser, verbose, maxSent, null);
  }


  public ParserAnnotator(ParserGrammar parser, boolean verbose, int maxSent, Function<Tree, Tree> treeMap) {
    VERBOSE = verbose;
    this.BUILD_GRAPHS = parser.getTLPParams().supportsBasicDependencies();
    this.parser = parser;
    this.maxSentenceLength = maxSent;
    this.treeMap = treeMap;
    this.maxParseTime = 0;
    if (this.BUILD_GRAPHS) {
      TreebankLanguagePack tlp = parser.getTLPParams().treebankLanguagePack();
      this.gsf = tlp.grammaticalStructureFactory(tlp.punctuationWordRejectFilter(), parser.getTLPParams().typedDependencyHeadFinder());
    } else {
      this.gsf = null;
    }
    this.nThreads = 1;
    this.saveBinaryTrees = false;
  }




  public ParserAnnotator(String annotatorName, Properties props) {
    String model = props.getProperty(annotatorName + ".model", LexicalizedParser.DEFAULT_PARSER_LOC);
    if (model == null) {
      throw new IllegalArgumentException("No model specified for Parser annotator " + annotatorName);
    }
    this.VERBOSE = PropertiesUtils.getBool(props, annotatorName + ".debug", false);


    String[] flags = convertFlagsToArray(props.getProperty(annotatorName + ".flags"));
    this.parser = loadModel(model, VERBOSE, flags);
    this.maxSentenceLength = PropertiesUtils.getInt(props, annotatorName + ".maxlen", -1);


    String treeMapClass = props.getProperty(annotatorName + ".treemap");
    if (treeMapClass == null) {
      this.treeMap = null;
    } else {
      this.treeMap = ReflectionLoading.loadByReflection(treeMapClass, props);
    }


    this.maxParseTime = PropertiesUtils.getLong(props, annotatorName + ".maxtime", -1);


    String buildGraphsProperty = annotatorName + ".buildgraphs";
    if (!this.parser.getTLPParams().supportsBasicDependencies()) {
      if (props.getProperty(buildGraphsProperty) != null && PropertiesUtils.getBool(props, buildGraphsProperty)) {
        System.err.println("WARNING: " + buildGraphsProperty + " set to true, but " + this.parser.getTLPParams().getClass() + " does not support dependencies");
      }
      this.BUILD_GRAPHS = false;
    } else {
      this.BUILD_GRAPHS = PropertiesUtils.getBool(props, buildGraphsProperty, true);
    }


    if (this.BUILD_GRAPHS) {
      TreebankLanguagePack tlp = parser.getTLPParams().treebankLanguagePack();
      // TODO: expose keeping punctuation as an option to the user?
      this.gsf = tlp.grammaticalStructureFactory(tlp.punctuationWordRejectFilter(), parser.getTLPParams().typedDependencyHeadFinder());
    } else {
      this.gsf = null;
    }


    this.nThreads = PropertiesUtils.getInt(props, annotatorName + ".nthreads", PropertiesUtils.getInt(props, "nthreads", 1));
    boolean usesBinary = StanfordCoreNLP.usesBinaryTrees(props);
    this.saveBinaryTrees = PropertiesUtils.getBool(props, annotatorName + ".binaryTrees", usesBinary);
  }


  public static String signature(String annotatorName, Properties props) {
    StringBuilder os = new StringBuilder();
    os.append(annotatorName + ".model:" +
            props.getProperty(annotatorName + ".model",
                    LexicalizedParser.DEFAULT_PARSER_LOC));
    os.append(annotatorName + ".debug:" +
            props.getProperty(annotatorName + ".debug", "false"));
    os.append(annotatorName + ".flags:" +
            props.getProperty(annotatorName + ".flags", ""));
    os.append(annotatorName + ".maxlen:" +
            props.getProperty(annotatorName + ".maxlen", "-1"));
    os.append(annotatorName + ".treemap:" +
            props.getProperty(annotatorName + ".treemap", ""));
    os.append(annotatorName + ".maxtime:" +
            props.getProperty(annotatorName + ".maxtime", "-1"));
    os.append(annotatorName + ".buildgraphs:" +
            props.getProperty(annotatorName + ".buildgraphs", "true"));
    os.append(annotatorName + ".nthreads:" +
              props.getProperty(annotatorName + ".nthreads", props.getProperty("nthreads", "")));
    boolean usesBinary = StanfordCoreNLP.usesBinaryTrees(props);
    boolean saveBinaryTrees = PropertiesUtils.getBool(props, annotatorName + ".binaryTrees", usesBinary);
    os.append(annotatorName + ".binaryTrees:" + saveBinaryTrees);


    return os.toString();
  }


  public static String[] convertFlagsToArray(String parserFlags) {
    if (parserFlags == null || parserFlags.trim().equals("")) {
      return StringUtils.EMPTY_STRING_ARRAY;
    } else {
      return parserFlags.trim().split("\\s+");
    }
  }


  private static ParserGrammar loadModel(String parserLoc,
                                         boolean verbose,
                                         String[] flags) {
    if (verbose) {
      System.err.println("Loading Parser Model [" + parserLoc + "] ...");
      System.err.print("  Flags:");
      for (String flag : flags) {
        System.err.print("  " + flag);
      }
      System.err.println();
    }
    ParserGrammar result = ParserGrammar.loadModel(parserLoc);
    result.setOptionFlags(result.defaultCoreNLPFlags());
    result.setOptionFlags(flags);


    return result;
  }


  @Override
  protected int nThreads() {
    return nThreads;
  }


  @Override
  protected long maxTime() {
    return maxParseTime;
  };  


  @Override
  protected void doOneSentence(Annotation annotation, CoreMap sentence) {
    final List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class);
    if (VERBOSE) {
      System.err.println("Parsing: " + words);
    }
    Tree tree = null;
    // generate the constituent tree
    if (maxSentenceLength <= 0 || words.size() <= maxSentenceLength) {
      try {
        final List<ParserConstraint> constraints = sentence.get(ParserAnnotations.ConstraintAnnotation.class);
        tree = doOneSentence(constraints, words);
      } catch (RuntimeInterruptedException e) {
        if (VERBOSE) {
          System.err.println("Took too long parsing: " + words);
        }
        tree = null;
      }
    }
    // tree == null may happen if the parser takes too long or if
    // the sentence is longer than the max length
    if (tree == null) {
      doOneFailedSentence(annotation, sentence);
    } else {
      finishSentence(sentence, tree);
    }
  }


  @Override
  public void doOneFailedSentence(Annotation annotation, CoreMap sentence) {
    final List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class);
    Tree tree = ParserUtils.xTree(words);
    for (CoreLabel word : words) {
      if (word.tag() == null) {
        word.setTag("XX");
      }
    }
    finishSentence(sentence, tree);
  }


  private void finishSentence(CoreMap sentence, Tree tree) {
    if (treeMap != null) {
      tree = treeMap.apply(tree);
    }


    ParserAnnotatorUtils.fillInParseAnnotations(VERBOSE, BUILD_GRAPHS, gsf, sentence, tree);


    if (saveBinaryTrees) {
      TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
      Tree binarized = binarizer.transformTree(tree);
      Trees.convertToCoreLabels(binarized);
      sentence.set(TreeCoreAnnotations.BinarizedTreeAnnotation.class, binarized);
    }
  }


  private Tree doOneSentence(List<ParserConstraint> constraints,
                             List<CoreLabel> words) {
    ParserQuery pq = parser.parserQuery();
    pq.setConstraints(constraints);
    pq.parse(words);
    Tree tree = null;
    try {
      tree = pq.getBestParse();
      if (tree == null) {
        System.err.println("WARNING: Parsing of sentence failed.  " +
                         "Will ignore and continue: " +
                         Sentence.listToString(words));
      } else {
        // -10000 denotes unknown words
        tree.setScore(pq.getPCFGScore() % -10000.0);
      }
    } catch (OutOfMemoryError e) {
      System.err.println("WARNING: Parsing of sentence ran out of memory.  " +
                         "Will ignore and continue: " +
                         Sentence.listToString(words));
    } catch (NoSuchParseException e) {
      System.err.println("WARNING: Parsing of sentence failed, possibly because of out of memory.  " +
                         "Will ignore and continue: " +
                         Sentence.listToString(words));
    }
    return tree;
  }


  @Override
  public Set<Requirement> requires() {
    return parser.requiresTags() ? TOKENIZE_SSPLIT_POS : TOKENIZE_AND_SSPLIT;
  }


  @Override
  public Set<Requirement> requirementsSatisfied() {
    if (this.saveBinaryTrees) {
      return PARSE_TAG_BINARIZED_TREES;
    } else {
      return PARSE_AND_TAG;
    }
  }
}
Source Code of edu.stanford.nlp.pipeline.ParserAnnotator

Related Classes of edu.stanford.nlp.pipeline.ParserAnnotator