package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.process.SerializableFunction;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexParseException;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.TregexPatternCompiler;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import java.util.Collection;
import java.util.function.Function;
import java.util.Map;
* An extension of
* {@link edu.stanford.nlp.parser.lexparser.AbstractTreebankParserParams}
* which provides support for Tregex-powered annotations.
* Subclasses of this class provide collections of <em>features</em>
* which are associated with annotation behaviors that seek out
* and label matching trees in some way. For example, a <em>coord</em>
* feature might have an annotation behavior which searches for
* coordinating noun phrases and labels the associated constituent
* with a suffix <tt>-coordinating</tt>.
* The "search" in this process is conducted via Tregex, and the
* actual annotation is done through execution of an arbitrary
* {@link java.util.function.Function} provided by the user.
* This class carries as inner several classes several useful common
* annotation functions.
* @see #annotations
* @see SimpleStringFunction
* @author Jon Gauthier
* @author Spence Green
public abstract class TregexPoweredTreebankParserParams extends AbstractTreebankParserParams {
private static final long serialVersionUID = -1985603901694682420L;
* This data structure dictates how an arbitrary tree should be
* annotated. Subclasses should fill out the related member
* {@link #annotations}.
* It is a collection of <em>features:</em> a map from feature name
* to behavior, where each behavior is a tuple <tt>(t, f)</tt>.
* <tt>t</tt> is a Tregex pattern which matches subtrees
* corresponding to the feature, and <tt>f</tt> is a function which
* accepts such matches and generates an annotation which the matched
* subtree should be given.
* @see #annotations
private final Map<String, Pair<TregexPattern, Function<TregexMatcher, String>>> annotationPatterns
= Generics.newHashMap();
* This data structure dictates how an arbitrary tree should be
* annotated.
* It is a collection of <em>features:</em> a map from feature name
* to behavior, where each behavior is a tuple <tt>(t, f)</tt>.
* <tt>t</tt> is a string form of a TregexPattern which matches
* subtrees corresponding to the feature, and <tt>f</tt> is a
* function which accepts such matches and generates an annotation
* which the matched subtree should be given.
* @see #annotationPatterns
* @see SimpleStringFunction
protected final Map<String, Pair<String, Function<TregexMatcher, String>>> annotations
= Generics.newHashMap();
* Features which should be enabled by default.
protected abstract String[] baselineAnnotationFeatures();
* Extra features which have been requested. Use
* {@link #addFeature(String)} to add features.
private final Collection<String> features;
public TregexPoweredTreebankParserParams(TreebankLanguagePack tlp) {
features = CollectionUtils.asSet(baselineAnnotationFeatures());
* Compile the {@link #annotations} collection given a
* particular head finder. Subclasses should call this method at
* least once before the class is used, and whenever the head finder
* is changed.
protected void compileAnnotations(HeadFinder hf) {
TregexPatternCompiler compiler = new TregexPatternCompiler(hf);
for (Map.Entry<String, Pair<String, Function<TregexMatcher, String>>> annotation : annotations.entrySet()) {
TregexPattern compiled;
try {
compiled = compiler.compile(annotation.getValue().first());
} catch (TregexParseException e) {
int nth = annotationPatterns.size() + 1;
System.err.println("Parse exception on annotation pattern #" + nth + " initialization: " + e);
Pair<TregexPattern, Function<TregexMatcher, String>> behavior =
new Pair<TregexPattern, Function<TregexMatcher, String>>(compiled, annotation.getValue().second());
annotationPatterns.put(annotation.getKey(), behavior);
* Enable an annotation feature. If the provided feature has already
* been enabled, this method does nothing.
* @param featureName
* @throws java.lang.IllegalArgumentException If the provided feature
* name is unknown (i.e., if there is no entry in the
* {@link #annotations} collection with the same name)
protected void addFeature(String featureName) {
if (!annotations.containsKey(featureName))
throw new IllegalArgumentException("Invalid feature name '" + featureName + "'");
if (!annotationPatterns.containsKey(featureName))
throw new RuntimeException("Compiled patterns out of sync with annotations data structure;" +
"did you call compileAnnotations?");
* Disable a feature. If the feature was never enabled, this method
* returns without error.
* @param featureName
protected void removeFeature(String featureName) {
* This method does language-specific tree transformations such as annotating particular nodes with language-relevant
* features. Such parameterizations should be inside the specific TreebankLangParserParams class. This method is
* recursively applied to each node in the tree (depth first, left-to-right), so you shouldn't write this method to
* apply recursively to tree members. This method is allowed to (and in some cases does) destructively change the
* input tree <code>t</code>. It changes both labels and the tree shape.
* @param t The input tree (with non-language specific annotation already done, so you need to strip back to basic
* categories)
* @param root The root of the current tree (can be null for words)
* @return The fully annotated tree node (with daughters still as you want them in the final result)
public Tree transformTree(Tree t, Tree root) {
String newCat = t.value() + getAnnotationString(t, root);
if (t.isPreTerminal() && t.label() instanceof HasTag)
((HasTag) t.label()).setTag(newCat);
return t;
* Build a string of annotations for the given tree.
* @param t The input tree (with non-language specific annotation
* already done, so you need to strip back to basic categories)
* @param root The root of the current tree (can be null for words)
* @return A (possibly empty) string of annotations to add to the
* given tree
protected String getAnnotationString(Tree t, Tree root) {
// Accumulate all annotations in this string
StringBuilder annotationStr = new StringBuilder();
for (String featureName : features) {
Pair<TregexPattern, Function<TregexMatcher, String>> behavior = annotationPatterns.get(featureName);
TregexMatcher m = behavior.first().matcher(root);
if (m.matchesAt(t))
return annotationStr.toString();
* Output a description of the current annotation configuration to
* standard error.
public void display() {
for (String feature : features)
System.err.printf("%s ", feature);
* Annotates all nodes that match the tregex query with some string.
protected static class SimpleStringFunction implements SerializableFunction<TregexMatcher,
String> {
private static final long serialVersionUID = 6958776731059724396L;
private String annotationMark;
public SimpleStringFunction(String annotationMark) {
this.annotationMark = annotationMark;
public String apply(TregexMatcher matcher) {
return annotationMark;
public String toString() {
return "SimpleStringFunction[" + annotationMark + ']';
* Annotate a tree constituent with its lexical head.
protected static class AnnotateHeadFunction implements SerializableFunction<TregexMatcher,
String> {
private static final long serialVersionUID = -4213299755069618322L;
private final HeadFinder headFinder;
private boolean lowerCase;
public AnnotateHeadFunction(HeadFinder hf) {
this(hf, true);
public AnnotateHeadFunction(HeadFinder hf, boolean lowerCase) {
headFinder = hf;
this.lowerCase = lowerCase;
public String apply(TregexMatcher matcher) {
Tree matchedTree = matcher.getMatch();
Tree head = headFinder.determineHead(matchedTree);
if (!head.isPrePreTerminal())
return "";
Tree lexicalHead = head.firstChild().firstChild();
String headValue = lexicalHead.value();
if (headValue != null) {
if (lowerCase) headValue = headValue.toLowerCase();
return '[' + headValue + ']';
} else {
return "";
public String toString() {
return "AnnotateHeadFunction[" + headFinder.getClass().getName() + ']';