package edu.stanford.nlp.parser.lexparser;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.ling.HasCategory;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
/**
* Performs non-language specific annotation of Trees.
*
* @author Dan Klein
* @author Christopher Manning
*/
public class TreeAnnotator implements TreeTransformer {
private TreeFactory tf;
private TreebankLangParserParams tlpParams;
private HeadFinder hf;
private TrainOptions trainOptions;
public TreeAnnotator(HeadFinder hf, TreebankLangParserParams tlpp,
Options op) {
this.tlpParams = tlpp;
this.hf = hf;
this.tf = new LabeledScoredTreeFactory();
this.trainOptions = op.trainOptions;
}
/** Do the category splitting of the tree passed in.
* This method defensively copies its argument, which is not changed.
*
* @param t The tree to be annotated. This can be any tree with a
* {@code value()} stored in Labels. The tree is assumed to have
* preterminals that are parts of speech.
* @return The annotated version of the Tree (which is a completely
* separate Tree with new tree structure and new labels). The
* non-leaf nodes of the tree will be CategoryWordTag objects.
*/
@Override
public Tree transformTree(Tree t) {
// make a defensive copy which the helper method can then mangle
Tree copy = t.deepCopy(tf);
if (trainOptions.markStrahler) {
markStrahler(copy);
}
return transformTreeHelper(copy, copy);
}
/**
* Do the category splitting of the tree passed in.
* This is initially called on the root node of a tree, and it recursively
* calls itself on children. A depth first left-to-right traversal is
* done whereby a tree node's children are first transformed and then
* the parent is transformed. At the time of calling, the original root
* always sits above the current node. This routine can be assumed to,
* and does, change the tree passed in: it destructively modifies tree nodes,
* and makes new tree structure when it needs to.
*
* @param t The tree node to subcategorize.
* @param root The root of the tree. It must contain {@code t} or
* this code will throw a NullPointerException.
* @return The annotated tree.
*/
private Tree transformTreeHelper(Tree t, Tree root) {
if (t == null) {
// handle null
return null;
}
if (t.isLeaf()) {
//No need to change the label
return t;
}
String cat = t.label().value();
Tree parent;
String parentStr;
String grandParentStr;
if (root == null || t.equals(root)) {
parent = null;
parentStr = "";
} else {
parent = t.parent(root);
parentStr = parent.label().value();
}
if (parent == null || parent.equals(root)) {
grandParentStr = "";
} else {
grandParentStr = parent.parent(root).label().value();
}
String baseParentStr = tlpParams.treebankLanguagePack().basicCategory(parentStr);
String baseGrandParentStr = tlpParams.treebankLanguagePack().basicCategory(grandParentStr);
//System.out.println(t.label().value() + " " + parentStr + " " + grandParentStr);
if (t.isPreTerminal()) {
// handle tags
Tree childResult = transformTreeHelper(t.children()[0], null); // recurse
String word = childResult.value(); // would be nicer if Word/CWT ??
if ( ! trainOptions.noTagSplit) {
if (trainOptions.tagPA) {
String test = cat + "^" + baseParentStr;
if (!trainOptions.tagSelectiveSplit || trainOptions.splitters.contains(test)) {
cat = test;
}
}
if (trainOptions.markUnaryTags && parent.numChildren() == 1) {
cat = cat + "^U";
}
} // otherwise, leave the tags alone!
// Label label = new CategoryWordTag(cat, word, cat);
Label label = t.label().labelFactory().newLabel(t.label());
label.setValue(cat);
if(label instanceof HasCategory)
((HasCategory) label).setCategory(cat);
if(label instanceof HasWord)
((HasWord) label).setWord(word);
if(label instanceof HasTag)
((HasTag) label).setTag(cat);
t.setLabel(label);
t.setChild(0, childResult); // just in case word is changed
if (trainOptions.noTagSplit) {
return t;
} else {
// language-specific transforms
return tlpParams.transformTree(t, root);
}
} // end isPreTerminal()
// handle phrasal categories
Tree[] kids = t.children();
for (int childNum = 0; childNum < kids.length; childNum++) {
Tree child = kids[childNum];
Tree childResult = transformTreeHelper(child, root); // recursive call
t.setChild(childNum, childResult);
}
Tree headChild = hf.determineHead(t);
if(headChild == null || headChild.label() == null) {
throw new RuntimeException("TreeAnnotator: null head found for tree [suggesting incomplete/wrong HeadFinder]:\n" + t);
}
Label headLabel = headChild.label();
if( ! (headLabel instanceof HasWord))
throw new RuntimeException("TreeAnnotator: Head label lacks a Word annotation!");
if( ! (headLabel instanceof HasTag))
throw new RuntimeException("TreeAnnotator: Head label lacks a Tag annotation!");
String word = ((HasWord) headLabel).word();
String tag = ((HasTag) headLabel).tag();
// String baseTag = tlpParams.treebankLanguagePack().basicCategory(tag);
String baseCat = tlpParams.treebankLanguagePack().basicCategory(cat);
/* Sister annotation. Potential problem: if multiple sisters are
* strong indicators for a single category's expansions. This
* happens concretely in the Chinese Treebank when NP (object)
* has left sisters VV and AS. Could lead to too much
* sparseness. The ideal solution would be to give the
* splitting list an ordering, and take only the highest (~most
* informative/reliable) sister annotation.
*/
if (trainOptions.sisterAnnotate && !trainOptions.smoothing && baseParentStr.length() > 0) {
List<String> leftSis = listBasicCategories(SisterAnnotationStats.leftSisterLabels(t, parent));
List<String> rightSis = listBasicCategories(SisterAnnotationStats.rightSisterLabels(t, parent));
List<String> leftAnn = new ArrayList<String>();
List<String> rightAnn = new ArrayList<String>();
for (String s : leftSis) {
//s = baseCat+"=l="+tlpParams.treebankLanguagePack().basicCategory(s);
leftAnn.add(baseCat + "=l=" + tlpParams.treebankLanguagePack().basicCategory(s));
//System.out.println("left-annotated test string " + s);
}
for (String s : rightSis) {
//s = baseCat+"=r="+tlpParams.treebankLanguagePack().basicCategory(s);
rightAnn.add(baseCat + "=r=" + tlpParams.treebankLanguagePack().basicCategory(s));
}
for (Iterator<String> j = rightAnn.iterator(); j.hasNext();) {
//System.out.println("new rightsis " + (String)j.next()); //debugging
}
for (String annCat : trainOptions.sisterSplitters) {
//System.out.println("annotated test string " + annCat);
if (leftAnn.contains(annCat) || rightAnn.contains(annCat)) {
cat = cat + annCat.replaceAll("^" + baseCat, "");
break;
}
}
}
if (trainOptions.PA && !trainOptions.smoothing && baseParentStr.length() > 0) {
String cat2 = baseCat + "^" + baseParentStr;
if (!trainOptions.selectiveSplit || trainOptions.splitters.contains(cat2)) {
cat = cat + "^" + baseParentStr;
}
}
if (trainOptions.gPA && !trainOptions.smoothing && grandParentStr.length() > 0) {
if (trainOptions.selectiveSplit) {
String cat2 = baseCat + "^" + baseParentStr + "~" + baseGrandParentStr;
if (cat.contains("^") && trainOptions.splitters.contains(cat2)) {
cat = cat + "~" + baseGrandParentStr;
}
} else {
cat = cat + "~" + baseGrandParentStr;
}
}
if (trainOptions.markUnary > 0) {
if (trainOptions.markUnary == 1 && kids.length == 1 && kids[0].depth() >= 2) {
cat = cat + "-U";
} else if (trainOptions.markUnary == 2 && parent != null && parent.numChildren() == 1 && t.depth() >= 2) {
cat = cat + "-u";
}
}
if (trainOptions.rightRec && rightRec(t, baseCat)) {
cat = cat + "-R";
}
if (trainOptions.leftRec && leftRec(t, baseCat)) {
cat = cat + "-L";
}
if (trainOptions.splitPrePreT && t.isPrePreTerminal()) {
cat = cat + "-PPT";
}
// Label label = new CategoryWordTag(cat, word, tag);
Label label = t.label().labelFactory().newLabel(t.label());
label.setValue(cat);
if(label instanceof HasCategory)
((HasCategory) label).setCategory(cat);
if(label instanceof HasWord)
((HasWord) label).setWord(word);
if(label instanceof HasTag)
((HasTag) label).setTag(tag);
t.setLabel(label);
return tlpParams.transformTree(t, root);
}
private List<String> listBasicCategories(List<String> l) {
List<String> l1 = new ArrayList<String>();
for (String str : l) {
l1.add(tlpParams.treebankLanguagePack().basicCategory(str));
}
return l1;
}
private static boolean rightRec(Tree t, String baseCat) {
if (//! baseCat.equals("S") &&
!baseCat.equals("NP")) {
return false;
}
while (!t.isLeaf()) {
t = t.lastChild();
String str = t.label().value();
if (str.startsWith(baseCat)) {
return true;
}
}
return false;
}
private static boolean leftRec(Tree t, String baseCat) {
while (!t.isLeaf()) {
t = t.firstChild();
String str = t.label().value();
if (str.startsWith(baseCat)) {
return true;
}
}
return false;
}
private static int markStrahler(Tree t) {
if (t.isLeaf()) {
// don't annotate the words at leaves!
return 1;
} else {
String cat = t.label().value();
int maxStrahler = -1;
int maxMultiplicity = 0;
for (int i = 0; i < t.numChildren(); i++) {
int strahler = markStrahler(t.getChild(i));
if (strahler > maxStrahler) {
maxStrahler = strahler;
maxMultiplicity = 1;
} else if (strahler == maxStrahler) {
maxMultiplicity++;
}
}
if (maxMultiplicity > 1) {
maxStrahler++; // this is the one case where it grows
}
cat = cat + '~' + maxStrahler;
Label label = t.label().labelFactory().newLabel(t.label());
label.setValue(cat);
t.setLabel(label);
return maxStrahler;
}
}
}