import java.util.regex.Pattern;
import edu.stanford.nlp.trees.AbstractCollinsHeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
/** A HeadFinder for TueBa-D/Z. First version.
* <i>Notes:</i> EN_ADD seems to be replaced by ENADD in 2008 ACL German.
* Added as alternant by CDM.
* @author Roger Levy (
public class TueBaDZHeadFinder extends AbstractCollinsHeadFinder {
private static final long serialVersionUID = 1L;
private static final boolean DEBUG = false;
private final String left;
private String right;
private boolean coordSwitch = false;
public TueBaDZHeadFinder() {
super(new TueBaDZLanguagePack());
String excluded = String.valueOf(tlp.labelAnnotationIntroducingCharacters());
// if(excluded.indexOf("-") >= 0) {
excluded = "-" + excluded.replaceAll("-", ""); // - can only appear at the beginning of a regex character class
// }
headMarkedPattern = Pattern.compile("^[^" + excluded + "]*:HD");
headMarkedPattern2 = Pattern.compile("^[^" + excluded + "]*-HD");
nonTerminalInfo = Generics.newHashMap();
left = (coordSwitch ? "right" : "left");
right = (coordSwitch ? "left" : "right");
nonTerminalInfo.put("VROOT", new String[][]{{left, "SIMPX"},{left,"NX"},{left,"P"},{left,"PX","ADVX"},{left,"EN","EN_ADD","ENADD"},{left}}); // we'll arbitrarily choose the leftmost.
nonTerminalInfo.put("ROOT", new String[][]{{left, "SIMPX"},{left,"NX"},{left,"P"},{left,"PX","ADVX"},{left,"EN","EN_ADD","ENADD"},{left}}); // we'll arbitrarily choose the leftmost.
nonTerminalInfo.put("TOP", new String[][]{{left, "SIMPX"},{left,"NX"},{left,"P"},{left,"PX","ADVX"},{left,"EN","EN_ADD","ENADD"},{left}}); // we'll arbitrarily choose the leftmost. Using TOP now for ROOT
nonTerminalInfo.put("PX", new String[][]{{left, "APPR", "APPRART","PX"}});
nonTerminalInfo.put("NX", new String[][]{{right, "NX"},{right,"NE","NN"},{right,"EN","EN_ADD","ENADD","FX"},{right,"ADJX","PIS","ADVX"},{right,"CARD","TRUNC"},{right}});
nonTerminalInfo.put("FX", new String[][]{{right, "FM","FX"}}); // junk rule for junk category :)
nonTerminalInfo.put("ADJX", new String[][]{{right, "ADJX","ADJA","ADJD"},{right}});
nonTerminalInfo.put("ADVX", new String[][]{{right, "ADVX", "ADV"}}); // what a nice category!
nonTerminalInfo.put("DP", new String[][]{{left}}); // no need for this really
nonTerminalInfo.put("VXFIN", new String[][]{{left,"VXFIN"},{right,"VVFIN"}}); // not sure about left vs. right
nonTerminalInfo.put("VXINF", new String[][]{{right,"VXINF"},{right,"VVPP","VVINF"}}); // not sure about lef vs. right for this one either
nonTerminalInfo.put("LV", new String[][]{{right}}); // no need
nonTerminalInfo.put("C", new String[][]{{right,"KOUS"},{right,"NX"}}); // I *think* right makes more sense for this.
nonTerminalInfo.put("FKOORD", new String[][]{{left,"LK","C"},{right,"FKONJ","MF","VC",}}); // This one is very tough right/left because it conjoins all sorts of fields together. Not sure about the right solution
nonTerminalInfo.put("KOORD", new String[][]{{left}}); // no need.
nonTerminalInfo.put("LK", new String[][]{{left}}); // no need.
// the one for MF is super-bad. MF does not designate a category
// corresponding to headship. Really, something totally different
// ought to be done for dependency.
nonTerminalInfo.put("MF", new String[][]{{left}});
nonTerminalInfo.put("MFE", new String[][]{{left}}); // no need.
// NF is pretty bad too, like MF. But it's not nearly so horrible.
nonTerminalInfo.put("NF", new String[][]{{left}});
nonTerminalInfo.put("PARORD", new String[][]{{left}}); // no need.
// not sure what's right here, but it's rare not to have a head marked.
nonTerminalInfo.put("VC", new String[][]{{left,"VXINF"}});
nonTerminalInfo.put("VF", new String[][]{{left,"NX","ADJX","PX","ADVX","EN","SIMPX"}}); // second dtrs are always punctuation.
nonTerminalInfo.put("FKONJ", new String[][]{{left,"LK"},{right,"VC"},{left,"MF","NF","VF"}}); // these are basically like clauses themselves...the problem is when there's no LK or VC :(
nonTerminalInfo.put("DM", new String[][]{{left,"PTKANT"},{left,"ITJ"},{left,"KON","FM"},{left}});
nonTerminalInfo.put("P", new String[][]{{left,"SIMPX"},{left}}); // ***NOTE*** that this is really the P-SIMPX category, but the - will make it stripped to P.
nonTerminalInfo.put("PSIMPX", new String[][]{{left,"SIMPX"},{left}}); // ***NOTE*** that this is really the P-SIMPX category, but the - will make it stripped to P.
nonTerminalInfo.put("R", new String[][]{{left,"C"},{left,"R"},{right,"VC"}}); // ***NOTE*** this is really R-SIMPX. Also: syntactic head here. Except for the rare ones that have neither C nor R-SIMPX dtrs.
nonTerminalInfo.put("RSIMPX", new String[][]{{left,"C"},{left,"RSIMPX"},{right,"VC"}}); // ***NOTE*** this is really R-SIMPX. Also: syntactic head here. Except for the rare ones that have neither C nor R-SIMPX dtrs.
nonTerminalInfo.put("SIMPX", new String[][]{{left,"LK"},{right,"VC"},{left,"SIMPX"},{left,"C"},{right,"FKOORD"},{right,"MF"},{right}}); // syntactic (finite verb) head here. Note that when there's no LK or VC,the interesting predication tends to be annotated as inside the MF
nonTerminalInfo.put("EN", new String[][]{{left, "NX"}}); // note that this node label starts as EN-ADD but the -ADD will get stripped off.
nonTerminalInfo.put("EN_ADD", new String[][]{{left, "NX"},{left, "VXINF"}}); // just in case EN-ADD has been changed to EN_ADD
nonTerminalInfo.put("ENADD", new String[][]{{left, "NX"},{left, "VXINF"}}); // just in case EN-ADD has been changed to EN_ADD
private final Pattern headMarkedPattern;
private final Pattern headMarkedPattern2;
/* Many TueBaDZ local trees have an explicitly marked head, as :HD or -HD. (Almost!) all the time, there is only one :HD per local tree. Use it if possible. */
protected Tree findMarkedHead(Tree t) {
Tree[] kids = t.children();
for (int i = 0, n = kids.length; i < n; i++) {
if (headMarkedPattern.matcher(kids[i].label().value()).find() || headMarkedPattern2.matcher(kids[i].label().value()).find()) {
//System.err.println("found manually-labeled head " + kids[i] + " for tree " + t);
return kids[i];
return null;
//Taken from AbstractTreebankLanguage pack b/c we have a slightly different definition of
//basic category for head finding - we strip grammatical function tags.
public String basicCategory(String category) {
if (category == null) {
return null;
return category.substring(0, postBasicCategoryIndex(category));
private int postBasicCategoryIndex(String category) {
boolean sawAtZero = false;
char seenAtZero = '\u0000';
int i = 0;
for (int leng = category.length(); i < leng; i++) {
char ch = category.charAt(i);
if (isLabelAnnotationIntroducingCharacter(ch)) {
if (i == 0) {
sawAtZero = true;
seenAtZero = ch;
} else if (sawAtZero && ch == seenAtZero) {
sawAtZero = false;
} else {
return i;
* Say whether this character is an annotation introducing
* character.
* @param ch The character to check
* @return Whether it is an annotation introducing character
public boolean isLabelAnnotationIntroducingCharacter(char ch) {
if (tlp.isLabelAnnotationIntroducingCharacter(ch)) {
return true;
//for heads, there's one more char we want to check because we don't care about grammatical fns
if (ch == '-') {
return true;
return false;
/** Called by determineHead and may be overridden in subclasses
* if special treatment is necessary for particular categories.
protected Tree determineNonTrivialHead(Tree t, Tree parent) {
Tree theHead = null;
String motherCat = basicCategory(t.label().value());
if (DEBUG) {
System.err.println("Looking for head of " + t.label() +
"; value is |" + t.label().value() + "|, " +
" baseCat is |" + motherCat + "|");
// We know we have nonterminals underneath
// (a bit of a Penn Treebank assumption, but).
// Look at label.
String[][] how = nonTerminalInfo.get(motherCat);
if (how == null) {
if (DEBUG) {
System.err.println("Warning: No rule found for " + motherCat +
" (first char: " + motherCat.charAt(0) + ")");
System.err.println("Known nonterms are: " + nonTerminalInfo.keySet());
if (defaultRule != null) {
if (DEBUG) {
System.err.println(" Using defaultRule");
return traverseLocate(t.children(), defaultRule, true);
} else {
return null;
for (int i = 0; i < how.length; i++) {
boolean deflt = (i == how.length - 1);
theHead = traverseLocate(t.children(), how[i], deflt);
if (theHead != null) {
if (DEBUG) {
System.err.println(" Chose " + theHead.label());
return theHead;