cat = cat + "^U";
}
} // otherwise, leave the tags alone!
// Label label = new CategoryWordTag(cat, word, cat);
Label label = t.label().labelFactory().newLabel(t.label());
label.setValue(cat);
if(label instanceof HasCategory)
((HasCategory) label).setCategory(cat);
if(label instanceof HasWord)
((HasWord) label).setWord(word);
if(label instanceof HasTag)
((HasTag) label).setTag(cat);
t.setLabel(label);
t.setChild(0, childResult); // just in case word is changed
if (trainOptions.noTagSplit) {
return t;
} else {
// language-specific transforms
return tlpParams.transformTree(t, root);
}
} // end isPreTerminal()
// handle phrasal categories
Tree[] kids = t.children();
for (int childNum = 0; childNum < kids.length; childNum++) {
Tree child = kids[childNum];
Tree childResult = transformTreeHelper(child, root); // recursive call
t.setChild(childNum, childResult);
}
Tree headChild = hf.determineHead(t);
if(headChild == null || headChild.label() == null) {
throw new RuntimeException("TreeAnnotator: null head found for tree [suggesting incomplete/wrong HeadFinder]:\n" + t);
}
Label headLabel = headChild.label();
if( ! (headLabel instanceof HasWord))
throw new RuntimeException("TreeAnnotator: Head label lacks a Word annotation!");
if( ! (headLabel instanceof HasTag))
throw new RuntimeException("TreeAnnotator: Head label lacks a Tag annotation!");
String word = ((HasWord) headLabel).word();
String tag = ((HasTag) headLabel).tag();
// String baseTag = tlpParams.treebankLanguagePack().basicCategory(tag);
String baseCat = tlpParams.treebankLanguagePack().basicCategory(cat);
/* Sister annotation. Potential problem: if multiple sisters are
* strong indicators for a single category's expansions. This
* happens concretely in the Chinese Treebank when NP (object)
* has left sisters VV and AS. Could lead to too much
* sparseness. The ideal solution would be to give the
* splitting list an ordering, and take only the highest (~most
* informative/reliable) sister annotation.
*/
if (trainOptions.sisterAnnotate && !trainOptions.smoothing && baseParentStr.length() > 0) {
List<String> leftSis = listBasicCategories(SisterAnnotationStats.leftSisterLabels(t, parent));
List<String> rightSis = listBasicCategories(SisterAnnotationStats.rightSisterLabels(t, parent));
List<String> leftAnn = new ArrayList<String>();
List<String> rightAnn = new ArrayList<String>();
for (String s : leftSis) {
//s = baseCat+"=l="+tlpParams.treebankLanguagePack().basicCategory(s);
leftAnn.add(baseCat + "=l=" + tlpParams.treebankLanguagePack().basicCategory(s));
//System.out.println("left-annotated test string " + s);
}
for (String s : rightSis) {
//s = baseCat+"=r="+tlpParams.treebankLanguagePack().basicCategory(s);
rightAnn.add(baseCat + "=r=" + tlpParams.treebankLanguagePack().basicCategory(s));
}
for (Iterator<String> j = rightAnn.iterator(); j.hasNext();) {
//System.out.println("new rightsis " + (String)j.next()); //debugging
}
for (String annCat : trainOptions.sisterSplitters) {
//System.out.println("annotated test string " + annCat);
if (leftAnn.contains(annCat) || rightAnn.contains(annCat)) {
cat = cat + annCat.replaceAll("^" + baseCat, "");
break;
}
}
}
if (trainOptions.PA && !trainOptions.smoothing && baseParentStr.length() > 0) {
String cat2 = baseCat + "^" + baseParentStr;
if (!trainOptions.selectiveSplit || trainOptions.splitters.contains(cat2)) {
cat = cat + "^" + baseParentStr;
}
}
if (trainOptions.gPA && !trainOptions.smoothing && grandParentStr.length() > 0) {
if (trainOptions.selectiveSplit) {
String cat2 = baseCat + "^" + baseParentStr + "~" + baseGrandParentStr;
if (cat.contains("^") && trainOptions.splitters.contains(cat2)) {
cat = cat + "~" + baseGrandParentStr;
}
} else {
cat = cat + "~" + baseGrandParentStr;
}
}
if (trainOptions.markUnary > 0) {
if (trainOptions.markUnary == 1 && kids.length == 1 && kids[0].depth() >= 2) {
cat = cat + "-U";
} else if (trainOptions.markUnary == 2 && parent != null && parent.numChildren() == 1 && t.depth() >= 2) {
cat = cat + "-u";
}
}
if (trainOptions.rightRec && rightRec(t, baseCat)) {
cat = cat + "-R";
}
if (trainOptions.leftRec && leftRec(t, baseCat)) {
cat = cat + "-L";
}
if (trainOptions.splitPrePreT && t.isPrePreTerminal()) {
cat = cat + "-PPT";
}
// Label label = new CategoryWordTag(cat, word, tag);
Label label = t.label().labelFactory().newLabel(t.label());
label.setValue(cat);
if(label instanceof HasCategory)
((HasCategory) label).setCategory(cat);
if(label instanceof HasWord)
((HasWord) label).setWord(word);
if(label instanceof HasTag)