tag += "^=lVV";
}
// end Chinese-specific tag splits
Label label = new CategoryWordTag(tag, word, tag);
t.setLabel(label);
} else {
// it's a phrasal category
Tree[] kids = t.children();
// Chinese-specific category splits
List<String> leftSis = listBasicCategories(SisterAnnotationStats.leftSisterLabels(t, parent));
List<String> rightSis = listBasicCategories(SisterAnnotationStats.rightSisterLabels(t, parent));
if (paRootDtr && baseParentStr.equals("ROOT")) {
category += "^ROOT";
}
if (markIPsisterBA && baseCategory.equals("IP")) {
if (leftSis.contains("BA")) {
category += "=BA";
//System.out.println("Found IP sister of BA");
}
}
if (dominatesV && hasV(t.preTerminalYield())) {
// mark categories containing a verb
category += "-v";
}
if (markIPsisterVVorP && baseCategory.equals("IP")) {
// todo: cdm: is just looking for "P" here selective enough??
if (leftSis.contains("VV") || leftSis.contains("P")) {
category += "=VVP";
}
}
if (markIPsisDEC && baseCategory.equals("IP")) {
if (rightSis.contains("DEC")) {
category += "=DEC";
//System.out.println("Found prenominal IP");
}
}
if (baseCategory.equals("VP")) {
// cdm 2008: this used to just check that it startsWith("VP"), but
// I think that was bad because it also matched VPT verb compounds
if (chineseSplitVP == 3) {
boolean hasCC = false;
boolean hasPU = false;
boolean hasLexV = false;
for (Tree kid : kids) {
if (kid.label().value().startsWith("CC")) {
hasCC = true;
} else if (kid.label().value().startsWith("PU")) {
hasPU = true;
} else if (StringUtils.lookingAt(kid.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) {
hasLexV = true;
}
}
if (hasCC || (hasPU && ! hasLexV)) {
category += "-CRD";
//System.out.println("Found coordinate VP"); // testing
} else if (hasLexV) {
category += "-COMP";
//System.out.println("Found complementing VP"); // testing
} else {
category += "-ADJT";
//System.out.println("Found adjoining VP"); // testing
}
} else if (chineseSplitVP >= 1) {
boolean hasBA = false;
for (Tree kid : kids) {
if (kid.label().value().startsWith("BA")) {
hasBA = true;
} else if (chineseSplitVP == 2 && tlp.basicCategory(kid.label().value()).equals("VP")) {
for (Tree kidkid : kid.children()) {
if (kidkid.label().value().startsWith("BA")) {
hasBA = true;
}
}
}
}
if (hasBA) {
category += "-BA";
}
}
}
if (markVPadjunct && baseParentStr.equals("VP")) {
// cdm 2008: This used to use startsWith("VP") but changed to baseCat
Tree[] sisters = parent.children();
boolean hasVPsister = false;
boolean hasCC = false;
boolean hasPU = false;
boolean hasLexV = false;
for (Tree sister : sisters) {
if (tlp.basicCategory(sister.label().value()).equals("VP")) {
hasVPsister = true;
}
if (sister.label().value().startsWith("CC")) {
hasCC = true;
}
if (sister.label().value().startsWith("PU")) {
hasPU = true;
}
if (StringUtils.lookingAt(sister.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) {
hasLexV = true;
}
}
if (hasVPsister && !(hasCC || hasPU || hasLexV)) {
category += "-VPADJ";
//System.out.println("Found adjunct of VP"); // testing
}
}
if (markNPmodNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
if (rightSis.contains("NP")) {
category += "=MODIFIERNP";
//System.out.println("Found NP modifier of NP"); // testing
}
}
if (markModifiedNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
if (rightSis.isEmpty() && (leftSis.contains("ADJP") || leftSis.contains("NP") || leftSis.contains("DNP") || leftSis.contains("QP") || leftSis.contains("CP") || leftSis.contains("PP"))) {
category += "=MODIFIEDNP";
//System.out.println("Found modified NP"); // testing
}
}
if (markNPconj && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
if (rightSis.contains("CC") || rightSis.contains("PU") || leftSis.contains("CC") || leftSis.contains("PU")) {
category += "=CONJ";
//System.out.println("Found NP conjunct"); // testing
}
}
if (markIPconj && baseCategory.equals("IP") && baseParentStr.equals("IP")) {
Tree[] sisters = parent.children();
boolean hasCommaSis = false;
boolean hasIPSis = false;
for (Tree sister : sisters) {
if (ctlp.basicCategory(sister.label().value()).equals("PU") && ChineseTreebankLanguagePack.chineseCommaAcceptFilter().test(sister.children()[0].label().toString())) {
hasCommaSis = true;
//System.out.println("Found CommaSis"); // testing
}
if (ctlp.basicCategory(sister.label().value()).equals("IP") && sister != t) {
hasIPSis = true;
}
}
if (hasCommaSis && hasIPSis) {
category += "-CONJ";
//System.out.println("Found IP conjunct"); // testing
}
}
if (unaryIP && baseCategory.equals("IP") && t.numChildren() == 1) {
category += "-U";
//System.out.println("Found unary IP"); //testing
}
if (unaryCP && baseCategory.equals("CP") && t.numChildren() == 1) {
category += "-U";
//System.out.println("Found unary CP"); //testing
}
if (splitBaseNP && baseCategory.equals("NP")) {
if (t.isPrePreTerminal()) {
category = category + "-B";
}
}
//if (Test.verbose) printlnErr(baseCategory + " " + leftSis.toString()); //debugging
if (markPostverbalPP && leftSis.contains("VV") && baseCategory.equals("PP")) {
//System.out.println("Found post-verbal PP");
category += "=lVV";
}
if ((markADgrandchildOfIP || gpaAD) && listBasicCategories(SisterAnnotationStats.kidLabels(t)).contains("AD")) {
category += "^ADVP";
}
if (markCC) {
// was: for (int i = 0; i < kids.length; i++) {
// This second version takes an idea from Collins: don't count
// marginal conjunctions which don't conjoin 2 things.
for (int i = 1; i < kids.length - 1; i++) {
String cat2 = kids[i].label().value();
if (cat2.startsWith("CC")) {
category += "-CC";
}
}
}
Label label = new CategoryWordTag(category, word, tag);
t.setLabel(label);
}
return t;
}