tlpp.setOptionFlag(options, 0);
}
Treebank tb = tlpp.diskTreebank();
tb.loadPath(args[1]);
MorphoFeatureSpecification morphoSpec = language.equals(Language.Arabic) ?
new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();
String[] features = args[2].trim().split(",");
for (String feature : features) {
morphoSpec.activate(MorphoFeatureType.valueOf(feature));
}
// Counters
Counter<String> wordTagCounter = new ClassicCounter<String>(30000);
Counter<String> morphTagCounter = new ClassicCounter<String>(500);
// Counter<String> signatureTagCounter = new ClassicCounter<String>();
Counter<String> morphCounter = new ClassicCounter<String>(500);
Counter<String> wordCounter = new ClassicCounter<String>(30000);
Counter<String> tagCounter = new ClassicCounter<String>(300);
Counter<String> lemmaCounter = new ClassicCounter<String>(25000);
Counter<String> lemmaTagCounter = new ClassicCounter<String>(25000);
Counter<String> richTagCounter = new ClassicCounter<String>(1000);
Counter<String> reducedTagCounter = new ClassicCounter<String>(500);
Counter<String> reducedTagLemmaCounter = new ClassicCounter<String>(500);
Map<String,Set<String>> wordLemmaMap = Generics.newHashMap();
TwoDimensionalIntCounter<String,String> lemmaReducedTagCounter = new TwoDimensionalIntCounter<String,String>(30000);
TwoDimensionalIntCounter<String,String> reducedTagTagCounter = new TwoDimensionalIntCounter<String,String>(500);
TwoDimensionalIntCounter<String,String> tagReducedTagCounter = new TwoDimensionalIntCounter<String,String>(300);
int numTrees = 0;
for (Tree tree : tb) {
for (Tree subTree : tree) {
if (!subTree.isLeaf()) {
tlpp.transformTree(subTree, tree);
}
}
List<Label> pretermList = tree.preTerminalYield();
List<Label> yield = tree.yield();
assert yield.size() == pretermList.size();
int yieldLen = yield.size();
for (int i = 0; i < yieldLen; ++i) {
String tag = pretermList.get(i).value();
String word = yield.get(i).value();
String morph = ((CoreLabel) yield.get(i)).originalText();
// Note: if there is no lemma, then we use the surface form.
Pair<String,String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph);
String lemma = lemmaTag.first();
String richTag = lemmaTag.second();
// WSGDEBUG
if (tag.contains("MW")) lemma += "-MWE";
lemmaCounter.incrementCount(lemma);
lemmaTagCounter.incrementCount(lemma + tag);
richTagCounter.incrementCount(richTag);
String reducedTag = morphoSpec.strToFeatures(richTag).toString();
reducedTagCounter.incrementCount(reducedTag);
reducedTagLemmaCounter.incrementCount(reducedTag + lemma);
wordTagCounter.incrementCount(word + tag);