Tree[] subtrees = new Tree[maxNode + 1];
for (int i = 0; i < sentence.size(); ++i) {
CoreLabel word = new CoreLabel();
word.setValue(sentence.get(i));
Tree leaf = new LabeledScoredTreeNode(word);
subtrees[i] = new LabeledScoredTreeNode(new CoreLabel());
subtrees[i].addChild(leaf);
}
for (int i = sentence.size(); i <= maxNode; ++i) {
subtrees[i] = new LabeledScoredTreeNode(new CoreLabel());
}
boolean[] connected = new boolean[maxNode + 1];
Tree root = null;
for (int index = 0; index < parentPointers.size(); ++index) {
if (parentPointers.get(index) == -1) {
if (root != null) {
throw new RuntimeException("Found two roots for sentence " + sentence);
}
root = subtrees[index];
} else {
// Walk up the tree structure to make sure that leftmost
// phrases are added first. Otherwise, if the numbers are
// inverted, we might get the right phrase added to a parent
// first, resulting in "case zero in this", for example,
// instead of "in this case zero"
// Note that because we keep track of which ones are already
// connected, we process this at most once per parent, so the
// overall construction time is still efficient.
connect(parentPointers, subtrees, connected, index);
}
}
for (int i = 0; i <= maxNode; ++i) {
List<Tree> leaves = subtrees[i].getLeaves();
List<String> words = CollectionUtils.transformAsList(leaves, TRANSFORM_TREE_TO_WORD);
// First we look for a copy of the phrase with -LRB- -RRB-
// instead of (). The sentiment trees sometimes have both, and
// the escaped versions seem to have more reasonable scores.
// If a particular phrase doesn't have -LRB- -RRB- we fall back
// to the unescaped versions.
Integer phraseId = phraseIds.get(CollectionUtils.transformAsList(words, TRANSFORM_PARENS));
if (phraseId == null) {
phraseId = phraseIds.get(words);
}
if (phraseId == null) {
throw new RuntimeException("Could not find phrase id for phrase " + sentence);
}
// TODO: should we make this an option? Perhaps we want cases
// where the trees have the phrase id and not their class
Double score = sentimentScores.get(phraseId);
if (score == null) {
throw new RuntimeException("Could not find sentiment score for phrase id " + phraseId);
}
// TODO: make this a numClasses option
int classLabel = Math.round((float) Math.floor(score * 5.0));
if (classLabel > 4) {
classLabel = 4;
}
subtrees[i].label().setValue(Integer.toString(classLabel));
}
for (int i = 0; i < sentence.size(); ++i) {
Tree leaf = subtrees[i].children()[0];
leaf.label().setValue(escaper.escapeString(leaf.label().value()));
}
for (int i = 0; i < tregexPatterns.length; ++i) {
root = Tsurgeon.processPattern(tregexPatterns[i], tsurgeonPatterns[i], root);
}