List<Tree> leaves = tree.getLeaves();
List<CoreLabel> tokens = rel.getSentence().get(TokensAnnotation.class);
// this assumes that both args are in the same sentence as the relation object
// let's check for this to be safe
CoreMap relSentence = rel.getSentence();
CoreMap arg0Sentence = arg0.getSentence();
CoreMap arg1Sentence = arg1.getSentence();
if(arg0Sentence != relSentence){
System.err.println("WARNING: Found relation with arg0 in a different sentence: " + rel);
System.err.println("Relation sentence: " + relSentence.get(TextAnnotation.class));
System.err.println("Arg0 sentence: " + arg0Sentence.get(TextAnnotation.class));
return false;
}
if(arg1Sentence != relSentence){
System.err.println("WARNING: Found relation with arg1 in a different sentence: " + rel);
System.err.println("Relation sentence: " + relSentence.get(TextAnnotation.class));
System.err.println("Arg1 sentence: " + arg1Sentence.get(TextAnnotation.class));
return false;
}
// Checklist keeps track of which features have been handled by an if clause
// Should be empty after all the clauses have been gone through.
List<String> checklist = new ArrayList<String>(types);
// arg_type: concatenation of the entity types of the args, e.g.
// "arg1type=Loc_and_arg2type=Org"
// arg_subtype: similar, for entity subtypes
if (usingFeature(types, checklist, "arg_type")) {
features.setCount("arg1type=" + arg0.getType() + "_and_arg2type=" + arg1.getType(), 1.0);
}
if (usingFeature(types,checklist,"arg_subtype")) {
features.setCount("arg1subtype="+arg0.getSubType()+"_and_arg2subtype="+arg1.getSubType(),1.0);
}
// arg_order: which arg comes first in the sentence
if (usingFeature(types, checklist, "arg_order")) {
if (arg0.getSyntacticHeadTokenPosition() < arg1.getSyntacticHeadTokenPosition())
features.setCount("arg1BeforeArg2", 1.0);
}
// same_head: whether the two args share the same syntactic head token
if (usingFeature(types, checklist, "same_head")) {
if (arg0.getSyntacticHeadTokenPosition() == arg1.getSyntacticHeadTokenPosition())
features.setCount("arguments_have_same_head",1.0);
}
// full_tree_path: Path from one arg to the other in the phrase structure tree,
// e.g., NNP -> PP -> NN <- NNP
if (usingFeature(types, checklist, "full_tree_path")) {
//System.err.println("ARG0: " + arg0);
//System.err.println("ARG0 HEAD: " + arg0.getSyntacticHeadTokenPosition());
//System.err.println("TREE: " + tree);
//System.err.println("SENTENCE: " + sentToString(arg0.getSentence()));
if(arg0.getSyntacticHeadTokenPosition() < leaves.size() && arg1.getSyntacticHeadTokenPosition() < leaves.size()){
Tree arg0preterm = leaves.get(arg0.getSyntacticHeadTokenPosition()).parent(tree);
Tree arg1preterm = leaves.get(arg1.getSyntacticHeadTokenPosition()).parent(tree);
Tree join = tree.joinNode(arg0preterm, arg1preterm);
StringBuilder pathStringBuilder = new StringBuilder();
List<Tree> pathUp = join.dominationPath(arg0preterm);
Collections.reverse(pathUp);
for (Tree node : pathUp) {
if (node != join) {
pathStringBuilder.append(node.label().value() + " <- ");
}
}
for (Tree node : join.dominationPath(arg1preterm)) {
pathStringBuilder.append(((node == join) ? "" : " -> ") + node.label().value());
}
String pathString = pathStringBuilder.toString();
if(logger != null && ! rel.getType().equals(RelationMention.UNRELATED)) logger.info("full_tree_path: " + pathString);
features.setCount("treepath:"+pathString, 1.0);
} else {
System.err.println("WARNING: found weird argument offsets. Most likely because arguments appear in different sentences than the relation:");
System.err.println("ARG0: " + arg0);
System.err.println("ARG0 HEAD: " + arg0.getSyntacticHeadTokenPosition());
System.err.println("ARG0 SENTENCE: " + sentToString(arg0.getSentence()));
System.err.println("ARG1: " + arg1);
System.err.println("ARG1 HEAD: " + arg1.getSyntacticHeadTokenPosition());
System.err.println("ARG1 SENTENCE: " + sentToString(arg1.getSentence()));
System.err.println("RELATION TREE: " + tree);
}
}
int pathLength = tree.pathNodeToNode(tree.getLeaves().get(arg0.getSyntacticHeadTokenPosition()),
tree.getLeaves().get(arg1.getSyntacticHeadTokenPosition())).size();
// path_length: Length of the path in the phrase structure parse tree, integer-valued feature
if (usingFeature(types, checklist, "path_length")) {
features.setCount("path_length", pathLength);
}
// path_length_binary: Length of the path in the phrase structure parse tree, binary features
if (usingFeature(types, checklist, "path_length_binary")) {
features.setCount("path_length_" + pathLength, 1.0);
}
/* entity_order
* This tells you for each of the two args
* whether there are other entities before or after that arg.
* In particular, it can tell whether an arg is the first entity of its type in the sentence
* (which can be useful for example for telling the gameWinner and gameLoser in NFL).
* TODO: restrict this feature so that it only looks for
* entities of the same type?
* */
if (usingFeature(types, checklist, "entity_order")) {
for (int i = 0; i < rel.getArgs().size(); i++) {
// We already checked the class of the args at the beginning of the method
EntityMention arg = (EntityMention) rel.getArgs().get(i);
if(rel.getSentence().get(MachineReadingAnnotations.EntityMentionsAnnotation.class) != null) { // may be null due to annotation error
for (EntityMention otherArg : rel.getSentence().get(MachineReadingAnnotations.EntityMentionsAnnotation.class)) {
String feature;
if (otherArg.getSyntacticHeadTokenPosition() > arg.getSyntacticHeadTokenPosition()) {
feature = "arg" + i + "_before_" + otherArg.getType();
features.setCount(feature, 1.0);
}
if (otherArg.getSyntacticHeadTokenPosition() < arg.getSyntacticHeadTokenPosition()) {
feature = "arg" + i + "_after_" + otherArg.getType();
features.setCount(feature, 1.0);
}
}
}
}
}
// surface_distance: Number of tokens in the sentence between the two words, integer-valued feature
int surfaceDistance = Math.abs(arg0.getSyntacticHeadTokenPosition() - arg1.getSyntacticHeadTokenPosition());
if (usingFeature(types, checklist, "surface_distance")) {
features.setCount("surface_distance", surfaceDistance);
}
// surface_distance_binary: Number of tokens in the sentence between the two words, binary features
if (usingFeature(types, checklist, "surface_distance_binary")) {
features.setCount("surface_distance_" + surfaceDistance, 1.0);
}
// surface_distance_bins: number of tokens between the two args, binned to several intervals
if(usingFeature(types, checklist, "surface_distance_bins")) {
if(surfaceDistance < 4){
features.setCount("surface_distance_bin" + surfaceDistance, 1.0);
} else if(surfaceDistance < 6){
features.setCount("surface_distance_bin_lt6", 1.0);
} else if(surfaceDistance < 10) {
features.setCount("surface_distance_bin_lt10", 1.0);
} else {
features.setCount("surface_distance_bin_ge10", 1.0);
}
}
// separate_surface_windows: windows of 1,2,3 tokens before and after args, for each arg separately
// Separate features are generated for windows to the left and to the right of the args.
// Features are concatenations of words in the window (or NULL for sentence boundary).
//
// conjunction_surface_windows: concatenation of the windows of the two args
//
// separate_surface_windows_POS: windows of POS tags of size 1,2,3 for each arg
//
// conjunction_surface_windows_POS: concatenation of windows of the args
List<EntityMention> args = new ArrayList<EntityMention>();
args.add(arg0); args.add(arg1);
for (int windowSize = 1; windowSize <= 3; windowSize++) {
String[] leftWindow, rightWindow, leftWindowPOS, rightWindowPOS;
leftWindow = new String[2];
rightWindow = new String[2];
leftWindowPOS = new String[2];
rightWindowPOS = new String[2];
for (int argn = 0; argn <= 1; argn++) {
int ind = args.get(argn).getSyntacticHeadTokenPosition();
for (int winnum = 1; winnum <= windowSize; winnum++) {
int windex = ind - winnum;
if (windex > 0) {
leftWindow[argn] = leaves.get(windex).label().value() + "_" + leftWindow[argn];
leftWindowPOS[argn] = leaves.get(windex).parent(tree).label().value() + "_" + leftWindowPOS[argn];
} else {
leftWindow[argn] = "NULL_" + leftWindow[argn];
leftWindowPOS[argn] = "NULL_" + leftWindowPOS[argn];
}
windex = ind + winnum;
if (windex < leaves.size()) {
rightWindow[argn] = rightWindow[argn] + "_" + leaves.get(windex).label().value();
rightWindowPOS[argn] = rightWindowPOS[argn] + "_" + leaves.get(windex).parent(tree).label().value();
} else {
rightWindow[argn] = rightWindow[argn] + "_NULL";
rightWindowPOS[argn] = rightWindowPOS[argn] + "_NULL";
}
}
if (usingFeature(types, checklist, "separate_surface_windows")) {
features.setCount("left_window_"+windowSize+"_arg_" + argn + ": " + leftWindow[argn], 1.0);
features.setCount("left_window_"+windowSize+"_POS_arg_" + argn + ": " + leftWindowPOS[argn], 1.0);
}
if (usingFeature(types, checklist, "separate_surface_windows_POS")) {
features.setCount("right_window_"+windowSize+"_arg_" + argn + ": " + rightWindow[argn], 1.0);
features.setCount("right_window_"+windowSize+"_POS_arg_" + argn + ": " + rightWindowPOS[argn], 1.0);
}
}
if (usingFeature(types, checklist, "conjunction_surface_windows")) {
features.setCount("left_windows_"+windowSize+": " + leftWindow[0] + "__" + leftWindow[1], 1.0);
features.setCount("right_windows_"+windowSize+": " + rightWindow[0] + "__" + rightWindow[1], 1.0);
}
if (usingFeature(types, checklist, "conjunction_surface_windows_POS")) {
features.setCount("left_windows_"+windowSize+"_POS: " + leftWindowPOS[0] + "__" + leftWindowPOS[1], 1.0);
features.setCount("right_windows_"+windowSize+"_POS: " + rightWindowPOS[0] + "__" + rightWindowPOS[1], 1.0);
}
}
// arg_words: The actual arg tokens as separate features, and concatenated
String word0 = leaves.get(arg0.getSyntacticHeadTokenPosition()).label().value();
String word1 = leaves.get(arg1.getSyntacticHeadTokenPosition()).label().value();
if (usingFeature(types, checklist, "arg_words")) {
if(doNotLexicalizeFirstArg == false)
features.setCount("word_arg0: " + word0, 1.0);
features.setCount("word_arg1: " + word1, 1.0);
if(doNotLexicalizeFirstArg == false)
features.setCount("words: " + word0 + "__" + word1, 1.0);
}
// arg_POS: POS tags of the args, as separate features and concatenated
String pos0 = leaves.get(arg0.getSyntacticHeadTokenPosition()).parent(tree).label().value();
String pos1 = leaves.get(arg1.getSyntacticHeadTokenPosition()).parent(tree).label().value();
if (usingFeature(types, checklist, "arg_POS")) {
features.setCount("POS_arg0: " + pos0, 1.0);
features.setCount("POS_arg1: " + pos1, 1.0);
features.setCount("POSs: " + pos0 + "__" + pos1, 1.0);
}
// adjacent_words: words immediately to the left and right of the args
if(usingFeature(types, checklist, "adjacent_words")){
for(int i = 0; i < rel.getArgs().size(); i ++){
Span s = ((EntityMention) rel.getArg(i)).getHead();
if(s.start() > 0){
String v = tokens.get(s.start() - 1).word();
features.setCount("leftarg" + i + "-" + v, 1.0);
}
if(s.end() < tokens.size()){
String v = tokens.get(s.end()).word();
features.setCount("rightarg" + i + "-" + v, 1.0);
}
}
}
// entities_between_args: binary feature for each type specifying whether there is an entity of that type in the sentence
// between the two args.
// e.g. "entity_between_args: Loc" means there is at least one entity of type Loc between the two args
if (usingFeature(types, checklist, "entities_between_args")) {
CoreMap sent = rel.getSentence();
if(sent == null) throw new RuntimeException("NULL sentence for relation " + rel);
List<EntityMention> relArgs = sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
if(relArgs != null) { // may be null due to annotation errors!
for (EntityMention arg : relArgs) {
if ((arg.getSyntacticHeadTokenPosition() > arg0.getSyntacticHeadTokenPosition() && arg.getSyntacticHeadTokenPosition() < arg1.getSyntacticHeadTokenPosition())
|| (arg.getSyntacticHeadTokenPosition() > arg1.getSyntacticHeadTokenPosition() && arg.getSyntacticHeadTokenPosition() < arg0.getSyntacticHeadTokenPosition())) {
features.setCount("entity_between_args: " + arg.getType(), 1.0);