package edu.stanford.nlp.semgraph;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.LabeledWord;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.trees.EnglishGrammaticalRelations;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.MapList;
import java.io.StringWriter;
import java.util.*;
/**
* Generic utilities for dealing with Dependency graphs and other structures, useful for
* text simplification and rewriting.
*
* TODO: Migrate some of the functions (that make sense) into SemanticGraph proper.
* BUT BEWARE: This class has methods that use jgraph (as opposed to jgrapht).
* We don't want our core code to become dependent on jgraph, so methods in
* SemanticGraph shouldn't call methods in this class, and methods that use
* jgraph shouldn't be moved into SemanticGraph.
*
* @author Eric Yeh
*
*/
public class SemanticGraphUtils {
private SemanticGraphUtils() {}
/**
* Given a collection of nodes from srcGraph, generates a new
* SemanticGraph based off the subset represented by those nodes.
* This uses the same vertices as in the original graph, which
* allows for equality and comparisons between the two graphs.
*/
public static SemanticGraph makeGraphFromNodes(Collection<IndexedWord> nodes, SemanticGraph srcGraph) {
if (nodes.size() == 1) {
SemanticGraph retSg = new SemanticGraph();
for (IndexedWord node :nodes)
retSg.addVertex(node);
return retSg;
}
if (nodes.isEmpty()) {
return null;
}
// TODO: if any nodes are not connected to edges in the original
// graph, this will leave them out
List<SemanticGraphEdge> edges = new ArrayList<SemanticGraphEdge>();
for (IndexedWord nodeG : nodes) {
for (IndexedWord nodeD: nodes) {
Collection<SemanticGraphEdge> existingEdges =
srcGraph.getAllEdges(nodeG, nodeD);
if (existingEdges != null) {
edges.addAll(existingEdges);
}
}
}
return SemanticGraphFactory.makeFromEdges(edges);
}
//----------------------------------------------------------------------------------------
//Query routines (obtaining sets of edges/vertices over predicates, etc)
//----------------------------------------------------------------------------------------
/**
* Finds the vertex in the given SemanticGraph that corresponds to the given node.
* Returns null if cannot find. Uses first match on index, sentIndex, and word values.
*/
public static IndexedWord findMatchingNode(IndexedWord node,
SemanticGraph sg) {
for (IndexedWord tgt : sg.vertexSet()) {
if ((tgt.index() == node.index()) &&
(tgt.sentIndex() == node.sentIndex()) &&
(tgt.word().equals(node.word())) )
return tgt;
}
return null;
}
/**
* Given a starting vertice, grabs the subtree encapsulated by portion of the semantic graph, excluding
* a given edge. A tabu list is maintained, in order to deal with cyclical relations (such as between a
* rcmod (relative clause) and its nsubj).
*
*/
public static Set<SemanticGraphEdge> getSubTreeEdges(IndexedWord vertice, SemanticGraph sg, SemanticGraphEdge excludedEdge) {
Set<SemanticGraphEdge> tabu = Generics.newHashSet();
tabu.add(excludedEdge);
getSubTreeEdgesHelper(vertice, sg, tabu);
tabu.remove(excludedEdge); // Do not want this in the returned edges
return tabu;
}
public static void getSubTreeEdgesHelper(IndexedWord vertice, SemanticGraph sg, Set<SemanticGraphEdge> tabuEdges) {
for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(vertice)) {
if (!tabuEdges.contains(edge)) {
IndexedWord dep = edge.getDependent();
tabuEdges.add(edge);
getSubTreeEdgesHelper(dep, sg, tabuEdges);
}
}
}
/**
* Given a set of nodes from a SemanticGraph, returns the set of
* edges that are spanned between these nodes.
*/
public static Collection<SemanticGraphEdge> getEdgesSpannedByVertices(Collection<IndexedWord> nodes, SemanticGraph sg) {
Collection<SemanticGraphEdge> ret = Generics.newHashSet();
for (IndexedWord n1 : nodes)
for (IndexedWord n2: nodes) {
if (n1 != n2) {
Collection<SemanticGraphEdge> edges = sg.getAllEdges(n1, n2);
if (edges != null) ret.addAll(edges);
}
}
return ret;
}
/**
* Returns a list of all children bearing a grammatical relation starting with the given string, relnPrefix
*/
public static List<IndexedWord> getChildrenWithRelnPrefix(SemanticGraph graph, IndexedWord vertex, String relnPrefix) {
if (vertex.equals(IndexedWord.NO_WORD))
return new ArrayList<IndexedWord>();
if (!graph.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
List<IndexedWord> childList = new ArrayList<IndexedWord>();
for (SemanticGraphEdge edge : graph.outgoingEdgeIterable(vertex)) {
if (edge.getRelation().toString().startsWith(relnPrefix)) {
childList.add(edge.getTarget());
}
}
return childList;
}
/**
* Returns a list of all children bearing a grammatical relation starting with the given set of relation prefixes
*/
public static List<IndexedWord> getChildrenWithRelnPrefix(SemanticGraph graph, IndexedWord vertex, Collection<String> relnPrefixes) {
if (vertex.equals(IndexedWord.NO_WORD))
return new ArrayList<IndexedWord>();
if (!graph.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
List<IndexedWord> childList = new ArrayList<IndexedWord>();
for (SemanticGraphEdge edge : graph.outgoingEdgeIterable(vertex)) {
String edgeString = edge.getRelation().toString();
for (String relnPrefix : relnPrefixes) {
if (edgeString.startsWith(relnPrefix)) {
childList.add(edge.getTarget());
break;
}
}
}
return childList;
}
/**
* Since graphs can be have preps collapsed, finds all the immediate children of this node
* that are linked by a collapsed preposition edge.
*/
public static List<IndexedWord> getChildrenWithPrepC(SemanticGraph sg, IndexedWord vertex) {
List<IndexedWord> ret =new ArrayList<IndexedWord>();
// Collection<GrammaticalRelation> prepCs = EnglishGrammaticalRelations.getPrepsC();
// for (SemanticGraphEdge edge : sg.outgoingEdgesOf(vertex)) {
// if (prepCs.contains(edge.getRelation()))
for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(vertex)) {
if (edge.getRelation().toString().startsWith("prep"))
ret.add(edge.getDependent());
}
return ret;
}
/**
* Returns the set of incoming edges for the given node that have the given
* relation.
*
* Because certain edges may remain in string form (prepcs), check for both
* string and object form of relations.
*/
public static List<SemanticGraphEdge> incomingEdgesWithReln(IndexedWord node, SemanticGraph sg, GrammaticalRelation reln) {
return edgesWithReln(sg.incomingEdgeIterable(node), reln);
}
/**
* Checks for outgoing edges of the node, in the given graph, which contain
* the given relation. Relations are matched on if they are GrammaticalRelation
* objects or strings.
*/
public static List<SemanticGraphEdge> outgoingEdgesWithReln(IndexedWord node, SemanticGraph sg, GrammaticalRelation reln) {
return edgesWithReln(sg.outgoingEdgeIterable(node), reln);
}
/**
* Given a list of edges, returns those which match the given relation (can be string or
* GrammaticalRelation object).
*/
public static List<SemanticGraphEdge> edgesWithReln(Iterable<SemanticGraphEdge> edges,
GrammaticalRelation reln) {
List<SemanticGraphEdge> found = Generics.newArrayList();
for (SemanticGraphEdge edge : edges) {
GrammaticalRelation tgtReln = edge.getRelation();
if (tgtReln.equals(reln)) {
found.add(edge);
}
}
return found;
}
/**
* Given a semantic graph, and a relation prefix, returns a list of all relations (edges)
* that start with the given prefix (e.g., prefix "prep" gives you all the prep relations: prep_by, pref_in,etc.)
*
*/
public static List<SemanticGraphEdge> findAllRelnsWithPrefix(SemanticGraph sg, String prefix) {
ArrayList<SemanticGraphEdge> relns = new ArrayList<SemanticGraphEdge>();
for (SemanticGraphEdge edge : sg.edgeIterable()) {
GrammaticalRelation edgeRelation = edge.getRelation();
if (edgeRelation.toString().startsWith(prefix)) {
relns.add(edge);
}
}
return relns;
}
/**
* Finds the descendents of the given node in graph, avoiding the given set of nodes
*/
public static Set<IndexedWord> tabuDescendants(SemanticGraph sg, IndexedWord vertex, Collection<IndexedWord> tabu) {
if (!sg.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
// Do a depth first search
Set<IndexedWord> descendantSet = Generics.newHashSet();
tabuDescendantsHelper(sg, vertex, descendantSet, tabu, null, null);
return descendantSet;
}
/**
* Finds the set of descendants for a node in the graph, avoiding the set of nodes and the
* set of edge relations. NOTE: these edges are encountered from the downward cull,
* from governor to dependent.
*/
public static Set<IndexedWord> tabuDescendants(SemanticGraph sg, IndexedWord vertex, Collection<IndexedWord> tabu,
Collection<GrammaticalRelation> tabuRelns) {
if (!sg.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
// Do a depth first search
Set<IndexedWord> descendantSet = Generics.newHashSet();
tabuDescendantsHelper(sg, vertex, descendantSet, tabu, tabuRelns, null);
return descendantSet;
}
public static Set<IndexedWord> descendantsTabuRelns(SemanticGraph sg, IndexedWord vertex,
Collection<GrammaticalRelation> tabuRelns) {
if (!sg.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
// Do a depth first search
Set<IndexedWord> descendantSet = Generics.newHashSet();
tabuDescendantsHelper(sg, vertex, descendantSet, Generics.<IndexedWord>newHashSet(), tabuRelns, null);
return descendantSet;
}
public static Set<IndexedWord> descendantsTabuTestAndRelns(SemanticGraph sg, IndexedWord vertex,
Collection<GrammaticalRelation> tabuRelns, IndexedWordUnaryPred tabuTest) {
if (!sg.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
// Do a depth first search
Set<IndexedWord> descendantSet = Generics.newHashSet();
tabuDescendantsHelper(sg, vertex, descendantSet, Generics.<IndexedWord>newHashSet(), tabuRelns, tabuTest);
return descendantSet;
}
public static Set<IndexedWord> descendantsTabuTestAndRelns(SemanticGraph sg, IndexedWord vertex,
Collection<IndexedWord> tabuNodes, Collection<GrammaticalRelation> tabuRelns, IndexedWordUnaryPred tabuTest) {
if (!sg.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
// Do a depth first search
Set<IndexedWord> descendantSet = Generics.newHashSet();
tabuDescendantsHelper(sg, vertex, descendantSet, tabuNodes, tabuRelns, tabuTest);
return descendantSet;
}
/**
* Performs a cull for the descendents of the given node in the
* graph, subject to the tabu nodes to avoid, relations to avoid
* crawling over, and child nodes to avoid traversing to based upon
* a predicate test.
*/
private static void tabuDescendantsHelper(SemanticGraph sg, IndexedWord curr, Set<IndexedWord> descendantSet, Collection<IndexedWord> tabu,
Collection<GrammaticalRelation> relnsToAvoid, IndexedWordUnaryPred tabuTest) {
if (tabu.contains(curr))
return;
if (descendantSet.contains(curr)) {
return;
}
descendantSet.add(curr);
for (IndexedWord child : sg.getChildren(curr)) {
for (SemanticGraphEdge edge : sg.getAllEdges(curr, child)) {
if (relnsToAvoid != null && relnsToAvoid.contains(edge.getRelation()))
continue;
if (tabuTest != null && tabuTest.test(edge.getDependent(), sg))
continue;
tabuDescendantsHelper(sg, child, descendantSet, tabu, relnsToAvoid,
tabuTest);
}
}
}
//------------------------------------------------------------------------------------
//"Constituent" extraction and manipulation
//------------------------------------------------------------------------------------
/**
* Returns the vertice that is "leftmost." Note this requires that the IndexedFeatureLabels present actually have
* ordering information.
* TODO: can be done more efficiently?
*/
public static IndexedWord leftMostChildVertice(IndexedWord startNode, SemanticGraph sg) {
TreeSet<IndexedWord> vertices = new TreeSet<IndexedWord>();
for (IndexedWord vertex : sg.descendants(startNode)) {
vertices.add(vertex);
}
return vertices.first();
}
/**
* Given a SemanticGraph, and a set of nodes, finds the "blanket" of nodes that are one
* edge away from the set of nodes passed in. This is similar to the idea of a Markov
* Blanket, except in the context of a SemanticGraph.
* TODO: optimize
*/
public static Collection<IndexedWord> getDependencyBlanket(SemanticGraph sg, Collection<IndexedWord> assertedNodes) {
Set<IndexedWord> retSet = Generics.newHashSet();
for (IndexedWord curr : sg.vertexSet()) {
if (!assertedNodes.contains(curr) && !retSet.contains(curr)) {
for (IndexedWord assertedNode : assertedNodes) {
if (sg.containsEdge(assertedNode, curr) ||
sg.containsEdge(curr, assertedNode)) {
retSet.add(curr);
}
}
}
}
return retSet;
}
/**
* Resets the indices for the vertices in the graph, using the current
* ordering returned by vertexList (presumably in order). This is to ensure
* accesses to the InfoFile word table do not fall off after a SemanticGraph has
* been edited.
* <br>
* NOTE: the vertices will be replaced, as JGraphT does not permit
* in-place modification of the nodes. (TODO: we no longer use
* JGraphT, so this should be fixed)
*/
public static SemanticGraph resetVerticeOrdering(SemanticGraph sg) {
SemanticGraph nsg = new SemanticGraph();
List<IndexedWord> vertices = sg.vertexListSorted();
int index = 1;
Map<IndexedWord, IndexedWord> oldToNewVertices = Generics.newHashMap();
List<IndexedWord> newVertices = new ArrayList<IndexedWord>();
for (IndexedWord vertex : vertices) {
IndexedWord newVertex = new IndexedWord(vertex);
newVertex.setIndex(index++);
oldToNewVertices.put(vertex, newVertex);
///sg.removeVertex(vertex);
newVertices.add(newVertex);
}
for (IndexedWord nv : newVertices) {
nsg.addVertex(nv);
}
List<IndexedWord> newRoots = new ArrayList<IndexedWord>();
for (IndexedWord or : sg.getRoots()) {
newRoots.add(oldToNewVertices.get(or));
}
nsg.setRoots(newRoots);
for (SemanticGraphEdge edge : sg.edgeIterable()) {
IndexedWord newGov = oldToNewVertices.get(edge.getGovernor());
IndexedWord newDep = oldToNewVertices.get(edge.getDependent());
nsg.addEdge(newGov, newDep, edge.getRelation(), edge.getWeight(), edge.isExtra());
}
return nsg;
}
/**
* Given a graph, ensures all edges are EnglishGrammaticalRelations
* NOTE: this is English specific
* NOTE: currently EnglishGrammaticalRelations does not link collapsed prep string forms
* back to their object forms, for its valueOf relation. This may need to be repaired if
* generated edges indeed do have collapsed preps as strings.
*/
public static void enRepairEdges(SemanticGraph sg, boolean verbose) {
for (SemanticGraphEdge edge : sg.edgeIterable()) {
if (edge.getRelation().isFromString()) {
GrammaticalRelation newReln =
EnglishGrammaticalRelations.valueOf(edge.getRelation().toString());
if (newReln != null) {
IndexedWord gov = edge.getGovernor();
IndexedWord dep = edge.getDependent();
double weight = edge.getWeight();
boolean isExtra = edge.isExtra();
sg.removeEdge(edge);
sg.addEdge(gov, dep, newReln, weight, isExtra);
} else {
if (verbose)
System.err.println("Warning, could not find matching GrammaticalRelation for reln="+edge.getRelation());
}
}
}
}
public static void enRepairEdges(SemanticGraph sg) {
enRepairEdges(sg, false);
}
/**
* Deletes all nodes that are not rooted (such as dangling vertices after a series of
* edges have been chopped).
*/
public static void killNonRooted(SemanticGraph sg) {
List<IndexedWord> nodes = new ArrayList<IndexedWord>(sg.vertexSet());
// Hack: store all of the nodes we know are in the rootset
Set<IndexedWord> guaranteed = Generics.newHashSet();
for (IndexedWord root : sg.getRoots()) {
guaranteed.add(root);
guaranteed.addAll(sg.descendants(root));
}
for (IndexedWord node : nodes) {
if (!guaranteed.contains(node)) {
sg.removeVertex(node);
}
}
}
/**
* Replaces a node in the given SemanticGraph with the new node,
* replacing its position in the node edges.
*/
public static void replaceNode(IndexedWord newNode, IndexedWord oldNode, SemanticGraph sg) {
// Obtain the edges where the old node was the governor and the dependent.
// Remove the old node, insert the new, and re-insert the edges.
// Save the edges in a list so that remove operations don't affect
// the iterator or our ability to find the edges in the first place
List<SemanticGraphEdge> govEdges = sg.outgoingEdgeList(oldNode);
List<SemanticGraphEdge> depEdges = sg.incomingEdgeList(oldNode);
boolean oldNodeRemoved = sg.removeVertex(oldNode);
if (oldNodeRemoved) {
// If the new node is not present, be sure to add it in.
if (!sg.containsVertex(newNode)) {
sg.addVertex(newNode);
}
for (SemanticGraphEdge govEdge : govEdges) {
sg.removeEdge(govEdge);
sg.addEdge(newNode, govEdge.getDependent(), govEdge.getRelation(), govEdge.getWeight(), govEdge.isExtra());
}
for (SemanticGraphEdge depEdge : depEdges) {
sg.removeEdge(depEdge);
sg.addEdge(depEdge.getGovernor(), newNode, depEdge.getRelation(), depEdge.getWeight(), depEdge.isExtra());
}
} else {
System.err.println("SemanticGraphUtils.replaceNode: previous node does not exist");
}
}
public static final String WILDCARD_VERTICE_TOKEN = "WILDCARD";
public static final IndexedWord WILDCARD_VERTICE = new IndexedWord();
static {
WILDCARD_VERTICE.setWord("*");
WILDCARD_VERTICE.setValue("*");
WILDCARD_VERTICE.setOriginalText("*");
}
/**
* GIven an iterable set of distinct vertices, creates a new mapping that maps the
* original vertices to a set of "generic" versions. Used for generalizing tokens in discovered rules.
* @param verts Vertices to anonymize
* @param prefix Prefix to assign to this anonymization
*/
public static Map<IndexedWord, IndexedWord> anonymyizeNodes(Iterable<IndexedWord> verts, String prefix) {
Map<IndexedWord, IndexedWord> retMap = Generics.newHashMap();
int index = 1;
for (IndexedWord orig: verts) {
IndexedWord genericVert = new IndexedWord(orig);
genericVert.set(CoreAnnotations.LemmaAnnotation.class, "");
String genericValue = prefix+index;
genericVert.setValue(genericValue);
genericVert.setWord(genericValue);
genericVert.setOriginalText(genericValue);
index++;
retMap.put(orig, genericVert);
}
return retMap;
}
public static final String SHARED_NODE_ANON_PREFIX ="A";
public static final String BLANKET_NODE_ANON_PREFIX ="B";
/**
* Used to make a mapping that lets you create "anonymous" versions of shared nodes between two
* graphs (given in the arg) using the shared prefix.
*/
public static Map<IndexedWord, IndexedWord> makeGenericVertices(Iterable<IndexedWord> verts) {
return anonymyizeNodes(verts, SHARED_NODE_ANON_PREFIX);
}
/**
* Used to assign generic labels to the nodes in the "blanket" for a set of vertices in a graph. Here, a "blanket" node is
* similar to nodes in a Markov Blanket, i.e. nodes that are one edge away from a set of asserted vertices in a
* SemanticGraph.
*/
public static Map<IndexedWord, IndexedWord> makeBlanketVertices(Iterable<IndexedWord> verts) {
return anonymyizeNodes(verts, BLANKET_NODE_ANON_PREFIX);
}
/**
* Given a set of edges, and a mapping between the replacement and target vertices that comprise the
* vertices of the edges, returns a new set of edges with the replacement vertices. If a replacement
* is not present, the WILDCARD_VERTICE is used in its place (i.e. can be anything).
*
* Currently used to generate "generic" versions of Semantic Graphs, when given a list of generic
* vertices to replace with, but can conceivably be used for other purposes where vertices must
* be replaced.
*/
public static List<SemanticGraphEdge> makeReplacedEdges(Iterable<SemanticGraphEdge> edges, Map<IndexedWord, IndexedWord> vertReplacementMap,
boolean useGenericReplacement) {
List<SemanticGraphEdge> retList = new ArrayList<SemanticGraphEdge>();
for (SemanticGraphEdge edge : edges) {
IndexedWord gov = edge.getGovernor();
IndexedWord dep = edge.getDependent();
IndexedWord newGov = vertReplacementMap.get(gov);
IndexedWord newDep = vertReplacementMap.get(dep);
if (useGenericReplacement) {
if (newGov == null) {
newGov = new IndexedWord(gov);
newGov.set(CoreAnnotations.TextAnnotation.class, WILDCARD_VERTICE_TOKEN);
newGov.set(CoreAnnotations.OriginalTextAnnotation.class, WILDCARD_VERTICE_TOKEN);
newGov.set(CoreAnnotations.LemmaAnnotation.class, WILDCARD_VERTICE_TOKEN);
}
if (newDep == null) {
newDep = new IndexedWord(dep);
newDep.set(CoreAnnotations.TextAnnotation.class, WILDCARD_VERTICE_TOKEN);
newDep.set(CoreAnnotations.OriginalTextAnnotation.class, WILDCARD_VERTICE_TOKEN);
newDep.set(CoreAnnotations.LemmaAnnotation.class,WILDCARD_VERTICE_TOKEN);
}
} else {
if (newGov == null)
newGov = edge.getGovernor();
if (newDep == null)
newDep = edge.getDependent();
}
SemanticGraphEdge newEdge = new SemanticGraphEdge(newGov, newDep, edge.getRelation(), edge.getWeight(), edge.isExtra());
retList.add(newEdge);
}
return retList;
}
/**
* Given a set of vertices from the same graph, returns the set of all edges between these
* vertices.
*/
public static Set<SemanticGraphEdge> allEdgesInSet(Iterable<IndexedWord> vertices, SemanticGraph sg) {
Set<SemanticGraphEdge> edges = Generics.newHashSet();
for (IndexedWord v1 : vertices) {
for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(v1)) {
edges.add(edge);
}
for (SemanticGraphEdge edge : sg.incomingEdgeIterable(v1)) {
edges.add(edge);
}
}
return edges;
}
/**
* Given two iterable sequences of edges, returns a pair containing the set of
* edges in the first graph not in the second, and edges in the second not in the first.
* Edge equality is determined using an object that implements ISemanticGraphEdgeEql.
*
*/
public static EdgeDiffResult diffEdges(Collection<SemanticGraphEdge> edges1, Collection<SemanticGraphEdge> edges2,
SemanticGraph sg1, SemanticGraph sg2,
ISemanticGraphEdgeEql compareObj) {
Set<SemanticGraphEdge> remainingEdges1 = Generics.newHashSet();
Set<SemanticGraphEdge> remainingEdges2 = Generics.newHashSet();
Set<SemanticGraphEdge> sameEdges = Generics.newHashSet();
ArrayList<SemanticGraphEdge> edges2Cache = new ArrayList<SemanticGraphEdge>(edges2);
edge1Loop:
for (SemanticGraphEdge edge1 : edges1) {
for (SemanticGraphEdge edge2 : edges2Cache) {
if (compareObj.equals(edge1, edge2, sg1, sg2)) {
sameEdges.add(edge1);
edges2Cache.remove(edge2);
continue edge1Loop;
}
}
remainingEdges1.add(edge1);
}
ArrayList<SemanticGraphEdge> edges1Cache = new ArrayList<SemanticGraphEdge>(edges1);
edge2Loop:
for (SemanticGraphEdge edge2 : edges2) {
for (SemanticGraphEdge edge1 : edges1) {
if (compareObj.equals(edge1, edge2, sg1, sg2)) {
edges1Cache.remove(edge1);
continue edge2Loop;
}
}
remainingEdges2.add(edge2);
}
return new EdgeDiffResult(sameEdges, remainingEdges1, remainingEdges2);
}
public static class EdgeDiffResult {
Set<SemanticGraphEdge> sameEdges;
Set<SemanticGraphEdge> remaining1;
Set<SemanticGraphEdge> remaining2;
public EdgeDiffResult(Set<SemanticGraphEdge> sameEdges,
Set<SemanticGraphEdge> remaining1,
Set<SemanticGraphEdge> remaining2) {
this.sameEdges = sameEdges;
this.remaining1 = remaining1;
this.remaining2 = remaining2;
}
public Set<SemanticGraphEdge> getRemaining1() {
return remaining1;
}
public Set<SemanticGraphEdge> getRemaining2() {
return remaining2;
}
public Set<SemanticGraphEdge> getSameEdges() {
return sameEdges;
}
}
/**
* Pretty printers
*/
public static String printEdges(Iterable<SemanticGraphEdge> edges) {
StringWriter buf = new StringWriter();
for (SemanticGraphEdge edge : edges) {
buf.append("\t");
buf.append(edge.getRelation().toString());
buf.append("(");
buf.append(edge.getGovernor().toString());
buf.append(", ");
buf.append(edge.getDependent().toString());
buf.append(")\n");
}
return buf.toString();
}
public static class PrintVerticeParams {
public boolean showWord = true;
public boolean showIndex = true;
public boolean showSentIndex = false;
public boolean showPOS = false;
public int wrapAt = 8;
}
public static String printVertices(SemanticGraph sg) {
return printVertices(sg, new PrintVerticeParams());
}
public static String printVertices(SemanticGraph sg, PrintVerticeParams params) {
StringWriter buf = new StringWriter();
int count = 0;
for (IndexedWord word : sg.vertexListSorted()) {
count++;
if (count % params.wrapAt == 0) { buf.write("\n\t"); }
if (params.showIndex) {
buf.write(String.valueOf(word.index()));
buf.write(":");
}
if (params.showSentIndex) {
buf.write("s");
buf.write(String.valueOf(word.sentIndex()));
buf.write("/");
}
if (params.showPOS) {
buf.write(word.tag());
buf.write("/");
}
if (params.showWord) {
buf.write(word.word());
}
buf.write(" ");
}
return buf.toString();
}
/**
* Given a SemanticGraph, creates a SemgrexPattern string based off of this graph.
* NOTE: the word() value of the vertice is the name to reference
* NOTE: currently presumes there is only one root in this graph.
* TODO: see if Semgrex can allow multiroot patterns
* @param sg SemanticGraph to base this pattern on.
*/
public static String semgrexFromGraph(SemanticGraph sg, boolean matchTag, boolean matchWord,
Map<IndexedWord, String> nodeNameMap) throws Exception {
return semgrexFromGraph(sg, null, matchTag, matchWord, nodeNameMap);
}
public static String semgrexFromGraph(SemanticGraph sg, Collection<IndexedWord> wildcardNodes,
boolean useTag, boolean useWord, Map<IndexedWord, String> nodeNameMap) throws Exception {
IndexedWord patternRoot = sg.getFirstRoot();
StringWriter buf = new StringWriter();
Set<IndexedWord> tabu = Generics.newHashSet();
Set<SemanticGraphEdge> seenEdges = Generics.newHashSet();
buf.append(semgrexFromGraphHelper(patternRoot, sg, tabu, seenEdges, true, true, wildcardNodes,
useTag, useWord, nodeNameMap, false));
String patternString = buf.toString();
return patternString;
}
/**
* Given a set of edges that form a rooted and connected graph, returns a Semgrex pattern
* corresponding to it.
* @throws Exception
*/
public static String semgrexFromGraph(Iterable<SemanticGraphEdge> edges, boolean matchTag,
boolean matchWord, Map<IndexedWord, String> nodeNameMap) throws Exception {
SemanticGraph sg = SemanticGraphFactory.makeFromEdges(edges);
return semgrexFromGraph(sg, matchTag, matchWord, nodeNameMap);
}
/**
* Recursive call to generate the Semgrex pattern based off of this SemanticGraph.
* Currently presumes the only elements to match on are the tags.
* TODO: consider tag generalization, and matching off of other features?
*/
protected static String semgrexFromGraphHelper(IndexedWord vertice, SemanticGraph sg,
Set<IndexedWord> tabu, Set<SemanticGraphEdge> seenEdges, boolean useWordAsLabel, boolean nameEdges, Collection<IndexedWord> wildcardNodes,
boolean useTag, boolean useWord, Map<IndexedWord, String> nodeNameMap, boolean orderedNodes) {
StringWriter buf = new StringWriter();
// If the node is a wildcarded one, treat it as a {}, meaning any match. Currently these will not
// be labeled, but this may change later.
if (wildcardNodes != null && wildcardNodes.contains(vertice)) {
buf.append("{}");
} else {
buf.append("{");
if (useTag) {
buf.append("tag:"); buf.append(vertice.tag());
if (useWord)
buf.append(";");
}
if (useWord) {
buf.append("word:"); buf.append(vertice.word());
}
buf.append("}");
}
if (nodeNameMap != null) {
buf.append("=");
buf.append(nodeNameMap.get(vertice));
buf.append(" ");
} else if (useWordAsLabel) {
buf.append("=");
buf.append(sanitizeForSemgrexName(vertice.word()));
buf.append(" ");
}
tabu.add(vertice);
Iterable<SemanticGraphEdge> edgeIter = null;
if(!orderedNodes){
edgeIter = sg.outgoingEdgeIterable(vertice);
} else{
edgeIter = CollectionUtils.sorted(sg.outgoingEdgeList(vertice), (arg0, arg1) ->
(arg0.getRelation().toString().compareTo(arg1.getRelation().toString())));
}
// For each edge, record the edge, but do not traverse to the vertice if it is already in the
// tabu list. If it already is, we emit the edge and the target vertice, as
// we will not be continuing in that vertex, but we wish to record the relation.
// If we will proceed down that node, add parens if it will continue recursing down.
for (SemanticGraphEdge edge : edgeIter) {
seenEdges.add(edge);
IndexedWord tgtVert = edge.getDependent();
boolean applyParens =
sg.outDegree(tgtVert) > 0 && !tabu.contains(tgtVert);
buf.append(" >");
buf.append(edge.getRelation().toString());
if (nameEdges) {
buf.append("=E");
buf.write(String.valueOf(seenEdges.size()));
}
buf.append(" ");
if (applyParens)
buf.append("(");
if (tabu.contains(tgtVert)) {
buf.append("{tag:"); buf.append(tgtVert.tag()); buf.append("}");
if (useWordAsLabel) {
buf.append("=");
buf.append(tgtVert.word());
buf.append(" ");
}
} else {
buf.append(semgrexFromGraphHelper(tgtVert, sg, tabu, seenEdges, useWordAsLabel, nameEdges,
wildcardNodes, useTag, useWord, nodeNameMap, orderedNodes));
if (applyParens)
buf.append(")");
}
}
return buf.toString();
}
public static String semgrexFromGraphOrderedNodes(SemanticGraph sg, Collection<IndexedWord> wildcardNodes,
boolean useTag, boolean useWord, Map<IndexedWord, String> nodeNameMap) throws Exception {
IndexedWord patternRoot = sg.getFirstRoot();
StringWriter buf = new StringWriter();
Set<IndexedWord> tabu = Generics.newHashSet();
Set<SemanticGraphEdge> seenEdges = Generics.newHashSet();
buf.append(semgrexFromGraphHelper(patternRoot, sg, tabu, seenEdges, true, true, wildcardNodes,
useTag, useWord, nodeNameMap, true));
String patternString = buf.toString();
return patternString;
}
/**
* Sanitizes the given string into a Semgrex friendly name
*/
public static String sanitizeForSemgrexName(String text) {
text = text.replaceAll("\\.", "_DOT_");
text = text.replaceAll("\\,", "_COMMA_");
text = text.replaceAll("\\\\", "_BSLASH_");
text = text.replaceAll("\\/", "_BSLASH_");
text = text.replaceAll("\\?", "_QUES_");
text = text.replaceAll("\\!", "_BANG_");
text = text.replaceAll("\\$", "_DOL_");
text = text.replaceAll("\\!", "_BANG_");
text = text.replaceAll("\\&", "_AMP_");
text = text.replaceAll("\\:", "_COL_");
text = text.replaceAll("\\;", "_SCOL_");
text = text.replaceAll("\\#", "_PND_");
text = text.replaceAll("\\@", "_AND_");
text = text.replaceAll("\\%", "_PER_");
text = text.replaceAll("\\(","_LRB_");
text = text.replaceAll("\\)", "_RRB_");
return text;
}
/**
* Given a <code>SemanticGraph</code>, sets the lemmas on its label
* objects based on their word and tag.
*/
public static void lemmatize(SemanticGraph sg) {
for (IndexedWord node : sg.vertexSet()) {
String lemma = Morphology.stemStatic(node.word(), node.tag()).word();
node.setLemma(lemma);
}
}
/**
* GIven a graph, returns a new graph with the the new sentence index enforced.
* NOTE: new vertices are inserted.
* TODO: is this ok? rewrite this?
*/
public static SemanticGraph setSentIndex(SemanticGraph sg, int newSentIndex) {
SemanticGraph newGraph = new SemanticGraph(sg);
List<IndexedWord> prevRoots = new ArrayList<IndexedWord>(newGraph.getRoots());
List<IndexedWord> newRoots = new ArrayList<IndexedWord>();
// TODO: we are using vertexListSorted here because we're changing
// vertices while iterating. Perhaps there is a better way to do it.
for (IndexedWord node : newGraph.vertexListSorted()) {
IndexedWord newWord = new IndexedWord(node);
newWord.setSentIndex(newSentIndex);
SemanticGraphUtils.replaceNode(newWord, node, newGraph);
if (prevRoots.contains(node))
newRoots.add(newWord);
}
newGraph.setRoots(newRoots);
return newGraph;
}
//-----------------------------------------------------------------------------------------------
// Graph redundancy checks
//-----------------------------------------------------------------------------------------------
/**
* Removes duplicate graphs from the set, using the string form of the graph
* as the key (obviating issues with object equality).
*/
public static Collection<SemanticGraph> removeDuplicates(Collection<SemanticGraph> graphs) {
Map<String, SemanticGraph> map = Generics.newHashMap();
for (SemanticGraph sg : graphs) {
String keyVal = sg.toString().intern();
map.put(keyVal, sg);
}
return map.values();
}
/**
* Given the set of graphs to remove duplicates from, also removes those on the tabu graphs
* (and does not include them in the return set).
*/
public static Collection<SemanticGraph> removeDuplicates(Collection<SemanticGraph> graphs,
Collection<SemanticGraph> tabuGraphs) {
Map<String, SemanticGraph> tabuMap = Generics.newHashMap();
for (SemanticGraph tabuSg : tabuGraphs) {
String keyVal = tabuSg.toString().intern();
tabuMap.put(keyVal, tabuSg);
}
Map<String, SemanticGraph> map = Generics.newHashMap();
for (SemanticGraph sg : graphs) {
String keyVal = sg.toString().intern();
if (tabuMap.containsKey(keyVal))
continue;
map.put(keyVal, sg);
}
return map.values();
}
public static Collection<SemanticGraph> removeDuplicates(Collection<SemanticGraph> graphs,
SemanticGraph tabuGraph) {
Collection<SemanticGraph> tabuSet = Generics.newHashSet();
tabuSet.add(tabuGraph);
return removeDuplicates(graphs, tabuSet);
}
// -----------------------------------------------------------------------------------------------
// Tree matching code
// -----------------------------------------------------------------------------------------------
/**
* Given a CFG Tree parse, and the equivalent SemanticGraph derived from that Tree, generates a mapping
* from each of the tree terminals to the best-guess SemanticGraph node(s).
* This is performed using lexical matching, finding the nth match.
* NOTE: not all tree nodes may match a Semgraph node, esp. for tokens removed in a collapsed Semgraph,
* such as prepositions.
*/
public static Map<PositionedTree, IndexedWord> mapTreeToSg(Tree tree, SemanticGraph sg) {
// In order to keep track of positions, we store lists, in order encountered, of lex terms.
// e.g. lexToTreeNode.get("the").get(2) should point to the same word as lexToSemNode.get("the").get(2)
// Because IndexedWords may be collapsed together "A B" -> "A_B", we check the value of current(), and
// split on whitespace if present.
MapList<String, TreeNodeProxy> lexToTreeNode = new MapList<String, TreeNodeProxy>();
MapList<String, IndexedWordProxy> lexToSemNode = new MapList<String, IndexedWordProxy>();
for (Tree child : tree.getLeaves()) {
List<TreeNodeProxy> leafProxies = TreeNodeProxy.create(child, tree);
for (TreeNodeProxy proxy : leafProxies)
lexToTreeNode.add(proxy.lex, proxy);
}
Map<IndexedWord, Integer> depthMap = Generics.newHashMap();
for (IndexedWord node : sg.vertexSet()) {
List<IndexedWord> path = sg.getPathToRoot(node);
if (path != null)
depthMap.put(node, path.size());
else
depthMap.put(node, 99999); // Use an arbitrarily deep depth value, to trick it into never being used.
List<IndexedWordProxy> nodeProxies = IndexedWordProxy.create(node);
for (IndexedWordProxy proxy : nodeProxies)
lexToSemNode.add(proxy.lex, proxy);
}
// Now the map-lists (string->position encountered indices) are populated,
// simply go through, finding matches.
// NOTE: we use TreeNodeProxy instead of keying off of Tree, as
// hash codes for Tree nodes do not consider position of the tree
// within a tree: two subtrees with the same layout and child
// labels will be equal.
Map<PositionedTree, IndexedWord> map = Generics.newHashMap();
for (String lex : lexToTreeNode.keySet()) {
for (int i=0;i<lexToTreeNode.size(lex) && i<lexToSemNode.size(lex);i++) {
map.put(new PositionedTree(lexToTreeNode.get(lex, i).treeNode, tree), lexToSemNode.get(lex,i).node);
}
}
// Now that a terminals to terminals map has been generated, account for the
// tree non-terminals.
for (Tree nonTerm : tree) {
if (!nonTerm.isLeaf()) {
IndexedWord bestNode = null;
int bestScore = 99999;
for (Tree curr : nonTerm) {
IndexedWord equivNode = map.get(new PositionedTree(curr, tree));
if ((equivNode == null) || !depthMap.containsKey(equivNode)) continue;
int currScore = depthMap.get(equivNode);
if (currScore < bestScore) {
bestScore = currScore;
bestNode = equivNode;
}
}
if (bestNode != null) {
map.put(new PositionedTree(nonTerm, tree), bestNode);
}
}
}
return map;
}
/**
* Private helper class for <code>mapTreeToSg</code>. Acts to
* map between a Tree node and a lexical value.
* @author Eric Yeh
*
*/
private static class TreeNodeProxy {
Tree treeNode;
String lex;
Tree root;
public String toString() {
return lex+" -> "+treeNode.toString()+", #="+treeNode.nodeNumber(root);
}
private TreeNodeProxy(Tree intree, String lex, Tree root) {
this.treeNode = intree;
this.lex = lex;
this.root = root;
}
public static List<TreeNodeProxy> create(Tree intree, Tree root) {
List<TreeNodeProxy> ret = new ArrayList<TreeNodeProxy>();
if (intree.isLeaf()) {
ret.add(new TreeNodeProxy(intree, intree.label().value(), root));
} else
for (LabeledWord lword : intree.labeledYield()) {
ret.add(new TreeNodeProxy(intree, lword.word(), root));
}
return ret;
}
}
/**
* This is used to uniquely index trees within a
* Tree, maintaining the position of this subtree
* within the context of the root.
* @author Eric Yeh
*
*/
public static class PositionedTree {
Tree tree;
Tree root;
int nodeNumber;
public String toString() {
return tree+"."+nodeNumber;
}
public PositionedTree(Tree tree, Tree root) {
this.tree = tree;
this.root = root;
this.nodeNumber = tree.nodeNumber(root);
}
public boolean equals(Object obj) {
if (obj instanceof PositionedTree) {
PositionedTree tgt = (PositionedTree) obj;
return tree.equals(tgt.tree) && root.equals(tgt.root) && tgt.nodeNumber == nodeNumber;
}
return false;
}
/**
* TODO: verify this is correct
*/
@Override
public int hashCode() {
int hc = tree.hashCode() ^ (root.hashCode() << 8);
hc ^= (2 ^ nodeNumber);
return hc;
}
}
/**
* Private helper class for <code>mapTreeToSg</code>. Acts to
* map between an IndexedWord (in a SemanticGraph) and a lexical value.
* @author lumberjack
*
*/
private static final class IndexedWordProxy {
IndexedWord node;
String lex;
public String toString() {
return lex+" -> "+node.word()+":"+node.sentIndex()+"."+node.index();
}
private IndexedWordProxy(IndexedWord node, String lex) {
this.node = node; this.lex = lex;
}
/**
* Generates a set of IndexedWordProxy objects. If the current() field is present, splits the tokens by
* a space, and for each, creates a new IndexedWordProxy, in order encountered, referencing this current
* node, but using the lexical value of the current split token. Otherwise just use the value of word().
* This is used to retain attribution to the originating node.
*/
public static List<IndexedWordProxy> create(IndexedWord node) {
List<IndexedWordProxy> ret = new ArrayList<IndexedWordProxy>();
if (node.originalText().length() > 0) {
for (String token : node.originalText().split(" ")) {
ret.add(new IndexedWordProxy(node, token));
}
} else {
ret.add(new IndexedWordProxy(node, node.word()));
}
return ret;
}
}
}