package edu.stanford.nlp.pipeline;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.TwoDimensionalMap;
public abstract class AnnotationSerializer {
/**
* Append a single object to this stream. Subsequent calls to append on the same stream must supply the returned
* output stream; furthermore, implementations of this function must be prepared to handle
* the same output stream being passed in as it returned on the previous write.
*
* @param corpus The document to serialize to the stream.
* @param os The output stream to serialize to.
* @return The output stream which should be closed when done writing, and which should be passed into subsequent
* calls to write() on this serializer.
* @throws IOException Thrown if the underlying output stream throws the exception.
*/
public abstract OutputStream write(Annotation corpus, OutputStream os) throws IOException;
/**
* Read a single object from this stream. Subsequent calls to read on the same input stream must supply the
* returned input stream; furthermore, implementations of this function must be prepared to handle the same
* input stream being passed to it as it returned on the previous read.
*
* @param is The input stream to read a document from.
* @return A pair of the read document, and the implementation-specific input stream which it was actually read from.
* This stream should be passed to subsequent calls to read on the same stream, and should be closed when reading
* completes.
* @throws IOException Thrown if the underlying stream throws the exception.
* @throws ClassNotFoundException Thrown if an object was read that does not exist in the classpath.
* @throws ClassCastException Thrown if the signature of a class changed in way that was incompatible with the serialized document.
*/
public abstract Pair<Annotation, InputStream> read(InputStream is) throws IOException, ClassNotFoundException, ClassCastException;
public static class IntermediateNode {
String docId;
int sentIndex;
int index;
int copyAnnotation;
boolean isRoot;
public IntermediateNode(String docId, int sentIndex, int index, int copy, boolean isRoot) {
this.docId = docId;
this.sentIndex = sentIndex;
this.index = index;
this.copyAnnotation = copy;
this.isRoot = isRoot;
}
}
public static class IntermediateEdge {
int source;
int sourceCopy;
int target;
int targetCopy;
String dep;
boolean isExtra;
public IntermediateEdge(String dep, int source, int sourceCopy, int target, int targetCopy, boolean isExtra) {
this.dep = dep;
this.source = source;
this.sourceCopy = sourceCopy;
this.target = target;
this.targetCopy = targetCopy;
this.isExtra = isExtra;
}
}
public static class IntermediateSemanticGraph {
public List<IntermediateNode> nodes;
public List<IntermediateEdge> edges;
public IntermediateSemanticGraph() {
nodes = new ArrayList<IntermediateNode>();
edges = new ArrayList<IntermediateEdge>();
}
public IntermediateSemanticGraph(List<IntermediateNode> nodes, List<IntermediateEdge> edges) {
this.nodes = new ArrayList<IntermediateNode>(nodes);
this.edges = new ArrayList<IntermediateEdge>(edges);
}
private static final Object LOCK = new Object();
public SemanticGraph convertIntermediateGraph(List<CoreLabel> sentence) {
SemanticGraph graph = new SemanticGraph();
// first construct the actual nodes; keep them indexed by their index and copy count
// sentences such as "I went over the river and through the woods" have
// copys for "went" in the collapsed dependencies
TwoDimensionalMap<Integer, Integer, IndexedWord> nodeMap = TwoDimensionalMap.hashMap();
for (IntermediateNode in: nodes){
CoreLabel token = sentence.get(in.index - 1); // index starts at 1!
IndexedWord word;
if (in.copyAnnotation > 0) {
// TODO: if we make a copy wrapper CoreLabel, use it here instead
word = new IndexedWord(new CoreLabel(token));
word.setCopyCount(in.copyAnnotation);
} else {
word = new IndexedWord(token);
}
// for backwards compatibility - new annotations should have
// these fields set, but annotations older than August 2014 might not
if (word.docID() == null && in.docId != null) {
word.setDocID(in.docId);
}
if (word.sentIndex() < 0 && in.sentIndex >= 0) {
word.setSentIndex(in.sentIndex);
}
if (word.index() < 0 && in.index >= 0) {
word.setIndex(in.index);
}
nodeMap.put(word.index(), word.copyCount(), word);
graph.addVertex(word);
if (in.isRoot) {
graph.addRoot(word);
}
}
// add all edges to the actual graph
for(IntermediateEdge ie: edges){
IndexedWord source = nodeMap.get(ie.source, ie.sourceCopy);
if (source == null) {
throw new RuntimeIOException("Failed to find node " + ie.source + "-" + ie.sourceCopy);
}
IndexedWord target = nodeMap.get(ie.target, ie.targetCopy);
if (target == null) {
throw new RuntimeIOException("Failed to find node " + ie.target + "-" + ie.targetCopy);
}
assert(target != null);
synchronized (LOCK) {
// this is not thread-safe: there are static fields in GrammaticalRelation
GrammaticalRelation rel = GrammaticalRelation.valueOf(ie.dep);
graph.addEdge(source, target, rel, 1.0, ie.isExtra);
}
}
// compute root nodes if they weren't stored in the graph
if (!graph.isEmpty() && graph.getRoots().size() == 0){
graph.resetRoots();
}
return graph;
}
}
}