package edu.stanford.nlp.ie.machinereading.structure;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.Set;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;
/**
* Utilities to manipulate Annotations storing datasets or sentences with Machine Reading info
* @author Mihai
*
*/
public class AnnotationUtils {
private AnnotationUtils() {} // only static methods
/**
* Given a list of sentences (as CoreMaps), wrap it in a new Annotation.
*/
public static Annotation createDataset(List<CoreMap> sentences) {
Annotation dataset = new Annotation("");
addSentences(dataset,sentences);
return dataset;
}
/**
* Randomized shuffle of all sentences int this dataset
* @param dataset
*/
public static void shuffleSentences(CoreMap dataset) {
List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);
// we use a constant seed for replicability of experiments
Collections.shuffle(sentences, new Random(0));
dataset.set(CoreAnnotations.SentencesAnnotation.class, sentences);
}
/**
* Converts the labels of all entity mentions in this dataset to sequences of CoreLabels
* @param dataset
* @param annotationsToSkip
* @param useSubTypes
*/
public static List<List<CoreLabel>> entityMentionsToCoreLabels(CoreMap dataset, Set<String> annotationsToSkip, boolean useSubTypes, boolean useBIO) {
List<List<CoreLabel>> retVal = new ArrayList<List<CoreLabel>>();
List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
List<CoreLabel> labeledSentence = sentenceEntityMentionsToCoreLabels(sentence, true, annotationsToSkip, null, useSubTypes, useBIO);
assert(labeledSentence != null);
retVal.add(labeledSentence);
}
return retVal;
}
/**
* Converts the labels of all entity mentions in this sentence to sequences of CoreLabels
* @param sentence
* @param addAnswerAnnotation
* @param annotationsToSkip
* @param useSubTypes
*/
public static List<CoreLabel> sentenceEntityMentionsToCoreLabels(
CoreMap sentence,
boolean addAnswerAnnotation,
Set<String> annotationsToSkip,
Set<String> mentionTypesToUse,
boolean useSubTypes,
boolean useBIO) {
/*
Tree completeTree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
if(completeTree == null){
throw new RuntimeException("ERROR: TreeAnnotation MUST be set before calling this method!");
}
*/
//
// Set TextAnnotation and PartOfSpeechAnnotation (using the parser data)
//
/*
List<CoreLabel> labels = new ArrayList<CoreLabel>();
List<Tree> tokenList = completeTree.getLeaves();
for (Tree tree : tokenList) {
Word word = new Word(tree.label());
CoreLabel label = new CoreLabel();
label.set(CoreAnnotations.TextAnnotation.class, word.value());
if (addAnswerAnnotation) {
label.set(CoreAnnotations.AnswerAnnotation.class,
SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
}
label.set(CoreAnnotations.PartOfSpeechAnnotation.class, tree.parent(completeTree).label().value());
labels.add(label);
}
*/
// use the token CoreLabels not the parser data => more robust
List<CoreLabel> labels = new ArrayList<CoreLabel>();
for(CoreLabel l: sentence.get(CoreAnnotations.TokensAnnotation.class)){
CoreLabel nl = new CoreLabel(l);
if (addAnswerAnnotation) {
nl.set(CoreAnnotations.AnswerAnnotation.class, SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
}
labels.add(nl);
}
// Add AnswerAnnotation from the types of the entity mentions
if (addAnswerAnnotation) {
List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
if(entities != null){
for (EntityMention entity : entities) {
// is this a type that we should skip?
if(annotationsToSkip != null && annotationsToSkip.contains(entity.getType())) continue;
// is this a valid mention type?
if(mentionTypesToUse != null && ! mentionTypesToUse.contains(entity.getMentionType())) continue;
// ignore entities without head span
if(entity.getHead() != null){
for(int i = entity.getHeadTokenStart(); i < entity.getHeadTokenEnd(); i ++){
String tag = entity.getType();
if(useSubTypes && entity.getSubType() != null) tag += "-" + entity.getSubType();
if(useBIO){
if(i == entity.getHeadTokenStart()) tag = "B-" + tag;
else tag = "I-" + tag;
}
labels.get(i).set(CoreAnnotations.AnswerAnnotation.class, tag);
}
}
}
}
}
/*
// Displaying the CoreLabels generated for this sentence
System.err.print("sentence to core labels:");
for(CoreLabel l: labels){
System.err.print(" " + l.word() + "/" + l.getString(CoreAnnotations.PartOfSpeechAnnotation.class));
String tag = l.getString(CoreAnnotations.AnswerAnnotation.class);
if(tag != null && ! tag.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL)){
System.err.print("/" + tag);
}
}
System.err.println();
*/
return labels;
}
public static CoreMap getSentence(CoreMap dataset, int i) {
return dataset.get(CoreAnnotations.SentencesAnnotation.class).get(i);
}
public static int sentenceCount(CoreMap dataset) {
List<CoreMap> sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
if(sents != null) return sents.size();
return 0;
}
public static void addSentence(CoreMap dataset, CoreMap sentence) {
List<CoreMap> sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
if(sents == null){
sents = new ArrayList<CoreMap>();
dataset.set(CoreAnnotations.SentencesAnnotation.class, sents);
}
sents.add(sentence);
}
public static void addSentences(CoreMap dataset, List<CoreMap> sentences) {
List<CoreMap> sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
if(sents == null){
sents = new ArrayList<CoreMap>();
dataset.set(CoreAnnotations.SentencesAnnotation.class, sents);
}
for(CoreMap sentence: sentences){
sents.add(sentence);
}
}
/**
* Creates a deep copy of the given dataset with new lists for all mentions (entity, relation, event)
* @param dataset
*/
public static Annotation deepMentionCopy(CoreMap dataset) {
Annotation newDataset = new Annotation("");
List<CoreMap> sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
List<CoreMap> newSents = new ArrayList<CoreMap>();
if(sents != null){
for(CoreMap sent: sents){
if(! (sent instanceof Annotation)){
throw new RuntimeException("ERROR: Sentences must instantiate Annotation!");
}
CoreMap newSent = sentenceDeepMentionCopy((Annotation) sent);
newSents.add(newSent);
}
}
addSentences(newDataset, newSents);
return newDataset;
}
/**
* Deep copy of the sentence: we create new entity/relation/event lists here.
* However, we do not deep copy the ExtractionObjects themselves!
* @param sentence
*/
public static Annotation sentenceDeepMentionCopy(Annotation sentence) {
Annotation newSent = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class));
newSent.set(CoreAnnotations.TokensAnnotation.class, sentence.get(CoreAnnotations.TokensAnnotation.class));
newSent.set(TreeCoreAnnotations.TreeAnnotation.class, sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
newSent.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class));
newSent.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class));
newSent.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class));
newSent.set(CoreAnnotations.DocIDAnnotation.class, sentence.get(CoreAnnotations.DocIDAnnotation.class));
// deep copy of all mentions lists
List<EntityMention> ents = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
if(ents != null) newSent.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, new ArrayList<EntityMention>(ents));
List<RelationMention> rels = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
if(rels != null) newSent.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, new ArrayList<RelationMention>(rels));
List<EventMention> evs = sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class);
if(evs != null) newSent.set(MachineReadingAnnotations.EventMentionsAnnotation.class, new ArrayList<EventMention>(evs));
return newSent;
}
/**
* Return the relation that holds between the given entities.
* Return a relation of type UNRELATED if this sentence contains no relation between the entities.
*/
public static RelationMention getRelation(RelationMentionFactory factory, CoreMap sentence, ExtractionObject ... args) {
return getRelations(factory, sentence, args).get(0);
}
/**
* Return all the relations that holds between the given entities.
* Returns a list containing a relation of type UNRELATED if this sentence contains no relation between the entities.
*/
public static List<RelationMention> getRelations(RelationMentionFactory factory, CoreMap sentence, ExtractionObject... args) {
List<RelationMention> relationMentions = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
List<RelationMention> matchingRelationMentions = new ArrayList<RelationMention>();
if (relationMentions != null) {
for (RelationMention rel : relationMentions) {
if (rel.argsMatch(args)) {
matchingRelationMentions.add(rel);
}
}
}
if (matchingRelationMentions.size() == 0) {
matchingRelationMentions.add(RelationMention.createUnrelatedRelation(factory, args));
}
return matchingRelationMentions;
}
/**
* Get list of all relations and non-relations between EntityMentions in this sentence
* Use with care. This is an expensive call due to getAllUnrelatedRelations, which creates all non-existing relations between all entity mentions
*/
public static List<RelationMention> getAllRelations(RelationMentionFactory factory, CoreMap sentence, boolean createUnrelatedRelations) {
List<RelationMention> relationMentions = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
List<RelationMention> allRelations = new ArrayList<RelationMention>();
if(relationMentions != null) allRelations.addAll(relationMentions);
if(createUnrelatedRelations){
allRelations.addAll(getAllUnrelatedRelations(factory, sentence, true));
}
return allRelations;
}
public static List<RelationMention> getAllUnrelatedRelations(RelationMentionFactory factory, CoreMap sentence, boolean checkExisting) {
List<RelationMention> relationMentions = (checkExisting ? sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class) : null);
List<EntityMention> entityMentions = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
List<RelationMention> nonRelations = new ArrayList<RelationMention>();
//
// scan all possible arguments
//
if(entityMentions != null){
for(int i = 0; i < entityMentions.size(); i ++){
for(int j = 0; j < entityMentions.size(); j ++){
if(i == j) continue;
EntityMention arg1 = entityMentions.get(i);
EntityMention arg2 = entityMentions.get(j);
boolean match = false;
if(relationMentions != null){
for (RelationMention rel : relationMentions) {
if (rel.argsMatch(arg1, arg2)) {
match = true;
break;
}
}
}
if (match == false) {
nonRelations.add(RelationMention.createUnrelatedRelation(factory, arg1,arg2));
}
}
}
}
return nonRelations;
}
public static void addEntityMention(CoreMap sentence, EntityMention arg) {
List<EntityMention> l = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
if(l == null){
l = new ArrayList<EntityMention>();
sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, l);
}
l.add(arg);
}
public static void addEntityMentions(CoreMap sentence, Collection<EntityMention> args) {
List<EntityMention> l = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
if(l == null){
l = new ArrayList<EntityMention>();
sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, l);
}
l.addAll(args);
}
public List<EntityMention> getEntityMentions(CoreMap sent) {
return Collections.unmodifiableList(sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class));
}
public static void addRelationMention(CoreMap sentence, RelationMention arg) {
List<RelationMention> l = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
if(l == null){
l = new ArrayList<RelationMention>();
sentence.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, l);
}
l.add(arg);
}
public static void addRelationMentions(CoreMap sentence, Collection<RelationMention> args) {
List<RelationMention> l = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
if(l == null){
l = new ArrayList<RelationMention>();
sentence.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, l);
}
l.addAll(args);
}
public List<RelationMention> getRelationMentions(CoreMap sent) {
return Collections.unmodifiableList(sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class));
}
public static void addEventMention(CoreMap sentence, EventMention arg) {
List<EventMention> l = sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class);
if(l == null){
l = new ArrayList<EventMention>();
sentence.set(MachineReadingAnnotations.EventMentionsAnnotation.class, l);
}
l.add(arg);
}
public static void addEventMentions(CoreMap sentence, Collection<EventMention> args) {
List<EventMention> l = sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class);
if(l == null){
l = new ArrayList<EventMention>();
sentence.set(MachineReadingAnnotations.EventMentionsAnnotation.class, l);
}
l.addAll(args);
}
public List<EventMention> getEventMentions(CoreMap sent) {
return Collections.unmodifiableList(sent.get(MachineReadingAnnotations.EventMentionsAnnotation.class));
}
/**
* Prepare a string for printing in a spreadsheet for Mechanical Turk input.
* @param s String to be formatted
* @return String string enclosed in quotes with other quotes escaped, and with better formatting for readability by Turkers.
*/
public static String prettify(String s) {
if (s==null) return "";
return s.replace(
" ,",",").replace(
" .",".").replace(
" :",":").replace(
"( ","(").replace(
"[ ","[").replace(
" )",")").replace(
" ]","]").replace(
" - ","-").replace(
" '","'").replace(
"-LRB- ","(").replace(
" -RRB-",")").replace(
"` ` ","\"").replace(
" ' '","\"").replace(
" COMMA",",");
}
/**
* Fetches the sentence text in a given token span
* @param span
*/
public static String getTextContent(CoreMap sent, Span span) {
List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
StringBuffer buf = new StringBuffer();
assert(span != null);
for(int i = span.start(); i < span.end(); i ++){
if(i > span.start()) buf.append(" ");
buf.append(tokens.get(i).word());
}
return buf.toString();
}
public static String sentenceToString(CoreMap sent) {
StringBuilder sb = new StringBuilder(512);
List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
sb.append("\"" + StringUtils.join(tokens, " ") + "\"");
sb.append("\n");
List<RelationMention> relationMentions = sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
if(relationMentions != null){
for (RelationMention rel : relationMentions) {
sb.append("\n");
sb.append(rel);
}
}
// TODO: add entity and event mentions
return sb.toString();
}
public static String tokensAndNELabelsToString(CoreMap sentence) {
StringBuffer os = new StringBuffer();
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
if(tokens != null){
boolean first = true;
for(CoreLabel token: tokens) {
if(! first) os.append(" ");
os.append(token.word());
if(token.ner() != null && ! token.ner().equals("O")){
os.append("/" + token.ner());
}
first = false;
}
}
return os.toString();
}
public static String datasetToString(CoreMap dataset){
List<CoreMap> sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
StringBuffer b = new StringBuffer();
if(sents != null){
for(CoreMap sent: sents){
b.append(sentenceToString(sent));
}
}
return b.toString();
}
/*
public static List<CoreLabel> wordsToCoreLabels(List<Word> words) {
List<CoreLabel> labels = new ArrayList<CoreLabel>();
for(Word word: words){
CoreLabel l = new CoreLabel();
l.setWord(word.word());
l.set(CoreAnnotations.TextAnnotation.class, word.word());
l.setBeginPosition(word.beginPosition());
l.setEndPosition(word.endPosition());
labels.add(l);
}
return labels;
}
*/
public static String tokensToString(List<CoreLabel> tokens) {
StringBuffer os = new StringBuffer();
boolean first = true;
for(CoreLabel t: tokens){
if(! first) os.append(" ");
os.append(t.word() + "{" + t.beginPosition() + ", " + t.endPosition() + "}");
first = false;
}
return os.toString();
}
/*
public static boolean sentenceContainsSpan(CoreMap sentence, Span span) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
int sentenceStart = tokens.get(0).beginPosition();
int sentenceEnd = tokens.get(tokens.size() - 1).endPosition();
return sentenceStart <= span.start() && sentenceEnd >= span.end();
}
*/
/*
* Shift the character offsets of all tokens by offset.
*/
public static void updateOffsets(List<Word> tokens, int offset) {
for(Word l: tokens) {
l.setBeginPosition(l.beginPosition() + offset);
l.setEndPosition(l.endPosition() + offset);
}
}
/*
* Shift the character offsets of all tokens by offset.
*/
public static void updateOffsetsInCoreLabels(List<CoreLabel> tokens, int offset) {
for(CoreLabel l: tokens) {
l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, l.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) + offset);
l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, l.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) + offset);
}
}
/**
* Process string to be a cell in Excel file.
* Escape any quotes in the string and enclose the whole string with quotes.
*/
public static String excelify(String s) {
return '"'+s.replace("\"","\"\"")+'"';
}
public static List<CoreMap> readSentencesFromFile(String path) throws IOException, ClassNotFoundException {
Annotation doc = (Annotation) IOUtils.readObjectFromFile(path);
return doc.get(CoreAnnotations.SentencesAnnotation.class);
}
}