package edu.stanford.nlp.time;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.time.TimeAnnotations;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.DataFilePaths;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.SystemUtils;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.*;
import java.util.regex.Pattern;
/**
* Annotates text using HeidelTime
*
* GUTIME/TimeML specifications can be found at:
* <a href="http://www.timeml.org/site/tarsqi/modules/gutime/index.html">
* http://www.timeml.org/site/tarsqi/modules/gutime/index.html</a>.
*
* TODO heideltime doesn't actually run on the NLP machines :( (TreeTagger doesn't run)
* @author Gabor Angeli
*/
public class HeidelTimeAnnotator implements Annotator {
private static final String BASE_PATH = "$NLP_DATA_HOME/packages/heideltime/";
private static final String DEFAULT_PATH = DataFilePaths.convert(BASE_PATH);
private final File heideltimePath;
private final boolean outputResults;
// if used in a pipeline or constructed with a Properties object,
// this property tells the annotator where to find the script
public static final String HEIDELTIME_PATH_PROPERTY = "heideltime.path";
public static final String HEIDELTIME_OUTPUT_RESULTS = "heideltime.outputResults";
public HeidelTimeAnnotator() {
this(new File(System.getProperty("heideltime", DEFAULT_PATH)));
}
public HeidelTimeAnnotator(File heideltimePath) {
this.heideltimePath = heideltimePath;
this.outputResults = false;
}
public HeidelTimeAnnotator(String name, Properties props) {
String path = props.getProperty(HEIDELTIME_PATH_PROPERTY,
System.getProperty("heideltime",
DEFAULT_PATH));
this.heideltimePath = new File(path);
this.outputResults =
Boolean.valueOf(props.getProperty(HEIDELTIME_OUTPUT_RESULTS, "false"));
}
public void annotate(Annotation annotation) {
try {
this.annotate((CoreMap)annotation);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
public void annotate(CoreMap document) throws IOException {
//--Create Input File
//(create file)
File inputFile = File.createTempFile("heideltime", ".input");
//(write to file)
PrintWriter inputWriter = new PrintWriter(inputFile);
inputWriter.println(document.get(CoreAnnotations.TextAnnotation.class));
inputWriter.close();
//--Get Date
//(error checks)
if(!document.has(CoreAnnotations.CalendarAnnotation.class) && !document.has(CoreAnnotations.DocDateAnnotation.class)){
throw new IllegalArgumentException("CoreMap must have either a Calendar or DocDate annotation"); //not strictly necessary, technically...
}
//(variables)
Calendar dateCalendar = document.get(CoreAnnotations.CalendarAnnotation.class);
String pubDate = null;
if (dateCalendar != null) {
//(case: calendar annotation)
pubDate = String.format("%TF", dateCalendar);
} else {
//(case: docdateannotation)
String s = document.get(CoreAnnotations.DocDateAnnotation.class);
if (s != null) {
pubDate = s;
}
}
//--Build Command
ArrayList<String> args = new ArrayList<String>();
args.add("java");
args.add("-jar"); args.add(this.heideltimePath.getPath() + "/heideltime.jar");
args.add("-t"); args.add("NEWS");
if(pubDate != null){
args.add("-dct"); args.add(pubDate);
}
args.add(inputFile.getPath());
// run HeidelTime on the input file
ProcessBuilder process = new ProcessBuilder(args);
StringWriter outputWriter = new StringWriter();
SystemUtils.run(process, outputWriter, null);
String output = outputWriter.getBuffer().toString();
Pattern docClose = Pattern.compile("</DOC>.*", Pattern.DOTALL);
output = docClose.matcher(output).replaceAll("</DOC>").replaceAll("<!DOCTYPE TimeML SYSTEM \"TimeML.dtd\">",""); //TODO TimeML.dtd? FileNotFoundException if we leave it in
Pattern badNestedTimex = Pattern.compile(Pattern.quote("<T</TIMEX3>IMEX3"));
output = badNestedTimex.matcher(output).replaceAll("</TIMEX3><TIMEX3");
Pattern badNestedTimex2 = Pattern.compile(Pattern.quote("<TI</TIMEX3>MEX3"));
output = badNestedTimex2.matcher(output).replaceAll("</TIMEX3><TIMEX3");
output = output.replaceAll("\\n\\n<TimeML>\\n\\n","<TimeML>");
// parse the HeidelTime output
Element outputXML;
try {
outputXML = XMLUtils.parseElement(output);
} catch (Exception ex) {
throw new RuntimeException(String.format("error:\n%s\ninput:\n%s\noutput:\n%s",
ex, IOUtils.slurpFile(inputFile), output), ex);
}
inputFile.delete();
// get Timex annotations
List<CoreMap> timexAnns = toTimexCoreMaps(outputXML, document);
document.set(TimeAnnotations.TimexAnnotations.class, timexAnns);
if (outputResults) {
System.out.println(timexAnns);
}
// align Timex annotations to sentences
int timexIndex = 0;
for (CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) {
int sentBegin = beginOffset(sentence);
int sentEnd = endOffset(sentence);
// skip times before the sentence
while (timexIndex < timexAnns.size() && beginOffset(timexAnns.get(timexIndex)) < sentBegin) {
++timexIndex;
}
// determine times within the sentence
int sublistBegin = timexIndex;
int sublistEnd = timexIndex;
while (timexIndex < timexAnns.size() &&
sentBegin <= beginOffset(timexAnns.get(timexIndex)) &&
endOffset(timexAnns.get(timexIndex)) <= sentEnd) {
++sublistEnd;
++timexIndex;
}
// set the sentence timexes
sentence.set(TimeAnnotations.TimexAnnotations.class, timexAnns.subList(sublistBegin, sublistEnd));
}
}
private static int beginOffset(CoreMap ann) {
return ann.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
}
private static int endOffset(CoreMap ann) {
return ann.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
}
private static List<CoreMap> toTimexCoreMaps(Element docElem, CoreMap originalDocument) {
//--Collect Token Offsets
Map<Integer,Integer> beginMap = Generics.newHashMap();
Map<Integer,Integer> endMap = Generics.newHashMap();
boolean haveTokenOffsets = true;
for(CoreMap sent : originalDocument.get(CoreAnnotations.SentencesAnnotation.class)){
for(CoreLabel token : sent.get(CoreAnnotations.TokensAnnotation.class)){
Integer tokBegin = token.get(CoreAnnotations.TokenBeginAnnotation.class);
Integer tokEnd = token.get(CoreAnnotations.TokenEndAnnotation.class);
if(tokBegin == null || tokEnd == null){ haveTokenOffsets = false; }
int charBegin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int charEnd = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
beginMap.put(charBegin,tokBegin);
endMap.put(charEnd,tokEnd);
}
}
List<CoreMap> timexMaps = new ArrayList<CoreMap>();
int offset = 0;
NodeList docNodes = docElem.getChildNodes();
for (int i = 0; i < docNodes.getLength(); i++) {
Node content = docNodes.item(i);
if (content instanceof Text) {
Text text = (Text)content;
offset += text.getWholeText().length();
} else if (content instanceof Element) {
Element child = (Element)content;
if (child.getNodeName().equals("TIMEX3")) {
Timex timex = new Timex(child);
if (child.getChildNodes().getLength() != 1) {
throw new RuntimeException("TIMEX3 should only contain text " + child);
}
String timexText = child.getTextContent();
CoreMap timexMap = new ArrayCoreMap();
timexMap.set(TimeAnnotations.TimexAnnotation.class, timex);
timexMap.set(CoreAnnotations.TextAnnotation.class, timexText);
int charBegin = offset;
timexMap.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
offset += timexText.length();
timexMap.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
int charEnd = offset;
//(tokens)
if(haveTokenOffsets){
Integer tokBegin = beginMap.get(charBegin);
int searchStep = 1; //if no exact match, search around the character offset
while(tokBegin == null){
tokBegin = beginMap.get(charBegin - searchStep);
if(tokBegin == null){
tokBegin = beginMap.get(charBegin + searchStep);
}
searchStep += 1;
}
searchStep = 1;
Integer tokEnd = endMap.get(charEnd);
while(tokEnd == null){
tokEnd = endMap.get(charEnd - searchStep);
if(tokEnd == null){
tokEnd = endMap.get(charEnd + searchStep);
}
searchStep += 1;
}
timexMap.set(CoreAnnotations.TokenBeginAnnotation.class, tokBegin);
timexMap.set(CoreAnnotations.TokenEndAnnotation.class, tokEnd);
}
timexMaps.add(timexMap);
} else {
throw new RuntimeException("unexpected element " + child);
}
} else {
throw new RuntimeException("unexpected content " + content);
}
}
return timexMaps;
}
@Override
public Set<Requirement> requires() {
return TOKENIZE_AND_SSPLIT;
}
@Override
public Set<Requirement> requirementsSatisfied() {
return Collections.singleton(HEIDELTIME_REQUIREMENT);
}
}