package edu.stanford.nlp.time;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.ReaderInputStream;
import edu.stanford.nlp.io.TeeStream;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.MatchedExpression;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.stats.PrecisionRecallStats;
import edu.stanford.nlp.util.*;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import java.io.*;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.function.Function;
import java.util.logging.LogManager;
import java.util.regex.Pattern;
/**
* Main program for testing SUTime
* <br>
* Processing a text string:
* <pre>
* -in.type TEXT
* -date YYYY-MM-dd
* -i <text>
* -o <output file>
* </pre>
*
* Processing a text file:
* <pre>
* -in.type TEXTFILE
* -date YYYY-MM-dd
* -i input.txt
* -o <output file>
* </pre>
*
* Running on Timebank
* <pre>
* -in.type TIMEBANK_CSV
* -i timebank.csv
* -tempeval2.dct dct.txt
* -o <output directory>
* -eval <evaluation script>
* </pre>
*
* Evaluating on Tempeval2
* <pre>
* -in.type TEMPEVAL2
* -i <directory with english data>
* -o <output directory>
* -eval <evaluation script>
* -tempeval2.dct dct file (with document creation times)
*
* TEMPEVAL2 (download from http://timeml.org/site/timebank/timebank.html)
* Evaluation is token based.
*
* TRAINING (english):
*
* GUTIME:
* precision 0.88
* recall 0.71
* f1-measure 0.79
* accuracy 0.98
* attribute type 0.92
* attribute value 0.31 // LOW SCORE here is due to difference in format (no -,: in date)
*
* After fixing some formats for GUTIME:
* (GUTIME syntax is inconsistent at times (1991W 8WE, 19980212EV)
* attribute value 0.67
*
* SUTIME:
* Default: sutime.teRelHeurLevel=NONE, restrictToTimex3=false
* precision 0.873
* recall 0.897
* f1-measure 0.885
* accuracy 0.991
*
* P R F1
* attribute type 0.918 | 0.751 0.802 0.776
* attribute value 0.762 | 0.623 0.665 0.644
*
* P R F1
* mention attribute type 0.900 | 0.780 0.833 0.805
* mention attribute value 0.742 | 0.643 0.687 0.664
*
* sutime.teRelHeurLevel=MORE, restrictToTimex3=true
* precision 0.876
* recall 0.889
* f1-measure 0.882
* accuracy 0.991
* P R F1
* attribute type 0.918 | 0.744 0.798 0.770
* attribute value 0.776 | 0.629 0.675 0.651
*
* P R F1
* mention attribute type 0.901 | 0.780 0.836 0.807
* mention attribute value 0.750 | 0.649 0.696 0.672
*
* ------------------------------------------------------------------------------
* TEST (english):
*
* GUTIME:
* precision 0.89
* recall 0.79
* f1-measure 0.84
* accuracy 0.99
*
* attribute type 0.95
* attribute value 0.68
*
* SUTIME:
* Default: sutime.teRelHeurLevel=NONE, restrictToTimex3=false
* precision 0.878
* recall 0.963
* f1-measure 0.918
* accuracy 0.996
*
* P R F1
* attribute type 0.953 | 0.820 0.904 0.860
* attribute value 0.791 | 0.680 0.750 0.713
*
* P R F1
* mention attribute type 0.954 | 0.837 0.923 0.878
* mention attribute value 0.781 | 0.686 0.756 0.720
*
* sutime.teRelHeurLevel=MORE, restrictToTimex3=true
* precision 0.881
* recall 0.963
* f1-measure 0.920
* accuracy 0.995
* P R F1
* attribute type 0.959 | 0.821 0.910 0.863
* attribute value 0.818 | 0.699 0.776 0.736
*
* P R F1
* mention attribute type 0.961 | 0.844 0.936 0.888
* mention attribute value 0.803 | 0.705 0.782 0.742
*
* </pre>
* @author Angel Chang
*/
public class SUTimeMain {
protected static String PYTHON = null;
private SUTimeMain() {} // static class
/*
* Other Time corpora: (see also http://timeml.org/site/timebank/timebank.html)
* LDC2006T08 TimeBank 1.2 (Uses TIMEX3)
* LDC2005T07 ACE Time Normalization (TERN) 2004 English Training Data v 1.0 (Uses TIMEX2)
* GUTime achieved .85, .78, and .82 F-measure for timex2, text, and val fields
* LDC2010T18 ACE Time Normalization (TERN) 2004 English Evaluation Data V1.0
*/
////////////////////////////////////////////////////////////////////////////////////////
private static class EvalStats {
PrecisionRecallStats prStats = new PrecisionRecallStats();
// PrecisionRecallStats tokenPrStats = new PrecisionRecallStats();
PrecisionRecallStats valPrStats = new PrecisionRecallStats();
PrecisionRecallStats estPrStats = new PrecisionRecallStats();
}
private static class TimebankTimex {
String timexId;
String timexVal;
String timexOrigVal;
String timexStr;
int tid;
private TimebankTimex(String timexId, String timexVal, String timexOrigVal, String timexStr) {
this.timexId = timexId;
this.timexVal = timexVal;
this.timexOrigVal = timexOrigVal;
this.timexStr = timexStr;
if (timexId != null && timexId.length() > 0) {
tid = Integer.parseInt(timexId);
}
}
}
private static class TimebankSent {
boolean initialized = false;
String docId;
@SuppressWarnings("unused")
String docFilename;
String docPubDate;
String sentId;
String text;
List<TimebankTimex> timexes = new ArrayList<TimebankTimex>();
List<String> origItems = new ArrayList<String>();
public boolean add(String item) {
String[] fields = item.split("\\s*\\|\\s*", 9);
String docId = fields[0];
String docFilename = fields[1];
String docPubDate = fields[2];
String sentId = fields[3];
String sent = fields[8];
if (initialized) {
// check compatibility;
if (!docId.equals(this.docId) || !sentId.equals(this.sentId)) {
return false;
}
} else {
this.docId = docId;
this.docFilename = docFilename;
this.docPubDate = docPubDate;
this.sentId = sentId;
this.text = sent;
initialized = true;
}
origItems.add(item);
String timexId = fields[4];
String timexVal = fields[5];
String timexOrigVal = fields[6];
String timexStr = fields[7];
if (timexId != null && timexId.length() > 0) {
timexes.add(new TimebankTimex(timexId, timexVal, timexOrigVal, timexStr));
}
return true;
}
}
//Overall: PrecisionRecallStats[tp=877,fp=199,fn=386,p=0.82 (877/1076),r=0.69 (877/1263),f1=0.75]
//Value: PrecisionRecallStats[tp=229,fp=199,fn=1034,p=0.54 (229/428),r=0.18 (229/1263),f1=0.27]
// Process one item from timebank CSV file
private static void processTimebankCsvSent(AnnotationPipeline pipeline, TimebankSent sent, PrintWriter pw, EvalStats evalStats)
{
if (sent != null) {
Collections.sort(sent.timexes, (o1, o2) -> {
if (o1.tid == o2.tid) { return 0; }
else return (o1.tid < o2.tid)? -1:1;
});
pw.println();
for (String item:sent.origItems) {
pw.println("PROC |" + item);
}
Annotation annotation = new Annotation(sent.text);
annotation.set(CoreAnnotations.DocDateAnnotation.class, sent.docPubDate);
pipeline.annotate(annotation);
List<CoreMap> timexes = annotation.get(TimeAnnotations.TimexAnnotations.class);
int i = 0;
for (CoreMap t:timexes) {
String[] newFields;
if (sent.timexes.size() > i) {
String res;
TimebankTimex goldTimex = sent.timexes.get(i);
Timex guessTimex = t.get(TimeAnnotations.TimexAnnotation.class);
String s1 = goldTimex.timexStr.replaceAll("\\s+", "");
String s2 = guessTimex.text().replaceAll("\\s+", "");
if (s1.equals(s2)) {
evalStats.estPrStats.incrementTP();
res = "OK";
} else {
evalStats.estPrStats.incrementFP();
evalStats.estPrStats.incrementFN();
res = "BAD";
}
newFields = new String[] { res, goldTimex.timexId, goldTimex.timexVal, goldTimex.timexOrigVal, goldTimex.timexStr,
t.get(TimeAnnotations.TimexAnnotation.class).toString() };
i++;
} else {
newFields = new String[] { "NONE" , t.get(TimeAnnotations.TimexAnnotation.class).toString()};
evalStats.estPrStats.incrementFP();
}
pw.println("GOT | "+ StringUtils.join(newFields, "|"));
}
for (; i < sent.timexes.size(); i++) {
evalStats.estPrStats.incrementFN();
}
i = 0;
int lastIndex = 0;
for (TimebankTimex goldTimex:sent.timexes) {
int index = sent.text.indexOf(goldTimex.timexStr, lastIndex);
int endIndex = index + goldTimex.timexStr.length();
boolean found = false;
for (; i < timexes.size(); i++) {
CoreMap t = timexes.get(i);
if (t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) >= endIndex) {
break;
} else {
if (t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) >= index) {
found = true;
evalStats.prStats.incrementTP();
if (goldTimex.timexOrigVal.equals(t.get(TimeAnnotations.TimexAnnotation.class).value())) {
evalStats.valPrStats.incrementTP();
} else {
evalStats.valPrStats.incrementFN();
}
} else {
evalStats.prStats.incrementFP();
evalStats.valPrStats.incrementFP();
}
}
}
if (!found) {
evalStats.prStats.incrementFN();
evalStats.valPrStats.incrementFN();
}
lastIndex = endIndex;
}
for (; i < timexes.size(); i++) {
evalStats.prStats.incrementFP();
evalStats.valPrStats.incrementFP();
}
}
}
// Process CSV file with just timebank sentences with time expressions
public static void processTimebankCsv(AnnotationPipeline pipeline, String in, String out, String eval) throws IOException
{
BufferedReader br = IOUtils.getBufferedFileReader(in);
PrintWriter pw = (out != null)? IOUtils.getPrintWriter(out):new PrintWriter(System.out);
String line;
// boolean dataStarted = false;
boolean dataStarted = true;
TimebankSent sent = new TimebankSent();
String item = null;
EvalStats evalStats = new EvalStats();
line = br.readLine(); // Skip first line
while ((line = br.readLine()) != null) {
if (line.trim().length() == 0) continue;
if (dataStarted) {
if (line.contains("|")) {
if (item != null) {
boolean addOld = sent.add(item);
if (!addOld) {
processTimebankCsvSent(pipeline, sent, pw, evalStats);
sent = new TimebankSent();
sent.add(item);
}
}
item = line;
} else {
item += " " + line;
}
} else {
if (line.matches("#+ BEGIN DATA #+")) {
dataStarted = true;
}
}
}
if (item != null) {
boolean addOld = sent.add(item);
if (!addOld) {
processTimebankCsvSent(pipeline, sent, pw, evalStats);
sent = new TimebankSent();
sent.add(item);
}
processTimebankCsvSent(pipeline, sent, pw, evalStats);
}
br.close();
if (out != null) { pw.close(); }
System.out.println("Estimate: " + evalStats.estPrStats.toString(2));
System.out.println("Overall: " + evalStats.prStats.toString(2));
System.out.println("Value: " + evalStats.valPrStats.toString(2));
}
public static String joinWordTags(List<? extends CoreMap> l, String glue, int start, int end) {
return StringUtils.join(l, glue, in -> in.get(CoreAnnotations.TextAnnotation.class) + "/" + in.get(CoreAnnotations.PartOfSpeechAnnotation.class), start, end);
}
private static void processTempEval2Doc(AnnotationPipeline pipeline, Annotation docAnnotation,
Map<String, List<TimexAttributes>> timexMap,
PrintWriter extPw, PrintWriter attrPw, PrintWriter debugPw,
PrintWriter attrDebugPwGold, PrintWriter attrDebugPw)
{
pipeline.annotate(docAnnotation);
String docId = docAnnotation.get(CoreAnnotations.DocIDAnnotation.class);
String docDate = docAnnotation.get(CoreAnnotations.DocDateAnnotation.class);
List<CoreMap> sents = docAnnotation.get(CoreAnnotations.SentencesAnnotation.class);
if (timexMap != null) {
List<TimexAttributes> golds = updateTimexText(timexMap, docAnnotation);
if (attrDebugPwGold != null && golds != null) {
for (TimexAttributes g:golds) {
String[] newFields = { docId, docDate,
String.valueOf(g.sentIndex),
String.valueOf(g.tokenStart),
String.valueOf(g.tokenEnd),
/*g.tid, */ g.type, g.value, g.text, g.context };
attrDebugPwGold.println(StringUtils.join(newFields, "\t"));
}
}
}
if (attrDebugPw != null) {
for (CoreMap sent:sents) {
List<CoreMap> timexes = sent.get(TimeAnnotations.TimexAnnotations.class);
if (timexes != null) {
for (CoreMap t:timexes) {
Timex timex = t.get(TimeAnnotations.TimexAnnotation.class);
int sentIndex = sent.get(CoreAnnotations.SentenceIndexAnnotation.class);
int sentTokenStart = sent.get(CoreAnnotations.TokenBeginAnnotation.class);
int tokenStart;
int tokenEnd;
if (t.containsKey(CoreAnnotations.TokenBeginAnnotation.class)) {
tokenStart = t.get(CoreAnnotations.TokenBeginAnnotation.class) - sentTokenStart;
tokenEnd = t.get(CoreAnnotations.TokenEndAnnotation.class) - sentTokenStart;
} else {
CoreMap cm = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(docAnnotation,
t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
t.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
tokenStart = cm.get(CoreAnnotations.TokenBeginAnnotation.class) - sentTokenStart;
tokenEnd = cm.get(CoreAnnotations.TokenEndAnnotation.class) - sentTokenStart;
}
String context = joinWordTags(sent.get(CoreAnnotations.TokensAnnotation.class), " ", tokenStart-3, tokenEnd+3);
String[] newFields = { docId, docDate,
String.valueOf(sentIndex),
String.valueOf(tokenStart), String.valueOf(tokenEnd),
/*timex.tid(), */ timex.timexType(), timex.value(), timex.text(), context};
attrDebugPw.println(StringUtils.join(newFields, "\t"));
}
}
}
}
if (debugPw != null) {
List<CoreMap> timexes = docAnnotation.get(TimeAnnotations.TimexAnnotations.class);
for (CoreMap t:timexes) {
String[] newFields = { docId, docDate, t.get(TimeAnnotations.TimexAnnotation.class).toString() };
debugPw.println("GOT | "+ StringUtils.join(newFields, "|"));
}
}
if (extPw != null || attrPw != null) {
for (CoreMap sent:sents) {
int sentTokenBegin = sent.get(CoreAnnotations.TokenBeginAnnotation.class);
for (CoreMap t:sent.get(TimeAnnotations.TimexAnnotations.class)) {
Timex tmx = t.get(TimeAnnotations.TimexAnnotation.class);
List<CoreLabel> tokens = t.get(CoreAnnotations.TokensAnnotation.class);
int tokenIndex = 0;
if (tokens == null) {
CoreMap cm = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(docAnnotation,
t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
t.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
tokens = cm.get(CoreAnnotations.TokensAnnotation.class);
tokenIndex = cm.get(CoreAnnotations.TokenBeginAnnotation.class);
} else {
tokenIndex = t.get(CoreAnnotations.TokenBeginAnnotation.class);
}
tokenIndex = tokenIndex - sentTokenBegin;
String sentenceIndex = String.valueOf(sent.get(CoreAnnotations.SentenceIndexAnnotation.class));
int tokenCount = 0;
for (@SuppressWarnings("unused") CoreLabel token:tokens) {
String[] extFields = {
docId,
sentenceIndex,
String.valueOf(tokenIndex),
"timex3",
tmx.tid(),
"1"};
String extString = StringUtils.join(extFields, "\t");
if (extPw != null) extPw.println(extString);
if (attrPw != null /* && tokenCount == 0 */) {
String[] attrFields = {
"type",
tmx.timexType(),
};
attrPw.println(extString + "\t" + StringUtils.join(attrFields, "\t"));
if (tmx.value() != null) {
String val = tmx.value();
// Fix up expression values (needed for GUTime)
if (useGUTime) {
if ("TIME".equals(tmx.timexType())) {
if (val.matches("T\\d{4}")) {
val = "T" + val.substring(1,3) + ":" + val.substring(3,5);
}
} else if ("DATE".equals(tmx.timexType())) {
if (val.matches("\\d{8}T.*")) {
val = val.substring(0,4) + "-" + val.substring(4,6) + "-" + val.substring(6);
} else if (val.matches("\\d{8}")) {
val = val.substring(0,4) + "-" + val.substring(4,6) + "-" + val.substring(6,8);
} else if (val.matches("\\d\\d\\d\\d..")) {
val = val.substring(0,4) + "-" + val.substring(4,6);
} else if (val.matches("[0-9X]{4}W[0-9X]{2}.*")) {
if (val.length() > 7) {
val = val.substring(0,4) + "-" + val.substring(4,7) + "-" + val.substring(7);
} else {
val = val.substring(0,4) + "-" + val.substring(4,7);
}
}
}
} /*else {
// SUTIME
if ("DATE".equals(tmx.timexType())) {
if (val.matches("\\d\\d\\dX")) {
val = val.substring(0,3); // Convert 199X to 199
}
}
} */
attrFields[0] = "value";
attrFields[1] = val;
attrPw.println(extString + "\t" + StringUtils.join(attrFields, "\t"));
}
}
tokenIndex++;
tokenCount++;
}
}
}
}
}
private static CoreLabelTokenFactory tokenFactory = new CoreLabelTokenFactory();
private static CoreMap wordsToSentence(List<String> sentWords)
{
String sentText = StringUtils.join(sentWords, " ");
Annotation sentence = new Annotation(sentText);
List<CoreLabel> tokens = new ArrayList<CoreLabel>(sentWords.size());
for (String text:sentWords) {
CoreLabel token = tokenFactory.makeToken();
token.set(CoreAnnotations.TextAnnotation.class, text);
tokens.add(token);
}
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
return sentence;
}
public static Annotation sentencesToDocument(String documentID, String docDate, List<CoreMap> sentences)
{
String docText = ChunkAnnotationUtils.getTokenText(sentences, CoreAnnotations.TextAnnotation.class);
Annotation document = new Annotation(docText);
document.set(CoreAnnotations.DocIDAnnotation.class, documentID);
document.set(CoreAnnotations.DocDateAnnotation.class, docDate);
document.set(CoreAnnotations.SentencesAnnotation.class, sentences);
// Accumulate docTokens and label sentence with overall token begin/end, and sentence index annotations
List<CoreLabel> docTokens = new ArrayList<CoreLabel>();
int sentenceIndex = 0;
int tokenBegin = 0;
for (CoreMap sentenceAnnotation:sentences) {
List<CoreLabel> sentenceTokens = sentenceAnnotation.get(CoreAnnotations.TokensAnnotation.class);
docTokens.addAll(sentenceTokens);
int tokenEnd = tokenBegin + sentenceTokens.size();
sentenceAnnotation.set(CoreAnnotations.TokenBeginAnnotation.class, tokenBegin);
sentenceAnnotation.set(CoreAnnotations.TokenEndAnnotation.class, tokenEnd);
sentenceAnnotation.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex);
sentenceIndex++;
tokenBegin = tokenEnd;
}
document.set(CoreAnnotations.TokensAnnotation.class, docTokens);
// Put in character offsets
int i = 0;
for (CoreLabel token:docTokens) {
String tokenText = token.get(CoreAnnotations.TextAnnotation.class);
token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, i);
i+=tokenText.length();
token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, i);
i++; // Skip space
}
for (CoreMap sentenceAnnotation:sentences) {
List<CoreLabel> sentenceTokens = sentenceAnnotation.get(CoreAnnotations.TokensAnnotation.class);
sentenceAnnotation.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class,
sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
sentenceAnnotation.set(CoreAnnotations.CharacterOffsetEndAnnotation.class,
sentenceTokens.get(sentenceTokens.size()-1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
}
return document;
}
private static class TimexAttributes {
public String tid;
public int sentIndex;
public int tokenStart;
public int tokenEnd;
public String text;
public String type;
public String value;
public String context;
public TimexAttributes(String tid, int sentIndex, int tokenIndex) {
this.tid = tid;
this.sentIndex = sentIndex;
this.tokenStart = tokenIndex;
this.tokenEnd = tokenIndex + 1;
}
}
private static TimexAttributes findTimex(Map<String,List<TimexAttributes>> timexMap, String docId, String tid)
{
// Find entry
List<TimexAttributes> list = timexMap.get(docId);
for (TimexAttributes timex:list) {
if (timex.tid.equals(tid)) {
return timex;
}
}
return null;
}
private static List<TimexAttributes> updateTimexText(Map<String,List<TimexAttributes>> timexMap, Annotation docAnnotation)
{
// Find entry
String docId = docAnnotation.get(CoreAnnotations.DocIDAnnotation.class);
List<CoreMap> sents = docAnnotation.get(CoreAnnotations.SentencesAnnotation.class);
List<TimexAttributes> list = timexMap.get(docId);
if (list != null) {
for (TimexAttributes timex:list) {
CoreMap sent = sents.get(timex.sentIndex);
List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
timex.text = StringUtils.joinWords(tokens, " ", timex.tokenStart, timex.tokenEnd);
timex.context = joinWordTags(tokens, " ", timex.tokenStart - 3, timex.tokenEnd + 3);
/* StringBuilder sb = new StringBuilder("");
for (int i = timex.tokenStart; i < timex.tokenEnd; i++) {
if (sb.length() > 0) { sb.append(" "); }
sb.append(tokens.get(i).word());
}
timex.text = sb.toString();
// Get context
sb.setLength(0);
int c1 = Math.max(0, timex.tokenStart - 3);
int c2 = Math.min(tokens.size(), timex.tokenEnd + 3);
for (int i = c1; i < c2; i++) {
if (sb.length() > 0) { sb.append(" "); }
sb.append(tokens.get(i).word());
}
timex.context = sb.toString(); */
}
return list;
}
return null;
}
private static Map<String,List<TimexAttributes>> readTimexAttrExts(String extentsFile, String attrsFile) throws IOException
{
Map<String,List<TimexAttributes>> timexMap = Generics.newHashMap();
BufferedReader extBr = IOUtils.getBufferedFileReader(extentsFile);
String line;
String lastDocId = null;
TimexAttributes lastTimex = null;
while ((line = extBr.readLine()) != null) {
if (line.trim().length() == 0) continue;
// Simple tab delimited file
String[] fields = line.split("\t");
String docName = fields[0];
int sentNo = Integer.parseInt(fields[1]);
int tokenNo = Integer.parseInt(fields[2]);
String tid = fields[4];
if (lastDocId != null && lastDocId.equals(docName) && lastTimex != null && lastTimex.tid.equals(tid)) {
// Expand previous
assert(lastTimex.sentIndex == sentNo);
lastTimex.tokenEnd = tokenNo + 1;
} else {
lastDocId = docName;
lastTimex = new TimexAttributes(tid, sentNo, tokenNo);
List<TimexAttributes> list = timexMap.get(docName);
if (list == null) {
timexMap.put(docName, list = new ArrayList<TimexAttributes>());
}
list.add(lastTimex);
}
}
extBr.close();
BufferedReader attrBr = IOUtils.getBufferedFileReader(attrsFile);
while ((line = attrBr.readLine()) != null) {
if (line.trim().length() == 0) continue;
// Simple tab delimited file
String[] fields = line.split("\t");
String docName = fields[0];
int sentNo = Integer.parseInt(fields[1]);
int tokenNo = Integer.parseInt(fields[2]);
String tid = fields[4];
String attrname = fields[6];
String attrvalue = fields[7];
// Find entry
TimexAttributes timex = findTimex(timexMap, docName, tid);
assert(timex.sentIndex == sentNo);
assert(timex.tokenStart <= tokenNo && timex.tokenEnd > tokenNo);
switch (attrname) {
case "type":
assert (timex.type == null || timex.type.equals(attrvalue));
timex.type = attrvalue;
break;
case "value":
assert (timex.value == null || timex.value.equals(attrvalue));
timex.value = attrvalue;
break;
default:
throw new RuntimeException("Error processing " + attrsFile + ":" +
"Unknown attribute " + attrname + ": from line " + line);
}
}
attrBr.close();
return timexMap;
}
public static void processTempEval2Tab(AnnotationPipeline pipeline, String in, String out, Map<String,String> docDates) throws IOException
{
Map<String,List<TimexAttributes>> timexMap = readTimexAttrExts(in + "/timex-extents.tab", in + "/timex-attributes.tab");
BufferedReader br = IOUtils.getBufferedFileReader(in + "/base-segmentation.tab");
PrintWriter debugPw = IOUtils.getPrintWriter(out + "/timex-debug.out");
PrintWriter attrPw = IOUtils.getPrintWriter(out + "/timex-attrs.res.tab");
PrintWriter extPw = IOUtils.getPrintWriter(out + "/timex-extents.res.tab");
PrintWriter attrDebugPwGold = IOUtils.getPrintWriter(out + "/timex-attrs.debug.gold.tab");
PrintWriter attrDebugPw = IOUtils.getPrintWriter(out + "/timex-attrs.debug.res.tab");
String line;
String curDocName = null;
int curSentNo = -1;
List<String> tokens = null;
List<CoreMap> sentences = null;
while ((line = br.readLine()) != null) {
if (line.trim().length() == 0) continue;
// Simple tab delimited file
String[] fields = line.split("\t");
String docName = fields[0];
int sentNo = Integer.parseInt(fields[1]);
//int tokenNo = Integer.parseInt(fields[2]);
String tokenText = fields[3];
// Create little annotation with sentences and tokens
if (!docName.equals(curDocName)) {
if (curDocName != null) {
// Process document
CoreMap lastSentence = wordsToSentence(tokens);
sentences.add(lastSentence);
Annotation docAnnotation = sentencesToDocument(curDocName, docDates.get(curDocName), sentences);
processTempEval2Doc(pipeline, docAnnotation, timexMap, extPw, attrPw, debugPw, attrDebugPwGold, attrDebugPw);
curDocName = null;
}
// New doc
tokens = new ArrayList<String>();
sentences = new ArrayList<CoreMap>();
} else if (curSentNo != sentNo) {
CoreMap lastSentence = wordsToSentence(tokens);
sentences.add(lastSentence);
tokens = new ArrayList<String>();
}
tokens.add(tokenText);
curDocName = docName;
curSentNo = sentNo;
}
if (curDocName != null) {
// Process document
CoreMap lastSentence = wordsToSentence(tokens);
sentences.add(lastSentence);
Annotation docAnnotation = sentencesToDocument(curDocName, docDates.get(curDocName), sentences);
processTempEval2Doc(pipeline, docAnnotation, timexMap, extPw, attrPw, debugPw, attrDebugPwGold, attrDebugPw);
curDocName = null;
}
br.close();
extPw.close();
attrPw.close();
debugPw.close();
attrDebugPwGold.close();
attrDebugPw.close();
}
public static void processTempEval2(AnnotationPipeline pipeline, String in, String out, String eval, String dct) throws IOException, ParseException
{
Map<String,String> docDates = (dct != null)? IOUtils.readMap(dct):IOUtils.readMap(in + "/dct.txt");
if (requiredDocDateFormat != null) {
// convert from yyyyMMdd to requiredDocDateFormat
DateFormat defaultFormatter = new SimpleDateFormat("yyyyMMdd");
DateFormat requiredFormatter = new SimpleDateFormat(requiredDocDateFormat);
for (String docId:docDates.keySet()) {
Date date = defaultFormatter.parse(docDates.get(docId));
docDates.put(docId, requiredFormatter.format(date));
}
}
processTempEval2Tab(pipeline, in, out, docDates);
if (eval != null) {
List<String> command = new ArrayList<String>();
if (PYTHON != null) {
command.add(PYTHON);
}
command.add(eval);
command.add(in + "/base-segmentation.tab");
command.add(in + "/timex-extents.tab");
command.add(out + "/timex-extents.res.tab");
command.add(in + "/timex-attributes.tab");
command.add(out + "/timex-attrs.res.tab");
ProcessBuilder pb = new ProcessBuilder(command);
FileOutputStream evalFileOutput = new FileOutputStream(out + "/scores.txt");
Writer output = new OutputStreamWriter(
new TeeStream(System.out, evalFileOutput));
SystemUtils.run(pb, output, null);
evalFileOutput.close();
}
}
public static void processTempEval3(AnnotationPipeline pipeline, String in, String out, String evalCmd) throws Exception
{
// Process files
File inFile = new File(in);
if (inFile.isDirectory()) {
// input is a directory - process files in directory
Pattern teinputPattern = Pattern.compile("\\.(TE3input|tml)$");
Iterable<File> files = IOUtils.iterFilesRecursive(inFile, teinputPattern);
File outDir = new File(out);
outDir.mkdirs();
for (File file: files) {
String inputFilename = file.getAbsolutePath();
String outputFilename = inputFilename.replace(in, out).replace(".TE3input", "");
if (!outputFilename.equalsIgnoreCase(inputFilename)) {
//System.out.println(inputFilename + " => " + outputFilename);
processTempEval3File(pipeline, inputFilename, outputFilename);
} else {
System.err.println("ABORTING: Input file and output is the same - " + inputFilename);
System.exit(-1);
}
}
} else {
// input is a file - process file
processTempEval3File(pipeline, in, out);
}
// Evaluate
if (evalCmd != null) {
// TODO: apply eval command
}
}
public static void processTempEval3File(AnnotationPipeline pipeline, String in, String out) throws Exception
{
// Process one tempeval file
Document doc = edu.stanford.nlp.util.XMLUtils.readDocumentFromFile(in);
Node timemlNode = XMLUtils.getNode(doc, "TimeML");
Node docIdNode = XMLUtils.getNode(timemlNode, "DOCID");
Node dctNode = XMLUtils.getNode(timemlNode, "DCT");
Node dctTimexNode = XMLUtils.getNode(dctNode, "TIMEX3");
Node titleNode = XMLUtils.getNode(timemlNode, "TITLE");
Node extraInfoNode = XMLUtils.getNode(timemlNode, "EXTRA_INFO");
Node textNode = XMLUtils.getNode(timemlNode, "TEXT");
String date = XMLUtils.getAttributeValue(dctTimexNode, "value");
String text = textNode.getTextContent();
Annotation annotation = textToAnnotation(pipeline, text, date);
Element annotatedTextElem = annotationToTmlTextElement(annotation);
Document annotatedDoc = XMLUtils.createDocument();
Node newTimemlNode = annotatedDoc.importNode(timemlNode, false);
newTimemlNode.appendChild(annotatedDoc.importNode(docIdNode, true));
newTimemlNode.appendChild(annotatedDoc.importNode(dctNode, true));
if (titleNode != null) {
newTimemlNode.appendChild(annotatedDoc.importNode(titleNode, true));
}
if (extraInfoNode != null) {
newTimemlNode.appendChild(annotatedDoc.importNode(extraInfoNode, true));
}
newTimemlNode.appendChild(annotatedDoc.adoptNode(annotatedTextElem));
annotatedDoc.appendChild(newTimemlNode);
PrintWriter pw = (out != null)? IOUtils.getPrintWriter(out):new PrintWriter(System.out);
String string = XMLUtils.documentToString(annotatedDoc);
pw.println(string);
pw.flush();
if (out != null) pw.close();
}
private static String requiredDocDateFormat;
private static boolean useGUTime = false;
public static AnnotationPipeline getPipeline(Properties props, boolean tokenize) throws Exception
{
// useGUTime = Boolean.parseBoolean(props.getProperty("gutime", "false"));
AnnotationPipeline pipeline = new AnnotationPipeline();
if (tokenize) {
pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
}
pipeline.addAnnotator(new POSTaggerAnnotator(false));
// pipeline.addAnnotator(new NumberAnnotator(false));
// pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false));
String timeAnnotator = props.getProperty("timeAnnotator", "sutime");
switch (timeAnnotator) {
case "gutime":
useGUTime = true;
pipeline.addAnnotator(new GUTimeAnnotator());
break;
case "heideltime":
requiredDocDateFormat = "yyyy-MM-dd";
pipeline.addAnnotator(new HeidelTimeAnnotator("heideltime", props));
break;
case "sutime":
pipeline.addAnnotator(new TimeAnnotator("sutime", props));
break;
default:
throw new IllegalArgumentException("Unknown timeAnnotator: " + timeAnnotator);
}
return pipeline;
}
enum InputType { TEXTFILE, TEXT, TIMEBANK_CSV, TEMPEVAL2, TEMPEVAL3 }
public static void configLogger(String out) throws IOException {
File outDir = new File(out);
if (!outDir.exists()) {
outDir.mkdirs();
}
StringBuilder sb = new StringBuilder();
sb.append("handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler\n");
sb.append(".level=SEVERE\n");
sb.append("edu.stanford.nlp.level=INFO\n");
sb.append("java.util.logging.ConsoleHandler.level=SEVERE\n");
sb.append("java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter\n");
sb.append("java.util.logging.FileHandler.level=INFO\n");
sb.append("java.util.logging.FileHandler.pattern=" + out + "/err.log" + "\n");
LogManager.getLogManager().readConfiguration(new ReaderInputStream(new StringReader(sb.toString())));
}
private static List<Node> createTimexNodes(String str, Integer charBeginOffset, List<CoreMap> timexAnns) {
List<ValuedInterval<CoreMap,Integer>> timexList = new ArrayList<ValuedInterval<CoreMap,Integer>>(timexAnns.size());
for (CoreMap timexAnn:timexAnns) {
timexList.add(new ValuedInterval<CoreMap, Integer>(timexAnn,
MatchedExpression.COREMAP_TO_CHAR_OFFSETS_INTERVAL_FUNC.apply(timexAnn)));
}
Collections.sort(timexList, HasInterval.CONTAINS_FIRST_ENDPOINTS_COMPARATOR );
return createTimexNodesPresorted(str, charBeginOffset, timexList);
}
private static List<Node> createTimexNodesPresorted(String str, Integer charBeginOffset, List<ValuedInterval<CoreMap,Integer>> timexList) {
if (charBeginOffset == null) charBeginOffset = 0;
List<Node> nodes = new ArrayList<Node>();
int previousEnd = 0;
List<Element> timexElems = new ArrayList<Element>();
List<ValuedInterval<CoreMap,Integer>> processed = new ArrayList<ValuedInterval<CoreMap,Integer>>();
CollectionValuedMap<Integer, ValuedInterval<CoreMap,Integer>> unprocessed =
new CollectionValuedMap<Integer, ValuedInterval<CoreMap,Integer>>(CollectionFactory.<ValuedInterval<CoreMap,Integer>>arrayListFactory());
for (ValuedInterval<CoreMap,Integer> v:timexList) {
CoreMap timexAnn = v.getValue();
int begin = timexAnn.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) - charBeginOffset;
int end = timexAnn.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) - charBeginOffset;
if (begin >= previousEnd) {
// Add text
nodes.add(XMLUtils.createTextNode(str.substring(previousEnd, begin)));
// Add timex
Timex timex = timexAnn.get(TimeAnnotations.TimexAnnotation.class);
Element timexElem = timex.toXmlElement();
nodes.add(timexElem);
previousEnd = end;
// For handling nested timexes
processed.add(v);
timexElems.add(timexElem);
} else {
unprocessed.add(processed.size()-1, v);
}
}
if (previousEnd < str.length()) {
nodes.add(XMLUtils.createTextNode(str.substring(previousEnd)));
}
for (Integer i:unprocessed.keySet()) {
ValuedInterval<CoreMap, Integer> v = processed.get(i);
String elemStr = v.getValue().get(CoreAnnotations.TextAnnotation.class);
int charStart = v.getValue().get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
List<Node> innerElems = createTimexNodesPresorted(elemStr, charStart, (List<ValuedInterval<CoreMap, Integer>>) unprocessed.get(i));
Element timexElem = timexElems.get(i);
XMLUtils.removeChildren(timexElem);
for (Node n:innerElems) {
timexElem.appendChild(n);
}
}
return nodes;
}
public static void processTextFile(AnnotationPipeline pipeline, String in, String out, String date) throws IOException
{
String text = IOUtils.slurpFile(in);
PrintWriter pw = (out != null)? IOUtils.getPrintWriter(out):new PrintWriter(System.out);
String string = textToAnnotatedXml(pipeline, text, date);
pw.println(string);
pw.flush();
if (out != null) pw.close();
}
public static void processText(AnnotationPipeline pipeline, String text, String out, String date) throws IOException
{
PrintWriter pw = (out != null)? IOUtils.getPrintWriter(out):new PrintWriter(System.out);
String string = textToAnnotatedXml(pipeline, text, date);
pw.println(string);
pw.flush();
if (out != null) pw.close();
}
public static String textToAnnotatedXml(AnnotationPipeline pipeline, String text, String date) {
Annotation annotation = textToAnnotation(pipeline, text, date);
Document xmlDoc = annotationToXmlDocument(annotation);
return XMLUtils.documentToString(xmlDoc);
}
public static Element annotationToTmlTextElement(Annotation annotation) {
List<CoreMap> timexAnnsAll = annotation.get(TimeAnnotations.TimexAnnotations.class);
Element textElem = XMLUtils.createElement("TEXT");
List<Node> timexNodes = createTimexNodes(
annotation.get(CoreAnnotations.TextAnnotation.class),
annotation.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
timexAnnsAll);
for (Node node:timexNodes) {
textElem.appendChild(node);
}
return textElem;
}
public static Document annotationToXmlDocument(Annotation annotation)
{
Element dateElem = XMLUtils.createElement("DATE");
dateElem.setTextContent(annotation.get(CoreAnnotations.DocDateAnnotation.class));
Element textElem = annotationToTmlTextElement(annotation);
Element docElem = XMLUtils.createElement("DOC");
docElem.appendChild(dateElem);
docElem.appendChild(textElem);
// Create document and import elements into this document....
Document doc = XMLUtils.createDocument();
doc.appendChild(doc.importNode(docElem, true));
return doc;
}
public static Annotation textToAnnotation(AnnotationPipeline pipeline, String text, String date)
{
Annotation annotation = new Annotation(text);
annotation.set(CoreAnnotations.DocDateAnnotation.class, date);
pipeline.annotate(annotation);
return annotation;
}
public static void main(String[] args) throws Exception {
// Process arguments
Properties props = StringUtils.argsToProperties(args);
String in = props.getProperty("i");
String date = props.getProperty("date");
String dct = props.getProperty("tempeval2.dct");
String out = props.getProperty("o");
String inputTypeStr = props.getProperty("in.type", InputType.TEXT.name());
String eval = props.getProperty("eval");
PYTHON = props.getProperty("python", PYTHON);
InputType inputType = InputType.valueOf(inputTypeStr);
AnnotationPipeline pipeline;
switch (inputType) {
case TEXT:
pipeline = getPipeline(props, true);
processText(pipeline, in, out, date);
break;
case TEXTFILE:
pipeline = getPipeline(props, true);
processTextFile(pipeline, in, out, date);
break;
case TIMEBANK_CSV:
configLogger(out);
pipeline = getPipeline(props, true);
processTimebankCsv(pipeline, in, out, eval);
break;
case TEMPEVAL2:
configLogger(out);
pipeline = getPipeline(props, false);
processTempEval2(pipeline, in, out, eval, dct);
break;
case TEMPEVAL3:
pipeline = getPipeline(props, true);
processTempEval3(pipeline, in, out, eval);
break;
}
}
}