package edu.stanford.nlp.time;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import nu.xom.Builder;
import nu.xom.Element;
import nu.xom.Elements;
import nu.xom.ParsingException;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.CoreMap;
import java.util.function.Function;
import edu.stanford.nlp.util.Iterables;
/**
* @author Karthik Raghunathan
*/
public class ParsedGigawordReader implements Iterable<Annotation> {
private Iterable<File> files;
public ParsedGigawordReader(File directory) {
this.files = IOUtils.iterFilesRecursive(directory);
}
@Override
public Iterator<Annotation> iterator() {
return new Iterator<Annotation>() {
private Iterator<BufferedReader> readers = Iterables.transform(files,
file -> IOUtils.readerFromFile(file)).iterator();
private BufferedReader reader = findReader();
private Annotation annotation = findAnnotation();
@Override
public boolean hasNext() {
return this.annotation != null;
}
@Override
public Annotation next() {
if (this.annotation == null) {
throw new NoSuchElementException();
}
Annotation toReturn = this.annotation;
this.annotation = this.findAnnotation();
return toReturn;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
private BufferedReader findReader() {
return this.readers.hasNext() ? this.readers.next() : null;
}
private Annotation findAnnotation() {
if (this.reader == null) {
return null;
}
try {
String line;
StringBuilder doc = new StringBuilder();
while ((line = this.reader.readLine()) != null) {
doc.append(line);
doc.append('\n');
// if(line.contains("<DOC id")){
// System.err.println(line);
// }
if (line.equals("</DOC>")) {
break;
}
if (line.contains("</DOC>")) {
throw new RuntimeException(String.format("invalid line '%s'", line));
}
}
if (line == null) {
this.reader.close();
this.reader = findReader();
}
String xml = doc.toString().replaceAll("&", "&");
if(xml == null || xml.equals("")) {
return findAnnotation();
}
xml = xml.replaceAll("num=([0-9]+) (.*)", "num=\"$1\" $2");
xml = xml.replaceAll("sid=(.*)>", "sid=\"$1\">");
xml = xml.replaceAll("</SENT>\n</DOC>", "</SENT>\n</TEXT>\n</DOC>");
xml = new String(xml.getBytes(), "UTF8");
//System.err.println("This is what goes in:\n" + xml);
return toAnnotation(xml);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
};
}
private static final Pattern datePattern = Pattern.compile("^\\w+_\\w+_(\\d+)\\.");
/*
* Old implementation based on JDOM.
* No longer maintained due to JDOM licensing issues.
private static Annotation toAnnotation(String xml) throws IOException {
Element docElem;
try {
docElem = new SAXBuilder().build(new StringReader(xml)).getRootElement();
} catch (JDOMException e) {
throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
}
Element textElem = docElem.getChild("TEXT");
StringBuilder text = new StringBuilder();
int offset = 0;
List<CoreMap> sentences = new ArrayList<CoreMap>();
for (Object sentObj: textElem.getChildren("SENT")) {
CoreMap sentence = new ArrayCoreMap();
sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
Element sentElem = (Element)sentObj;
Tree tree = Tree.valueOf(sentElem.getText());
List<CoreLabel> tokens = new ArrayList<CoreLabel>();
List<Tree> preTerminals = preTerminals(tree);
for (Tree preTerminal: preTerminals) {
String posTag = preTerminal.value();
for (Tree wordTree: preTerminal.children()) {
String word = wordTree.value();
CoreLabel token = new CoreLabel();
token.set(CoreAnnotations.TextAnnotation.class, word);
token.set(CoreAnnotations.TextAnnotation.class, word);
token.set(CoreAnnotations.PartOfSpeechAnnotation.class, posTag);
token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
offset += word.length();
token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
text.append(word);
text.append(' ');
offset += 1;
tokens.add(token);
}
}
if (preTerminals.size() > 0) {
text.setCharAt(text.length() - 1, '\n');
}
sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset - 1);
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
sentences.add(sentence);
}
String docID = docElem.getAttributeValue("id");
Matcher matcher = datePattern.matcher(docID);
matcher.find();
Calendar docDate = new Timex(matcher.group(1)).getDate();
Annotation document = new Annotation(text.toString());
document.set(CoreAnnotations.DocIDAnnotation.class, docID);
document.set(CoreAnnotations.CalendarAnnotation.class, docDate);
document.set(CoreAnnotations.SentencesAnnotation.class, sentences);
return document;
}
*/
private static Annotation toAnnotation(String xml) throws IOException {
Element docElem;
try {
Builder parser = new Builder();
StringReader in = new StringReader(xml);
docElem = parser.build(in).getRootElement();
} catch (ParsingException e) {
throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
} catch(IOException e) {
throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
}
Element textElem = docElem.getFirstChildElement("TEXT");
StringBuilder text = new StringBuilder();
int offset = 0;
List<CoreMap> sentences = new ArrayList<CoreMap>();
Elements sentenceElements = textElem.getChildElements("SENT");
for (int crtsent = 0; crtsent < sentenceElements.size(); crtsent ++){
Element sentElem = sentenceElements.get(crtsent);
CoreMap sentence = new ArrayCoreMap();
sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
Tree tree = Tree.valueOf(sentElem.getChild(0).getValue()); // XXX ms: is this the same as sentElem.getText() in JDOM?
List<CoreLabel> tokens = new ArrayList<CoreLabel>();
List<Tree> preTerminals = preTerminals(tree);
for (Tree preTerminal: preTerminals) {
String posTag = preTerminal.value();
for (Tree wordTree: preTerminal.children()) {
String word = wordTree.value();
CoreLabel token = new CoreLabel();
token.set(CoreAnnotations.TextAnnotation.class, word);
token.set(CoreAnnotations.TextAnnotation.class, word);
token.set(CoreAnnotations.PartOfSpeechAnnotation.class, posTag);
token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
offset += word.length();
token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
text.append(word);
text.append(' ');
offset += 1;
tokens.add(token);
}
}
if (preTerminals.size() > 0) {
text.setCharAt(text.length() - 1, '\n');
}
sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset - 1);
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
sentences.add(sentence);
}
String docID = docElem.getAttributeValue("id");
Matcher matcher = datePattern.matcher(docID);
matcher.find();
Calendar docDate = new Timex("DATE", matcher.group(1)).getDate();
Annotation document = new Annotation(text.toString());
document.set(CoreAnnotations.DocIDAnnotation.class, docID);
document.set(CoreAnnotations.CalendarAnnotation.class, docDate);
document.set(CoreAnnotations.SentencesAnnotation.class, sentences);
return document;
}
// todo [cdm 2013]: replace the methods below with ones in Tree?
// It depends on whether the code is somehow using preterminals with multiple children.
private static List<Tree> preTerminals(Tree tree) {
List<Tree> preTerminals = new ArrayList<Tree>();
for (Tree descendant: tree) {
if (isPreterminal(descendant)) {
preTerminals.add(descendant);
}
}
return preTerminals;
}
private static boolean isPreterminal(Tree tree) {
if (tree.isLeaf()) {
return false;
}
for (Tree child: tree.children()) {
if (!child.isLeaf()) {
return false;
}
}
return true;
}
}