Source Code of edu.stanford.nlp.time.ParsedGigawordReader

package edu.stanford.nlp.time;


import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import nu.xom.Builder;
import nu.xom.Element;
import nu.xom.Elements;
import nu.xom.ParsingException;


import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.CoreMap;
import java.util.function.Function;
import edu.stanford.nlp.util.Iterables;




/**
 * @author Karthik Raghunathan
 */
public class ParsedGigawordReader implements Iterable<Annotation> {


  private Iterable<File> files;


  public ParsedGigawordReader(File directory) {
    this.files = IOUtils.iterFilesRecursive(directory);
  }


  @Override
  public Iterator<Annotation> iterator() {
    return new Iterator<Annotation>() {
      private Iterator<BufferedReader> readers = Iterables.transform(files,
          file -> IOUtils.readerFromFile(file)).iterator();


      private BufferedReader reader = findReader();
      private Annotation annotation = findAnnotation();


      @Override
      public boolean hasNext() {
        return this.annotation != null;
      }


      @Override
      public Annotation next() {
        if (this.annotation == null) {
          throw new NoSuchElementException();
        }
        Annotation toReturn = this.annotation;
        this.annotation = this.findAnnotation();
        return toReturn;
      }


      @Override
      public void remove() {
        throw new UnsupportedOperationException();
      }


      private BufferedReader findReader() {
        return this.readers.hasNext() ? this.readers.next() : null;
      }


      private Annotation findAnnotation() {
        if (this.reader == null) {
          return null;
        }
        try {
          String line;
          StringBuilder doc = new StringBuilder();
          while ((line = this.reader.readLine()) != null) {
            doc.append(line);
            doc.append('\n');
//            if(line.contains("<DOC id")){
//              System.err.println(line);
//            }
            if (line.equals("</DOC>")) {
              break;
            }
            if (line.contains("</DOC>")) {
              throw new RuntimeException(String.format("invalid line '%s'", line));
            }
          }
          if (line == null) {
            this.reader.close();
            this.reader = findReader();
          }
          String xml = doc.toString().replaceAll("&", "&amp;");
          if(xml == null || xml.equals("")) {
            return findAnnotation();
          }


          xml = xml.replaceAll("num=([0-9]+) (.*)", "num=\"$1\" $2");
          xml = xml.replaceAll("sid=(.*)>", "sid=\"$1\">");
          xml = xml.replaceAll("</SENT>\n</DOC>", "</SENT>\n</TEXT>\n</DOC>");
          xml = new String(xml.getBytes(), "UTF8");
          //System.err.println("This is what goes in:\n" + xml);
          return toAnnotation(xml);
        } catch (IOException e) {
          throw new RuntimeIOException(e);
        }
      }
    };
  }


  private static final Pattern datePattern = Pattern.compile("^\\w+_\\w+_(\\d+)\\.");


  /*
   * Old implementation based on JDOM.
   * No longer maintained due to JDOM licensing issues.
  private static Annotation toAnnotation(String xml) throws IOException {
    Element docElem;
    try {
      docElem = new SAXBuilder().build(new StringReader(xml)).getRootElement();
    } catch (JDOMException e) {
      throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
    }
    Element textElem = docElem.getChild("TEXT");
    StringBuilder text = new StringBuilder();
    int offset = 0;
    List<CoreMap> sentences = new ArrayList<CoreMap>();
    for (Object sentObj: textElem.getChildren("SENT")) {
      CoreMap sentence = new ArrayCoreMap();
      sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
      Element sentElem = (Element)sentObj;
      Tree tree = Tree.valueOf(sentElem.getText());
      List<CoreLabel> tokens = new ArrayList<CoreLabel>();
      List<Tree> preTerminals = preTerminals(tree);
      for (Tree preTerminal: preTerminals) {
        String posTag = preTerminal.value();
        for (Tree wordTree: preTerminal.children()) {
          String word = wordTree.value();
          CoreLabel token = new CoreLabel();
          token.set(CoreAnnotations.TextAnnotation.class, word);
          token.set(CoreAnnotations.TextAnnotation.class, word);
          token.set(CoreAnnotations.PartOfSpeechAnnotation.class, posTag);
          token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
          offset += word.length();
          token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
          text.append(word);
          text.append(' ');
          offset += 1;
          tokens.add(token);
        }
      }
      if (preTerminals.size() > 0) {
        text.setCharAt(text.length() - 1, '\n');
      }
      sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset - 1);
      sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
      sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
      sentences.add(sentence);
    }


    String docID = docElem.getAttributeValue("id");
    Matcher matcher = datePattern.matcher(docID);
    matcher.find();
    Calendar docDate = new Timex(matcher.group(1)).getDate();


    Annotation document = new Annotation(text.toString());
    document.set(CoreAnnotations.DocIDAnnotation.class, docID);
    document.set(CoreAnnotations.CalendarAnnotation.class, docDate);
    document.set(CoreAnnotations.SentencesAnnotation.class, sentences);
    return document;
  }
  */


  private static Annotation toAnnotation(String xml) throws IOException {
    Element docElem;
    try {
      Builder parser = new Builder();
      StringReader in = new StringReader(xml);
      docElem = parser.build(in).getRootElement();
    } catch (ParsingException e) {
      throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
    } catch(IOException e) {
      throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
    }


    Element textElem = docElem.getFirstChildElement("TEXT");
    StringBuilder text = new StringBuilder();
    int offset = 0;
    List<CoreMap> sentences = new ArrayList<CoreMap>();
    Elements sentenceElements = textElem.getChildElements("SENT");
    for (int crtsent = 0; crtsent < sentenceElements.size(); crtsent ++){
      Element sentElem = sentenceElements.get(crtsent);
      CoreMap sentence = new ArrayCoreMap();
      sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
      Tree tree = Tree.valueOf(sentElem.getChild(0).getValue()); // XXX ms: is this the same as sentElem.getText() in JDOM?
      List<CoreLabel> tokens = new ArrayList<CoreLabel>();
      List<Tree> preTerminals = preTerminals(tree);
      for (Tree preTerminal: preTerminals) {
        String posTag = preTerminal.value();
        for (Tree wordTree: preTerminal.children()) {
          String word = wordTree.value();
          CoreLabel token = new CoreLabel();
          token.set(CoreAnnotations.TextAnnotation.class, word);
          token.set(CoreAnnotations.TextAnnotation.class, word);
          token.set(CoreAnnotations.PartOfSpeechAnnotation.class, posTag);
          token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
          offset += word.length();
          token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
          text.append(word);
          text.append(' ');
          offset += 1;
          tokens.add(token);
        }
      }
      if (preTerminals.size() > 0) {
        text.setCharAt(text.length() - 1, '\n');
      }
      sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset - 1);
      sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
      sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
      sentences.add(sentence);
    }


    String docID = docElem.getAttributeValue("id");
    Matcher matcher = datePattern.matcher(docID);
    matcher.find();
    Calendar docDate = new Timex("DATE", matcher.group(1)).getDate();


    Annotation document = new Annotation(text.toString());
    document.set(CoreAnnotations.DocIDAnnotation.class, docID);
    document.set(CoreAnnotations.CalendarAnnotation.class, docDate);
    document.set(CoreAnnotations.SentencesAnnotation.class, sentences);
    return document;
  }


  // todo [cdm 2013]: replace the methods below with ones in Tree?
  // It depends on whether the code is somehow using preterminals with multiple children.


  private static List<Tree> preTerminals(Tree tree) {
    List<Tree> preTerminals = new ArrayList<Tree>();
    for (Tree descendant: tree) {
      if (isPreterminal(descendant)) {
        preTerminals.add(descendant);
      }
    }
    return preTerminals;
  }


  private static boolean isPreterminal(Tree tree) {
    if (tree.isLeaf()) {
      return false;
    }
    for (Tree child: tree.children()) {
      if (!child.isLeaf()) {
        return false;
      }
    }
    return true;
  }


}
Source Code of edu.stanford.nlp.time.ParsedGigawordReader

Related Classes of edu.stanford.nlp.time.ParsedGigawordReader