Source Code of edu.stanford.nlp.ie.machinereading.domains.roth.RothCONLL04Reader

package edu.stanford.nlp.ie.machinereading.domains.roth;


import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;


import edu.stanford.nlp.ie.machinereading.GenericDataSetReader;
import edu.stanford.nlp.ie.machinereading.structure.AnnotationUtils;
import edu.stanford.nlp.ie.machinereading.structure.EntityMention;
import edu.stanford.nlp.ie.machinereading.structure.ExtractionObject;
import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
import edu.stanford.nlp.ie.machinereading.structure.RelationMention;
import edu.stanford.nlp.ie.machinereading.structure.Span;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.StringUtils;


/**
 *
 * @author Mihai, David McClosky, and agusev
 * @author Sonal Gupta (sonalg@stanford.edu)
 *
 */
public class RothCONLL04Reader extends GenericDataSetReader {
  public RothCONLL04Reader() {
    super(null, true, true, true);


    // change the logger to one from our namespace
    logger = Logger.getLogger(RothCONLL04Reader.class.getName());
    // run quietly by default
    logger.setLevel(Level.SEVERE);
  }


  @Override
  public Annotation read(String path) throws IOException {
    Annotation doc = new Annotation("");


    logger.info("Reading file: " + path);
    Iterator<String> lineIterator = IOUtils.readLines(path).iterator();


    // Each iteration through this loop processes a single sentence
    // along with any relations in it
    while (lineIterator.hasNext()) {
      Annotation sentence = readSentence(doc, path, lineIterator);
      AnnotationUtils.addSentence(doc, sentence);
    }


    return doc;
  }




  private static String getNormalizedNERTag(String ner){
    if(ner.equalsIgnoreCase("O"))
      return "O";
    else if(ner.equalsIgnoreCase("Peop"))
      return "PERSON";
    else if(ner.equalsIgnoreCase("Loc"))
      return "LOCATION";
    else if(ner.equalsIgnoreCase("Org"))
      return "ORGANIZATION";
    else if(ner.equalsIgnoreCase("Other"))
      return "OTHER";
    throw new RuntimeException("Cannot normalize ner tag " + ner);
  }


  private static Annotation readSentence(Annotation doc, String docId, Iterator<String> lineIterator) {
    Annotation sentence = new Annotation("");
    sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
    sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, new ArrayList<EntityMention>());
    // we'll need to set things like the tokens and textContent after we've
    // fully read the sentence


    // contains the full text that we've read so far
    StringBuilder textContent = new StringBuilder();
    int tokenCount = 0; // how many tokens we've seen so far
    List<CoreLabel> tokens = new ArrayList<CoreLabel>();


    // when we've seen two blank lines in a row, this sentence is over (one
    // blank line separates the sentence and the relations
    int numBlankLinesSeen = 0;
    String sentenceID = null;


    // keeps tracks of entities we've seen so far for use by relations
    Map<String, EntityMention> indexToEntityMention = new HashMap<String, EntityMention>();


    while (lineIterator.hasNext() && numBlankLinesSeen < 2) {
      String currentLine = lineIterator.next();
      currentLine = currentLine.replace("COMMA", ",");


      List<String> pieces = StringUtils.split(currentLine);
      String identifier;


      int size = pieces.size();
      switch (size) {
      case 1: // blank line between sentences or relations
        numBlankLinesSeen++;
        break;
      case 3: // relation
        String type = pieces.get(2);
        List<ExtractionObject> args = new ArrayList<ExtractionObject>();
        EntityMention entity1 = indexToEntityMention.get(pieces.get(0));
        EntityMention entity2 = indexToEntityMention.get(pieces.get(1));
        args.add(entity1);
        args.add(entity2);
        Span span = new Span(entity1.getExtentTokenStart(), entity2
            .getExtentTokenEnd());
        // identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size();
        identifier = RelationMention.makeUniqueId();
        RelationMention relationMention = new RelationMention(identifier,
            sentence, span, type, null, args);
        AnnotationUtils.addRelationMention(sentence, relationMention);
        break;
      case 9: // token
        /*
         * Roth token lines look like this:
         *
         * 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O
         */


        // Entities may be multiple words joined by '/'; we split these up
        List<String> words = StringUtils.split(pieces.get(5), "/");
        //List<String> postags = StringUtils.split(pieces.get(4),"/");


        String text = StringUtils.join(words, " ");
        identifier = "entity" + pieces.get(0) + "-" + pieces.get(2);
        String nerTag = getNormalizedNERTag(pieces.get(1)); // entity type of the word/expression


        if (sentenceID == null)
          sentenceID = pieces.get(0);


        if (!nerTag.equals("O")) {
          Span extentSpan = new Span(tokenCount, tokenCount + words.size());
          // Temporarily sets the head span to equal the extent span.
          // This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called.
          // The head span is later modified if preprocessSentences is called.
          EntityMention entity = new EntityMention(identifier, sentence,
              extentSpan, extentSpan, nerTag, null, null);
          AnnotationUtils.addEntityMention(sentence, entity);


          // we can get by using these indices as strings since we only use them
          // as a hash key
          String index = pieces.get(2);
          indexToEntityMention.put(index, entity);
        }


        // int i =0;
        for (String word : words) {
          CoreLabel label = new CoreLabel();
          label.setWord(word);
          //label.setTag(postags.get(i));
          label.set(CoreAnnotations.TextAnnotation.class, word);
          label.set(CoreAnnotations.ValueAnnotation.class, word);
          // we don't set TokenBeginAnnotation or TokenEndAnnotation since we're
          // not keeping track of character offsets
          tokens.add(label);
          // i++;
        }


        textContent.append(text);
        textContent.append(" ");
        tokenCount += words.size();
        break;
      }
    }


    sentence.set(CoreAnnotations.TextAnnotation.class, textContent.toString());
    sentence.set(CoreAnnotations.ValueAnnotation.class, textContent.toString());
    sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
    sentence.set(CoreAnnotations.SentenceIDAnnotation.class, sentenceID);


    return sentence;
  }


  /*
   * Gets the index of an object in a list using == to test (List.indexOf uses
   * equals() which could be problematic here)
   */
  private static <X> int getIndexByObjectEquality(List<X> list, X obj) {
    for (int i = 0, sz = list.size(); i < sz; i++) {
      if (list.get(i) == obj) {
        return i;
      }
    }
    return -1;
  }


  /*
   * Sets the head word and the index for an entity, given the parse tree for
   * the sentence containing the entity.
   *
   * This code is no longer used, but I've kept it around (at least for now) as
   * reference when we modify preProcessSentences().
   */
  @SuppressWarnings("unused")
  private void setHeadWord(EntityMention entity, Tree tree) {
    List<Tree> leaves = tree.getLeaves();
    Tree argRoot = tree.joinNode(leaves.get(entity.getExtentTokenStart()),
        leaves.get(entity.getExtentTokenEnd()));
    Tree headWordNode = argRoot.headTerminal(headFinder);


    int headWordIndex = getIndexByObjectEquality(leaves, headWordNode);


    if (StringUtils.isPunct(leaves.get(entity.getExtentTokenEnd()).label().value().trim())
        && (headWordIndex >= entity.getExtentTokenEnd()
            || headWordIndex < entity.getExtentTokenStart())) {


      argRoot = tree.joinNode(leaves.get(entity.getExtentTokenStart()), leaves
          .get(entity.getExtentTokenEnd() - 1));
      headWordNode = argRoot.headTerminal(headFinder);
      headWordIndex = getIndexByObjectEquality(leaves, headWordNode);


      if (headWordIndex >= entity.getExtentTokenStart()
          && headWordIndex <= entity.getExtentTokenEnd() - 1) {
        entity.setHeadTokenPosition(headWordIndex);
        entity.setHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
      }
    }


    if (headWordIndex >= entity.getExtentTokenStart()
        && headWordIndex <= entity.getExtentTokenEnd()) {
      entity.setHeadTokenPosition(headWordIndex);
      entity.setHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
    } else {
      // Re-parse the argument words by themselves
      // Get the list of words in the arg by looking at the leaves between
      // arg.getExtentTokenStart() and arg.getExtentTokenEnd() inclusive
      List<String> argWords = new ArrayList<String>();
      for (int i = entity.getExtentTokenStart(); i <= entity.getExtentTokenEnd(); i++) {
        argWords.add(leaves.get(i).label().value());
      }
      if (StringUtils.isPunct(argWords.get(argWords.size() - 1))) {
        argWords.remove(argWords.size() - 1);
      }
      Tree argTree = parseStrings(argWords);
      headWordNode = argTree.headTerminal(headFinder);
      headWordIndex = getIndexByObjectEquality(argTree.getLeaves(),
          headWordNode)
          + entity.getExtentTokenStart();
      entity.setHeadTokenPosition(headWordIndex);
      entity.setHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
    }
  }


  public static void main(String[] args) throws Exception {
    // just a simple test, to make sure stuff works
    Properties props = StringUtils.argsToProperties(args);
    RothCONLL04Reader reader = new RothCONLL04Reader();
    reader.setLoggerLevel(Level.INFO);
    reader.setProcessor(new StanfordCoreNLP(props));
    Annotation doc = reader.parse("/u/nlp/data/RothCONLL04/conll04.corp");
    System.out.println(AnnotationUtils.datasetToString(doc));
  }
}
Source Code of edu.stanford.nlp.ie.machinereading.domains.roth.RothCONLL04Reader

Related Classes of edu.stanford.nlp.ie.machinereading.domains.roth.RothCONLL04Reader