Package edu.stanford.nlp.pipeline

Examples of edu.stanford.nlp.pipeline.Annotation


    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<Tree> allTrees = new ArrayList<Tree>();

    Annotation anno;

    try {
      String filename="";
      while(files.length > fileIndex){
        if(files[fileIndex].contains("apf.xml")) {
          filename = files[fileIndex];
          fileIndex++;
          break;
        }
        else {
          fileIndex++;
          filename="";
        }
      }
      if(files.length <= fileIndex && filename.equals("")) return null;

      anno = aceReader.parse(corpusPath+filename);
      stanfordProcessor.annotate(anno);


      List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);

      for (CoreMap s : sentences){
        int i = 1;
        for(CoreLabel w : s.get(CoreAnnotations.TokensAnnotation.class)){
          w.set(CoreAnnotations.IndexAnnotation.class, i++);
View Full Code Here


    logger.setLevel(Level.SEVERE);
  }

  @Override
  public Annotation read(String path) throws IOException {
    Annotation doc = new Annotation("");

    logger.info("Reading file: " + path);
    Iterator<String> lineIterator = IOUtils.readLines(path).iterator();

    // Each iteration through this loop processes a single sentence
    // along with any relations in it
    while (lineIterator.hasNext()) {
      Annotation sentence = readSentence(doc, path, lineIterator);
      AnnotationUtils.addSentence(doc, sentence);
    }

    return doc;
  }
View Full Code Here

      return "OTHER";
    throw new RuntimeException("Cannot normalize ner tag " + ner);
  }

  private static Annotation readSentence(Annotation doc, String docId, Iterator<String> lineIterator) {
    Annotation sentence = new Annotation("");
    sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
    sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, new ArrayList<EntityMention>());
    // we'll need to set things like the tokens and textContent after we've
    // fully read the sentence

    // contains the full text that we've read so far
    StringBuilder textContent = new StringBuilder();
    int tokenCount = 0; // how many tokens we've seen so far
    List<CoreLabel> tokens = new ArrayList<CoreLabel>();

    // when we've seen two blank lines in a row, this sentence is over (one
    // blank line separates the sentence and the relations
    int numBlankLinesSeen = 0;
    String sentenceID = null;

    // keeps tracks of entities we've seen so far for use by relations
    Map<String, EntityMention> indexToEntityMention = new HashMap<String, EntityMention>();

    while (lineIterator.hasNext() && numBlankLinesSeen < 2) {
      String currentLine = lineIterator.next();
      currentLine = currentLine.replace("COMMA", ",");

      List<String> pieces = StringUtils.split(currentLine);
      String identifier;

      int size = pieces.size();
      switch (size) {
      case 1: // blank line between sentences or relations
        numBlankLinesSeen++;
        break;
      case 3: // relation
        String type = pieces.get(2);
        List<ExtractionObject> args = new ArrayList<ExtractionObject>();
        EntityMention entity1 = indexToEntityMention.get(pieces.get(0));
        EntityMention entity2 = indexToEntityMention.get(pieces.get(1));
        args.add(entity1);
        args.add(entity2);
        Span span = new Span(entity1.getExtentTokenStart(), entity2
            .getExtentTokenEnd());
        // identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size();
        identifier = RelationMention.makeUniqueId();
        RelationMention relationMention = new RelationMention(identifier,
            sentence, span, type, null, args);
        AnnotationUtils.addRelationMention(sentence, relationMention);
        break;
      case 9: // token
        /*
         * Roth token lines look like this:
         *
         * 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O
         */

        // Entities may be multiple words joined by '/'; we split these up
        List<String> words = StringUtils.split(pieces.get(5), "/");
        //List<String> postags = StringUtils.split(pieces.get(4),"/");

        String text = StringUtils.join(words, " ");
        identifier = "entity" + pieces.get(0) + "-" + pieces.get(2);
        String nerTag = getNormalizedNERTag(pieces.get(1)); // entity type of the word/expression

        if (sentenceID == null)
          sentenceID = pieces.get(0);

        if (!nerTag.equals("O")) {
          Span extentSpan = new Span(tokenCount, tokenCount + words.size());
          // Temporarily sets the head span to equal the extent span.
          // This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called.
          // The head span is later modified if preprocessSentences is called.
          EntityMention entity = new EntityMention(identifier, sentence,
              extentSpan, extentSpan, nerTag, null, null);
          AnnotationUtils.addEntityMention(sentence, entity);

          // we can get by using these indices as strings since we only use them
          // as a hash key
          String index = pieces.get(2);
          indexToEntityMention.put(index, entity);
        }

        // int i =0;
        for (String word : words) {
          CoreLabel label = new CoreLabel();
          label.setWord(word);
          //label.setTag(postags.get(i));
          label.set(CoreAnnotations.TextAnnotation.class, word);
          label.set(CoreAnnotations.ValueAnnotation.class, word);
          // we don't set TokenBeginAnnotation or TokenEndAnnotation since we're
          // not keeping track of character offsets
          tokens.add(label);
          // i++;
        }

        textContent.append(text);
        textContent.append(" ");
        tokenCount += words.size();
        break;
      }
    }

    sentence.set(CoreAnnotations.TextAnnotation.class, textContent.toString());
    sentence.set(CoreAnnotations.ValueAnnotation.class, textContent.toString());
    sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
    sentence.set(CoreAnnotations.SentenceIDAnnotation.class, sentenceID);

    return sentence;
  }
View Full Code Here

    // just a simple test, to make sure stuff works
    Properties props = StringUtils.argsToProperties(args);
    RothCONLL04Reader reader = new RothCONLL04Reader();
    reader.setLoggerLevel(Level.INFO);
    reader.setProcessor(new StanfordCoreNLP(props));
    Annotation doc = reader.parse("/u/nlp/data/RothCONLL04/conll04.corp");
    System.out.println(AnnotationUtils.datasetToString(doc));
  }
View Full Code Here

  @Override
  public Annotation read(String path) throws IOException, SAXException, ParserConfigurationException {
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    File basePath = new File(path);
    assert basePath.exists();
    Annotation corpus = new Annotation("");

    if (basePath.isDirectory()) {
      for (File aceFile : IOUtils.iterFilesRecursive(basePath, ".apf.xml")) {
        if (aceFile.getName().endsWith(".UPC1.apf.xml")) {
          continue;
View Full Code Here

          tokenOffset += tokens.size();
          continue;
        }
      }

      CoreMap sentence = new Annotation(textContent.toString());
      sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
      sentence.set(CoreAnnotations.TokensAnnotation.class, words);
      logger.info("Reading sentence: \"" + textContent + "\"");

      List<AceEntityMention> entityMentions = aceDocument.getEntityMentions(sentenceIndex);
      List<AceRelationMention> relationMentions = aceDocument.getRelationMentions(sentenceIndex);
      List<AceEventMention> eventMentions = aceDocument.getEventMentions(sentenceIndex);
View Full Code Here

  public String[] splitText(String phraseText)
  {
    String[] words;
    if (tokenizer != null) {
      Annotation annotation = new Annotation(phraseText);
      tokenizer.annotate(annotation);
      List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
      words = new String[tokens.size()];
      for (int i = 0; i < tokens.size(); i++) {
        words[i] = tokens.get(i).word();
      }
    } else {
View Full Code Here

    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL+Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL+Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (! docMatcher.find(currentOffset)) return null;

    currentOffset = docMatcher.end();
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    //Maintain current document ID.
    Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL+Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if(docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1);
    else currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
      String sentenceString = sentenceMatcher.group(2);
      List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize();

      // FIXING TOKENIZATION PROBLEMS
      for (int i = 0; i < words.size(); i++) {
        CoreLabel w = words.get(i);
        if (i > 0 && w.word().equals("$")) {
          if(!words.get(i-1).word().endsWith("PRP") && !words.get(i-1).word().endsWith("WP"))
            continue;
          words.get(i-1).set(CoreAnnotations.TextAnnotation.class, words.get(i-1).word()+"$");
          words.remove(i);
          i--;
        } else if (w.word().equals("\\/")) {
          if(words.get(i-1).word().equals("</COREF>"))
            continue;
          w.set(CoreAnnotations.TextAnnotation.class, words.get(i-1).word()+"\\/"+words.get(i+1).word());
          words.remove(i+1);
          words.remove(i-1);
        }
      }
      // END FIXING TOKENIZATION PROBLEMS

      List<CoreLabel> sentence = new ArrayList<CoreLabel>();
      // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open
      Stack<Mention> stack = new Stack<Mention>();
      List<Mention> mentions = new ArrayList<Mention>();

      allWords.add(sentence);
      allGoldMentions.add(mentions);

      for (CoreLabel word : words) {
        String w = word.get(CoreAnnotations.TextAnnotation.class);
        // found regular token: WORD/POS
        if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length()-2) {
          int i = w.lastIndexOf("\\/");
          String w1 = w.substring(0, i);
          // we do NOT set POS info here. We take the POS tags from the parser!
          word.set(CoreAnnotations.TextAnnotation.class, w1);
          word.remove(CoreAnnotations.OriginalTextAnnotation.class);
          if(Constants.USE_GOLD_NE) {
            if (ner != null) {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
            } else {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
            }
          }
          sentence.add(word);
        }
        // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
        else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
          Pattern nerPattern = Pattern.compile("<(.*?)>");
          Matcher m = nerPattern.matcher(w);
          m.find();
          ner = m.group(1);
        }
        // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
        else if (w.startsWith("</") && !w.startsWith("</COREF")) {
          Pattern nerPattern = Pattern.compile("</(.*?)>");
          Matcher m = nerPattern.matcher(w);
          m.find();
          String ner1 = m.group(1);
          if (ner != null && !ner.equals(ner1)) throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
          ner = null;
        }
        // found the start SGML tag for a coref mention
        else if (w.startsWith("<COREF")) {
          Mention mention = new Mention();
          // position of this mention in the sentence
          mention.startIndex = sentence.size();

          // extract GOLD info about this coref chain. needed for eval
          Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
          Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

          Matcher m = idPattern.matcher(w);
          m.find();
          mention.mentionID = Integer.parseInt(m.group(1));

          m = refPattern.matcher(w);
          if (m.find()) {
            mention.originalRef = Integer.parseInt(m.group(1));
          }

          // open mention. keep track of all open mentions using the stack
          stack.push(mention);
        }
        // found the end SGML tag for a coref mention
        else if (w.equals("</COREF>")) {
          Mention mention = stack.pop();
          mention.endIndex = sentence.size();

          // this is a closed mention. add it to the final list of mentions
          // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef);
          mentions.add(mention);
        } else {
          word.remove(CoreAnnotations.OriginalTextAnnotation.class);
          if(Constants.USE_GOLD_NE){
            if (ner != null) {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
            } else {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
            }
          }
          sentence.add(word);
        }
      }
      StringBuilder textContent = new StringBuilder();
      for (int i=0 ; i<sentence.size(); i++){
        CoreLabel w = sentence.get(i);
        w.set(CoreAnnotations.IndexAnnotation.class, i+1);
        w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
        if(i>0) textContent.append(" ");
        textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
      }
      CoreMap sentCoreMap = new Annotation(textContent.toString());
      allSentences.add(sentCoreMap);
      sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap();    // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
View Full Code Here

    int sentCnt = 0;
    int tokenCnt = 0;
    for (Document doc; (doc = reader.getNextDocument()) != null; ) {
      corpusStats.process(doc);
      docCnt++;
      Annotation anno = doc.getAnnotation();
      if (debug) System.out.println("Document " + docCnt + ": " + anno.get(CoreAnnotations.DocIDAnnotation.class));
      for (CoreMap sentence:anno.get(CoreAnnotations.SentencesAnnotation.class)) {
        if (debug) System.out.println("Parse: " + sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
        if (debug) System.out.println("Sentence Tokens: " + StringUtils.join(sentence.get(CoreAnnotations.TokensAnnotation.class), ","));
        writeTabSep(fout,sentence,doc.corefChainMap);
        sentCnt++;
        tokenCnt += sentence.get(CoreAnnotations.TokensAnnotation.class).size();
View Full Code Here

    }

    private CoreMap wordsToSentence(List<String[]> sentWords)
    {
      String sentText = concatField(sentWords, FIELD_WORD);
      Annotation sentence = new Annotation(sentText);
      Tree tree = wordsToParse(sentWords);
      sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
      List<Tree> leaves = tree.getLeaves();
      // Check leaves == number of words
      assert(leaves.size() == sentWords.size());
      List<CoreLabel> tokens = new ArrayList<CoreLabel>(leaves.size());
      sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
      for (int i = 0; i < sentWords.size(); i++) {
        String[] fields = sentWords.get(i);
        int wordPos = Integer.parseInt(fields[FIELD_WORD_NO]);
        assert(wordPos == i);
        Tree leaf = leaves.get(i);
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.pipeline.Annotation

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.