Package edu.stanford.nlp.ling

Examples of edu.stanford.nlp.ling.CoreLabel


   * @param tokens The list of tokens representing this sentence.
   * @return The original text of the sentence.
   */
  protected String recoverOriginalText(List<CoreLabel> tokens, CoreNLPProtos.Sentence sentence) {
    StringBuilder text = new StringBuilder();
    CoreLabel last = null;
    if (tokens.size() > 0) {
      CoreLabel token = tokens.get(0);
      if (token.originalText() != null) { text.append(token.originalText()); } else { text.append(token.word()); }
      last = tokens.get(0);
    }
    for (int i = 1; i < tokens.size(); ++i) {
      CoreLabel token = tokens.get(i);
      if (token.before() != null) {
        text.append(token.before());
        assert last != null;
        int missingWhitespace = (token.beginPosition() - last.endPosition()) - token.before().length();
        while (missingWhitespace > 0) {
          text.append(' ');
          missingWhitespace -= 1;
        }
      }
      if (token.originalText() != null) { text.append(token.originalText()); } else { text.append(token.word()); }
      last = token;
    }
    return text.toString();
  }
View Full Code Here


        assertEquals("Wrong number of sentences", 1, doc.size());
        List<CoreLabel> tokens = doc.get(0);
        assertEquals("Wrong number of tokens", offsets.length, tokens.size());

        for (int j = 0, sz = tokens.size(); j < sz; j++) {
          CoreLabel token = tokens.get(j);
          assertEquals("Wrong begin offset", offsets[j][0], (int) token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
          assertEquals("Wrong end offset", offsets[j][1], (int) token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
        }
      }
    }
  }
View Full Code Here

  public static State initialStateFromTaggedSentence(List<? extends HasWord> words) {
    List<Tree> preterminals = Generics.newArrayList();
    for (int index = 0; index < words.size(); ++index) {
      HasWord hw = words.get(index);

      CoreLabel wordLabel;
      String tag;
      if (hw instanceof CoreLabel) {
        wordLabel = (CoreLabel) hw;
        tag = wordLabel.tag();
        CoreLabel cl = (CoreLabel) hw;
      } else {
        wordLabel = new CoreLabel();
        wordLabel.setValue(hw.word());
        wordLabel.setWord(hw.word());
        if (!(hw instanceof HasTag)) {
          throw new IllegalArgumentException("Expected tagged words");
        }
        tag = ((HasTag) hw).tag();
        wordLabel.setTag(tag);
      }
      if (tag == null) {
        throw new IllegalArgumentException("Input word not tagged");
      }
      CoreLabel tagLabel = new CoreLabel();
      tagLabel.setValue(tag);

      // Index from 1.  Tools downstream from the parser expect that
      // Internally this parser uses the index, so we have to
      // overwrite incorrect indices if the label is already indexed
      wordLabel.setIndex(index + 1);
      tagLabel.setIndex(index + 1);

      LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel);
      LabeledScoredTreeNode tagNode = new LabeledScoredTreeNode(tagLabel);
      tagNode.addChild(wordNode);

      wordLabel.set(TreeCoreAnnotations.HeadWordAnnotation.class, wordNode);
      wordLabel.set(TreeCoreAnnotations.HeadTagAnnotation.class, tagNode);
      tagLabel.set(TreeCoreAnnotations.HeadWordAnnotation.class, wordNode);
      tagLabel.set(TreeCoreAnnotations.HeadTagAnnotation.class, tagNode);

      preterminals.add(tagNode);
    }
    return new State(preterminals);
  }
View Full Code Here

    return featuresCpC;
  }

  protected Collection<String> featuresCp2C(PaddedList<IN> cInfo, int loc) {
    CoreLabel c = cInfo.get(loc);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);

    String cWord = getWord(c);
    String pWord = getWord(p);
    String p2Word = getWord(p2);
    Collection<String> featuresCp2C = new ArrayList<String>();

    if (flags.useMoreAbbr) {
      featuresCp2C.add(p2.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-P2ABBRANS");
    }

    if (flags.useMinimalAbbr) {
      featuresCp2C.add(p2.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-P2AP2CABB");
    }

    if (flags.useMinimalAbbr1) {
      if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
        featuresCp2C.add(p2.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-P2AP2CABB");
      }
    }

    if (flags.useParenMatching) {
      if (flags.useReverse) {
View Full Code Here

    return featuresCp2C;
  }

  protected Collection<String> featuresCp3C(PaddedList<IN> cInfo, int loc) {
    CoreLabel c = cInfo.get(loc);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);

    String cWord = getWord(c);
    String pWord = getWord(p);
    String p2Word = getWord(p2);
    String p3Word = getWord(p3);
View Full Code Here

    return featuresCp3C;
  }

  protected Collection<String> featuresCp4C(PaddedList<IN> cInfo, int loc) {
    CoreLabel c = cInfo.get(loc);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);
    CoreLabel p4 = cInfo.get(loc - 4);

    String cWord = getWord(c);
    String pWord = getWord(p);
    String p2Word = getWord(p2);
    String p3Word = getWord(p3);
View Full Code Here

    return featuresCp4C;
  }

  protected Collection<String> featuresCp5C(PaddedList<IN> cInfo, int loc) {
    CoreLabel c = cInfo.get(loc);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);
    CoreLabel p4 = cInfo.get(loc - 4);
    CoreLabel p5 = cInfo.get(loc - 5);

    String cWord = getWord(c);
    String pWord = getWord(p);
    String p2Word = getWord(p2);
    String p3Word = getWord(p3);
View Full Code Here

    return featuresCp5C;
  }


  protected Collection<String> featuresCpCp2C(PaddedList<IN> cInfo, int loc) {
    CoreLabel c = cInfo.get(loc);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);

    String pWord = getWord(p);
    // String p2Word = getWord(p2);

    Collection<String> featuresCpCp2C = new ArrayList<String>();

    if (flags.useInternal && flags.useExternal) {

      if (flags.strictGoodCoNLL && ! flags.removeStrictGoodCoNLLDuplicates && flags.useTypeySequences && flags.maxLeft >= 2) {
        // this feature duplicates -TYPETYPES below, so probably don't include it, but it was in original tests of CMM goodCoNLL
        featuresCpCp2C.add(p2.get(CoreAnnotations.ShapeAnnotation.class) + '-' + p.get(CoreAnnotations.ShapeAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTPS");
      }

      if (flags.useAbbr) {
        featuresCpCp2C.add(p2.get(CoreAnnotations.AbbrAnnotation.class) + '-' + p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-2PABBRANS");
      }

      if (flags.useChunks) {
        featuresCpCp2C.add(p2.get(CoreAnnotations.ChunkAnnotation.class) + '-' + p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + "-2PCHUNKS");
      }

      if (flags.useLongSequences) {
        featuresCpCp2C.add("PPSEQ");
      }
      if (flags.useBoundarySequences && pWord.equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) {
        featuresCpCp2C.add("BNDRY-SPAN-PPSEQ");
      }
      // This more complex consistency checker didn't help!
      // if (flags.useBoundarySequences) {
      //   // try enforce consistency over "and" and "," as well as boundary
      //   if (pWord.equals(CoNLLDocumentIteratorFactory.BOUNDARY) ||
      //       pWord.equalsIgnoreCase("and") || pWord.equalsIgnoreCase("or") ||
      //       pWord.equals(",")) {
      //   }
      // }

      if (flags.useTaggySequences) {
        if (flags.useTags) {
          featuresCpCp2C.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS");
          if (flags.useTaggySequencesShapeInteraction) {
            featuresCpCp2C.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTS-CS");
          }
        }
        if (flags.useDistSim) {
          featuresCpCp2C.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1");
          if (flags.useTaggySequencesShapeInteraction) {
            featuresCpCp2C.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-DISTSIM_TTS1-CS");
          }
        }
      }

      if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) ||
           flags.useShapeStrings)
          && flags.useTypeSeqs && flags.useTypeSeqs2 && flags.maxLeft >= 2) {
        String cShape = c.get(CoreAnnotations.ShapeAnnotation.class);
        String pShape = p.get(CoreAnnotations.ShapeAnnotation.class);
        String p2Shape = p2.get(CoreAnnotations.ShapeAnnotation.class);
        featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES");
      }
    } else if (flags.useInternal) {

      if (flags.useLongSequences) {
        featuresCpCp2C.add("PPSEQ");
      }
    } else if (flags.useExternal) {

      if (flags.useLongSequences) {
        featuresCpCp2C.add("PPSEQ");
      }

      if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) ||
           flags.useShapeStrings)
          && flags.useTypeSeqs && flags.useTypeSeqs2 && flags.maxLeft >= 2) {
        String cShape = c.get(CoreAnnotations.ShapeAnnotation.class);
        String pShape = p.get(CoreAnnotations.ShapeAnnotation.class);
        String p2Shape = p2.get(CoreAnnotations.ShapeAnnotation.class);
        featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES");
      }
    }

    return featuresCpCp2C;
View Full Code Here

          } else if (endMatcher.matches()) {
            //System.out.println("matched end");
            label = backgroundSymbol;
          } else {

            CoreLabel c = new CoreLabel();

            List<String> toks = new ArrayList<String>();

            toks.add(tok);

            for (String toksplit : toks) {

              sentStr += " " + toksplit;

              c.setWord(toksplit);
              c.setLemma(toksplit);
              c.setValue(toksplit);
              c.set(CoreAnnotations.TextAnnotation.class, toksplit);
              c.set(CoreAnnotations.OriginalTextAnnotation.class, tok);

              if (setGoldClass){
                
                c.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
              }
             
              if (setClassForTheseLabels != null
                  && setClassForTheseLabels.containsKey(label))
                c.set(setClassForTheseLabels.get(label), label);

              sent.add(c);
            }
          }
        }
View Full Code Here

    if (flags.lowerNewgeneThreshold) {
      // Used to raise recall for task 1B
      System.err.println("Using NEWGENE threshold: " + flags.newgeneThreshold);
      for (int i = 0, docSize = document.size(); i < docSize; i++) {
        CoreLabel wordInfo = document.get(i);
        Datum<String, String> d = makeDatum(document, i, featureFactories);
        Counter<String> scores = classifier.scoresOf(d);
        //String answer = BACKGROUND;
        String answer = flags.backgroundSymbol;
        // HN: The evaluation of scoresOf seems to result in some
        // kind of side effect.  Specifically, the symptom is that
        // if scoresOf is not evaluated at every position, the
        // answers are different
        if ("NEWGENE".equals(wordInfo.get(CoreAnnotations.GazAnnotation.class))) {
          for (String label : scores.keySet()) {
            if ("G".equals(label)) {
              System.err.println(wordInfo.word() + ':' + scores.getCount(label));
              if (scores.getCount(label) > flags.newgeneThreshold) {
                answer = label;
              }
            }
          }
        }
        wordInfo.set(CoreAnnotations.AnswerAnnotation.class, answer);
      }
    } else {
      for (int i = 0, listSize = document.size(); i < listSize; i++) {
        String answer = classOf(document, i);
        CoreLabel wordInfo = document.get(i);
        //System.err.println("XXX answer for " +
        //        wordInfo.word() + " is " + answer);
        wordInfo.set(CoreAnnotations.AnswerAnnotation.class, answer);
      }
      if (flags.justify && (classifier instanceof LinearClassifier)) {
        LinearClassifier<String, String> lc = (LinearClassifier<String, String>) classifier;
        for (int i = 0, lsize = document.size(); i < lsize; i++) {
          CoreLabel lineInfo = document.get(i);
          System.err.print("@@ Position " + i + ": ");
          System.err.println(lineInfo.word() + " chose " + lineInfo.get(CoreAnnotations.AnswerAnnotation.class));
          lc.justificationOf(makeDatum(document, i, featureFactories));
        }
      }
    }
    if (flags.useReverse) {
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.ling.CoreLabel

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.