Package edu.stanford.nlp.ling

Examples of edu.stanford.nlp.ling.CoreLabel


        continue;
      }

      // - or +
      if(i < tokens.size() - 1){
        CoreLabel crt = tokens.get(i);
        CoreLabel nxt = tokens.get(i + 1);

        // trailing +
        if(crt.endPosition() == nxt.beginPosition() &&
          ! isParen(crt.word()) &&
          nxt.word().equals("+")){
          String word = crt.word() + nxt.word();
          output.add(tokenFactory.makeToken(word, crt.beginPosition(), word.length()));
          i ++;
          continue;
        }

        // trailing -
        if(crt.endPosition() == nxt.beginPosition() &&
          (i + 2 >= tokens.size() || nxt.endPosition() != tokens.get(i + 2).beginPosition()) &&
          ! isParen(crt.word()) &&
          nxt.word().equals("-")){
          String word = crt.word() + nxt.word();
          output.add(tokenFactory.makeToken(word, crt.beginPosition(), word.length()));
          i ++;
          continue;
        }

        // preceding -
        if(crt.endPosition() == nxt.beginPosition() &&
          (i == 0 || crt.beginPosition() != tokens.get(i - 1).endPosition()) &&
          ! isParen(nxt.word()) &&
          crt.word().equals("-")){
          String word = crt.word() + nxt.word();
          output.add(tokenFactory.makeToken(word, crt.beginPosition(), word.length()));
          i ++;
          continue;
        }
      }
View Full Code Here


  */

  private static List<CoreLabel> breakOnPattern(List<CoreLabel> tokens, Pattern pattern) {
    List<CoreLabel> output = new ArrayList<CoreLabel>();
    for (int i = 0; i < tokens.size(); i++) {
      CoreLabel token = tokens.get(i);
      Matcher matcher = pattern.matcher(token.word());
      if (matcher.find()) {
        int sepPos = matcher.start(2);
        String s1 = token.word().substring(0, sepPos);
        if(! DISCARD_STANDALONE_DASHES || ! s1.equals("-")){
          output.add(tokenFactory.makeToken(s1, token.beginPosition(), sepPos));
        }
        String sep = matcher.group(2);
        if(! DISCARD_STANDALONE_DASHES || ! sep.equals("-")){
          output.add(tokenFactory.makeToken(sep, token.beginPosition() + sepPos, 1));
        }
        String s3 = token.word().substring(sepPos + 1);
        if(! DISCARD_STANDALONE_DASHES || ! s3.equals("-")){
          output.add(tokenFactory.makeToken(s3, token.beginPosition() + sepPos + 1,
            token.endPosition() - token.beginPosition() - sepPos - 1));
        }

      } else {
        output.add(token);
      }
View Full Code Here

  private String tokensToString(Annotation annotation, List<CoreLabel> tokens) {
    if (tokens.isEmpty()) return "";
    // Try to get original text back?
    String annotationText = (annotation != null)? annotation.get(CoreAnnotations.TextAnnotation.class) : null;
    if (annotationText != null) {
      CoreLabel firstToken = tokens.get(0);
      CoreLabel lastToken = tokens.get(tokens.size() - 1);
      int firstCharOffset = firstToken.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      int lastCharOffset = lastToken.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
      return annotationText.substring(firstCharOffset, lastCharOffset);
    } else {
      return StringUtils.joinWords(tokens, " ");
    }
  }
View Full Code Here

    CoreMap tokenAnnotations = (tokenAnnotationPatterns != null && !tokenAnnotationPatterns.isEmpty())? new ArrayCoreMap():null;
    Map<Class, Stack<Pair<String, String>>> savedTokenAnnotations = new ArrayMap<Class, Stack<Pair<String, String>>>();

    // Local variable for annotating sections
    XMLUtils.XMLTag sectionStartTag = null;
    CoreLabel sectionStartToken = null;
    CoreMap sectionAnnotations = null;
    Map<Class, List<CoreLabel>> savedTokensForSection = new HashMap<Class, List<CoreLabel>>();

    boolean markSingleSentence = false;
    for (CoreLabel token : tokens) {
      String word = token.word().trim();
      XMLUtils.XMLTag tag = XMLUtils.parseTag(word);

      // If it's not a tag, we do manipulations such as unescaping
      if (tag == null) {
        // TODO: put this into the lexer instead of here
        token.setWord(XMLUtils.unescapeStringForXML(token.word()));
        // TODO: was there another annotation that also represents the word?
        if (matchDepth > 0 ||
            xmlTagMatcher == null ||
            xmlTagMatcher.matcher("").matches()) {
          newTokens.add(token);
          if (inUtterance) {
            token.set(CoreAnnotations.UtteranceAnnotation.class, utteranceIndex);
            if (currentSpeaker != null) token.set(CoreAnnotations.SpeakerAnnotation.class, currentSpeaker);
          }
          if (markSingleSentence) {
            token.set(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class, true);
            markSingleSentence = false;
          }
          if (tokenAnnotations != null) {
            ChunkAnnotationUtils.copyUnsetAnnotations(tokenAnnotations, token);
          }
        }
        // if we removed any text, and the tokens are "invertible" and
        // therefore keep track of their before/after text, append
        // what we removed to the appropriate tokens
        if (removedText.length() > 0) {
          boolean added = false;
          String before = token.get(CoreAnnotations.BeforeAnnotation.class);
          if (before != null) {
            token.set(CoreAnnotations.BeforeAnnotation.class, removedText + before);
            added = true;
          }
          if (added && newTokens.size() > 1) {
            CoreLabel previous = newTokens.get(newTokens.size() - 2);
            String after = previous.get(CoreAnnotations.AfterAnnotation.class);
            if (after != null)
              previous.set(CoreAnnotations.AfterAnnotation.class, after + removedText);
            else
              previous.set(CoreAnnotations.AfterAnnotation.class, removedText.toString());
          }
          removedText = new StringBuilder();
        }
        if (currentTagSet == null) {
          // We wrap the list in an unmodifiable list because we reuse
          // the same list object many times.  We don't want to
          // let someone modify one list and screw up all the others.
          currentTagSet =
            Collections.unmodifiableList(new ArrayList<String>(enclosingTags));
        }
        token.set(CoreAnnotations.XmlContextAnnotation.class, currentTagSet);

        // is this token part of the doc date sequence?
        if (dateTagMatcher != null &&
            currentTagSet.size() > 0 &&
            dateTagMatcher.matcher(currentTagSet.get(currentTagSet.size() - 1)).matches()) {
          docDateTokens.add(token);
        }

        if (docIdTagMatcher != null &&
                currentTagSet.size() > 0 &&
                docIdTagMatcher.matcher(currentTagSet.get(currentTagSet.size() - 1)).matches()) {
          docIdTokens.add(token);
        }

        if (docTypeTagMatcher != null &&
                currentTagSet.size() > 0 &&
                docTypeTagMatcher.matcher(currentTagSet.get(currentTagSet.size() - 1)).matches()) {
          docTypeTokens.add(token);
        }

        if (inSpeakerTag) {
          speakerTokens.add(token);
        }

        if (sectionStartTag != null) {
          boolean okay = true;
          if (ssplitDiscardTokensMatcher != null) {
            okay = !ssplitDiscardTokensMatcher.matcher(token.word()).matches();
          }
          if (okay) {
            if (sectionStartToken == null) {
              sectionStartToken = token;
            }
            // Add tokens to saved section tokens
            for (List<CoreLabel> saved:savedTokensForSection.values()) {
              saved.add(token);
            }
          }
        }

        continue;
      }

      // At this point, we know we have a tag

      // we are removing a token and its associated text...
      // keep track of that
      String currentRemoval = token.get(CoreAnnotations.BeforeAnnotation.class);
      if (currentRemoval != null)
        removedText.append(currentRemoval);
      currentRemoval = token.get(CoreAnnotations.OriginalTextAnnotation.class);
      if (currentRemoval != null)
        removedText.append(currentRemoval);
      if (token == tokens.get(tokens.size() - 1)) {
        currentRemoval = token.get(CoreAnnotations.AfterAnnotation.class);
        if (currentRemoval != null)
          removedText.append(currentRemoval);
      }

      // Process tag

      // Check if we want to annotate anything using the tags's attributes
      if (!toAnnotate.isEmpty() && tag.attributes != null) {
        Set<Class> foundAnnotations = annotateWithTag(annotation, annotation, tag, docAnnotationPatterns, null, toAnnotate, null);
        toAnnotate.removeAll(foundAnnotations);
      }

      // Check if the tag matches a section
      if (sectionTagMatcher != null && sectionTagMatcher.matcher(tag.name).matches()) {
        if (tag.isEndTag) {
          annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null);
          if (sectionStartToken != null) {
            sectionStartToken.set(CoreAnnotations.SectionStartAnnotation.class, sectionAnnotations);
          }
          // Mark previous token as forcing sentence and section end
          if (newTokens.size() > 0) {
            CoreLabel previous = newTokens.get(newTokens.size() - 1);
            previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
            previous.set(CoreAnnotations.SectionEndAnnotation.class, sectionStartTag.name);
          }
          savedTokensForSection.clear();
          sectionStartTag = null;
          sectionStartToken = null;
          sectionAnnotations = null;
        } else if (!tag.isSingleTag) {
          // Prepare to mark first token with section information
          sectionStartTag = tag;
          sectionAnnotations = new ArrayCoreMap();
          sectionAnnotations.set(CoreAnnotations.SectionAnnotation.class, sectionStartTag.name);
        }
      }
      if (sectionStartTag != null) {
        // store away annotations for section
        annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null);
      }
      if (tokenAnnotations != null) {
        annotateWithTag(annotation, tokenAnnotations, tag, tokenAnnotationPatterns, null, null, savedTokenAnnotations);
      }

      // If the tag matches the sentence ending tags, and we have some
      // existing words, mark that word as being somewhere we want
      // to end the sentence.
      if (sentenceEndingTagMatcher != null &&
          sentenceEndingTagMatcher.matcher(tag.name).matches() &&
          newTokens.size() > 0) {
        CoreLabel previous = newTokens.get(newTokens.size() - 1);
        previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
      }

      if (utteranceTurnTagMatcher != null && utteranceTurnTagMatcher.matcher(tag.name).matches()) {
        if (newTokens.size() > 0) {
          // Utterance turn is also sentence ending
          CoreLabel previous = newTokens.get(newTokens.size() - 1);
          previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
        }
        inUtterance = !(tag.isEndTag || tag.isSingleTag);
        if (inUtterance) {
          utteranceIndex++;
        }
        if (!inUtterance) {
          currentSpeaker = null;
        }
      }

      if (speakerTagMatcher != null && speakerTagMatcher.matcher(tag.name).matches()) {
        if (newTokens.size() > 0) {
          // Speaker is not really part of sentence
          CoreLabel previous = newTokens.get(newTokens.size() - 1);
          previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
        }
        inSpeakerTag = !(tag.isEndTag || tag.isSingleTag);
        if (tag.isEndTag) {
          currentSpeaker = tokensToString(annotation, speakerTokens);
          MultiTokenTag.Tag mentionTag = new MultiTokenTag.Tag(currentSpeaker, "Speaker", speakerTokens.size());
          int i = 0;
          for (CoreLabel t:speakerTokens) {
            t.set(CoreAnnotations.SpeakerAnnotation.class, currentSpeaker);
            t.set(CoreAnnotations.MentionTokenAnnotation.class, new MultiTokenTag(mentionTag, i));
            i++;
          }
        } else {
          currentSpeaker = null;
        }
        speakerTokens.clear();
      }

      if (singleSentenceTagMatcher != null && singleSentenceTagMatcher.matcher(tag.name).matches()) {
        if (tag.isEndTag) {
          // Mark previous token as forcing sentence end
          if (newTokens.size() > 0) {
            CoreLabel previous = newTokens.get(newTokens.size() - 1);
            previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
          }
          markSingleSentence = false;
        } else if (!tag.isSingleTag) {
          // Enforce rest of the tokens to be single token until ForceSentenceEnd is seen
          markSingleSentence = true;
        }
      }

      if (xmlTagMatcher == null)
        continue;

      if (tag.isSingleTag) {
        continue;
      }
      // at this point, we can't reuse the "currentTagSet" vector
      // any more, since the current tag set has changed
      currentTagSet = null;
      if (tag.isEndTag) {
        while (true) {
          if (enclosingTags.isEmpty()) {
            throw new IllegalArgumentException("Got a close tag " + tag.name +
                                               " which does not match" +
                                               " any open tag");
          }
          String lastTag = enclosingTags.pop();
          if (xmlTagMatcher.matcher(lastTag).matches()){
            --matchDepth;
          }
          if (lastTag.equals(tag.name))
            break;
          if (!allowFlawedXml)
            throw new IllegalArgumentException("Mismatched tags... " +
                                               tag.name + " closed a " +
                                               lastTag + " tag.");
        }
        if (matchDepth < 0) {
          // this should be impossible, since we already assert that
          // the tags match up correctly
          throw new AssertionError("Programming error?  We think there " +
                                   "have been more close tags than open tags");
        }
      } else {
        // open tag, since all other cases are exhausted
        enclosingTags.push(tag.name);
        if (xmlTagMatcher.matcher(tag.name).matches())
          matchDepth++;
      }
    }

    if (enclosingTags.size() > 0 && !allowFlawedXml) {
      throw new IllegalArgumentException("Unclosed tags, starting with " +
                                         enclosingTags.pop());
    }

    // If we ended with a string of xml tokens, that text needs to be
    // appended to the "AfterAnnotation" of one of the tokens...
    // Note that we clear removedText when we see a real token, so
    // if removedText is not empty, that must be because we just
    // dropped an xml tag.  Therefore we ignore that old After
    // annotation, since that text was already absorbed in the Before
    // annotation of the xml tag we threw away
    if (newTokens.size() > 0 && removedText.length() > 0) {
      CoreLabel lastToken = newTokens.get(newTokens.size() - 1);
      // sometimes AfterAnnotation seems to be null even when we are
      // collecting before & after annotations, but OriginalTextAnnotation
      // is only non-null if we are invertible.  Hopefully.
      if (lastToken.get(CoreAnnotations.OriginalTextAnnotation.class) != null) {
        lastToken.set(CoreAnnotations.AfterAnnotation.class, removedText.toString());
      }
    }

    // Populate docid, docdate, doctype
    if (annotation != null) {
View Full Code Here

  public static void setMissingTags(CoreMap sentence, Tree tree) {
    List<TaggedWord> taggedWords = null;
    List<Label> leaves = null;
    List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    for (int i = 0; i < tokens.size(); ++i) {
      CoreLabel token = tokens.get(i);
      if (token.tag() == null) {
        if (taggedWords == null) {
          taggedWords = tree.taggedYield();
        }
        if (leaves == null) {
          leaves = tree.yield();
        }
        token.setTag(taggedWords.get(i).tag());
        Label leaf = leaves.get(i);
        if (leaf instanceof HasTag) {
          ((HasTag) leaf).setTag(taggedWords.get(i).tag());
        }
      }
View Full Code Here

  private <TOKEN extends CoreLabel> void annotateTokens(List<TOKEN> tokens) {
    // Make a copy of the tokens before annotating because QuantifiableEntityNormalizer may change the POS too
    List<CoreLabel> words = new ArrayList<CoreLabel>();
    for (CoreLabel token : tokens) {
      CoreLabel word = new CoreLabel();
      word.setWord(token.word());
      word.setNER(token.ner());
      word.setTag(token.tag());

      // copy fields potentially set by SUTime
      NumberSequenceClassifier.transferAnnotations(token, word);

      words.add(word);
View Full Code Here

  public String toString(String format) { return toString(); }


  private static CoreLabel makeStartLabel(String label) {
    CoreLabel root = new CoreLabel();
    root.set(CoreAnnotations.ValueAnnotation.class, label);
    root.set(CoreAnnotations.IndexAnnotation.class, 0);
    return root;
  }
View Full Code Here

        System.err.println("WARNING: CollinsDependency.extractFromTree() could not find root for:\n" + node.pennString());

      } else { //Make dependencies
        if(mustProcessRoot) {
          mustProcessRoot = false;
          final CoreLabel startLabel = makeStartLabel(startSymbol);
          deps.add(new CollinsDependency(new CoreLabel(head.label()), startLabel, new CollinsRelation(startSymbol, startSymbol, node.value(), Direction.Right)));
        }

        Direction dir = Direction.Left;
        for(final Tree daughter : node.children()) {

          if(daughter.equals(headDaughter)) {
            dir = Direction.Right;

          } else {
            final Tree headOfDaughter = daughter.headTerminal(hf);

            final String relParent = (normPOS && node.isPreTerminal()) ? normPOSLabel : node.value();
            final String relHead = (normPOS && headDaughter.isPreTerminal()) ? normPOSLabel : headDaughter.value();
            final String relModifier = (normPOS && daughter.isPreTerminal()) ? normPOSLabel : daughter.value();

            final CollinsDependency newDep =
              new CollinsDependency(new CoreLabel(headOfDaughter.label()), new CoreLabel(head.label()), new CollinsRelation(relParent, relHead, relModifier, dir));

            deps.add(newDep);
          }
        }
      }
View Full Code Here

    BufferedReader reader = null;
    try {
      reader = IOUtils.readerFromString(inFile);

      CoreMap sentence = new CoreLabel();
      List<CoreLabel> sentenceTokens = new ArrayList<>();

      DependencyTree tree = new DependencyTree();

      for (String line : IOUtils.getLineIterable(reader, false)) {
        String[] splits = line.split("\t");
        if (splits.length < 10) {
          trees.add(tree);
          sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
          sents.add(sentence);

          tree = new DependencyTree();
          sentence = new CoreLabel();
          sentenceTokens = new ArrayList<>();
        } else {
          String word = splits[1],
                  pos = splits[4],
                  depType = splits[7];
          int head = Integer.parseInt(splits[6]);

          CoreLabel token = tf.makeToken(word, 0, 0);
          token.setTag(pos);
          token.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, head);
          token.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, depType);
          sentenceTokens.add(token);

          if (labeled)
            tree.add(head, depType);
          else
View Full Code Here

      {
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);

        for (int j = 1, size = tokens.size(); j <= size; ++j)
        {
          CoreLabel token = tokens.get(j - 1);
          output.printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n",
                  j, token.word(), token.tag(), token.tag(),
                  token.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class),
                  token.get(CoreAnnotations.CoNLLDepTypeAnnotation.class));
        }
        output.println();
      }
      output.close();
    }
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.ling.CoreLabel

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.