Source Code of edu.stanford.nlp.pipeline.WordsToSentencesAnnotator

package edu.stanford.nlp.pipeline;


import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;


import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.ArrayUtils;
import edu.stanford.nlp.util.CoreMap;




/**
 * This class assumes that there is a {@code List<? extends CoreLabel>}
 * under the {@code TokensAnnotation} field, and runs it
 * through {@link edu.stanford.nlp.process.WordToSentenceProcessor}
 * and puts the new {@code List<List<? extends CoreLabel>>}
 * under the {@code SentencesAnnotation} field.
 *
 * @author Jenny Finkel
 * @author Christopher Manning
 */
public class WordsToSentencesAnnotator implements Annotator {


  private final WordToSentenceProcessor<CoreLabel> wts;


  private final boolean VERBOSE;


  private final boolean countLineNumbers;


  public WordsToSentencesAnnotator() {
    this(false);
  }


  public WordsToSentencesAnnotator(boolean verbose) {
    this(verbose, false, new WordToSentenceProcessor<CoreLabel>());
  }


  public WordsToSentencesAnnotator(boolean verbose, String boundaryTokenRegex,
                                   Set<String> boundaryToDiscard, Set<String> htmlElementsToDiscard,
                                   String newlineIsSentenceBreak) {
    this(verbose, false,
         new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex,
                 boundaryToDiscard, htmlElementsToDiscard,
                 WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak)));
  }


  public WordsToSentencesAnnotator(boolean verbose, String boundaryTokenRegex,
                                   Set<String> boundaryToDiscard, Set<String> htmlElementsToDiscard,
                                   String newlineIsSentenceBreak, String boundaryMultiTokenRegex,
                                   Set<String> tokenRegexesToDiscard) {
    this(verbose, false,
            new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex,
                    boundaryToDiscard, htmlElementsToDiscard,
                    WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak),
                    (boundaryMultiTokenRegex != null)? TokenSequencePattern.compile(boundaryMultiTokenRegex):null, tokenRegexesToDiscard));
  }


  private WordsToSentencesAnnotator(boolean verbose, boolean countLineNumbers,
                                    WordToSentenceProcessor<CoreLabel> wts) {
    VERBOSE = verbose;
    this.countLineNumbers = countLineNumbers;
    this.wts = wts;
  }




  /** Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted.
   *  This constructor counts the lines by putting in empty token lists for empty lines.
   *  It tells the underlying splitter to return empty lists of tokens
   *  and then treats those empty lists as empty lines.  We don't
   *  actually include empty sentences in the annotation, though. But they
   *  are used in numbering the sentence. Only this constructor leads to
   *  empty sentences.
   *
   *  @param verbose Whether it is verbose.
   *  @param  nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake
   *                 newline tokens returned from the tokenizer.
   *  @return A WordsToSentenceAnnotator.
   */
  public static WordsToSentencesAnnotator newlineSplitter(boolean verbose, String ... nlToken) {
    // this constructor will keep empty lines as empty sentences
    WordToSentenceProcessor<CoreLabel> wts =
            new WordToSentenceProcessor<CoreLabel>(ArrayUtils.asImmutableSet(nlToken));
    return new WordsToSentencesAnnotator(verbose, true, wts);
  }




  /** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence.
   *
   *  @param verbose Whether it is verbose.
   *  @return A WordsToSentenceAnnotator.
   */
  public static WordsToSentencesAnnotator nonSplitter(boolean verbose) {
    WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<CoreLabel>(true);
    return new WordsToSentencesAnnotator(verbose, false, wts);
  }




  /**
   * If setCountLineNumbers is set to true, we count line numbers by
   * telling the underlying splitter to return empty lists of tokens
   * and then treating those empty lists as empty lines.  We don't
   * actually include empty sentences in the annotation, though.
   **/
  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      System.err.print("Sentence splitting ...");
    }
    if ( ! annotation.has(CoreAnnotations.TokensAnnotation.class)) {
      throw new IllegalArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
    }


    // get text and tokens from the document
    String text = annotation.get(CoreAnnotations.TextAnnotation.class);
    List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
    // System.err.println("Tokens are: " + tokens);


    // assemble the sentence annotations
    int tokenOffset = 0;
    int lineNumber = 0;
    // section annotations to mark sentences with
    CoreMap sectionAnnotations = null;
    List<CoreMap> sentences = new ArrayList<CoreMap>();
    for (List<CoreLabel> sentenceTokens: this.wts.process(tokens)) {
      if (countLineNumbers) {
        ++lineNumber;
      }
      if (sentenceTokens.isEmpty()) {
        if (!countLineNumbers) {
          throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens);
        } else {
          continue;
        }
      }


      // get the sentence text from the first and last character offsets
      int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      int last = sentenceTokens.size() - 1;
      int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
      String sentenceText = text.substring(begin, end);


      // create a sentence annotation with text and token offsets
      Annotation sentence = new Annotation(sentenceText);
      sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
      sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
      sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
      sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset);
      tokenOffset += sentenceTokens.size();
      sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset);
      sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());


      if (countLineNumbers) {
        sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber);
      }


      // Annotation sentence with section information
      // Assume section start and end appear as first and last tokens of sentence
      CoreLabel sentenceStartToken = sentenceTokens.get(0);
      CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size()-1);


      CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class);
      if (sectionStart != null) {
        // Section is started
        sectionAnnotations = sectionStart;
      }
      if (sectionAnnotations != null) {
        // transfer annotations over to sentence
        ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence);
      }
      String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class);
      if (sectionEnd != null) {
        sectionAnnotations = null;
      }
      
      if (docID != null) {
        sentence.set(CoreAnnotations.DocIDAnnotation.class, docID);
      }


      int index = 1;
      for (CoreLabel token : sentenceTokens) {
        token.setIndex(index++);
        token.setSentIndex(sentences.size());
        if (docID != null) {
          token.setDocID(docID);
        }
      }


      // add the sentence to the list
      sentences.add(sentence);
    }
    // the condition below is possible if sentenceBoundaryToDiscard is initialized!
      /*
      if (tokenOffset != tokens.size()) {
        throw new RuntimeException(String.format(
            "expected %d tokens, found %d", tokens.size(), tokenOffset));
      }
      */


    // add the sentences annotations to the document
    annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
  }




  @Override
  public Set<Requirement> requires() {
    return Collections.singleton(TOKENIZE_REQUIREMENT);
  }


  @Override
  public Set<Requirement> requirementsSatisfied() {
    return Collections.singleton(SSPLIT_REQUIREMENT);
  }


}
Source Code of edu.stanford.nlp.pipeline.WordsToSentencesAnnotator

Related Classes of edu.stanford.nlp.pipeline.WordsToSentencesAnnotator