Examples of edu.stanford.nlp.process.DocumentPreprocessor

edu.stanford.nlp.process.DocumentPreprocessor
Produces a list of sentences from either a plain text or XML document.
Tokenization: The default tokenizer is {@link PTBTokenizer}. If null is passed to setTokenizerFactory, then whitespace tokenization is assumed.
Adding a new document type requires two steps:
1. Add a new DocType.
2. Create an iterator for the new DocType and modify the iterator() function to return the new iterator.
NOTE: This implementation should not use external libraries since it is used in the parser. @author Spence Green

    TreebankLanguagePack tlp = lp.getOp().langpack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();


    Iterable<List<? extends HasWord>> sentences;
    if (args.length > 1) {
      DocumentPreprocessor dp = new DocumentPreprocessor(args[1]);
      List<List<? extends HasWord>> tmp =
        new ArrayList<List<? extends HasWord>>();
      for (List<HasWord> sentence : dp) {
        tmp.add(sentence);
      }

View Full Code Here

    if (tlp.supportsGrammaticalStructures()) {
      gsf = tlp.grammaticalStructureFactory();
    }
    // You could also create a tokenizer here (as below) and pass it
    // to DocumentPreprocessor
    for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
      Tree parse = lp.apply(sentence);
      parse.pennPrint();
      System.out.println();


      if (gsf != null) {

View Full Code Here

    }
    return lasNoPunc;
  }


  private void parseTextFile(BufferedReader input, PrintWriter output) {
    DocumentPreprocessor preprocessor = new DocumentPreprocessor(input);
    preprocessor.setSentenceFinalPuncWords(config.tlp.sentenceFinalPunctuationWords());
    preprocessor.setEscaper(config.escaper);
    preprocessor.setSentenceDelimiter(config.sentenceDelimiter);
    preprocessor.setTokenizerFactory(config.tlp.getTokenizerFactory());


    Timing timer = new Timing();


    MaxentTagger tagger = new MaxentTagger(config.tagger);
    List<List<TaggedWord>> tagged = new ArrayList<>();

View Full Code Here

   *     the default English tokenizer (PTBTokenizerFactory) is used.
   * @return List of tokenized sentences
   */
  public static List<List<HasWord>> tokenizeText(Reader r,
                 TokenizerFactory<? extends HasWord> tokenizerFactory) {
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    if (tokenizerFactory != null) {
      documentPreprocessor.setTokenizerFactory(tokenizerFactory);
    }
    List<List<HasWord>> out = Generics.newArrayList();
    for (List<HasWord> item : documentPreprocessor) {
      out.add(item);
    }

View Full Code Here

      sentenceDelimiter = "\n";
    }


    while (true) {
      //Now we do everything through the doc preprocessor
      final DocumentPreprocessor docProcessor;
      String line = reader.readLine();
      // this happens when we reach end of file
      if (line == null)
        break;
      docProcessor = new DocumentPreprocessor(new StringReader(line));
      docProcessor.setTokenizerFactory(tokenizerFactory);
      docProcessor.setSentenceDelimiter(sentenceDelimiter);
      if (config.keepEmptySentences()) {
        docProcessor.setKeepEmptySentences(true);
      }


      for (List<HasWord> sentence : docProcessor) {
        numWords += sentence.size();

View Full Code Here

      sentenceDelimiter = "\n";
    }
    final TokenizerFactory<? extends HasWord> tokenizerFactory = chooseTokenizerFactory();


    //Now we do everything through the doc preprocessor
    final DocumentPreprocessor docProcessor;
    if (tagInside.length() > 0) {
      docProcessor = new DocumentPreprocessor(reader, DocumentPreprocessor.DocType.XML);
      docProcessor.setElementDelimiter(tagInside);
      if (config.keepEmptySentences()) {
        docProcessor.setKeepEmptySentences(true);
      }
    } else {
      docProcessor = new DocumentPreprocessor(reader);
      docProcessor.setSentenceDelimiter(sentenceDelimiter);
      if (config.keepEmptySentences()) {
        docProcessor.setKeepEmptySentences(true);
      }
    }
    docProcessor.setTokenizerFactory(tokenizerFactory);


    runTagger(docProcessor, writer, outputStyle);
  }

View Full Code Here

    String text = "I can almost always tell when movies use fake dinosaurs.";


    MaxentTagger tagger = new MaxentTagger(taggerPath);
    DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);


    DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
    for (List<HasWord> sentence : tokenizer) {
      List<TaggedWord> tagged = tagger.tagSentence(sentence);
      GrammaticalStructure gs = parser.predict(tagged);


      // Print typed dependencies

View Full Code Here

    String text = "My dog likes to shake his stuffed chickadee toy.";


    MaxentTagger tagger = new MaxentTagger(taggerPath);
    ShiftReduceParser model = ShiftReduceParser.loadModel(modelPath);


    DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
    for (List<HasWord> sentence : tokenizer) {
      List<TaggedWord> tagged = tagger.tagSentence(sentence);
      Tree tree = model.apply(tagged);
      System.err.println(tree);
    }

View Full Code Here

    FileSystem.mkdirOrFail(outputFile);


    int count = 0;
    if (inputPath != null) {
      Reader input = new BufferedReader(new FileReader(inputPath));
      DocumentPreprocessor processor = new DocumentPreprocessor(input);
      for (List<HasWord> sentence : processor) {
        count++; // index from 1
        ParserQuery pq = parser.parserQuery();
        if (!(pq instanceof RerankingParserQuery)) {
          throw new IllegalArgumentException("Expected a RerankingParserQuery");

View Full Code Here

      } else if (t.length == 1) {
        text = t[0];
        id = String.valueOf(lineNum);
      }
      id = sentIDprefix + id;
      DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
      PTBTokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizerFactory
          .newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
      dp.setTokenizerFactory(tokenizerFactory);


      String label = backgroundSymbol;
      int sentNum = -1;


      for (List<HasWord> sentence : dp) {

View Full Code Here

0 1

TOP

Related Classes of edu.stanford.nlp.process.DocumentPreprocessor

edu.stanford.nlp.ling.CoreLabel

edu.stanford.nlp.ling.HasWord

edu.stanford.nlp.parser.dvparser.ParseAndPrintMatrices

edu.stanford.nlp.parser.lexparser.demo.ParserDemo

edu.stanford.nlp.parser.lexparser.demo.ParserDemo2

edu.stanford.nlp.parser.lexparser.ParseFiles

edu.stanford.nlp.parser.nndep.demo.DependencyParserDemo

edu.stanford.nlp.parser.nndep.DependencyParser

edu.stanford.nlp.parser.shiftreduce.demo.ShiftReduceDemo

edu.stanford.nlp.patterns.surface.AnnotatedTextReader

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.