Package edu.stanford.nlp.process

Examples of edu.stanford.nlp.process.DocumentPreprocessor


    TreebankLanguagePack tlp = lp.getOp().langpack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();

    Iterable<List<? extends HasWord>> sentences;
    if (args.length > 1) {
      DocumentPreprocessor dp = new DocumentPreprocessor(args[1]);
      List<List<? extends HasWord>> tmp =
        new ArrayList<List<? extends HasWord>>();
      for (List<HasWord> sentence : dp) {
        tmp.add(sentence);
      }
View Full Code Here


    if (tlp.supportsGrammaticalStructures()) {
      gsf = tlp.grammaticalStructureFactory();
    }
    // You could also create a tokenizer here (as below) and pass it
    // to DocumentPreprocessor
    for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
      Tree parse = lp.apply(sentence);
      parse.pennPrint();
      System.out.println();

      if (gsf != null) {
View Full Code Here

    }
    return lasNoPunc;
  }

  private void parseTextFile(BufferedReader input, PrintWriter output) {
    DocumentPreprocessor preprocessor = new DocumentPreprocessor(input);
    preprocessor.setSentenceFinalPuncWords(config.tlp.sentenceFinalPunctuationWords());
    preprocessor.setEscaper(config.escaper);
    preprocessor.setSentenceDelimiter(config.sentenceDelimiter);
    preprocessor.setTokenizerFactory(config.tlp.getTokenizerFactory());

    Timing timer = new Timing();

    MaxentTagger tagger = new MaxentTagger(config.tagger);
    List<List<TaggedWord>> tagged = new ArrayList<>();
View Full Code Here

   *     the default English tokenizer (PTBTokenizerFactory) is used.
   * @return List of tokenized sentences
   */
  public static List<List<HasWord>> tokenizeText(Reader r,
                 TokenizerFactory<? extends HasWord> tokenizerFactory) {
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    if (tokenizerFactory != null) {
      documentPreprocessor.setTokenizerFactory(tokenizerFactory);
    }
    List<List<HasWord>> out = Generics.newArrayList();
    for (List<HasWord> item : documentPreprocessor) {
      out.add(item);
    }
View Full Code Here

      sentenceDelimiter = "\n";
    }

    while (true) {
      //Now we do everything through the doc preprocessor
      final DocumentPreprocessor docProcessor;
      String line = reader.readLine();
      // this happens when we reach end of file
      if (line == null)
        break;
      docProcessor = new DocumentPreprocessor(new StringReader(line));
      docProcessor.setTokenizerFactory(tokenizerFactory);
      docProcessor.setSentenceDelimiter(sentenceDelimiter);
      if (config.keepEmptySentences()) {
        docProcessor.setKeepEmptySentences(true);
      }

      for (List<HasWord> sentence : docProcessor) {
        numWords += sentence.size();
View Full Code Here

      sentenceDelimiter = "\n";
    }
    final TokenizerFactory<? extends HasWord> tokenizerFactory = chooseTokenizerFactory();

    //Now we do everything through the doc preprocessor
    final DocumentPreprocessor docProcessor;
    if (tagInside.length() > 0) {
      docProcessor = new DocumentPreprocessor(reader, DocumentPreprocessor.DocType.XML);
      docProcessor.setElementDelimiter(tagInside);
      if (config.keepEmptySentences()) {
        docProcessor.setKeepEmptySentences(true);
      }
    } else {
      docProcessor = new DocumentPreprocessor(reader);
      docProcessor.setSentenceDelimiter(sentenceDelimiter);
      if (config.keepEmptySentences()) {
        docProcessor.setKeepEmptySentences(true);
      }
    }
    docProcessor.setTokenizerFactory(tokenizerFactory);

    runTagger(docProcessor, writer, outputStyle);
  }
View Full Code Here

    String text = "I can almost always tell when movies use fake dinosaurs.";

    MaxentTagger tagger = new MaxentTagger(taggerPath);
    DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);

    DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
    for (List<HasWord> sentence : tokenizer) {
      List<TaggedWord> tagged = tagger.tagSentence(sentence);
      GrammaticalStructure gs = parser.predict(tagged);

      // Print typed dependencies
View Full Code Here

    String text = "My dog likes to shake his stuffed chickadee toy.";

    MaxentTagger tagger = new MaxentTagger(taggerPath);
    ShiftReduceParser model = ShiftReduceParser.loadModel(modelPath);

    DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
    for (List<HasWord> sentence : tokenizer) {
      List<TaggedWord> tagged = tagger.tagSentence(sentence);
      Tree tree = model.apply(tagged);
      System.err.println(tree);
    }
View Full Code Here

    FileSystem.mkdirOrFail(outputFile);

    int count = 0;
    if (inputPath != null) {
      Reader input = new BufferedReader(new FileReader(inputPath));
      DocumentPreprocessor processor = new DocumentPreprocessor(input);
      for (List<HasWord> sentence : processor) {
        count++; // index from 1
        ParserQuery pq = parser.parserQuery();
        if (!(pq instanceof RerankingParserQuery)) {
          throw new IllegalArgumentException("Expected a RerankingParserQuery");
View Full Code Here

      } else if (t.length == 1) {
        text = t[0];
        id = String.valueOf(lineNum);
      }
      id = sentIDprefix + id;
      DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
      PTBTokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizerFactory
          .newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
      dp.setTokenizerFactory(tokenizerFactory);

      String label = backgroundSymbol;
      int sentNum = -1;

      for (List<HasWord> sentence : dp) {
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.process.DocumentPreprocessor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.