Examples of edu.stanford.nlp.process.DocumentPreprocessor$ListEscaper

edu.stanford.nlp.process.DocumentPreprocessor
Fully customizable preprocessor for XML, HTML, and PLAIN text documents. Can take any of a number of input formats and return a {@link List} of tokenized strings. @author Chris Cox @author Jenny Finkel

    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
                     "untokenizable=noneKeep");
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
      List<TaggedWord> tSentence = tagger.tagSentence(sentence);
      pw.println(Sentence.listToString(tSentence, false));
    }

View Full Code Here


    //Loop over the files
    for (int i = argIndex; i < args.length; i++) {
      final String filename = args[i];


      final DocumentPreprocessor documentPreprocessor;
      if (filename.equals("-")) {
        try {
          documentPreprocessor = new DocumentPreprocessor(new BufferedReader(new InputStreamReader(System.in, op.tlpParams.getInputEncoding())),docType);
        } catch (IOException e) {
          throw new RuntimeIOException(e);
        }
      } else {
        documentPreprocessor = new DocumentPreprocessor(filename,docType,op.tlpParams.getInputEncoding());
      }


      //Unused values are null per the main() method invocation below
      //null is the default for these properties
      documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
      documentPreprocessor.setEscaper(escaper);
      documentPreprocessor.setSentenceDelimiter(sentenceDelimiter);
      documentPreprocessor.setTagDelimiter(tagDelimiter);
      documentPreprocessor.setElementDelimiter(elementDelimiter);
      if(tokenizerFactory == null)
        documentPreprocessor.setTokenizerFactory((tokenized) ? null : tlp.getTokenizerFactory());
      else
        documentPreprocessor.setTokenizerFactory(tokenizerFactory);


      //Setup the output
      PrintWriter pwo = pwOut;
      if (op.testOptions.writeOutputFiles) {
        String normalizedName = filename;

View Full Code Here


      // Here we take the first line and tokenize it as one sentence.
      String[] lines = chunk.trim().split("\\n");
      String sentence = lines[0];
      StringReader sin = new StringReader(sentence);
      DocumentPreprocessor document = new DocumentPreprocessor(sin);
      document.setSentenceFinalPuncWords(new String[] {"\n"});
      List<HasWord> tokens = document.iterator().next();
      Integer mainLabel = new Integer(tokens.get(0).word());
      //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
      tokens = tokens.subList(1, tokens.size());
      //System.err.println(tokens);

View Full Code Here

        private final Iterator<List<HasWord>> sentenceIterator;
        private Iterator<HasWord> tokenIterator;
        boolean eos = false;


        public StandfordTokenStream(Reader reader) {
            DocumentPreprocessor dp = new DocumentPreprocessor(reader);
            sentenceIterator = dp.iterator();
        }

View Full Code Here

0 1

TOP

Related Classes of edu.stanford.nlp.process.DocumentPreprocessor$ListEscaper

edu.stanford.nlp.ling.CoreLabel

edu.stanford.nlp.ling.HasWord

edu.stanford.nlp.parser.dvparser.ParseAndPrintMatrices

edu.stanford.nlp.parser.lexparser.demo.ParserDemo

edu.stanford.nlp.parser.lexparser.demo.ParserDemo2

edu.stanford.nlp.parser.lexparser.ParseFiles

edu.stanford.nlp.parser.nndep.demo.DependencyParserDemo

edu.stanford.nlp.parser.nndep.DependencyParser

edu.stanford.nlp.parser.shiftreduce.demo.ShiftReduceDemo

edu.stanford.nlp.patterns.surface.AnnotatedTextReader

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.