Package opennlp.tools.util

Examples of opennlp.tools.util.PlainTextByLineStream


   *          the charset of the Arvores Deitadas Corpus
   */
  public ADChunkSampleStream(InputStream in, String charsetName) {

    try {
      this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(
          in, charsetName));
    } catch (UnsupportedEncodingException e) {
      // UTF-8 is available on all JVMs, will never happen
      throw new IllegalStateException(e);
    }
View Full Code Here


      TokenNameFinderModel model = new TokenNameFinderModelLoader().load(new File(args[i]));
      nameFinders[i] = new NameFinderME(model);
    }
   
    ObjectStream<String> untokenizedLineStream =
        new PlainTextByLineStream(new InputStreamReader(System.in));
   
    PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent");
    perfMon.start();
   
    try {
      String line;
      while((line = untokenizedLineStream.read()) != null) {
        String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line);
       
        // A new line indicates a new document,
        // adaptive data must be cleared for a new document
       
View Full Code Here

      File sampleDataFile, Charset encoding) {
    CmdLineUtil.checkInputFile(sampleDataName + " Data", sampleDataFile);

    FileInputStream sampleDataIn = CmdLineUtil.openInFile(sampleDataFile);

    ObjectStream<String> lineStream = new PlainTextByLineStream(sampleDataIn
        .getChannel(), encoding);

    return new TokenSampleStream(lineStream);
  }
View Full Code Here

      File sampleDataFile, Charset encoding) {
    CmdLineUtil.checkInputFile(sampleDataName + " Data", sampleDataFile);

    FileInputStream sampleDataIn = CmdLineUtil.openInFile(sampleDataFile);

    ObjectStream<String> lineStream = new PlainTextByLineStream(sampleDataIn
        .getChannel(), encoding);

    return new NameSampleDataStream(lineStream);
  }
View Full Code Here

   
    Detokenizer detokenizer = new DictionaryDetokenizer(
        new DetokenizationDictionaryLoader().load(new File(args[0])));
   
    ObjectStream<String> tokenizedLineStream =
      new PlainTextByLineStream(new InputStreamReader(System.in));
   
    PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent");
    perfMon.start();
   
    try {
      String tokenizedLine;
      while ((tokenizedLine = tokenizedLineStream.read()) != null) {
       
        // white space tokenize line
        String tokens[] = WhitespaceTokenizer.INSTANCE.tokenize(tokenizedLine);
       
        DetokenizationOperation operations[] = detokenizer.detokenize(tokens);
View Full Code Here

    InputStream in = getClass().getResourceAsStream(
        "/opennlp/tools/sentdetect/Sentences.txt");

    SentenceModel sentdetectModel = SentenceDetectorME.train(
        "en", new SentenceSampleStream(new PlainTextByLineStream(new InputStreamReader(in))), true, null, 100, 0);
   
    assertEquals("en", sentdetectModel.getLanguage());
   
    SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
View Full Code Here

   
    InputStream trainDataIn = TokenizerTestUtil.class.getResourceAsStream(
        "/opennlp/tools/tokenize/token.train");
   
    ObjectStream<TokenSample> samples = new TokenSampleStream(
        new PlainTextByLineStream(new InputStreamReader(trainDataIn, "UTF-8")));
   
    return TokenizerME.train("en", samples, true, 5, 100);
  }
View Full Code Here

        }
       
        additionalTrainingDataIn = new FileInputStream(additionalTrainingDataFile);
       
        ObjectStream<TokenSample> additionalSamples = new TokenSampleStream(
            new PlainTextByLineStream(new InputStreamReader(additionalTrainingDataIn, additionalTrainingDataEncoding)));
       
        samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples);
      }
     
      tokenModel = TokenizerME.train(language, samples, isSkipAlphaNumerics);
View Full Code Here

       
        additionalTrainingDataIn = new FileInputStream(additionalTrainingDataFile);
       
        // TODO: Make encoding configurable, otherwise use UTF-8 as default!
        ObjectStream<NameSample> additionalSamples = new NameSampleDataStream(
            new PlainTextByLineStream(new InputStreamReader(additionalTrainingDataIn, additionalTrainingDataEncoding)));
       
        samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples);
      }
     
      // TODO: Make sure its possible to pass custom feature generator
View Full Code Here

      File sampleDataFile, Charset encoding) {
    CmdLineUtil.checkInputFile(sampleDataName + " Data", sampleDataFile);

    FileInputStream sampleDataIn = CmdLineUtil.openInFile(sampleDataFile);

    ObjectStream<String> lineStream = new PlainTextByLineStream(sampleDataIn
        .getChannel(), encoding);

    return new ChunkSampleStream(lineStream);
  }
View Full Code Here

TOP

Related Classes of opennlp.tools.util.PlainTextByLineStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.