Package opennlp.tools.tokenize

Examples of opennlp.tools.tokenize.TokenizerME


        Tokenizer tokenizer = null;
        if(language != null){
            try {
                TokenizerModel model = getTokenizerModel(language);
                if(model != null){
                    tokenizer = new TokenizerME(getTokenizerModel(language));
                }
            } catch (InvalidFormatException e) {
                log.warn("Unable to load Tokenizer Model for "+language+": " +
                    "Will use Simple Tokenizer instead",e);
            } catch (IOException e) {
View Full Code Here


            if(model == null){
                throw new EngineException("The configured OpenNLP TokenizerModel '"
                        + modelName +" is not available' ("+getClass().getSimpleName()
                        + " | name=" + getName() + ")!");
            }
            return new TokenizerME(model);
        }
    }
View Full Code Here

        getDefaultPosTagger();
        getDefaultChunker();
    }

    public static Tokenizer getDefaultTokenizer() throws IOException {
        return new TokenizerME(new TokenizerModel(
                getResourceAsStream(tokenizerModelFile)));
    }
View Full Code Here

  protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) {
        String intext = text.text();
    //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n");
    List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>();
    SentenceDetectorME  sentenceDetector = new SentenceDetectorME((SentenceModel)sentenceModel);
    TokenizerME tokenizer = new TokenizerME((TokenizerModel)tokenModel);
    POSTaggerME posTagger = new POSTaggerME((POSModel)posModel);
    ChunkerME chunker = new ChunkerME((ChunkerModel)chunkModel);

    Span[] sentSpans = sentenceDetector.sentPosDetect(intext);
    for (Span sentSpan : sentSpans) {
      String sentence = sentSpan.getCoveredText(intext).toString();
      int start = sentSpan.getStart();
      Span[] tokSpans = tokenizer.tokenizePos(sentence);
      String[] tokens = new String[tokSpans.length];
      // System.out.println("\n\nTokens:");
      for (int i = 0; i < tokens.length; i++) {
        tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
        // System.out.println(tokens[i]);
View Full Code Here

     *
     * @throws IOException
     */
    public ApacheExtractor() throws IOException {
        nameFinder = new NameFinderME(new TokenNameFinderModel(ApacheExtractor.class.getResourceAsStream(pathToNERModel)));
        tokenizer = new TokenizerME(new TokenizerModel(ApacheExtractor.class.getResourceAsStream(pathToTokenizerModel)));
        sentenceDetector = new SentenceDetectorME(new SentenceModel(ApacheExtractor.class.getResourceAsStream(pathToSentenceDetectorModel)));
    }
View Full Code Here

    return getTokensWithTokenReadings(tokenReadings, tokens, chunkTags);
  }

  // non-private for test cases
  String[] tokenize(String sentence) {
    TokenizerME tokenizer = new TokenizerME(tokenModel);
    String cleanString = sentence.replace('’', '\'')// this is the type of apostrophe that OpenNLP expects
    return tokenizer.tokenize(cleanString);
  }
View Full Code Here

            if(model == null){
                throw new EngineException("The configured OpenNLP TokenizerModel '"
                        + modelName +" is not available' ("+getClass().getSimpleName()
                        + " | name=" + getName() + ")!");
            }
            return new TokenizerME(model);
        }
    }
View Full Code Here

  public ObjectStream<NameSample> create(String[] args) {

    Parameters params = ArgumentParser.parse(args, Parameters.class);

    TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel());
    Tokenizer tokenizer = new TokenizerME(tokenizerModel);

    ObjectStream<String> mucDocStream = new FileToStringSampleStream(
        new DirectorySampleStream(params.getData(), new FileFilter() {

          public boolean accept(File file) {
View Full Code Here

    Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;

    if (params.getTokenizerModel() != null) {
      try {
        tokenizer = new TokenizerME(new TokenizerModel(params.getTokenizerModel()));
      } catch (IOException e) {
        throw new TerminateToolException(-1, "Failed to load tokenizer model!", e);
      }
    }
    else if (params.getRuleBasedTokenizer() != null) {
View Full Code Here

      ObjectStream<BratDocument> samples) {
    super(samples);

    // TODO: We can pass in custom validators here ...
    this.sentDetector = new SentenceDetectorME(sentModel);
    this.tokenizer = new TokenizerME(tokenModel);
  }
View Full Code Here

TOP

Related Classes of opennlp.tools.tokenize.TokenizerME

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.