Examples of opennlp.tools.tokenize.TokenizerME

opennlp.tools.tokenize.TokenizerME
is.upenn.edu/~jcreynar>.
This tokenizer needs a statistical model to tokenize a text which reproduces the tokenization observed in the training data used to create the model. The {@link TokenizerModel} class encapsulates the model and providesmethods to create it from the binary representation.
A tokenizer instance is not thread safe. For each thread one tokenizer must be instantiated which can share one TokenizerModel instance to safe memory.
To train a new model { {@link #train(String,ObjectStream,boolean,TrainingParameters)} methodcan be used.
Sample usage:
InputStream modelIn; ... TokenizerModel model = TokenizerModel(modelIn); Tokenizer tokenizer = new TokenizerME(model); String tokens[] = tokenizer.tokenize("A sentence to be tokenized."); @see Tokenizer @see TokenizerModel @see TokenSample

      model = modelResource.getModel();
    } catch (ResourceAccessException e) {
      throw new ResourceInitializationException(e);
    }


    tokenizer = new TokenizerME(model);
  }

View Full Code Here

  public void setTokenizer(FileSystem fs, Path p){
    try {
      FSDataInputStream in = fs.open(p);
      TokenizerModel model;
      model = new TokenizerModel(in);
      tokenizer = new TokenizerME(model);
    }
    catch (IOException e) {
      e.printStackTrace();
    }
  }

View Full Code Here

  public void setTokenizer(FileSystem fs, Path p){
    try {
      FSDataInputStream in = fs.open(p);
      TokenizerModel model;
      model = new TokenizerModel(in);
      tokenizer = new TokenizerME(model);
    } catch (IOException e) {
      e.printStackTrace();
      throw new RuntimeException("OpenNLPTokenizer model not available at " + p); 
    }
  }

View Full Code Here

  public void setTokenizer(FileSystem fs, Path p){
    try {
      FSDataInputStream in = fs.open(p);
      TokenizerModel model;
      model = new TokenizerModel(in);
      tokenizer = new TokenizerME(model);
    }
    catch (IOException e) {
      e.printStackTrace();
    }
  }

View Full Code Here

        Tokenizer tokenizer = null;
        if(language != null){
            try {
                TokenizerModel model = getTokenizerModel(language);
                if(model != null){
                    tokenizer = new TokenizerME(getTokenizerModel(language));
                }
            } catch (InvalidFormatException e) {
                log.warn("Unable to load Tokenizer Model for "+language+": " +
                    "Will use Simple Tokenizer instead",e);
            } catch (IOException e) {

View Full Code Here

    // the sentence detector and tokenizer constructors
    // take paths to their respective models
    SentenceDetectorME sdetector = new SentenceDetectorME(
        new SentenceModel(new FileInputStream(
            "models/en-sent.bin")));
    Tokenizer tokenizer = new TokenizerME(new TokenizerModel(
        new FileInputStream("models/en-token.bin")));


    // the parser takes the path to the parser models
    // directory and a few other options
    /*
     * boolean useTagDict = true; boolean useCaseInsensitiveTagDict = false;
     * int beamSize = opennlp.tools.parser.chunking.Parser.defaultBeamSize;
     * double advancePercentage =
     * opennlp.tools.parser.chunking.Parser.defaultAdvancePercentage;
     * opennlp.tools.parser.Parser parser = TreebankParser.getParser(
     * "models/parser", useTagDict, useCaseInsensitiveTagDict, beamSize,
     * advancePercentage);
     */Parser parser = ParserFactory.create(new ParserModel(
        new FileInputStream("models/en-parser-chunking.bin")),
        AbstractBottomUpParser.defaultBeamSize,
        AbstractBottomUpParser.defaultAdvancePercentage);


    // break a paragraph into sentences
    String[] sents = sdetector.sentDetect(paragraph.toString());


    // TODO handle paragraph (multiple sentences)
    String sent = sents[0];


    // tokenize brackets and parentheses by putting a space on either side.
    // this makes sure it doesn't get confused with output from the parser
    sent = untokenizedParenPattern1.matcher(sent).replaceAll("$1 $2");
    sent = untokenizedParenPattern2.matcher(sent).replaceAll("$1 $2");


    // get the tokenizer to break apart the sentence
    String[] tokens = tokenizer.tokenize(sent);


    // build a string to parse as well as a list of tokens
    StringBuffer sb = new StringBuffer();
    List<String> tokenList = new ArrayList<String>();
    for (int j = 0; j < tokens.length; j++) {

View Full Code Here

   * 
   * @throws IOException 
   */
  public ApacheExtractor() throws IOException {
    nameFinder = new NameFinderME(new TokenNameFinderModel(ApacheExtractor.class.getResourceAsStream(pathToNERModel)));
    tokenizer = new TokenizerME(new TokenizerModel(ApacheExtractor.class.getResourceAsStream(pathToTokenizerModel)));
        sentenceDetector = new SentenceDetectorME(new SentenceModel(ApacheExtractor.class.getResourceAsStream(pathToSentenceDetectorModel)));
  }

View Full Code Here

  public void setTokenizer(FileSystem fs, Path p){
    try {
      FSDataInputStream in = fs.open(p);
      TokenizerModel model;
      model = new TokenizerModel(in);
      tokenizer = new TokenizerME(model);
    } catch (IOException e) {
      e.printStackTrace();
      throw new RuntimeException("OpenNLPTokenizer model not available at " + p); 
    }
  }

View Full Code Here

        if(languageConfig.isLanguage(language)){
            String modelName = languageConfig.getParameter(language, PARAM_MODEL);
            if(modelName != null){
                try {
                    TokenizerModel model = openNlp.getModel(TokenizerModel.class, modelName, null);
                    return new TokenizerME(model).tokenize(label);
                } catch (Exception e) {
                    log.warn("Unable to load configured TokenizerModel '"+modelName
                        + "' for language '"+language
                        + "! Fallback to default Tokenizers",e);
                }

View Full Code Here

0 1 2

TOP

Related Classes of opennlp.tools.tokenize.TokenizerME

com.bericotech.clavin.extractor.ApacheExtractor

edu.washington.cs.knowitall.util.DefaultObjects

functionality.SentencesToTree

gate.opennlp.OpenNlpTokenizer

io.lumify.opennlpDictionary.OpenNLPDictionaryExtractorGraphPropertyWorker

io.lumify.opennlpme.OpenNLPMaximumEntropyExtractorGraphPropertyWorker

ivory.core.tokenize.OpenNLPTokenizer

net.sf.nlpshell.Main

opennlp.maxent.EventCollectorAsStream

opennlp.maxent.EventStream

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.