Package opennlp.tools.tokenize

Examples of opennlp.tools.tokenize.TokenizerME


      model = modelResource.getModel();
    } catch (ResourceAccessException e) {
      throw new ResourceInitializationException(e);
    }

    tokenizer = new TokenizerME(model);
  }
View Full Code Here


  public void setTokenizer(FileSystem fs, Path p){
    try {
      FSDataInputStream in = fs.open(p);
      TokenizerModel model;
      model = new TokenizerModel(in);
      tokenizer = new TokenizerME(model);
    }
    catch (IOException e) {
      e.printStackTrace();
    }
  }
View Full Code Here

  public void setTokenizer(FileSystem fs, Path p){
    try {
      FSDataInputStream in = fs.open(p);
      TokenizerModel model;
      model = new TokenizerModel(in);
      tokenizer = new TokenizerME(model);
    } catch (IOException e) {
      e.printStackTrace();
      throw new RuntimeException("OpenNLPTokenizer model not available at " + p);
    }
  }
View Full Code Here

  public void setTokenizer(FileSystem fs, Path p){
    try {
      FSDataInputStream in = fs.open(p);
      TokenizerModel model;
      model = new TokenizerModel(in);
      tokenizer = new TokenizerME(model);
    }
    catch (IOException e) {
      e.printStackTrace();
    }
  }
View Full Code Here

        Tokenizer tokenizer = null;
        if(language != null){
            try {
                TokenizerModel model = getTokenizerModel(language);
                if(model != null){
                    tokenizer = new TokenizerME(getTokenizerModel(language));
                }
            } catch (InvalidFormatException e) {
                log.warn("Unable to load Tokenizer Model for "+language+": " +
                    "Will use Simple Tokenizer instead",e);
            } catch (IOException e) {
View Full Code Here

    // the sentence detector and tokenizer constructors
    // take paths to their respective models
    SentenceDetectorME sdetector = new SentenceDetectorME(
        new SentenceModel(new FileInputStream(
            "models/en-sent.bin")));
    Tokenizer tokenizer = new TokenizerME(new TokenizerModel(
        new FileInputStream("models/en-token.bin")));

    // the parser takes the path to the parser models
    // directory and a few other options
    /*
     * boolean useTagDict = true; boolean useCaseInsensitiveTagDict = false;
     * int beamSize = opennlp.tools.parser.chunking.Parser.defaultBeamSize;
     * double advancePercentage =
     * opennlp.tools.parser.chunking.Parser.defaultAdvancePercentage;
     * opennlp.tools.parser.Parser parser = TreebankParser.getParser(
     * "models/parser", useTagDict, useCaseInsensitiveTagDict, beamSize,
     * advancePercentage);
     */Parser parser = ParserFactory.create(new ParserModel(
        new FileInputStream("models/en-parser-chunking.bin")),
        AbstractBottomUpParser.defaultBeamSize,
        AbstractBottomUpParser.defaultAdvancePercentage);

    // break a paragraph into sentences
    String[] sents = sdetector.sentDetect(paragraph.toString());

    // TODO handle paragraph (multiple sentences)
    String sent = sents[0];

    // tokenize brackets and parentheses by putting a space on either side.
    // this makes sure it doesn't get confused with output from the parser
    sent = untokenizedParenPattern1.matcher(sent).replaceAll("$1 $2");
    sent = untokenizedParenPattern2.matcher(sent).replaceAll("$1 $2");

    // get the tokenizer to break apart the sentence
    String[] tokens = tokenizer.tokenize(sent);

    // build a string to parse as well as a list of tokens
    StringBuffer sb = new StringBuffer();
    List<String> tokenList = new ArrayList<String>();
    for (int j = 0; j < tokens.length; j++) {
View Full Code Here

   *
   * @throws IOException
   */
  public ApacheExtractor() throws IOException {
    nameFinder = new NameFinderME(new TokenNameFinderModel(ApacheExtractor.class.getResourceAsStream(pathToNERModel)));
    tokenizer = new TokenizerME(new TokenizerModel(ApacheExtractor.class.getResourceAsStream(pathToTokenizerModel)));
        sentenceDetector = new SentenceDetectorME(new SentenceModel(ApacheExtractor.class.getResourceAsStream(pathToSentenceDetectorModel)));
  }
View Full Code Here

  public void setTokenizer(FileSystem fs, Path p){
    try {
      FSDataInputStream in = fs.open(p);
      TokenizerModel model;
      model = new TokenizerModel(in);
      tokenizer = new TokenizerME(model);
    } catch (IOException e) {
      e.printStackTrace();
      throw new RuntimeException("OpenNLPTokenizer model not available at " + p);
    }
  }
View Full Code Here

        if(languageConfig.isLanguage(language)){
            String modelName = languageConfig.getParameter(language, PARAM_MODEL);
            if(modelName != null){
                try {
                    TokenizerModel model = openNlp.getModel(TokenizerModel.class, modelName, null);
                    return new TokenizerME(model).tokenize(label);
                } catch (Exception e) {
                    log.warn("Unable to load configured TokenizerModel '"+modelName
                        + "' for language '"+language
                        + "! Fallback to default Tokenizers",e);
                }
View Full Code Here

TOP

Related Classes of opennlp.tools.tokenize.TokenizerME

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.