Package opennlp.tools.tokenize

Examples of opennlp.tools.tokenize.Tokenizer


  public ObjectStream<NameSample> create(String[] args) {

    Parameters params = ArgumentParser.parse(args, Parameters.class);

    TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel());
    Tokenizer tokenizer = new TokenizerME(tokenizerModel);

    ObjectStream<String> mucDocStream = new FileToStringSampleStream(
        new DirectorySampleStream(params.getData(), new FileFilter() {

          public boolean accept(File file) {
View Full Code Here


    }
    else {
      sentDetector = new NewlineSentenceDetector();
    }

    Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;

    if (params.getTokenizerModel() != null) {
      try {
        tokenizer = new TokenizerME(new TokenizerModel(params.getTokenizerModel()));
      } catch (IOException e) {
View Full Code Here

        log.debug("computeEnhancements for ContentPart {} of ContentItem {} language {} text={}",
            new Object [] { contentPart.getKey(),ci.getUri().getUnicodeString(),
                            language, StringUtils.abbreviate(text, 100) });
       
        //first get the models
        Tokenizer tokenizer = initTokenizer(language);
        SentenceDetector sentenceDetector = initSentence(language);
        POSTaggerME posTagger;
        if(sentenceDetector != null){ //sentence detection is requirement
            posTagger = initTagger(language);
        } else {
            posTagger = null;
        }
        ChunkerME chunker;
        if(posTagger != null && useChunker ){ //pos tags requirement
            chunker = initChunker(language);
        } else {
            chunker = null;
        }
        Map<String,Suggestion> suggestionCache = new TreeMap<String,Suggestion>();
        if(sentenceDetector != null){
            //add dots for multiple line breaks
            text = text.replaceAll("\\n\\n", ".\n");
            Span[] sentenceSpans = sentenceDetector.sentPosDetect(text);
            for (int i = 0; i < sentenceSpans.length; i++) {
                String sentence = sentenceSpans[i].getCoveredText(text).toString();
                Span[] tokenSpans = tokenizer.tokenizePos(sentence);
                String[] tokens = getTokensForSpans(sentence, tokenSpans);
                String[] pos;
                double[] posProbs;
                if(posTagger != null){
                    pos = posTagger.tag(tokens);
                    posProbs = posTagger.probs();
                } else {
                    pos = null;
                    posProbs = null;
                }
                Span[] chunkSpans;
                double[] chunkProps;
                if(chunker != null){
                    chunkSpans = chunker.chunkAsSpans(tokens, pos);
                    chunkProps = chunker.probs();
                } else {
                    chunkSpans = null;
                    chunkProps = null;
                }
                enhance(suggestionCache,site,ci,language, //the site, metadata and lang
                    sentenceSpans[i].getStart(),sentence, //offset and sentence
                    tokenSpans,tokens, //the tokens
                    pos,posProbs, // the pos tags (might be null)
                    chunkSpans,chunkProps); //the chunks (might be null)
            }
        } else {
            Span[] tokenSpans = tokenizer.tokenizePos(text);
            String[] tokens = getTokensForSpans(text, tokenSpans);
            enhance(suggestionCache,site,ci,language,0,text,tokenSpans,tokens,
                null,null,null,null);
        }
        //finally write the entity enhancements
View Full Code Here

    /**
     * @param language
     * @return
     */
    private Tokenizer initTokenizer(String language) {
        Tokenizer tokenizer;
        if(useSimpleTokenizer ){
            tokenizer = SimpleTokenizer.INSTANCE;
        } else {
            tokenizer = openNLP.getTokenizer(language);
        }
View Full Code Here

        SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));

        Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);

        NameFinderME finder = new NameFinderME(nameFinderModel);
        Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
        Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
        for (int i = 0; i < sentenceSpans.length; i++) {
            String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();

            // build a context by concatenating three sentences to be used for
            // similarity ranking / disambiguation + contextual snippet in the
            // extraction structure
            List<String> contextElements = new ArrayList<String>();
            if (i > 0) {
                CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
                contextElements.add(previousSentence.toString().trim());
            }
            contextElements.add(sentence.toString().trim());
            if (i + 1 < sentenceSpans.length) {
                CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
                contextElements.add(nextSentence.toString().trim());
            }
            String context = StringUtils.join(contextElements, " ");

            // extract the names in the current sentence and
            // keep them store them with the current context
            Span[] tokenSpans = tokenizer.tokenizePos(sentence);
            String[] tokens = Span.spansToStrings(tokenSpans, sentence);
            Span[] nameSpans = finder.find(tokens);
            double[] probs = finder.probs();
            String[] names = Span.spansToStrings(nameSpans, tokens);
            //int lastStartPosition = 0;
View Full Code Here

     * @param language the language or <code>null</code> to build a
     * {@link SimpleTokenizer}
     * @return the {@link Tokenizer} for the parsed language.
     */
    public Tokenizer getTokenizer(String language) {
        Tokenizer tokenizer = null;
        if(language != null){
            try {
                TokenizerModel model = getTokenizerModel(language);
                if(model != null){
                    tokenizer = new TokenizerME(getTokenizerModel(language));
View Full Code Here

  public double[] categorize(String text[]) {
    return model.eval(mContextGenerator.getContext(text));
  }

  public double[] categorize(String documentText) {
    Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
    return categorize(tokenizer.tokenize(documentText));
  }
View Full Code Here

    // the sentence detector and tokenizer constructors
    // take paths to their respective models
    SentenceDetectorME sdetector = new SentenceDetectorME(
        new SentenceModel(new FileInputStream(
            "models/en-sent.bin")));
    Tokenizer tokenizer = new TokenizerME(new TokenizerModel(
        new FileInputStream("models/en-token.bin")));

    // the parser takes the path to the parser models
    // directory and a few other options
    /*
     * boolean useTagDict = true; boolean useCaseInsensitiveTagDict = false;
     * int beamSize = opennlp.tools.parser.chunking.Parser.defaultBeamSize;
     * double advancePercentage =
     * opennlp.tools.parser.chunking.Parser.defaultAdvancePercentage;
     * opennlp.tools.parser.Parser parser = TreebankParser.getParser(
     * "models/parser", useTagDict, useCaseInsensitiveTagDict, beamSize,
     * advancePercentage);
     */Parser parser = ParserFactory.create(new ParserModel(
        new FileInputStream("models/en-parser-chunking.bin")),
        AbstractBottomUpParser.defaultBeamSize,
        AbstractBottomUpParser.defaultAdvancePercentage);

    // break a paragraph into sentences
    String[] sents = sdetector.sentDetect(paragraph.toString());

    // TODO handle paragraph (multiple sentences)
    String sent = sents[0];

    // tokenize brackets and parentheses by putting a space on either side.
    // this makes sure it doesn't get confused with output from the parser
    sent = untokenizedParenPattern1.matcher(sent).replaceAll("$1 $2");
    sent = untokenizedParenPattern2.matcher(sent).replaceAll("$1 $2");

    // get the tokenizer to break apart the sentence
    String[] tokens = tokenizer.tokenize(sent);

    // build a string to parse as well as a list of tokens
    StringBuffer sb = new StringBuffer();
    List<String> tokenList = new ArrayList<String>();
    for (int j = 0; j < tokens.length; j++) {
View Full Code Here

        log.info("Encountered umapped POS tag '{}' for langauge '{}'",tag,language);
        return posTag;
    }

    private List<Token> tokenize(Section section,String langauge) {
        Tokenizer tokenizer = getTokenizer(langauge);
        String text = section.getSpan();
        List<Token> tokens = new ArrayList<Token>(text.length()/5); //assume avr. token length is 5
        opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
        for(int i=0;i<tokenSpans.length;i++){
            Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
            log.trace(" > add {}",token);
            tokens.add(token);
        }
View Full Code Here

TOP

Related Classes of opennlp.tools.tokenize.Tokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.