Package com.lulu.WordCloud

Source Code of com.lulu.WordCloud.StanfordLemmatizer

package com.lulu.WordCloud;

import java.util.List;
import java.util.Properties;

import org.apache.commons.lang.StringUtils;

import cue.lang.Counter;
import cue.lang.stop.StopWords;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

class StanfordLemmatizer {

    protected StanfordCoreNLP pipeline;

    public StanfordLemmatizer() {
        // Create StanfordCoreNLP object properties, with POS tagging
        // (required for lemmatization), and lemmatization
        Properties props;
        props = new Properties();
        props.put("annotators", "tokenize, ssplit, pos, lemma");

        // StanfordCoreNLP loads a lot of models, so you probably
        // only want to do this once per execution
        this.pipeline = new StanfordCoreNLP(props);
    }

    public Counter<String> lemmatize(String documentText, StopWords stopWords) {
        Counter<String> lemmas = new Counter<String>();

        // create an empty Annotation just with the given text
        Annotation document = new Annotation(documentText);

        // run all Annotators on this text
        this.pipeline.annotate(document);

        // Iterate over all of the sentences found
        List<CoreMap> sentences = document.get(SentencesAnnotation.class);
        for (CoreMap sentence : sentences) {
            // Iterate over all tokens in a sentence
            for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
                // Retrieve and add the lemma for each word into the
                // list of lemmas
                String word = token.originalText();
                if (stopWords != null && !stopWords.isStopWord(word)) {
                    String lemma = token.get(LemmaAnnotation.class);
                    if (StringUtils.isAlphanumericSpace(lemma)) {
                        lemmas.note(lemma);
                    }
                }
            }
        }

        return lemmas;
    }
}
TOP

Related Classes of com.lulu.WordCloud.StanfordLemmatizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.