package com.lulu.WordCloud;
import java.util.List;
import java.util.Properties;
import org.apache.commons.lang.StringUtils;
import cue.lang.Counter;
import cue.lang.stop.StopWords;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
class StanfordLemmatizer {
protected StanfordCoreNLP pipeline;
public StanfordLemmatizer() {
// Create StanfordCoreNLP object properties, with POS tagging
// (required for lemmatization), and lemmatization
Properties props;
props = new Properties();
props.put("annotators", "tokenize, ssplit, pos, lemma");
// StanfordCoreNLP loads a lot of models, so you probably
// only want to do this once per execution
this.pipeline = new StanfordCoreNLP(props);
}
public Counter<String> lemmatize(String documentText, StopWords stopWords) {
Counter<String> lemmas = new Counter<String>();
// create an empty Annotation just with the given text
Annotation document = new Annotation(documentText);
// run all Annotators on this text
this.pipeline.annotate(document);
// Iterate over all of the sentences found
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
// Iterate over all tokens in a sentence
for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
// Retrieve and add the lemma for each word into the
// list of lemmas
String word = token.originalText();
if (stopWords != null && !stopWords.isStopWord(word)) {
String lemma = token.get(LemmaAnnotation.class);
if (StringUtils.isAlphanumericSpace(lemma)) {
lemmas.note(lemma);
}
}
}
}
return lemmas;
}
}