log.debug("computeEnhancements for ContentPart {} of ContentItem {} language {} text={}",
new Object [] { contentPart.getKey(),ci.getUri().getUnicodeString(),
language, StringUtils.abbreviate(text, 100) });
//first get the models
Tokenizer tokenizer = initTokenizer(language);
SentenceDetector sentenceDetector = initSentence(language);
POSTaggerME posTagger;
if(sentenceDetector != null){ //sentence detection is requirement
posTagger = initTagger(language);
} else {
posTagger = null;
}
ChunkerME chunker;
if(posTagger != null && useChunker ){ //pos tags requirement
chunker = initChunker(language);
} else {
chunker = null;
}
Map<String,Suggestion> suggestionCache = new TreeMap<String,Suggestion>();
if(sentenceDetector != null){
//add dots for multiple line breaks
text = text.replaceAll("\\n\\n", ".\n");
Span[] sentenceSpans = sentenceDetector.sentPosDetect(text);
for (int i = 0; i < sentenceSpans.length; i++) {
String sentence = sentenceSpans[i].getCoveredText(text).toString();
Span[] tokenSpans = tokenizer.tokenizePos(sentence);
String[] tokens = getTokensForSpans(sentence, tokenSpans);
String[] pos;
double[] posProbs;
if(posTagger != null){
pos = posTagger.tag(tokens);
posProbs = posTagger.probs();
} else {
pos = null;
posProbs = null;
}
Span[] chunkSpans;
double[] chunkProps;
if(chunker != null){
chunkSpans = chunker.chunkAsSpans(tokens, pos);
chunkProps = chunker.probs();
} else {
chunkSpans = null;
chunkProps = null;
}
enhance(suggestionCache,site,ci,language, //the site, metadata and lang
sentenceSpans[i].getStart(),sentence, //offset and sentence
tokenSpans,tokens, //the tokens
pos,posProbs, // the pos tags (might be null)
chunkSpans,chunkProps); //the chunks (might be null)
}
} else {
Span[] tokenSpans = tokenizer.tokenizePos(text);
String[] tokens = getTokensForSpans(text, tokenSpans);
enhance(suggestionCache,site,ci,language,0,text,tokenSpans,tokens,
null,null,null,null);
}
//finally write the entity enhancements