SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));
Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
NameFinderME finder = new NameFinderME(nameFinderModel);
Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
for (int i = 0; i < sentenceSpans.length; i++) {
String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();
// build a context by concatenating three sentences to be used for
// similarity ranking / disambiguation + contextual snippet in the
// extraction structure
List<String> contextElements = new ArrayList<String>();
if (i > 0) {
CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
contextElements.add(previousSentence.toString().trim());
}
contextElements.add(sentence.toString().trim());
if (i + 1 < sentenceSpans.length) {
CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
contextElements.add(nextSentence.toString().trim());
}
String context = StringUtils.join(contextElements, " ");
// extract the names in the current sentence and
// keep them store them with the current context
Span[] tokenSpans = tokenizer.tokenizePos(sentence);
String[] tokens = Span.spansToStrings(tokenSpans, sentence);
Span[] nameSpans = finder.find(tokens);
double[] probs = finder.probs();
String[] names = Span.spansToStrings(nameSpans, tokens);
//int lastStartPosition = 0;
for (int j = 0; j < names.length; j++) {
String name = names[j];
Double confidence = 1.0;
for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
confidence *= probs[k];
}
int start = tokenSpans[nameSpans[j].getStart()].getStart();
int absoluteStart = sentenceSpans[i].getStart() + start;
int absoluteEnd = absoluteStart + name.length();
NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, context,
confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {
occurrences = new ArrayList<NameOccurrence>();
}
occurrences.add(occurrence);
nameOccurrences.put(name, occurrences);
}
}
finder.clearAdaptiveData();
log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
return nameOccurrences;
}