protected Map<String,List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
AnalysedText at, String language) {
// version with explicit sentence endings to reflect heading / paragraph
// structure of an HTML or PDF document converted to text
NameFinderME finder = new NameFinderME(nameFinderModel);
Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
List<Section> sentences = new ArrayList<Section>();
//Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
AnalysedTextUtils.appandToList(at.getSentences(), sentences);
if(sentences.isEmpty()){ //no sentence annotations
sentences.add(at); //process as a single section
}
for (int i=0;i<sentences.size();i++) {
String sentence = sentences.get(i).getSpan();
// build a context by concatenating three sentences to be used for
// similarity ranking / disambiguation + contextual snippet in the
// extraction structure
List<String> contextElements = new ArrayList<String>();
contextElements.add(sentence);
//three sentences as context
String context = at.getSpan().substring(
sentences.get(Math.max(0, i-1)).getStart(),
sentences.get(Math.min(sentences.size()-1, i+1)).getEnd());
// get the tokens, words of the current sentence
List<Token> tokens = new ArrayList<Token>(32);
List<String> words = new ArrayList<String>(32);
for(Iterator<Token> it =sentences.get(i).getTokens();it.hasNext();){
Token t = it.next();
tokens.add(t);
words.add(t.getSpan());
}
Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
double[] probs = finder.probs();
//int lastStartPosition = 0;
for (int j = 0; j < nameSpans.length; j++) {
String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(),
tokens.get(nameSpans[j].getEnd()-1).getEnd());
Double confidence = 1.0;
for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
confidence *= probs[k];
}
int start = tokens.get(nameSpans[j].getStart()).getStart();
int end = start + name.length();
NerTag nerTag = config.getNerTag(nameSpans[j].getType());
//create the occurrence for writing fise:TextAnnotations
NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(),
context, confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {
occurrences = new ArrayList<NameOccurrence>();
}
occurrences.add(occurrence);
nameOccurrences.put(name, occurrences);
//add also the NerAnnotation to the AnalysedText
Chunk chunk = at.addChunk(start, end);
//TODO: build AnnotationModel based on the configured Mappings
chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
}
}
finder.clearAdaptiveData();
log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
return nameOccurrences;
}