* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfiguration, language, true);
ChunkerME chunker = initChunker(language);
if(chunker == null){
return;
}
//init the Phrase TagSet
TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
if(tagSet == null){
}
if(tagSet == null){
log.warn("No Phrase TagSet registered for Language '{}'. Will build an "
+ "adhoc set based on encountered Tags!",language);
//for now only created to avoid checks for tagSet == null
//TODO: in future we might want to automatically create posModels based
//on tagged texts. However this makes no sense as long we can not
//persist TagSets.
tagSet = new TagSet<PhraseTag>("dummy", language);
}
//holds PosTags created for POS tags that where not part of the posModel
//(will hold all PosTags in case tagSet is NULL
Map<String,PhraseTag> adhocTags = languageAdhocTags.get(language);
if(adhocTags == null){
adhocTags = new HashMap<String,PhraseTag>();
languageAdhocTags.put(language, adhocTags);
}
ci.getLock().writeLock().lock();
try {
Iterator<? extends Section> sentences = at.getSentences();
if(!sentences.hasNext()){ //no sentences ... iterate over the whole text
sentences = Collections.singleton(at).iterator();
}
List<String> tokenTextList = new ArrayList<String>(64);
List<String> posList = new ArrayList<String>(64);
List<Token> tokenList = new ArrayList<Token>(64);
//process each sentence seperatly
while(sentences.hasNext()){
// (1) get Tokens and POS information for the sentence
Section sentence = sentences.next();
Iterator<Token> tokens = sentence.getTokens();
while(tokens.hasNext()){
Token token = tokens.next();
tokenList.add(token);
tokenTextList.add(token.getSpan());
Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
if(posValue == null){
throw new EngineException("Missing POS value for Token '"
+ token.getSpan()+" of ContentItem "+ci.getUri()
+ "(Sentence: '"+sentence.getSpan()+"'). This may "
+ "indicate that a POS tagging Engine is missing in "
+ "the EnhancementChain or that the used POS tagging "
+ "does not provide POS tags for each token!");
} else {
posList.add(posValue.value().getTag());
}
}
String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
if(log.isTraceEnabled()){
log.trace("Tokens: {}"+Arrays.toString(tokenStrings));
}
tokenTextList.clear(); //free memory
posList.clear(); //free memory
// (2) Chunk the sentence
String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
double[] chunkProb = chunker.probs();
if(log.isTraceEnabled()){
log.trace("Chunks: {}"+Arrays.toString(chunkTags));
}
tokenStrings = null; //free memory
tokenPos = null; //free memory
// (3) Process the results and write the Annotations
double chunkProps = 0;
int chunkTokenCount = 0;
PhraseTag tag = null;
int i;
/*
* This assumes:
* - 'B-{tag}' ... for start of a new chunk
* - '???' ... anything other for continuing the current chunk
* - 'O' ... no chunk (ends current chunk)
*/
for(i=0;i<tokenList.size();i++){
boolean start = chunkTags[i].charAt(0) == 'B';
boolean end = tag != null && (start || chunkTags[i].charAt(0) == 'O');
if(end){ //add the current phrase
//add at AnalysedText level, because offsets are absolute
//NOTE we are already at the next token when we detect the end
Chunk chunk = at.addChunk(
tokenList.get(i-chunkTokenCount).getStart(),
tokenList.get(i-1).getEnd());
chunk.addAnnotation(PHRASE_ANNOTATION,
new Value<PhraseTag>(tag,
chunkProps/(double)chunkTokenCount));
//reset the state
tag = null;
chunkTokenCount = 0;
chunkProps = 0;
}
if(start){ //create the new tag
tag = getPhraseTag(tagSet,adhocTags,
chunkTags[i].substring(2), language); //skip 'B-'
}
if(tag != null){ //count this token for the current chunk
chunkProps = chunkProps + chunkProb[i];
chunkTokenCount++;
}
}
if(tag != null){
Chunk chunk = at.addChunk(
tokenList.get(i-chunkTokenCount).getStart(),
tokenList.get(i-1).getEnd());
chunk.addAnnotation(PHRASE_ANNOTATION,
new Value<PhraseTag>(tag,
chunkProps/(double)chunkTokenCount));