* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
String language = getLanguage(this,ci,false);
if(!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
throw new IllegalStateException("The detected language is NOT 'ja'! "
+ "As this is also checked within the #canEnhance(..) method this "
+ "indicates an Bug in the used EnhancementJobManager implementation. "
+ "Please report this on the dev@apache.stanbol.org or create an "
+ "JIRA issue about this.");
}
//start with the Tokenizer
TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
//build the analyzing chain by adding all TokenFilters
for(TokenFilterFactory filterFactory : filterFactories){
tokenStream = filterFactory.create(tokenStream);
}
//Try to extract sentences based on POS tags ...
int sentStartOffset = -1;
//NER data
List<NerData> nerList = new ArrayList<NerData>();
int nerSentIndex = 0; //the next index where the NerData.context need to be set
NerData ner = null;
OffsetAttribute offset = null;
try {
tokenStream.reset(); //required with Solr 4
while (tokenStream.incrementToken()){
offset = tokenStream.addAttribute(OffsetAttribute.class);
Token token = at.addToken(offset.startOffset(), offset.endOffset());
//Get the POS attribute and init the PosTag
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
if(posTag == null){
posTag = adhocTags.get(posAttr.getPartOfSpeech());
if(posTag == null){
posTag = new PosTag(posAttr.getPartOfSpeech());
adhocTags.put(posAttr.getPartOfSpeech(), posTag);
log.warn(" ... missing PosTag mapping for {}",posAttr.getPartOfSpeech());
}
}
//Sentence detection by POS tag
if(sentStartOffset < 0){ //the last token was a sentence ending
sentStartOffset = offset.startOffset();
}
if(posTag.hasPos(Pos.Point)) {
Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
//add the sentence as context to the NerData instances
while(nerSentIndex < nerList.size()){
nerList.get(nerSentIndex).context = sent.getSpan();
nerSentIndex++;
}
sentStartOffset = -1;
}
//POS
token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
//NER
NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
if(ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))){
//write NER annotation
Chunk chunk = at.addChunk(ner.start, ner.end);
chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
//NOTE that the fise:TextAnnotation are written later based on the nerList
//clean up
ner = null;
}
if(nerTag != null){
if(ner == null){
ner = new NerData(nerTag, offset.startOffset());
nerList.add(ner);
}
ner.end = offset.endOffset();
}
BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
MorphoFeatures morpho = null;
if(baseFormAttr != null && baseFormAttr.getBaseForm() != null){
morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
morpho.addPos(posTag); //and add the posTag
}
InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
inflectionAttr.getInflectionForm();
inflectionAttr.getInflectionType();
if(morpho != null){ //if present add the morpho
token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
}
}
//we still need to write the last sentence
Sentence lastSent = null;
if(offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset){
lastSent = at.addSentence(sentStartOffset, offset.endOffset());
}
//and set the context off remaining named entities
while(nerSentIndex < nerList.size()){
if(lastSent != null){
nerList.get(nerSentIndex).context = lastSent.getSpan();
} else { //no sentence detected
nerList.get(nerSentIndex).context = at.getSpan();
}
nerSentIndex++;
}
} catch (IOException e) {
throw new EngineException(this, ci, "Exception while reading from "
+ "AnalyzedText contentpart",e);
} finally {
try {
tokenStream.close();
} catch (IOException e) {/* ignore */}
}
//finally write the NER annotations to the metadata of the ContentItem
final MGraph metadata = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
Language lang = new Language("ja");
for(NerData nerData : nerList){
UriRef ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(
at.getSpan().substring(nerData.start, nerData.end),lang)));
metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT,
new PlainLiteralImpl(nerData.context, lang)));