Source Code of org.apache.stanbol.enhancer.engines.opennlp.pos.services.OpenNlpPosTaggingEngine

/*
 * Copyright (c) 2012 Sebastian Schaffert
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */


package org.apache.stanbol.enhancer.engines.opennlp.pos.services;


import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;


import java.util.ArrayList;
import java.util.Collections;
import java.util.Dictionary;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;


import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTagger;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.Sequence;


import org.apache.clerezza.rdf.core.UriRef;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.opennlp.OpenNLP;
import org.apache.stanbol.enhancer.engines.opennlp.pos.model.PosTagSetRegistry;
import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
import org.apache.stanbol.enhancer.nlp.model.Section;
import org.apache.stanbol.enhancer.nlp.model.Sentence;
import org.apache.stanbol.enhancer.nlp.model.Span;
import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.osgi.framework.Constants;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * A german language POS tagger. Requires that the content item has a text/plain part and a
 * language id of "de". Adds a POSContentPart to the content item that can be used for further
 * processing by other modules.
 * 
 * @author Sebastian Schaffert
 */


@Component(immediate = true, metatype = true, 
    configurationFactory = true, //allow multiple instances
    policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
@Service
@Properties(value={
        @Property(name= EnhancementEngine.PROPERTY_NAME,value="opennlp-pos"),
        @Property(name=OpenNlpPosTaggingEngine.CONFIG_LANGUAGES, value = {"*"},cardinality=Integer.MAX_VALUE),
        @Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0
})
public class OpenNlpPosTaggingEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {


    private static final Map<String,Object> SERVICE_PROPERTIES;
    static {
        Map<String,Object> props = new HashMap<String,Object>();
        props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING, 
            ServiceProperties.ORDERING_NLP_POS);
        props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE, 
            NlpProcessingRole.PartOfSpeachTagging);
        SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
    }


    /**
     * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
     * are the languages given as default value.
     */
    public static final String CONFIG_LANGUAGES = "org.apache.stanbol.enhancer.pos.languages";


    /**
     * The parameter name used to configure the name of the OpenNLP model used for pos tagging
     */
    private static final String MODEL_NAME_PARAM = "model";




    private static Logger log = LoggerFactory.getLogger(OpenNlpPosTaggingEngine.class);


    //Langauge configuration
    private LanguageConfiguration languageConfig = new LanguageConfiguration(CONFIG_LANGUAGES,new String[]{"*"});
//    private Set<String> configuredLanguages;
//    private Set<String> excludedLanguages;
//    private boolean allowAll;


    @Reference
    private OpenNLP openNLP;
    /**
     * Provides known {@link TagSet}s used by OpenNLP
     */
    private PosTagSetRegistry tagSetRegistry = PosTagSetRegistry.getInstance();
    
    @Reference
    private AnalysedTextFactory analysedTextFactory;


    /**
     * Holds as key the languages and as values the ad-hoc (unmapped) phrase tags
     * for that languages.<p>
     * NOTE: Not synchronised as concurrent execution caused multiple adds will
     * only create some additional {@link PhraseTag} instances and not actual
     * problems.
     */
    private Map<String,Map<String,PosTag>> languageAdhocTags = new HashMap<String,Map<String,PosTag>>();
     
    /**
     * Indicate if this engine can enhance supplied ContentItem, and if it
     * suggests enhancing it synchronously or asynchronously. The
     * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
     * just a suggestion from the engine.
     * <p/>
     * Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for
     * the content item, CANNOT_ENHANCE otherwise.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the introspecting process of the content item
     *          fails
     */
    @Override
    public int canEnhance(ContentItem ci) throws EngineException {
        // check if content is present
        Map.Entry<UriRef,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
        if(entry == null || entry.getValue() == null) {
            return CANNOT_ENHANCE;
        }


        String language = getLanguage(this,ci,false);
        if(language == null) {
            return CANNOT_ENHANCE;
        }
        if(!languageConfig.isLanguage(language)){
            log.trace(" > can NOT enhance ContentItem {} because language {} is "
                + "not enabled by this engines configuration",ci,language);
            return CANNOT_ENHANCE;
        }


        if(getPOSTagger(language) == null) {
            log.trace(" > can NOT enhance ContentItem {} because no POSTagger is"
                    + "is present for language {}",ci,language);
            return CANNOT_ENHANCE;
        }


        log.trace(" > can enhance ContentItem {} with language {}",ci,language);
        return ENHANCE_ASYNC;
    }


    /**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     * <p/>
     * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
     * stores it as a new part in the content item. The metadata is not changed.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
        String language = getLanguage(this, ci, true);
        
        POSTagger posTagger = getPOSTagger(language);
        if(posTagger == null){
            //this means that the POS tagger became unavailable in-between
            //the call to canEnhance and computeEnhancement
            throw new EngineException("PosTagger for langauge '"+language
                + "is not available."); 
        }
        TagSet<PosTag> tagSet = tagSetRegistry.getTagSet(language);
        if(tagSet == null){
            log.warn("No POS TagSet registered for Language '{}'. Will build an "
                    + "adhoc set based on encountered Tags!",language);
            //for now only created to avoid checks for tagSet == null
            //TODO: in future we might want to automatically create posModels based
            //on tagged texts. However this makes no sense as long we can not
            //persist TagSets.
            tagSet = new TagSet<PosTag>("dummy", language);
        }
        //holds PosTags created for POS tags that where not part of the posModel
        //(will hold all PosTags in case tagSet is NULL
        Map<String,PosTag> adhocTags = languageAdhocTags.get(language);
        if(adhocTags == null){
                adhocTags =  new HashMap<String,PosTag>();
                languageAdhocTags.put(language, adhocTags);
        }
        //(1) Sentence detection
        
        //Try to read existing Sentence Annotations
        Iterator<Sentence> sentences = at.getSentences();
        List<Section> sentenceList;
        if(!sentences.hasNext()){
            //if non try to detect sentences
            log.trace(" > detect sentences for {}",at);
            sentenceList = detectSentences(at,language);
        }
        if(sentences.hasNext()){ //check if we have detected sentences
            log.trace(" > use existing Sentence annotations for {}",at);
            sentenceList = new ArrayList<Section>();
            AnalysedTextUtils.appandToList(sentences, sentenceList);
        } else {
            //no sentence detected ... treat the whole text as a single sentence
            //TODO: maybe apply here a limit to the text size!
            log.trace(" > unable to detect Sentences for {} (langauge: {})",at,language);
            sentenceList = Collections.singletonList((Section)at);
        }
        
        //for all sentences (or the whole Text - if no sentences available)
        for(Section sentence : sentenceList){


            //(2) Tokenize Sentences
            
            List<Token> tokenList;
            //check if there are already tokens
            Iterator<Token> tokens = sentence.getTokens();
            if(!tokens.hasNext()){ //no tokens present -> tokenize
                log.trace(" > tokenize {}",sentence);
                tokenList = tokenize(sentence,language);
            } else { //use existing
                log.trace(" > use existing Tokens for {}",sentence);
                tokenList = new ArrayList<Token>(); //ensure an ArrayList is used
                AnalysedTextUtils.appandToList(tokens, tokenList);
            }
            
            //(3) POS Tagging
            posTag(tokenList, posTagger,tagSet,adhocTags,language);
            
        }
        if(log.isTraceEnabled()){
            logAnnotations(at);
        }
    }
    
    @Override
    public Map<String,Object> getServiceProperties() {
        return SERVICE_PROPERTIES;
    }
    
    
    private void logAnnotations(AnalysedText at){
        Iterator<Span> it = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Token));
        while(it.hasNext()){
            Span span = it.next();
            log.trace(" > {}",span);
            for(Value<PosTag> value : span.getAnnotations(POS_ANNOTATION)){
                log.trace("   - {}",value);
            }
        }
    }
    /**
     * POS tags the parsed tokens by using the pos tagger. Annotations are
     * added based on the posModel and already created adhoc tags.
     * @param tokenList
     * @param posTagger
     * @param posModel
     * @param adhocTags
     * @param language
     */
    private void posTag(List<Token> tokenList,
                        POSTagger posTagger,
                        TagSet<PosTag> posModel,
                        Map<String,PosTag> adhocTags, 
                        String language) {
        String[] tokenTexts = new String[tokenList.size()];
        for(int i=0;i<tokenList.size(); i++){
            tokenTexts[i] = tokenList.get(i).getSpan();
        }
        //get the topK POS tags and props and copy it over to the 2dim Arrays
        Sequence[] posSequences = posTagger.topKSequences(tokenTexts);
        //extract the POS tags and props for the current token from the
        //posSequences.
        //NOTE: Sequence includes always POS tags for all Tokens. If
        //      less then posSequences.length are available it adds the
        //      best match for all followings.
        //      We do not want such copies.
        PosTag[] actPos = new PosTag[posSequences.length];
        double[] actProp = new double[posSequences.length];
        for(int i=0;i<tokenTexts.length;i++){
            Token token = tokenList.get(i);
            boolean done = false;
            int j = 0;
            while( j < posSequences.length && !done){
                String p = posSequences[j].getOutcomes().get(i);
                done = j > 0 && p.equals(actPos[0].getTag());
                if(!done){
                    actPos[j] = getPosTag(posModel,adhocTags,p,language);
                    actProp[j] = posSequences[j].getProbs()[i];
                    j++;
                }
            }
            //create the POS values
            token.addAnnotations(POS_ANNOTATION, Value.values(actPos, actProp,j));
        }


    }


    private PosTag getPosTag(TagSet<PosTag> model, Map<String,PosTag> adhocTags, String tag, String language) {
        PosTag posTag = model.getTag(tag);
        if(posTag != null){
            return posTag;
        }
        posTag = adhocTags.get(tag);
        if(posTag != null){
            return posTag;
        }
        posTag = new PosTag(tag);
        adhocTags.put(tag, posTag);
        log.info("Encountered umapped POS tag '{}' for langauge '{}'",tag,language);
        return posTag;
    }


    private List<Token> tokenize(Section section,String langauge) {
        Tokenizer tokenizer = getTokenizer(langauge);
        String text = section.getSpan();
        List<Token> tokens = new ArrayList<Token>(text.length()/5); //assume avr. token length is 5
        opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
        for(int i=0;i<tokenSpans.length;i++){
            Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
            log.trace(" > add {}",token);
            tokens.add(token);
        }
        return tokens;
    }


    private List<Section> detectSentences(AnalysedText at, String language) {
        SentenceDetector sentenceDetector = getSentenceDetector(language);
        List<Section> sentences;
        if(sentenceDetector != null){
            sentences = new ArrayList<Section>();
            for(opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
                Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd());
                log.trace(" > add {}",sentence);
                sentences.add(sentence);
            }
        } else {
            sentences = null;
        }
        return sentences;
    }


    /**
     * Activate and read the properties. Configures and initialises a POSTagger for each language configured in
     * CONFIG_LANGUAGES.
     *
     * @param ce the {@link org.osgi.service.component.ComponentContext}
     */
    @Activate
    protected void activate(ComponentContext ce) throws ConfigurationException {
        log.info("activating POS tagging engine");
        super.activate(ce);
        @SuppressWarnings("unchecked")
        Dictionary<String, Object> properties = ce.getProperties();


        languageConfig.setConfiguration(properties);
    }
    
    @Deactivate
    protected void deactivate(ComponentContext context) {
        languageConfig.setDefault();
        super.deactivate(context);
    }
    
    
    private SentenceDetector getSentenceDetector(String language) {
        try {
            SentenceModel model = openNLP.getSentenceModel(language);
            if(model != null) {
                log.debug("Sentence Detection Model {} for lanugage '{}' version: {}",
                    new Object[]{model.getClass().getSimpleName(), 
                                 model.getLanguage(), 
                                 model.getVersion() != null ? model.getVersion() : "undefined"});
                return new SentenceDetectorME(model);
            }
        } catch (Exception e) {
        }
        log.debug("Sentence Detection Model for Language '{}' not available.", language);
        return null;
    }
    private POSTagger getPOSTagger(String language) {
        String modelName = languageConfig.getParameter(language,MODEL_NAME_PARAM);
        try {
            POSModel model;
            if(modelName == null){ //use the default
                model = openNLP.getPartOfSpeachModel(language);
            } else {
                model = openNLP.getModel(POSModel.class, modelName, null);
            }
            if(model != null) {
                log.debug("POS Tagger Model {} for lanugage '{}' version: {}",
                    new Object[]{model.getClass().getSimpleName(), 
                                 model.getLanguage(), 
                                 model.getVersion() != null ? model.getVersion() : "undefined"});
                return new POSTaggerME(model);
            }
        } catch (Exception e) {
            log.warn("Unable to load POS model for language '"+language+"'!",e);
        }
        log.debug("POS tagging Model for Language '{}' not available.", language);
        return null;
    }
    
    private Tokenizer getTokenizer(String language){
        return openNLP.getTokenizer(language);
    }
    
}
Source Code of org.apache.stanbol.enhancer.engines.opennlp.pos.services.OpenNlpPosTaggingEngine

Related Classes of org.apache.stanbol.enhancer.engines.opennlp.pos.services.OpenNlpPosTaggingEngine