Package org.apache.stanbol.enhancer.engines.opennlp.sentence.impl

Source Code of org.apache.stanbol.enhancer.engines.opennlp.sentence.impl.OpenNlpSentenceDetectionEngine

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.opennlp.sentence.impl;

import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;

import java.util.Collections;
import java.util.Dictionary;
import java.util.HashMap;
import java.util.Map;

import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;

import org.apache.clerezza.rdf.core.UriRef;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.opennlp.OpenNLP;
import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
import org.apache.stanbol.enhancer.nlp.model.Sentence;
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.osgi.framework.Constants;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* EnhancementEngine that uses the OpenNLP {@link SentenceDetector} to
* add {@link Sentence} annotations to the {@link AnalysedText}
* content part of the parsed {@link ContentItem}.<p>
* While the opennlp-pos engine does also support adding of {@link Sentence}
* annotations this engine can be used in cases where no POS tagging is
* needed. In addition this engine also allows to configure custom
* {@link SentenceModel}s with by using the {@link #MODEL_NAME_PARAM}
* with the language configuration
* <code><pre>
*     {lang};model={model-name}
* </pre></code>
*/
@Component(immediate = true, metatype = true,
    policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
@Service
@Properties(value={
        @Property(name= EnhancementEngine.PROPERTY_NAME,value="opennlp-sentence"),
        @Property(name=OpenNlpSentenceDetectionEngine.CONFIG_LANGUAGES, value = {"*"},cardinality=Integer.MAX_VALUE),
        @Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0
})
public class OpenNlpSentenceDetectionEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {


    private static final Map<String,Object> SERVICE_PROPERTIES;
    static {
        Map<String,Object> props = new HashMap<String,Object>();
        props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
            ServiceProperties.ORDERING_NLP_SENTENCE_DETECTION);
        props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
            NlpProcessingRole.SentenceDetection);
        SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
    }

    /**
     * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
     * are the languages given as default value.
     */
    public static final String CONFIG_LANGUAGES = "org.apache.stanbol.enhancer.sentence.languages";

    /**
     * The parameter name used to configure the name of the OpenNLP model used for pos tagging
     */
    private static final String MODEL_NAME_PARAM = "model";


    private static Logger log = LoggerFactory.getLogger(OpenNlpSentenceDetectionEngine.class);

    //Langauge configuration
    private LanguageConfiguration languageConfig = new LanguageConfiguration(CONFIG_LANGUAGES,new String[]{"*"});

    @Reference
    private OpenNLP openNLP;
   
    @Reference
    private AnalysedTextFactory analysedTextFactory;
   
    /**
     * Indicate if this engine can enhance supplied ContentItem, and if it
     * suggests enhancing it synchronously or asynchronously. The
     * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
     * just a suggestion from the engine.
     * <p/>
     * Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for
     * the content item, CANNOT_ENHANCE otherwise.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the introspecting process of the content item
     *          fails
     */
    @Override
    public int canEnhance(ContentItem ci) throws EngineException {
        // check if content is present
        Map.Entry<UriRef,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
        if(entry == null || entry.getValue() == null) {
            return CANNOT_ENHANCE;
        }

        String language = getLanguage(this,ci,false);
        if(language == null) {
            return CANNOT_ENHANCE;
        }
        if(!languageConfig.isLanguage(language)){
            log.trace(" > can NOT enhance ContentItem {} because language {} is "
                + "not enabled by this engines configuration",ci,language);
            return CANNOT_ENHANCE;
        }
        if(getSentenceDetector(language) == null){
            log.trace(" > can NOT enhance ContentItem {} because no sentence "
                    + "deteciton model for language {} is available.",ci,language);
                return CANNOT_ENHANCE;
        }
       
        log.trace(" > can enhance ContentItem {} with language {}",ci,language);
        return ENHANCE_ASYNC;
    }

    /**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     * <p/>
     * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
     * stores it as a new part in the content item. The metadata is not changed.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
        String language = getLanguage(this, ci, true);
        SentenceDetector sentenceDetector = getSentenceDetector(language);
        if(sentenceDetector != null){
            for(opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
                //detect sentences and add it to the AnalyzedText.
                Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd());
                log.trace(" > add {}",sentence);
            }
        } else {
            log.warn("SentenceDetector model for language {} is no longer available. "
                + "This might happen if the model becomes unavailable during enhancement. "
                + "If this happens more often it might also indicate an bug in the used "
                + "EnhancementJobManager implementation as the availability is also checked "
                + "in the canEnhance(..) method of this Enhancement Engine.");
        }
    }
   
    @Override
    public Map<String,Object> getServiceProperties() {
        return SERVICE_PROPERTIES;
    }
   
    /**
     * Activate and read the properties. Configures and initialises a POSTagger for each language configured in
     * CONFIG_LANGUAGES.
     *
     * @param ce the {@link org.osgi.service.component.ComponentContext}
     */
    @Activate
    protected void activate(ComponentContext ce) throws ConfigurationException {
        log.info("activating POS tagging engine");
        super.activate(ce);
        @SuppressWarnings("unchecked")
        Dictionary<String, Object> properties = ce.getProperties();

        languageConfig.setConfiguration(properties);
    }
   
    @Deactivate
    protected void deactivate(ComponentContext context) {
        languageConfig.setDefault();
        super.deactivate(context);
    }
   
    /**
     * Obtains the {@link SentenceDetectorME} model for the given
     * language form the {@link #openNLP} service. If a custom
     * model is configured for the parsed language than it is
     * loaded by using {@link OpenNLP#getModel(Class, String, Map)}
     * otherwise the default model {@link OpenNLP#getSentenceDetector(String)}
     * is retrieved
     * @param language the language
     * @return the model of <code>null</code> if non is available or
     * an exception was encountered while loading
     */
    private SentenceDetector getSentenceDetector(String language) {
        SentenceModel model;
        String modelName = languageConfig.getParameter(language, MODEL_NAME_PARAM);
        if(modelName == null){
            try {
                model = openNLP.getSentenceModel(language);
            } catch (Exception e) {
                log.warn("Unable to load default Sentence Detection model for language '"+language+"'!",e);
                return null;
            }
        } else {
            try {
                model = openNLP.getModel(SentenceModel.class, modelName, null);
            } catch (Exception e) {
                log.warn("Unable to load Sentence Detection model for language '"
                        +language+"' from the configured model '"+modelName+"'!",e);
                return null;
            }
        }
        if(model != null) {
            log.debug("Sentence Detection Model {} for lanugage '{}' version: {}",
                new Object[]{model.getClass().getSimpleName(),
                             model.getLanguage(),
                             model.getVersion() != null ? model.getVersion() : "undefined"});
            return new SentenceDetectorME(model);
        }
        log.debug("Sentence Detection Model for Language '{}' not available.", language);
        return null;
    }
       
}
TOP

Related Classes of org.apache.stanbol.enhancer.engines.opennlp.sentence.impl.OpenNlpSentenceDetectionEngine

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.