Source Code of org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.keywordextraction.engine;


import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;


import org.apache.clerezza.rdf.core.Language;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.lang.StringUtils;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.PropertyOption;
import org.apache.felix.scr.annotations.ReferenceCardinality;
import org.apache.felix.scr.annotations.ReferencePolicy;
import org.apache.felix.scr.annotations.ReferenceStrategy;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.namespaceprefix.NamespaceMappingUtils;
import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService;
import org.apache.stanbol.commons.opennlp.OpenNLP;
import org.apache.stanbol.commons.opennlp.TextAnalyzer;
import org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig;
import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
import org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinker;
import org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig;
import org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity;
import org.apache.stanbol.enhancer.engines.keywordextraction.impl.Suggestion;
import org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig.RedirectProcessingMode;
import org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity.Occurrence;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.AnalysedContent;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntitySearcher;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.EntityhubSearcher;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.ReferencedSiteSearcher;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.TrackingEntitySearcher;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
import org.apache.stanbol.entityhub.servicesapi.Entityhub;
import org.apache.stanbol.entityhub.servicesapi.model.Reference;
import org.apache.stanbol.entityhub.servicesapi.model.Text;
import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
import org.osgi.framework.Constants;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


@Component(
    configurationFactory = true, 
    policy = ConfigurationPolicy.REQUIRE, // the baseUri is required!
    specVersion = "1.1", 
    metatype = true, 
    immediate = true,
    inherit = true)
@Service
@org.apache.felix.scr.annotations.Properties(value={
    @Property(name=EnhancementEngine.PROPERTY_NAME),
    @Property(name=KeywordLinkingEngine.REFERENCED_SITE_ID),
    @Property(name=KeywordLinkingEngine.NAME_FIELD,value="rdfs:label"),
    @Property(name=KeywordLinkingEngine.CASE_SENSITIVE,boolValue=EntityLinkerConfig.DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
    @Property(name=KeywordLinkingEngine.TYPE_FIELD,value="rdf:type"),
    @Property(name=KeywordLinkingEngine.REDIRECT_FIELD,value="rdfs:seeAlso"),
    @Property(name=KeywordLinkingEngine.REDIRECT_PROCESSING_MODE,options={
        @PropertyOption(
            value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.ignore",
            name="IGNORE"),
        @PropertyOption(
            value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.addValues",
            name="ADD_VALUES"),
        @PropertyOption(
                value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.follow",
                name="FOLLOW")
        },value="IGNORE"),
    @Property(name=KeywordLinkingEngine.MIN_SEARCH_TOKEN_LENGTH,
        intValue=EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
    @Property(name=KeywordLinkingEngine.MIN_TOKEN_MATCH_FACTOR,floatValue=
            EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR),
    @Property(name=KeywordLinkingEngine.KEYWORD_TOKENIZER,boolValue=false),
    @Property(name=KeywordLinkingEngine.MAX_SUGGESTIONS,
        intValue=EntityLinkerConfig.DEFAULT_SUGGESTIONS),
    @Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value=""),
    @Property(name=KeywordLinkingEngine.DEFAULT_MATCHING_LANGUAGE,value=""),
    @Property(name=KeywordLinkingEngine.TYPE_MAPPINGS,cardinality=1000),
    @Property(name=KeywordLinkingEngine.DEREFERENCE_ENTITIES,
        boolValue=KeywordLinkingEngine.DEFAULT_DEREFERENCE_ENTITIES_STATE),
    @Property(name=Constants.SERVICE_RANKING,intValue=0)
})
public class KeywordLinkingEngine 
        extends AbstractEnhancementEngine<RuntimeException,RuntimeException> 
        implements EnhancementEngine, ServiceProperties {


    private final Logger log = LoggerFactory.getLogger(KeywordLinkingEngine.class);
    /**
     * This is used to check the content type of parsed {@link ContentItem}s for
     * plain text
     */
    protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
    /**
     * Contains the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
     */
    protected static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
    /**
     * The default value for the Execution of this Engine.
     * This Engine creates TextAnnotations that should not be processed by other Engines.
     * Therefore it uses a lower rank than {@link ServiceProperties#ORDERING_DEFAULT}
     * to ensure that other engines do not get confused
     */
    public static final Integer DEFAULT_ORDER = ServiceProperties.ORDERING_DEFAULT - 10;


    
    public static final String REFERENCED_SITE_ID = "org.apache.stanbol.enhancer.engines.keywordextraction.referencedSiteId";
    public static final String NAME_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.nameField";
    public static final String TYPE_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.typeField";
    public static final String CASE_SENSITIVE = "org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive";
    public static final String REDIRECT_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectField";
    public static final String REDIRECT_PROCESSING_MODE = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectMode";
    public static final String MIN_SEARCH_TOKEN_LENGTH = "org.apache.stanbol.enhancer.engines.keywordextraction.minSearchTokenLength";
    public static final String MAX_SUGGESTIONS = "org.apache.stanbol.enhancer.engines.keywordextraction.maxSuggestions";
    public static final String PROCESSED_LANGUAGES = "org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages";
    public static final String MIN_FOUND_TOKENS= "org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens";
    public static final String DEFAULT_MATCHING_LANGUAGE = "org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage";
    public static final String MIN_POS_TAG_PROBABILITY = "org.apache.stanbol.enhancer.engines.keywordextraction.minPosTagProbability";
    public static final String TYPE_MAPPINGS = "org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings";
    public static final String KEYWORD_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer";
    public static final String MIN_TOKEN_MATCH_FACTOR = "org.apache.stanbol.enhancer.engines.keywordextraction.minTokenMatchFactor";
//  public static final String ENABLE_CHUNKER = "org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
    /**
     * Adds the dereference feature (STANBOL-333) also to this engine.
     * This will be replaced by STANBOL-336. 
     */
    public static final String DEREFERENCE_ENTITIES = "org.apache.stanbol.enhancer.engines.keywordextraction.dereference";
    /**
     * The default state to dereference entities set to <code>true</code>.
     */
    public static final boolean DEFAULT_DEREFERENCE_ENTITIES_STATE = true;
    /**
     * Allows to add a list of fields that are included when dereferencing Entities
     */
    public static final String DEREFERENCE_ENTITIES_FIELDS = "org.apache.stanbol.enhancer.engines.keywordextraction.dereferenceFields";
    /**
     * Additional fields added for dereferenced entities
     */
    private static final Collection<String> DEREFERENCE_FIELDS = Arrays.asList(
        "http://www.w3.org/2000/01/rdf-schema#comment",
        "http://www.w3.org/2003/01/geo/wgs84_pos#lat",
        "http://www.w3.org/2003/01/geo/wgs84_pos#long",
        "http://xmlns.com/foaf/0.1/depiction",
        "http://dbpedia.org/ontology/thumbnail");
    /**
     * The dereferenceEntitiesState as set in {@link #activateEntityDereference(Dictionary)}
     */
    private boolean dereferenceEntitiesState;
    /**
     * Default set of languages. This is an empty set indicating that texts in any
     * language are processed. 
     */
    public static final Set<String> DEFAULT_LANGUAGES = Collections.emptySet();
    public static final double DEFAULT_MIN_POS_TAG_PROBABILITY = 0.6667;
    /**
     * The languages this engine is configured to enhance. An empty List is
     * considered as active for any language
     */
    private Set<String> languages = DEFAULT_LANGUAGES;
    /**
     * The literal representing the LangIDEngine as creator.
     */
    public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine");
    
    private EntitySearcher entitySearcher;
    private EntityLinkerConfig linkerConfig;
    private TextAnalyzerConfig nlpConfig;
    
    /**
     * The reference to the OpenNLP component
     */
    @org.apache.felix.scr.annotations.Reference
    private OpenNLP openNLP;
    
    @org.apache.felix.scr.annotations.Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY)
    protected NamespacePrefixService nsPrefixService;


    //TextAnalyzer was changed to have a scope of a single request ( call to
    //#computeEnhancement!
    //private TextAnalyzer textAnalyser;
    /**
     * Used to create {@link AnalysedContent} instances for parsed content items
     */
    private OpenNlpAnalysedContentFactory analysedContentFactory;
    /**
     * The literalFactory used to create typed literals
     */
    private LiteralFactory literalFactory = LiteralFactory.getInstance();
    
    /**
     * The {@link OfflineMode} is used by Stanbol to indicate that no external service should be referenced.
     * For this engine that means it is necessary to check if the used {@link ReferencedSite} can operate
     * offline or not.
     * 
     * @see #enableOfflineMode(OfflineMode)
     * @see #disableOfflineMode(OfflineMode)
     */
    @org.apache.felix.scr.annotations.Reference(
        cardinality = ReferenceCardinality.OPTIONAL_UNARY, 
        policy = ReferencePolicy.DYNAMIC, 
        bind = "enableOfflineMode", 
        unbind = "disableOfflineMode", 
        strategy = ReferenceStrategy.EVENT)
    private OfflineMode offlineMode;
    /**
     * The name of the reference site ('local' or 'entityhub') if the
     * Entityhub is used for enhancing
     */
    protected String referencedSiteName;


    /**
     * Called by the ConfigurationAdmin to bind the {@link #offlineMode} if the service becomes available
     * 
     * @param mode
     */
    protected final void enableOfflineMode(OfflineMode mode) {
        this.offlineMode = mode;
    }


    /**
     * Called by the ConfigurationAdmin to unbind the {@link #offlineMode} if the service becomes unavailable
     * 
     * @param mode
     */
    protected final void disableOfflineMode(OfflineMode mode) {
        this.offlineMode = null;
    }


    /**
     * Returns <code>true</code> only if Stanbol operates in {@link OfflineMode}.
     * 
     * @return the offline state
     */
    protected final boolean isOfflineMode() {
        return offlineMode != null;
    }
    
    /**
     * Default constructor as used by OSGI. This expects that 
     * {@link #activate(ComponentContext)} is called before usage
     */
    public KeywordLinkingEngine() {
    }
    /**
     * Internal Constructor used by {@link #createInstance(OpenNLP, EntitySearcher, EntityLinkerConfig)}
     * @param openNLP
     * @param entitySearcher
     * @param config
     */
    protected KeywordLinkingEngine(OpenNLP openNLP,EntitySearcher entitySearcher,
                                   TextAnalyzerConfig nlpConfig,EntityLinkerConfig linkingConfig){
        this.openNLP = openNLP;
        this.linkerConfig = linkingConfig != null ? linkingConfig : new EntityLinkerConfig();
        this.nlpConfig = nlpConfig != null ? nlpConfig : new TextAnalyzerConfig();
        this.analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP,nlpConfig);
        this.entitySearcher = entitySearcher;
    }
    /**
     * Allows to create an instance that can be used outside of an OSGI
     * environment. This is mainly intended for unit tests.
     * @param openNLP The {@link OpenNLP} instance used for natural language processing
     * @param entitySearcher the searcher used to lookup terms
     * @param config the configuration or <code>null</code> to use the defaults
     * @return the created engine instance
     */
    public static KeywordLinkingEngine createInstance(OpenNLP openNLP,
                                                      EntitySearcher entitySearcher,
                                                      TextAnalyzerConfig nlpConfig,
                                                      EntityLinkerConfig linkingConfig){
        return new KeywordLinkingEngine(openNLP,entitySearcher,nlpConfig,linkingConfig);
    }




    /**
     * Checks if the parsed language is enabled for processing.
     * @param language The language to process
     * @return the processing state for the parsed language.
     */
    protected boolean isProcessableLanguages(String language) {
        return languages.isEmpty() || languages.contains(language);
    }
    
    @Override
    public Map<String,Object> getServiceProperties() {
        return Collections.unmodifiableMap(Collections.singletonMap(
            ENHANCEMENT_ENGINE_ORDERING,
            (Object) DEFAULT_ORDER));
    }


    @Override
    public int canEnhance(ContentItem ci) throws EngineException {
        if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){
            return ENHANCE_ASYNC; //KeywordLinking now supports async processing
        } else {
            return CANNOT_ENHANCE;
        }
    }


    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
            throw new EngineException("Offline mode is not supported by the Component used to lookup Entities");
        }
        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
        if(contentPart == null){
            throw new IllegalStateException("No ContentPart with a supported Mime Type"
                    + "found for ContentItem "+ci.getUri()+"(supported: '"
                    + SUPPORTED_MIMETYPES+"') -> this indicates that canEnhance was" 
                    + "NOT called and indicates a bug in the used EnhancementJobManager!");
        }
        String text;
        try {
            text = ContentItemHelper.getText(contentPart.getValue());
        } catch (IOException e) {
            throw new InvalidContentException(String.format("Unable to extract "
                +" text from ContentPart %s of ContentItem %s!",
                contentPart.getKey(),ci.getUri()),e);
        }
        if (text.trim().length() == 0) {
            // TODO: make the length of the data a field of the ContentItem
            // interface to be able to filter out empty items in the canEnhance
            // method
            log.warn("ContentPart {} of ContentItem does not contain any Text to extract knowledge from",
                contentPart.getKey(), ci);
            return;
        }
        //Determine the language
        String language;
        ci.getLock().readLock().lock();
        try {
         language = extractLanguage(ci);
        } finally {
            ci.getLock().readLock().unlock();
        }
        if(isProcessableLanguages(language)){
            log.debug("computeEnhancements for ContentItem {} language {} text={}", 
                new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(text, 100)});
            
            EntityLinker entityLinker = new EntityLinker(
                analysedContentFactory.create(text, language),
                entitySearcher, linkerConfig);
            //process
            entityLinker.process();
            //write results (requires a write lock)
            ci.getLock().writeLock().lock();
            try {
                writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language);
            } finally {
                ci.getLock().writeLock().unlock();
            }
        } else {
            log.debug("ignore ContentItem {} because language '{}' is not configured to" +
                "be processed by this engine.",ci.getUri().getUnicodeString(),language);
        }
        
    }


    /**
     * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
     * extracted from the parsed ContentItem
     * @param ci
     * @param linkedEntities
     * @param language
     */
    private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language) {
        Language languageObject = null;
        if(language != null && !language.isEmpty()){
            languageObject = new Language(language);
        }
        MGraph metadata = ci.getMetadata();
        for(LinkedEntity linkedEntity : linkedEntities){
            Collection<UriRef> textAnnotations = new ArrayList<UriRef>(linkedEntity.getOccurrences().size());
            //first create the TextAnnotations for the Occurrences
            for(Occurrence occurrence : linkedEntity.getOccurrences()){
                UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                textAnnotations.add(textAnnotation);
                metadata.add(new TripleImpl(textAnnotation, 
                    Properties.ENHANCER_START, 
                    literalFactory.createTypedLiteral(occurrence.getStart())));
                metadata.add(new TripleImpl(textAnnotation, 
                    Properties.ENHANCER_END, 
                    literalFactory.createTypedLiteral(occurrence.getEnd())));
                metadata.add(new TripleImpl(textAnnotation, 
                    Properties.ENHANCER_SELECTION_CONTEXT, 
                    new PlainLiteralImpl(occurrence.getContext(),languageObject)));
                metadata.add(new TripleImpl(textAnnotation, 
                    Properties.ENHANCER_SELECTED_TEXT, 
                    new PlainLiteralImpl(occurrence.getSelectedText(),languageObject)));
                metadata.add(new TripleImpl(textAnnotation, 
                    Properties.ENHANCER_CONFIDENCE, 
                    literalFactory.createTypedLiteral(linkedEntity.getScore())));
                for(UriRef dcType : linkedEntity.getTypes()){
                    metadata.add(new TripleImpl(
                        textAnnotation, Properties.DC_TYPE, dcType));
                }
            }
            //now the EntityAnnotations for the Suggestions
            for(Suggestion suggestion : linkedEntity.getSuggestions()){
                UriRef entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
                //should we use the label used for the match, or search the
                //representation for the best label ... currently its the matched one
                Text label = suggestion.getBestLabel(linkerConfig.getNameField(),language);
                metadata.add(new TripleImpl(entityAnnotation, 
                    Properties.ENHANCER_ENTITY_LABEL, 
                    label.getLanguage() == null ?
                            new PlainLiteralImpl(label.getText()) :
                                new PlainLiteralImpl(label.getText(),
                                    new Language(label.getLanguage()))));
                metadata.add(new TripleImpl(entityAnnotation, 
                    Properties.ENHANCER_ENTITY_REFERENCE, 
                    new UriRef(suggestion.getRepresentation().getId())));
                Iterator<Reference> suggestionTypes = suggestion.getRepresentation().getReferences(linkerConfig.getTypeField());
                while(suggestionTypes.hasNext()){
                    metadata.add(new TripleImpl(entityAnnotation, 
                        Properties.ENHANCER_ENTITY_TYPE, new UriRef(suggestionTypes.next().getReference())));
                }
                metadata.add(new TripleImpl(entityAnnotation,
                    Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
                for(UriRef textAnnotation : textAnnotations){
                    metadata.add(new TripleImpl(entityAnnotation, 
                        Properties.DC_RELATION, textAnnotation));
                }
                //add the name of the ReferencedSite providing this suggestion
                metadata.add(new TripleImpl(entityAnnotation, 
                    new UriRef(RdfResourceEnum.site.getUri()), 
                    new PlainLiteralImpl(referencedSiteName)));
                //in case dereferencing of Entities is enabled we need also to
                //add the RDF data for entities
                if(dereferenceEntitiesState){
                    metadata.addAll(
                        RdfValueFactory.getInstance().toRdfRepresentation(
                            suggestion.getRepresentation()).getRdfGraph());
                }
            }
        }
    }
    /**
     * Extracts the language of the parsed ContentItem by using
     * {@link EnhancementEngineHelper#getLanguage(ContentItem)} and "en" as
     * default.
     * @param ci the content item
     * @return the language
     */
    private String extractLanguage(ContentItem ci) {
        String lang = EnhancementEngineHelper.getLanguage(ci);
//        if(lang != null){
//        MGraph metadata = ci.getMetadata();
//        Iterator<Triple> langaugeEnhancementCreatorTriples = 
//            metadata.filter(null, Properties.DC_CREATOR, LANG_ID_ENGINE_NAME);
//        if(langaugeEnhancementCreatorTriples.hasNext()){
//            String lang = EnhancementEngineHelper.getString(metadata, 
//                langaugeEnhancementCreatorTriples.next().getSubject(), 
//                Properties.DC_LANGUAGE);
        if(lang != null){
            return lang;
        } else {
            log.warn("Unable to extract language for ContentItem %s! The Enhancement of the %s is missing the %s property",
                new Object[]{ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm(),Properties.DC_LANGUAGE});
            log.warn(" ... return 'en' as default");
            return "en";
        }
//        } else {
//            log.warn("Unable to extract language for ContentItem %s! Is the %s active?",
//                ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm());
//            log.warn(" ... return 'en' as default");
//            return "en";
//        }
    }


    
    /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     * Methods for activate() and deactivate() the properties configureable via
     * OSGI.
     * 
     * NOTEs:
     * Directly calling super.activate and super.deactivate
     * is possible but might not be applicable in all cases.
     * The activate**(...) and deactivate**() Methods are intended to be
     * called by subclasses that need more control over the initialisation
     * process.
     * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     */
    /**
     * Activates this Engine. Subclasses should not call this method but rather
     * call<ul>
     * <li> {@link #activateEntitySearcher(ComponentContext, Dictionary)}
     * <li> {@link #initEntityLinkerConfig(Dictionary, EntityLinkerConfig)} and
     * <li> {@link #activateTextAnalyzerConfig(Dictionary)}
     * <li> {@link #dereferenceEntitiesState} (needs to be called after 
     * {@link #initEntityLinkerConfig(Dictionary, EntityLinkerConfig)})
     * </ul>
     * if applicable.
     * @param context the Component context
     * @throws ConfigurationException if the required {@link #REFERENCED_SITE_ID}
     * configuration is missing or any of the other properties has an illegal value
     */
    @Activate
    @SuppressWarnings("unchecked")
    protected void activate(ComponentContext context) throws ConfigurationException {
        super.activate(context);
        Dictionary<String,Object> properties = context.getProperties();
        activateTextAnalyzerConfig(properties);
        activateEntitySearcher(context, properties);
        activateEntityLinkerConfig(properties);
        activateEntityDereference(properties);
    }


    /**
     * Inits the {@link #dereferenceEntitiesState} based on the
     * {@link #DEREFERENCE_ENTITIES} configuration.
     * @param properties the configuration
     */
    protected final void activateEntityDereference(Dictionary<String,Object> properties) throws ConfigurationException {
        Object value = properties.get(DEREFERENCE_ENTITIES);
        if(value instanceof Boolean){
            dereferenceEntitiesState = ((Boolean)value).booleanValue();
        } else if(value != null && !value.toString().isEmpty()){
            dereferenceEntitiesState = Boolean.parseBoolean(value.toString());
        } else {
            dereferenceEntitiesState = DEFAULT_DEREFERENCE_ENTITIES_STATE;
        }
        if(dereferenceEntitiesState){
            value = properties.get(DEREFERENCE_ENTITIES_FIELDS);
            if(value instanceof String[]){
                for(String field : (String[])value){
                    if(field != null && !field.isEmpty()){
                        linkerConfig.getSelectedFields().add(field);
                    }
                }
            } else if(value instanceof Collection<?>){
                for(Object field : (Collection<?>)value){
                    if(field != null && !field.toString().isEmpty()){
                        linkerConfig.getSelectedFields().add(field.toString());
                    }
                }
            } else if(value instanceof String){
                if(!value.toString().isEmpty()){
                    linkerConfig.getSelectedFields().add(value.toString());
                }
            } else if(value != null){
                throw new ConfigurationException(DEREFERENCE_ENTITIES_FIELDS, 
                    "Dereference Entities_Fields MUST BE parsed as String[], Collection<String> or "
                    + "String (single value). The actual value '"+value+"'(type: '"+value.getClass() 
                    + "') is NOT supported");
            } else { //value == null -> add the default fields
                linkerConfig.getSelectedFields().addAll(DEREFERENCE_FIELDS);
            }
        }    }


    /**
     * Initialise the {@link TextAnalyzer} component.<p>
     * Currently this includes the following configurations: <ul>
     * <li>{@link #PROCESSED_LANGUAGES}: If no configuration is present the
     * default (process all languages) is used.
     * <li> {@value #MIN_POS_TAG_PROBABILITY}: If no configuration is
     * present the #DEFAULT_MIN_POS_TAG_PROBABILITY is used
     * languages based on the value of the
     * 
     * @param configuration the OSGI component configuration
     */
    protected final void activateTextAnalyzerConfig(Dictionary<String,Object> configuration) throws ConfigurationException {
        nlpConfig = new TextAnalyzerConfig();
        Object value;
        value = configuration.get(PROCESSED_LANGUAGES);
        if(value == null){
            this.languages = DEFAULT_LANGUAGES;
        } else if (value.toString().trim().isEmpty()){
            this.languages = Collections.emptySet();
        } else {
            String[] languageArray = value.toString().split(",");
            languages = new HashSet<String>();
            for(String language : languageArray){
                if(language != null){
                    language = language.trim();
                    if(!language.isEmpty()){
                        languages.add(language);
                    }
                }
            }
        }
        value = configuration.get(MIN_POS_TAG_PROBABILITY);
        double minPosTagProb;
        if(value instanceof Number){
            minPosTagProb = ((Number)value).doubleValue();
        } else if(value != null && !value.toString().isEmpty()){
            try {
                minPosTagProb = Double.valueOf(value.toString());
            } catch (NumberFormatException e) {
                throw new ConfigurationException(MIN_POS_TAG_PROBABILITY, 
                    "Unable to parse the min POS tag probability from the parsed value "+value,e);
            }
        } else {
            minPosTagProb = DEFAULT_MIN_POS_TAG_PROBABILITY;
        }
        if(minPosTagProb > 1){
            throw new ConfigurationException(MIN_POS_TAG_PROBABILITY, 
                "The configured min POS tag probability MUST BE in the range [0..1] " +
                "or < 0 to deactivate this feature (parsed value "+value+")!");
        }
        nlpConfig.setMinPosTagProbability(minPosTagProb);
        value = configuration.get(KEYWORD_TOKENIZER);
        //the keyword tokenizer config
        if(value instanceof Boolean){
            nlpConfig.forceKeywordTokenizer((Boolean)value);
        } else if(value != null && !value.toString().isEmpty()){
            nlpConfig.forceKeywordTokenizer(Boolean.valueOf(value.toString()));
        }
        //nlpConfig.enablePosTypeChunker(false);
        //nlpConfig.enableChunker(false);
        analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP,nlpConfig);
    }


    /**
     * Configures the parsed {@link EntityLinkerConfig} with the values of the
     * following properties:<ul>
     * <li>{@link #NAME_FIELD}
     * <li>{@link #TYPE_FIELD}
     * <li>{@link #REDIRECT_FIELD}
     * <li>{@link #REDIRECT_PROCESSING_MODE}
     * <li>{@link #MAX_SUGGESTIONS}
     * <li>{@link #MIN_SEARCH_TOKEN_LENGTH}
     * <li>{@link #MIN_FOUND_TOKENS}
     * <li> {@link #MIN_TOKEN_MATCH_FACTOR}
     * </ul>
     * This Method create an new {@link EntityLinkerConfig} instance only if
     * <code>{@link #linkerConfig} == null</code>. If the instance is already initialised
     * that all current values for keys missing in the parsed configuration are
     * preserved.
     * @param configuration the configuration
     * @throws ConfigurationException In case of an illegal value in the parsed configuration.
     * Note that all configuration are assumed as optional, therefore missing values will not
     * case a ConfigurationException.
     */
    protected void activateEntityLinkerConfig(Dictionary<String,Object> configuration) throws ConfigurationException {
        if(linkerConfig == null){
            this.linkerConfig = new EntityLinkerConfig();
        }
        Object value;
        value = configuration.get(NAME_FIELD);
        if(value != null){
            if(value.toString().isEmpty()){
                throw new ConfigurationException(NAME_FIELD,"The configured name field MUST NOT be empty");
            }
            
            linkerConfig.setNameField(NamespaceMappingUtils.getConfiguredUri(
                nsPrefixService, NAME_FIELD, value.toString()));
        }
        //init case sensitivity
        value = configuration.get(CASE_SENSITIVE);
        if(value instanceof Boolean){
            linkerConfig.setCaseSensitiveMatchingState((Boolean)value);
        } else if(value != null && !value.toString().isEmpty()){
            linkerConfig.setCaseSensitiveMatchingState(Boolean.valueOf(value.toString()));
        } //if NULL or empty use default
        //init TYPE_FIELD
        value = configuration.get(TYPE_FIELD);
        if(value != null){
            if(value.toString().isEmpty()){
                throw new ConfigurationException(TYPE_FIELD,"The configured name field MUST NOT be empty");
            }
            linkerConfig.setTypeField(NamespaceMappingUtils.getConfiguredUri(
                nsPrefixService, TYPE_FIELD, value.toString()));
        }
        //init REDIRECT_FIELD
        value = configuration.get(REDIRECT_FIELD);
        if(value != null){
            if(value.toString().isEmpty()){
                throw new ConfigurationException(NAME_FIELD,"The configured name field MUST NOT be empty");
            }
            linkerConfig.setRedirectField(NamespaceMappingUtils.getConfiguredUri(
                nsPrefixService, REDIRECT_FIELD, value.toString()));
        }
        //init MAX_SUGGESTIONS
        value = configuration.get(MAX_SUGGESTIONS);
        Integer maxSuggestions;
        if(value instanceof Integer){
            maxSuggestions = (Integer)value;
        } else if (value != null){
            try {
                maxSuggestions = Integer.valueOf(value.toString());
            } catch(NumberFormatException e){
                throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0",e);
            }
        } else {
            maxSuggestions = null;
        }
        if(maxSuggestions != null){
            if(maxSuggestions < 1){
                throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0");
            }
            linkerConfig.setMaxSuggestions(maxSuggestions);
        }
        //init MIN_FOUND_TOKENS
        value = configuration.get(MIN_FOUND_TOKENS);
        Integer minFoundTokens;
        if(value instanceof Integer){
            minFoundTokens = (Integer)value;
        } else if(value != null){
            try {
                minFoundTokens = Integer.valueOf(value.toString());
            } catch(NumberFormatException e){
                throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0",e);
            }
        } else {
            minFoundTokens = null;
        }
        if(minFoundTokens != null){
            if(minFoundTokens < 1){
                throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0");
            }
            linkerConfig.setMinFoundTokens(minFoundTokens);
        }
        // init MIN_SEARCH_TOKEN_LENGTH
        value = configuration.get(MIN_SEARCH_TOKEN_LENGTH);
        Integer minSearchTokenLength;
        if(value instanceof Integer){
            minSearchTokenLength = (Integer)value;
        } else if (value != null){
            try {
                minSearchTokenLength = Integer.valueOf(value.toString());
            } catch(NumberFormatException e){
                throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0",e);
            }
        } else {
            minSearchTokenLength = null;
        }
        if(minSearchTokenLength != null){
            if(minSearchTokenLength < 1){
                throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0");
            }
            linkerConfig.setMinSearchTokenLength(minSearchTokenLength);
        }
        //init the REDIRECT_PROCESSING_MODE
        value = configuration.get(REDIRECT_PROCESSING_MODE);
        if(value != null){
            try {
                linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.valueOf(value.toString()));
            } catch (IllegalArgumentException e) {
                throw new ConfigurationException(REDIRECT_PROCESSING_MODE, "Values MUST be one of "+
                    Arrays.toString(RedirectProcessingMode.values()));
            }
        }
        //init the DEFAULT_LANGUAGE
        value = configuration.get(DEFAULT_MATCHING_LANGUAGE);
        if(value != null){
            String defaultLang = value.toString().trim();
            if(defaultLang.isEmpty()){
                linkerConfig.setDefaultLanguage(null);
            } else if(defaultLang.length() == 1){
                throw new ConfigurationException(DEFAULT_MATCHING_LANGUAGE, "Illegal language code '"+
                    defaultLang+"'! Language Codes MUST BE at least 2 chars long.");
            } else {
                linkerConfig.setDefaultLanguage(defaultLang);
            }
        }
        // init MIN_TOKEN_MATCH_FACTOR
        value=configuration.get(MIN_TOKEN_MATCH_FACTOR);
        float minTokenMatchFactor;
        if(value instanceof Number){
            minTokenMatchFactor = ((Number)value).floatValue();
        } else if(value != null){
            try {
                minTokenMatchFactor = Float.valueOf(value.toString());
            } catch (NumberFormatException e) {
                throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR, 
                    "Unable to parse the minimum token match factor from the parsed value "+value,e);
            }
            if(minTokenMatchFactor < 0){
                minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
            }
        } else {
            minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
        }
        if(minTokenMatchFactor == 0 || minTokenMatchFactor > 1){
            throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR, 
                "The minimum token match factor MUST be > 0 and <= 1 (negative values for the default)");
        }
        linkerConfig.setMinTokenMatchFactor(minTokenMatchFactor);


        //init type mappings
        value = configuration.get(TYPE_MAPPINGS);
        if(value instanceof String[]){ //support array
            value = Arrays.asList((String[])value);
        } else if(value instanceof String) { //single value
            value = Collections.singleton(value);
        }
        if(value instanceof Collection<?>){ //and collection
            log.info("Init Type Mappings");
            configs :
            for(Object o : (Iterable<?>)value){
                if(o != null){
                    StringBuilder usage = new StringBuilder("useages: ");
                    usage.append("a: '{uri}' short for {uri} > {uri} | ");
                    usage.append("b: '{source1};{source2};..;{sourceN} > {target}'");
                    String[] config = o.toString().split(">");
                    if(config[0].isEmpty()){
                        log.warn("Invalid Type Mapping Config '{}': Missing Source Type ({}) -> ignore this config",
                            o,usage);
                        continue configs;
                    }
                    String[] sourceTypes = config[0].split(";");
                    if(sourceTypes.length > 1 && (config.length < 2 || config[1].isEmpty())){
                        log.warn("Invalid Type Mapping Config '{}': Missing Target Type '{}' ({}) -> ignore this config",
                            o,usage);
                        continue configs;
                    }
                    String targetType = config.length < 2 ? sourceTypes[0] : config[1];
                    targetType = NamespaceMappingUtils.getConfiguredUri(
                        nsPrefixService,TYPE_MAPPINGS,targetType.trim()); //support for ns:localName
                    try { //validate
                        new URI(targetType);
                    } catch (URISyntaxException e) {
                        log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this config",
                            sourceTypes[0],o);
                        continue configs;
                    }
                    UriRef targetUri = new UriRef(targetType);
                    for(String sourceType : sourceTypes){
                        if(!sourceType.isEmpty()){
                            sourceType = NamespaceMappingUtils.getConfiguredUri(
                                nsPrefixService,TYPE_MAPPINGS,sourceType.trim()); //support for ns:localName
                            try { //validate
                                new URI(sourceType);
                                UriRef old = linkerConfig.setTypeMapping(sourceType, targetUri);
                                if(old == null){
                                    log.info(" > add type mapping {} > {}", sourceType,targetType);
                                } else {
                                    log.info(" > set type mapping {} > {} (old: {})", 
                                        new Object[]{sourceType,targetType,old.getUnicodeString()});
                                }
                            } catch (URISyntaxException e) {
                                log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this source type",
                                    sourceTypes[0],o);
                            }
                        }
                    }
                }
            }
        } else {
            log.debug("No Type mappings configured");
        }
    }


    /**
     * Initialise the {@link #entitySearcher} based on the value of the
     * {@link #REFERENCED_SITE_ID} property in the parsed configuration
     * @param context
     * @param configuration
     * @throws ConfigurationException
     */
    protected void activateEntitySearcher(ComponentContext context, Dictionary<String,Object> configuration) throws ConfigurationException {
        Object value = configuration.get(REFERENCED_SITE_ID);
        //init the EntitySource
        if (value == null) {
            throw new ConfigurationException(REFERENCED_SITE_ID,
                    "The ID of the Referenced Site is a required Parameter and MUST NOT be NULL!");
        }
        referencedSiteName = value.toString();
        if (referencedSiteName.isEmpty()) {
            throw new ConfigurationException(REFERENCED_SITE_ID,
                    "The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!");
        }
        //TODO: make limit configurable!
        if(Entityhub.ENTITYHUB_IDS.contains(referencedSiteName.toLowerCase())){
            entitySearcher = new EntityhubSearcher(context.getBundleContext(),10);
        } else {
            entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,10);
        }
    }
    /**
     * Deactivates this Engine. Subclasses should not call this method but rather
     * call<ul>
     * <li> {@link #deactivateEntitySearcher()}
     * <li> {@link #deactivateEntityLinkerConfig()} and
     * <li> {@link #deactivateProcessedLanguages())}
     * </ul>
     * @param context the context (not used)
     */
    @Deactivate
    protected void deactivate(ComponentContext context) {
        super.deactivate(context);
        deactivateEntitySearcher();
        deactivateTextAnalyzerConfig();
        deactivateEntityLinkerConfig();
        deactivateEntityDereference();
    }
    /**
     * Resets the {@link #dereferenceEntitiesState} to 
     * {@link #DEFAULT_DEREFERENCE_ENTITIES_STATE}
     */
    protected final void deactivateEntityDereference() {
        dereferenceEntitiesState = DEFAULT_DEREFERENCE_ENTITIES_STATE;
    }


    /**
     * Deactivates the {@link TextAnalyzer} as well as resets the set of languages
     * to process to {@link #DEFAULT_LANGUAGES}
     */
    protected void deactivateTextAnalyzerConfig() {
        this.nlpConfig = null;
        this.analysedContentFactory = null;
        languages = DEFAULT_LANGUAGES;
    }


    /**
     * sets the {@link EntityLinkerConfig} to <code>null</code>
     */
    protected void deactivateEntityLinkerConfig() {
        linkerConfig = null;
    }


    /**
     * Closes and resets the EntitySearcher. Also calls
     * {@link TrackingEntitySearcher#close()} if applicable. 
     */
    protected void deactivateEntitySearcher() {
        if(entitySearcher instanceof TrackingEntitySearcher<?>){
            //close tracking EntitySearcher
            ((TrackingEntitySearcher<?>)entitySearcher).close();
        }
        entitySearcher = null;
        referencedSiteName = null;
    }
}
Source Code of org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine

Related Classes of org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine