Package io.lumify.clavin

Source Code of io.lumify.clavin.ClavinTermMentionFilter

package io.lumify.clavin;

import com.bericotech.clavin.extractor.LocationOccurrence;
import com.bericotech.clavin.gazetteer.FeatureClass;
import com.bericotech.clavin.gazetteer.FeatureCode;
import com.bericotech.clavin.gazetteer.GeoName;
import com.bericotech.clavin.resolver.LuceneLocationResolver;
import com.bericotech.clavin.resolver.ResolvedLocation;
import com.google.inject.Inject;
import io.lumify.core.config.Configuration;
import io.lumify.core.exception.LumifyException;
import io.lumify.core.ingest.graphProperty.TermMentionFilter;
import io.lumify.core.ingest.graphProperty.TermMentionFilterPrepareData;
import io.lumify.core.model.audit.AuditAction;
import io.lumify.core.model.audit.AuditRepository;
import io.lumify.core.model.ontology.Concept;
import io.lumify.core.model.ontology.OntologyProperty;
import io.lumify.core.model.ontology.OntologyRepository;
import io.lumify.core.model.properties.LumifyProperties;
import io.lumify.core.model.termMention.TermMentionBuilder;
import io.lumify.core.model.workspace.WorkspaceRepository;
import io.lumify.core.user.User;
import io.lumify.core.util.LumifyLogger;
import io.lumify.core.util.LumifyLoggerFactory;
import io.lumify.web.clientapi.model.PropertyType;
import io.lumify.web.clientapi.model.VisibilityJson;
import org.apache.lucene.queryparser.classic.ParseException;
import org.securegraph.Authorizations;
import org.securegraph.Edge;
import org.securegraph.ElementBuilder;
import org.securegraph.Vertex;
import org.securegraph.type.GeoPoint;

import java.io.File;
import java.io.IOException;
import java.util.*;

import static org.securegraph.util.IterableUtils.count;

/**
* This TermResolutionWorker uses the CLAVIN processor to refine
* identification of location entities.
*/
public class ClavinTermMentionFilter extends TermMentionFilter {
    private static final LumifyLogger LOGGER = LumifyLoggerFactory.getLogger(ClavinTermMentionFilter.class);

    public static final String MULTI_VALUE_PROPERTY_KEY = ClavinTermMentionFilter.class.getName();

    /**
     * The CLAVIN index directory configuration key.
     */
    public static final String CLAVIN_INDEX_DIRECTORY = "clavin.indexDirectory";

    /**
     * The CLAVIN max hit depth configuration key.
     */
    public static final String CLAVIN_MAX_HIT_DEPTH = "clavin.maxHitDepth";

    /**
     * The CLAVIN max context window configuration key.
     */
    public static final String CLAVIN_MAX_CONTEXT_WINDOW = "clavin.maxContextWindow";

    /**
     * The CLAVIN use fuzzy matching configuration key.
     */
    public static final String CLAVIN_USE_FUZZY_MATCHING = "clavin.useFuzzyMatching";

    /**
     * The default max hit depth.
     */
    public static final int DEFAULT_MAX_HIT_DEPTH = 5;

    /**
     * The default max context window.
     */
    public static final int DEFAULT_MAX_CONTENT_WINDOW = 5;

    /**
     * The default fuzzy matching.
     */
    public static final boolean DEFAULT_FUZZY_MATCHING = false;

    private static final String CONFIG_STATE_IRI = "ontology.iri.state";
    private static final String CONFIG_COUNTRY_IRI = "ontology.iri.country";
    private static final String CONFIG_CITY_IRI = "ontology.iri.city";
    private static final String CONFIG_GEO_LOCATION_IRI = "ontology.iri.geoLocation";
    private static final String CONFIG_EXCLUDED_IRI_PREFIX = "clavin.excludeIri";

    private LuceneLocationResolver resolver;
    private boolean fuzzy;
    private Set<String> targetConcepts;
    private OntologyRepository ontologyRepository;
    private String stateIri;
    private String countryIri;
    private String cityIri;
    private String geoLocationIri;
    private AuditRepository auditRepository;
    private User user;
    private String artifactHasEntityIri;
    private WorkspaceRepository workspaceRepository;

    @Override
    public void prepare(TermMentionFilterPrepareData termMentionFilterPrepareData) throws Exception {
        super.prepare(termMentionFilterPrepareData);

        LOGGER.info("Configuring CLAVIN Location Resolution.");
        prepareIris(termMentionFilterPrepareData);
        prepareClavinLuceneIndex(getConfiguration());
        prepareFuzzy(getConfiguration());
        prepareTargetConcepts(getConfiguration());
        user = termMentionFilterPrepareData.getUser();
    }

    public void prepareTargetConcepts(Configuration config) {
        Set<String> excludedIris = getExcludedIris(config);

        Set<String> conceptsWithGeoLocationProperty = new HashSet<String>();
        for (Concept concept : ontologyRepository.getConceptsWithProperties()) {
            for (OntologyProperty property : concept.getProperties()) {
                String iri = concept.getTitle();
                if (property.getDataType() == PropertyType.GEO_LOCATION && !excludedIris.contains(iri)) {
                    conceptsWithGeoLocationProperty.add(iri);
                    break;
                }
            }
        }
        targetConcepts = Collections.unmodifiableSet(conceptsWithGeoLocationProperty);
    }

    private Set<String> getExcludedIris(Configuration config) {
        Set<String> excludedIris = new HashSet<String>();
        excludedIris.addAll(config.getSubset(CONFIG_EXCLUDED_IRI_PREFIX).values());
        return excludedIris;
    }

    public void prepareFuzzy(Configuration config) {
        String fuzzyStr = config.get(CLAVIN_USE_FUZZY_MATCHING, null);
        if (fuzzyStr != null) {
            fuzzyStr = fuzzyStr.trim();
        }
        if (fuzzyStr != null && Boolean.TRUE.toString().equalsIgnoreCase(fuzzyStr) ||
                Boolean.FALSE.toString().equalsIgnoreCase(fuzzyStr)) {
            fuzzy = Boolean.parseBoolean(fuzzyStr);
            LOGGER.debug("Found %s: %s. fuzzy=%s", CLAVIN_USE_FUZZY_MATCHING, fuzzyStr, fuzzy);
        } else {
            LOGGER.debug("%s not configured. Using default: %s", CLAVIN_USE_FUZZY_MATCHING, DEFAULT_FUZZY_MATCHING);
            fuzzy = DEFAULT_FUZZY_MATCHING;
        }
    }

    public void prepareClavinLuceneIndex(Configuration config) throws IOException, ParseException {
        String idxDirPath = config.get(CLAVIN_INDEX_DIRECTORY, null);
        if (idxDirPath == null || idxDirPath.trim().isEmpty()) {
            throw new IllegalArgumentException(String.format("%s must be configured.", CLAVIN_INDEX_DIRECTORY));
        }
        LOGGER.debug("Configuring CLAVIN index [%s]: %s", CLAVIN_INDEX_DIRECTORY, idxDirPath);
        File indexDirectory = new File(idxDirPath);
        if (!indexDirectory.exists() || !indexDirectory.isDirectory()) {
            throw new IllegalArgumentException(String.format("CLAVIN index cannot be found at configured (%s) location: %s",
                    CLAVIN_INDEX_DIRECTORY, idxDirPath));
        }

        int maxHitDepth = config.getInt(CLAVIN_MAX_HIT_DEPTH);
        if (maxHitDepth < 1) {
            LOGGER.debug("Found %s of %d. Using default: %d", CLAVIN_MAX_HIT_DEPTH, maxHitDepth, DEFAULT_MAX_HIT_DEPTH);
            maxHitDepth = DEFAULT_MAX_HIT_DEPTH;
        }
        int maxContextWindow = config.getInt(CLAVIN_MAX_CONTEXT_WINDOW);
        if (maxContextWindow < 1) {
            LOGGER.debug("Found %s of %d. Using default: %d", CLAVIN_MAX_CONTEXT_WINDOW, maxContextWindow, DEFAULT_MAX_CONTENT_WINDOW);
            maxContextWindow = DEFAULT_MAX_CONTENT_WINDOW;
        }

        resolver = new LuceneLocationResolver(indexDirectory, maxHitDepth, maxContextWindow);
    }

    public void prepareIris(TermMentionFilterPrepareData termMentionFilterPrepareData) {
        this.artifactHasEntityIri = getConfiguration().get(Configuration.ONTOLOGY_IRI_ARTIFACT_HAS_ENTITY);
        if (this.artifactHasEntityIri == null) {
            throw new LumifyException("Could not find configuration for " + Configuration.ONTOLOGY_IRI_ARTIFACT_HAS_ENTITY);
        }

        stateIri = (String) termMentionFilterPrepareData.getStormConf().get(CONFIG_STATE_IRI);
        if (stateIri == null || stateIri.length() == 0) {
            throw new LumifyException("Could not find config: " + CONFIG_STATE_IRI);
        }

        countryIri = (String) termMentionFilterPrepareData.getStormConf().get(CONFIG_COUNTRY_IRI);
        if (countryIri == null || countryIri.length() == 0) {
            throw new LumifyException("Could not find config: " + CONFIG_COUNTRY_IRI);
        }

        cityIri = (String) termMentionFilterPrepareData.getStormConf().get(CONFIG_CITY_IRI);
        if (cityIri == null || cityIri.length() == 0) {
            throw new LumifyException("Could not find config: " + CONFIG_CITY_IRI);
        }

        geoLocationIri = (String) termMentionFilterPrepareData.getStormConf().get(CONFIG_GEO_LOCATION_IRI);
        if (geoLocationIri == null || geoLocationIri.length() == 0) {
            throw new LumifyException("Could not find config: " + CONFIG_GEO_LOCATION_IRI);
        }
    }

    @Override
    public void apply(Vertex sourceVertex, Iterable<Vertex> termMentions, Authorizations authorizations) throws IOException, ParseException {
        List<LocationOccurrence> locationOccurrences = getLocationOccurrencesFromTermMentions(termMentions);
        LOGGER.info("Found %d Locations in %d terms.", locationOccurrences.size(), count(termMentions));
        List<ResolvedLocation> resolvedLocationNames = resolver.resolveLocations(locationOccurrences, fuzzy);
        LOGGER.info("Resolved %d Locations", resolvedLocationNames.size());

        if (resolvedLocationNames.isEmpty()) {
            return;
        }

        Map<Integer, ResolvedLocation> resolvedLocationOffsetMap = new HashMap<Integer, ResolvedLocation>();
        for (ResolvedLocation resolvedLocation : resolvedLocationNames) {
            // assumes start/end positions are real, i.e., unique start positions for each extracted term
            resolvedLocationOffsetMap.put(resolvedLocation.getLocation().getPosition(), resolvedLocation);
        }

        ResolvedLocation loc;
        String processId = getClass().getName();
        for (Vertex termMention : termMentions) {
            loc = resolvedLocationOffsetMap.get((int) LumifyProperties.TERM_MENTION_START_OFFSET.getPropertyValue(termMention, 0));
            if (isLocation(termMention) && loc != null) {
                String id = String.format("CLAVIN-%d", loc.getGeoname().getGeonameID());
                GeoPoint geoPoint = new GeoPoint(loc.getGeoname().getLatitude(), loc.getGeoname().getLongitude(), LumifyProperties.TERM_MENTION_TITLE.getPropertyValue(termMention));
                String title = toSign(loc);
                String termMentionConceptType = LumifyProperties.TERM_MENTION_CONCEPT_TYPE.getPropertyValue(termMention);
                String conceptType = getOntologyClassUri(loc, termMentionConceptType);

                VisibilityJson sourceVertexVisibilityJson = LumifyProperties.VISIBILITY_JSON.getPropertyValue(sourceVertex);
                Map<String, Object> metadata = new HashMap<String, Object>();
                LumifyProperties.VISIBILITY_JSON.setMetadata(metadata, sourceVertexVisibilityJson);
                ElementBuilder<Vertex> resolvedToVertexBuilder = getGraph().prepareVertex(id, sourceVertex.getVisibility())
                        .addPropertyValue(MULTI_VALUE_PROPERTY_KEY, geoLocationIri, geoPoint, metadata, sourceVertex.getVisibility());
                LumifyProperties.CONCEPT_TYPE.addPropertyValue(resolvedToVertexBuilder, MULTI_VALUE_PROPERTY_KEY, conceptType, metadata, sourceVertex.getVisibility());
                LumifyProperties.SOURCE.addPropertyValue(resolvedToVertexBuilder, MULTI_VALUE_PROPERTY_KEY, "CLAVIN", metadata, sourceVertex.getVisibility());
                LumifyProperties.TITLE.addPropertyValue(resolvedToVertexBuilder, MULTI_VALUE_PROPERTY_KEY, title, metadata, sourceVertex.getVisibility());
                LumifyProperties.VISIBILITY_JSON.addPropertyValue(resolvedToVertexBuilder, MULTI_VALUE_PROPERTY_KEY, sourceVertexVisibilityJson, metadata, sourceVertex.getVisibility());
                Vertex resolvedToVertex = resolvedToVertexBuilder.save(authorizations);
                getGraph().flush();

                String edgeId = sourceVertex.getId() + "-" + artifactHasEntityIri + "-" + resolvedToVertex.getId();
                Edge resolvedEdge = getGraph().prepareEdge(edgeId, sourceVertex, resolvedToVertex, artifactHasEntityIri, sourceVertex.getVisibility()).save(authorizations);
                LumifyProperties.VISIBILITY_JSON.addPropertyValue(resolvedEdge, MULTI_VALUE_PROPERTY_KEY, sourceVertexVisibilityJson, metadata, sourceVertex.getVisibility(), authorizations);
                VisibilityJson visibilityJson = LumifyProperties.TERM_MENTION_VISIBILITY_JSON.getPropertyValue(termMention);
                if (visibilityJson != null && visibilityJson.getWorkspaces().size() > 0) {
                    Set<String> workspaceIds = visibilityJson.getWorkspaces();
                    for (String workspaceId : workspaceIds) {
                        workspaceRepository.updateEntityOnWorkspace(workspaceRepository.findById(workspaceId, user), id, false, null, user);
                    }
                }

                Vertex resolvedMention = new TermMentionBuilder(termMention, sourceVertex)
                        .resolvedTo(resolvedToVertex, resolvedEdge)
                        .title(title)
                        .conceptIri(conceptType)
                        .process(processId)
                        .visibilityJson(LumifyProperties.TERM_MENTION_VISIBILITY_JSON.getPropertyValue(termMention))
                        .save(getGraph(), getVisibilityTranslator(), authorizations);

                LOGGER.debug("Replacing original location [%s] with resolved location [%s]", termMention.getId(), resolvedMention.getId());
            }
        }
        auditRepository.auditAnalyzedBy(AuditAction.ANALYZED_BY, sourceVertex, getClass().getSimpleName(), user, sourceVertex.getVisibility());
    }

    private String toSign(final ResolvedLocation location) {
        GeoName geoname = location.getGeoname();
        return String.format("%s (%s, %s)", geoname.getName(), geoname.getPrimaryCountryCode(), geoname.getAdmin1Code());
    }

    private boolean isLocation(final Vertex mention) {
        return targetConcepts.contains(LumifyProperties.TERM_MENTION_CONCEPT_TYPE.getPropertyValue(mention));
    }

    private List<LocationOccurrence> getLocationOccurrencesFromTermMentions(final Iterable<Vertex> termMentions) {
        List<LocationOccurrence> locationOccurrences = new ArrayList<LocationOccurrence>();

        for (Vertex termMention : termMentions) {
            if (isLocation(termMention)) {
                locationOccurrences.add(new LocationOccurrence(LumifyProperties.TERM_MENTION_TITLE.getPropertyValue(termMention), (int) LumifyProperties.TERM_MENTION_START_OFFSET.getPropertyValue(termMention, 0)));
            }
        }
        return locationOccurrences;
    }

    public String getOntologyClassUri(final ResolvedLocation location, final String defaultValue) {
        String uri = defaultValue;
        FeatureClass featureClass = location.getGeoname().getFeatureClass();
        FeatureCode featureCode = location.getGeoname().getFeatureCode();
        if (featureClass == null) {
            featureClass = FeatureClass.NULL;
        }
        if (featureCode == null) {
            featureCode = FeatureCode.NULL;
        }
        switch (featureClass) {
            case A:
                switch (featureCode) {
                    case ADM1:
                        uri = stateIri;
                        break;
                    case PCLI:
                        uri = countryIri;
                        break;
                }
                break;
            case P:
                uri = cityIri;
                break;
        }
        return uri;
    }

    @Inject
    public void setOntologyRepository(OntologyRepository ontologyRepository) {
        this.ontologyRepository = ontologyRepository;
    }

    @Inject
    public void setAuditRepository(AuditRepository auditRepository) {
        this.auditRepository = auditRepository;
    }

    @Inject
    public void setWorkspaceRepository(WorkspaceRepository workspaceRepository) {
        this.workspaceRepository = workspaceRepository;
    }
}
TOP

Related Classes of io.lumify.clavin.ClavinTermMentionFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.