Source Code of com.bericotech.clavin.gazetteer.query.LuceneGazetteer$QueryPart

/*#####################################################################
 *
 * CLAVIN (Cartographic Location And Vicinity INdexer)
 * ---------------------------------------------------
 *
 * Copyright (C) 2012-2013 Berico Technologies
 * http://clavin.bericotechnologies.com
 *
 * ====================================================================
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 *
 * ====================================================================
 *
 * LuceneGazetteer.java
 *
 *###################################################################*/


package com.bericotech.clavin.gazetteer.query;


import static com.bericotech.clavin.index.IndexField.*;
import static org.apache.lucene.queryparser.classic.QueryParserBase.escape;


import com.bericotech.clavin.ClavinException;
import com.bericotech.clavin.extractor.LocationOccurrence;
import com.bericotech.clavin.gazetteer.FeatureCode;
import com.bericotech.clavin.gazetteer.GeoName;
import com.bericotech.clavin.index.BinarySimilarity;
import com.bericotech.clavin.index.IndexField;
import com.bericotech.clavin.index.WhitespaceLowerCaseAnalyzer;
import com.bericotech.clavin.resolver.ResolvedLocation;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.analyzing.AnalyzingQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * An implementation of Gazetteer that uses Lucene to rapidly search
 * known locations.
 */
public class LuceneGazetteer implements Gazetteer {
    /**
     * The logger.
     */
    private static final Logger LOG = LoggerFactory.getLogger(LuceneGazetteer.class);


    /**
     * Index employs simple lower-casing & tokenizing on whitespace.
     */
    private static final Analyzer INDEX_ANALYZER = new WhitespaceLowerCaseAnalyzer();


    /**
     * Custom Lucene sorting based on Lucene match score and the
     * population of the GeoNames gazetteer entry represented by the
     * matched index document.
     */
    private static final Sort POPULATION_SORT = new Sort(new SortField[] {
        SortField.FIELD_SCORE,
        // new SortField(POPULATION.key(), SortField.Type.LONG, true)
        new SortField(SORT_POP.key(), SortField.Type.LONG, true)
    });


    /**
     * The default number of results to return.
     */
    private static final int DEFAULT_MAX_RESULTS = 5;


    /**
     * The set of all FeatureCodes.
     */
    private static final Set<FeatureCode> ALL_CODES = Collections.unmodifiableSet(EnumSet.allOf(FeatureCode.class));


    /**
     * The format string for exact match queries.
     */
    private static final String EXACT_MATCH_FMT = "\"%s\"";


    /**
     * The format string for fuzzy queries.
     */
    private static final String FUZZY_FMT = "%s~";


    // Lucene index built from GeoNames gazetteer
    private final FSDirectory index;
    private final IndexSearcher indexSearcher;


    /**
     * Builds a {@link LuceneGazetteer} by loading a pre-built Lucene
     * index from disk and setting configuration parameters for
     * resolving location names to GeoName objects.
     *
     * @param indexDir              Lucene index directory to be loaded
     * @throws ClavinException      if an error occurs opening the index
     */
    public LuceneGazetteer(final File indexDir) throws ClavinException {
        try {
        // load the Lucene index directory from disk
        index = FSDirectory.open(indexDir);


        indexSearcher = new IndexSearcher(DirectoryReader.open(index));


        // override default TF/IDF score to ignore multiple appearances
        indexSearcher.setSimilarity(new BinarySimilarity());


        // run an initial throw-away query just to "prime the pump" for
        // the cache, so we can accurately measure performance speed
        // per: http://wiki.apache.org/lucene-java/ImproveSearchingSpeed
        indexSearcher.search(new AnalyzingQueryParser(Version.LUCENE_4_9, INDEX_NAME.key(),
                INDEX_ANALYZER).parse("Reston"), null, DEFAULT_MAX_RESULTS, POPULATION_SORT);
        } catch (ParseException pe) {
            throw new ClavinException("Error executing priming query.", pe);
        } catch (IOException ioe) {
            throw new ClavinException("Error opening gazetteer index.", ioe);
        }
    }


    /**
     * Execute a query against the Lucene gazetteer index using the provided configuration,
     * returning the top matches as {@link ResolvedLocation}s.
     *
     * @param query              the configuration parameters for the query
     * @return                   the list of ResolvedLocations as potential matches
     * @throws ClavinException   if an error occurs
     */
    @Override
    @SuppressWarnings("unchecked")
    public List<ResolvedLocation> getClosestLocations(final GazetteerQuery query) throws ClavinException {
        // sanitize the query input
        String sanitizedLocationName = sanitizeQueryText(query);


        // if there is no location to query, return no results
        if ("".equals(sanitizedLocationName)) {
            return Collections.EMPTY_LIST;
        }


        LocationOccurrence location = query.getOccurrence();
        int maxResults = query.getMaxResults() > 0 ? query.getMaxResults() : DEFAULT_MAX_RESULTS;
        Filter filter = buildFilter(query);
        List<ResolvedLocation> matches;
        try {
            // attempt to find an exact match for the query
            matches = executeQuery(location, sanitizedLocationName, filter, maxResults, false, query.isFilterDupes(), null);
            if (LOG.isDebugEnabled()) {
                for (ResolvedLocation loc : matches) {
                    LOG.debug("{}", loc);
                }
            }
            // check to see if we should run a fuzzy query based on the configured FuzzyMode
            if (query.getFuzzyMode().useFuzzyMatching(maxResults, matches.size())) {
                // provide any exact matches if we are running a fuzzy query so they can be considered for deduplication
                // and result count
                matches = executeQuery(location, sanitizedLocationName, filter, maxResults, true, query.isFilterDupes(), matches);
                if (LOG.isDebugEnabled()) {
                    for (ResolvedLocation loc : matches) {
                        LOG.debug("{}[fuzzy]", loc);
                    }
                }
            }
            if (matches.isEmpty()) {
                LOG.debug("No match found for: '{}'", location.getText());
            }
        } catch (ParseException pe) {
            throw new ClavinException(String.format("Error parsing query for: '%s'}", location.getText()), pe);
        } catch (IOException ioe) {
            throw new ClavinException(String.format("Error executing query for: '%s'}", location.getText()), ioe);
        }
        return matches;
    }


    /**
     * Executes a query against the Lucene index, processing the results and returning
     * at most maxResults ResolvedLocations with ancestry resolved.
     * @param location the location occurrence
     * @param sanitizedName the sanitized name of the search location
     * @param filter the filter used to restrict the search results
     * @param maxResults the maximum number of results
     * @param fuzzy is this a fuzzy query
     * @param dedupe should duplicate locations be filtered from the results
     * @param previousResults the results of a previous query that should be used for duplicate filtering and appended to until
     *                        no additional matches are found or maxResults has been reached; the input list will not be modified
     *                        and may be <code>null</code>
     * @return the ResolvedLocations with ancestry resolved matching the query
     * @throws ParseException if an error occurs generating the query
     * @throws IOException if an error occurs executing the query
     */
    private List<ResolvedLocation> executeQuery(final LocationOccurrence location, final String sanitizedName, final Filter filter,
            final int maxResults, final boolean fuzzy, final boolean dedupe, final List<ResolvedLocation> previousResults)
            throws ParseException, IOException {
        Query query = new AnalyzingQueryParser(Version.LUCENE_4_9, INDEX_NAME.key(), INDEX_ANALYZER)
                .parse(String.format(fuzzy ? FUZZY_FMT : EXACT_MATCH_FMT, sanitizedName));


        List<ResolvedLocation> matches = new ArrayList<ResolvedLocation>(maxResults);


        Map<Integer, Set<GeoName>> parentMap = new HashMap<Integer, Set<GeoName>>();


        // reuse GeoName instances so all ancestry is correctly resolved if multiple names for
        // the same GeoName match the query
        Map<Integer, GeoName> geonameMap = new HashMap<Integer, GeoName>();
        // if we are filling previous results, add them to the match list and the geoname map
        // so they can be used for deduplication or re-used if additional matches are found
        if (previousResults != null) {
            matches.addAll(previousResults);
            for (ResolvedLocation loc : previousResults) {
                geonameMap.put(loc.getGeoname().getGeonameID(), loc.getGeoname());
            }
        }


        // short circuit if we were provided enough previous results to satisfy maxResults;
        // we do this here because the query loop condition is evaluated after the query
        // is executed and results are processed to support de-duplication
        if (matches.size() >= maxResults) {
            return matches;
        }


        // track the last discovered hit so we can re-execute the query if we are
        // deduping and need to fill results
        ScoreDoc lastDoc = null;
        do {
            // collect all the hits up to maxResults, and sort them based
            // on Lucene match score and population for the associated
            // GeoNames record
            TopDocs results = indexSearcher.searchAfter(lastDoc, query, filter, maxResults, POPULATION_SORT);
            // set lastDoc to null so we don't infinite loop if results is empty
            lastDoc = null;
            // populate results if matches were discovered
            for (ScoreDoc scoreDoc : results.scoreDocs) {
                lastDoc = scoreDoc;
                Document doc = indexSearcher.doc(scoreDoc.doc);
                // reuse GeoName instances so all ancestry is correctly resolved if multiple names for
                // the same GeoName match the query
                int geonameID = GEONAME_ID.getValue(doc);
                GeoName geoname = geonameMap.get(geonameID);
                if (geoname == null) {
                    geoname = GeoName.parseFromGeoNamesRecord((String) GEONAME.getValue(doc), (String) PREFERRED_NAME.getValue(doc));
                    geonameMap.put(geonameID, geoname);
                } else if (dedupe) {
                    // if we have already seen this GeoName and we are removing duplicates, skip to the next doc
                    continue;
                }
                String matchedName = INDEX_NAME.getValue(doc);
                if (!geoname.isAncestryResolved()) {
                    IndexableField parentIdField = doc.getField(IndexField.PARENT_ID.key());
                    Integer parentId = parentIdField != null && parentIdField.numericValue() != null ?
                            parentIdField.numericValue().intValue() : null;
                    if (parentId != null) {
                        Set<GeoName> geos = parentMap.get(parentId);
                        if (geos == null) {
                            geos = new HashSet<GeoName>();
                            parentMap.put(parentId, geos);
                        }
                        geos.add(geoname);
                    }
                }
                matches.add(new ResolvedLocation(location, geoname, matchedName, fuzzy));
                // stop processing results if we have reached maxResults matches
                if (matches.size() >= maxResults) {
                    break;
                }
            }
        } while (dedupe && lastDoc != null && matches.size() < maxResults);
        // if any results need ancestry resolution, resolve parents
        if (!parentMap.isEmpty()) {
            resolveParents(parentMap);
        }


        return matches;
    }


    /**
     * Sanitizes the text of the LocationOccurrence in the query parameters for
     * use in a Lucene query, returning an empty string if no text is found.
     * @param query the query configuration
     * @return the santitized query text or the empty string if there is no query text
     */
    private String sanitizeQueryText(final GazetteerQuery query) {
        String sanitized = "";
        if (query != null && query.getOccurrence() != null) {
            String text = query.getOccurrence().getText();
            if (text != null) {
                sanitized = escape(text.trim().toLowerCase());
            }
        }
        return sanitized;
    }


    /**
     * Builds a Lucene search filter based on the provided parameters.
     * @param params the query configuration parameters
     * @return a Lucene search filter that will restrict the returned documents to the criteria provided or <code>null</code>
     *         if no filtering is necessary
     */
    private Filter buildFilter(final GazetteerQuery params) {
        List<Query> queryParts = new ArrayList<Query>();


        // create the historical locations restriction if we are not including historical locations
        if (!params.isIncludeHistorical()) {
            int val = IndexField.getBooleanIndexValue(false);
            queryParts.add(NumericRangeQuery.newIntRange(HISTORICAL.key(), val, val, true, true));
        }


        // create the parent ID restrictions if we were provided at least one parent ID
        Set<Integer> parentIds = params.getParentIds();
        if (!parentIds.isEmpty()) {
            BooleanQuery parentQuery = new BooleanQuery();
            // locations must descend from at least one of the specified parents (OR)
            for (Integer id : parentIds) {
                parentQuery.add(NumericRangeQuery.newIntRange(ANCESTOR_IDS.key(), id, id, true, true), Occur.SHOULD);
            }
            queryParts.add(parentQuery);
        }


        // create the feature code restrictions if we were provided some, but not all, feature codes
        Set<FeatureCode> codes = params.getFeatureCodes();
        if (!(codes.isEmpty() || ALL_CODES.equals(codes))) {
            BooleanQuery codeQuery = new BooleanQuery();
            // locations must be one of the specified feature codes (OR)
            for (FeatureCode code : codes) {
                codeQuery.add(new TermQuery(new Term(FEATURE_CODE.key(), code.name())), Occur.SHOULD);
            }
            queryParts.add(codeQuery);
        }


        Filter filter = null;
        if (!queryParts.isEmpty()) {
            BooleanQuery bq = new BooleanQuery();
            for (Query part : queryParts) {
                bq.add(part, Occur.MUST);
            }
            filter = new QueryWrapperFilter(bq);
        }
        return filter;
    }


    /**
     * Retrieves and sets the parents of the provided children.
     * @param childMap the map of parent geonameID to the set of children that belong to it
     * @throws IOException if an error occurs during parent resolution
     */
    private void resolveParents(final Map<Integer, Set<GeoName>> childMap) throws IOException {
        Map<Integer, GeoName> parentMap = new HashMap<Integer, GeoName>();
        Map<Integer, Set<GeoName>> grandParentMap = new HashMap<Integer, Set<GeoName>>();
        for (Integer parentId : childMap.keySet()) {
            // Lucene query used to look for exact match on the "geonameID" field
            Query q = NumericRangeQuery.newIntRange(GEONAME_ID.key(), parentId, parentId, true, true);
            TopDocs results = indexSearcher.search(q, null, 1, POPULATION_SORT);
            if (results.scoreDocs.length > 0) {
                Document doc = indexSearcher.doc(results.scoreDocs[0].doc);
                GeoName parent = GeoName.parseFromGeoNamesRecord(doc.get(GEONAME.key()), doc.get(PREFERRED_NAME.key()));
                parentMap.put(parent.getGeonameID(), parent);
                if (!parent.isAncestryResolved()) {
                    Integer grandParentId = PARENT_ID.getValue(doc);
                    if (grandParentId != null) {
                        Set<GeoName> geos = grandParentMap.get(grandParentId);
                        if (geos == null) {
                            geos = new HashSet<GeoName>();
                            grandParentMap.put(grandParentId, geos);
                        }
                        geos.add(parent);
                    }
                }
            } else {
                LOG.error("Unable to find parent GeoName [{}]", parentId);
            }
        }


        // find all parents of the parents
        if (!grandParentMap.isEmpty()) {
            resolveParents(grandParentMap);
        }


        // set parents of children
        for (Integer parentId : childMap.keySet()) {
            GeoName parent = parentMap.get(parentId);
            if (parent == null) {
                LOG.info("Unable to find parent with ID [{}]", parentId);
                continue;
            }
            for (GeoName child : childMap.get(parentId)) {
                child.setParent(parent);
            }
        }
    }


    /**
     * Retrieves the GeoName with the provided ID.
     * @param geonameId          the ID of the requested GeoName
     * @return                   the requested GeoName or <code>null</code> if not found
     * @throws ClavinException   if an error occurs
     */
    @Override
    public GeoName getGeoName(final int geonameId) throws ClavinException {
        try {
            GeoName geoName = null;
            // Lucene query used to look for exact match on the "geonameID" field
            Query q = NumericRangeQuery.newIntRange(GEONAME_ID.key(), geonameId, geonameId, true, true);
            // retrieve only one matching document
            TopDocs results = indexSearcher.search(q, 1);
            if (results.scoreDocs.length > 0) {
                Document doc = indexSearcher.doc(results.scoreDocs[0].doc);
                geoName = GeoName.parseFromGeoNamesRecord(doc.get(GEONAME.key()), doc.get(PREFERRED_NAME.key()));
                if (!geoName.isAncestryResolved()) {
                    Integer parentId = PARENT_ID.getValue(doc);
                    if (parentId != null) {
                        Map<Integer, Set<GeoName>> childMap = new HashMap<Integer, Set<GeoName>>();
                        childMap.put(parentId, Collections.singleton(geoName));
                        resolveParents(childMap);
                    }
                }
            } else {
                LOG.debug("No geoname found for ID: {}", geonameId);
            }
            return geoName;
        } catch (IOException e) {
            String msg = String.format("Error retrieving geoname with ID : %d", geonameId);
            LOG.error(msg, e);
            throw new ClavinException(msg, e);
        }
    }


    private static class QueryPart {
        public final Query query;
        public final Occur occur;


        public QueryPart(Query query, Occur occur) {
            this.query = query;
            this.occur = occur;
        }
    }
}
Source Code of com.bericotech.clavin.gazetteer.query.LuceneGazetteer$QueryPart

Related Classes of com.bericotech.clavin.gazetteer.query.LuceneGazetteer$QueryPart