Package com.bericotech.clavin.resolver.multipart

Source Code of com.bericotech.clavin.resolver.multipart.MultipartLocationResolver

/*#####################################################################
*
* CLAVIN (Cartographic Location And Vicinity INdexer)
* ---------------------------------------------------
*
* Copyright (C) 2012-2013 Berico Technologies
* http://clavin.bericotechnologies.com
*
* ====================================================================
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* ====================================================================
*
* MultipartLocationResolver.java
*
*###################################################################*/

package com.bericotech.clavin.resolver.multipart;

import com.bericotech.clavin.ClavinException;
import com.bericotech.clavin.gazetteer.CountryCode;
import com.bericotech.clavin.gazetteer.query.FuzzyMode;
import com.bericotech.clavin.gazetteer.query.Gazetteer;
import com.bericotech.clavin.gazetteer.GeoName;
import com.bericotech.clavin.gazetteer.query.QueryBuilder;
import com.bericotech.clavin.resolver.ResolvedLocation;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Deque;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Resolves multipart location names from structured data into GeoName objects.
*
* Takes multipart location names, such as what's often found in structured data
* like a spreadsheet or database table (e.g., [Reston][Virginia][United States]),
* and resolves them into the appropriate geographic entities by identifying the
* most logical match in a gazetteer, trying to enforce some kind of notional
* hierarchy of place names (e.g., city --> state/province/etc. --> country).
*/
public class MultipartLocationResolver {
    /**
     * The logger.
     */
    private final static Logger LOG = LoggerFactory.getLogger(MultipartLocationResolver.class);

    /**
     * The hit depth used during searches.
     */
    private static final int MAX_RESULTS = 200;

    /**
     * The gazetteer for searches.
     */
    private final Gazetteer gazetteer;

    /**
     * The scorer for multi-value searches.
     */
    private final Scorer scorer;

    public MultipartLocationResolver(final Gazetteer gaz) {
        this.gazetteer = gaz;
        scorer = new DefaultScorer();
    }

    /**
     * Resolves a multipart location name, such as what's often found
     * in structured data like a spreadsheet or database table (e.g.,
     * [Reston][Virginia][United States]), into a {@link ResolvedMultipartLocation}
     * containing {@link com.bericotech.clavin.gazetteer.GeoName} objects.
     *
     * @param location           multipart location name to be resolved
     * @param fuzzy              switch for turning on/off fuzzy matching
     * @return                   resolved multipart location name
     * @throws ClavinException   if an error occurs while resolving locations
     */
    public ResolvedMultipartLocation resolveMultipartLocation(MultipartLocationName location, boolean fuzzy)
            throws ClavinException {
        // find all component locations in the gazetteer
        QueryBuilder queryBuilder = new QueryBuilder()
                // translate CLAVIN 1.x 'fuzzy' parameter into NO_EXACT or OFF; it isn't
                // necessary, or desirable to support FILL for the multi-part resolution algorithm
                .fuzzyMode(fuzzy ? FuzzyMode.NO_EXACT : FuzzyMode.OFF)
                .includeHistorical(true)
                .maxResults(MAX_RESULTS);

        // country query should only include country-like feature codes
        queryBuilder.location(location.getCountry()).addCountryCodes();
        List<ResolvedLocation> countries = gazetteer.getClosestLocations(queryBuilder.build());
        // remove all "countries" that are not considered top-level administrative divisions; this
        // filters out territories that do not contain descendant GeoNames
        Iterator<ResolvedLocation> iter = countries.iterator();
        while (iter.hasNext()) {
            if (!iter.next().getGeoname().isTopLevelAdminDivision()) {
                iter.remove();
            }
        }

        Set<CountryCode> foundCountries = EnumSet.noneOf(CountryCode.class);
        // state query should only include admin-level feature codes with ancestors
        // in the list of located countries
        queryBuilder.location(location.getState()).clearFeatureCodes().addAdminCodes();
        for (ResolvedLocation country : countries) {
            queryBuilder.addParentIds(country.getGeoname().getGeonameID());
            foundCountries.add(country.getGeoname().getPrimaryCountryCode());
        }
        List<ResolvedLocation> states = gazetteer.getClosestLocations(queryBuilder.build());

        // city query should only include city-level feature codes; ancestry is restricted
        // to the discovered states or, if no states were found, the discovered countries or,
        // if neither states nor countries were found, no ancestry restrictions are added and
        // the most populated city will be selected
        queryBuilder.location(location.getCity()).clearFeatureCodes().addCityCodes();
        if (!states.isEmpty()) {
            Set<CountryCode> stateCodes = EnumSet.noneOf(CountryCode.class);
            // only clear the parent ID restrictions if states were found; otherwise
            // we will continue our search based on the existing country restrictions, if any
            queryBuilder.clearParentIds();
            for (ResolvedLocation state : states) {
                // only include the first administrative division found for each target
                // country
                if (!stateCodes.contains(state.getGeoname().getPrimaryCountryCode())) {
                    queryBuilder.addParentIds(state.getGeoname().getGeonameID());
                    stateCodes.add(state.getGeoname().getPrimaryCountryCode());
                }
                // since we are only including one "state" per country, short-circuit
                // the loop if we have added one for each unique country code returned
                // by the countries search
                if (!foundCountries.isEmpty() && foundCountries.equals(stateCodes)) {
                    break;
                }
            }
        }
        List<ResolvedLocation> cities = gazetteer.getClosestLocations(queryBuilder.build());

        // initialize return objects components
        ResolvedLocation finalCity = null;
        ResolvedLocation finalState = null;
        ResolvedLocation finalCountry = null;

        // assume the most populous valid city is the correct one return
        // note: this should be a reasonably safe assumption since we've attempted to enforce the
        // notional hierarchy of given place names (e.g., city --> state/province/etc. --> country)
        // and have therefore weeded out all other matches that don't fit this hierarchy
        if (!cities.isEmpty()) {
            finalCity = cities.get(0);
        }

        if (!states.isEmpty()) {
            // if we couldn't find a valid city, just take the most populous valid state/province/etc.
            if (finalCity == null) {
                finalState = states.get(0);
            } else {
                for (ResolvedLocation state : states) {
                    // select the first state that is an ancestor of the selected city
                    if (finalCity.getGeoname().isDescendantOf(state.getGeoname())) {
                        finalState = state;
                        break;
                    }
                }
            }
        }

        if (!countries.isEmpty()) {
            // use the selected city if available and the selected state if not to identify the selected country
            ResolvedLocation best = finalCity != null ? finalCity : finalState;
            // if neither city nor state was resolved, take the most populous valid country
            if (best == null) {
                finalCountry = countries.get(0);
            } else {
                for (ResolvedLocation country : countries) {
                    // select the first country that is an ancestor of the selected city or state
                    if (best.getGeoname().isDescendantOf(country.getGeoname())) {
                        finalCountry = country;
                        break;
                    }
                }
            }
        }

        return new ResolvedMultipartLocation(finalCity, finalState, finalCountry);
    }

    /**
     * Attempts to resolve a location provided as a comma-separated string of political divisions from
     * narrowest to broadest. The gazetteer current supports ancestry from the country level through four
     * administrative divisions so any more-specific divisions will be ignored once a city (lowest available
     * level of resolution) is found. Results will only be returned if all unignored location components are
     * matched.
     * @param loc the comma-separated location name (e.g. "City, County, State, Country")
     * @param fuzzy <code>true</code> to use fuzzy matching if an exact match for any location could not be found
     * @return the resolved location
     * @throws ClavinException if an error occurs while searching
     */
    public ResolvedLocation resolveLocation(final String loc, final boolean fuzzy) throws ClavinException {
        return resolveLocation(fuzzy, loc.split(","));
    }

    /**
     * Resolves a location provided as a series of political divisions from narrowest to broadest. The gazetteer
     * current supports ancestry from the country level through four administrative divisions so any more-specific
     * divisions will be ignored once a city (lowest available level of resolution) is found. Results will only
     * be returned if all unignored location components are matched.
     * @param fuzzy <code>true</code> to use fuzzy matching if an exact match for any location could not be found
     * @param locationParts the names of the locations to match, ordered from most to least specific
     *                      (e.g. [ "City", "County", "State", "Country" ])
     * @return the resolved location
     * @throws ClavinException if an error occurs while searching
     */
    @SuppressWarnings("unchecked")
    public ResolvedLocation resolveLocation(final boolean fuzzy, final String... locationParts)
            throws ClavinException {
        final List<String> terms = new ArrayList<String>(locationParts.length+1);
        // terms will be a list of broadest to narrowest; e.g. United States, Virginia, Fairfax County, Reston
        for (String part : locationParts) {
            if (part != null && !part.trim().equals("")) {
                terms.add(0, part);
            }
        }
        // short circuit if no input was provided
        if (terms.isEmpty()) {
            return null;
        }

        Set<MatchedLocation> candidates = new HashSet<MatchedLocation>();
        Deque<SearchResult> matches = new LinkedList<SearchResult>();
        QueryBuilder query = new QueryBuilder()
                .maxResults(MAX_RESULTS)
                // translate CLAVIN 1.x 'fuzzy' parameter into NO_EXACT or OFF; it isn't
                // necessary, or desirable to support FILL for the multi-part resolution algorithm
                .fuzzyMode(fuzzy ? FuzzyMode.NO_EXACT : FuzzyMode.OFF)
                .includeHistorical(true);
        findCandidates(candidates, terms, SearchLevel.COUNTRY, matches, query);

        // Using post-processing sort instead of SortedSet implementation (TreeSet) because
        // TreeSet uses compareTo instead of equals/hashCode to eliminate duplicates and
        // incorrectly excludes elements that evaluate to the same sort score
        List<MatchedLocation> candidateList = new ArrayList<MatchedLocation>(candidates);
        Collections.sort(candidateList, new Comparator<MatchedLocation>() {
            @Override
            public int compare(final MatchedLocation loc1, final MatchedLocation loc2) {
                double score1 = scorer.score(terms, loc1);
                double score2 = scorer.score(terms, loc2);
                // sort candidates in descending order by score
                return Double.compare(score2, score1);
            }
        });
        if (LOG.isDebugEnabled()) {
            LOG.debug("Found {} candidates", candidateList.size());
            for (MatchedLocation candidate : candidateList) {
                LOG.debug(String.format("[%.3f] %s", scorer.score(terms, candidate), candidate.toString()));
            }
        }
        MatchedLocation bestMatch = candidateList.isEmpty() ? null : candidateList.get(0);
        ResolvedLocation location = null;
        if (bestMatch != null && (bestMatch.isFullySpecified() || bestMatch.getMatchCount() == terms.size())) {
            location = bestMatch.getMostSpecificMatch().getLocation();
        }
        return location;
    }

    @SuppressWarnings("unchecked")
    private void findCandidates(final Set<MatchedLocation> candidates, final List<String> terms, final SearchLevel level,
            final Deque<SearchResult> matches, final QueryBuilder query) throws ClavinException {
        // if there are no more terms or level is null, add a candidate to the list
        // if there are any prior matches
        if (terms.isEmpty() || level == null) {
            if (!matches.isEmpty()) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Adding candidate for matches:");
                    for (SearchResult res : matches) {
                        LOG.debug(res.toString());
                    }
                }
                candidates.add(new MatchedLocation(matches));
            }
            return;
        }

        String term = terms.get(0);
        List<String> nextTerms = terms.size() > 1 ? terms.subList(1, terms.size()) : Collections.EMPTY_LIST;
        SearchResult lastMatch = matches.peek();
        level.apply(query).location(term).clearParentIds();
        if (lastMatch != null) {
            query.parentIds(lastMatch.parentIds);
        }
        List<ResolvedLocation> results = gazetteer.getClosestLocations(query.build());
        // no results for this term at this level; search for this term at the
        // next level, then search for subsequent terms at this level
        if (results.isEmpty()) {
            findCandidates(candidates, terms, level.narrow(), matches, query);
            findCandidates(candidates, nextTerms, level, matches, query);
        } else {
            // we found results, process them to configure the filters for the next
            // level of the search and add them to the matches stack
            Set<Integer> parentIds = new HashSet<Integer>();
            Set<String> parentCodes = new HashSet<String>();
            Set<String> foundParents = new HashSet<String>();
            // only include the first (best) result for each distinct parent in the filter set
            for (ResolvedLocation loc : results) {
                GeoName geo = loc.getGeoname();
                String pCode = lastMatch != null ? lastMatch.level.getCode(geo) : null;
                // if there were no parent filters or we have not found a child for this parent
                // code, add this location to the filter set
                if (lastMatch == null || !foundParents.contains(pCode)) {
                    parentIds.add(geo.getGeonameID());
                    parentCodes.add(level.getCode(geo));
                    foundParents.add(pCode);
                }
                // if there was a previous filter set, short-circuit once we have
                // a child from each parent
                if (lastMatch != null && foundParents.equals(lastMatch.parentCodes)) {
                    break;
                }
            }
            matches.push(new SearchResult(level, results, parentIds, parentCodes));
            // continue search for additional terms after adding these results to the
            // match stack
            findCandidates(candidates, nextTerms, level.narrow(), matches, query);
            // pop this match off the stack, then search for this term at the next level
            matches.pop();
            findCandidates(candidates, terms, level.narrow(), matches, query);
        }
    }
}
TOP

Related Classes of com.bericotech.clavin.resolver.multipart.MultipartLocationResolver

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.