/*#####################################################################
*
* CLAVIN (Cartographic Location And Vicinity INdexer)
* ---------------------------------------------------
*
* Copyright (C) 2012-2013 Berico Technologies
* http://clavin.bericotechnologies.com
*
* ====================================================================
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* ====================================================================
*
* ClavinLocationResolver.java
*
*###################################################################*/
package com.bericotech.clavin.resolver;
import com.bericotech.clavin.ClavinException;
import com.bericotech.clavin.extractor.LocationOccurrence;
import com.bericotech.clavin.gazetteer.CountryCode;
import com.bericotech.clavin.gazetteer.query.FuzzyMode;
import com.bericotech.clavin.gazetteer.query.Gazetteer;
import com.bericotech.clavin.gazetteer.query.QueryBuilder;
import com.bericotech.clavin.util.ListUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Resolves location names into GeoName objects.
*
* Takes location names extracted from unstructured text documents by
* {@link com.bericotech.clavin.extractor.LocationExtractor} and resolves them into the appropriate
* geographic entities (as intended by the document's author based on
* context) by finding the best match in a gazetteer.
*/
public class ClavinLocationResolver {
/**
* The default number of candidate matches to consider.
*/
public static final int DEFAULT_MAX_HIT_DEPTH = 5;
/**
* The default context window to consider when resolving matches.
*/
public static final int DEFAULT_MAX_CONTEXT_WINDOW = 5;
/**
* The Gazetteer.
*/
private final Gazetteer gazetteer;
/**
* Set of demonyms to filter out from extracted location names.
*/
private static HashSet<String> demonyms;
/**
* Create a new ClavinLocationResolver.
* @param gazetteer the Gazetteer to query
*/
public ClavinLocationResolver(final Gazetteer gazetteer) {
this.gazetteer = gazetteer;
}
/**
* Get the Gazetteer used by this resolver.
* @return the configured gazetteer
*/
public Gazetteer getGazetteer() {
return gazetteer;
}
/**
* Resolves the supplied list of location names into
* {@link ResolvedLocation}s containing {@link com.bericotech.clavin.gazetteer.GeoName} objects
* using the defaults for maxHitDepth and maxContentWindow.
*
* Calls {@link Gazetteer#getClosestLocations} on
* each location name to find all possible matches, then uses
* heuristics to select the best match for each by calling
* {@link ClavinLocationResolver#pickBestCandidates}.
*
* @param locations list of location names to be resolved
* @param fuzzy switch for turning on/off fuzzy matching
* @return list of {@link ResolvedLocation} objects
* @throws ClavinException if an error occurs parsing the search terms
**/
public List<ResolvedLocation> resolveLocations(final List<LocationOccurrence> locations, final boolean fuzzy)
throws ClavinException {
return resolveLocations(locations, DEFAULT_MAX_HIT_DEPTH, DEFAULT_MAX_CONTEXT_WINDOW, fuzzy);
}
/**
* Resolves the supplied list of location names into
* {@link ResolvedLocation}s containing {@link com.bericotech.clavin.gazetteer.GeoName} objects.
*
* Calls {@link Gazetteer#getClosestLocations} on
* each location name to find all possible matches, then uses
* heuristics to select the best match for each by calling
* {@link ClavinLocationResolver#pickBestCandidates}.
*
* @param locations list of location names to be resolved
* @param maxHitDepth number of candidate matches to consider
* @param maxContextWindow how much context to consider when resolving
* @param fuzzy switch for turning on/off fuzzy matching
* @return list of {@link ResolvedLocation} objects
* @throws ClavinException if an error occurs parsing the search terms
**/
@SuppressWarnings("unchecked")
public List<ResolvedLocation> resolveLocations(final List<LocationOccurrence> locations, final int maxHitDepth,
final int maxContextWindow, final boolean fuzzy) throws ClavinException {
// are you forgetting something? -- short-circuit if no locations were provided
if (locations == null || locations.isEmpty()) {
return Collections.EMPTY_LIST;
}
/* Various named entity recognizers tend to mistakenly extract demonyms
* (i.e., names for residents of localities (e.g., American, British))
* as place names, which tends to gum up the works, so we make sure to
* filter them out from the list of {@link LocationOccurrence}s passed
* to the resolver.
*/
List<LocationOccurrence> filteredLocations = new ArrayList<LocationOccurrence>();
for (LocationOccurrence location : locations)
if (!isDemonym(location))
filteredLocations.add(location);
// did we filter *everything* out?
if (filteredLocations.isEmpty()) {
return Collections.EMPTY_LIST;
}
QueryBuilder builder = new QueryBuilder()
.maxResults(maxHitDepth)
// translate CLAVIN 1.x 'fuzzy' parameter into NO_EXACT or OFF; it isn't
// necessary, or desirable to support FILL for the CLAVIN resolution algorithm
.fuzzyMode(fuzzy ? FuzzyMode.NO_EXACT : FuzzyMode.OFF)
.includeHistorical(true);
if (maxHitDepth > 1) { // perform context-based heuristic matching
// stores all possible matches for each location name
List<List<ResolvedLocation>> allCandidates = new ArrayList<List<ResolvedLocation>>();
// loop through all the location names
for (LocationOccurrence location : filteredLocations) {
// get all possible matches
List<ResolvedLocation> candidates = gazetteer.getClosestLocations(builder.location(location).build());
// if we found some possible matches, save them
if (candidates.size() > 0) {
allCandidates.add(candidates);
}
}
// initialize return object
List<ResolvedLocation> bestCandidates = new ArrayList<ResolvedLocation>();
// split-up allCandidates into reasonably-sized chunks to
// limit computational load when heuristically selecting
// the best matches
for (List<List<ResolvedLocation>> theseCandidates : ListUtils.chunkifyList(allCandidates, maxContextWindow)) {
// select the best match for each location name based
// based on heuristics
bestCandidates.addAll(pickBestCandidates(theseCandidates));
}
return bestCandidates;
} else { // use no heuristics, simply choose matching location with greatest population
// initialize return object
List<ResolvedLocation> resolvedLocations = new ArrayList<ResolvedLocation>();
// stores possible matches for each location name
List<ResolvedLocation> candidateLocations;
// loop through all the location names
for (LocationOccurrence location : filteredLocations) {
// choose the top-sorted candidate for each individual
// location name
candidateLocations = gazetteer.getClosestLocations(builder.location(location).build());
// if a match was found, add it to the return list
if (candidateLocations.size() > 0) {
resolvedLocations.add(candidateLocations.get(0));
}
}
return resolvedLocations;
}
}
/**
* Uses heuristics to select the best match for each location name
* extracted from a document, choosing from among a list of lists
* of candidate matches.
*
* Although not guaranteeing an optimal solution (enumerating &
* evaluating each possible combination is too costly), it does a
* decent job of cracking the "Springfield Problem" by selecting
* candidates that would make sense to appear together based on
* common country and admin1 codes (i.e., states or provinces).
*
* For example, if we also see "Boston" mentioned in a document
* that contains "Springfield," we'd use this as a clue that we
* ought to choose Springfield, MA over Springfield, IL or
* Springfield, MO.
*
* TODO: consider lat/lon distance in addition to shared
* CountryCodes and Admin1Codes.
*
* @param allCandidates list of lists of candidate matches for locations names
* @return list of best matches for each location name
*/
private List<ResolvedLocation> pickBestCandidates(final List<List<ResolvedLocation>> allCandidates) {
// initialize return object
List<ResolvedLocation> bestCandidates = new ArrayList<ResolvedLocation>();
// variables used in heuristic matching
Set<CountryCode> countries;
Set<String> states;
float score;
// initial values for variables controlling recursion
float newMaxScore = 0;
float oldMaxScore;
// controls window of Lucene hits for each location considered
// context-based heuristic matching, initialized as a "magic
// number" of *3* based on tests of the "Springfield Problem"
int candidateDepth = 3;
// keep searching deeper & deeper for better combinations of
// candidate matches, as long as the scores are improving
do {
// reset the threshold for recursion
oldMaxScore = newMaxScore;
// loop through all combinations up to the specified depth.
// first recursive call for each depth starts at index 0
for (List<ResolvedLocation> combo : generateAllCombos(allCandidates, 0, candidateDepth)) {
// these lists store the country codes & admin1 codes for each candidate
countries = EnumSet.noneOf(CountryCode.class);
states = new HashSet<String>();
for (ResolvedLocation location: combo) {
countries.add(location.getGeoname().getPrimaryCountryCode());
states.add(location.getGeoname().getPrimaryCountryCode() + location.getGeoname().getAdmin1Code());
}
// calculate a score for this particular combination based on commonality
// of country codes & admin1 codes, and the cost of searching this deep
// TODO: tune this score calculation!
score = ((float)allCandidates.size() / (countries.size() + states.size())) / candidateDepth;
/* ***********************************************************
* "So, at last we meet for the first time for the last time."
*
* The fact that you're interested enough in CLAVIN to be
* reading this means we're interested in talking with you.
*
* Are you looking for a job, or are you in need of a
* customized solution built around CLAVIN?
*
* Drop us a line at clavin@bericotechnologies.com
*
* "What's the matter, Colonel Sandurz? CHICKEN?"
* **********************************************************/
// if this is the best we've seen during this loop, update the return value
if (score > newMaxScore) {
newMaxScore = score;
bestCandidates = combo;
}
}
// search one level deeper in the next loop
candidateDepth++;
} while (newMaxScore > oldMaxScore);
// keep searching while the scores are monotonically increasing
return bestCandidates;
}
/**
* Recursive helper function for
* {@link #pickBestCandidates}.
*
* Generates all combinations of candidate matches from each
* location, down to the specified depth through the lists.
*
* Adapted from:
* http://www.daniweb.com/software-development/java/threads/177956/generating-all-possible-combinations-from-list-of-sublists#post882553
*
* @param allCandidates list of lists of candidate matches for all location names
* @param index keeps track of which location we're working on for recursive calls
* @param depth max depth into list we're searching during this recursion
* @return all combinations of candidate matches for each location, down to the specified depth
*/
private List<List<ResolvedLocation>> generateAllCombos(final List<List<ResolvedLocation>> allCandidates,
final int index, final int depth) {
// stopping condition
if (index == allCandidates.size()) {
// return a list with an empty list
List<List<ResolvedLocation>> result = new ArrayList<List<ResolvedLocation>>();
result.add(new ArrayList<ResolvedLocation>());
return result;
}
// initialize return object
List<List<ResolvedLocation>> result = new ArrayList<List<ResolvedLocation>>();
// recursive call
List<List<ResolvedLocation>> recursive = generateAllCombos(allCandidates, index+1, depth);
// for each element of the first list of input, up to depth or list size
for (int j = 0; j < Math.min(allCandidates.get(index).size(), depth); j++) {
// add the element to all combinations obtained for the rest of the lists
for (List<ResolvedLocation> recList : recursive) {
List<ResolvedLocation> newList = new ArrayList<ResolvedLocation>();
// add element of the first list
newList.add(allCandidates.get(index).get(j));
// copy a combination from recursive
for (ResolvedLocation listItem : recList) {
newList.add(listItem);
}
// add new combination to result
result.add(newList);
}
}
return result;
}
/**
* Various named entity recognizers tend to mistakenly extract
* demonyms (i.e., names for residents of localities (e.g.,
* American, British)) as place names, which tends to gum up the
* works for the resolver, so this method filters them out from
* the list of {@link LocationOccurrence}s passed to the resolver.
*
* @param extractedLocation extraction location name to filter
* @return true if input is a demonym, false otherwise
*/
public static boolean isDemonym(LocationOccurrence extractedLocation) {
// lazy load set of demonyms
if (demonyms == null) {
// populate set of demonyms to filter out from results, source:
// http://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations
demonyms = new HashSet<String>();
BufferedReader br = new BufferedReader(new InputStreamReader(ClavinLocationResolver.class.getClassLoader().getResourceAsStream("Demonyms.txt")));
String line;
try {
while ((line = br.readLine()) != null)
demonyms.add(line);
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return demonyms.contains(extractedLocation.getText());
}
}