// 130] Sitting tenant with none of the above (ie default)
// 140] Anything else!
for (Map.Entry<String, Candidate> pair: dubiousLocations.entrySet()) {
EntityPojo ent = pair.getValue().entity;
Candidate candidate = pair.getValue();
// 2.1] Let's analyse the "sitting tenant"
int nPrio = 130;
GeoFeaturePojo currLeader = null;
int nCase = 0; // (just for debugging, 0=st, 1=large city, 2=region, 3=other)
if (otherRegions.contains(candidate.state)) { // Strong direct ref, winner!
nPrio = 10; // winner!
}//TESTED: "san antonio, texas/city" vs "texas"
else if (otherCountriesOrRegionsReferenced.contains(candidate.state)) {
// Indirect ref
nPrio = 40; // good, but beatable...
}//TESTED: "philadelphia (village), new york/city"
else if (otherCountries.contains("united states")) { // Weak direct ref
nPrio = 80; // better than nothing...
}//TESTED: "apache, oklahoma/city"
else if (otherCountriesOrRegionsReferenced.contains("united states")) { // Weak indirect ref
nPrio = 80; // better than nothing...
}//TESTED: "washington, d.c." have DC as stateorcounty, but US in countries list
// Special case: we don't like "village":
if ((80 != nPrio) && ent.getDisambiguatedName().contains("village") && !ent.getActual_name().contains("village"))
{
nPrio = 80;
}//TESTED: "Downvoted: Philadelphia (village), New York from Philadelphia"
// Debug
if (_nDebugLevel >= 2) {
System.out.println(pair.getKey() + " SittingTenantScore=" + nPrio);
}
// Alternatives
if (nPrio > 10) {
LinkedList<GeoFeaturePojo> geos = pair.getValue().candidates;
for (GeoFeaturePojo geo: geos) {
int nAltPrio = 140;
int nAltCase = -1;
String city = (null != geo.getCity()) ? geo.getCity().toLowerCase() : null;
String region = (null != geo.getRegion()) ? geo.getRegion().toLowerCase() : null;
String country = (null != geo.getCountry()) ? geo.getCountry().toLowerCase() : null;
// 2.2] CASE 1: I'm a city with pop > 1M (best score 15)
// 15] Large city with strong direct
// 30] Large city with strong indirect
// 70] Large city with weak direct
// 72] Large city with weak indirect
// 75] Large city with no reference
if ((null != city) && (geo.getPopulation() >= 400000) && (nPrio > 15)) {
nAltCase = 1;
if ((null != region) && (otherRegions.contains(region))) {
nAltPrio = 15; // strong direct
}//TESTED: "dallas / Texas / United States = 15"
else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) {
nAltPrio = 30; // strong indirect
}//TESTED: "sacramento / California / United State"
else if ((null != country) && (otherCountries.contains(country))) {
nAltPrio = 70; // weak direct
}//TESTED: "berlin, germany", with "germany" directly mentioned
else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) {
nAltPrio = 72; // weak indirect
}//TESTED: "los angeles / California / United States = 72"
else {
nAltPrio = 75; // just for being big!
}//TESTED: "barcelona, spain"
}
// 2.3] CASE 2: I'm a region (best score=20, can beat current score)
// 20] Region with direct
// 50] Region with indirect
// 120] Region with no reference, if there is only 1
else if ((null == city) && (nPrio > 20)) {
nAltCase = 2;
if ((null != country) && (otherCountries.contains(country))) {
nAltPrio = 20; // strong direct
}//TESTED: (region) "Berlin, Germany" with "Germany" mentioned
else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) {
nAltPrio = 50; // strong indirect
}//(haven't seen, but we'll live)
else {
nAltPrio = 120; // (just for being there)
}//TESTED: "null / Portland / Jamaica = 120", also "Shanghai / China"
}
// 2.4] CASE 3: I'm any foreign possibility (best score=60)
// 60] Another foreign possibility with strong direct
// 78] Another foreign possibility with strong indirect (>100K population - ie not insignificant)
// 90] Another foreign possibility with strong indirect
// 100] Another foreign possibility with weak direct
// 110] Another foreign possibility with weak indirect
else if (nPrio > 60) {
nAltCase = 3;
if ((null != region) && (otherRegions.contains(region))) {
nAltPrio = 60; // strong direct
// Double check we're not falling into the trap below:
if (!geo.getCountry_code().equals("US")) {
Matcher m = this._statesRegex.matcher(geo.getRegion());
if (m.matches()) { // non US state matching against (probably) US state, disregard)
nAltPrio = 140;
}
}//TESTED (same clause as below)
}//TESTED: lol "philadelphia / Maryland / Liberia = 60" (before above extra clause)
if (nAltPrio > 60) { // (may need to re-run test)
if ((null != country) && (otherCountries.contains(country))) {
if (geo.getPopulation() < 100000) {
nAltPrio = 90; // strong indirect
} //TESTED: "washington / Villa Clara / Cuba"
else {
nAltPrio = 78; // strong indirect, with boost!
} //TESTED: "geneva, Geneve, Switzerland", pop 180K
}
else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) {
nAltPrio = 100; // weak direct
}//TESTED: "lincoln / Lincolnshire / United Kingdom = 100"
else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) {
nAltPrio = 110; // weak indirect
}//(haven't seen, but we'll live)
}
}
// Debug:
if ((_nDebugLevel >= 2) && (nAltPrio < 140)) {
System.out.println("----Alternative: " + geo.getCity() + " / " + geo.getRegion() + " / " + geo.getCountry() + " score=" + nAltPrio);
}
// Outcome of results:
if (nAltPrio < nPrio) {
currLeader = geo;
nPrio = nAltPrio;
nCase = nAltCase;
}
} // end loop over alternativse
if (null != currLeader) { // Need to change
if (1 == nCase) {
this._nMovedToLargeCity++;
//(Cities are lower case in georef DB for some reason)
String city = WordUtils.capitalize(currLeader.getCity());
if (currLeader.getCountry_code().equals("US")) { // Special case: is this just the original?
String region = currLeader.getRegion();
if (region.equals("District of Columbia")) { // Special special case
region = "D.C.";
}
String sCandidate = city + ", " + region;
if (!sCandidate.equals(ent.getDisambiguatedName())) {
ent.setDisambiguatedName(sCandidate);
ent.setIndex(ent.getDisambiguatedName() + "/city");
ent.setSemanticLinks(null);
bChangedAnything = true;
}//TESTED (lots, eg "Philadelphia (village), New York" -> "Philadelphia, PA"; Wash, Ill. -> Wash DC)
else {
this._nMovedToLargeCity--;
_nStayedWithOriginal++;
}//TESTED ("Washington DC", "San Juan, Puerto Rico")
}//TESTED (see above)
else {
ent.setDisambiguatedName(city + ", " + currLeader.getCountry());
ent.setIndex(ent.getDisambiguatedName() + "/city");
ent.setSemanticLinks(null);
bChangedAnything = true;
}//TESTED: "london, california/city to London, United Kingdom"
}
else if (2 == nCase) {
this._nMovedToRegion++;
ent.setDisambiguatedName(currLeader.getRegion() + ", " + currLeader.getCountry());
ent.setIndex(ent.getDisambiguatedName() + "/region");
ent.setSemanticLinks(null);
bChangedAnything = true;
}//TESTED: "Moved madrid, new york/city to Madrid, Spain" (treats Madrid as region, like Berlin see above)
else {
//(Cities are lower case in georef DB for some reason)
String city = WordUtils.capitalize(currLeader.getCity());
this._nMovedToForeignCity++;
ent.setDisambiguatedName(city + ", " + currLeader.getCountry());
ent.setIndex(ent.getDisambiguatedName() + "/city");
ent.setSemanticLinks(null);
bChangedAnything = true;
}//TESTED: "Moved geneva, new york/city to Geneva, Switzerland"
if ((_nDebugLevel >= 1) && (null == ent.getSemanticLinks())) {
System.out.println("++++ Moved " + pair.getKey() + " to " + ent.getDisambiguatedName());
}
}
else {
_nStayedWithOriginal++;
}