public FileVisitResult visitFile(Path aFile, BasicFileAttributes aAttrs)
throws IOException {
logger.info("--------------------------------------------------------------------------------");
logger.info("Visiting file "+aFile);
if( aFile.getFileName().toString().endsWith(".xml") ) {
NYTCorpusDocument doc = parser.parseNYTCorpusDocumentFromFile(new File(aFile.toString()), false);
logger.info(" "+doc.getHeadline());
if(doc.getLocations().size()>0){
articlesWithLocations++;
// load the document and geolocate the places NYT tagged
List<ResolvedLocation> rawResolvedLocations = new ArrayList<ResolvedLocation>();
List<LocationOccurrence> locationOccurrences = new ArrayList<LocationOccurrence>();
try {
for (String locationName: doc.getLocations()){
if(customSubstitutions.contains(locationName)){
locationName = customSubstitutions.getSubstitution(locationName);
}
locationOccurrences.add( new LocationOccurrence(locationName,0) );
rawResolvedLocations.addAll( ParseManager.extractAndResolve(locationName).getResolvedLocations() );
}
List<ResolvedLocation> resolvedLocations;
resolvedLocations = ParseManager.getResolver().resolveLocations(locationOccurrences,false);
resolvedLocations.addAll(rawResolvedLocations);
List<GeoName> countriesTheyCoded = ExtractedEntities.getUniqueCountryGeoNames(resolvedLocations);
// now geoparse it ourselves and see
List<CountryCode> countriesWeFound = ParseManager.extractAndResolve(doc.getHeadline() + " " + doc.getBody()).getUniqueCountries();
if(countriesWeFound.size()>0){
boolean allMatched = true;
for(GeoName countryTheyCoded:countriesTheyCoded){
if(!countriesWeFound.contains(countryTheyCoded)){
allMatched = false;
}
}
if(allMatched){
articlesWeGotRight++;
} else {
logger.warn("We found "+countriesWeFound+" they found "+countriesTheyCoded+" from ("+doc.getLocations()+")");
//logger.info("TC:" + doc.getTaxonomicClassifiers());
}
}
//also have a measure for making sure the main "about" country is included in their list of countries
FocusStrategy focus = ParseManager.getFocusStrategy();
List<FocusLocation> ourAboutnessCountries = focus.selectCountries(resolvedLocations);
List<GeoName> ourAboutnessGeoNames = new ArrayList<GeoName>();
for(FocusLocation aboutLocation: ourAboutnessCountries){
ourAboutnessGeoNames.add(aboutLocation.getGeoName());
}
if(ourAboutnessCountries.size()>0){
boolean allMatched = true;
for(GeoName focusGeoName:ourAboutnessGeoNames){
if(!countriesTheyCoded.contains(focusGeoName)){
allMatched = false;
}
}
if(allMatched){
focusArticlesWeGotRight++;
} else {
logger.warn("We found "+ourAboutnessCountries+" they found "+countriesTheyCoded+" from ("+doc.getLocations()+")");
//logger.info("TC:" + doc.getTaxonomicClassifiers());
}
}
} catch (Exception e) {