Examples of NYTCorpusDocument


Examples of com.nytlabs.corpus.NYTCorpusDocument

        public FileVisitResult visitFile(Path aFile, BasicFileAttributes aAttrs)
                throws IOException {
            logger.info("--------------------------------------------------------------------------------");
            logger.info("Visiting file "+aFile);
            if( aFile.getFileName().toString().endsWith(".xml") ) {
                NYTCorpusDocument doc = parser.parseNYTCorpusDocumentFromFile(new File(aFile.toString()), false);
                logger.info("  "+doc.getHeadline());
                if(doc.getLocations().size()>0){
                    articlesWithLocations++;
                    // load the document and geolocate the places NYT tagged
                    List<ResolvedLocation> rawResolvedLocations = new ArrayList<ResolvedLocation>();
                    List<LocationOccurrence> locationOccurrences = new ArrayList<LocationOccurrence>();
                    try {
                        for (String locationName: doc.getLocations()){
                            if(customSubstitutions.contains(locationName)){
                                locationName = customSubstitutions.getSubstitution(locationName);
                            }
                            locationOccurrences.add( new LocationOccurrence(locationName,0) );
                            rawResolvedLocations.addAll( ParseManager.extractAndResolve(locationName).getResolvedLocations() );
                        }
                        List<ResolvedLocation> resolvedLocations;
                        resolvedLocations = ParseManager.getResolver().resolveLocations(locationOccurrences,false);
                        resolvedLocations.addAll(rawResolvedLocations);
                        List<GeoName> countriesTheyCoded = ExtractedEntities.getUniqueCountryGeoNames(resolvedLocations);
                  
                        // now geoparse it ourselves and see
                        List<CountryCode> countriesWeFound = ParseManager.extractAndResolve(doc.getHeadline() + " " + doc.getBody()).getUniqueCountries();
                        if(countriesWeFound.size()>0){
                            boolean allMatched = true;
                            for(GeoName countryTheyCoded:countriesTheyCoded){
                                if(!countriesWeFound.contains(countryTheyCoded)){
                                    allMatched = false;
                                }
                            }
                            if(allMatched){
                                articlesWeGotRight++;
                            } else {
                                logger.warn("We found "+countriesWeFound+" they found "+countriesTheyCoded+" from ("+doc.getLocations()+")");
                                //logger.info("TC:" + doc.getTaxonomicClassifiers());
                            }
                        }
                       
                        //also have a measure for making sure the main "about" country is included in their list of countries
                        FocusStrategy focus = ParseManager.getFocusStrategy();
                        List<FocusLocation> ourAboutnessCountries = focus.selectCountries(resolvedLocations);
                        List<GeoName> ourAboutnessGeoNames = new ArrayList<GeoName>();
                        for(FocusLocation aboutLocation: ourAboutnessCountries){
                            ourAboutnessGeoNames.add(aboutLocation.getGeoName());
                        }
                        if(ourAboutnessCountries.size()>0){
                            boolean allMatched = true;
                            for(GeoName focusGeoName:ourAboutnessGeoNames){
                                if(!countriesTheyCoded.contains(focusGeoName)){
                                    allMatched = false;
                                }
                            }
                            if(allMatched){
                                focusArticlesWeGotRight++;
                            } else {
                                logger.warn("We found "+ourAboutnessCountries+" they found "+countriesTheyCoded+" from ("+doc.getLocations()+")");
                                //logger.info("TC:" + doc.getTaxonomicClassifiers());
                            }
                        }
                       
                    } catch (Exception e) {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.