public GdeltFocusChecker(){
}
public void check() throws Exception {
FileSystemCache cache = new FileSystemCache("gdelt-articles");
ArrayList<GdeltEvent> events = GdeltCsv.allEvents(BASE_DIR);
//TODO: run through events grabbing source text, running that through CLIFF, and checking results
int mentionedSuccesses = 0;
int mentionedFailures = 0;
for(GdeltEvent event:events){
logger.debug("-------------------------------------------------------------------------------------------");
logger.debug("Checking event "+event);
try{
URL url = event.getSourceUrl();
String text;
if(cache.contains(url.toString())){
text = cache.get(url.toString());
logger.debug(" Fetched from cache:"+url.toString());
} else {
HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
text = ArticleExtractor.INSTANCE.getText(doc);
cache.put(url.toString(), text);
logger.debug("Fetched from web:"+url.toString());
}
if(text.length()<100){
logger.debug(" Skipping because it is too short");
continue; //assume we didn't fetch/extract it right