package edu.gslis.ttg.searchers;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import cc.twittertools.search.api.TrecSearchThriftClient;
import cc.twittertools.thrift.gen.TResult;
import edu.gslis.queries.GQuery;
import edu.gslis.textrepresentation.FeatureVector;
public class SimpleSearcher {
private TrecSearchThriftClient client;
private int maxResults;
public SimpleSearcher(TrecSearchThriftClient client, int maxResults) {
this.client = client;
this.maxResults = maxResults;
}
public Map<Long, TResult> search(GQuery query) {
// clean up query
String queryText = query.getText();
queryText = queryText.replaceAll("[,'\\.\\?]", " ");
queryText = queryText.replaceAll(" ", " ").trim();
// need to lowercase the query vector
FeatureVector temp = new FeatureVector(null);
Iterator<String> qTerms = query.getFeatureVector().iterator();
while(qTerms.hasNext()) {
String term = qTerms.next();
temp.addTerm(term.toLowerCase(), query.getFeatureVector().getFeatureWeight(term));
}
temp.normalize();;
query.setFeatureVector(temp);
System.err.println(query.getTitle()+": "+queryText);
// perform search
List<TResult> results = null;
try {
results = client.search(queryText, Long.parseLong(query.getMetadata("querytweettime")), maxResults);
} catch (Exception e) {
System.err.println("Error searching.");
System.exit(-1);
}
// set cutoff score heuristically
double topScore = results.get(0).getRsv();
double cutOffScore = topScore / 2;
// record hits, removing duplicates
int i = 1;
Map<Long, TResult> seenMap = new HashMap<Long, TResult>();
Iterator<TResult> hitIterator = results.iterator();
while(hitIterator.hasNext()) {
TResult hit = hitIterator.next();
if (hit.getRsv() < cutOffScore) {
break;
}
long docId = hit.id;
if (seenMap.containsKey(docId))
continue;
seenMap.put(docId, hit);
if(i++ >= maxResults)
break;
}
return seenMap;
}
}