// instantiate search client
TrecSearchThriftClient client = new TrecSearchThriftClient(params.getParamValue(HOST_OPTION),
trainingPort, group, token);
SimpleSearcher searcher = new SimpleSearcher(client, numResults);
err.println("=== Train Queries ===");
List<Double> thresholds = new ArrayList<Double>();
double averageThreshold = 0;
Iterator<GQuery> queryIterator = trainingQueries.iterator();
while(queryIterator.hasNext()) {
GQuery query = queryIterator.next();
Map<Long, TResult> seenResults = searcher.search(query);
SimpleJaccardClusterer clusterer = new SimpleJaccardClusterer(new ArrayList<TResult>(seenResults.values()));
// sweep through jaccard steps, calculating F1
double maxF1 = 0;
double maxF1Threshold = 1;
for (double j = 1.0; j >= 0.0; j -= stepSize) { // for each jaccard threshold step
Clusters clusters = clusterer.cluster(j);
// all clusters are created now, get a finalized set of results
Set<Long> allResults = new HashSet<Long>(seenResults.keySet());
allResults.removeAll(clusters.getAllClusteredResults()); // allResults includes unclustered plus one representative from each cluster
for (Cluster c : clusters) {
allResults.add(c.getFirstMember());
}
// calculate f1 on the finalized set
Clusters seenClusters = new Clusters();
Clusters trueClusters = clusterMembership.get(query.getTitle());
Iterator<Long> resultIt = allResults.iterator();
while (resultIt.hasNext()) {
long result = resultIt.next();
Cluster trueCluster = trueClusters.findCluster(result);
if (trueCluster != null) { // if it is relevant, it will have a true cluster; if this is null, it's non-relevant
seenClusters.add(trueCluster);
}
}
int numRetrievedClusters = seenClusters.size();
int numResultsReturned = allResults.size();
int numTrueClusters = trueClusters.size();
double precision = 0;
double recall = 0;
double f1 = 0;
if (evalType.equals("unweighted")) {
precision = numRetrievedClusters / (double) numResultsReturned;
recall = numRetrievedClusters / (double) numTrueClusters;
f1 = 2 * precision * recall / (precision + recall);
} else {
// for weighted measurements, we need the weight of each cluster
int retrievedWeight = 0;
for (Cluster cluster : seenClusters) {
int w = cluster.getWeight(query, qrels);
retrievedWeight += w;
}
int resultsWeight = 0;
for (long result : allResults) {
int w = 0;
if (seenClusters.findCluster(result) == null)
resultsWeight += w;
}
int trueWeight = 0;
for (Cluster cluster : trueClusters) {
int w = cluster.getWeight(query, qrels);
trueWeight += w;
}
precision = retrievedWeight / (double) resultsWeight; // <--- ??????
recall = retrievedWeight / (double) trueWeight;
f1 = 2 * precision * recall / (precision + recall);
}
if (f1 > maxF1) {
maxF1 = f1;
maxF1Threshold = j;
}
}
thresholds.add(maxF1Threshold);
err.println("F1: "+df.format(maxF1)+"; Jaccard: "+df.format(maxF1Threshold));
}
// get the average threshold
for (double threshold : thresholds) {
averageThreshold += threshold;
}
averageThreshold /= thresholds.size();
err.println("Average Jaccard: "+averageThreshold);
err.println("=== Test Queries ===");
// now cluster the test queries and output
queryIterator = queries.iterator();
while(queryIterator.hasNext()) {
GQuery query = queryIterator.next();
err.println(query.getTitle());
client = new TrecSearchThriftClient(params.getParamValue(HOST_OPTION), testingPort, group, token);
searcher = new SimpleSearcher(client, numResults);
Map<Long, TResult> seenResults = searcher.search(query);
SimpleJaccardClusterer clusterer = new SimpleJaccardClusterer(new ArrayList<TResult>(seenResults.values()));
Clusters clusters = clusterer.cluster(averageThreshold);
// all clusters are created now, get a finalized set of results