if (reader == null) {
app.showStatus("No index loaded");
cleanChart();
return;
}
SlowThread st = new SlowThread(app) {
public void execute() {
try {
int numBuckets = 100;
TermsEnum te = MultiFields.getTerms(ir, field).iterator(null);
ArrayList<TermCount> terms = new ArrayList<TermCount>();
// most terms occur very infrequently - just keep group totals for the DFs
// representing these "long tail" terms.
int longTailDfStart = 1;
int longTailDfEnd = 1000;
int longTailTermDfCounts[] = new int[(longTailDfEnd - longTailDfStart) + 1];
// For "short tail" terms there are less of them and they represent a lot
// of different
// DFs (typically in the thousands) so unlike long-tail terms we can't
// predict what common DF buckets to accumulate counts in. For this reason
// we don't attempt
// to total them and keep a list of them individually (shouldn't occupy
// too much ram)
int numUniqueTerms = 0;
while (te.next() != null) {
numUniqueTerms++;
int df = te.docFreq();
if (df <= longTailDfEnd) {
int i = df - longTailDfStart;
longTailTermDfCounts[i]++;
} else {
terms.add(new TermCount(new Term(field, te.term().utf8ToString()), df));
}
}
TermCount sortedTerms[] = (TermCount[]) terms.toArray(new TermCount[terms
.size()]);
Arrays.sort(sortedTerms);
int termsPerBucket;
if (numUniqueTerms < 100) {
termsPerBucket = 1;
numBuckets = numUniqueTerms;
} else {
termsPerBucket = numUniqueTerms / numBuckets;
}
ArrayList buckets = new ArrayList();
Bucket currentBucket = new Bucket();
buckets.add(currentBucket);
for (int i = 0; i < sortedTerms.length; i++) {
currentBucket.addTermDf(sortedTerms[i].df);
if (currentBucket.numTermsInThisBucket >= termsPerBucket) {
// start a new bucket
currentBucket = new Bucket();
buckets.add(currentBucket);
}
}
// now work through the aggregated long-tail terms - start from
// most common DF down to least common DF
for (int i = longTailTermDfCounts.length - 1; i >= 0; i--) {
int df = i + longTailDfStart;
int numTerms = longTailTermDfCounts[i];
for (int t = 0; t < numTerms; t++) {
currentBucket.addTermDf(df);
if (currentBucket.numTermsInThisBucket >= termsPerBucket) {
// start a new bucket
currentBucket = new Bucket();
buckets.add(currentBucket);
}
}
}
if (currentBucket.numTermsInThisBucket == 0) buckets.remove(currentBucket);
Bucket bucketsResult[] = (Bucket[]) buckets.toArray(new Bucket[buckets.size()]);
float termBucketTotals[] = new float[bucketsResult.length];
int maxDf = 0;
for (int i = 0; i < bucketsResult.length; i++) {
termBucketTotals[i] = bucketsResult[i].getAverageDf();
maxDf = (int) Math.max(maxDf, termBucketTotals[i]);
}
// update the GUI
Object maxdf = app.find(myUi, "maxdf");
app.setString(maxdf, "text", "" + maxDf);
Object maxterm = app.find(myUi, "maxterm");
Object midterm = app.find(myUi, "midterm");
app.setString(maxterm, "text", numUniqueTerms + "");
app.setString(midterm, "text", (numUniqueTerms / 2) + "");
chart.setScores(termBucketTotals);
chart.invalidate();
app.repaint();
} catch (Exception e) {
app.showStatus("ERROR: " + e.getMessage());
}
}
};
if (app.isSlowAccess()) {
st.start();
} else {
st.execute();
}
}