package proj.zoie.perf.client;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import proj.zoie.api.impl.util.PriorityQueue;
public class TermFileBuilder {
private static class TermWithFreq {
final String term;
final int count;
TermWithFreq(String term, int count) {
this.term = term;
this.count = count;
}
}
public static List<String> loadFile(File infile) throws Exception {
LinkedList<String> terms = new LinkedList<String>();
if (infile.exists()) {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(infile),
Charset.forName("UTF-8")));
while (true) {
String line = reader.readLine();
if (line == null) break;
terms.add(line);
}
}
return terms;
}
/**
* @param args
*/
public static void main(String[] args) throws Exception {
File idxDir = new File(args[0]);
File outFile = new File("terms.txt");
String field = "contents";
int capacity = 1000;
PriorityQueue<TermWithFreq> pq = new PriorityQueue<TermWithFreq>(capacity,
new Comparator<TermWithFreq>() {
@Override
public int compare(TermWithFreq o1, TermWithFreq o2) {
if (o1.count == o2.count) {
return o1.term.compareTo(o2.term);
}
return o2.count - o1.count;
}
});
DirectoryReader reader = DirectoryReader.open(FSDirectory.open(idxDir));
TermsEnum te = MultiFields.getTerms(reader, field).iterator(null);
BytesRef termBytes = null;
while ((termBytes = te.next()) != null) {
String text = termBytes.utf8ToString();
if (text != null && text.length() > 0) {
int freq = te.docFreq();
if (freq > 0) {
pq.offer(new TermWithFreq(text, freq));
}
}
}
reader.close();
OutputStreamWriter owriter = new OutputStreamWriter(new FileOutputStream(outFile),
Charset.forName("UTF-8"));
PrintWriter pwriter = new PrintWriter(owriter);
Iterator<TermWithFreq> iter = pq.iterator();
while (iter.hasNext()) {
TermWithFreq t = iter.next();
pwriter.println(t.term);
}
pwriter.flush();
pwriter.close();
}
}