LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);
IndexSearcher is = lis.getIndexSearcher();
IndexReader ir = lis.getIndexReader();
Bag<String> tfBag = new Bag<String>();
for(int i=0;i<ir.maxDoc();i++) {
TermFreqVector tv = ir.getTermFreqVector(i, "txt");
String [] terms = tv.getTerms();
int [] freqs = tv.getTermFrequencies();
for(int k=0;k<tv.size();k++) {
String term = terms[k];
if("In".equals(term)) continue;
if(TermSets.getClosedClass().contains(term)) continue;
if(!term.matches(".*[A-Za-z].*")) continue;
tfBag.add(term, freqs[k]);
}
}
double overallEntropy = tfBag.entropy();
double totalEntropy = overallEntropy * tfBag.totalCount();
System.out.println(totalEntropy);
List<String> termList = tfBag.getList().subList(0, 2000);
for(String splitTerm : termList) {
Query q = new TermQuery(new Term("txt", splitTerm));
VectorCollector vc = new VectorCollector();
is.search(q, vc);
Bag<String> inBag = new Bag<String>();
Bag<String> outBag = new Bag<String>();
for(int i=0;i<ir.maxDoc();i++) {
Bag<String> bag = inBag;
if(!vc.getResultsVector().containsKey(i)) continue;
//Bag<String> bag = outBag;
//if(vc.getResultsVector().containsKey(i)) bag = inBag;
TermFreqVector tv = ir.getTermFreqVector(i, "txt");
String [] terms = tv.getTerms();
int [] freqs = tv.getTermFrequencies();
for(int k=0;k<tv.size();k++) {
String term = terms[k];
if("In".equals(term)) continue;
if(TermSets.getClosedClass().contains(term)) continue;
if(!term.matches(".*[A-Za-z].*")) continue;
bag.add(term, freqs[k]);