onts.add(termArray[j]);
}
}
}
}
NGramTfDf ngtd = NGramTfDf.analyseFiles(clusterFiles);
ngtd.calculateNGrams();
Bag<String> df = ngtd.getDfBag(1);
df.discardInfrequent(2);
Map<String,Double> scores = new HashMap<String,Double>();
int numDocs = ir.numDocs();
int clusterSize = cluster.size();
double scaleFactor = clusterSize * 1.0 / numDocs;
IndexSearcher is = new IndexSearcher(ir);
for(String s : df.getSet()) {
//System.out.println(s);
int docFreq = 0;
Query q;
if(s.matches("\\S+")) {
TermQuery tq = new TermQuery(new Term("txt", s));
q = tq;
//docFreq = ir.docFreq(new Term("txt", s));
} else {
PhraseQuery pq = new PhraseQuery();
for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
q = pq;
}
VectorCollector vc = new VectorCollector();
is.search(q, vc);
docFreq = vc.getResultsVector().size();
double score;
double expected = scaleFactor * docFreq;
double excess = df.getCount(s) - expected;
score = excess / clusterSize;
if(score > threshold) scores.put(s, score);
}
Stemmer st = new Stemmer(new EnglishStemmer());
Map<String,List<String>> stems = st.wordsToStems(df.getSet());
for(String stem : stems.keySet()) {
List<String> words = stems.get(stem);
if(words.size() > 1) {
BooleanQuery bq = new BooleanQuery(true);
for(String word : words) {
bq.add(new BooleanClause(new TermQuery(new Term("txt", word)), Occur.SHOULD));
}
VectorCollector vc = new VectorCollector();
is.search(bq, vc);
double expected = scaleFactor * vc.getResultsVector().size();
int overlap = overlapDocs(vc.getResultsVector(), cluster);
double excess = overlap - expected;
double score = excess / clusterSize;
if(score > threshold) {
df.add(stems.get(stem).toString(), overlap);
scores.put(stems.get(stem).toString(), score);
}
}
}
Map<String,List<String>> termStems = ngtd.ngramsByStem();
for(String stem : termStems.keySet()) {
List<String> multiWords = termStems.get(stem);
if(multiWords.size() > 1) {
BooleanQuery bq = new BooleanQuery(true);
for(String multiWord : multiWords) {