} else {
PhraseQuery pq = new PhraseQuery();
for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
q = pq;
}
VectorCollector vc = new VectorCollector();
is.search(q, vc);
docFreq = vc.getResultsVector().size();
double score;
double expected = scaleFactor * docFreq;
double excess = df.getCount(s) - expected;
score = excess / clusterSize;
if(score > threshold) scores.put(s, score);
}
Stemmer st = new Stemmer(new EnglishStemmer());
Map<String,List<String>> stems = st.wordsToStems(df.getSet());
for(String stem : stems.keySet()) {
List<String> words = stems.get(stem);
if(words.size() > 1) {
BooleanQuery bq = new BooleanQuery(true);
for(String word : words) {
bq.add(new BooleanClause(new TermQuery(new Term("txt", word)), Occur.SHOULD));
}
VectorCollector vc = new VectorCollector();
is.search(bq, vc);
double expected = scaleFactor * vc.getResultsVector().size();
int overlap = overlapDocs(vc.getResultsVector(), cluster);
double excess = overlap - expected;
double score = excess / clusterSize;
if(score > threshold) {
df.add(stems.get(stem).toString(), overlap);
scores.put(stems.get(stem).toString(), score);
}
}
}
Map<String,List<String>> termStems = ngtd.ngramsByStem();
for(String stem : termStems.keySet()) {
List<String> multiWords = termStems.get(stem);
if(multiWords.size() > 1) {
BooleanQuery bq = new BooleanQuery(true);
for(String multiWord : multiWords) {
PhraseQuery pq = new PhraseQuery();
for(String ss : StringTools.arrayToList(multiWord.split("\\s+"))) pq.add(new Term("txt", ss));
bq.add(new BooleanClause(pq, Occur.SHOULD));
}
VectorCollector vc = new VectorCollector();
is.search(bq, vc);
double expected = scaleFactor * vc.getResultsVector().size();
int overlap = overlapDocs(vc.getResultsVector(), cluster);
double excess = overlap - expected;
double score = excess / clusterSize;
if(score > threshold) {
df.add(termStems.get(stem).toString(), overlap);
scores.put(termStems.get(stem).toString(), score);
}
}
}
if(enriched) {
for(String inchi : inchis) {
Term luceneTerm = new Term("InChI", inchi);
Query q = new TermQuery(luceneTerm);
VectorCollector vc = new VectorCollector();
is.search(q, vc);
double expected = scaleFactor * vc.getResultsVector().size();
int overlap = overlapDocs(vc.getResultsVector(), cluster);
if(overlap < 2) continue;
double excess = overlap - expected;
double score = excess / clusterSize;
if(score > threshold) {
String s = "InChi: " + lci.getName(lci.hitsByInChI(inchi));
scores.put(s, score);
df.add(s, overlap);
}
}
Map<String,Set<String>> ontQs = OBOOntology.getInstance().queriesForIds(onts);
for(String ontQ : ontQs.keySet()) {
/*BooleanQuery bq = new BooleanQuery(true);
if(ontQs.get(ontQ).size() > BooleanQuery.getMaxClauseCount()) continue;
for(String ont : ontQs.get(ontQ)) {
bq.add(new BooleanClause(new TermQuery(new Term("Ontology", ont)), Occur.SHOULD));
}
VectorCollector vc = new VectorCollector();
is.search(bq, vc);*/
VectorCollector vc = OntologyQueryCache.getResultsStatic(ontQ, ontQs.get(ontQ), is);
Map<Integer,Float> results = vc.getResultsVector();
double expected = scaleFactor * results.size();
int overlap = overlapDocs(vc.getResultsVector(), cluster);
if(overlap < 2) continue;
double excess = overlap - expected;
double score = excess / clusterSize;
if(score > threshold) {
String s = ontQ + " " + OBOOntology.getInstance().getNameForID(ontQ);