return scores;
}
public static void analyseCluster(Map<Integer,Float> cluster, IndexReader ir, DocVectorSimilarity similarity, double threshold) throws Exception {
LuceneChemicalIndex lci = new LuceneIndexerSearcher(false).getLci();
List<File> clusterFiles = new ArrayList<File>();
Bag<String> dfs = new Bag<String>();
Set<String> inchis = new HashSet<String>();
Set<String> onts = new HashSet<String>();
for(Integer i : cluster.keySet()) {
cluster.put(i, 1.0f);
TermFreqVector tvf = ir.getTermFreqVector(i, "txt");
String [] termArray = tvf.getTerms();
for(int j=0;j<termArray.length;j++) {
dfs.add(termArray[j]);
}
if(false) {
tvf = ir.getTermFreqVector(i, "InChI");
if(tvf != null) {
termArray = tvf.getTerms();
for(int j=0;j<termArray.length;j++) {
inchis.add(termArray[j]);
}
}
tvf = ir.getTermFreqVector(i, "Ontology");
if(tvf != null) {
termArray = tvf.getTerms();
for(int j=0;j<termArray.length;j++) {
onts.add(termArray[j]);
}
}
}
clusterFiles.add(new File(ir.document(i).getField("filename").stringValue().replaceAll("markedup", "source")));
}
Stemmer st = new Stemmer(new EnglishStemmer());
Map<String,List<String>> stems = st.wordsToStems(dfs.getSet());
dfs.discardInfrequent(2);
NGramTfDf ngtd = NGramTfDf.analyseFiles(clusterFiles);
ngtd.calculateNGrams();
Bag<String> bs = ngtd.getDfBag(2);
bs.discardInfrequent(2);
Map<String,List<String>> termStems = ngtd.ngramsByStem();
Map<String,Double> scores = new HashMap<String,Double>();
Map<String,Integer> overlaps = new HashMap<String,Integer>();
IndexSearcher is = new IndexSearcher(ir);
int docTotal = ir.numDocs();
for(String term : dfs.getSet()) {
if(TermSets.getClosedClass().contains(term) || term.matches("[^A-Za-z]+")) continue;
Term luceneTerm = new Term("txt", term);
Query q = new TermQuery(luceneTerm);
VectorCollector vc = new VectorCollector();
is.search(q, vc);
double score = similarity.similarity(cluster, vc.getResultsVector());
if(score > threshold) {
int overlap = overlapDocs(vc.getResultsVector(), cluster);
if(overlap > 1) {
scores.put(term, score);
overlaps.put(term, overlap);
}
}
}
for(String stem : stems.keySet()) {
List<String> words = stems.get(stem);
if(words.size() > 1) {
BooleanQuery bq = new BooleanQuery(true);
for(String word : words) {
bq.add(new BooleanClause(new TermQuery(new Term("txt", word)), Occur.SHOULD));
}
VectorCollector vc = new VectorCollector();
is.search(bq, vc);
double score = similarity.similarity(cluster, vc.getResultsVector());
if(score > threshold) {
String s = words.toString();
int overlap = overlapDocs(vc.getResultsVector(), cluster);
if(overlap > 1) {
scores.put(s, score);
overlaps.put(s, overlap);
}
}
}
}
for(String stem : termStems.keySet()) {
List<String> multiWords = termStems.get(stem);
if(multiWords.size() > 1) {
BooleanQuery bq = new BooleanQuery(true);
for(String multiWord : multiWords) {
PhraseQuery pq = new PhraseQuery();
for(String ss : StringTools.arrayToList(multiWord.split("\\s+"))) pq.add(new Term("txt", ss));
bq.add(new BooleanClause(pq, Occur.SHOULD));
}
VectorCollector vc = new VectorCollector();
is.search(bq, vc);
double score = similarity.similarity(cluster, vc.getResultsVector());
if(score > threshold) {
String s = multiWords.toString();
int overlap = overlapDocs(vc.getResultsVector(), cluster);
if(overlap > 1) {
scores.put(s, score);
overlaps.put(s, overlap);
}
}
}
}
for(String s : bs.getList()) {
if(!s.matches(".*\\s+.*")) continue;
PhraseQuery pq = new PhraseQuery();
for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
VectorCollector vc = new VectorCollector();
is.search(pq, vc);
double score = similarity.similarity(cluster, vc.getResultsVector());
if(score > threshold) {
scores.put(s, score);
overlaps.put(s, overlapDocs(vc.getResultsVector(), cluster));
}
}
if(false) {
for(String inchi : inchis) {
Term luceneTerm = new Term("InChI", inchi);
Query q = new TermQuery(luceneTerm);
VectorCollector vc = new VectorCollector();
is.search(q, vc);
double score = similarity.similarity(cluster, vc.getResultsVector());
if(score > threshold) {
int overlap = overlapDocs(vc.getResultsVector(), cluster);
if(overlap > 1) {
String s = "InChi: " + lci.getName(lci.hitsByInChI(inchi));
scores.put(s, score);
overlaps.put(s, overlap);
}
}
}