/*
* Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* -------------------
* To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
* http://www.manning.com/ingersoll
*/
package com.tamingtext.classifier.mlt;
import java.io.File;
import java.io.FileReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similar.MoreLikeThis;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Before;
import org.junit.Test;
public class MoreLikeThisQueryTest {
int nGramSize;
String inputPath;
String modelPath;
int maxResults;
String categoryFieldName;
@Before
public void setup() {
nGramSize = 1;
inputPath = "src/test/resources/classifier/mlt/sample-input.txt";
modelPath = "src/test/resources/classifier/mlt/sample-model";
maxResults = 100;
categoryFieldName = "category";
}
@Test
public void testMoreLikeThisQuery() throws Exception {
//<start id="lucene.examples.mlt.setup"/>
Directory directory = FSDirectory.open(new File(modelPath));
IndexReader indexReader = IndexReader.open(directory); //<co id="mlt.indexsetup"/>
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
Analyzer analyzer //<co id="mlt.analyzersetup"/>
= new EnglishAnalyzer(Version.LUCENE_36);
if (nGramSize > 1) { //<co id="mlt.ngramsetup"/>
analyzer = new ShingleAnalyzerWrapper(analyzer, nGramSize,
nGramSize);
}
MoreLikeThis moreLikeThis = new MoreLikeThis(indexReader); //<co id="mlt.configure"/>
moreLikeThis.setAnalyzer(analyzer);
moreLikeThis.setFieldNames(new String[] {
"content"
});
/*<calloutlist>
<callout arearefs="mlt.indexsetup">Open Index</callout>
<callout arearefs="mlt.analyzersetup">Setup Analyzer</callout>
<callout arearefs="mlt.ngramsetup">Setup NGrams</callout>
<callout arearefs="mlt.configure">Create <classname>MoreLikeThis</classname></callout>
</calloutlist>*/
//<end id="lucene.examples.mlt.setup"/>
// for testing against the same corpus
moreLikeThis.setMinTermFreq(1);
moreLikeThis.setMinDocFreq(1);
//<start id="lucene.examples.mlt.query"/>
Reader reader = new FileReader(inputPath); //<co id="mlt.query"/>
Query query = moreLikeThis.like(reader);
TopDocs results
= indexSearcher.search(query, maxResults); //<co id="mlt.search"/>
HashMap<String, CategoryHits> categoryHash
= new HashMap<String, CategoryHits>();
for (ScoreDoc sd: results.scoreDocs) { //<co id="mlt.collect"/>
Document d = indexReader.document(sd.doc);
Fieldable f = d.getFieldable(categoryFieldName);
String cat = f.stringValue();
CategoryHits ch = categoryHash.get(cat);
if (ch == null) {
ch = new CategoryHits();
ch.setLabel(cat);
categoryHash.put(cat, ch);
}
ch.incrementScore(sd.score);
}
SortedSet<CategoryHits> sortedCats //<co id="mlt.rank"/>
= new TreeSet<CategoryHits>(CategoryHits.byScoreComparator());
sortedCats.addAll(categoryHash.values());
for (CategoryHits c: sortedCats) { //<co id="mlt.display"/>
System.out.println(
c.getLabel() + "\t" + c.getScore());
}
/*<calloutlist>
<callout arearefs="mlt.query">Create Query</callout>
<callout arearefs="mlt.search">Perform Search</callout>
<callout arearefs="mlt.collect">Collect Results</callout>
<callout arearefs="mlt.rank">Rank Categories</callout>
<callout arearefs="mlt.display">Display Categories</callout>
</calloutlist>*/
//<end id="lucene.examples.mlt.query"/>
}
}