IndexReader ir;
try
{
ir = IndexReader.open(FSDirectory.open(nsdl_index_dir), true);
//IndexSearcher searcher = new IndexSearcher(ir);
SnowballAnalyzer analyzer = new SnowballAnalyzer(SRM.VERSION , "Porter" , SRM.stopWords);
TokenStream ts ;
TermAttribute termAtt;
// Query data
doc = (Document)ir.document(testDocId);
ts = analyzer.tokenStream("title", new StringReader(doc.get("title")));
termAtt = ts.addAttribute(TermAttribute.class);
while (ts.incrementToken())
{
rTitle.add(termAtt.term());
}
ts = analyzer.tokenStream("content", new StringReader(doc.get("content")));
termAtt = ts.addAttribute(TermAttribute.class);
while (ts.incrementToken())
{
rContent.add(termAtt.term());
}
ts = analyzer.tokenStream("desc", new StringReader(doc.get("desc")));
termAtt = ts.addAttribute(TermAttribute.class);
while (ts.incrementToken())
{
rDesc.add(termAtt.term());
}
// Iterate over all training records to find the score of train , test document pair
Iterator <Integer> trainIterator = trainDocIds.iterator();
while (trainIterator.hasNext())
{
int docId = (Integer)trainIterator.next();
doc = (Document)ir.document(docId);
//********************** Title Similarity Score ***************
ts = analyzer.tokenStream("title", new StringReader(doc.get("title")));
termAtt = ts.addAttribute(TermAttribute.class);
// Construct a HashMap of Train record title
titleMap = new HashMap<String , Integer>();
titleNI = 0;
while(ts.incrementToken())
{
tempToken = termAtt.term();
if (tempToken.length() > 2)
{
titleNI++;
if (titleMap.containsKey(tempToken))
{
titleMap.put(tempToken, titleMap.get(tempToken) + 1);
}
else
titleMap.put(tempToken, 1);
}
}
// Iterate over query title set to find similarity score
iterator = rTitle.iterator();
titleSimScore = 0.0;
while(iterator.hasNext())
{
tempToken = iterator.next();
if (titleMap.containsKey(tempToken))
{
titleSimScore += ((double)titleMap.get(tempToken) + (100 * titleVocabMap.get(tempToken)/titleLength))/(titleNI + 100);
}
}
//********************** Description Similarity Score ****************
ts = analyzer.tokenStream("desc", new StringReader(doc.get("desc")));
termAtt = ts.addAttribute(TermAttribute.class);
// Construct a HashMap of Train record description
descMap = new HashMap<String , Integer>();
descNI = 0;
while(ts.incrementToken())
{
tempToken = termAtt.term();
if (tempToken.length() > 2)
{
descNI++;
if (descMap.containsKey(tempToken))
{
descMap.put(tempToken, descMap.get(tempToken) + 1);
}
else
descMap.put(tempToken, 1);
}
}
/// Iterate over query description set to find similarity score
iterator = rDesc.iterator();
descSimScore = 0.0;
while(iterator.hasNext())
{
tempToken = iterator.next();
if (descMap.containsKey(tempToken))
{
descSimScore += ((double)descMap.get(tempToken) + (100 * descVocabMap.get(tempToken))/descLength)/(descNI + 100);
}
}
//********************** Content Similarity Score ****************
ts = analyzer.tokenStream("content", new StringReader(doc.get("content")));
termAtt = ts.addAttribute(TermAttribute.class);
// Construct a HashMap of Train record content
contentMap = new HashMap<String , Integer>();
contentNI = 0;