package org.sf.mustru.search;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.LingpipeTools;
import org.sf.mustru.utils.StandardBgramAnalyzer;
import com.sleepycat.bind.tuple.TupleBinding;
import com.sleepycat.je.DatabaseEntry;
/**
* A collection of search tools to get a multi-searcher, expand queries, and find similar documents.
* <ol>
* <li> getSearcher: Return an index reader based on the passed index directory <br>
* <li> expandQuery: Expand the passed query by finding the hypernyms for the terms <br>
* <li> similarDocs: Extract the tokens from a document and generate a query <br>
* <li> bestPassages: Accept a test chunk and query words and return the best passage from the text <br>
* <li> getQwords: Get a list of question words
* </ol>
*/
public class SearchTools
{
public final static Logger logger = Logger.getLogger(SearchTools.class.getName() );
public final static Pattern qwordPattern = Pattern.compile("^(.*?)(how long will|what should|how long is|which month|what place|what month|where were|where does|which date|which year|what time|what were|what does|where was|where can|where are|when were|when does|what day|what age|what are|what did|what can|who were|who does|how were|how does|how many|how tall|how high|how much|how long|why were|why does|where is|where do|when was|when are|when did|what is|what do|who are|who was|who did|how are|how was|how did|how old|how far|why are|why did|where's|when is|when do|what's|who is|who do|how is|how do|why is|why do|when's|who's|how's|why's|where|which|whom|when|who|how|why)(.*$)",
Pattern.CASE_INSENSITIVE);
private final static StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer();
private final static LingpipeTools sentExtractor = new LingpipeTools();
public final static Set stopWords = StopFilter.makeStopSet(Constants.getSTOP_WORDS());
/**
* Return an index reader or a multi reader depending on whether the index directory contains any sub-directories.
* An array of directories will be scanned and sub-directories will be individually added.
* @param indexDir The index directory containing the index files or a set of directories that each contain index files
* @param create Flag to create a new index
* @return Searcher
*/
public static Searcher getSearcher(String indexDir, boolean create) throws IOException
{ String dirs[] = {indexDir}; return getSearcher ( dirs, create); }
public static Searcher getSearcher(String[] dirs, boolean create) throws IOException
{
ArrayList <String> idirs = new ArrayList<String>();
for (int i = 0; i < dirs.length; i++)
{ String[] files = new File(dirs[i]).list();
for (int j = 0; j < files.length; j++)
{ String dirName = dirs[i] + File.separator + files[j];
if ( new File(dirName).isDirectory() ) idirs.add(dirName);
}
}
//*-- return IndexSearcher if zero sub-directories
if (idirs.size() == 0)
{ Directory directory = FSDirectory.getDirectory( dirs[0], create);
return( new IndexSearcher(directory) );
}
//*-- otherwise return MultiSearcher
IndexSearcher[] searchers = new IndexSearcher[ idirs.size() ];
for (int i = 0; i < idirs.size(); i++)
{ Directory directory = FSDirectory.getDirectory( idirs.get(i), create);
searchers[i] = new IndexSearcher(directory);
}
return (new MultiSearcher(searchers));
}
/**
* Accept a search engine query and return an expanded query using the entities
* @param query search engine query
* @return expanded search engine query
*/
public static Query expandQuery(String query) throws IOException
{
//*-- extract a list of tokens from the passed text
if (query == null) return null;
//*-- extract a list of tokens with entities
analyzer.setExtractEntities(true);
Token[] tokens = tokensFromAnalysis(analyzer, query);
BooleanQuery fullQuery = new BooleanQuery();
LOOP: for (int i = 0; i < tokens.length; i++)
{
String word = tokens[i].termText().toLowerCase(Constants.locale);
//*-- skip punctuation and very short words
if ( (word.length() < 2) || (stopWords.contains(word)) ) continue LOOP;
TermQuery tq = new TermQuery(new Term("contents", word));
tq.setBoost((float) 10.0); //*-- give an average boost to the phrases
fullQuery.add(tq, BooleanClause.Occur.SHOULD);
}
return (fullQuery);
}
/**
* Accept the key of a document and generate a query using tokens found in the document
* @param key File name key to identify the contents of the document in the DB
* @param dbt Database tools handle
* @return Query
*/
public static Query similarDocs(String key, DbTools dbt)
{
//*-- open the database environment
boolean readOnly = true; boolean dupFlag = false;
dbt.openDB(Constants.EXT_FILES_DB, readOnly, dupFlag);
BooleanQuery query = new BooleanQuery();
TupleBinding tpb = new IndexableDoc().getBdbBinding();
DatabaseEntry data = new DatabaseEntry();
if ( dbt.fetch(key, data) )
{
//*-- extract the text of the document
IndexableDoc idoc = (IndexableDoc) tpb.entryToObject(data);
String docText = idoc.getContents().toString();
//*-- tokenize the text
analyzer.setExtractEntities(false);
Token[] tokens = null;
try { tokens = tokensFromAnalysis(analyzer, docText); }
catch (IOException ie) { logger.error("IO Error: Could not tokenize" + ie.getMessage()); }
//*-- build a query from the individual tokens taken from the document, limit the
//*-- number of tokens from the document to 100.
int numTokens = 0;
LOOP: for (int i = 0; i < tokens.length; i++)
{ String token = tokens[i].termText().toLowerCase(Constants.locale);
if ( (stopWords.contains(token)) || (token.length() < 3)) continue;
TermQuery tq = new TermQuery(new Term("contents", token));
query.add(tq, BooleanClause.Occur.SHOULD);
if (++numTokens >= 100) break LOOP;
} //*-- end of for
} //*-- end of if
dbt.closeDB();
return(query);
}
/**
* Segment the passed text into passages. Score each passage based on the degree of overlap
* between passage words and the nouns, verbs, and adjectives
* @param text String Text to be chunked
* @param nouns String[] Nouns found in the question
* @param verbs String[] Verbs found in the question
* @param adjectives String[] Adjectives found in the question
* @return String top 2 passages from passed text
*/
public static String[] bestPassages(String text, String[] nouns, String[] verbs, String[] adjectives, String[] bigrams, String[] entities)
throws IOException
{
//*-- limits for text size and weights for POS
int TEXT_LIMIT = 25000; //*-- limit the size of text that will be chunked
double BIGRAM_WT = 16.0; //*-- increase the weight of bigrams
double NOUN_WT = 2.0; //*-- higher weight for nouns
double VERB_WT = 2.0; //*-- moderate weight for verbs
double ADJ_WT = 1.0; //*-- moderate weight for adjectives
double ENTITY_WT = 32.0; //*-- moderate weight for entities
//*-- flags to indicate if the query words were seen in the passages
boolean[] foundNouns = new boolean[nouns.length] ;
boolean[] foundAdjs = new boolean[adjectives.length];
boolean[] foundVerbs = new boolean[verbs.length];
boolean[] foundEnts = new boolean[entities.length];
boolean[] foundBigrams = new boolean[bigrams.length];
String[] topSentences = {"", ""}; //*-- initialize the top sentences returned to caller
//*-- limit the length of the text
if (text.length() > TEXT_LIMIT) text = text.substring(0, TEXT_LIMIT);
//*-- extract sentences from the text
sentExtractor.setMaxCharsPerSentence(250); sentExtractor.setMinCharsPerSentence(100);
sentExtractor.buildSentences(text);
analyzer.setExtractEntities(true);
String sentence = "";
double bestScore = 0.0;
double secondBestScore = 0.0;
String prevSentence = "";
for (int i = 0; i < bigrams.length; i++) bigrams[i] = bigrams[i].replace('_', ' ');
while ( (sentence = sentExtractor.nextSentence()) != null)
{
//*-- add part of the previous sentence, if it exists
if (sentence.length() < sentExtractor.getMaxCharsPerSentence() && (prevSentence.length() > 0))
{ int addChars = sentExtractor.getMaxCharsPerSentence() - sentence.length();
int beginIndex = (prevSentence.length() > addChars) ? prevSentence.length() - addChars: 0;
sentence = prevSentence.substring(beginIndex, prevSentence.length()) + sentence;
}
//*-- build a list of tokens from the sentence
TokenStream stream = analyzer.tokenStream("contents", new StringReader(sentence));
ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
while ( (token = stream.next()) != null) tokenList.add(token);
//*-- initialize the boolean arrays and scores
boolean[] foundWords = new boolean[tokenList.size()];
for (int i = 0; i < nouns.length; i++) foundNouns[i] = false;
for (int i = 0; i < verbs.length; i++) foundVerbs[i] = false;
for (int i = 0; i < adjectives.length; i++) foundAdjs[i] = false;
for (int i = 0; i < entities.length; i++) foundEnts[i] = false;
for (int i = 0; i < bigrams.length; i++) foundBigrams[i] = false;
for (int i = 0; i < tokenList.size(); i++) foundWords[i] = false;
//*-- loop thru the tokens and build the match score
double matchScore = 0.0;
for (int i = 0; i < tokenList.size(); i++)
{
String tokText = tokenList.get(i).termText();
boolean foundWord = false;
for (int j = 0; j < nouns.length; j++) //*-- scan the list of nouns
if (tokText.equalsIgnoreCase(nouns[j]) && !foundNouns[j])
{ matchScore += (NOUN_WT * (nouns.length - j) ); foundNouns[j] = true; foundWord = true; }
// { matchScore += (NOUN_WT ); foundNouns[j] = true; foundWord = true; }
for (int j = 0; j < verbs.length; j++) //*-- scan the list of verbs
if (tokText.equalsIgnoreCase(verbs[j]) && !foundVerbs[j])
{ matchScore += (VERB_WT * (verbs.length - j) ); foundVerbs[j] = true; foundWord = true; }
for (int j = 0; j < adjectives.length; j++) //*-- scan the list of adjectives
if (tokText.equalsIgnoreCase(adjectives[j]) && !foundAdjs[j])
{ matchScore += (ADJ_WT * (adjectives.length - 1) ); foundAdjs[j] = true; foundWord = true; }
for (int j = 0; j < entities.length; j++) //*-- scan the list of entity types
if (tokText.equalsIgnoreCase(entities[j]) && !foundEnts[j])
{ matchScore += ENTITY_WT; foundEnts[j] = true; foundWord = true; }
String bigram = (tokenList.get(i).type().equals("<BIGRAM>")) ? tokenList.get(i).termText().replace('_', ' '):
(i > 0) ? tokenList.get(i-1).termText() + " " + tokText: null;
if (bigram != null)
for (int j = 0; j < bigrams.length; j++) //*-- scan the list of bigrams
if (bigram.equalsIgnoreCase(bigrams[j]) && !foundBigrams[j])
{ matchScore += BIGRAM_WT; foundBigrams[j] = true; foundWord = true; }
if (foundWord) foundWords[i] = true;
} //*-- end of for
//*-- compute the dispersion score -- how close are the query words
double dispScore = 0.0;
for (int i = 1; i < foundWords.length; i++) if (foundWords[i] && foundWords[i-1]) dispScore += BIGRAM_WT;
//*-- compute the mismatch score -- the number of query words not seen in the passage
double misMatchScore = 0.0;
for (int i = 0; i < nouns.length; i++) if (!foundNouns[i]) misMatchScore += NOUN_WT;
for (int i = 0; i < verbs.length; i++) if (!foundVerbs[i]) misMatchScore += VERB_WT;
for (int i = 0; i < adjectives.length; i++) if (!foundAdjs[i]) misMatchScore += ADJ_WT;
for (int i = 0; i < entities.length; i++) if (!foundEnts[i]) misMatchScore += ENTITY_WT;
misMatchScore *= 0.25;
//*-- compute the total score
double totalScore = matchScore + dispScore - misMatchScore; if (totalScore < 0.0) totalScore = 0.0;
if (totalScore >= bestScore)
{ if (bestScore >= secondBestScore) //*-- move the best sentence to second best
{ secondBestScore = bestScore; topSentences[1] = topSentences[0]; }
bestScore = totalScore; topSentences[0] = sentence;
}
else if (totalScore > secondBestScore)
{ secondBestScore = totalScore; topSentences[1] = sentence; }
prevSentence = sentence;
} //*-- end of while
return (topSentences);
}
/**
* Use the passed analyzer to get a list of tokens from the text
*/
private static Token[] tokensFromAnalysis(Analyzer analyzer, String text) throws IOException
{
TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
while ( (token = stream.next()) != null) tokenList.add(token);
Token[] tokens = new Token[tokenList.size()];
for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
return (tokens);
}
}