Source Code of org.sf.mustru.search.SearchTools

package org.sf.mustru.search;




import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Set;
import java.util.regex.Pattern;


import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;


import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.LingpipeTools;
import org.sf.mustru.utils.StandardBgramAnalyzer;


import com.sleepycat.bind.tuple.TupleBinding;
import com.sleepycat.je.DatabaseEntry;


/**
 * A collection of search tools to get a multi-searcher, expand queries, and find similar documents.
 * <ol>
 * <li> getSearcher: Return an index reader based on the passed index directory <br>
 * <li> expandQuery: Expand the passed query by finding the hypernyms for the terms <br>
 * <li> similarDocs: Extract the tokens from a document and generate a query <br>
 * <li> bestPassages: Accept a test chunk and query words and return the best passage from the text <br>
 * <li> getQwords: Get a list of question words
 * </ol>
 */
public class SearchTools
{
 public final static Logger logger = Logger.getLogger(SearchTools.class.getName() );
 public final static Pattern qwordPattern =  Pattern.compile("^(.*?)(how long will|what should|how long is|which month|what place|what month|where were|where does|which date|which year|what time|what were|what does|where was|where can|where are|when were|when does|what day|what age|what are|what did|what can|who were|who does|how were|how does|how many|how tall|how high|how much|how long|why were|why does|where is|where do|when was|when are|when did|what is|what do|who are|who was|who did|how are|how was|how did|how old|how far|why are|why did|where's|when is|when do|what's|who is|who do|how is|how do|why is|why do|when's|who's|how's|why's|where|which|whom|when|who|how|why)(.*$)", 
  Pattern.CASE_INSENSITIVE); 
 
 private final static StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer();
 private final static LingpipeTools sentExtractor = new LingpipeTools();
 public final static Set stopWords = StopFilter.makeStopSet(Constants.getSTOP_WORDS());


 /**
  * Return an index reader or a multi reader depending on whether the index directory contains any sub-directories.
  * An array of directories will be scanned and sub-directories will be individually added.
  * @param indexDir The index directory containing the index files or a set of directories that each contain index files
  * @param create Flag to create a new index
  * @return Searcher
  */
 public static Searcher getSearcher(String indexDir, boolean create) throws IOException
 { String dirs[] = {indexDir}; return getSearcher ( dirs, create);  }
 
 public static Searcher getSearcher(String[] dirs, boolean create) throws IOException
 {
  ArrayList <String> idirs = new ArrayList<String>();
  for (int i = 0; i < dirs.length; i++)
  { String[] files =  new File(dirs[i]).list();
    for (int j = 0; j < files.length; j++)
     { String dirName = dirs[i] + File.separator + files[j];
       if ( new File(dirName).isDirectory() )  idirs.add(dirName);
     }
  }
  
  //*-- return IndexSearcher if zero sub-directories
  if (idirs.size() == 0) 
   { Directory directory = FSDirectory.getDirectory( dirs[0], create); 
     return( new IndexSearcher(directory) ); 
   }
  
  //*-- otherwise return MultiSearcher
  IndexSearcher[] searchers = new IndexSearcher[ idirs.size() ];
  for (int i = 0; i < idirs.size(); i++) 
  { Directory directory = FSDirectory.getDirectory( idirs.get(i), create); 
    searchers[i] = new IndexSearcher(directory);
  }
  
  return (new MultiSearcher(searchers));
 }


 /**
  * Accept a search engine query and return an expanded query using the entities
  * @param query search engine query
  * @return expanded search engine query
  */
 public static Query expandQuery(String query) throws IOException
 {
  
  //*-- extract a list of tokens from the passed text
  if (query == null) return null;
  
  //*-- extract a list of tokens with entities
  analyzer.setExtractEntities(true);
  Token[] tokens = tokensFromAnalysis(analyzer, query);  
  BooleanQuery fullQuery = new BooleanQuery();
  LOOP: for (int i = 0; i < tokens.length; i++)
   { 
     String word = tokens[i].termText().toLowerCase(Constants.locale);
     
     //*-- skip punctuation and very short words
     if ( (word.length() < 2) || (stopWords.contains(word)) ) continue LOOP;
   
     TermQuery tq = new TermQuery(new Term("contents", word));
     tq.setBoost((float) 10.0);     //*-- give an average boost to the phrases
     fullQuery.add(tq, BooleanClause.Occur.SHOULD);
   } 
  
  return (fullQuery);
 }
 
 /**
  * Accept the key of a document and generate a query using tokens found in the document
  * @param key  File name key to identify the contents of the document in the DB
  * @param dbt Database tools handle
  * @return Query
  */
 public static Query similarDocs(String key, DbTools dbt)
 {
  //*-- open the database environment
  boolean readOnly = true; boolean dupFlag = false; 
  dbt.openDB(Constants.EXT_FILES_DB, readOnly, dupFlag); 
  
  BooleanQuery query = new BooleanQuery();
  TupleBinding tpb = new IndexableDoc().getBdbBinding();
  DatabaseEntry data = new DatabaseEntry();
  if ( dbt.fetch(key, data) )
   { 
     //*-- extract the text of the document
     IndexableDoc idoc = (IndexableDoc) tpb.entryToObject(data);
     String docText = idoc.getContents().toString();
    
     //*-- tokenize the text
     analyzer.setExtractEntities(false); 
     Token[] tokens = null;
     try { tokens = tokensFromAnalysis(analyzer, docText); }
     catch (IOException ie) { logger.error("IO Error: Could not tokenize" + ie.getMessage()); }
    
     //*-- build a query from the individual tokens taken from the document, limit the
     //*-- number of tokens from the document to 100.
     int numTokens = 0;
     LOOP: for (int i = 0; i < tokens.length; i++)
      { String token = tokens[i].termText().toLowerCase(Constants.locale);
        if ( (stopWords.contains(token)) || (token.length() < 3)) continue;
  TermQuery tq = new TermQuery(new Term("contents", token));
  query.add(tq, BooleanClause.Occur.SHOULD);
  if (++numTokens >= 100) break LOOP;
      }  //*-- end of for
   } //*-- end of if
  
  dbt.closeDB(); 
   
  return(query);
 }
 
 /**
  * Segment the passed text into passages. Score each passage based on the degree of overlap
  * between passage words and the nouns, verbs, and adjectives
  * @param text   String Text to be chunked 
  * @param nouns String[] Nouns found in the question
  * @param verbs String[] Verbs found in the question
  * @param adjectives String[] Adjectives found in the question
  * @return String top 2 passages from passed text
  */
 public static String[] bestPassages(String text, String[] nouns, String[] verbs, String[] adjectives, String[] bigrams, String[] entities)
       throws IOException
 {
  //*-- limits for text size and weights for POS
  int TEXT_LIMIT = 25000;    //*-- limit the size of text that will be chunked
  double BIGRAM_WT = 16.0;    //*-- increase the weight of bigrams
  double NOUN_WT = 2.0;      //*-- higher weight for nouns 
  double VERB_WT = 2.0;      //*-- moderate weight for verbs
  double ADJ_WT = 1.0;      //*-- moderate weight for adjectives
  double ENTITY_WT = 32.0;    //*-- moderate weight for entities
  
  //*-- flags to indicate if the query words were seen in the passages
  boolean[] foundNouns = new boolean[nouns.length] ;
  boolean[] foundAdjs = new boolean[adjectives.length];
  boolean[] foundVerbs = new boolean[verbs.length];
  boolean[] foundEnts = new boolean[entities.length];
  boolean[] foundBigrams = new boolean[bigrams.length];
 
  String[] topSentences = {"", ""};    //*-- initialize the top sentences returned to caller
  
  //*-- limit the length of the text
  if (text.length() > TEXT_LIMIT) text = text.substring(0, TEXT_LIMIT);
  
  //*-- extract sentences from the text
  sentExtractor.setMaxCharsPerSentence(250); sentExtractor.setMinCharsPerSentence(100); 
  sentExtractor.buildSentences(text); 
  analyzer.setExtractEntities(true); 
  String sentence = ""; 
  double bestScore = 0.0; 
  double secondBestScore = 0.0; 
  String prevSentence = "";
  for (int i = 0; i < bigrams.length; i++) bigrams[i] = bigrams[i].replace('_', ' ');
  
  while ( (sentence = sentExtractor.nextSentence()) != null)
  {
   //*-- add part of the previous sentence, if it exists
   if (sentence.length() < sentExtractor.getMaxCharsPerSentence() && (prevSentence.length() > 0))
   { int addChars = sentExtractor.getMaxCharsPerSentence() - sentence.length();
     int beginIndex = (prevSentence.length() > addChars) ? prevSentence.length() - addChars: 0;
     sentence = prevSentence.substring(beginIndex, prevSentence.length()) + sentence;
   }
   //*-- build a list of tokens from the sentence
   TokenStream stream = analyzer.tokenStream("contents", new StringReader(sentence));
   ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
   while ( (token = stream.next()) != null) tokenList.add(token);
   
   //*-- initialize the boolean arrays and scores
   boolean[] foundWords = new boolean[tokenList.size()];
   for (int i = 0; i < nouns.length; i++) foundNouns[i] = false;
   for (int i = 0; i < verbs.length; i++) foundVerbs[i] = false;
   for (int i = 0; i < adjectives.length; i++) foundAdjs[i] = false;
   for (int i = 0; i < entities.length; i++) foundEnts[i] = false;
   for (int i = 0; i < bigrams.length; i++) foundBigrams[i] = false;
   for (int i = 0; i < tokenList.size(); i++) foundWords[i] = false;
   
   //*-- loop thru the tokens and build the match score
   double matchScore = 0.0; 
   for (int i = 0; i < tokenList.size(); i++)
   { 
    String tokText = tokenList.get(i).termText();
    boolean foundWord = false;
    for (int j = 0; j < nouns.length; j++)    //*-- scan the list of nouns 
     if (tokText.equalsIgnoreCase(nouns[j]) && !foundNouns[j]) 
       { matchScore += (NOUN_WT * (nouns.length - j) ); foundNouns[j] = true; foundWord = true; }
    // { matchScore += (NOUN_WT  ); foundNouns[j] = true; foundWord = true; }
    for (int j = 0; j < verbs.length; j++)     //*-- scan the list of verbs
     if (tokText.equalsIgnoreCase(verbs[j]) && !foundVerbs[j]) 
       { matchScore += (VERB_WT * (verbs.length - j) ); foundVerbs[j] = true; foundWord = true; }
    for (int j = 0; j < adjectives.length; j++)   //*-- scan the list of adjectives
     if (tokText.equalsIgnoreCase(adjectives[j]) && !foundAdjs[j]) 
       { matchScore += (ADJ_WT * (adjectives.length - 1) ); foundAdjs[j] = true; foundWord = true; }
    for (int j = 0; j < entities.length; j++)     //*-- scan the list of entity types
     if (tokText.equalsIgnoreCase(entities[j]) && !foundEnts[j]) 
       { matchScore += ENTITY_WT; foundEnts[j] = true; foundWord = true; }
    String bigram = (tokenList.get(i).type().equals("<BIGRAM>")) ? tokenList.get(i).termText().replace('_', ' '):
                    (i > 0) ? tokenList.get(i-1).termText() + " " + tokText: null;
    if (bigram != null)
     for (int j = 0; j < bigrams.length; j++)   //*-- scan the list of bigrams 
       if (bigram.equalsIgnoreCase(bigrams[j]) && !foundBigrams[j]) 
        { matchScore += BIGRAM_WT; foundBigrams[j] = true; foundWord = true; }
    
    if (foundWord) foundWords[i] = true; 
   } //*-- end of for
   
   //*-- compute the dispersion score -- how close are the query words
   double dispScore = 0.0;
   for (int i = 1; i < foundWords.length; i++) if (foundWords[i] && foundWords[i-1]) dispScore += BIGRAM_WT; 
  
   //*-- compute the mismatch score -- the number of query words not seen in the passage
   double misMatchScore = 0.0; 
   for (int i = 0; i < nouns.length; i++) if (!foundNouns[i]) misMatchScore += NOUN_WT;
   for (int i = 0; i < verbs.length; i++) if (!foundVerbs[i]) misMatchScore += VERB_WT;
   for (int i = 0; i < adjectives.length; i++) if (!foundAdjs[i]) misMatchScore += ADJ_WT; 
   for (int i = 0; i < entities.length; i++) if (!foundEnts[i]) misMatchScore += ENTITY_WT;
   misMatchScore *= 0.25;
 
   //*-- compute the total score 
   double totalScore = matchScore + dispScore - misMatchScore; if (totalScore < 0.0) totalScore = 0.0;
   
   if (totalScore >= bestScore) 
     { if (bestScore >= secondBestScore) //*-- move the best sentence to second best 
        { secondBestScore = bestScore; topSentences[1] = topSentences[0]; }
       bestScore = totalScore; topSentences[0] = sentence;
     }
   else if (totalScore > secondBestScore)
   { secondBestScore = totalScore; topSentences[1] = sentence; }
   prevSentence = sentence;
   
  } //*-- end of while
   
  return (topSentences);
 }
  
 /**
  * Use the passed analyzer to get a list of tokens from the text 
  */
 private static Token[] tokensFromAnalysis(Analyzer analyzer, String text) throws IOException 
 {
   TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
   ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
   while ( (token = stream.next()) != null) tokenList.add(token);
   Token[] tokens = new Token[tokenList.size()];
   for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
   return (tokens);
   }
}
Source Code of org.sf.mustru.search.SearchTools

Related Classes of org.sf.mustru.search.SearchTools