Source Code of org.sf.mustru.search.SearchQuestion

package org.sf.mustru.search;


import java.io.IOException;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;


import org.apache.log4j.Logger;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery;


import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.LingpipeTools;
import org.sf.mustru.utils.StandardBgramAnalyzer;
//import org.sf.mustru.utils.WordnetTools;


import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;


import com.sleepycat.je.DatabaseEntry;


import org.apache.log4j.PropertyConfigurator;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.Searcher;


/**
 * Submit natural language questions to the search engine after conversion to a query.
 * Return a list of hits
 */


public class SearchQuestion extends DefaultHandler
{
 static Logger logger = Logger.getLogger(SearchQuestion.class.getName() ); 
 private Query query = null;      //*-- a lucene query
 private Searcher is = null;      //*-- lucene index searcher
 
 private LingpipeTools posTagger;    //*-- tools to tag the question with POS and entity type
 private Set stopWords;      //*-- list of stop words
 private Set questionWords;      //*-- list of question words


 private ArrayList<String> adjectives = null;   //*-- list of adjectives in the question
 private ArrayList<String> nouns = null;  //*-- list of nouns in the question
 private ArrayList<String> verbs = null;  //*-- list of verbs in the question
 private ArrayList<String> bigrams = null;  //*-- list of bigrams in the question
 private ArrayList<String> trigrams = null;  //*-- list of trigrams in the question
 private ArrayList<String> entities = null;  //*-- list of entities in the question
 private ArrayList<String> qwords = null;  //*-- list of question words
 public final static Pattern whatPattern =  Pattern.compile("^(.*?)(what is|what do|what's)(.*$)", Pattern.CASE_INSENSITIVE);
 public final static Pattern wherePattern =  Pattern.compile("^(.*?)(where is|where|where's)(.*$)", Pattern.CASE_INSENSITIVE);


 private SAXParser parser;      //*-- parser for the XML tagged question
 private StringBuffer elementBuffer;     //*-- string buffer to capture tagged tokens
 private SearchQuery sq;      //*-- object to dump hits from a query result
 private HashMap<String, String> qhash = Constants.getQtypeEntXref();  //*-- question type entity xref
 
 //*-- Weights for generating questions
 private static float WT_ENTITY = (float) 1.0;    //*-- Weight for an entity type in the question 
 private static float WT_QTYPE = (float) 2.0;    //*-- Weight for the question type entity
 private static float WT_UNIGRAM = (float) 16.0;  //*-- Weight for the unigram words 
 private static float WT_BIGRAM = (float) 128.0;  //*-- Weight for the bigram words
 private static float WT_SYNONYMS= (float) 0.0;  //*-- Weight for question synonyms
 private static float WT_TRANSFORM = (float) 1.0;  //*-- Weight for the question transformations


 public SearchQuestion() 
 { PropertyConfigurator.configure (Constants.LOG4J_FILE);
 
   //*-- create the index searcher
   try { is = SearchTools.getSearcher(Constants.getINDEXDIR(), false); }
   catch (IOException ie) { logger.error("IO Error in opening index"); }
   
   //*-- set the db handler, part of speech tagger
   //*-- modified the Lucene source code to prevent length normalization
   float[] decoder = new float[256]; for (int i = 0; i < decoder.length; i++) decoder[i] = (float) 1.0;
   Similarity.setNormDecoder(decoder);
   is.setSimilarity(new SearchSimilarity());
   
   String[] addEntities = {"Qword"};
   posTagger = new LingpipeTools(); posTagger.setforPOS(addEntities); 
   sq = new SearchQuery(); 
   elementBuffer = new StringBuffer();
   
   stopWords = StopFilter.makeStopSet(Constants.getSTOP_WORDS());
   questionWords = StopFilter.makeStopSet(Constants.getQwords());
   
   try { SAXParserFactory spf = SAXParserFactory.newInstance(); parser = spf.newSAXParser(); } 
   catch (ParserConfigurationException pe) 
   { logger.error("Cannot parse XML document Parse Config. Error" + pe.getMessage() ); }
   catch (SAXException se) 
   { logger.error("Cannot parse XML document SAX Error: " + se.getMessage() ); }


 }


 /**
  * Fetch the hits for the question. Translate the question into a search engine query 
  * and submit to Lucene.
  * @param question A natural language question
  * @return Hits A hits object
  */
 public Hits getHits(String question)
 {
  //*-- translate the question into a search engine query
  Hits hits = null; 
  try { query = buildQuery(question); 
        logger.info("Question: " + question + " is parsed to " + query);
        hits = is.search(query); 
       }
  catch (IOException ie) { logger.error("IO Error in fetching hits for query " + question); hits = null; }
  return(hits);
 }


 /**
  * Dump the results of the query into a string array
  * @param hits Hits object containing the list of hits for the question
  * @param explain Flag to show explanation of scores
  */
 public void dumpHits(Hits hits, boolean explain)  
 { sq.dumpHits(hits, explain); }
 
 /**
  * Accept a question and return the explanation for the top n hits
  * @param question Question string
  * @return String explanation
  */
 public String explainAnswer(String question)
 { int[] ranks = {0}; return explainAnswer(question, ranks ); }
 
 public String explainAnswer(String question, int[] ranks)
 {
  StringBuffer retv = new StringBuffer();
  if (ranks == null) return ("");
  
  try
  { 
   //*-- submit the question to the search engine and fetch the hits
   Hits hits = getHits(question); 
   if (hits == null) throw new IOException("Could not find any hits for question " + question);
   
   //*-- build the list of answers
   DbTools dbt = Constants.getDbt();
   dbt.openDB(Constants.EXT_FILES_DB, true, false); //*-- read only access
   
   Explanation explanation;
   LOOP: for (int i = 0; i < hits.length(); i++)
   {
     //*-- limit explanations for the top 100 hits
     if (i > 100) break LOOP; boolean foundHit = false;
    
     //*-- check if the hit rank matches the passed rank
     for (int j = 0; j < ranks.length; j++) if (ranks[j] == i) foundHit = true;
     if (!foundHit) continue LOOP;
     
     retv.append("Document: " + i + Constants.NEWLINE); 
     explanation = is.explain(query, hits.id(i));  
     Document doc = hits.doc(i); 
     String key = doc.get("key");
     DatabaseEntry data = new DatabaseEntry();
     if (!dbt.fetch(key, data)) continue LOOP;


     //*-- extract the text
     IndexableDoc idoc = new IndexableDoc();
     idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);
     String line= idoc.getContents().toString(); 
     if (line.length() > 1000) line = line.substring(0, 999);
  
     retv.append(" Score: " + hits.score(i) + " TEXT: " + line + Constants.NEWLINE);
     retv.append(explanation.toString());
     retv.append("------------------------------------------------------------------");
     retv.append(Constants.NEWLINE); retv.append(Constants.NEWLINE);
   }
  } //*-- end of try


  catch (IOException ie)
  { logger.error("IO Error " + ie.getMessage()); }
  
  return(retv.toString());
 }
 
 /**
  * Accept a natural language question and return a search engine query
  * @param question
  * @return search engine query
  */
 public Query buildQuery(String question) throws IOException
 {


  //*-- extract a list of tokens from the passed text
  if (question == null) return null;
  question = question.replaceAll("[^a-zA-Z0-9]", " "); 
  question = question.trim();


  //*-- initialize the type lists for the question and tag the question
  //*-- use the SAX Parser to parse the tagged output and build the type lists
  //*-- parse the question and build the query in parts
  parseQuestion(question);


  //*-- get the list of synonyms for the first noun, adjective, and verb
/*  StringBuffer synBuffer = new StringBuffer();
  synBuffer.append( (nouns.size() > 0) ? wnetTools.getSynonyms(nouns.get(0), "n"):""); synBuffer.append(" ");
  synBuffer.append( (verbs.size() > 0) ? wnetTools.getSynonyms(verbs.get(0), "v"):""); synBuffer.append(" ");
  synBuffer.append( (adjectives.size() > 0) ? wnetTools.getSynonyms(adjectives.get(0), "a"):"");
  String[] synonyms = synBuffer.toString().trim().split(" ");
 */ 
  //*-- tokenize the question
  StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
  TokenStream stream = analyzer.tokenStream("contents", new StringReader(question));
  ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
  entities = new ArrayList<String>();    //*-- list of entities in the question
  while ( (token = stream.next()) != null) 
   { tokenList.add(token); if (token.type().equals("<ENTITY>")) entities.add(token.termText()); }
 
  //*-------------------------------------------------------------------
  //*-- build the query with the five components
  //*--
  //*-- 1. First identify the entity types for the query
  //*-------------------------------------------------------------------
  StringBuffer queryString = new StringBuffer();
  NumberFormat nf = NumberFormat.getInstance();
  nf.setMaximumIntegerDigits(3); nf.setMaximumFractionDigits(4);
  float wt = WT_QTYPE;      //*--- Weight for question type entities
  BooleanQuery theQuery = new BooleanQuery();
  LOOP: for (int i = 0; i < tokenList.size(); i++)
  {
   //*-- first try two word query tokens and then single word tokens
   String etype = null;
   if (i > 0) etype = qhash.get( tokenList.get(i - 1).termText() + " " + tokenList.get(i).termText() ); 
   if ( (etype == null) || (etype.length() < 2)) etype = qhash.get( tokenList.get(i).termText() );
     
   if ( (etype != null) && (etype.length() > 2) ) 
    { String[] etypes = etype.split("OR");
      for (int j = 0; j < etypes.length; j++)
      { queryString.append("contents:" + etypes[j].trim() + "^" + nf.format(wt) + " ");
        TermQuery tq = new TermQuery( new Term("contents", etypes[j])); tq.setBoost(wt);
        theQuery.add(tq, BooleanClause.Occur.SHOULD);
        entities.add(etypes[j]); 
      }
     break LOOP;
    }
   }
   
  //*-------------------------------------------
  //*-- 2. Find entities in the question words 
  //*-------------------------------------------
  wt = WT_ENTITY;
  for (int i = 0; i < tokenList.size(); i++)
  { if ( tokenList.get(i).type().equals("ENTITY") )   
    { String qword = tokenList.get(i).termText();
      queryString.append("contents:" + qword + "^" + nf.format(wt) + " ");
      TermQuery tq = new TermQuery( new Term("contents", qword)); tq.setBoost(wt);
      theQuery.add(tq, BooleanClause.Occur.SHOULD);
    }
  }
  
  //*-------------------------------------------------------------------------------
  //*-- 3. Create a list of weighted trigrams/bigrams/unigrams from the query 
  //*-------------------------------------------------------------------------------
  int numNouns = nouns.size(); int numVerbs = verbs.size(); int numAdjectives = adjectives.size();
  String[] queryWords = question.split("\\s+"); int wordsLength = queryWords.length;
  boolean[] contentWord = new boolean[wordsLength];
  for (int i = 0; i < wordsLength; i++) 
   { queryWords[i] = queryWords[i].toLowerCase(Constants.locale);
     contentWord[i] = false; 
     for (int j = 0; j < nouns.size(); j++) if (queryWords[i].equalsIgnoreCase(nouns.get(j))) contentWord[i] = true;
     for (int j = 0; j < verbs.size(); j++) if (queryWords[i].equalsIgnoreCase(verbs.get(j))) contentWord[i] = true;
     for (int j = 0; j < adjectives.size(); j++) if (queryWords[i].equalsIgnoreCase(adjectives.get(j))) contentWord[i] = true;
   }
  
  String joinChar;  
  //*-- generate all possible bigrams with higher weights for bigrams that do not have stopwords
  float WT_NORM_BIGRAM = WT_BIGRAM;
  for (int i = 1; i < 4; i++) if (wordsLength > (Math.pow(2, (i + 1)))) WT_NORM_BIGRAM /= 2; 
  LOOP2: for (int i = 1; i < wordsLength; i++)
  {  
   //*-- skip if the previous word was a question word
   //*-- if the previous word was a stop word use a underscore to build the bigram, otherwise use a space
   wt = 0; 
   if ( !questionWords.contains(queryWords[i-1]) )
   { 
     if (stopWords.contains(queryWords[i-1]) && stopWords.contains(queryWords[i])) continue LOOP2;
     joinChar = (stopWords.contains(queryWords[i-1]) || stopWords.contains(queryWords[i])) ? "_": " ";
     for (int j = i-1; j < i+1; j++) wt += (contentWord[j]) ? WT_NORM_BIGRAM: 0;
     String bigram = queryWords[i-1] + joinChar + queryWords[i];
     queryString.append("contents:\"" + bigram + "\"~0^" + wt + " ");
     PhraseQuery pq = new PhraseQuery(); pq.add( new Term("contents", bigram)); pq.setBoost(wt); pq.setSlop(0);
     theQuery.add(pq, BooleanClause.Occur.SHOULD);
     bigrams.add(bigram);
   }
  } //*-- end of for
  
  //*-- create unigrams from non-stop words and weigh unigrams near the start of the question
  //*-- higher than unigrams near the end of the question
  LOOP3: for (int i = 0; i < wordsLength; i++)
  { wt = WT_UNIGRAM;
  
    //*-- skip punctuation and very short words
    if ( (queryWords[i].length() < 2)  || (!contentWord[i]) ) continue LOOP3;
    
    wt *=  ( (numNouns > 0) && (nouns.get(0).equalsIgnoreCase(queryWords[i])) ) ? 8:
           ( (numNouns > 1) && (nouns.get(1).equalsIgnoreCase(queryWords[i])) ) ? 4: 1;
    wt *=  ( (numVerbs > 0) && (verbs.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
           ( (numVerbs > 1) && (verbs.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
    wt *=  ( (numAdjectives > 0) && (adjectives.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
           ( (numAdjectives > 1) && (adjectives.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
   
   queryString.append("contents:" + queryWords[i] + "^" + nf.format(wt) + " ");
   TermQuery tq = new TermQuery( new Term("contents", queryWords[i])); tq.setBoost(wt);
   theQuery.add(tq, BooleanClause.Occur.SHOULD); 
  } //*-- end of for


  //*--------------------------------------------------------------------------
  //*-- 4. Add the query transformation for the part. query type and add the synonyms
  //*--------------------------------------------------------------------------
/*  wt = WT_SYNONYMS;
  for (int j = 0; j < synonyms.length; j++)
  { queryString.append("contents:" + synonyms[j] + "^" + nf.format(wt) + " ");
    TermQuery tq = new TermQuery( new Term("contents", synonyms[j])); tq.setBoost(wt);
    theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
  */
  wt = WT_TRANSFORM;
  Matcher matcher = whatPattern.matcher(question);
  if ( (matcher.matches()) && (nouns.size() > 0) )
  {  String qTransform = "\"" + nouns.get(0) + "_is" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
     qTransform = "\"" + nouns.get(0) + "_was" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
  
  matcher = wherePattern.matcher(question);
  if ( (matcher.matches()) && (nouns.size() > 0) )
  {  String qTransform = "is_located" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
     qTransform = "\"located_at\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
  
//  String query = queryString.toString();
//System.out.println("query string " + query);
//System.out.println("gen q: " + theQuery);


  analyzer.setExtractEntities(false);
  QueryParser qp = new QueryParser("contents", analyzer);  
  try { return(qp.parse(queryString.toString()) ); }
  catch(ParseException pe) { }
  
  return(theQuery);
 }


 /**
  * Parse the question using a POS tagger. Extract the nouns, verbs, and adjectives.
  * Extract the question hypernyms and question transform
  * @param question
  */
 public void parseQuestion(String question)
 {
  elementBuffer = new StringBuffer();


  StringBuffer xmlOutput = new StringBuffer("<?xml version='1.0' encoding='utf-8'?> <Question>"); 
  xmlOutput.append(posTagger.getPOS(question));
  xmlOutput.append("</Question>");


  try { parser.parse(new java.io.ByteArrayInputStream(xmlOutput.toString().getBytes("UTF-8")), this); }
  catch (UnsupportedEncodingException ue) { logger.error("Encoding error " + ue.getMessage()); }
  catch (IOException ie) { logger.error("IO Error in parsing tagged question " + ie.getMessage()); }
  catch (SAXException se) { logger.error("Failed to parse question " + se.getMessage()); }
 }


 //*----------------------------------------------
 //*-- Start XML parser overriden methods
 //*----------------------------------------------
 public void startDocument() 
 { adjectives = new ArrayList<String>();   //*-- list of adjectives in the question
   nouns = new ArrayList<String>();    //*-- list of nouns in the question
   verbs = new ArrayList<String>();    //*-- list of verbs in the question
   bigrams = new ArrayList<String>();    //*-- list of bigrams in the question
   trigrams = new ArrayList<String>();    //*-- list of trigram in the question
   qwords = new ArrayList<String>();    //*-- list of question words 
 }


 public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException
 { elementBuffer.setLength(0);  }


 public void characters(char[] text, int start, int length) 
 {   
  for (int i = start; i < (start + length) ; i++)
   if ( !Character.isDefined(text[i])  && !Character.isSpaceChar(text[i]) ) text[i] = ' ';
  elementBuffer.append(text, start, length); 
 }


 //*-- Build the qwords, nouns, adjectives, and verbs lists
 public void endElement(String uri, String localName, String qName)
 { 
  String taggedText = elementBuffer.toString().trim().toLowerCase(Constants.locale);
  if (qName.equalsIgnoreCase("Qword")) qwords.add(taggedText); 
  if (qName.equalsIgnoreCase("Noun"))  nouns.add(taggedText);
  if (qName.equalsIgnoreCase("Adjective")) adjectives.add(taggedText);  
  if (qName.equalsIgnoreCase("Verb")) verbs.add(taggedText);  
 }


 //*-----------------------------------------
 //*-- End XML Parser overriden methods 
 //*-----------------------------------------


 public String[] getNouns() 
 { String[] rnouns = new String[nouns.size()];
   for (int i = 0; i < nouns.size(); i++) rnouns[i] = nouns.get(i);
   return(rnouns);
 }


 public String[] getVerbs() 
 { String[] rverbs = new String[verbs.size()];
   for (int i = 0; i < verbs.size(); i++) rverbs[i] = verbs.get(i);
   return(rverbs);
 }


 public String[] getAdjectives() 
 { String[] radjectives = new String[adjectives.size()];
   for (int i = 0; i < adjectives.size(); i++) radjectives[i] = adjectives.get(i);
   return(radjectives);
 }
 
 public String[] getEntities() 
 { String[] rentities = new String[entities.size()];
   for (int i = 0; i < entities.size(); i++) rentities[i] = entities.get(i);
   return(rentities);
 }


 public String[] getBigrams() 
 { String[] rbigrams = new String[bigrams.size()];
   for (int i = 0; i < bigrams.size(); i++) rbigrams[i] = bigrams.get(i);
   return(rbigrams);
 }


 public String[] getTrigrams() 
 { String[] rtrigrams = new String[trigrams.size()];
   for (int i = 0; i < trigrams.size(); i++) rtrigrams[i] = trigrams.get(i);
   return(rtrigrams);
 }
 
 public String[] getResults() 
 { return sq.getResults(); }


 public static float getWT_BIGRAM()
 { return WT_BIGRAM; }


 public static void setWT_BIGRAM(float wt_bigram)
 { WT_BIGRAM = wt_bigram; }


 public static float getWT_ENTITY()
 { return WT_ENTITY; }


 public static void setWT_ENTITY(float wt_entity)
 { WT_ENTITY = wt_entity; }


 public static float getWT_QTYPE()
 { return WT_QTYPE; }


 public static void setWT_QTYPE(float wt_qtype)
 { WT_QTYPE = wt_qtype; }


 public static float getWT_SYNONYMS()
 { return WT_SYNONYMS; }


 public static void setWT_SYNONYMS(float wt_synonyms)
 { WT_SYNONYMS = wt_synonyms; }


 public static float getWT_TRANSFORM()
 { return WT_TRANSFORM; }


 public static void setWT_TRANSFORM(float wt_transform)
 { WT_TRANSFORM = wt_transform; }


 public static float getWT_UNIGRAM()
 { return WT_UNIGRAM; }


 public static void setWT_UNIGRAM(float wt_unigram)
 { WT_UNIGRAM = wt_unigram; }
 
}
Source Code of org.sf.mustru.search.SearchQuestion

Related Classes of org.sf.mustru.search.SearchQuestion