Package org.sf.mustru.search

Source Code of org.sf.mustru.search.SearchQuestion

package org.sf.mustru.search;

import java.io.IOException;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery;

import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.LingpipeTools;
import org.sf.mustru.utils.StandardBgramAnalyzer;
//import org.sf.mustru.utils.WordnetTools;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import com.sleepycat.je.DatabaseEntry;

import org.apache.log4j.PropertyConfigurator;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.Searcher;

/**
* Submit natural language questions to the search engine after conversion to a query.
* Return a list of hits
*/

public class SearchQuestion extends DefaultHandler
{
static Logger logger = Logger.getLogger(SearchQuestion.class.getName() );
private Query query = null;      //*-- a lucene query
private Searcher is = null;      //*-- lucene index searcher
private LingpipeTools posTagger;    //*-- tools to tag the question with POS and entity type
private Set stopWords;      //*-- list of stop words
private Set questionWords;      //*-- list of question words

private ArrayList<String> adjectives = null;   //*-- list of adjectives in the question
private ArrayList<String> nouns = null//*-- list of nouns in the question
private ArrayList<String> verbs = null//*-- list of verbs in the question
private ArrayList<String> bigrams = null//*-- list of bigrams in the question
private ArrayList<String> trigrams = null//*-- list of trigrams in the question
private ArrayList<String> entities = null//*-- list of entities in the question
private ArrayList<String> qwords = null//*-- list of question words
public final static Pattern whatPattern =  Pattern.compile("^(.*?)(what is|what do|what's)(.*$)", Pattern.CASE_INSENSITIVE);
public final static Pattern wherePattern =  Pattern.compile("^(.*?)(where is|where|where's)(.*$)", Pattern.CASE_INSENSITIVE);

private SAXParser parser;      //*-- parser for the XML tagged question
private StringBuffer elementBuffer;     //*-- string buffer to capture tagged tokens
private SearchQuery sq;      //*-- object to dump hits from a query result
private HashMap<String, String> qhash = Constants.getQtypeEntXref()//*-- question type entity xref
//*-- Weights for generating questions
private static float WT_ENTITY = (float) 1.0;    //*-- Weight for an entity type in the question
private static float WT_QTYPE = (float) 2.0;    //*-- Weight for the question type entity
private static float WT_UNIGRAM = (float) 16.0//*-- Weight for the unigram words
private static float WT_BIGRAM = (float) 128.0//*-- Weight for the bigram words
private static float WT_SYNONYMS= (float) 0.0//*-- Weight for question synonyms
private static float WT_TRANSFORM = (float) 1.0//*-- Weight for the question transformations

public SearchQuestion()
{ PropertyConfigurator.configure (Constants.LOG4J_FILE);
   //*-- create the index searcher
   try { is = SearchTools.getSearcher(Constants.getINDEXDIR(), false); }
   catch (IOException ie) { logger.error("IO Error in opening index"); }
  
   //*-- set the db handler, part of speech tagger
   //*-- modified the Lucene source code to prevent length normalization
   float[] decoder = new float[256]; for (int i = 0; i < decoder.length; i++) decoder[i] = (float) 1.0;
   Similarity.setNormDecoder(decoder);
   is.setSimilarity(new SearchSimilarity());
  
   String[] addEntities = {"Qword"};
   posTagger = new LingpipeTools(); posTagger.setforPOS(addEntities);
   sq = new SearchQuery();
   elementBuffer = new StringBuffer();
  
   stopWords = StopFilter.makeStopSet(Constants.getSTOP_WORDS());
   questionWords = StopFilter.makeStopSet(Constants.getQwords());
  
   try { SAXParserFactory spf = SAXParserFactory.newInstance(); parser = spf.newSAXParser(); }
   catch (ParserConfigurationException pe)
   { logger.error("Cannot parse XML document Parse Config. Error" + pe.getMessage() ); }
   catch (SAXException se)
   { logger.error("Cannot parse XML document SAX Error: " + se.getMessage() ); }

}

/**
  * Fetch the hits for the question. Translate the question into a search engine query
  * and submit to Lucene.
  * @param question A natural language question
  * @return Hits A hits object
  */
public Hits getHits(String question)
{
  //*-- translate the question into a search engine query
  Hits hits = null;
  try { query = buildQuery(question);
        logger.info("Question: " + question + " is parsed to " + query);
        hits = is.search(query);
       }
  catch (IOException ie) { logger.error("IO Error in fetching hits for query " + question); hits = null; }
  return(hits);
}

/**
  * Dump the results of the query into a string array
  * @param hits Hits object containing the list of hits for the question
  * @param explain Flag to show explanation of scores
  */
public void dumpHits(Hits hits, boolean explain
{ sq.dumpHits(hits, explain); }
/**
  * Accept a question and return the explanation for the top n hits
  * @param question Question string
  * @return String explanation
  */
public String explainAnswer(String question)
{ int[] ranks = {0}; return explainAnswer(question, ranks ); }
public String explainAnswer(String question, int[] ranks)
{
  StringBuffer retv = new StringBuffer();
  if (ranks == null) return ("");
 
  try
  {
   //*-- submit the question to the search engine and fetch the hits
   Hits hits = getHits(question);
   if (hits == null) throw new IOException("Could not find any hits for question " + question);
  
   //*-- build the list of answers
   DbTools dbt = Constants.getDbt();
   dbt.openDB(Constants.EXT_FILES_DB, true, false); //*-- read only access
  
   Explanation explanation;
   LOOP: for (int i = 0; i < hits.length(); i++)
   {
     //*-- limit explanations for the top 100 hits
     if (i > 100) break LOOP; boolean foundHit = false;
   
     //*-- check if the hit rank matches the passed rank
     for (int j = 0; j < ranks.length; j++) if (ranks[j] == i) foundHit = true;
     if (!foundHit) continue LOOP;
    
     retv.append("Document: " + i + Constants.NEWLINE);
     explanation = is.explain(query, hits.id(i))
     Document doc = hits.doc(i);
     String key = doc.get("key");
     DatabaseEntry data = new DatabaseEntry();
     if (!dbt.fetch(key, data)) continue LOOP;

     //*-- extract the text
     IndexableDoc idoc = new IndexableDoc();
     idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);
     String line= idoc.getContents().toString();
     if (line.length() > 1000) line = line.substring(0, 999);
 
     retv.append(" Score: " + hits.score(i) + " TEXT: " + line + Constants.NEWLINE);
     retv.append(explanation.toString());
     retv.append("------------------------------------------------------------------");
     retv.append(Constants.NEWLINE); retv.append(Constants.NEWLINE);
   }
  } //*-- end of try

  catch (IOException ie)
  { logger.error("IO Error " + ie.getMessage()); }
 
  return(retv.toString());
}
/**
  * Accept a natural language question and return a search engine query
  * @param question
  * @return search engine query
  */
public Query buildQuery(String question) throws IOException
{

  //*-- extract a list of tokens from the passed text
  if (question == null) return null;
  question = question.replaceAll("[^a-zA-Z0-9]", " ");
  question = question.trim();

  //*-- initialize the type lists for the question and tag the question
  //*-- use the SAX Parser to parse the tagged output and build the type lists
  //*-- parse the question and build the query in parts
  parseQuestion(question);

  //*-- get the list of synonyms for the first noun, adjective, and verb
/*  StringBuffer synBuffer = new StringBuffer();
  synBuffer.append( (nouns.size() > 0) ? wnetTools.getSynonyms(nouns.get(0), "n"):""); synBuffer.append(" ");
  synBuffer.append( (verbs.size() > 0) ? wnetTools.getSynonyms(verbs.get(0), "v"):""); synBuffer.append(" ");
  synBuffer.append( (adjectives.size() > 0) ? wnetTools.getSynonyms(adjectives.get(0), "a"):"");
  String[] synonyms = synBuffer.toString().trim().split(" ");
*/
  //*-- tokenize the question
  StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
  TokenStream stream = analyzer.tokenStream("contents", new StringReader(question));
  ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
  entities = new ArrayList<String>();    //*-- list of entities in the question
  while ( (token = stream.next()) != null)
   { tokenList.add(token); if (token.type().equals("<ENTITY>")) entities.add(token.termText()); }
  //*-------------------------------------------------------------------
  //*-- build the query with the five components
  //*--
  //*-- 1. First identify the entity types for the query
  //*-------------------------------------------------------------------
  StringBuffer queryString = new StringBuffer();
  NumberFormat nf = NumberFormat.getInstance();
  nf.setMaximumIntegerDigits(3); nf.setMaximumFractionDigits(4);
  float wt = WT_QTYPE;      //*--- Weight for question type entities
  BooleanQuery theQuery = new BooleanQuery();
  LOOP: for (int i = 0; i < tokenList.size(); i++)
  {
   //*-- first try two word query tokens and then single word tokens
   String etype = null;
   if (i > 0) etype = qhash.get( tokenList.get(i - 1).termText() + " " + tokenList.get(i).termText() );
   if ( (etype == null) || (etype.length() < 2)) etype = qhash.get( tokenList.get(i).termText() );
    
   if ( (etype != null) && (etype.length() > 2) )
    { String[] etypes = etype.split("OR");
      for (int j = 0; j < etypes.length; j++)
      { queryString.append("contents:" + etypes[j].trim() + "^" + nf.format(wt) + " ");
        TermQuery tq = new TermQuery( new Term("contents", etypes[j])); tq.setBoost(wt);
        theQuery.add(tq, BooleanClause.Occur.SHOULD);
        entities.add(etypes[j]);
      }
     break LOOP;
    }
   }
  
  //*-------------------------------------------
  //*-- 2. Find entities in the question words
  //*-------------------------------------------
  wt = WT_ENTITY;
  for (int i = 0; i < tokenList.size(); i++)
  { if ( tokenList.get(i).type().equals("ENTITY") )  
    { String qword = tokenList.get(i).termText();
      queryString.append("contents:" + qword + "^" + nf.format(wt) + " ");
      TermQuery tq = new TermQuery( new Term("contents", qword)); tq.setBoost(wt);
      theQuery.add(tq, BooleanClause.Occur.SHOULD);
    }
  }
 
  //*-------------------------------------------------------------------------------
  //*-- 3. Create a list of weighted trigrams/bigrams/unigrams from the query
  //*-------------------------------------------------------------------------------
  int numNouns = nouns.size(); int numVerbs = verbs.size(); int numAdjectives = adjectives.size();
  String[] queryWords = question.split("\\s+"); int wordsLength = queryWords.length;
  boolean[] contentWord = new boolean[wordsLength];
  for (int i = 0; i < wordsLength; i++)
   { queryWords[i] = queryWords[i].toLowerCase(Constants.locale);
     contentWord[i] = false;
     for (int j = 0; j < nouns.size(); j++) if (queryWords[i].equalsIgnoreCase(nouns.get(j))) contentWord[i] = true;
     for (int j = 0; j < verbs.size(); j++) if (queryWords[i].equalsIgnoreCase(verbs.get(j))) contentWord[i] = true;
     for (int j = 0; j < adjectives.size(); j++) if (queryWords[i].equalsIgnoreCase(adjectives.get(j))) contentWord[i] = true;
   }
 
  String joinChar; 
  //*-- generate all possible bigrams with higher weights for bigrams that do not have stopwords
  float WT_NORM_BIGRAM = WT_BIGRAM;
  for (int i = 1; i < 4; i++) if (wordsLength > (Math.pow(2, (i + 1)))) WT_NORM_BIGRAM /= 2;
  LOOP2: for (int i = 1; i < wordsLength; i++)
  { 
   //*-- skip if the previous word was a question word
   //*-- if the previous word was a stop word use a underscore to build the bigram, otherwise use a space
   wt = 0;
   if ( !questionWords.contains(queryWords[i-1]) )
   {
     if (stopWords.contains(queryWords[i-1]) && stopWords.contains(queryWords[i])) continue LOOP2;
     joinChar = (stopWords.contains(queryWords[i-1]) || stopWords.contains(queryWords[i])) ? "_": " ";
     for (int j = i-1; j < i+1; j++) wt += (contentWord[j]) ? WT_NORM_BIGRAM: 0;
     String bigram = queryWords[i-1] + joinChar + queryWords[i];
     queryString.append("contents:\"" + bigram + "\"~0^" + wt + " ");
     PhraseQuery pq = new PhraseQuery(); pq.add( new Term("contents", bigram)); pq.setBoost(wt); pq.setSlop(0);
     theQuery.add(pq, BooleanClause.Occur.SHOULD);
     bigrams.add(bigram);
   }
  } //*-- end of for
 
  //*-- create unigrams from non-stop words and weigh unigrams near the start of the question
  //*-- higher than unigrams near the end of the question
  LOOP3: for (int i = 0; i < wordsLength; i++)
  { wt = WT_UNIGRAM;
 
    //*-- skip punctuation and very short words
    if ( (queryWords[i].length() < 2|| (!contentWord[i]) ) continue LOOP3;
   
    wt *=  ( (numNouns > 0) && (nouns.get(0).equalsIgnoreCase(queryWords[i])) ) ? 8:
           ( (numNouns > 1) && (nouns.get(1).equalsIgnoreCase(queryWords[i])) ) ? 4: 1;
    wt *=  ( (numVerbs > 0) && (verbs.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
           ( (numVerbs > 1) && (verbs.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
    wt *=  ( (numAdjectives > 0) && (adjectives.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
           ( (numAdjectives > 1) && (adjectives.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
  
   queryString.append("contents:" + queryWords[i] + "^" + nf.format(wt) + " ");
   TermQuery tq = new TermQuery( new Term("contents", queryWords[i])); tq.setBoost(wt);
   theQuery.add(tq, BooleanClause.Occur.SHOULD);
  } //*-- end of for

  //*--------------------------------------------------------------------------
  //*-- 4. Add the query transformation for the part. query type and add the synonyms
  //*--------------------------------------------------------------------------
/*  wt = WT_SYNONYMS;
  for (int j = 0; j < synonyms.length; j++)
  { queryString.append("contents:" + synonyms[j] + "^" + nf.format(wt) + " ");
    TermQuery tq = new TermQuery( new Term("contents", synonyms[j])); tq.setBoost(wt);
    theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
  */
  wt = WT_TRANSFORM;
  Matcher matcher = whatPattern.matcher(question);
  if ( (matcher.matches()) && (nouns.size() > 0) )
  {  String qTransform = "\"" + nouns.get(0) + "_is" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
     qTransform = "\"" + nouns.get(0) + "_was" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
 
  matcher = wherePattern.matcher(question);
  if ( (matcher.matches()) && (nouns.size() > 0) )
  {  String qTransform = "is_located" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
     qTransform = "\"located_at\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
 
//  String query = queryString.toString();
//System.out.println("query string " + query);
//System.out.println("gen q: " + theQuery);

  analyzer.setExtractEntities(false);
  QueryParser qp = new QueryParser("contents", analyzer)
  try { return(qp.parse(queryString.toString()) ); }
  catch(ParseException pe) { }
 
  return(theQuery);
}

/**
  * Parse the question using a POS tagger. Extract the nouns, verbs, and adjectives.
  * Extract the question hypernyms and question transform
  * @param question
  */
public void parseQuestion(String question)
{
  elementBuffer = new StringBuffer();

  StringBuffer xmlOutput = new StringBuffer("<?xml version='1.0' encoding='utf-8'?> <Question>");
  xmlOutput.append(posTagger.getPOS(question));
  xmlOutput.append("</Question>");

  try { parser.parse(new java.io.ByteArrayInputStream(xmlOutput.toString().getBytes("UTF-8")), this); }
  catch (UnsupportedEncodingException ue) { logger.error("Encoding error " + ue.getMessage()); }
  catch (IOException ie) { logger.error("IO Error in parsing tagged question " + ie.getMessage()); }
  catch (SAXException se) { logger.error("Failed to parse question " + se.getMessage()); }
}

//*----------------------------------------------
//*-- Start XML parser overriden methods
//*----------------------------------------------
public void startDocument()
{ adjectives = new ArrayList<String>();   //*-- list of adjectives in the question
   nouns = new ArrayList<String>();    //*-- list of nouns in the question
   verbs = new ArrayList<String>();    //*-- list of verbs in the question
   bigrams = new ArrayList<String>();    //*-- list of bigrams in the question
   trigrams = new ArrayList<String>();    //*-- list of trigram in the question
   qwords = new ArrayList<String>();    //*-- list of question words
}

public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException
{ elementBuffer.setLength(0)}

public void characters(char[] text, int start, int length)
{  
  for (int i = start; i < (start + length) ; i++)
   if ( !Character.isDefined(text[i])  && !Character.isSpaceChar(text[i]) ) text[i] = ' ';
  elementBuffer.append(text, start, length);
}

//*-- Build the qwords, nouns, adjectives, and verbs lists
public void endElement(String uri, String localName, String qName)
{
  String taggedText = elementBuffer.toString().trim().toLowerCase(Constants.locale);
  if (qName.equalsIgnoreCase("Qword")) qwords.add(taggedText);
  if (qName.equalsIgnoreCase("Noun"))  nouns.add(taggedText);
  if (qName.equalsIgnoreCase("Adjective")) adjectives.add(taggedText)
  if (qName.equalsIgnoreCase("Verb")) verbs.add(taggedText)
}

//*-----------------------------------------
//*-- End XML Parser overriden methods
//*-----------------------------------------

public String[] getNouns()
{ String[] rnouns = new String[nouns.size()];
   for (int i = 0; i < nouns.size(); i++) rnouns[i] = nouns.get(i);
   return(rnouns);
}

public String[] getVerbs()
{ String[] rverbs = new String[verbs.size()];
   for (int i = 0; i < verbs.size(); i++) rverbs[i] = verbs.get(i);
   return(rverbs);
}

public String[] getAdjectives()
{ String[] radjectives = new String[adjectives.size()];
   for (int i = 0; i < adjectives.size(); i++) radjectives[i] = adjectives.get(i);
   return(radjectives);
}
public String[] getEntities()
{ String[] rentities = new String[entities.size()];
   for (int i = 0; i < entities.size(); i++) rentities[i] = entities.get(i);
   return(rentities);
}

public String[] getBigrams()
{ String[] rbigrams = new String[bigrams.size()];
   for (int i = 0; i < bigrams.size(); i++) rbigrams[i] = bigrams.get(i);
   return(rbigrams);
}

public String[] getTrigrams()
{ String[] rtrigrams = new String[trigrams.size()];
   for (int i = 0; i < trigrams.size(); i++) rtrigrams[i] = trigrams.get(i);
   return(rtrigrams);
}
public String[] getResults()
{ return sq.getResults(); }

public static float getWT_BIGRAM()
{ return WT_BIGRAM; }

public static void setWT_BIGRAM(float wt_bigram)
{ WT_BIGRAM = wt_bigram; }

public static float getWT_ENTITY()
{ return WT_ENTITY; }

public static void setWT_ENTITY(float wt_entity)
{ WT_ENTITY = wt_entity; }

public static float getWT_QTYPE()
{ return WT_QTYPE; }

public static void setWT_QTYPE(float wt_qtype)
{ WT_QTYPE = wt_qtype; }

public static float getWT_SYNONYMS()
{ return WT_SYNONYMS; }

public static void setWT_SYNONYMS(float wt_synonyms)
{ WT_SYNONYMS = wt_synonyms; }

public static float getWT_TRANSFORM()
{ return WT_TRANSFORM; }

public static void setWT_TRANSFORM(float wt_transform)
{ WT_TRANSFORM = wt_transform; }

public static float getWT_UNIGRAM()
{ return WT_UNIGRAM; }

public static void setWT_UNIGRAM(float wt_unigram)
{ WT_UNIGRAM = wt_unigram; }
}
TOP

Related Classes of org.sf.mustru.search.SearchQuestion

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.