package org.sf.mustru.search;
import java.io.IOException;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery;
import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.LingpipeTools;
import org.sf.mustru.utils.StandardBgramAnalyzer;
//import org.sf.mustru.utils.WordnetTools;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import com.sleepycat.je.DatabaseEntry;
import org.apache.log4j.PropertyConfigurator;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.Searcher;
/**
* Submit natural language questions to the search engine after conversion to a query.
* Return a list of hits
*/
public class SearchQuestion extends DefaultHandler
{
static Logger logger = Logger.getLogger(SearchQuestion.class.getName() );
private Query query = null; //*-- a lucene query
private Searcher is = null; //*-- lucene index searcher
private LingpipeTools posTagger; //*-- tools to tag the question with POS and entity type
private Set stopWords; //*-- list of stop words
private Set questionWords; //*-- list of question words
private ArrayList<String> adjectives = null; //*-- list of adjectives in the question
private ArrayList<String> nouns = null; //*-- list of nouns in the question
private ArrayList<String> verbs = null; //*-- list of verbs in the question
private ArrayList<String> bigrams = null; //*-- list of bigrams in the question
private ArrayList<String> trigrams = null; //*-- list of trigrams in the question
private ArrayList<String> entities = null; //*-- list of entities in the question
private ArrayList<String> qwords = null; //*-- list of question words
public final static Pattern whatPattern = Pattern.compile("^(.*?)(what is|what do|what's)(.*$)", Pattern.CASE_INSENSITIVE);
public final static Pattern wherePattern = Pattern.compile("^(.*?)(where is|where|where's)(.*$)", Pattern.CASE_INSENSITIVE);
private SAXParser parser; //*-- parser for the XML tagged question
private StringBuffer elementBuffer; //*-- string buffer to capture tagged tokens
private SearchQuery sq; //*-- object to dump hits from a query result
private HashMap<String, String> qhash = Constants.getQtypeEntXref(); //*-- question type entity xref
//*-- Weights for generating questions
private static float WT_ENTITY = (float) 1.0; //*-- Weight for an entity type in the question
private static float WT_QTYPE = (float) 2.0; //*-- Weight for the question type entity
private static float WT_UNIGRAM = (float) 16.0; //*-- Weight for the unigram words
private static float WT_BIGRAM = (float) 128.0; //*-- Weight for the bigram words
private static float WT_SYNONYMS= (float) 0.0; //*-- Weight for question synonyms
private static float WT_TRANSFORM = (float) 1.0; //*-- Weight for the question transformations
public SearchQuestion()
{ PropertyConfigurator.configure (Constants.LOG4J_FILE);
//*-- create the index searcher
try { is = SearchTools.getSearcher(Constants.getINDEXDIR(), false); }
catch (IOException ie) { logger.error("IO Error in opening index"); }
//*-- set the db handler, part of speech tagger
//*-- modified the Lucene source code to prevent length normalization
float[] decoder = new float[256]; for (int i = 0; i < decoder.length; i++) decoder[i] = (float) 1.0;
Similarity.setNormDecoder(decoder);
is.setSimilarity(new SearchSimilarity());
String[] addEntities = {"Qword"};
posTagger = new LingpipeTools(); posTagger.setforPOS(addEntities);
sq = new SearchQuery();
elementBuffer = new StringBuffer();
stopWords = StopFilter.makeStopSet(Constants.getSTOP_WORDS());
questionWords = StopFilter.makeStopSet(Constants.getQwords());
try { SAXParserFactory spf = SAXParserFactory.newInstance(); parser = spf.newSAXParser(); }
catch (ParserConfigurationException pe)
{ logger.error("Cannot parse XML document Parse Config. Error" + pe.getMessage() ); }
catch (SAXException se)
{ logger.error("Cannot parse XML document SAX Error: " + se.getMessage() ); }
}
/**
* Fetch the hits for the question. Translate the question into a search engine query
* and submit to Lucene.
* @param question A natural language question
* @return Hits A hits object
*/
public Hits getHits(String question)
{
//*-- translate the question into a search engine query
Hits hits = null;
try { query = buildQuery(question);
logger.info("Question: " + question + " is parsed to " + query);
hits = is.search(query);
}
catch (IOException ie) { logger.error("IO Error in fetching hits for query " + question); hits = null; }
return(hits);
}
/**
* Dump the results of the query into a string array
* @param hits Hits object containing the list of hits for the question
* @param explain Flag to show explanation of scores
*/
public void dumpHits(Hits hits, boolean explain)
{ sq.dumpHits(hits, explain); }
/**
* Accept a question and return the explanation for the top n hits
* @param question Question string
* @return String explanation
*/
public String explainAnswer(String question)
{ int[] ranks = {0}; return explainAnswer(question, ranks ); }
public String explainAnswer(String question, int[] ranks)
{
StringBuffer retv = new StringBuffer();
if (ranks == null) return ("");
try
{
//*-- submit the question to the search engine and fetch the hits
Hits hits = getHits(question);
if (hits == null) throw new IOException("Could not find any hits for question " + question);
//*-- build the list of answers
DbTools dbt = Constants.getDbt();
dbt.openDB(Constants.EXT_FILES_DB, true, false); //*-- read only access
Explanation explanation;
LOOP: for (int i = 0; i < hits.length(); i++)
{
//*-- limit explanations for the top 100 hits
if (i > 100) break LOOP; boolean foundHit = false;
//*-- check if the hit rank matches the passed rank
for (int j = 0; j < ranks.length; j++) if (ranks[j] == i) foundHit = true;
if (!foundHit) continue LOOP;
retv.append("Document: " + i + Constants.NEWLINE);
explanation = is.explain(query, hits.id(i));
Document doc = hits.doc(i);
String key = doc.get("key");
DatabaseEntry data = new DatabaseEntry();
if (!dbt.fetch(key, data)) continue LOOP;
//*-- extract the text
IndexableDoc idoc = new IndexableDoc();
idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);
String line= idoc.getContents().toString();
if (line.length() > 1000) line = line.substring(0, 999);
retv.append(" Score: " + hits.score(i) + " TEXT: " + line + Constants.NEWLINE);
retv.append(explanation.toString());
retv.append("------------------------------------------------------------------");
retv.append(Constants.NEWLINE); retv.append(Constants.NEWLINE);
}
} //*-- end of try
catch (IOException ie)
{ logger.error("IO Error " + ie.getMessage()); }
return(retv.toString());
}
/**
* Accept a natural language question and return a search engine query
* @param question
* @return search engine query
*/
public Query buildQuery(String question) throws IOException
{
//*-- extract a list of tokens from the passed text
if (question == null) return null;
question = question.replaceAll("[^a-zA-Z0-9]", " ");
question = question.trim();
//*-- initialize the type lists for the question and tag the question
//*-- use the SAX Parser to parse the tagged output and build the type lists
//*-- parse the question and build the query in parts
parseQuestion(question);
//*-- get the list of synonyms for the first noun, adjective, and verb
/* StringBuffer synBuffer = new StringBuffer();
synBuffer.append( (nouns.size() > 0) ? wnetTools.getSynonyms(nouns.get(0), "n"):""); synBuffer.append(" ");
synBuffer.append( (verbs.size() > 0) ? wnetTools.getSynonyms(verbs.get(0), "v"):""); synBuffer.append(" ");
synBuffer.append( (adjectives.size() > 0) ? wnetTools.getSynonyms(adjectives.get(0), "a"):"");
String[] synonyms = synBuffer.toString().trim().split(" ");
*/
//*-- tokenize the question
StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
TokenStream stream = analyzer.tokenStream("contents", new StringReader(question));
ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
entities = new ArrayList<String>(); //*-- list of entities in the question
while ( (token = stream.next()) != null)
{ tokenList.add(token); if (token.type().equals("<ENTITY>")) entities.add(token.termText()); }
//*-------------------------------------------------------------------
//*-- build the query with the five components
//*--
//*-- 1. First identify the entity types for the query
//*-------------------------------------------------------------------
StringBuffer queryString = new StringBuffer();
NumberFormat nf = NumberFormat.getInstance();
nf.setMaximumIntegerDigits(3); nf.setMaximumFractionDigits(4);
float wt = WT_QTYPE; //*--- Weight for question type entities
BooleanQuery theQuery = new BooleanQuery();
LOOP: for (int i = 0; i < tokenList.size(); i++)
{
//*-- first try two word query tokens and then single word tokens
String etype = null;
if (i > 0) etype = qhash.get( tokenList.get(i - 1).termText() + " " + tokenList.get(i).termText() );
if ( (etype == null) || (etype.length() < 2)) etype = qhash.get( tokenList.get(i).termText() );
if ( (etype != null) && (etype.length() > 2) )
{ String[] etypes = etype.split("OR");
for (int j = 0; j < etypes.length; j++)
{ queryString.append("contents:" + etypes[j].trim() + "^" + nf.format(wt) + " ");
TermQuery tq = new TermQuery( new Term("contents", etypes[j])); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
entities.add(etypes[j]);
}
break LOOP;
}
}
//*-------------------------------------------
//*-- 2. Find entities in the question words
//*-------------------------------------------
wt = WT_ENTITY;
for (int i = 0; i < tokenList.size(); i++)
{ if ( tokenList.get(i).type().equals("ENTITY") )
{ String qword = tokenList.get(i).termText();
queryString.append("contents:" + qword + "^" + nf.format(wt) + " ");
TermQuery tq = new TermQuery( new Term("contents", qword)); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
}
}
//*-------------------------------------------------------------------------------
//*-- 3. Create a list of weighted trigrams/bigrams/unigrams from the query
//*-------------------------------------------------------------------------------
int numNouns = nouns.size(); int numVerbs = verbs.size(); int numAdjectives = adjectives.size();
String[] queryWords = question.split("\\s+"); int wordsLength = queryWords.length;
boolean[] contentWord = new boolean[wordsLength];
for (int i = 0; i < wordsLength; i++)
{ queryWords[i] = queryWords[i].toLowerCase(Constants.locale);
contentWord[i] = false;
for (int j = 0; j < nouns.size(); j++) if (queryWords[i].equalsIgnoreCase(nouns.get(j))) contentWord[i] = true;
for (int j = 0; j < verbs.size(); j++) if (queryWords[i].equalsIgnoreCase(verbs.get(j))) contentWord[i] = true;
for (int j = 0; j < adjectives.size(); j++) if (queryWords[i].equalsIgnoreCase(adjectives.get(j))) contentWord[i] = true;
}
String joinChar;
//*-- generate all possible bigrams with higher weights for bigrams that do not have stopwords
float WT_NORM_BIGRAM = WT_BIGRAM;
for (int i = 1; i < 4; i++) if (wordsLength > (Math.pow(2, (i + 1)))) WT_NORM_BIGRAM /= 2;
LOOP2: for (int i = 1; i < wordsLength; i++)
{
//*-- skip if the previous word was a question word
//*-- if the previous word was a stop word use a underscore to build the bigram, otherwise use a space
wt = 0;
if ( !questionWords.contains(queryWords[i-1]) )
{
if (stopWords.contains(queryWords[i-1]) && stopWords.contains(queryWords[i])) continue LOOP2;
joinChar = (stopWords.contains(queryWords[i-1]) || stopWords.contains(queryWords[i])) ? "_": " ";
for (int j = i-1; j < i+1; j++) wt += (contentWord[j]) ? WT_NORM_BIGRAM: 0;
String bigram = queryWords[i-1] + joinChar + queryWords[i];
queryString.append("contents:\"" + bigram + "\"~0^" + wt + " ");
PhraseQuery pq = new PhraseQuery(); pq.add( new Term("contents", bigram)); pq.setBoost(wt); pq.setSlop(0);
theQuery.add(pq, BooleanClause.Occur.SHOULD);
bigrams.add(bigram);
}
} //*-- end of for
//*-- create unigrams from non-stop words and weigh unigrams near the start of the question
//*-- higher than unigrams near the end of the question
LOOP3: for (int i = 0; i < wordsLength; i++)
{ wt = WT_UNIGRAM;
//*-- skip punctuation and very short words
if ( (queryWords[i].length() < 2) || (!contentWord[i]) ) continue LOOP3;
wt *= ( (numNouns > 0) && (nouns.get(0).equalsIgnoreCase(queryWords[i])) ) ? 8:
( (numNouns > 1) && (nouns.get(1).equalsIgnoreCase(queryWords[i])) ) ? 4: 1;
wt *= ( (numVerbs > 0) && (verbs.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
( (numVerbs > 1) && (verbs.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
wt *= ( (numAdjectives > 0) && (adjectives.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
( (numAdjectives > 1) && (adjectives.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
queryString.append("contents:" + queryWords[i] + "^" + nf.format(wt) + " ");
TermQuery tq = new TermQuery( new Term("contents", queryWords[i])); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
} //*-- end of for
//*--------------------------------------------------------------------------
//*-- 4. Add the query transformation for the part. query type and add the synonyms
//*--------------------------------------------------------------------------
/* wt = WT_SYNONYMS;
for (int j = 0; j < synonyms.length; j++)
{ queryString.append("contents:" + synonyms[j] + "^" + nf.format(wt) + " ");
TermQuery tq = new TermQuery( new Term("contents", synonyms[j])); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
}
*/
wt = WT_TRANSFORM;
Matcher matcher = whatPattern.matcher(question);
if ( (matcher.matches()) && (nouns.size() > 0) )
{ String qTransform = "\"" + nouns.get(0) + "_is" + "\"";
queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
qTransform = "\"" + nouns.get(0) + "_was" + "\"";
queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
}
matcher = wherePattern.matcher(question);
if ( (matcher.matches()) && (nouns.size() > 0) )
{ String qTransform = "is_located" + "\"";
queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
qTransform = "\"located_at\"";
queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
}
// String query = queryString.toString();
//System.out.println("query string " + query);
//System.out.println("gen q: " + theQuery);
analyzer.setExtractEntities(false);
QueryParser qp = new QueryParser("contents", analyzer);
try { return(qp.parse(queryString.toString()) ); }
catch(ParseException pe) { }
return(theQuery);
}
/**
* Parse the question using a POS tagger. Extract the nouns, verbs, and adjectives.
* Extract the question hypernyms and question transform
* @param question
*/
public void parseQuestion(String question)
{
elementBuffer = new StringBuffer();
StringBuffer xmlOutput = new StringBuffer("<?xml version='1.0' encoding='utf-8'?> <Question>");
xmlOutput.append(posTagger.getPOS(question));
xmlOutput.append("</Question>");
try { parser.parse(new java.io.ByteArrayInputStream(xmlOutput.toString().getBytes("UTF-8")), this); }
catch (UnsupportedEncodingException ue) { logger.error("Encoding error " + ue.getMessage()); }
catch (IOException ie) { logger.error("IO Error in parsing tagged question " + ie.getMessage()); }
catch (SAXException se) { logger.error("Failed to parse question " + se.getMessage()); }
}
//*----------------------------------------------
//*-- Start XML parser overriden methods
//*----------------------------------------------
public void startDocument()
{ adjectives = new ArrayList<String>(); //*-- list of adjectives in the question
nouns = new ArrayList<String>(); //*-- list of nouns in the question
verbs = new ArrayList<String>(); //*-- list of verbs in the question
bigrams = new ArrayList<String>(); //*-- list of bigrams in the question
trigrams = new ArrayList<String>(); //*-- list of trigram in the question
qwords = new ArrayList<String>(); //*-- list of question words
}
public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException
{ elementBuffer.setLength(0); }
public void characters(char[] text, int start, int length)
{
for (int i = start; i < (start + length) ; i++)
if ( !Character.isDefined(text[i]) && !Character.isSpaceChar(text[i]) ) text[i] = ' ';
elementBuffer.append(text, start, length);
}
//*-- Build the qwords, nouns, adjectives, and verbs lists
public void endElement(String uri, String localName, String qName)
{
String taggedText = elementBuffer.toString().trim().toLowerCase(Constants.locale);
if (qName.equalsIgnoreCase("Qword")) qwords.add(taggedText);
if (qName.equalsIgnoreCase("Noun")) nouns.add(taggedText);
if (qName.equalsIgnoreCase("Adjective")) adjectives.add(taggedText);
if (qName.equalsIgnoreCase("Verb")) verbs.add(taggedText);
}
//*-----------------------------------------
//*-- End XML Parser overriden methods
//*-----------------------------------------
public String[] getNouns()
{ String[] rnouns = new String[nouns.size()];
for (int i = 0; i < nouns.size(); i++) rnouns[i] = nouns.get(i);
return(rnouns);
}
public String[] getVerbs()
{ String[] rverbs = new String[verbs.size()];
for (int i = 0; i < verbs.size(); i++) rverbs[i] = verbs.get(i);
return(rverbs);
}
public String[] getAdjectives()
{ String[] radjectives = new String[adjectives.size()];
for (int i = 0; i < adjectives.size(); i++) radjectives[i] = adjectives.get(i);
return(radjectives);
}
public String[] getEntities()
{ String[] rentities = new String[entities.size()];
for (int i = 0; i < entities.size(); i++) rentities[i] = entities.get(i);
return(rentities);
}
public String[] getBigrams()
{ String[] rbigrams = new String[bigrams.size()];
for (int i = 0; i < bigrams.size(); i++) rbigrams[i] = bigrams.get(i);
return(rbigrams);
}
public String[] getTrigrams()
{ String[] rtrigrams = new String[trigrams.size()];
for (int i = 0; i < trigrams.size(); i++) rtrigrams[i] = trigrams.get(i);
return(rtrigrams);
}
public String[] getResults()
{ return sq.getResults(); }
public static float getWT_BIGRAM()
{ return WT_BIGRAM; }
public static void setWT_BIGRAM(float wt_bigram)
{ WT_BIGRAM = wt_bigram; }
public static float getWT_ENTITY()
{ return WT_ENTITY; }
public static void setWT_ENTITY(float wt_entity)
{ WT_ENTITY = wt_entity; }
public static float getWT_QTYPE()
{ return WT_QTYPE; }
public static void setWT_QTYPE(float wt_qtype)
{ WT_QTYPE = wt_qtype; }
public static float getWT_SYNONYMS()
{ return WT_SYNONYMS; }
public static void setWT_SYNONYMS(float wt_synonyms)
{ WT_SYNONYMS = wt_synonyms; }
public static float getWT_TRANSFORM()
{ return WT_TRANSFORM; }
public static void setWT_TRANSFORM(float wt_transform)
{ WT_TRANSFORM = wt_transform; }
public static float getWT_UNIGRAM()
{ return WT_UNIGRAM; }
public static void setWT_UNIGRAM(float wt_unigram)
{ WT_UNIGRAM = wt_unigram; }
}