package org.sf.mustruweb;
import java.beans.IntrospectionException;
import java.beans.Introspector;
import java.beans.PropertyDescriptor;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.text.DateFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.sf.mustru.crawl.CrawlConfig;
import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.search.SearchQuery;
import org.sf.mustru.search.SearchQuestion;
import org.sf.mustru.search.SearchTools;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.StringTools;
import com.sleepycat.je.DatabaseEntry;
/**
* Accept search parameters from the client and generate a response.
* If a question, then use the question search function, otherwise
* run a standard search
*/
public final class Search extends HttpServlet
{
static final long serialVersionUID = 7225456138146845275L;
private String q; //*-- query string
private SearchQuery sQuery; //*-- Submit a query and get the list of hits
private SearchQuestion sQuestion; //*-- Submit a question and get the list of hits
private Searcher is; //*-- Index searcher
private DbTools dbt; //*-- Berkeley DB handle
private Pattern[] nounPatterns; //*-- noun patterns to match in response to a question
private Logger logger; //*-- Log4j logger
Properties dprops = null; //*-- document properties file
private static Pattern trimPattern = Pattern.compile("(_\\d+?)$");
/**
* Load some initial data, set the index/database directories
*/
public void init()
{
PropertyConfigurator.configure (Constants.LOG4J_FILE);
logger = Logger.getLogger(Search.class.getName());
//*-- set the index and database directories
CrawlConfig crawlConfig = new CrawlConfig(false); //*-- initialize from the properties file
Constants.setDBDIR( crawlConfig.getDbDir() ); Constants.setINDEXDIR( crawlConfig.getIndexDir() );
//*-- Create the Berkeley DB environment for read only access
dbt = new DbTools(); dbt.openEnv(Constants.getDBDIR(), true);
Constants.setDbt(dbt);
try { is = SearchTools.getSearcher(Constants.getINDEXDIR(), false); }
catch (IOException ie) { logger.error("IO Error " + ie.getMessage() ); }
sQuestion = new SearchQuestion(); sQuery = new SearchQuery();
try { dprops = new Properties(); dprops.load(new FileInputStream(Constants.DOCTYPES_FILE)); }
catch (IOException e) { logger.error("Could not read " + Constants.DOCTYPES_FILE + " " + e.getMessage()); }
}
/**
* Respond to a GET request from client
*
* @param request Received from client
* @param response Response to send to the client
*
* @exception IOException for an I/O error
* @exception ServletException for servlet error
*/
public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException
{
//*-- extract the query / question string
q = request.getParameter("q");
//*-- redirect blank queries to initial page
if (q.equals(""))
{ RequestDispatcher dispatcher = request.getRequestDispatcher("index.html");
try { dispatcher.forward(request, response); }
catch (IOException ie) { logger.error("IO Error: Could not forward to index.html: " + ie.getMessage()); }
return;
}
//*-- Fetch the hit list depending on the type of request - query, question, query expansion, or a similar document
Hits hits = null; String requestType = request.getParameter("type" );
try
{ if (requestType.equals("SEARCH")) hits = sQuery.getHits(q);
else if (requestType.equals("ASK")) hits = sQuestion.getHits(q);
else if (requestType.equals("EXPAND")) hits = is.search( SearchTools.expandQuery(q) );
else if (requestType.equals("SIMILAR")) hits = is.search( SearchTools.similarDocs(request.getParameter("dockey"), dbt));
}
catch (IOException ie) { logger.error("IO Error: Could not fetch hits" + ie.getMessage()); }
//*-- create the common bean containing the number of hits and paging information
CommonBean cbean = createCbean(request, hits); request.setAttribute("cbean", cbean);
//*-- generate the document or answer bean for queries and questions respectively
try
{ if ( (requestType.equals("ASK")) && (sQuestion != null) )
{ AnswerBean abean = createAbean(hits, cbean); request.setAttribute("abean", abean); }
else { DocBean dbean = createDbean(hits, cbean); request.setAttribute("dbean", dbean); }
} //*-- end of try
catch (IOException ie) { logger.error("IO Error: Could not create doc/answer bean: " + ie.getMessage()); }
//*-- pass control to the appropriate JSP
String address = (requestType.equals("ASK")) ? "/WEB-INF/answers.jsp": "/WEB-INF/documents.jsp";
RequestDispatcher dispatcher = request.getRequestDispatcher(address);
try { dispatcher.forward(request, response); }
catch (IOException ie) { logger.error("IO Error: Could not forward request: " + ie.getMessage()); }
}
//*------------------------------------------------------------
//*- Return a common bean filled with values based on the hits object
//*------------------------------------------------------------
private CommonBean createCbean(HttpServletRequest request, Hits hits)
{
CommonBean cbean = new CommonBean();
//*-- build the fields for the common bean
int hitsLen = (hits != null) ? hits.length(): 0;
//cbean.setQ(q.replaceAll("\"", """));
cbean.setQ(q);
cbean.setNumHits(hitsLen);
cbean.setAlt(sQuery.getAlt());
//*-- set the page increment
int pageInc = (request.getParameter("pageInc") == null) ? 10: Integer.parseInt(request.getParameter("pageInc"));
cbean.setPageInc(pageInc);
//*-- set the total no. of pages
int numPages = hitsLen / pageInc; numPages += ( (hitsLen % pageInc) == 0) ? 0: 1;
cbean.setNumPages(numPages);
//*-- set the start page
int startPage = (request.getParameter("startPage")== null) ? 0: Integer.parseInt(request.getParameter("startPage"));
if (startPage > numPages) startPage = 0;
cbean.setStartPage(startPage);
//*-- set the current page
int currentPage = (request.getParameter("currentPage") == null) ? 0: Integer.parseInt(request.getParameter("currentPage"));
if (currentPage > numPages) currentPage = 0;
cbean.setCurrentPage(currentPage);
//*-- set start/end index for the hit list
int start = currentPage * pageInc;
cbean.setStart(start);
int end = ( (start + pageInc) < hitsLen ) ? start + pageInc: hitsLen; cbean.setEnd(end);
//*-- set the type of question / query
cbean.setType(request.getParameter("type"));
return cbean;
}
//*---------------------------------------------------------
//*-- Create an answer bean containing the list of answers
//*---------------------------------------------------------
private AnswerBean createAbean(Hits hits, CommonBean cbean) throws IOException
{
AnswerBean abean = new AnswerBean();
if (hits == null) { abean.setAnswers(null); return (abean); }
//*-- build the list of answers
ArrayList<Answer> alist = new ArrayList<Answer>();
Matcher matcher = null;
dbt.openDB(Constants.EXT_FILES_DB, true, false); //*-- read only access
//*-- allocate space for the digests of the passages
String[] qnouns = sQuestion.getNouns();
nounPatterns = new Pattern[qnouns.length];
for (int i = 0; i < qnouns.length; i++)
nounPatterns[i] = Pattern.compile(qnouns[i], Pattern.CASE_INSENSITIVE);
ArrayList<String> digests = new ArrayList<String>();
int numPageHits = 0; int inc = 0;
LOOP: for (int i = cbean.getStart(); i < hits.length(); i+= inc)
{
//*-- use the key to fetch the matching database entry
Document doc = hits.doc(i);
String key = doc.get("key");
DatabaseEntry data = new DatabaseEntry();
if (!dbt.fetch(key, data)) continue LOOP;
//*-- extract the passage text
IndexableDoc idoc = new IndexableDoc();
idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);
String line= idoc.getContents().toString();
String passage[] = SearchTools.bestPassages(line, sQuestion.getNouns(), sQuestion.getVerbs(), sQuestion.getAdjectives(),
sQuestion.getBigrams(), sQuestion.getEntities());
//*-- get the file name
matcher = trimPattern.matcher(key); if (matcher != null) key = matcher.replaceFirst("");
//*-- check for dups from the top two sentences
inc = 0;
for (int j = 0; j < 2; j++)
{
String digest = StringTools.shaDigest(passage[j]);
if (digests.indexOf(passage[j]) != -1) continue LOOP;
digests.add(digest);
Answer answer = new Answer();
answer.setFileName(key);
answer.setPassage(hiliter(passage[j], sQuestion.getNouns()) );
answer.setScore(hits.score(i));
alist.add(answer);
inc++;
}
if (++numPageHits >= cbean.getPageInc()) break LOOP;
} //*-- end of for
//*-- set the answers array in the bean and return
Answer[] answers = new Answer[alist.size()]; alist.toArray(answers);
abean.setAnswers(answers);
dbt.closeDB();
return(abean);
}
//*---------------------------------------------------------
//*-- Create a document bean containing the list of document hits
//*---------------------------------------------------------
private DocBean createDbean(Hits hits, CommonBean cbean) throws IOException
{
DocBean dbean = new DocBean();
if (hits == null) { dbean.setDocuments(null); return(dbean); }
DateFormat df = DateFormat.getDateInstance(DateFormat.MEDIUM, Constants.locale);
NumberFormat nf = NumberFormat.getInstance(Constants.locale);
nf.setMaximumFractionDigits(1); nf.setMinimumFractionDigits(1);
//*-- build the list of documents
ArrayList<Doc> dlist = new ArrayList<Doc>();
int numPageHits = 0;
LOOP: for (int i = cbean.getStart(); i < hits.length(); i++)
{
//*-- use the key to fetch the matching database entry
Document doc = hits.doc(i);
String key = doc.get("key"); String ftype = doc.get("type");
DatabaseEntry data = new DatabaseEntry();
String dbname = (ftype.equalsIgnoreCase("email")) ? Constants.EXT_MESSAGES_DB: Constants.EXT_FILES_DB;
dbt.openDB(dbname, true, false); //*-- open the database for read only access
if (!dbt.fetch(key, data)) continue LOOP;
dbt.closeDB();
//*-- use the type of the document to create a doc instance of the specified type
try
{
Doc document = new Doc();
String docClass = dprops.getProperty(ftype);
if ( (docClass == null) || (docClass.equals("")) ) docClass = "org.sf.mustru.docs.TextDoc";
Class docType = Class.forName(docClass);
IndexableDoc idoc = (IndexableDoc) docType.newInstance();
idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);
//*-- set the snippet of the document for the bean
String contents = idoc.getContents().toString(); contents = StringTools.filterChars(contents);
if (contents.length() > 5000) contents = contents.substring(0, 5000);
idoc.setContents(new StringBuffer(snippet(contents)));
//*-- handle the different types of documents
copyBean(idoc, document);
document.setFormatFileLength(nf.format(document.getFileLength() / 1000.0) );
document.setFormatMdate( df.format( new Date(document.getMdate()) ) );
document.setScore(hits.score(i));
dlist.add(document);
if (++numPageHits >= cbean.getPageInc()) break LOOP;
}
catch (ClassNotFoundException ce)
{ logger.error("Could not get doc class : " + ce.getMessage()); }
catch (InstantiationException ie)
{ logger.error("Could not instantiate doc class: " + ie.getMessage() ); }
catch (IllegalAccessException ae)
{ logger.error("Could not access class: " + ae.getMessage()); }
} //*-- end of for
//*-- set the answers array in the bean and return
Doc[] docs = new Doc[dlist.size()]; dlist.toArray(docs);
dbean.setDocuments(docs);
return(dbean);
}
//*--------------------------------------------------
//*-- copy values from one bean to another bean
//*--------------------------------------------------
private void copyBean (Object source, Object target )
{
PropertyDescriptor[] sourceProperties = null;
PropertyDescriptor[] targetProperties = null;
try
{ sourceProperties = Introspector.getBeanInfo (source.getClass()).getPropertyDescriptors();
targetProperties = Introspector.getBeanInfo (target.getClass()).getPropertyDescriptors();
}
catch (IntrospectionException ie ) { logger.error("Introspection error " + ie.getMessage()); }
try
{
Object[] value = {null};
for ( int i = 0; i < sourceProperties.length; ++i )
{
String name = sourceProperties[i].getName();
LOOP: for ( int j = 0; j < targetProperties.length; ++j )
if ( targetProperties[j].getName().equals(name) )
{
Method read = sourceProperties[i].getReadMethod();
Method write = targetProperties[j].getWriteMethod();
if ( read == null || write == null ) break LOOP;
value[0] = read.invoke(source, (Object[]) null);
write.invoke(target, value);
break LOOP;
} //*-- end of inner for
} //*-- end of outer for
}
catch (IllegalAccessException ie) { logger.error("Illegal access error " + ie.getMessage()); }
catch (InvocationTargetException ie) { logger.error("Invocation error: " + ie.getMessage()); }
return;
}
//*-------------------------------------------------
//*- Return a string with query keywords highlighted
//*-------------------------------------------------
private String snippet(String text) throws IOException
{ return (snippet(text, null) ); }
private String snippet(String text, String[] stopWords) throws IOException
{
//*-- first extract a list of tokens from the query string excluding stop words
StandardAnalyzer sAnalyzer = (stopWords == null) ? new StandardAnalyzer(): new StandardAnalyzer(stopWords);
Token[] tokens = extractTokens(sAnalyzer, q);
//*-- next, build a new filtered query string from the list of tokens
StringBuffer queryString = new StringBuffer();
LOOP: for (int i = 0; i < tokens.length; i++)
{ if (tokens[i].termText().length() < 3) continue LOOP;
queryString.append(tokens[i].termText());
if (i < (tokens.length - 1) ) queryString.append(" OR ");
}
//*-- parse the query using the standard analyzer and highlight the text
QueryParser qp = new QueryParser("contents", sAnalyzer );
String result = "";
try
{
if (queryString.length() > 0)
{ Query query = qp.parse(queryString.toString());
QueryScorer qScorer = new QueryScorer(query);
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"hlight\">", "</span>");
Highlighter highlighter = new Highlighter(formatter, qScorer);
Fragmenter fragmenter = new SimpleFragmenter(80); //*-- use fragments of 50 bytes each
highlighter.setTextFragmenter(fragmenter);
TokenStream tokenStream = sAnalyzer.tokenStream("contents", new StringReader(text));
result = highlighter.getBestFragments(tokenStream, text, 3, "...<br>"); //*-- collect upto three fragments
}
}
catch (ParseException pe) { logger.error("Query parse error " + pe.getMessage() ); }
//*-- if no tokens were extracted, then return the original string
return ( (result.length() == 0) ? StringTools.fillin(text, 250, true, '.', 3): result);
}
//*-----------------------------------------------------------
//*- Highlight text using the passed nouns
//*-----------------------------------------------------------
private String hiliter(String passage, String[] nouns)
{
//*-- match the nouns in a loop and highlight the nouns
LOOP: for (int i = 0; i < nouns.length; i++)
{ if (nouns[i].length() < 3) continue LOOP;
String replace = "<span class=\"hlight\">" + nouns[i] + "</span>";
passage = nounPatterns[i].matcher(passage).replaceAll(replace);
}
return(passage);
}
//*-----------------------------------------------------------
//*- Use the passed analyzer to get a list of tokens from the text
//*-----------------------------------------------------------
private static Token[] extractTokens(Analyzer analyzer, String text) throws IOException
{
TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
while ( (token = stream.next()) != null) tokenList.add(token);
Token[] tokens = new Token[tokenList.size()];
for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
return (tokens);
}
/**
* Release resources before ending the servlet
*/
public void destroy()
{ sQuestion = null; sQuery = null; dbt.closeEnv(); }
}