Source Code of org.sf.mustruweb.Search

package org.sf.mustruweb;


import java.beans.IntrospectionException;
import java.beans.Introspector;
import java.beans.PropertyDescriptor;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.text.DateFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;


import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;


import org.sf.mustru.crawl.CrawlConfig;
import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.search.SearchQuery;
import org.sf.mustru.search.SearchQuestion;
import org.sf.mustru.search.SearchTools;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.StringTools;


import com.sleepycat.je.DatabaseEntry;


/**
 * Accept search parameters from the client and generate a response.
 * If a question, then use the question search function, otherwise
 * run a standard search
 */


public final class Search extends HttpServlet 
{ 
 static final long serialVersionUID = 7225456138146845275L;
 private String q;        //*-- query string
 private SearchQuery sQuery;      //*-- Submit a query and get the list of hits
 private SearchQuestion sQuestion;    //*-- Submit a question and get the list of hits
 private Searcher is;        //*-- Index searcher 
 private DbTools dbt;        //*-- Berkeley DB handle
 private Pattern[] nounPatterns;    //*-- noun patterns to match in response to a question
 private Logger logger;      //*-- Log4j logger
 Properties dprops = null;      //*-- document properties file
 private static Pattern trimPattern = Pattern.compile("(_\\d+?)$");


 /**
  * Load some initial data, set the index/database directories
  */
 public void init()
 { 
  
  PropertyConfigurator.configure (Constants.LOG4J_FILE); 
  logger = Logger.getLogger(Search.class.getName()); 


  //*-- set the index and database directories
  CrawlConfig crawlConfig = new CrawlConfig(false);   //*-- initialize from the properties file
  Constants.setDBDIR( crawlConfig.getDbDir() ); Constants.setINDEXDIR( crawlConfig.getIndexDir() );


  //*-- Create the Berkeley DB environment for read only access
  dbt = new DbTools(); dbt.openEnv(Constants.getDBDIR(), true); 
  Constants.setDbt(dbt);
  
  try { is = SearchTools.getSearcher(Constants.getINDEXDIR(), false); }
  catch (IOException ie) { logger.error("IO Error " + ie.getMessage() ); }


  sQuestion = new SearchQuestion(); sQuery = new SearchQuery();
  try { dprops = new Properties(); dprops.load(new FileInputStream(Constants.DOCTYPES_FILE)); }
  catch (IOException e)  { logger.error("Could not read " + Constants.DOCTYPES_FILE + " " + e.getMessage()); }
 }


 /**
  * Respond to a GET request from client
  *
  * @param request Received from client
  * @param response Response to send to the client
  *
  * @exception IOException for an I/O error 
  * @exception ServletException for servlet error 
  */
 public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException
 {
  //*-- extract the query / question string
  q = request.getParameter("q"); 


  //*-- redirect blank queries to initial page
  if (q.equals("")) 
  { RequestDispatcher dispatcher = request.getRequestDispatcher("index.html");
    try { dispatcher.forward(request, response); }
    catch (IOException ie) { logger.error("IO Error: Could not forward to index.html: " + ie.getMessage()); }
    return;
  }


  //*-- Fetch the hit list depending on the type of request - query, question, query expansion, or a similar document
  Hits hits = null; String requestType = request.getParameter("type"  );
  try 
  { if (requestType.equals("SEARCH")) hits = sQuery.getHits(q);
    else if (requestType.equals("ASK")) hits =  sQuestion.getHits(q);
    else if (requestType.equals("EXPAND")) hits = is.search( SearchTools.expandQuery(q) );
    else if (requestType.equals("SIMILAR")) hits = is.search( SearchTools.similarDocs(request.getParameter("dockey"), dbt));
  }
  catch (IOException ie) { logger.error("IO Error: Could not fetch hits" + ie.getMessage()); }
  
  //*-- create the common bean containing the number of hits and paging information
  CommonBean cbean = createCbean(request, hits); request.setAttribute("cbean", cbean);
  
  //*-- generate the document or answer bean for queries and questions respectively
  try
  {  if ( (requestType.equals("ASK")) && (sQuestion != null) )  
       { AnswerBean abean = createAbean(hits, cbean);  request.setAttribute("abean", abean); }
     else  { DocBean dbean = createDbean(hits, cbean);  request.setAttribute("dbean", dbean); }  
  } //*-- end of try
  catch (IOException ie) { logger.error("IO Error: Could not create doc/answer bean: " + ie.getMessage()); }


  //*-- pass control to the appropriate JSP
  String address = (requestType.equals("ASK")) ? "/WEB-INF/answers.jsp": "/WEB-INF/documents.jsp";
  RequestDispatcher dispatcher = request.getRequestDispatcher(address);
  try { dispatcher.forward(request, response); }
  catch (IOException ie) { logger.error("IO Error: Could not forward request: " + ie.getMessage()); }
 }


 //*------------------------------------------------------------
 //*- Return a common bean filled with values based on the hits object
 //*------------------------------------------------------------
 private CommonBean createCbean(HttpServletRequest request, Hits hits)
 {
  CommonBean cbean = new CommonBean();
  
  //*-- build the fields for the common bean
  int hitsLen = (hits != null) ? hits.length(): 0;
  //cbean.setQ(q.replaceAll("\"", "&quot;")); 
  cbean.setQ(q);
  cbean.setNumHits(hitsLen);
  cbean.setAlt(sQuery.getAlt());
  
  //*-- set the page increment
  int pageInc = (request.getParameter("pageInc") == null) ? 10: Integer.parseInt(request.getParameter("pageInc"));
  cbean.setPageInc(pageInc); 
  
  //*-- set the total no. of pages 
  int numPages = hitsLen / pageInc; numPages += ( (hitsLen % pageInc) == 0) ? 0: 1;
  cbean.setNumPages(numPages); 


  //*-- set the start page 
  int startPage = (request.getParameter("startPage")== null) ?  0: Integer.parseInt(request.getParameter("startPage"));
  if (startPage > numPages) startPage = 0;
  cbean.setStartPage(startPage); 
  
  //*-- set the current page
  int currentPage = (request.getParameter("currentPage") == null) ? 0: Integer.parseInt(request.getParameter("currentPage"));
  if (currentPage > numPages) currentPage = 0;
  cbean.setCurrentPage(currentPage);
  
  //*-- set start/end index for the hit list
  int start = currentPage * pageInc;
  cbean.setStart(start); 
  int end = ( (start + pageInc) < hitsLen ) ? start + pageInc: hitsLen; cbean.setEnd(end);
  
  //*-- set the type of question / query
  cbean.setType(request.getParameter("type"));
  return cbean;
 }
 
 //*---------------------------------------------------------
 //*-- Create an answer bean containing the list of answers
 //*---------------------------------------------------------
 private AnswerBean createAbean(Hits hits, CommonBean cbean) throws IOException
 {
  AnswerBean abean = new AnswerBean(); 
  if (hits == null)  { abean.setAnswers(null); return (abean); }
 
  //*-- build the list of answers
  ArrayList<Answer> alist = new ArrayList<Answer>(); 
  Matcher matcher = null;
  dbt.openDB(Constants.EXT_FILES_DB, true, false); //*-- read only access


  //*-- allocate space for the digests of the passages
  String[] qnouns = sQuestion.getNouns();
  nounPatterns = new Pattern[qnouns.length];
  for (int i = 0; i < qnouns.length; i++) 
    nounPatterns[i] = Pattern.compile(qnouns[i], Pattern.CASE_INSENSITIVE);
  ArrayList<String> digests = new ArrayList<String>(); 
  int numPageHits = 0; int inc = 0;
  LOOP: for (int i = cbean.getStart(); i < hits.length(); i+= inc)
  { 
   //*-- use the key to fetch the matching database entry
   Document doc = hits.doc(i);
   String key = doc.get("key");
   DatabaseEntry data = new DatabaseEntry();             
   if (!dbt.fetch(key, data)) continue LOOP;


   //*-- extract the passage text
   IndexableDoc idoc = new IndexableDoc();
   idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);
   String line= idoc.getContents().toString();
   String passage[] = SearchTools.bestPassages(line, sQuestion.getNouns(), sQuestion.getVerbs(), sQuestion.getAdjectives(),
                     sQuestion.getBigrams(), sQuestion.getEntities());
   
   //*-- get the file name
   matcher = trimPattern.matcher(key); if (matcher != null) key = matcher.replaceFirst("");


   //*-- check for dups from the top two sentences
   inc = 0;
   for (int j = 0; j < 2; j++)
   {
    String digest = StringTools.shaDigest(passage[j]); 
    if (digests.indexOf(passage[j]) != -1) continue LOOP;
    digests.add(digest);


    Answer answer = new Answer();
    answer.setFileName(key); 
    answer.setPassage(hiliter(passage[j], sQuestion.getNouns()) );
    answer.setScore(hits.score(i));
    alist.add(answer);
    inc++;
   } 
   
   if (++numPageHits >= cbean.getPageInc()) break LOOP;  
  } //*-- end of for


  //*-- set the answers array in the bean and return
  Answer[] answers = new Answer[alist.size()]; alist.toArray(answers);
  abean.setAnswers(answers);
  dbt.closeDB();


  return(abean);
 }


 //*---------------------------------------------------------
 //*-- Create a document bean containing the list of document hits
 //*---------------------------------------------------------
 private DocBean createDbean(Hits hits, CommonBean cbean) throws IOException
 {
  DocBean dbean = new DocBean();
  if (hits == null) { dbean.setDocuments(null); return(dbean); }
  
  DateFormat df = DateFormat.getDateInstance(DateFormat.MEDIUM, Constants.locale);
  NumberFormat nf = NumberFormat.getInstance(Constants.locale);
  nf.setMaximumFractionDigits(1); nf.setMinimumFractionDigits(1);


  //*-- build the list of documents
  ArrayList<Doc> dlist = new ArrayList<Doc>();
  int numPageHits = 0;
  LOOP: for (int i = cbean.getStart(); i < hits.length(); i++)
  { 
   //*-- use the key to fetch the matching database entry
   Document doc = hits.doc(i);


   String key = doc.get("key"); String ftype = doc.get("type");
   DatabaseEntry data = new DatabaseEntry();


   String dbname = (ftype.equalsIgnoreCase("email")) ? Constants.EXT_MESSAGES_DB: Constants.EXT_FILES_DB;


   dbt.openDB(dbname, true, false); //*-- open the database for read only access             
   if (!dbt.fetch(key, data)) continue LOOP;
   dbt.closeDB();
   
   //*-- use the type of the document to create a doc instance of the specified type
   try
   {
    Doc document = new Doc();
    String docClass = dprops.getProperty(ftype); 
    if ( (docClass == null) || (docClass.equals("")) ) docClass = "org.sf.mustru.docs.TextDoc";
    Class docType = Class.forName(docClass); 
    IndexableDoc idoc =  (IndexableDoc) docType.newInstance(); 
    idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);  
   
    //*-- set the snippet of the document for the bean
    String contents = idoc.getContents().toString(); contents = StringTools.filterChars(contents);
    if (contents.length() > 5000) contents = contents.substring(0, 5000); 
    idoc.setContents(new StringBuffer(snippet(contents)));
       
    //*-- handle the different types of documents      
    copyBean(idoc, document); 
    document.setFormatFileLength(nf.format(document.getFileLength() / 1000.0) );
    document.setFormatMdate( df.format( new Date(document.getMdate()) ) );
    document.setScore(hits.score(i));
 
    dlist.add(document);
    if (++numPageHits >= cbean.getPageInc()) break LOOP;  
   }
   catch (ClassNotFoundException ce)
   { logger.error("Could not get doc class : " + ce.getMessage()); }
   catch (InstantiationException ie)
   { logger.error("Could not instantiate doc class: " + ie.getMessage() ); }
   catch (IllegalAccessException ae)
   { logger.error("Could not access class: " + ae.getMessage()); }


  } //*-- end of for


  //*-- set the answers array in the bean and return
  Doc[] docs = new Doc[dlist.size()]; dlist.toArray(docs); 
  dbean.setDocuments(docs);


  return(dbean);
 }


 //*--------------------------------------------------
 //*-- copy values from one bean to another bean 
 //*--------------------------------------------------
 private void copyBean (Object source, Object target )  
  {   
   PropertyDescriptor[]  sourceProperties = null; 
   PropertyDescriptor[]  targetProperties = null; 
   try 
    { sourceProperties = Introspector.getBeanInfo  (source.getClass()).getPropertyDescriptors(); 
      targetProperties = Introspector.getBeanInfo  (target.getClass()).getPropertyDescriptors(); 
    }  
   catch  (IntrospectionException ie )  {  logger.error("Introspection error " + ie.getMessage()); }  
   
   try
   {
   Object[] value =  {null};  
   for  ( int i = 0; i  <  sourceProperties.length; ++i )  
    {  
     String name = sourceProperties[i].getName(); 
     LOOP: for ( int j = 0; j  <  targetProperties.length; ++j )  
       if  ( targetProperties[j].getName().equals(name) )  
        {  
         Method read = sourceProperties[i].getReadMethod(); 
         Method write = targetProperties[j].getWriteMethod(); 
         if  ( read == null || write == null )  break LOOP; 
         value[0]  = read.invoke(source, (Object[]) null); 
         write.invoke(target, value); 
         break LOOP; 
        }  //*-- end of inner for
    } //*-- end of outer for   
   }
   catch (IllegalAccessException ie) { logger.error("Illegal access error " + ie.getMessage()); }
   catch (InvocationTargetException ie) { logger.error("Invocation error: " + ie.getMessage()); }
   
   return; 
  } 
 
 //*-------------------------------------------------
 //*-  Return a string with query keywords highlighted
 //*-------------------------------------------------
 private String snippet(String text) throws IOException
 { return (snippet(text, null) ); }


 private String snippet(String text, String[] stopWords) throws IOException
 {
  //*-- first extract a list of tokens from the query string excluding stop words
  StandardAnalyzer sAnalyzer = (stopWords == null) ? new StandardAnalyzer(): new StandardAnalyzer(stopWords);
  Token[] tokens = extractTokens(sAnalyzer, q);


  //*-- next, build a new filtered query string from the list of tokens
  StringBuffer queryString = new StringBuffer();
  LOOP: for (int i = 0; i < tokens.length; i++)
  { if (tokens[i].termText().length() < 3) continue LOOP;
    queryString.append(tokens[i].termText()); 
    if (i < (tokens.length - 1) ) queryString.append(" OR ");
  }


  //*-- parse the query using the standard analyzer and highlight the text
  QueryParser qp = new QueryParser("contents", sAnalyzer );
  String result = "";
  try 
  {
   if (queryString.length() > 0)
   { Query query = qp.parse(queryString.toString());
     QueryScorer qScorer = new QueryScorer(query);
     SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"hlight\">", "</span>");
     Highlighter highlighter = new Highlighter(formatter, qScorer);
     Fragmenter fragmenter = new SimpleFragmenter(80);        //*-- use fragments of 50 bytes each
     highlighter.setTextFragmenter(fragmenter);
     TokenStream tokenStream = sAnalyzer.tokenStream("contents", new StringReader(text));
     result = highlighter.getBestFragments(tokenStream, text, 3, "...<br>");  //*-- collect upto three fragments
   }
  }
  catch (ParseException pe) { logger.error("Query parse error " + pe.getMessage() ); }


  //*-- if no tokens were extracted, then return the original string
  return ( (result.length() == 0) ? StringTools.fillin(text, 250, true, '.', 3): result); 


 }


 //*-----------------------------------------------------------
 //*- Highlight text using the passed nouns
 //*-----------------------------------------------------------
 private String hiliter(String passage, String[] nouns)
 {
  //*-- match the nouns in a loop and highlight the nouns
  LOOP: for (int i = 0; i < nouns.length; i++)
   { if (nouns[i].length() < 3) continue LOOP;
     String replace = "<span class=\"hlight\">" + nouns[i] + "</span>";
     passage = nounPatterns[i].matcher(passage).replaceAll(replace);
   }
  return(passage);
 }
 
 //*-----------------------------------------------------------
 //*- Use the passed analyzer to get a list of tokens from the text 
 //*-----------------------------------------------------------
 private static Token[] extractTokens(Analyzer analyzer, String text) throws IOException 
 {
  TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
  ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
  while ( (token = stream.next()) != null) tokenList.add(token);
  Token[] tokens = new Token[tokenList.size()];
  for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
  return (tokens);
 }


 /**
  * Release resources before ending the servlet
  */
 public void destroy()
 { sQuestion = null; sQuery = null; dbt.closeEnv(); }


}
Source Code of org.sf.mustruweb.Search

Related Classes of org.sf.mustruweb.Search