Source Code of org.sf.mustru.crawl.CrawlThread

package org.sf.mustru.crawl;


import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Date;
import java.util.Properties;


import org.apache.log4j.Logger;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;


import org.sf.mustru.docs.*;
import org.sf.mustru.crawl.CrawlConfig;
import org.sf.mustru.filters.StarHandler;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.StandardBgramAnalyzer;
import org.sf.mustru.utils.StringTools;
import com.sleepycat.je.DatabaseEntry;


/**
 * Crawl thread that reads the task file for documents to index. Multiple threads
 * will share the workload. Each entry in the task file is processed - text extraction,
 * indexed, and categorized.
 */
public class CrawlThread extends Thread
{
 static Logger logger = Logger.getLogger(CrawlThread.class.getName());


 private CrawlManager crawlManager;    //*-- instance of caller
 private CrawlConfig crawlConfig;    //*-- Crawler configuration object
 private int threadNum;      //*-- thread number of this instance
 private IndexableDoc tempDoc;    //*-- a reusable doc
 private IndexableDocBinding tempBinding;  //*-- a reusable DB binding
 private File tempFile;        //*-- a temporary reusable file handle
 private int numThreads;      //*-- number of threads in the ensemble
 private int currentDoc;      //*-- the index of the current document being processed
 private int docsProcessed;      //*-- the number of documents processed so far
 private boolean running;      //*-- flag to indicate if the thread should continue running
 private DbTools dbt;        //*-- tools to access the BDB


 //*-- Lucene vars
 private IndexWriter fileIW;      //*-- File system based IndexWriter instance from CrawlManager
 private IndexWriter ramIW;      //*-- RAM based instance IndexWriter
 private RAMDirectory ramDir;      //*-- RAM based directory
 private StandardBgramAnalyzer analyzer;  //*-- the analyzer for the Lucene index
 private ClassifyDoc cdoc;      //*-- classifier for file type and content


 /**
  * Set the instance variables needed to process documents from the taskfile
  * @param threadNum   Thread number of current thread 
  * @param crawlConfig  Crawler configuration
  * @param caller  Caller of this thread - the Crawl Manager
  */
 public CrawlThread (int threadNum, CrawlConfig crawlConfig, CrawlManager caller) throws IOException
 { 
  super();
  setRunning(true);
  totalTime = totalTime - new Date().getTime();
  initTime -=  new Date().getTime();       
  this.crawlConfig = crawlConfig;              this.threadNum = threadNum;   this.crawlManager = caller; 
  numThreads = crawlConfig.getNumThreads();    tempDoc = new IndexableDoc();   tempBinding = new IndexableDocBinding();   
  tempFile = null;        fileIW = caller.getIw();                  
  cdoc = new ClassifyDoc();      docsProcessed = 0;   
  dbt = Constants.getDbt();


  //*-- create the RAM based Lucene IndexWriter 
  analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
  ramDir = new RAMDirectory(); //Similarity.setDefault(new SearchSimilarity());
  createRamIW();
  initTime +=  new Date().getTime(); 
 }


 /**
  *  Scan the contents of the task file. Each line contains the path to an indexable document
  *  on the filesystem. Use the thread number and number of threads to distribute the load evenly
  *  across all threads. The text for each file is extracted, classified, and indexed.
  */
 public void run()
 {      
  //*-- read the filters file and get a generic handler to extract text  
  fileReadTime -=  new Date().getTime();
  Properties props = null; StarHandler sh = null;
  try { props = new Properties(); props.load(new BufferedInputStream(new FileInputStream(Constants.FILTER_FILE))); 
  sh = new StarHandler(props); }
  catch (IOException e) { crawlManager.cleanUp("Could not read filtersFile " + e.getMessage()); }
  if (sh == null) crawlManager.cleanUp("The generic handler to filter files was not found");


  //*-- read the document types file to classify documents into types - text, image, audio, book, letter, etc.
  Properties dprops = null;
  try { dprops = new Properties(); dprops.load(new BufferedInputStream(new FileInputStream(Constants.DOCTYPES_FILE))); }
  catch (IOException e)  { crawlManager.cleanUp("Could not read doctypesFile " + e.getMessage()); }


  //*-- read the media suffixes and associated file types properties file
  Properties mprops = new Properties();
  try { mprops.load(new BufferedInputStream(new FileInputStream( Constants.MEDIATYPES_FILE))); }
  catch ( IOException e) { crawlManager.cleanUp("Could not read media types file " + e.getMessage()); }
  fileReadTime += new Date().getTime();


  currentDoc = crawlManager.getCurrentDoc(threadNum); 


  long startMemory = Runtime.getRuntime().freeMemory();
  logger.info("Free memory at start of scan: " + startMemory);


  //*-- scan the list of files and process depending on thread number
  String iDocument = ""; IndexableDoc idoc = null; boolean completed = false;
  FILELOOP: while (!completed) 
  { 
   //*-- distribute the load evenly 
   if ( (++currentDoc % numThreads) != threadNum )  continue FILELOOP; 


   //*-- for a restart skip documents scanned earlier
   if (currentDoc < crawlConfig.getStartPosition()) continue FILELOOP;


   //*-- terminate if all files have been processed
   if (currentDoc >= crawlManager.totalNumFiles() ) { completed = true; break FILELOOP; }


   //*-- periodically synchronize the indexes
   syncTime -= new Date().getTime(); 
   long freeMemory = Runtime.getRuntime().freeMemory();
   //  if (freeMemory < startMemory)  mergeIndexes(false);
   if ( ( (currentDoc % 1000) == 0) && (docsProcessed > 0) ) mergeIndexes(false);
   syncTime += new Date().getTime();           
   try 
   { 
    //*-- check if the document was indexed prior to the modification date of the file
    iDocument = crawlManager.getFileName(currentDoc); initDoc(iDocument, tempDoc);
    //logger.info(currentDoc + ": Started File: " + iDocument + "<----"); 
    if (checkIndexed(iDocument)) continue FILELOOP;


    //*-- extract the text from the document and create an indexable document
    docFilterTime -= new Date().getTime(); 
    sh.getDocument(iDocument, tempDoc);
    docFilterTime += new Date().getTime();


    //*-- get the signature of the document text and check for duplicates 
    if (duplicateDoc()) continue FILELOOP;


    //*-- classify the text of the document into one of the document types
    String docClass = setClass(tempDoc, dprops, mprops);


    //*-- create a new instance of the specific type of document
    docLoadTime -= new Date().getTime(); 
    Class docType = Class.forName(docClass);
    idoc =  (IndexableDoc) docType.newInstance(); 
    idoc.loadGeneric(tempDoc);    //*-- copy generic information to the instance


    classifyTextTime -= new Date().getTime(); 
    idoc.loadSpecific(cdoc);    //*-- create the specific information for the type of document
    classifyTextTime += new Date().getTime();
    docLoadTime += new Date().getTime();


    //*-- increment the log for the type of document in the caller
    crawlManager.updateLogdata( idoc.getFileType() );


    //*-- index the document and extract entities
    setIndex(idoc); initDoc("", idoc);                
    docsProcessed++;
    // if (docsProcessed == 5000) completed = true;
   }


   catch (ClassNotFoundException ce) 
   { logger.error("Could not get doc class : " + ce.getMessage()); }
   catch (InstantiationException ie)
   { logger.error("Could not instantiate doc class: " + ie.getMessage() ); }
   catch (IllegalAccessException ae)
   { logger.error("Could not access class: " + ae.getMessage()); }
   catch (IOException ie) 
   { logger.error("IO Error : " + ie.getMessage()); }  
   //*-- save the state for errors and exceptions for a restart of the thread
   catch (OutOfMemoryError oe)
   { logger.error("Ran out of memory, will restart : " + oe.getMessage()); 
     completed = true;  crawlManager.setEnoughResources(false);
     crawlManager.setCurrentDoc(threadNum, currentDoc - 1);
   }
   finally
   { logger.info(currentDoc + ": Finished File: " + iDocument + "<---- " + freeMemory);
     if (!isRunning()) completed = true;  //*-- if forced to stop by BuildIndex
   }


  } //*-- end of while


  logger.info("Processed " + docsProcessed + " documents");
  mergeIndexes(true);
  dbt.closeDB(); 


  totalTime += new Date().getTime();
  updateTimers(); 
  endThread();
 }


 /**
  * Sync. the Berkeley DB environment and merge the Lucene indexes
  * @param last Flag to indicate the last time function will be called
  */
 public void mergeIndexes(boolean last)
 { 
  logger.info("Sync. environment and indexes");
  Directory[] dirs = {ramDir}; 


  synchronized(this) 
  { try  { dbt.syncEnv(); ramIW.close(); fileIW.addIndexes(dirs);  }
    catch (IOException ie ) { crawlManager.cleanUp("Could not merge Lucene indexes " + ie.getMessage() ); }
  }


  //*-- if not last iteration allocate the RAM index
  if (!last) { createRamIW(); tempDoc = new IndexableDoc(); }


 // Runtime.getRuntime().gc();
 }


 /**
  * Return true or false depending on whether the file was indexed earlier
  */
 private boolean checkIndexed(String iDocument)
 {
  dbTime -= new Date().getTime(); 
  boolean indexedEarlier = false; boolean noCreateflag = false; boolean dupFlag = false;
  dbt.openDB(Constants.EXT_FILES_DB, noCreateflag, dupFlag); 
  DatabaseEntry dbData = new DatabaseEntry(); 
  if (dbt.fetch(iDocument, dbData) )
  { long lastModified = new File(iDocument).lastModified();   //*-- get the last modified date  
  tempDoc = (IndexableDoc) tempBinding.entryToObject(dbData);


  //*-- if indexed earlier skip
  if (tempDoc.getIdate() > lastModified) 
  { crawlManager.updateLogdata("indexed earlier"); indexedEarlier = true; }


  //*-- otherwise, delete the existing index for the document
  //*-- should occur only for modified documents
  else
  { try  
     { synchronized(this)
       { FSDirectory fsd = FSDirectory.getDirectory(new File(Constants.getINDEXDIR()), false);
         IndexReader ir = IndexReader.open(fsd);
         ir.deleteDocuments(new Term("key", iDocument ) ); 
         ir.close(); 
       }
      }
      catch (IOException ie) { logger.error("Failed to delete " + iDocument + " " + ie.getMessage() ); }
     }
  }  
  dbt.closeDB();
  dbTime += new Date().getTime();
  return(indexedEarlier);
 }


 /**
  * Return true or false depending on whether the file content is a duplicate of another document
  */
 private boolean duplicateDoc()
 {   
  checkDupTime -= new Date().getTime(); 
  long contentsLen = (tempDoc != null) ? tempDoc.getContents().length(): 0;
  String signatureKey = (contentsLen < Constants.DB_DOC_STORE_LIMIT) ?
  StringTools.shaDigest(tempDoc.getContents().toString()): 
  StringTools.shaDigest(tempDoc.getContents().substring(0, Constants.DB_DOC_STORE_LIMIT));


  //*-- check for a duplicate  
  boolean duplicateDoc = false; boolean noCreateflag = false; boolean dupFlag = false;
  if (!crawlConfig.isKeepDups())
   {  
    dbt.openDB(Constants.EXT_FILES_DB, noCreateflag, dupFlag); 
    dbt.openSecDB(Constants.EXT_FILES_SECDB, false, new SecKeyDoc( tempBinding ) );  
    DatabaseEntry data = dbt.fetchSec(signatureKey);
    if (data != null)
    { 
     IndexableDoc tDoc =  (IndexableDoc) tempBinding.entryToObject(data);
     String filename = tDoc.getFileName();


     //*-- verify that the file exists in the filesystem, a file may have been moved
     File tfile = new File(filename); 
     if (tfile.exists())
     { logger.info("Duplicate object" + tempDoc.getFileName() + " of file: " + filename);
     crawlManager.updateLogdata( "duplicates" );      duplicateDoc = true; }
     else
     { logger.info("Duplicate object" + tempDoc.getFileName() + " of file: " + filename + " does not exist in filesystem"); }
    } //*-- end of inner if
    dbt.closeSecDB(); dbt.closeDB();
   } //*-- end of outer if


   if (!duplicateDoc) tempDoc.setFileSignature(signatureKey); 
   checkDupTime += new Date().getTime();
   return(duplicateDoc);
 }


 /**
  * Classify the document into a file type - book, article, letter, webpage, etc. 
  * and return the class to handle the particular document type
  * @param doc Indexable document
  * @param dprops Handle to document property types
  * @param mprops Handle to media types
  * @return String Return the name of the class depending on the type of document
  */
 public String setClass(IndexableDoc doc, Properties dprops, Properties mprops)
 {
  //*-- set the file type
  classifyFtypeTime -= new Date().getTime(); 
  String ftype = doc.getFileType();
  if (ftype.equals("text")) ftype = cdoc.classifyFileType(doc); 
  else if (ftype.equals("media"))
  { ftype = mprops.getProperty(StringTools.getSuffix( doc.getFileName()) ); 
  if ( (ftype == null) || (ftype.equals("") ) ) ftype = "unknown"; }
  doc.setFileType(ftype); 
  classifyFtypeTime += new Date().getTime(); 


  //*-- find the class for the type of document   
  String docClass = dprops.getProperty(ftype); 
  if ( (docClass == null) || (docClass.equals("")) ) 
  { docClass = "org.sf.mustru.docs.TextDoc"; 
  logger.error("Failed to get a file type for " + ftype);
  }


  return (docClass);
 }


 /**
  * Create the Lucene index for the document and optionally extract entities using Annie
  * @param idoc
  * @throws IOException
  */
 public void setIndex(IndexableDoc idoc) throws IOException
 {
  if (idoc == null) return;


  //*-- create the Lucene index
  luceneTime -= new Date().getTime();


  try { idoc.loadIndex(ramIW, false); }
  catch (IOException ie) { logger.error("Could not create index for " + idoc.getFileName() + " " + ie.getMessage() ); }
  luceneTime += new Date().getTime(); 


  //*-- store the entry in the database, limit the length of the content stored in the database
  //*-- create the key and data to store the entry
  docStoreTime -= new Date().getTime();
  dbt.openDB(Constants.EXT_FILES_DB, false);
  dbt.openSecDB(Constants.EXT_FILES_SECDB, false, new SecKeyDoc( tempBinding ) );
  int contentSizeLimit = Constants.DB_DOC_STORE_LIMIT;
  int textSize = idoc.getContents().length();
  if (textSize > contentSizeLimit ) 
   idoc.setContents( (new StringBuffer( idoc.getContents().substring(0, contentSizeLimit).trim() ) ) );
  idoc.setIdate(new Date().getTime() );
  boolean allowDups = (crawlConfig.isFreshCrawl() ) ? false: true;
  if (!dbt.insert( idoc.getFileName(), idoc, allowDups) ) 
   logger.error("Could not store document " + idoc.getFileName() + " in the database");
  dbt.closeSecDB(); dbt.closeDB();
  
  //*-- train the spell checker
  if (crawlConfig.isSpellCheck())
    synchronized(this) { crawlManager.getSc().train(idoc.getContents().toString()); }
  
  docStoreTime += new Date().getTime(); 
 }


 
 // *-- initialize the passed indexable doc
 private void initDoc(String filename, IndexableDoc doc)
 {
  doc.setAuthor(""); doc.setBdbBinding(tempBinding); doc.setContextFile("");
  doc.setFileLength(0); doc.setFileLocation(""); doc.setFileName(filename);
  doc.setFileRank(0); doc.setFileSignature(""); doc.setIdate(0);
  doc.setLanguage(""); doc.setMdate(0); doc.setMetadata("");
  doc.setFileName(filename); doc.setSummary(""); doc.setTitle("");
  doc.setFileType(""); doc.setContents(new StringBuffer());


  if (!(filename.equals(""))) {
   tempFile = new File(filename);
   doc.setFileLength(tempFile.length());
   doc.setMdate(tempFile.lastModified());
  }
 }


 /**
  * Update the timers for this thread in the caller instance
  */
 public void updateTimers()
 {
  int threadNum = getthreadNum();
  miscTime = totalTime
  - (initTime + fileReadTime + dbTime + docFilterTime + checkDupTime
    + classifyFtypeTime + classifyTextTime + docLoadTime + luceneTime
    + docStoreTime + syncTime);
  crawlManager.updateTimers(threadNum, "initTime", initTime);
  crawlManager.updateTimers(threadNum, "fileReadTime", fileReadTime);
  crawlManager.updateTimers(threadNum, "dbTime", dbTime);
  crawlManager.updateTimers(threadNum, "docFilterTime", docFilterTime);
  crawlManager.updateTimers(threadNum, "checkDupTime", checkDupTime);
  crawlManager.updateTimers(threadNum, "classifyFtypeTime", classifyFtypeTime);
  crawlManager.updateTimers(threadNum, "classifyTextTime", classifyTextTime);
  crawlManager.updateTimers(threadNum, "docLoadTime", docLoadTime);
  crawlManager.updateTimers(threadNum, "luceneTime", luceneTime);
  crawlManager.updateTimers(threadNum, "docStoreTime", docStoreTime);
  crawlManager.updateTimers(threadNum, "syncTime", syncTime);
  crawlManager.updateTimers(threadNum, "miscTime", miscTime);
  crawlManager.updateTimers(threadNum, "totalTime", totalTime);
 }


 /*
  * Create a RAM based Lucene index
  */
 private void createRamIW()
 { try { ramIW = new IndexWriter(ramDir, analyzer, true);  }
   catch (IOException ie) { logger.error("Could not create RAM index writer " + ie.getMessage()); }
   ramIW.setMaxFieldLength(Constants.LUCENE_MAX_WORDS);
 }


 public void endThread()
 {
  setRunning(false);
 }


 public boolean isRunning()
 {
  return running;
 }


 public void setRunning(boolean running)
 {
  this.running = running;
 }


 public int getthreadNum()
 {
  return this.threadNum;
 }


 // *-- timers for profiling
 long initTime = 0;     // *-- loading constructors
 long fileReadTime = 0;   // *-- reading file time
 long dbTime = 0;     // *-- database access time
 long docFilterTime = 0;   // *-- document conversion to text
 long checkDupTime = 0;   // *-- time to check for duplicates
 long classifyFtypeTime = 0;   // *-- time to classify document into one of the file types
 long classifyTextTime = 0;   // *-- time to classify docunemt text
 long docLoadTime = 0;     // *-- time to create a specific instance of a document
 long luceneTime = 0;     // *-- indexing time
 long docStoreTime = 0;   // *-- time to store the document in the database
 long syncTime = 0;     // *-- time to synchronize the database and merge RAM indexes with file index
 long miscTime = 0;     // *-- time for other misc. activities
 long totalTime = 0;     //*-- time to complete the scan  


}
Source Code of org.sf.mustru.crawl.CrawlThread

Related Classes of org.sf.mustru.crawl.CrawlThread