Package org.sf.mustru.crawl

Source Code of org.sf.mustru.crawl.CrawlThread

package org.sf.mustru.crawl;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Date;
import java.util.Properties;

import org.apache.log4j.Logger;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;

import org.sf.mustru.docs.*;
import org.sf.mustru.crawl.CrawlConfig;
import org.sf.mustru.filters.StarHandler;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.StandardBgramAnalyzer;
import org.sf.mustru.utils.StringTools;
import com.sleepycat.je.DatabaseEntry;

/**
* Crawl thread that reads the task file for documents to index. Multiple threads
* will share the workload. Each entry in the task file is processed - text extraction,
* indexed, and categorized.
*/
public class CrawlThread extends Thread
{
static Logger logger = Logger.getLogger(CrawlThread.class.getName());

private CrawlManager crawlManager;    //*-- instance of caller
private CrawlConfig crawlConfig;    //*-- Crawler configuration object
private int threadNum;      //*-- thread number of this instance
private IndexableDoc tempDoc;    //*-- a reusable doc
private IndexableDocBinding tempBinding;  //*-- a reusable DB binding
private File tempFile;        //*-- a temporary reusable file handle
private int numThreads;      //*-- number of threads in the ensemble
private int currentDoc;      //*-- the index of the current document being processed
private int docsProcessed;      //*-- the number of documents processed so far
private boolean running;      //*-- flag to indicate if the thread should continue running
private DbTools dbt;        //*-- tools to access the BDB

//*-- Lucene vars
private IndexWriter fileIW;      //*-- File system based IndexWriter instance from CrawlManager
private IndexWriter ramIW;      //*-- RAM based instance IndexWriter
private RAMDirectory ramDir;      //*-- RAM based directory
private StandardBgramAnalyzer analyzer;  //*-- the analyzer for the Lucene index
private ClassifyDoc cdoc;      //*-- classifier for file type and content

/**
  * Set the instance variables needed to process documents from the taskfile
  * @param threadNum   Thread number of current thread
  * @param crawlConfig  Crawler configuration
  * @param caller  Caller of this thread - the Crawl Manager
  */
public CrawlThread (int threadNum, CrawlConfig crawlConfig, CrawlManager caller) throws IOException
{
  super();
  setRunning(true);
  totalTime = totalTime - new Date().getTime();
  initTime -=  new Date().getTime();      
  this.crawlConfig = crawlConfig;              this.threadNum = threadNum;   this.crawlManager = caller;
  numThreads = crawlConfig.getNumThreads();    tempDoc = new IndexableDoc();   tempBinding = new IndexableDocBinding();  
  tempFile = null;        fileIW = caller.getIw();                 
  cdoc = new ClassifyDoc();      docsProcessed = 0;  
  dbt = Constants.getDbt();

  //*-- create the RAM based Lucene IndexWriter
  analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
  ramDir = new RAMDirectory(); //Similarity.setDefault(new SearchSimilarity());
  createRamIW();
  initTime +=  new Date().getTime();
}

/**
  *  Scan the contents of the task file. Each line contains the path to an indexable document
  *  on the filesystem. Use the thread number and number of threads to distribute the load evenly
  *  across all threads. The text for each file is extracted, classified, and indexed.
  */
public void run()
{     
  //*-- read the filters file and get a generic handler to extract text 
  fileReadTime -=  new Date().getTime();
  Properties props = null; StarHandler sh = null;
  try { props = new Properties(); props.load(new BufferedInputStream(new FileInputStream(Constants.FILTER_FILE)));
  sh = new StarHandler(props); }
  catch (IOException e) { crawlManager.cleanUp("Could not read filtersFile " + e.getMessage()); }
  if (sh == null) crawlManager.cleanUp("The generic handler to filter files was not found");

  //*-- read the document types file to classify documents into types - text, image, audio, book, letter, etc.
  Properties dprops = null;
  try { dprops = new Properties(); dprops.load(new BufferedInputStream(new FileInputStream(Constants.DOCTYPES_FILE))); }
  catch (IOException e)  { crawlManager.cleanUp("Could not read doctypesFile " + e.getMessage()); }

  //*-- read the media suffixes and associated file types properties file
  Properties mprops = new Properties();
  try { mprops.load(new BufferedInputStream(new FileInputStream( Constants.MEDIATYPES_FILE))); }
  catch ( IOException e) { crawlManager.cleanUp("Could not read media types file " + e.getMessage()); }
  fileReadTime += new Date().getTime();

  currentDoc = crawlManager.getCurrentDoc(threadNum);

  long startMemory = Runtime.getRuntime().freeMemory();
  logger.info("Free memory at start of scan: " + startMemory);

  //*-- scan the list of files and process depending on thread number
  String iDocument = ""; IndexableDoc idoc = null; boolean completed = false;
  FILELOOP: while (!completed)
  {
   //*-- distribute the load evenly
   if ( (++currentDoc % numThreads) != threadNum continue FILELOOP;

   //*-- for a restart skip documents scanned earlier
   if (currentDoc < crawlConfig.getStartPosition()) continue FILELOOP;

   //*-- terminate if all files have been processed
   if (currentDoc >= crawlManager.totalNumFiles() ) { completed = true; break FILELOOP; }

   //*-- periodically synchronize the indexes
   syncTime -= new Date().getTime();
   long freeMemory = Runtime.getRuntime().freeMemory();
   //  if (freeMemory < startMemory)  mergeIndexes(false);
   if ( ( (currentDoc % 1000) == 0) && (docsProcessed > 0) ) mergeIndexes(false);
   syncTime += new Date().getTime();          
   try
   {
    //*-- check if the document was indexed prior to the modification date of the file
    iDocument = crawlManager.getFileName(currentDoc); initDoc(iDocument, tempDoc);
    //logger.info(currentDoc + ": Started File: " + iDocument + "<----");
    if (checkIndexed(iDocument)) continue FILELOOP;

    //*-- extract the text from the document and create an indexable document
    docFilterTime -= new Date().getTime();
    sh.getDocument(iDocument, tempDoc);
    docFilterTime += new Date().getTime();

    //*-- get the signature of the document text and check for duplicates
    if (duplicateDoc()) continue FILELOOP;

    //*-- classify the text of the document into one of the document types
    String docClass = setClass(tempDoc, dprops, mprops);

    //*-- create a new instance of the specific type of document
    docLoadTime -= new Date().getTime();
    Class docType = Class.forName(docClass);
    idoc =  (IndexableDoc) docType.newInstance();
    idoc.loadGeneric(tempDoc);    //*-- copy generic information to the instance

    classifyTextTime -= new Date().getTime();
    idoc.loadSpecific(cdoc);    //*-- create the specific information for the type of document
    classifyTextTime += new Date().getTime();
    docLoadTime += new Date().getTime();

    //*-- increment the log for the type of document in the caller
    crawlManager.updateLogdata( idoc.getFileType() );

    //*-- index the document and extract entities
    setIndex(idoc); initDoc("", idoc);               
    docsProcessed++;
    // if (docsProcessed == 5000) completed = true;
   }

   catch (ClassNotFoundException ce)
   { logger.error("Could not get doc class : " + ce.getMessage()); }
   catch (InstantiationException ie)
   { logger.error("Could not instantiate doc class: " + ie.getMessage() ); }
   catch (IllegalAccessException ae)
   { logger.error("Could not access class: " + ae.getMessage()); }
   catch (IOException ie)
   { logger.error("IO Error : " + ie.getMessage());
   //*-- save the state for errors and exceptions for a restart of the thread
   catch (OutOfMemoryError oe)
   { logger.error("Ran out of memory, will restart : " + oe.getMessage());
     completed = true;  crawlManager.setEnoughResources(false);
     crawlManager.setCurrentDoc(threadNum, currentDoc - 1);
   }
   finally
   { logger.info(currentDoc + ": Finished File: " + iDocument + "<---- " + freeMemory);
     if (!isRunning()) completed = true//*-- if forced to stop by BuildIndex
   }

  } //*-- end of while

  logger.info("Processed " + docsProcessed + " documents");
  mergeIndexes(true);
  dbt.closeDB();

  totalTime += new Date().getTime();
  updateTimers();
  endThread();
}

/**
  * Sync. the Berkeley DB environment and merge the Lucene indexes
  * @param last Flag to indicate the last time function will be called
  */
public void mergeIndexes(boolean last)
{
  logger.info("Sync. environment and indexes");
  Directory[] dirs = {ramDir};

  synchronized(this)
  { try  { dbt.syncEnv(); ramIW.close(); fileIW.addIndexes(dirs)}
    catch (IOException ie ) { crawlManager.cleanUp("Could not merge Lucene indexes " + ie.getMessage() ); }
  }

  //*-- if not last iteration allocate the RAM index
  if (!last) { createRamIW(); tempDoc = new IndexableDoc(); }

// Runtime.getRuntime().gc();
}

/**
  * Return true or false depending on whether the file was indexed earlier
  */
private boolean checkIndexed(String iDocument)
{
  dbTime -= new Date().getTime();
  boolean indexedEarlier = false; boolean noCreateflag = false; boolean dupFlag = false;
  dbt.openDB(Constants.EXT_FILES_DB, noCreateflag, dupFlag);
  DatabaseEntry dbData = new DatabaseEntry();
  if (dbt.fetch(iDocument, dbData) )
  { long lastModified = new File(iDocument).lastModified();   //*-- get the last modified date 
  tempDoc = (IndexableDoc) tempBinding.entryToObject(dbData);

  //*-- if indexed earlier skip
  if (tempDoc.getIdate() > lastModified)
  { crawlManager.updateLogdata("indexed earlier"); indexedEarlier = true; }

  //*-- otherwise, delete the existing index for the document
  //*-- should occur only for modified documents
  else
  { try 
     { synchronized(this)
       { FSDirectory fsd = FSDirectory.getDirectory(new File(Constants.getINDEXDIR()), false);
         IndexReader ir = IndexReader.open(fsd);
         ir.deleteDocuments(new Term("key", iDocument ) );
         ir.close();
       }
      }
      catch (IOException ie) { logger.error("Failed to delete " + iDocument + " " + ie.getMessage() ); }
     }
  } 
  dbt.closeDB();
  dbTime += new Date().getTime();
  return(indexedEarlier);
}

/**
  * Return true or false depending on whether the file content is a duplicate of another document
  */
private boolean duplicateDoc()
{  
  checkDupTime -= new Date().getTime();
  long contentsLen = (tempDoc != null) ? tempDoc.getContents().length(): 0;
  String signatureKey = (contentsLen < Constants.DB_DOC_STORE_LIMIT) ?
  StringTools.shaDigest(tempDoc.getContents().toString()):
  StringTools.shaDigest(tempDoc.getContents().substring(0, Constants.DB_DOC_STORE_LIMIT));

  //*-- check for a duplicate 
  boolean duplicateDoc = false; boolean noCreateflag = false; boolean dupFlag = false;
  if (!crawlConfig.isKeepDups())
   { 
    dbt.openDB(Constants.EXT_FILES_DB, noCreateflag, dupFlag);
    dbt.openSecDB(Constants.EXT_FILES_SECDB, false, new SecKeyDoc( tempBinding ) )
    DatabaseEntry data = dbt.fetchSec(signatureKey);
    if (data != null)
    {
     IndexableDoc tDoc =  (IndexableDoc) tempBinding.entryToObject(data);
     String filename = tDoc.getFileName();

     //*-- verify that the file exists in the filesystem, a file may have been moved
     File tfile = new File(filename);
     if (tfile.exists())
     { logger.info("Duplicate object" + tempDoc.getFileName() + " of file: " + filename);
     crawlManager.updateLogdata( "duplicates" );      duplicateDoc = true; }
     else
     { logger.info("Duplicate object" + tempDoc.getFileName() + " of file: " + filename + " does not exist in filesystem"); }
    } //*-- end of inner if
    dbt.closeSecDB(); dbt.closeDB();
   } //*-- end of outer if

   if (!duplicateDoc) tempDoc.setFileSignature(signatureKey);
   checkDupTime += new Date().getTime();
   return(duplicateDoc);
}

/**
  * Classify the document into a file type - book, article, letter, webpage, etc.
  * and return the class to handle the particular document type
  * @param doc Indexable document
  * @param dprops Handle to document property types
  * @param mprops Handle to media types
  * @return String Return the name of the class depending on the type of document
  */
public String setClass(IndexableDoc doc, Properties dprops, Properties mprops)
{
  //*-- set the file type
  classifyFtypeTime -= new Date().getTime();
  String ftype = doc.getFileType();
  if (ftype.equals("text")) ftype = cdoc.classifyFileType(doc);
  else if (ftype.equals("media"))
  { ftype = mprops.getProperty(StringTools.getSuffix( doc.getFileName()) );
  if ( (ftype == null) || (ftype.equals("") ) ) ftype = "unknown"; }
  doc.setFileType(ftype);
  classifyFtypeTime += new Date().getTime();

  //*-- find the class for the type of document  
  String docClass = dprops.getProperty(ftype);
  if ( (docClass == null) || (docClass.equals("")) )
  { docClass = "org.sf.mustru.docs.TextDoc";
  logger.error("Failed to get a file type for " + ftype);
  }

  return (docClass);
}

/**
  * Create the Lucene index for the document and optionally extract entities using Annie
  * @param idoc
  * @throws IOException
  */
public void setIndex(IndexableDoc idoc) throws IOException
{
  if (idoc == null) return;

  //*-- create the Lucene index
  luceneTime -= new Date().getTime();

  try { idoc.loadIndex(ramIW, false); }
  catch (IOException ie) { logger.error("Could not create index for " + idoc.getFileName() + " " + ie.getMessage() ); }
  luceneTime += new Date().getTime();

  //*-- store the entry in the database, limit the length of the content stored in the database
  //*-- create the key and data to store the entry
  docStoreTime -= new Date().getTime();
  dbt.openDB(Constants.EXT_FILES_DB, false);
  dbt.openSecDB(Constants.EXT_FILES_SECDB, false, new SecKeyDoc( tempBinding ) );
  int contentSizeLimit = Constants.DB_DOC_STORE_LIMIT;
  int textSize = idoc.getContents().length();
  if (textSize > contentSizeLimit )
   idoc.setContents( (new StringBuffer( idoc.getContents().substring(0, contentSizeLimit).trim() ) ) );
  idoc.setIdate(new Date().getTime() );
  boolean allowDups = (crawlConfig.isFreshCrawl() ) ? false: true;
  if (!dbt.insert( idoc.getFileName(), idoc, allowDups) )
   logger.error("Could not store document " + idoc.getFileName() + " in the database");
  dbt.closeSecDB(); dbt.closeDB();
 
  //*-- train the spell checker
  if (crawlConfig.isSpellCheck())
    synchronized(this) { crawlManager.getSc().train(idoc.getContents().toString()); }
 
  docStoreTime += new Date().getTime();
}

// *-- initialize the passed indexable doc
private void initDoc(String filename, IndexableDoc doc)
{
  doc.setAuthor(""); doc.setBdbBinding(tempBinding); doc.setContextFile("");
  doc.setFileLength(0); doc.setFileLocation(""); doc.setFileName(filename);
  doc.setFileRank(0); doc.setFileSignature(""); doc.setIdate(0);
  doc.setLanguage(""); doc.setMdate(0); doc.setMetadata("");
  doc.setFileName(filename); doc.setSummary(""); doc.setTitle("");
  doc.setFileType(""); doc.setContents(new StringBuffer());

  if (!(filename.equals(""))) {
   tempFile = new File(filename);
   doc.setFileLength(tempFile.length());
   doc.setMdate(tempFile.lastModified());
  }
}

/**
  * Update the timers for this thread in the caller instance
  */
public void updateTimers()
{
  int threadNum = getthreadNum();
  miscTime = totalTime
  - (initTime + fileReadTime + dbTime + docFilterTime + checkDupTime
    + classifyFtypeTime + classifyTextTime + docLoadTime + luceneTime
    + docStoreTime + syncTime);
  crawlManager.updateTimers(threadNum, "initTime", initTime);
  crawlManager.updateTimers(threadNum, "fileReadTime", fileReadTime);
  crawlManager.updateTimers(threadNum, "dbTime", dbTime);
  crawlManager.updateTimers(threadNum, "docFilterTime", docFilterTime);
  crawlManager.updateTimers(threadNum, "checkDupTime", checkDupTime);
  crawlManager.updateTimers(threadNum, "classifyFtypeTime", classifyFtypeTime);
  crawlManager.updateTimers(threadNum, "classifyTextTime", classifyTextTime);
  crawlManager.updateTimers(threadNum, "docLoadTime", docLoadTime);
  crawlManager.updateTimers(threadNum, "luceneTime", luceneTime);
  crawlManager.updateTimers(threadNum, "docStoreTime", docStoreTime);
  crawlManager.updateTimers(threadNum, "syncTime", syncTime);
  crawlManager.updateTimers(threadNum, "miscTime", miscTime);
  crawlManager.updateTimers(threadNum, "totalTime", totalTime);
}

/*
  * Create a RAM based Lucene index
  */
private void createRamIW()
{ try { ramIW = new IndexWriter(ramDir, analyzer, true)}
   catch (IOException ie) { logger.error("Could not create RAM index writer " + ie.getMessage()); }
   ramIW.setMaxFieldLength(Constants.LUCENE_MAX_WORDS);
}

public void endThread()
{
  setRunning(false);
}

public boolean isRunning()
{
  return running;
}

public void setRunning(boolean running)
{
  this.running = running;
}

public int getthreadNum()
{
  return this.threadNum;
}

// *-- timers for profiling
long initTime = 0;     // *-- loading constructors
long fileReadTime = 0;   // *-- reading file time
long dbTime = 0;     // *-- database access time
long docFilterTime = 0;   // *-- document conversion to text
long checkDupTime = 0;   // *-- time to check for duplicates
long classifyFtypeTime = 0;   // *-- time to classify document into one of the file types
long classifyTextTime = 0;   // *-- time to classify docunemt text
long docLoadTime = 0;     // *-- time to create a specific instance of a document
long luceneTime = 0;     // *-- indexing time
long docStoreTime = 0;   // *-- time to store the document in the database
long syncTime = 0;     // *-- time to synchronize the database and merge RAM indexes with file index
long miscTime = 0;     // *-- time for other misc. activities
long totalTime = 0;     //*-- time to complete the scan 

}
TOP

Related Classes of org.sf.mustru.crawl.CrawlThread

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.