package org.sf.mustru.crawl;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Date;
import java.util.Properties;
import org.apache.log4j.Logger;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.sf.mustru.docs.*;
import org.sf.mustru.crawl.CrawlConfig;
import org.sf.mustru.filters.StarHandler;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.StandardBgramAnalyzer;
import org.sf.mustru.utils.StringTools;
import com.sleepycat.je.DatabaseEntry;
/**
* Crawl thread that reads the task file for documents to index. Multiple threads
* will share the workload. Each entry in the task file is processed - text extraction,
* indexed, and categorized.
*/
public class CrawlThread extends Thread
{
static Logger logger = Logger.getLogger(CrawlThread.class.getName());
private CrawlManager crawlManager; //*-- instance of caller
private CrawlConfig crawlConfig; //*-- Crawler configuration object
private int threadNum; //*-- thread number of this instance
private IndexableDoc tempDoc; //*-- a reusable doc
private IndexableDocBinding tempBinding; //*-- a reusable DB binding
private File tempFile; //*-- a temporary reusable file handle
private int numThreads; //*-- number of threads in the ensemble
private int currentDoc; //*-- the index of the current document being processed
private int docsProcessed; //*-- the number of documents processed so far
private boolean running; //*-- flag to indicate if the thread should continue running
private DbTools dbt; //*-- tools to access the BDB
//*-- Lucene vars
private IndexWriter fileIW; //*-- File system based IndexWriter instance from CrawlManager
private IndexWriter ramIW; //*-- RAM based instance IndexWriter
private RAMDirectory ramDir; //*-- RAM based directory
private StandardBgramAnalyzer analyzer; //*-- the analyzer for the Lucene index
private ClassifyDoc cdoc; //*-- classifier for file type and content
/**
* Set the instance variables needed to process documents from the taskfile
* @param threadNum Thread number of current thread
* @param crawlConfig Crawler configuration
* @param caller Caller of this thread - the Crawl Manager
*/
public CrawlThread (int threadNum, CrawlConfig crawlConfig, CrawlManager caller) throws IOException
{
super();
setRunning(true);
totalTime = totalTime - new Date().getTime();
initTime -= new Date().getTime();
this.crawlConfig = crawlConfig; this.threadNum = threadNum; this.crawlManager = caller;
numThreads = crawlConfig.getNumThreads(); tempDoc = new IndexableDoc(); tempBinding = new IndexableDocBinding();
tempFile = null; fileIW = caller.getIw();
cdoc = new ClassifyDoc(); docsProcessed = 0;
dbt = Constants.getDbt();
//*-- create the RAM based Lucene IndexWriter
analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
ramDir = new RAMDirectory(); //Similarity.setDefault(new SearchSimilarity());
createRamIW();
initTime += new Date().getTime();
}
/**
* Scan the contents of the task file. Each line contains the path to an indexable document
* on the filesystem. Use the thread number and number of threads to distribute the load evenly
* across all threads. The text for each file is extracted, classified, and indexed.
*/
public void run()
{
//*-- read the filters file and get a generic handler to extract text
fileReadTime -= new Date().getTime();
Properties props = null; StarHandler sh = null;
try { props = new Properties(); props.load(new BufferedInputStream(new FileInputStream(Constants.FILTER_FILE)));
sh = new StarHandler(props); }
catch (IOException e) { crawlManager.cleanUp("Could not read filtersFile " + e.getMessage()); }
if (sh == null) crawlManager.cleanUp("The generic handler to filter files was not found");
//*-- read the document types file to classify documents into types - text, image, audio, book, letter, etc.
Properties dprops = null;
try { dprops = new Properties(); dprops.load(new BufferedInputStream(new FileInputStream(Constants.DOCTYPES_FILE))); }
catch (IOException e) { crawlManager.cleanUp("Could not read doctypesFile " + e.getMessage()); }
//*-- read the media suffixes and associated file types properties file
Properties mprops = new Properties();
try { mprops.load(new BufferedInputStream(new FileInputStream( Constants.MEDIATYPES_FILE))); }
catch ( IOException e) { crawlManager.cleanUp("Could not read media types file " + e.getMessage()); }
fileReadTime += new Date().getTime();
currentDoc = crawlManager.getCurrentDoc(threadNum);
long startMemory = Runtime.getRuntime().freeMemory();
logger.info("Free memory at start of scan: " + startMemory);
//*-- scan the list of files and process depending on thread number
String iDocument = ""; IndexableDoc idoc = null; boolean completed = false;
FILELOOP: while (!completed)
{
//*-- distribute the load evenly
if ( (++currentDoc % numThreads) != threadNum ) continue FILELOOP;
//*-- for a restart skip documents scanned earlier
if (currentDoc < crawlConfig.getStartPosition()) continue FILELOOP;
//*-- terminate if all files have been processed
if (currentDoc >= crawlManager.totalNumFiles() ) { completed = true; break FILELOOP; }
//*-- periodically synchronize the indexes
syncTime -= new Date().getTime();
long freeMemory = Runtime.getRuntime().freeMemory();
// if (freeMemory < startMemory) mergeIndexes(false);
if ( ( (currentDoc % 1000) == 0) && (docsProcessed > 0) ) mergeIndexes(false);
syncTime += new Date().getTime();
try
{
//*-- check if the document was indexed prior to the modification date of the file
iDocument = crawlManager.getFileName(currentDoc); initDoc(iDocument, tempDoc);
//logger.info(currentDoc + ": Started File: " + iDocument + "<----");
if (checkIndexed(iDocument)) continue FILELOOP;
//*-- extract the text from the document and create an indexable document
docFilterTime -= new Date().getTime();
sh.getDocument(iDocument, tempDoc);
docFilterTime += new Date().getTime();
//*-- get the signature of the document text and check for duplicates
if (duplicateDoc()) continue FILELOOP;
//*-- classify the text of the document into one of the document types
String docClass = setClass(tempDoc, dprops, mprops);
//*-- create a new instance of the specific type of document
docLoadTime -= new Date().getTime();
Class docType = Class.forName(docClass);
idoc = (IndexableDoc) docType.newInstance();
idoc.loadGeneric(tempDoc); //*-- copy generic information to the instance
classifyTextTime -= new Date().getTime();
idoc.loadSpecific(cdoc); //*-- create the specific information for the type of document
classifyTextTime += new Date().getTime();
docLoadTime += new Date().getTime();
//*-- increment the log for the type of document in the caller
crawlManager.updateLogdata( idoc.getFileType() );
//*-- index the document and extract entities
setIndex(idoc); initDoc("", idoc);
docsProcessed++;
// if (docsProcessed == 5000) completed = true;
}
catch (ClassNotFoundException ce)
{ logger.error("Could not get doc class : " + ce.getMessage()); }
catch (InstantiationException ie)
{ logger.error("Could not instantiate doc class: " + ie.getMessage() ); }
catch (IllegalAccessException ae)
{ logger.error("Could not access class: " + ae.getMessage()); }
catch (IOException ie)
{ logger.error("IO Error : " + ie.getMessage()); }
//*-- save the state for errors and exceptions for a restart of the thread
catch (OutOfMemoryError oe)
{ logger.error("Ran out of memory, will restart : " + oe.getMessage());
completed = true; crawlManager.setEnoughResources(false);
crawlManager.setCurrentDoc(threadNum, currentDoc - 1);
}
finally
{ logger.info(currentDoc + ": Finished File: " + iDocument + "<---- " + freeMemory);
if (!isRunning()) completed = true; //*-- if forced to stop by BuildIndex
}
} //*-- end of while
logger.info("Processed " + docsProcessed + " documents");
mergeIndexes(true);
dbt.closeDB();
totalTime += new Date().getTime();
updateTimers();
endThread();
}
/**
* Sync. the Berkeley DB environment and merge the Lucene indexes
* @param last Flag to indicate the last time function will be called
*/
public void mergeIndexes(boolean last)
{
logger.info("Sync. environment and indexes");
Directory[] dirs = {ramDir};
synchronized(this)
{ try { dbt.syncEnv(); ramIW.close(); fileIW.addIndexes(dirs); }
catch (IOException ie ) { crawlManager.cleanUp("Could not merge Lucene indexes " + ie.getMessage() ); }
}
//*-- if not last iteration allocate the RAM index
if (!last) { createRamIW(); tempDoc = new IndexableDoc(); }
// Runtime.getRuntime().gc();
}
/**
* Return true or false depending on whether the file was indexed earlier
*/
private boolean checkIndexed(String iDocument)
{
dbTime -= new Date().getTime();
boolean indexedEarlier = false; boolean noCreateflag = false; boolean dupFlag = false;
dbt.openDB(Constants.EXT_FILES_DB, noCreateflag, dupFlag);
DatabaseEntry dbData = new DatabaseEntry();
if (dbt.fetch(iDocument, dbData) )
{ long lastModified = new File(iDocument).lastModified(); //*-- get the last modified date
tempDoc = (IndexableDoc) tempBinding.entryToObject(dbData);
//*-- if indexed earlier skip
if (tempDoc.getIdate() > lastModified)
{ crawlManager.updateLogdata("indexed earlier"); indexedEarlier = true; }
//*-- otherwise, delete the existing index for the document
//*-- should occur only for modified documents
else
{ try
{ synchronized(this)
{ FSDirectory fsd = FSDirectory.getDirectory(new File(Constants.getINDEXDIR()), false);
IndexReader ir = IndexReader.open(fsd);
ir.deleteDocuments(new Term("key", iDocument ) );
ir.close();
}
}
catch (IOException ie) { logger.error("Failed to delete " + iDocument + " " + ie.getMessage() ); }
}
}
dbt.closeDB();
dbTime += new Date().getTime();
return(indexedEarlier);
}
/**
* Return true or false depending on whether the file content is a duplicate of another document
*/
private boolean duplicateDoc()
{
checkDupTime -= new Date().getTime();
long contentsLen = (tempDoc != null) ? tempDoc.getContents().length(): 0;
String signatureKey = (contentsLen < Constants.DB_DOC_STORE_LIMIT) ?
StringTools.shaDigest(tempDoc.getContents().toString()):
StringTools.shaDigest(tempDoc.getContents().substring(0, Constants.DB_DOC_STORE_LIMIT));
//*-- check for a duplicate
boolean duplicateDoc = false; boolean noCreateflag = false; boolean dupFlag = false;
if (!crawlConfig.isKeepDups())
{
dbt.openDB(Constants.EXT_FILES_DB, noCreateflag, dupFlag);
dbt.openSecDB(Constants.EXT_FILES_SECDB, false, new SecKeyDoc( tempBinding ) );
DatabaseEntry data = dbt.fetchSec(signatureKey);
if (data != null)
{
IndexableDoc tDoc = (IndexableDoc) tempBinding.entryToObject(data);
String filename = tDoc.getFileName();
//*-- verify that the file exists in the filesystem, a file may have been moved
File tfile = new File(filename);
if (tfile.exists())
{ logger.info("Duplicate object" + tempDoc.getFileName() + " of file: " + filename);
crawlManager.updateLogdata( "duplicates" ); duplicateDoc = true; }
else
{ logger.info("Duplicate object" + tempDoc.getFileName() + " of file: " + filename + " does not exist in filesystem"); }
} //*-- end of inner if
dbt.closeSecDB(); dbt.closeDB();
} //*-- end of outer if
if (!duplicateDoc) tempDoc.setFileSignature(signatureKey);
checkDupTime += new Date().getTime();
return(duplicateDoc);
}
/**
* Classify the document into a file type - book, article, letter, webpage, etc.
* and return the class to handle the particular document type
* @param doc Indexable document
* @param dprops Handle to document property types
* @param mprops Handle to media types
* @return String Return the name of the class depending on the type of document
*/
public String setClass(IndexableDoc doc, Properties dprops, Properties mprops)
{
//*-- set the file type
classifyFtypeTime -= new Date().getTime();
String ftype = doc.getFileType();
if (ftype.equals("text")) ftype = cdoc.classifyFileType(doc);
else if (ftype.equals("media"))
{ ftype = mprops.getProperty(StringTools.getSuffix( doc.getFileName()) );
if ( (ftype == null) || (ftype.equals("") ) ) ftype = "unknown"; }
doc.setFileType(ftype);
classifyFtypeTime += new Date().getTime();
//*-- find the class for the type of document
String docClass = dprops.getProperty(ftype);
if ( (docClass == null) || (docClass.equals("")) )
{ docClass = "org.sf.mustru.docs.TextDoc";
logger.error("Failed to get a file type for " + ftype);
}
return (docClass);
}
/**
* Create the Lucene index for the document and optionally extract entities using Annie
* @param idoc
* @throws IOException
*/
public void setIndex(IndexableDoc idoc) throws IOException
{
if (idoc == null) return;
//*-- create the Lucene index
luceneTime -= new Date().getTime();
try { idoc.loadIndex(ramIW, false); }
catch (IOException ie) { logger.error("Could not create index for " + idoc.getFileName() + " " + ie.getMessage() ); }
luceneTime += new Date().getTime();
//*-- store the entry in the database, limit the length of the content stored in the database
//*-- create the key and data to store the entry
docStoreTime -= new Date().getTime();
dbt.openDB(Constants.EXT_FILES_DB, false);
dbt.openSecDB(Constants.EXT_FILES_SECDB, false, new SecKeyDoc( tempBinding ) );
int contentSizeLimit = Constants.DB_DOC_STORE_LIMIT;
int textSize = idoc.getContents().length();
if (textSize > contentSizeLimit )
idoc.setContents( (new StringBuffer( idoc.getContents().substring(0, contentSizeLimit).trim() ) ) );
idoc.setIdate(new Date().getTime() );
boolean allowDups = (crawlConfig.isFreshCrawl() ) ? false: true;
if (!dbt.insert( idoc.getFileName(), idoc, allowDups) )
logger.error("Could not store document " + idoc.getFileName() + " in the database");
dbt.closeSecDB(); dbt.closeDB();
//*-- train the spell checker
if (crawlConfig.isSpellCheck())
synchronized(this) { crawlManager.getSc().train(idoc.getContents().toString()); }
docStoreTime += new Date().getTime();
}
// *-- initialize the passed indexable doc
private void initDoc(String filename, IndexableDoc doc)
{
doc.setAuthor(""); doc.setBdbBinding(tempBinding); doc.setContextFile("");
doc.setFileLength(0); doc.setFileLocation(""); doc.setFileName(filename);
doc.setFileRank(0); doc.setFileSignature(""); doc.setIdate(0);
doc.setLanguage(""); doc.setMdate(0); doc.setMetadata("");
doc.setFileName(filename); doc.setSummary(""); doc.setTitle("");
doc.setFileType(""); doc.setContents(new StringBuffer());
if (!(filename.equals(""))) {
tempFile = new File(filename);
doc.setFileLength(tempFile.length());
doc.setMdate(tempFile.lastModified());
}
}
/**
* Update the timers for this thread in the caller instance
*/
public void updateTimers()
{
int threadNum = getthreadNum();
miscTime = totalTime
- (initTime + fileReadTime + dbTime + docFilterTime + checkDupTime
+ classifyFtypeTime + classifyTextTime + docLoadTime + luceneTime
+ docStoreTime + syncTime);
crawlManager.updateTimers(threadNum, "initTime", initTime);
crawlManager.updateTimers(threadNum, "fileReadTime", fileReadTime);
crawlManager.updateTimers(threadNum, "dbTime", dbTime);
crawlManager.updateTimers(threadNum, "docFilterTime", docFilterTime);
crawlManager.updateTimers(threadNum, "checkDupTime", checkDupTime);
crawlManager.updateTimers(threadNum, "classifyFtypeTime", classifyFtypeTime);
crawlManager.updateTimers(threadNum, "classifyTextTime", classifyTextTime);
crawlManager.updateTimers(threadNum, "docLoadTime", docLoadTime);
crawlManager.updateTimers(threadNum, "luceneTime", luceneTime);
crawlManager.updateTimers(threadNum, "docStoreTime", docStoreTime);
crawlManager.updateTimers(threadNum, "syncTime", syncTime);
crawlManager.updateTimers(threadNum, "miscTime", miscTime);
crawlManager.updateTimers(threadNum, "totalTime", totalTime);
}
/*
* Create a RAM based Lucene index
*/
private void createRamIW()
{ try { ramIW = new IndexWriter(ramDir, analyzer, true); }
catch (IOException ie) { logger.error("Could not create RAM index writer " + ie.getMessage()); }
ramIW.setMaxFieldLength(Constants.LUCENE_MAX_WORDS);
}
public void endThread()
{
setRunning(false);
}
public boolean isRunning()
{
return running;
}
public void setRunning(boolean running)
{
this.running = running;
}
public int getthreadNum()
{
return this.threadNum;
}
// *-- timers for profiling
long initTime = 0; // *-- loading constructors
long fileReadTime = 0; // *-- reading file time
long dbTime = 0; // *-- database access time
long docFilterTime = 0; // *-- document conversion to text
long checkDupTime = 0; // *-- time to check for duplicates
long classifyFtypeTime = 0; // *-- time to classify document into one of the file types
long classifyTextTime = 0; // *-- time to classify docunemt text
long docLoadTime = 0; // *-- time to create a specific instance of a document
long luceneTime = 0; // *-- indexing time
long docStoreTime = 0; // *-- time to store the document in the database
long syncTime = 0; // *-- time to synchronize the database and merge RAM indexes with file index
long miscTime = 0; // *-- time for other misc. activities
long totalTime = 0; //*-- time to complete the scan
}