package org.sf.mustru.crawl;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.lang.RuntimeException;
import java.util.Date;
import java.util.Enumeration;
import java.util.ArrayList;
import java.util.Properties;
import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.docs.SecKeyDoc;
import org.sf.mustru.search.SearchSimilarity;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.StandardBgramAnalyzer;
import org.sf.mustru.utils.StandardBgramTokenizerFactory;
import org.sf.mustru.utils.StringTools;
import org.sf.mustru.utils.TrainSpellChecker;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.eclipse.core.internal.runtime.HashMapOfString;
import com.aliasi.lm.NGramProcessLM;
import com.aliasi.spell.FixedWeightEditDistance;
import com.aliasi.tokenizer.TokenizerFactory;
/**
* Manage the crawl <br>
* <ol>
* <li> Start the crawl - open the index and databases <br>
* <li> Read the task file from crawlTask and start the individual crawl threads <br>
* <li> Wait for the crawl threads to end and dump statistics <br>
* </ol>
*/
public class CrawlManager
{
static Logger logger = Logger.getLogger(CrawlManager.class.getName());
private static CrawlManager ctRef = null;
//*-- timers and statistics
private HashMapOfString stats = new HashMapOfString(); //*-- a hash to keep track of statistics by media type
private String[] timers = Constants.getTIMERS(); //*-- a list of timers
private HashMapOfString timerHash = new HashMapOfString(); //*-- hash to keep track of timers of various events
private String[] docTypes = null; //*-- types of documents that will be processed
//*-- Lucene vars
private IndexWriter iw = null; //*-- Filesystem based indexWriter object for Lucene
private FSDirectory fsd; //*-- Filesystem directory to store the index
private Analyzer analyzer; //*-- tokenizer for the search engine
//*-- Lingpipe vars
private TrainSpellChecker sc = null; //*-- spell checker from Lingpipe
private final static int NGRAM_LENGTH = 5; private final static double MATCH_WEIGHT = -0.0;
private final static double DELETE_WEIGHT = -4.0; private final static double INSERT_WEIGHT = -1.0;
private final static double SUBSTITUTE_WEIGHT = -2.0; private final static double TRANSPOSE_WEIGHT = -2.0;
//*-- vars for the task file
private RandomAccessFile taskFile = null; //*-- handle to read the task file
private long[] filePos = null; //*-- list of file positions in task file
private int[] currentDoc = null; //*-- current document positions in each thread
private boolean enoughResources; //*-- flag to indicate a thread ran out of memory
/**
* Create a single instance of this class
*/
private CrawlManager() { }
public static CrawlManager getCrawlManager() throws NullPointerException
{ if (ctRef != null)
{ logger.warn("Cannot run two instances of CrawlManager");
throw new RuntimeException("Cannot run two instances of CrawlManager"); }
//*-- instantiate a new crawl
ctRef = new CrawlManager();
if (Constants.getDbt() == null)
{ DbTools dbt = new DbTools();
dbt.openEnv(Constants.getDBDIR(), false); Constants.setDbt(dbt);
}
return(ctRef);
}
public static void resetCrawlManager() { ctRef = null; }
/**
* Initialize the crawl. <br><br>
*
* 1. Create a Lucene IndexWriter <br>
* 2. Create the Berkeley databases <br>
* 3. Initialize a timer hash <br>
*
* @param crawlConfig Crawl configuration object
*/
public void initCrawl(CrawlConfig crawlConfig)
{
boolean freshCrawl = crawlConfig.isFreshCrawl();
int numThreads = crawlConfig.getNumThreads();
initTime =- new Date().getTime();
Properties props = new Properties();
try { props.load(new BufferedInputStream(new FileInputStream(Constants.DOCTYPES_FILE))); }
catch ( IOException e) { ctRef.cleanUp("Could not open " + Constants.DOCTYPES_FILE + " " + e.getMessage()); }
//*-- load the types of media from the docTypes properties file
docTypes = new String[props.size()]; int i = 0; Integer integerZero = new Integer(0);
for (Enumeration keys = props.propertyNames(); keys.hasMoreElements(); )
{ String key = (String) keys.nextElement(); stats.put(key, integerZero.toString() );
docTypes[i++] = key; }
stats.put("duplicates", integerZero.toString() );
stats.put("indexed earlier", integerZero.toString() );
//*-- create the single filesystem based Lucene IndexWriter
//*-- FreshIndex value : -1 = freshCrawl
//*-- 0 = false
//*-- * = true
boolean freshIndex = (crawlConfig.getFreshIndex() == -1) ? freshCrawl:
(crawlConfig.getFreshIndex() == 0) ? false: true;
try
{
fsd = FSDirectory.getDirectory(new File(Constants.getINDEXDIR()), freshIndex);
analyzer = new StandardBgramAnalyzer(); iw = new IndexWriter(fsd, analyzer, freshIndex);
iw.setSimilarity(new SearchSimilarity());
ctRef.setIw(iw);
}
catch (IOException ie) { ctRef.cleanUp("Could not get IndexWriter " + ie.getMessage() ); }
//*-- create the spell checker
if (crawlConfig.isSpellCheck())
{ FixedWeightEditDistance fixedEdit = new FixedWeightEditDistance( MATCH_WEIGHT, DELETE_WEIGHT, INSERT_WEIGHT,
SUBSTITUTE_WEIGHT, TRANSPOSE_WEIGHT);
NGramProcessLM lm = new NGramProcessLM(NGRAM_LENGTH);
TokenizerFactory tokenizerFactory = new StandardBgramTokenizerFactory(false); //*-- do not extract entities
try { if ( (new File(Constants.SPELL_TRAIN_MODEL).exists() ) && !crawlConfig.isFreshCrawl() )
lm = readModel(Constants.SPELL_TRAIN_MODEL);
sc = new TrainSpellChecker(lm, fixedEdit, tokenizerFactory); }
catch (IOException ie) { logger.error("IO Error: Could not read spell train file " + ie.getMessage()); }
catch (ClassNotFoundException ce) { logger.error("Class error: " + ce.getMessage()); }
}
//*-- Create the databases, if necessary
DbTools dbt = Constants.getDbt();
boolean createFlag = true; boolean dupFlag = false;
if (freshCrawl)
{
//*-- initialize and create a new database for the list of extracted files and a secondary
//*-- database for the file signature
dbt.dropDB(Constants.EXT_FILES_DB);
dbt.createDB(Constants.EXT_FILES_DB, createFlag, dupFlag);
logger.info("Created py. database " + Constants.EXT_FILES_DB);
dbt.dropSecDB(Constants.EXT_FILES_SECDB);
SecKeyDoc skd = new SecKeyDoc( new IndexableDoc().getBdbBinding() );
if (dbt.createSecDB(Constants.EXT_FILES_SECDB, false, skd))
logger.info("Created sec. database " + Constants.EXT_FILES_SECDB);
dbt.closeSecDB(); dbt.closeDB();
//*-- create the messages database
dbt.dropDB(Constants.EXT_MESSAGES_DB);
dbt.createDB(Constants.EXT_MESSAGES_DB, createFlag, dupFlag);
logger.info("Created py. database " + Constants.EXT_MESSAGES_DB);
dbt.closeDB();
}
//*-- initialize the timer hash
timers = Constants.getTIMERS();
for (int j = 0; j < timers.length ; j++)
for (int k = 0; k < numThreads; k++)
timerHash.put(timers[j] + "_" + k, " ");
initTime += new Date().getTime();
}
/**
* Scan the contents of the task file. Each line contains the path to an indexable document
* on the filesystem. Use the thread number and number of threads to distribute the load evenly
* across all threads. The text for each file is extracted, classified, and indexed.
*
* @param crawlConfig Crawl configuration object
* @param crawlThread List of index doc threads
*/
public void startThreads(CrawlConfig crawlConfig, CrawlThread[] crawlThread)
{
//*-- read the list of files to process into an array
int numThreads = crawlConfig.getNumThreads();
ArrayList<Long> fileList = new ArrayList<Long>();
try
{ taskFile = new RandomAccessFile(Constants.TASK_FILE, "r"); taskFile.seek(0);
int i = 0; fileList.add(i, new Long( taskFile.getFilePointer() ) );
String fileName = "";
LOOP: while ((fileName = taskFile.readLine()) != null)
{ if ( (crawlConfig.getStartPosition() <= 0) && (!(new File(fileName).canRead())) )
continue LOOP; //*-- make sure that the file is readable before adding to the list
i++;
fileList.add(i, new Long( taskFile.getFilePointer() ) );
}
}
catch (FileNotFoundException fe)
{ cleanUp("Could not find task file " + Constants.TASK_FILE + " " + fe.getMessage()); }
catch (IOException ie)
{ cleanUp("Could not read task file " + Constants.TASK_FILE + " " + ie.getMessage()); }
//*-- set the file positions for each file in the task file
filePos = new long[fileList.size() - 1];
for (int i = 0; i < filePos.length; i++) filePos[i] = ( (Long) fileList.get(i)).longValue();
fileList = null;
//*-- set the Lucene parameter for buffering documents
iw.setMaxBufferedDocs( (totalNumFiles() > 9) ? Constants.LUCENE_MAX_BUFFERED_DOCS: 2);
//*-- initially, set the current document positions and passage counts
if (currentDoc == null)
{ currentDoc = new int[numThreads];
for (int i = 0; i < currentDoc.length; i++) currentDoc[i] = -1;
}
//*-- Start threads to process documents in the task file, stagger the start of threads
for (int i = 0; i < numThreads; i++)
{
try { crawlThread[i] = new CrawlThread (i, crawlConfig, this); }
catch (IOException ie) { cleanUp("Could not create independent threads " + ie.getMessage() ); }
crawlThread[i].start();
try { Thread.sleep(100); } catch (InterruptedException e) { }
}
return;
} //*-- end of startThreads
/**
* Increment the log to track the number and type of files processed
* @param docType
*/
public synchronized void updateLogdata (String docType)
{
if (docType == null) docType = "unknown";
Integer count = Integer.parseInt(stats.get(docType) );
if ( count == null) { docType = "unknown"; count = Integer.parseInt(stats.get(docType)); }
count = count + 1;
stats.put(docType, count.toString());
} //*-- end of update log
/**
* Dump the log to a string
* @return String containing the statistics for the crawl
*/
public String dumpLogdata(int numThreads)
{
StringBuffer logData = new StringBuffer(); int numFiles = 0;
Integer io; String newLine = Constants.NEWLINE;
logData.append("----------------------------------------------" + newLine);
for (int i = 0; i < docTypes.length; i++)
{ io = Integer.parseInt( stats.get( docTypes[i] ) );
logData.append("No. of " + docTypes[i] + " files: " + io.intValue() + newLine);
numFiles += io.intValue();
}
io = Integer.parseInt(stats.get("duplicates") );
logData.append("No. of duplicates: " + io.intValue() + newLine);
numFiles += io.intValue();
io = Integer.parseInt(stats.get("indexed earlier") );
logData.append("No. indexed earlier: " + io.intValue() + newLine);
numFiles += io.intValue();
logData.append("----------------------------------------------" + newLine);
logData.append("Total no. of files: " + numFiles + newLine);
return (logData.toString());
} //*-- end of dump log
/**
* Called at the end of shutdown
* @param msg
*/
public synchronized void cleanUp(String msg)
{
//*-- clean up the Lucene index
try { if (iw != null)
{ logger.info("Optimizing Lucene index...");
//iw.optimize();
iw.close(); } }
catch (IOException ie) { logger.error("Could not optimize Lucene index " + ie.getMessage() ); }
//*-- close the task file
try { if (taskFile != null) taskFile.close(); }
catch (IOException ie) { }
if (Constants.getDbt() != null) Constants.getDbt().closeEnv();
//*-- dump the spell checker model
if (sc != null)
{ try { sc.pruneTokens(5);
BufferedOutputStream bufOut = new BufferedOutputStream( new FileOutputStream(Constants.SPELL_CHECK_MODEL));
ObjectOutputStream objOut = new ObjectOutputStream(bufOut); sc.compileTo(objOut);
objOut.close(); bufOut.close();
bufOut = new BufferedOutputStream( new FileOutputStream(Constants.SPELL_TRAIN_MODEL));
objOut = new ObjectOutputStream(bufOut); sc.dumpTo(objOut);
objOut.close(); bufOut.close();
}
catch (IOException ie) { logger.error("IO Error: " + ie.getMessage()); }
}
if (!msg.equals(""))
{ logger.error(msg); logger.error("ERROR: This thread was aborted"); }
resetCrawlManager();
return;
}
public void dumpTimers(int numThreads)
{
//*-- dump the headers
System.out.println("");
System.out.println("Profile of Crawl");
System.out.println(""); System.out.print("Timer Type\t\t");
for (int i = 0; i < numThreads; i++) System.out.print( StringTools.fillin("Thread " + i, 10, true, ' ', 2) );
System.out.println("");
System.out.println("-------------------------------------------------------------------------------------------------------");
//*-- dump the timers
for (int i = 0; i < timers.length; i++)
{
System.out.print(StringTools.fillin(timers[i], 22, true, ' ', 22 - timers[i].length() ) );
for (int j = 0; j < numThreads; j++)
{ String timeT = (String) timerHash.get(timers[i] + "_" + j);
System.out.print( StringTools.fillin( timeT, 10, false, ' ', 10 - timeT.length() ) ); }
System.out.println("");
}
System.out.println("");
System.out.println(""); System.out.print("Timer Type\t\t");
System.out.print( StringTools.fillin("Main Thread", 15, true, ' ', 2) );
System.out.println(""); System.out.println("-----------------------------------------------------");
//*-- dump the timers
System.out.print(StringTools.fillin("initTime", 22, true, ' ', 22 - 8) );
System.out.println(initTime);
System.out.print(StringTools.fillin("fileReadTime", 22, true, ' ', 22 - 12) );
System.out.println(fileReadTime);
}
/**
* Terminate all threads
* @param crawlThread
*/
public void endThreads (CrawlThread[] crawlThread)
{
//*-- signal all threads to stop running
for (int i = 0; i < crawlThread.length; i++) crawlThread[i].endThread();
//*-- wait till all threads complete...
LOOP: for (int i = 0; i < crawlThread.length; i++)
{ int numTimes = 0;
while ( (crawlThread[i] != null) && crawlThread[i].isAlive() )
{ logger.info("Waiting for indexing thread to terminate...");
try { Thread.sleep(3000); } catch (InterruptedException ie) { }
if (numTimes++ < 100) continue LOOP;
crawlThread[i] = null;
} //*-- end of while
} //*-- end of for
}
/**
* Crawl Threads call this function to update the timers
* @param threadNum Integer thread number
* @param timerType String type of timer
* @param timerval long time in msecs.
*/
public synchronized void updateTimers(int threadNum, String timerType, long timerval)
{ String key = timerType + "_" + threadNum;
timerHash.put(key, Long.toString(timerval) ); }
/**
* return the file name at the index position in the task file
* @param index line position in the task file
* @return String
* @throws IOException
*/
public String getFileName(int index) throws IOException
{ if (taskFile == null) return ("");
taskFile.seek(filePos[index]); String fileName = taskFile.readLine();
return ( (fileName == null) ? "": fileName.trim() );
}
/**
* return the number of files processed so far
*/
public int getTotalFilesProcessed()
{
String[] keys = stats.keys(); int numFiles = 0;
for (int i = 0; i < keys.length; i++)
numFiles += Integer.parseInt( stats.get(keys[i]) );
return(numFiles);
}
//*-------------------------------------------------------------
//*-- Read the NGramProcessLM model from a file
//*-------------------------------------------------------------
private NGramProcessLM readModel(String filename) throws ClassNotFoundException, IOException
{
//*--- create object input stream from file
BufferedInputStream bufIn = new BufferedInputStream(new FileInputStream(new File(filename)));
ObjectInputStream objIn = new ObjectInputStream(bufIn);
//*-- read the spell checker
NGramProcessLM nLM = NGramProcessLM.readFrom(objIn);
// close the resources and return result
objIn.close(); bufIn.close();
return(nLM);
}
public IndexWriter getIw()
{ return iw; }
public void setIw(IndexWriter iw)
{ this.iw = iw; }
public synchronized boolean isEnoughResources()
{ return enoughResources; }
public synchronized void setEnoughResources(boolean enoughMemory)
{ this.enoughResources = enoughMemory; }
public int totalNumFiles() { return filePos.length; }
public int getCurrentDoc(int threadNum)
{ return (currentDoc[threadNum]); }
public void setCurrentDoc(int threadNum, int docnum)
{ currentDoc[threadNum] = docnum; }
public TrainSpellChecker getSc()
{ return sc; }
public void setSc(TrainSpellChecker sc)
{ this.sc = sc; }
long initTime = 0;
long fileReadTime = 0;
}