package org.sf.mustru.crawl;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.eclipse.core.internal.runtime.HashMapOfString;
import org.sf.mustru.crawl.CrawlManager;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DirTools;
/**
* Create a thread to scan the specified directories and generate
* a taskfile (taskfile.txt) in the data/config directory.
*/
public class CrawlTask extends Thread
{
static Logger logger = Logger.getLogger(CrawlTask.class.getName());
private CrawlConfig crawlConfig; //*-- crawler configuration parameters
private CrawlManager ctRef; //*-- reference to caller
private int numFiles = 0; //*-- number of files processed
/**
* @param crawlConfig
* @param ctRef
*/
public CrawlTask(CrawlConfig crawlConfig, CrawlManager ctRef)
{ this.crawlConfig = crawlConfig; this.ctRef = ctRef; }
/**
* Build a task file containing a list of files to crawl from the directories and
* filters files. The list of directories will be recursively scanned for files
* that can be indexed. A list of files will be generated in a task file. The files
* that can be indexed are identified from a filters properties file that contains
* a specific handler for each suffix type. The specific handlers convert files
* to plain text and return a string.
*
*/
public void run()
{
String taskFile = Constants.TASK_FILE; //*-- file that will contain the list of documents to scan
String filtersFile = Constants.FILTER_FILE; //*-- file containing the list of handlers
String errMsg = null; //*-- Error message
RandomAccessFile tfile = null; //*-- a random access file containing a list of files to scan
BufferedReader filein = null; //*-- reader to remove dups from the task file
PrintWriter outp = null; //*-- writer to create the unique list of files
HashMapOfString h = null; //*-- hash set to save the list of files
logger.info("Start creating task file for crawl");
//*-- before creating a task file, check for a restart
if (crawlConfig.getStartPosition() != -1)
{ ctRef.fileReadTime -= new Date().getTime();
int numFiles = 0; LineNumberReader fileno = null;
try
{ fileno = new LineNumberReader(new FileReader(taskFile));
while ((fileno.readLine()) != null) numFiles++;
}
catch (IOException ie) { logger.error("IO Error in task file " + taskFile + " " + ie.getMessage()); }
finally { try { if (fileno != null) fileno.close(); }
catch (IOException ie) { logger.error("Ignore error"); } }
setNumFiles(numFiles);
ctRef.fileReadTime += new Date().getTime();
return;
}
ctRef.fileReadTime -= new Date().getTime();
String iDirs = crawlConfig.getIncDirs();
String eDirs = crawlConfig.getExcDirs();
boolean skipHidden = crawlConfig.isSkipHidden();
boolean followLinks = crawlConfig.isFollowLinks();
if ( !(iDirs.equals("")) && !(iDirs.endsWith(";")) ) iDirs += ";";
if ( !(eDirs.equals("")) && !(eDirs.endsWith(";")) ) eDirs += ";";
String[] incDirs = (iDirs.equals("") ) ? new String[0]: iDirs.split(";");
String[] excDirs = (eDirs.equals("") ) ? new String[0]: eDirs.split(";");
Pattern[] excPatts = new Pattern[excDirs.length];
for (int i = 0; i < excDirs.length; i++) excPatts[i] = Pattern.compile("^" + excDirs[i] + ".*$");
try
{
tfile = new RandomAccessFile(taskFile, "rw");
tfile.setLength(0); tfile.close();
// *-- Run the directory scan for each directory in the list
DirTools dirTools = new DirTools();
for (int i = 0; i < incDirs.length; i++)
{ logger.info("Start scanning " + incDirs[i]);
dirTools.dirScan(incDirs[i], filtersFile, taskFile, skipHidden, followLinks);
logger.info("Finished scanning " + incDirs[i]);
}
// *-- Read the task file into a hashset and remove dups.
logger.info("Removing duplicate file names from task file");
filein = new BufferedReader(new FileReader(taskFile));
String s; h = new HashMapOfString();
while ((s = filein.readLine()) != null) h.put(s, "");
}
catch (IOException ie)
{ errMsg = "IO Error in task file " + ie.getMessage(); }
finally
{ if (filein != null) try { filein.close(); } catch (IOException ie) { logger.error("Ignore error"); }
if (errMsg != null) ctRef.cleanUp(errMsg);
}
//*-- Dump the file back into tfile with a list of unique file names
//*-- Check if any of the files belong to the list of exclusion directories
try
{
tfile = new RandomAccessFile(taskFile, "rw");
tfile.setLength(0); tfile.seek(0);
outp = new PrintWriter(new FileWriter(taskFile));
if (h != null)
{ String[] keys = h.keys();
FLOOP: for (int i = 0; i < keys.length; i++)
{ for (int j = 0; j < excPatts.length; j++)
{ Matcher matcher = excPatts[j].matcher(keys[i]);
if (matcher.matches()) continue FLOOP;
}
outp.println(keys[i]); numFiles++;
} //*-- end of outer for
} //*-- end of if
outp.flush();
} //*-- end of try
catch (NullPointerException e) { errMsg = "Null pointer: " + e.getMessage(); }
catch (IOException e) { errMsg = "IO Error: " + e.getMessage(); }
finally
{ if (outp != null) outp.close();
try { if (tfile != null) tfile.close(); }
catch (IOException ie) { logger.error("Ignore error"); }
if (errMsg != null) ctRef.cleanUp(errMsg);
}
setNumFiles(numFiles);
logger.info("Finished creating the task file");
ctRef.fileReadTime += new Date().getTime();
return;
}
public int getNumFiles()
{ return numFiles; }
public void setNumFiles(int numFiles)
{ this.numFiles = numFiles; }
}