package org.sf.mustru.crawl;
import java.util.Properties;
import org.apache.log4j.Logger;
import org.eclipse.core.internal.runtime.HashMapOfString;
import org.sf.mustru.utils.Constants;
* Read, update, and store crawl configuration parameters from data/config/mustru.prp
public class CrawlConfig
static Logger logger = Logger.getLogger(CrawlConfig.class.getName());
private HashMapOfString paramHash; //*-- hash of crawler parameters and values
private final String[] PARAMETERS = { //*-- list of crawler parameters
"freshCrawl", //*-- flag to re-scan all documents
"keepDups", //*-- flag to keep duplicate documents
"skipHidden", //*-- flag to skip hidden files/directories i.e. starting with .
"numThreads", //*-- number of threads for this crawl
"spellCheck", //*-- create the spell checker ?
"incDirs", //*-- directories of include in this crawl
"excDirs", //*-- directories to exclude from the crawl
"followLinks", //*-- follow symbolic links
"indexDir", //*-- Lucene index directory
"dbDir", //*-- Berkeley DB dir
"webDir" //*-- optional Web root directory
private int freshIndex = -1; //*-- optional flag to create a fresh Lucene index
private int startPosition = -1; //*-- optional restart position indicator
* Crawler configuration bean class
public CrawlConfig(boolean init)
paramHash = new HashMapOfString();
String defaultIndexDir = Constants.MUSTRU_HOME + Constants.fs + "data" + Constants.fs + "index";
String defaultDbDir = Constants.MUSTRU_HOME + Constants.fs + "data" + Constants.fs + "bdb";
//*-- initialize the crawler configuration with default values
if (init)
{ setSkipHidden(true); setExcDirs(""); setFollowLinks(false); setFreshCrawl(true);
setIncDirs(""); setKeepDups(false); setNumThreads(1); setSpellCheck(false);
setDbDir(defaultDbDir); setIndexDir(defaultIndexDir); setWebDir("");
else loadHash();
Constants.setDBDIR(getDbDir()); Constants.setINDEXDIR(getIndexDir()); Constants.setWEBDIR(getWebDir());
private void loadHash()
//*-- Read the properties file to assign crawl parameters
Properties props = null; String paramFile = Constants.PROPFILE;
try { props = new Properties(); props.load(new FileInputStream(paramFile)); }
catch (IOException e) { logger.error("Could not read " + paramFile + " " + e.getMessage()); }
//*-- initialize the crawl configuration
String[] crawlParams = getPARAMETERS();
for (int i = 0; i < crawlParams.length; i++)
if (crawlParams[i].equals("indexDir")) setIndexDir( (String) props.getProperty(crawlParams[i]) );
else if (crawlParams[i].equals("dbDir")) setDbDir( (String) props.getProperty(crawlParams[i]) );
else if (crawlParams[i].equals("webDir")) setWebDir( (String) props.getProperty(crawlParams[i]) );
{ String val = (props.getProperty(crawlParams[i]) != null) ? props.getProperty(crawlParams[i]): "";
paramHash.put( crawlParams[i], val.trim() ); }
} //*-- end of for
* Return a list of parameters in the CrawlConfig class
* @return String[] List of names of parameters
public String[] getPARAMETERS()
String[] parameters = new String[PARAMETERS.length];
for (int i = 0; i < PARAMETERS.length; i++) parameters[i] = PARAMETERS[i];
return (parameters);
* Set the list of parameters in the CrawlConfig class
* @param pHash Hash containing parameter names and values
public void setPARAMETERS(HashMapOfString pHash)
String[] parameters = getPARAMETERS();
for (int i = 0; i < parameters.length; i++)
paramHash.put( parameters[i], (String) pHash.get(parameters[i]) );
* Generate a string containing the CrawlConfig parameters
* @return String - Dump the names of attribute names and values in a string
public String toString()
String[] parameters = getPARAMETERS(); StringBuffer sb = new StringBuffer();
for (int i = 0; i < parameters.length; i++)
{ String val = (String) paramHash.get(parameters[i]);
//*-- use the slash separator consistently across Windows and Linux
val = ( (val != null) && (val.length() > 0) ) ? val.replace('\\', '/'): "";
val = val.trim();
//*-- generate the line
sb.append(parameters[i]); sb.append(" = "); sb.append(val); sb.append(Constants.NEWLINE);
return(sb.toString() );
* Dump the crawl configuration to the properties file
public synchronized boolean dumpConfig()
boolean success = true;
String comments = "#*-----------------------------------------------------------------" + Constants.NEWLINE +
"#*- List of properties for a Mustru crawl" + Constants.NEWLINE +
"#*-" + Constants.NEWLINE +
"#*- freshCrawl: Re-create a new Lucene index and database" + Constants.NEWLINE +
"#*- keepDups: Index duplicate documents" + Constants.NEWLINE +
"#*- skipHidden: Skip directories that start with . " + Constants.NEWLINE +
"#*- numThreads: Number of threads to use during the index process" + Constants.NEWLINE +
"#*- spellCheck: Use a spell checker " + Constants.NEWLINE +
"#*- incDirs: Directories to search for indexable files" + Constants.NEWLINE +
"#*- excDirs: Directories that are excluded from the index process" + Constants.NEWLINE +
"#*- followLinks: Follow symbolic links in the crawl" + Constants.NEWLINE +
"#*- indexDir: Lucene index directory" + Constants.NEWLINE +
"#*- dbDir: Berkeley DB directory" + Constants.NEWLINE +
"#*- webDir: Optional Web server root directory" + Constants.NEWLINE +
"#*-" + Constants.NEWLINE +
"#*- Multiple directories are separated by ';'" + Constants.NEWLINE +
String filename = Constants.PROPFILE;
FileOutputStream fos = null;
try { fos = new FileOutputStream(new File(filename)); }
catch (FileNotFoundException fe) { success = false; }
PrintWriter pw = new PrintWriter(fos);
pw.println(comments); pw.println("" + this.toString());
try { if (fos != null) { fos.flush(); fos.close(); } }
catch (IOException ie) { success = false; }
public String getExcDirs()
{ return (String) paramHash.get("excDirs"); }
public void setExcDirs(String excDirs)
{ paramHash.put("excDirs", excDirs); }
public boolean isFreshCrawl()
{ String val = (String) paramHash.get("freshCrawl");
return ( val.equalsIgnoreCase("True") ? true: false ); }
public void setFreshCrawl(boolean freshCrawl)
{ paramHash.put("freshCrawl", Boolean.toString(freshCrawl)); }
public String getIncDirs()
{ return (String) paramHash.get("incDirs"); }
public void setIncDirs(String incDirs)
{ paramHash.put("incDirs", incDirs); }
public boolean isKeepDups()
{ String val = (String) paramHash.get("keepDups");
return ( val.equalsIgnoreCase("True") ? true: false ); }
public void setKeepDups(boolean keepDups)
{ paramHash.put("keepDups", Boolean.toString(keepDups)); }
public int getNumThreads()
{ return Integer.parseInt( (String) paramHash.get("numThreads") ); }
public void setNumThreads(int numThreads)
{ paramHash.put("numThreads", Integer.toString(numThreads)); }
public boolean isSpellCheck()
{ String val = (String) paramHash.get("spellCheck");
return ( val.equalsIgnoreCase("True") ? true: false ); }
public void setSpellCheck(boolean keepDups)
{ paramHash.put("spellCheck", Boolean.toString(keepDups)); }
public boolean isSkipHidden()
{ String val = (String) paramHash.get("skipHidden");
return ( val.equalsIgnoreCase("True") ? true: false ); }
public void setSkipHidden(boolean skipHidden)
{ paramHash.put("skipHidden", Boolean.toString(skipHidden)); }
public boolean isFollowLinks()
{ String val = (String) paramHash.get("followLinks");
return ( val.equalsIgnoreCase("True") ? true: false ); }
public void setFollowLinks(boolean followLinks)
{ paramHash.put("followLinks", Boolean.toString(followLinks)); }
public int getFreshIndex()
{ return freshIndex; }
public void setFreshIndex(int freshIndex)
{ this.freshIndex = freshIndex; }
public int getStartPosition()
{ return startPosition; }
public void setStartPosition(int startPosition)
{ this.startPosition = startPosition; }
public String getIndexDir()
{ return (String) paramHash.get("indexDir"); }
public void setIndexDir(String indexDir)
{ File file = new File(indexDir);
if (!file.exists())
{ indexDir = Constants.MUSTRU_HOME + Constants.fs + "data" + Constants.fs + indexDir;
file = new File(indexDir); if (!file.exists()) file.mkdir();
paramHash.put("indexDir", indexDir);
public String getDbDir()
{ return (String) paramHash.get("dbDir"); }
public void setDbDir(String dbDir)
{ File file = new File(dbDir);
if (!file.exists())
{ dbDir = Constants.MUSTRU_HOME + Constants.fs + "data" + Constants.fs + dbDir;
file = new File(dbDir); if (!file.exists()) file.mkdir();
paramHash.put("dbDir", dbDir);
public String getWebDir()
{ return (String) paramHash.get("webDir"); }
public void setWebDir(String webDir)
{ paramHash.put("webDir", webDir); }