Package org.sf.mustru.crawl

Source Code of org.sf.mustru.crawl.CrawlConfig

package org.sf.mustru.crawl;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Properties;

import org.apache.log4j.Logger;
import org.eclipse.core.internal.runtime.HashMapOfString;
import org.sf.mustru.utils.Constants;

/**
* Read, update, and store crawl configuration parameters from data/config/mustru.prp
*/
public class CrawlConfig
  static Logger logger = Logger.getLogger(CrawlConfig.class.getName());
  private HashMapOfString paramHash;      //*-- hash of crawler parameters and values
  private final String[] PARAMETERS  = {     //*-- list of crawler parameters
      "freshCrawl",       //*-- flag to re-scan all documents
      "keepDups",         //*-- flag to keep duplicate documents
      "skipHidden",       //*-- flag to skip hidden files/directories i.e. starting with .
      "numThreads",       //*-- number of threads for this crawl
      "spellCheck",        //*-- create the spell checker ?
      "incDirs",          //*-- directories of include in this crawl
      "excDirs",         //*-- directories to exclude from the crawl
      "followLinks",      //*-- follow symbolic links
      "indexDir",        //*-- Lucene index directory
      "dbDir",        //*-- Berkeley DB dir
      "webDir"        //*-- optional Web root directory
      };
  private int freshIndex = -1;        //*-- optional flag to create a fresh Lucene index
  private int startPosition = -1;      //*-- optional restart position indicator
 
  /**
   * Crawler configuration bean class
   */
  public CrawlConfig(boolean init)
   {
    paramHash = new HashMapOfString();
    String defaultIndexDir = Constants.MUSTRU_HOME + Constants.fs + "data" + Constants.fs + "index";
    String defaultDbDir = Constants.MUSTRU_HOME + Constants.fs + "data" + Constants.fs + "bdb";
   
    //*-- initialize the crawler configuration with default values
    if (init)
    { setSkipHidden(true);   setExcDirs("");   setFollowLinks(false);   setFreshCrawl(true);
      setIncDirs("");    setKeepDups(false);  setNumThreads(1);  setSpellCheck(false)
      setDbDir(defaultDbDir); setIndexDir(defaultIndexDir); setWebDir("");
    }
    else loadHash();
   
    Constants.setDBDIR(getDbDir()); Constants.setINDEXDIR(getIndexDir()); Constants.setWEBDIR(getWebDir());
   }

  private void loadHash()
  {
   //*-- Read the properties file to assign crawl parameters
   Properties props = null; String paramFile = Constants.PROPFILE;
   try { props = new Properties(); props.load(new FileInputStream(paramFile)); }
   catch (IOException e) { logger.error("Could not read " + paramFile + " " + e.getMessage()); }
    
   //*-- initialize the crawl configuration
   String[] crawlParams = getPARAMETERS();
   for (int i = 0; i < crawlParams.length; i++)
   {
    if (crawlParams[i].equals("indexDir")) setIndexDir( (String) props.getProperty(crawlParams[i]) );
    else if (crawlParams[i].equals("dbDir")) setDbDir( (String) props.getProperty(crawlParams[i]) );
    else if (crawlParams[i].equals("webDir")) setWebDir( (String) props.getProperty(crawlParams[i]) );
    else
    { String val = (props.getProperty(crawlParams[i]) != null) ? props.getProperty(crawlParams[i]): "";
      paramHash.put( crawlParams[i],  val.trim() ); }
   } //*-- end of for
  
  }
 
 
  /**
   * Return a list of parameters in the CrawlConfig class
   * @return String[] List of names of parameters
   */
  public String[] getPARAMETERS()
  {
   String[] parameters = new String[PARAMETERS.length];
   for (int i = 0; i < PARAMETERS.length; i++) parameters[i] = PARAMETERS[i];
   return (parameters);
  }

  /**
   * Set the list of parameters in the CrawlConfig class
   * @param pHash Hash containing parameter names and values
   */
  public void setPARAMETERS(HashMapOfString pHash)
  {
   String[] parameters = getPARAMETERS();
   for (int i = 0; i < parameters.length; i++)
     paramHash.put( parameters[i], (String) pHash.get(parameters[i]) );
  }
 
  /**
   * Generate a string containing the CrawlConfig parameters
   * @return String - Dump the names of attribute names and values in a string
   */
  public String toString()
  {
   String[] parameters = getPARAMETERS(); StringBuffer sb = new StringBuffer();
   for (int i = 0; i < parameters.length; i++)
    { String val = (String) paramHash.get(parameters[i]);

      //*-- use the slash separator consistently across Windows and Linux
      val =  ( (val != null) && (val.length() > 0) ) ? val.replace('\\', '/'): "";
      val = val.trim();
      //*-- generate the line
      sb.append(parameters[i]); sb.append(" = "); sb.append(val); sb.append(Constants.NEWLINE);
    }
   return(sb.toString() );
  }
 
  /**
   * Dump the crawl configuration to the properties file
   */
  public synchronized boolean dumpConfig()
  {
   boolean success = true;
   String comments =   "#*-----------------------------------------------------------------" + Constants.NEWLINE +
   "#*- List of properties for a Mustru crawl" + Constants.NEWLINE +
   "#*-" + Constants.NEWLINE +
   "#*- freshCrawl:  Re-create a new Lucene index and database" + Constants.NEWLINE +
   "#*- keepDups:    Index duplicate documents" + Constants.NEWLINE +
   "#*- skipHidden:  Skip directories that start with . " + Constants.NEWLINE +
   "#*- numThreads:  Number of threads to use during the index process" + Constants.NEWLINE +
   "#*- spellCheck:  Use a spell checker " + Constants.NEWLINE +
   "#*- incDirs:     Directories to search for indexable files" + Constants.NEWLINE +
   "#*- excDirs:     Directories that are excluded from the index process" + Constants.NEWLINE +
   "#*- followLinks: Follow symbolic links in the crawl" + Constants.NEWLINE +
   "#*- indexDir: Lucene index directory" + Constants.NEWLINE +
   "#*- dbDir: Berkeley DB directory" + Constants.NEWLINE +
   "#*- webDir: Optional Web server root directory" + Constants.NEWLINE +
   "#*-" + Constants.NEWLINE +
   "#*- Multiple directories are separated by ';'" + Constants.NEWLINE +
   "#*-----------------------------------------------------------------";

    String filename = Constants.PROPFILE;
    FileOutputStream fos = null;
    try { fos = new FileOutputStream(new File(filename)); }
    catch (FileNotFoundException fe) { success = false; }
    PrintWriter pw = new PrintWriter(fos);
    pw.println(comments); pw.println("" + this.toString());
    pw.close();
    try { if (fos != null)  { fos.flush(); fos.close(); } }
    catch (IOException ie) { success = false; }
    return(success);
  }
   
  public String getExcDirs()
  { return (String) paramHash.get("excDirs"); }

  public void setExcDirs(String excDirs)
  { paramHash.put("excDirs",  excDirs); }

  public boolean isFreshCrawl()
  { String val = (String) paramHash.get("freshCrawl");
    return ( val.equalsIgnoreCase("True") ? true: false ); }

  public void setFreshCrawl(boolean freshCrawl)
  { paramHash.put("freshCrawl",  Boolean.toString(freshCrawl)); }

  public String getIncDirs()
  { return (String) paramHash.get("incDirs"); }

  public void setIncDirs(String incDirs)
  { paramHash.put("incDirs",  incDirs); }

  public boolean isKeepDups()
  { String val = (String) paramHash.get("keepDups");
    return ( val.equalsIgnoreCase("True") ? true: false ); }

  public void setKeepDups(boolean keepDups)
  { paramHash.put("keepDups",  Boolean.toString(keepDups)); }

  public int getNumThreads()
  { return Integer.parseInt( (String) paramHash.get("numThreads") ); }

  public void setNumThreads(int numThreads)
  { paramHash.put("numThreads",  Integer.toString(numThreads)); }

  public boolean isSpellCheck()
  { String val = (String) paramHash.get("spellCheck");
    return ( val.equalsIgnoreCase("True") ? true: false ); }

  public void setSpellCheck(boolean keepDups)
  { paramHash.put("spellCheck",  Boolean.toString(keepDups)); }

  public boolean isSkipHidden()
  { String val = (String) paramHash.get("skipHidden");
    return ( val.equalsIgnoreCase("True") ? true: false ); }

  public void setSkipHidden(boolean skipHidden)
  { paramHash.put("skipHidden",  Boolean.toString(skipHidden)); }
 
  public boolean isFollowLinks()
  { String val = (String) paramHash.get("followLinks");
    return ( val.equalsIgnoreCase("True") ? true: false ); }

  public void setFollowLinks(boolean followLinks)
  { paramHash.put("followLinks",  Boolean.toString(followLinks)); }

  public int getFreshIndex()
  { return freshIndex; }

  public void setFreshIndex(int freshIndex)
  { this.freshIndex = freshIndex; }

  public int getStartPosition()
  { return startPosition; }

  public void setStartPosition(int startPosition)
  { this.startPosition = startPosition; }
 
  public String getIndexDir()
  { return (String) paramHash.get("indexDir"); }

  public void setIndexDir(String indexDir)
  { File file = new File(indexDir);
    if (!file.exists())
     { indexDir = Constants.MUSTRU_HOME + Constants.fs + "data" + Constants.fs + indexDir;
       file = new File(indexDir); if (!file.exists()) file.mkdir();
     }
    paramHash.put("indexDir",  indexDir);
  }
 
  public String getDbDir()
  { return (String) paramHash.get("dbDir"); }

  public void setDbDir(String dbDir)
  { File file = new File(dbDir);
    if (!file.exists())
      { dbDir = Constants.MUSTRU_HOME + Constants.fs + "data" + Constants.fs + dbDir;
        file = new File(dbDir); if (!file.exists()) file.mkdir();
     
    paramHash.put("dbDir",  dbDir);
  }
 
  public String getWebDir()
  { return (String) paramHash.get("webDir"); }

  public void setWebDir(String webDir)
  { paramHash.put("webDir",  webDir); }
 
}
TOP

Related Classes of org.sf.mustru.crawl.CrawlConfig

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.