Package org.sf.mustru.crawl

Source Code of org.sf.mustru.crawl.CrawlTask

package org.sf.mustru.crawl;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.eclipse.core.internal.runtime.HashMapOfString;

import org.sf.mustru.crawl.CrawlManager;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DirTools;

/**
* Create a thread to scan the specified directories and generate
* a taskfile (taskfile.txt) in the data/config directory.
*/
public class CrawlTask extends Thread
{
  static Logger logger = Logger.getLogger(CrawlTask.class.getName());
  private CrawlConfig crawlConfig;      //*-- crawler configuration parameters
  private CrawlManager ctRef;        //*-- reference to caller
  private int numFiles = 0;        //*-- number of files processed
 
/**
   * @param crawlConfig
   * @param ctRef
   */
  public CrawlTask(CrawlConfig crawlConfig, CrawlManager ctRef)
  { this.crawlConfig = crawlConfig; this.ctRef = ctRef; }
 
  /**
   * Build a task file containing a list of files to crawl from the directories and
   * filters files. The list of directories will be recursively scanned for files
   * that can be indexed. A list of files will be generated in a task file. The files
   * that can be indexed are identified from a filters properties file that contains
   * a specific handler for each suffix type. The specific handlers convert files
   * to plain text and return a string.
   *
   */
  public void run()
  {
   String taskFile = Constants.TASK_FILE;     //*-- file that will contain the list of documents to scan
   String filtersFile = Constants.FILTER_FILE;    //*-- file containing the list of handlers
   String errMsg = null;        //*-- Error message
   RandomAccessFile tfile = null;       //*-- a random access file containing a list of files to scan
   BufferedReader filein = null;      //*-- reader to remove dups from the task file       
   PrintWriter outp = null;        //*-- writer to create the unique list of files
   HashMapOfString h = null;        //*-- hash set to save the list of files
  
   logger.info("Start creating task file for crawl");
  
   //*-- before creating a task file, check for a restart
   if (crawlConfig.getStartPosition() != -1)
   { ctRef.fileReadTime -= new Date().getTime();
     int numFiles = 0; LineNumberReader fileno = null;
     try
     { fileno = new LineNumberReader(new FileReader(taskFile));
       while ((fileno.readLine()) != null) numFiles++;
     }
     catch (IOException ie) { logger.error("IO Error in task file " + taskFile + " " + ie.getMessage()); }
     finally { try { if (fileno != null) fileno.close(); }
               catch (IOException ie) { logger.error("Ignore error"); } }
     setNumFiles(numFiles);
     ctRef.fileReadTime += new Date().getTime();
     return;
   }
  
   ctRef.fileReadTime -= new Date().getTime();
   String iDirs = crawlConfig.getIncDirs();
   String eDirs = crawlConfig.getExcDirs();
   boolean skipHidden = crawlConfig.isSkipHidden();
   boolean followLinks = crawlConfig.isFollowLinks();
 
   if ( !(iDirs.equals("")) && !(iDirs.endsWith(";")) ) iDirs += ";";
   if ( !(eDirs.equals("")) && !(eDirs.endsWith(";")) ) eDirs += ";";
   String[] incDirs = (iDirs.equals("") ) ? new String[0]: iDirs.split(";");
   String[] excDirs = (eDirs.equals("") ) ? new String[0]: eDirs.split(";");
   Pattern[] excPatts = new Pattern[excDirs.length];
   for (int i = 0; i < excDirs.length; i++) excPatts[i] = Pattern.compile("^" + excDirs[i] + ".*$");
    
   try
    {
     tfile = new RandomAccessFile(taskFile, "rw");
     tfile.setLength(0); tfile.close();

     // *-- Run the directory scan for each directory in the list
     DirTools dirTools = new DirTools();
     for (int i = 0; i < incDirs.length; i++)
      { logger.info("Start scanning " + incDirs[i]);
        dirTools.dirScan(incDirs[i], filtersFile, taskFile, skipHidden, followLinks);
        logger.info("Finished scanning " + incDirs[i]);
      }

     // *-- Read the task file into a hashset and remove dups.
     logger.info("Removing duplicate file names from task file");
     filein = new BufferedReader(new FileReader(taskFile));
     String s; h = new HashMapOfString();
     while ((s = filein.readLine()) != null) h.put(s, "");
    }
   catch (IOException ie)
     { errMsg = "IO Error in task file " + ie.getMessage(); }
   finally
    { if (filein != null) try { filein.close(); } catch (IOException ie) { logger.error("Ignore error");
      if (errMsg != null) ctRef.cleanUp(errMsg);
    }
     
   //*-- Dump the file back into tfile with a list of unique file names
   //*-- Check if any of the files belong to the list of exclusion directories
   try
   {
    tfile = new RandomAccessFile(taskFile, "rw");
    tfile.setLength(0); tfile.seek(0);
    outp = new PrintWriter(new FileWriter(taskFile));
   
    if (h != null)
     { String[] keys = h.keys();
       FLOOP: for (int i = 0; i < keys.length; i++)
        { for (int j = 0; j < excPatts.length; j++)     
           { Matcher matcher = excPatts[j].matcher(keys[i]);
             if (matcher.matches()) continue FLOOP;
          }
           outp.println(keys[i]); numFiles++;
        } //*-- end of outer for
     
     } //*-- end of if
   
    outp.flush();
   
   } //*-- end of try
  
   catch (NullPointerException e)  { errMsg = "Null pointer: " + e.getMessage()}
   catch (IOException e)            { errMsg = "IO Error: " + e.getMessage(); }
   finally
    { if (outp != null) outp.close();
      try { if (tfile != null) tfile.close(); }
      catch (IOException ie) { logger.error("Ignore error"); }
      if (errMsg != null) ctRef.cleanUp(errMsg);
    }
    setNumFiles(numFiles);
    logger.info("Finished creating the task file");
    ctRef.fileReadTime += new Date().getTime();
    return;
  }

public int getNumFiles()
{ return numFiles; }

public void setNumFiles(int numFiles)
{ this.numFiles = numFiles; }
 
TOP

Related Classes of org.sf.mustru.crawl.CrawlTask

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.