Package net.sf.regain.crawler

Source Code of net.sf.regain.crawler.Crawler

/*
* regain - A file search engine providing plenty of formats
* Copyright (C) 2004  Til Schneider
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
* Contact: Til Schneider, info@murfman.de
*
* CVS information:
*  $RCSfile$
*   $Source$
*     $Date: 2010-03-05 15:42:27 +0100 (Fr, 05 Mrz 2010) $
*   $Author: thtesche $
* $Revision: 455 $
*/
package net.sf.regain.crawler;

import com.sun.mail.imap.IMAPFolder;
import com.sun.mail.imap.IMAPSSLStore;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.net.URLEncoder;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedList;

import java.util.Map;
import java.util.Properties;
import java.util.Set;
import javax.mail.Folder;
import javax.mail.Message;
import javax.mail.MessagingException;
import javax.mail.Session;
import javax.mail.URLName;
import javax.mail.internet.MimeMessage;
import jcifs.smb.SmbFile;
import net.sf.regain.ImapToolkit;
import net.sf.regain.RegainException;
import net.sf.regain.RegainToolkit;
import net.sf.regain.crawler.access.AccountPasswordEntry;
import net.sf.regain.crawler.config.CrawlerConfig;
import net.sf.regain.crawler.config.StartUrl;
import net.sf.regain.crawler.config.UrlMatcher;
import net.sf.regain.crawler.config.UrlPattern;
import net.sf.regain.crawler.config.WhiteListEntry;
import net.sf.regain.crawler.document.RawDocument;

import org.apache.commons.codec.binary.Base64;
import org.apache.log4j.Logger;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;


/**
* Durchsucht alle konfigurierten Startseiten nach URLs. Die gefundenen Seiten
* werden je nach Einstellung nur geladen, in den Suchindex aufgenommen oder
* wiederum nach URLs durchsucht.
* <p>
* für jede URL wird Anhand der Schwarzen und der Weißen Liste entschieden, ob sie
* ignoriert oder bearbeitet wird. Wenn <CODE>loadUnparsedUrls</CODE> auf
* <CODE>false</CODE> gesetzt wurde, dann werden auch URLs ignoriert, die weder
* durchsucht noch indiziert werden.
*
* @author Til Schneider, www.murfman.de
*/
public class Crawler implements ErrorLogger {

  /** The logger for this class. */
  private static Logger mLog = Logger.getLogger(Crawler.class);

  /** The configuration with the preferences. */
  private CrawlerConfig mConfiguration;
 
  /** The URL checker. */
  private UrlChecker mUrlChecker;

  /** Die Liste der noch zu bearbeitenden Jobs. */
  private LinkedList mJobList;

  /** The number of occured errors. */
  private int mErrorCount;
 
  /**
   * Die Anzahl der fatalen Fehler, die aufgetreten sind.
   * <p>
   * Fatale Fehler sind Fehler, durch die eine Erstellung oder Aktualisierung
   * des Index verhindert wurde.
   */
  private int mFatalErrorCount;

  /** The current crawler job. May be null. */
  private CrawlerJob mCurrentJob;

  /**
   * Contains all found dead links.
   * <p>
   * Contains Object[]s with two elements: The first is the URL that couldn't be
   * found (a String), the second the URL of the document where the dead link
   * was found (a String).
   */
  private LinkedList mDeadlinkList;

  /** The UrlPattern the HTML-Parser should use to identify URLs. */
  private UrlPattern[] mHtmlParserUrlPatternArr;
  /**
   * The regular expressions that belong to the respective UrlPattern for the
   * HTML-Parser.
   *
   * @see #mHtmlParserUrlPatternArr
   */
  private RE[] mHtmlParserPatternReArr;

  /** The profiler that measures the whole crawler jobs. */
  private Profiler mCrawlerJobProfiler;
  /** The profiler that measures the HTML-Parser. */
  private Profiler mHtmlParsingProfiler;
 
  /** The IndexWriterManager to use for adding documents to the index. */
  private IndexWriterManager mIndexWriterManager;
 
  /** Specifies whether the crawler should pause as soon as possible, */
  private boolean mShouldPause;

  /** Username, password Map for configured hostnames. */
  Map <String, AccountPasswordEntry> accountPasswordStore;
 
  /**
   * Creates a new instance of Crawler.
   *
   * @param config The Configuration
   *
   * @throws RegainException If the regular expressions have errors.
   */
  public Crawler(CrawlerConfig config, Properties authProps) throws RegainException {
    Profiler.clearRegisteredProfilers();
   
    mCrawlerJobProfiler = new Profiler("Whole crawler jobs", "jobs");
    mHtmlParsingProfiler = new Profiler("Parsed HTML documents", "docs");
   
    mConfiguration = config;

    mJobList = new LinkedList();
    mDeadlinkList = new LinkedList();

    mFatalErrorCount = 0;

    RawDocument.setHttpTimeoutSecs(config.getHttpTimeoutSecs());

    mHtmlParserUrlPatternArr = config.getHtmlParserUrlPatterns();
    if(mHtmlParserUrlPatternArr.length > 0) {
      mLog.error("Entries in <htmlParserPatternList/> are no longer supported. Please remove " +
        "the Tag and it's children from your config file. Use <whitelist><prefix|regex " +
        "parse=true|false index=true|false >http://someULR</prefix|regex> instead.");
      mHtmlParserPatternReArr = new RE[mHtmlParserUrlPatternArr.length];
      for (int i = 0; i < mHtmlParserPatternReArr.length; i++) {
        String regex = mHtmlParserUrlPatternArr[i].getRegexPattern();
        try {
          mHtmlParserPatternReArr[i] = new RE(regex);
        }
        catch (RESyntaxException exc) {
          throw new RegainException("Regular exception of HTML parser pattern #"
            + (i + 1) + " has a wrong syntax: '" + regex + "'", exc);
        }
      }
    }
    accountPasswordStore = new HashMap<String, AccountPasswordEntry>();
    readAuthenticationProperties(authProps);
    mLog.debug(System.getenv().toString());
  }
 
 
  /**
   * Gets the number of processed documents.
   *
   * @return The number of processed documents.
   */
  public int getFinishedJobCount() {
    return mCrawlerJobProfiler.getMeasureCount();
  }


  /**
   * Gets the number of documents that were in the (old) index when the
   * IndexWriterManager was created.
   *
   * @return The initial number of documents in the index.
   */
  public int getInitialDocCount() {
    IndexWriterManager mng = mIndexWriterManager;
    return (mng == null) ? -1 : mng.getInitialDocCount();
  }


  /**
   * Gets the number of documents that were added to the index.
   *
   * @return The number of documents added to the index.
   */
  public int getAddedDocCount() {
    IndexWriterManager mng = mIndexWriterManager;
    return (mng == null) ? -1 : mng.getAddedDocCount();
  }


  /**
   * Gets the number of documents that will be removed from the index.
   *
   * @return The number of documents removed from the index.
   */
  public int getRemovedDocCount() {
    IndexWriterManager mng = mIndexWriterManager;
    return (mng == null) ? -1 : mng.getRemovedDocCount();
  }


  /**
   * Gets the URL of the current job. Returns null, if the crawler has currently
   * no job.
   *
   * @return The URL of the current job.
   */
  public String getCurrentJobUrl() {
    // NOTE: We put the current job in a local variable to avoid it is set to
    //       null while this method is executed.
    CrawlerJob job = mCurrentJob;
    if (job == null) {
      return null;
    } else {
      return job.getUrl();
    }
  }


  /**
   * Get the time the crawler is already working on the current job.
   *
   * @return The current working time in milli seconds. Returns -1 if the
   *         crawler has currently no job.
   */
  public long getCurrentJobTime() {
    return mCrawlerJobProfiler.getCurrentMeasuringTime();
  }


  /**
   * Sets whether the crawler should pause.
   * 
   * @param shouldPause Whether the crawler should pause.
   */
  public void setShouldPause(boolean shouldPause) {
    mShouldPause = shouldPause;
  }


  /**
   * Gets whether the crawler is currently pausing or is pausing soon.
   *
   * @return Whether the crawler is currently pausing.
   */
  public boolean getShouldPause() {
    return mShouldPause;
  }
 

  /**
   * Analysiert die URL und entscheidet, ob sie bearbeitet werden soll oder nicht.
   * <p>
   * Wenn ja, dann wird ein neuer Job erzeugt und der Job-Liste hinzugef�gt.
   *
   * @param url Die URL des zu prüfenden Jobs.
   * @param sourceUrl Die URL des Dokuments in der die URL des zu prüfenden Jobs
   *        gefunden wurde.
   * @param shouldBeParsed Gibt an, ob die URL geparst werden soll.
   * @param shouldBeIndexed Gibt an, ob die URL indiziert werden soll.
   * @param sourceLinkText Der Text des Links in dem die URL gefunden wurde. Ist
   *        <code>null</code>, falls die URL nicht in einem Link (also einem
   *        a-Tag) gefunden wurde oder wenn aus sonstigen Gr�nden kein Link-Text
   *        vorhanden ist.
   */
  private void addJob(String url, String sourceUrl, boolean shouldBeParsed,
    boolean shouldBeIndexed, String sourceLinkText)
  {
    mLog.debug("Try to add " + url + " referer " + sourceUrl + " as a new crawler job.");
   
    if (! mConfiguration.getBuildIndex()) {
      // Indexing is disabled
      shouldBeIndexed = false;
    }

    // Change all blanks to %20, since blanks are not allowed in URLs
    url = RegainToolkit.replace(url, " ", "%20");
    // Change all &amp; to &
    url = RegainToolkit.replace(url, "&amp;", "&");

    // Replace parts of the URL with an empty string. The patterns are the
    // URLCleaners
    url = CrawlerToolkit.cleanURL(url, mConfiguration.getURLCleaners());

    boolean alreadyAccepted = mUrlChecker.wasAlreadyAccepted(url);
    boolean alreadyIgnored = mUrlChecker.wasAlreadyIgnored(url);

    if ((! alreadyAccepted) && (! alreadyIgnored)) {
      // Check whether the url matches an entry in the whitelist and not an entry in the blacklist
      // We assume that the caller of addJob() detected the correct values for shouldBeParsed
      // and shouldBeIndexed.
      UrlMatcher urlMatch = mUrlChecker.isUrlAccepted(url);
      boolean accepted;
      if( urlMatch.getShouldBeParsed() || urlMatch.getShouldBeIndexed() )
        accepted = true;
      else
        accepted = false;
     
      int mMaxCycleCount = mConfiguration.getMaxCycleCount();

      if (mMaxCycleCount > 0 && accepted) {
        // Check for cycles in the path of an URI
        accepted = mUrlChecker.hasNoCycles(url, mMaxCycleCount);
        if (mLog.isDebugEnabled() && !accepted) {
          mLog.debug("URI seems to have cycles (maxCount=" + mMaxCycleCount + "): " + url);
        }
      }

      // Check whether this page has to be loaded at all
      if (! mConfiguration.getLoadUnparsedUrls()) {
        // Pages that are neither parsed nor indexed can be skipped
        if ((! shouldBeParsed) && (! shouldBeIndexed)) {
          accepted = false;
        }
      }

      if (accepted) {
        mUrlChecker.setAccepted(url);
        if (mLog.isDebugEnabled()) {
          mLog.debug("Found new URL: " + url + " in page: " + sourceUrl);
        }

        CrawlerJob job = new CrawlerJob(url, sourceUrl, sourceLinkText,
                                      shouldBeParsed, shouldBeIndexed);
       
        // NOTE: This is a little trick: We put documents that aren't parsed at
        //       the beginning of the job list and documents that are parsed at
        //       the end. This keeps the job list small as first all documents
        //       are processed, before new documents are added.
        if (shouldBeParsed) {
          mJobList.addLast(job);
        } else {
          mJobList.addFirst(job);
        }
      } else {
        mUrlChecker.setIgnored(url);
        if (mLog.isDebugEnabled()) {
          mLog.debug("Ignoring URL: " + url + " in page: " + sourceUrl);
        }
      }
    }
  }


  /**
   * Executes the crawler process and prints out a statistik, the dead-link-list
   * and the error-list at the end.
   *
   * @param updateIndex Specifies whether an already existing index should be
   *        updated.
   * @param retryFailedDocs Specifies whether a document that couldn't be
   *        prepared the last time should be retried.
   * @param onlyEntriesArr The names of the white list entries, that should be
   *        updated. If <code>null</code> or empty, all entries will be updated.
   */
  public void run(boolean updateIndex, boolean retryFailedDocs,
    String[] onlyEntriesArr)
  {
    mLog.info("Starting crawling ...");
    mShouldPause = false;

    // Init the HTTP client
    CrawlerToolkit.initHttpClient(mConfiguration);

    // Initialize the IndexWriterManager if building the index is wanted
    mIndexWriterManager = null;
    if (mConfiguration.getBuildIndex()) {
      mLog.info("Preparing the index");
      try {
        mIndexWriterManager = new IndexWriterManager(mConfiguration, updateIndex, retryFailedDocs);
        updateIndex = mIndexWriterManager.getUpdateIndex();
      }
      catch (RegainException exc) {
        logError("Preparing the index failed!", exc, true);
        return;
      }
    }

    mLog.debug("Read whitelist entries from config");
    // Get the white list and set the "should be updated"-flags
    WhiteListEntry[] whiteList = mConfiguration.getWhiteList();
    whiteList = useOnlyWhiteListEntries(whiteList, onlyEntriesArr, updateIndex);

    // Create the UrlChecker
    mUrlChecker = new UrlChecker(whiteList, mConfiguration.getBlackList());


    // Add the start URLs
    mLog.info("Read start-URLs from config");
    addStartUrls();
   
    // Remember the last time when a breakpoint was created
    long lastBreakpointTime = System.currentTimeMillis();
   
    // Work on the job list
    while (! mJobList.isEmpty()) {
      mCrawlerJobProfiler.startMeasuring();

      mCurrentJob = (CrawlerJob) mJobList.removeFirst();
      String url = mCurrentJob.getUrl();

      boolean shouldBeParsed = mCurrentJob.shouldBeParsed();
      boolean shouldBeIndexed = mCurrentJob.shouldBeIndexed();

      if (url.startsWith("file://")) {
        // file system: Check whether this is a directory
        try {
          File file = RegainToolkit.urlToFile(url);
          // Check whether the file is readable.
          if (!file.canRead()) {
            mCrawlerJobProfiler.abortMeasuring();
            mLog.debug("File rights: canRead: " + file.canRead() +
                    " canExecute: " + file.canExecute() +
                    " canWrite: " + file.canWrite() + " exists: " + file.exists() +
                    " for url: " + url + ", canonical url: " + file.getCanonicalPath());
            logError("File is not readable: '" + url + "'", null, false);
            continue;
          } else if (file.isDirectory()) {
            // This IS a directory -> Add all child files as Jobs
            if (shouldBeParsed) {
              parseDirectory(file);
            }
           
            // A directory can't be indexed -> continue
            mCrawlerJobProfiler.stopMeasuring(0);
            continue;
          }
        }
        catch (Throwable thr) {
          mCrawlerJobProfiler.abortMeasuring();
          logError("Invalid URL: '" + url + "'", thr, false);
          continue;
        }
      } else if (url.startsWith("smb://")) {
        // Windows share: Check whether this is a directory
        try {
          SmbFile smbFile = RegainToolkit.urlToSmbFile(
            CrawlerToolkit.replaceAuthenticationValuesInURL(url,
            CrawlerToolkit.findAuthenticationValuesForURL(url, accountPasswordStore)));
          // Check whether the file is readable.
          if (!smbFile.canRead()) {
            mCrawlerJobProfiler.abortMeasuring();
            logError("File is not readable: '" + url + "'", null, false);
            continue;
          } else if (smbFile.isDirectory()) {
            // This IS a directory -> Add all child files as Jobs
            if (shouldBeParsed) {
              parseSmbDirectory(smbFile);
            }

            // A directory can't be indexed -> continue
            mCrawlerJobProfiler.stopMeasuring(0);
            continue;
          }
           
        }
        catch (Throwable thr) {
          mCrawlerJobProfiler.abortMeasuring();
          logError("Invalid URL: '" + url + "'", thr, false);
          continue;
        }
       
      } else if(url.startsWith("imap://") || url.startsWith("imaps://")) {
        // IMAP mail box: Check whether this is a folder or an e-mail url
        try {
          if( ImapToolkit.isMessageURL( url) == true) {
            // This is an URL wich describes an a-mail like
            // imap://user:password@mail.mailhost.com/INBOX/message_23(_attachment_1)
            // Mail are only indexed one times
            if( mIndexWriterManager.isAlreadyIndexed(url)) {
              // do not crawl the mail again
              mCrawlerJobProfiler.stopMeasuring(0);
              continue;
            }
          } else {
            // If the URL is not an e-mail it have to be folder. Add all subfolders and
            // messages as jobs
            if (shouldBeParsed) {
              parseIMAPFolder(url);
            }
           
            // A folder can't be indexed -> continue
            mCrawlerJobProfiler.stopMeasuring(0);
            continue;
          }

        }
        catch (Throwable thr) {
          mCrawlerJobProfiler.abortMeasuring();
          logError("Invalid URL: '" + url + "'", thr, false);
          continue;
        }
      }

      // Create a raw document
      RawDocument rawDocument;
      try {
        rawDocument = new RawDocument(url, mCurrentJob.getSourceUrl(),
          mCurrentJob.getSourceLinkText(),
          CrawlerToolkit.findAuthenticationValuesForURL(url, accountPasswordStore));
       
      } catch (RedirectException exc) {
        String redirectUrl = exc.getRedirectUrl();
        mLog.info("Redirect '" + url +  "' -> '" + redirectUrl + "'");
        mUrlChecker.setIgnored(url);
        // the RedirectURL inherit the properties for shouldBeParsed, shouldBeIndexed from the
        // sourceURL. This is possibly not right according to definitions in the whitelist
        addJob(redirectUrl, mCurrentJob.getSourceUrl(), shouldBeParsed,
               shouldBeIndexed, mCurrentJob.getSourceLinkText());
        mCrawlerJobProfiler.stopMeasuring(0);
        continue;
      }
      catch (RegainException exc) {
        // Check whether the exception was caused by a dead link
        handleDocumentLoadingException(exc, mCurrentJob);

        // This document does not exist -> We can't parse or index anything
        // -> continue
        mCrawlerJobProfiler.abortMeasuring();
        continue;
      }

      if( shouldBeIndexed || shouldBeParsed ){
        if (mLog.isDebugEnabled()) {
          mLog.debug("Parsing and indexing " + rawDocument.getUrl());
        }
        mHtmlParsingProfiler.startMeasuring();

        // Parse and index content and metadata
        if (shouldBeIndexed) {
           try {
            mIndexWriterManager.addToIndex(rawDocument, this);
          }
          catch (RegainException exc) {
            logError("Indexing failed for: " + rawDocument.getUrl(), exc, false);
          }
        }

        // Extract links form the document (parse=true). The real meaning of parse in this context
        // is link-extraction. The document is parsed anyway (building a html-node tree).
        if (shouldBeParsed) {
          if(!shouldBeIndexed){
            // The document is not parsed so parse it
            mIndexWriterManager.getDocumentFactory().createDocument(rawDocument, this);
          }
          try {
            //parseHtmlDocument(rawDocument);
            createCrawlerJobs(rawDocument);
          }
          catch (RegainException exc) {
            logError("CrawlerJob creation failed for: " + rawDocument.getUrl(), exc, false);
          }
        }
        mHtmlParsingProfiler.stopMeasuring(rawDocument.getLength());
      }
      // System-Ressourcen des RawDocument wieder frei geben.
      rawDocument.dispose();

      // Zeitmessung stoppen
      mCrawlerJobProfiler.stopMeasuring(rawDocument.getLength());
      mCurrentJob = null;
     
      // Check whether to create a breakpoint
      int breakpointInterval = mConfiguration.getBreakpointInterval();
      boolean breakpointIntervalIsOver = (breakpointInterval > 0)
        && (System.currentTimeMillis() > lastBreakpointTime + breakpointInterval * 60 * 1000);
      if (mShouldPause || breakpointIntervalIsOver) {
        try {
          mIndexWriterManager.createBreakpoint();
        }
        catch (RegainException exc) {
          logError("Creating breakpoint failed", exc, false);
        }
       
        // Pause
        while (mShouldPause) {
          try {
            Thread.sleep(1000);
            mLog.info("The crawler sleeps for 1 second.");
          } catch (InterruptedException exc) {}
        }
       
        lastBreakpointTime = System.currentTimeMillis();
      }
    } // while (! mJobList.isEmpty())

    // Remove documents from the index which no longer exists
    if (mConfiguration.getBuildIndex()) {
      mLog.info("Removing index entries of documents that do not exist any more...");
      try {
        mIndexWriterManager.removeObsoleteEntries(mUrlChecker);
      }
      catch (Throwable thr) {
        logError("Removing non-existing documents from index failed", thr, true);
      }
    }

    // Check wether the index is empty
    int entryCount = 0;
    try {
      entryCount = mIndexWriterManager.getIndexEntryCount();
      // NOTE: We've got to substract the errors, because for each failed
      //       document a substitute document is added to the index
      //       (which should not be counted).
      entryCount -= mErrorCount;
      if (entryCount < 0) {
        entryCount = 0;
      }
    }
    catch (Throwable thr) {
      logError("Counting index entries failed", thr, true);
    }
    double failedPercent;
    if (entryCount == 0) {
      logError("The index is empty.", null, true);
      failedPercent = 1;
    } else {
      // Check wether the count of dead/errror links reached the limit
      double failedDocCount = mDeadlinkList.size() + mErrorCount;
      double totalDocCount = failedDocCount + entryCount;
      failedPercent = failedDocCount / totalDocCount;
      double maxAbortedPercent = mConfiguration.getMaxFailedDocuments();
      if (failedPercent > maxAbortedPercent) {
        logError("There are more failed documents than allowed (Failed: " +
          RegainToolkit.toPercentString(failedPercent) + ", allowed: " +
          RegainToolkit.toPercentString(maxAbortedPercent) + ").",
          null, true);
      }
    }

    // Fehler und Deadlink-Liste schreiben
    writeDeadlinkAndErrorList();
    writeCrawledURLsList();

    // finalize index
    if (mIndexWriterManager != null) {
      boolean thereWereFatalErrors = (mFatalErrorCount > 0);
      if (thereWereFatalErrors) {
        mLog.warn("There were " + mFatalErrorCount + " fatal errors. " +
          "The index will be finished but put into quarantine.");
      } else {
        mLog.info("Finishing the index and providing it to the search mask");
      }
      try {
        mIndexWriterManager.close(thereWereFatalErrors);
      } catch (RegainException exc) {
        logError("Finishing index failed!", exc, true);
      }
      mIndexWriterManager = null;
    }

    mLog.info("... Finished crawling\n");

    mLog.info(Profiler.getProfilerResults());

    // get system specific line break
    String lineSeparator = RegainToolkit.getLineSeparator();

    mLog.info("Statistics:" + lineSeparator
      + "  Ignored URLs:       " + mUrlChecker.getIgnoredCount() + lineSeparator
      + "  Documents in index: " + entryCount + lineSeparator
      + "  Dead links:         " + mDeadlinkList.size() + lineSeparator
      + "  Errors:             " + mErrorCount + lineSeparator
      + "  Error ratio:        " + RegainToolkit.toPercentString(failedPercent));
  }

  private File createTempDir() {
    // Get the directory where the files should be put in
    File listDir;
    if (mConfiguration.getBuildIndex()) {
      listDir = new File(mConfiguration.getIndexDir() + File.separator + "temp" + File.separator + "log");
    } else {
      listDir = new File("log");
    }
    try {
      // Create the directory if doesn't exist
      if (!listDir.exists()) {
        if (!listDir.mkdir()) {
          throw new IOException("Creating directory failed: " + listDir.getAbsolutePath());
        }
      }
    } catch (IOException exc) {
      logError("Writing deadlink list and error list failed", exc, false);
    }
    return listDir;
  }


  /**
   * Handles an exception caused by a failed document loadung. Checks whether
   * the exception was caused by a dead link and puts it to the dead link list
   * if necessary.
   *
   * @param exc The exception to check.
   * @param job The job of the document.
   */
  private void handleDocumentLoadingException(RegainException exc, CrawlerJob job) {
    if (isExceptionFromDeadLink(exc)) {
      // Don't put this exception in the error list, because it's already in
      // the dead link list. (Use mCat.error() directly)
      mLog.error("Dead link: '" + job.getUrl() + "'. Found in '" + job.getSourceUrl()
                 + "'", exc);
      mDeadlinkList.add(new Object[] { job.getUrl(), job.getSourceUrl() });
    } else {
      logError("Loading " + job.getUrl() + " failed!", exc, false);
    }
  }


  /**
   * Adds all start URL to the job list.
   */
  private void addStartUrls() {
    // Get the start URLs from the config
    StartUrl[] startUrlArr = mConfiguration.getStartUrls();
   
    // Normalize the start URLs
    startUrlArr = mUrlChecker.normalizeStartUrls(startUrlArr);
    mLog.info("Found " + startUrlArr.length + " startURLs.");
   
    // Add the start URLs as jobs
    for (int i = 0; i < startUrlArr.length; i++) {
      String url = startUrlArr[i].getUrl();
      boolean shouldBeParsed = startUrlArr[i].getShouldBeParsed();
      boolean shouldBeIndexed = startUrlArr[i].getShouldBeIndexed();

      addJob(url, "Start URL from configuration", shouldBeParsed,
          shouldBeIndexed, null);
    }
  }

  /**
   * Reads the authentication properties of all entries.
   */
  private void readAuthenticationProperties(Properties authProps) {

    try {
      mLog.info("Read authentication entries from authentication properties.");

      // Java 6
      Set<String> keys = new HashSet<String>();
      //keys = authProps.stringPropertyNames();
      // Java 5
      for (Enumeration e = authProps.propertyNames(); e.hasMoreElements();) {
        keys.add((String) e.nextElement());
      }

      Iterator iter = keys.iterator();
      // Iterate over all keys
      while (iter.hasNext()) {
        String key = (String) iter.next();
        String parts[] = key.split("\\.");

        String url = CrawlerToolkit.createURLFromProps(parts);
        // Check for url in HashMap
        AccountPasswordEntry acPassEntry;
        if (accountPasswordStore.containsKey(url)) {
          acPassEntry = accountPasswordStore.get(url);
        } else {
          acPassEntry = new AccountPasswordEntry();
        }

        if (key.indexOf(".account") != -1) {
          acPassEntry.setAccountName(URLEncoder.encode(authProps.getProperty(key), "UTF-8"));
          mLog.debug("Found account name: " + acPassEntry.getAccountName() + " for auth entry: " + url);
        } else if (key.indexOf(".password") != -1) {
          Base64 decoder = new Base64();
          acPassEntry.setPassword(new String(decoder.decode(authProps.getProperty(key))));
          mLog.debug("Found password for auth entry: " + url);
        }
        // write the updated entry back to hashtable
        mLog.debug("write entry for url >" + url + "<" + " with username/password into authentication store.");
        accountPasswordStore.put(url, acPassEntry);

      }

    } catch (Exception e) {
      mLog.error("Error handling authentication.properties. ", e);
    }

  }

  /**
   * Sets the "should be updated"-flag for each entry in the white list.
   *
   * @param whiteList The white list to process.
   * @param onlyEntriesArr The names of the white list entries, that should be
   *        updated. If <code>null</code> or empty, all entries will be updated.
   * @param updateIndex Specifies whether an already existing index will be
   *        updated in this crawler run.
   * @return The processed white list.
   */
  private WhiteListEntry[] useOnlyWhiteListEntries(WhiteListEntry[] whiteList,
    String[] onlyEntriesArr, boolean updateIndex)
  {
    // NOTE: At that moment all white list entries are set to "should be updated"

    if ((onlyEntriesArr != null) && (onlyEntriesArr.length != 0)) {
      if (updateIndex) {
        // First set all white list entries to "should NOT be updated".
        for (int i = 0; i < whiteList.length; i++) {
          whiteList[i].setShouldBeUpdated(false);
        }

        // Now set those entries to "should be updated" that are in the list
        for (int i = 0; i < onlyEntriesArr.length; i++) {
          // Find the matching white list entry
          WhiteListEntry entry = null;
          for (int j = 0; j < whiteList.length; j++) {
            if (onlyEntriesArr[i].equals(whiteList[j].getName())) {
              entry = whiteList[j];
              break;
            }
          }

          if (entry == null) {
            // No matching white list entry found
            logError("There is no white list entry named '" + onlyEntriesArr[i]
                     + "'", null, true);
          } else {
            entry.setShouldBeUpdated(true);
          }
        }

        // Log all ignored entries
        for (int i = 0; i < whiteList.length; i++) {
          if (! whiteList[i].shouldBeUpdated()) {
            mLog.info("Ignoring white list entry: " + whiteList[i].getUrlMatcher());
          }
        }
      } else {
        mLog.warn("Unable to ignore white list entries, because a new index " +
                  "will be created");
      }
    }
   
    return whiteList;
  }

  /**
   * Writes the URLs of all crawl jobs into a file.
   */
  private void writeCrawledURLsList() {
    if (mUrlChecker.getmAcceptedUrlSet() != null) {

      File listDir = createTempDir();
      FileOutputStream stream = null;
      PrintStream printer = null;

      Iterator crawledURLsIter = mUrlChecker.getmAcceptedUrlSet().iterator();
      try {
        stream = new FileOutputStream(new File(listDir, "crawledURLs.txt"));
        printer = new PrintStream(stream);

        while (crawledURLsIter.hasNext()) {
          printer.println((String) crawledURLsIter.next());
        }
        printer.close();
        stream.close();

      } catch (IOException exc) {
        logError("Writing crawled URLs failed", exc, false);
      } finally {
        if (printer != null) {
          printer.close();
        }
        if (stream != null) {
          try {
            stream.close();
          } catch (IOException exc) {
          }
        }
      }
    }
  }

  /**
   * Schreibt die Deadlink- und Fehlerliste ins Logfile und nochmal in eine
   * eigene Datei. Diese stehen in einem Unterverzeichnis namens 'log'.
   * Bei eingeschalteter Indizierung steht dieses Unterverzeichnis im Index, bei
   * ausgeschalteter Indizierung im aktuellen Verzeichnis.
   */
  private void writeDeadlinkAndErrorList() {
    if (mDeadlinkList.isEmpty() && (mErrorCount == 0)) {
      // Nothing to do
      return;
    }
    File listDir = createTempDir();

    String msg;
    FileOutputStream stream = null;
    PrintStream printer = null;
    try {

      // Write the deadlink list
      if (! mDeadlinkList.isEmpty()) {
        stream = new FileOutputStream(new File(listDir, "deadlinks.txt"));
        printer = new PrintStream(stream);

        msg = "There were " + mDeadlinkList.size() + " dead links:";
        System.out.println(msg);
        printer.println(msg);

        Iterator iter = mDeadlinkList.iterator();
        for (int i = 0; iter.hasNext(); i++) {
          Object[] tupel = (Object[]) iter.next();
          String url = (String) tupel[0];
          String sourceUrl = (String) tupel[1];

          msg = "  Dead link #" + (i + 1) + ": '" + url + "' found in '" + sourceUrl + "'";
          System.out.println(msg);
          printer.println(msg);
        }

        printer.close();
        stream.close();
      }

      // Write the error list
      if (mErrorCount > 0) {
        mLog.warn("There were " + mErrorCount + " errors");
      }
    }
    catch (IOException exc) {
      logError("Writing deadlink list and error list failed", exc, false);
    }
    finally {
      if (printer != null) {
        printer.close();
      }
      if (stream != null) {
        try { stream.close(); } catch (IOException exc) {}
      }
    }
  }



  /**
   * Prüft, ob die Exception von einem Dead-Link herr�hrt.
   *
   * @param thr Die zu prüfende Exception
   * @return Ob die Exception von einem Dead-Link herr�hrt.
   */
  private boolean isExceptionFromDeadLink(Throwable thr) {
    if (thr instanceof HttpStreamException) {
      HttpStreamException exc = (HttpStreamException) thr;
      return exc.isHttpReturnCodeFromDeadLink();
    }
    else if (thr instanceof RegainException) {
      RegainException exc = (RegainException) thr;
      return isExceptionFromDeadLink(exc.getCause());
    }
    else {
      return false;
    }
  }


  /**
   * Searches a directory for URLs, that means files and sub-directories.
   * The method creates a new job for every match
   *
   * @param dir the directory to parse
   * @throws RegainException If encoding of the found URLs failed.
   */
  private void parseDirectory(File dir) throws RegainException {
    // Get the URL for the directory
    String sourceUrl = RegainToolkit.fileToUrl(dir);

    // Parse the directory
    File[] childArr = dir.listFiles();

    // dir.listFiles() can return null, because of "(Access denied)" / "(Zugriff verweigert)"
  if (childArr == null) {
    if (dir.canRead() == false) {
      throw new RegainException("canRead() on file returned: false. Maybe no access rights for sourceURL: " + sourceUrl);
    } else {
      throw new RegainException("listFiles() returned: null array. Maybe no access rights for sourceURL: " + sourceUrl);
    }
  }

    for (int childIdx = 0; childIdx < childArr.length; childIdx++) {
      // Get the URL for the current child file
      String url = RegainToolkit.fileToUrl(childArr[childIdx]);

      // Check whether this is a directory
      if (childArr[childIdx].isDirectory()) {
        // It's a directory -> Add a parse job
        addJob(url, sourceUrl, true, false, null);
      } else {
        // It's a file -> Add a index job
        addJob(url, sourceUrl, false, true, null);
      }
    }
  }

  /**
   * Searches a samba directory for URLs, that means files and sub-directories.
   * The method creates a new job for every match
   *
   * @param dir the directory to parse
   * @throws RegainException If encoding of the found URLs failed.
   */
  private void parseSmbDirectory(SmbFile dir) throws RegainException {
 
    try {
      // Get the URL for the directory
      String sourceUrl = dir.getCanonicalPath();

      // Parse the directory
      SmbFile[] childArr = dir.listFiles();
      for (int childIdx = 0; childIdx < childArr.length; childIdx++) {
        // Get the URL for the current child file
        String url = childArr[childIdx].getCanonicalPath();
        // Remove domain, username, password
        if (url.contains("@")) {
          url = "smb://" + url.substring(url.indexOf("@") + 1);
        }
        // Check whether this is a directory
        if (childArr[childIdx].isDirectory()) {
          // It's a directory -> Add a parse job
          addJob(url, sourceUrl, true, false, null);
        } else {
          // It's a file -> Add a index job
          addJob(url, sourceUrl, false, true, null);
        }
      }
    } catch( Exception ex ) {
      throw new RegainException(ex.getMessage(), ex);
    }
        
  }

  /**
   * Searches a imap directory for folder an counts the containing messages
   * The method creates a new job for every not empty folder
   *
   * @param folder the folder to parse
   * @throws RegainException If encoding of the found URLs failed.
   */
  private void parseIMAPFolder(String folderUrl) throws RegainException {

    mLog.debug("Determine IMAP subfolder for: " + folderUrl);
    Session session = Session.getInstance(new Properties());

    URLName originURLName = new URLName(
      CrawlerToolkit.replaceAuthenticationValuesInURL(folderUrl,
      CrawlerToolkit.findAuthenticationValuesForURL(folderUrl, accountPasswordStore)));
    // Replace all %20 with whitespace in folder pathes
    String folder = "";
    if (originURLName.getFile() != null) {
      folder = originURLName.getFile().replaceAll("%20", " ");
    }
    URLName urlName = new URLName(originURLName.getProtocol(), originURLName.getHost(),
      originURLName.getPort(), folder, originURLName.getUsername(), originURLName.getPassword());

    Map<String, Integer> folderList = new Hashtable<String, Integer>();

    try {
      IMAPSSLStore imapStore = new IMAPSSLStore(session, urlName);

      imapStore.connect();
      IMAPFolder startFolder;

      if (urlName.getFile() == null) {
        // There is no folder given
        startFolder = (IMAPFolder) imapStore.getDefaultFolder();
      } else {
        startFolder = (IMAPFolder) imapStore.getFolder(urlName.getFile());
      }

      // Find messages (if folder exist and could be openend)
      if (startFolder.exists()) {
        try {
          startFolder.open(Folder.READ_ONLY);
          Message[] msgs = startFolder.getMessages();
          for (int i = 0; i < msgs.length; i++) {
            MimeMessage message = (MimeMessage) msgs[i];
            // It's a message -> Add a index job
            addJob(folderUrl + "/message_" + startFolder.getUID(message), folderUrl, false, true, null);
          }
          startFolder.close(false);

        } catch (MessagingException messageEx) {
          mLog.debug("Could not open folder for reading but this is not an errror. Folder URL is " + folderUrl);
        }
      }
      // Find all subfolder
      folderList = ImapToolkit.getAllFolders(startFolder, false);

      // Iterate over all subfolders
      for (Map.Entry<String, Integer> entry : folderList.entrySet()) {
        // It's a directory -> Add a parse job
        String newFolder;
        if( folderUrl == null || folderUrl.length() == 0 || !folderUrl.endsWith("/")) {
          newFolder =  "/" + (String) entry.getKey();
        } else {
          newFolder =  (String) entry.getKey();
        }
        addJob(folderUrl + newFolder, folderUrl, true, false, null);
      }
      imapStore.close();

    } catch (Exception ex) {
      throw new RegainException("Couldn't determine IMAP entries.", ex);
    }
  }

  /**
   * Creates crawler jobs from inclosed links. Every link is checked against the white-/black list.
   *
   * @param rawDocument A document with or without links
   * @throws net.sf.regain.RegainException if an exception occurrs during job creation
   */
  private void createCrawlerJobs(RawDocument rawDocument) throws RegainException {
    if( rawDocument.hasLinks() ){
      // Iterate over all found links in the document
      for (Iterator iter = rawDocument.getLinks().entrySet().iterator(); iter.hasNext();){
        Map.Entry entry = (Map.Entry)iter.next();
        // The intention of this call is only to determine the link-extraction and indexing property
        UrlMatcher urlMatch = mUrlChecker.isUrlAccepted((String)entry.getKey());
        // Add the job
        addJob((String)entry.getKey(), rawDocument.getUrl(),
          urlMatch.getShouldBeParsed(), urlMatch.getShouldBeIndexed(), (String)entry.getValue());
      }
    }
  }

  /**
   * Durchsucht den Inhalt eines HTML-Dokuments nach URLs und erzeugt für jeden
   * Treffer einen neuen Job.
   *
   * @param rawDocument Das zu durchsuchende Dokument.
   * @throws RegainException Wenn das Dokument nicht gelesen werden konnte.
   */
  private void parseHtmlDocument(RawDocument rawDocument) throws RegainException {
    for (int i = 0; i < mHtmlParserPatternReArr.length; i++) {
      RE re = mHtmlParserPatternReArr[i];
      int urlGroup = mHtmlParserUrlPatternArr[i].getRegexUrlGroup();
      boolean shouldBeParsed = mHtmlParserUrlPatternArr[i].getShouldBeParsed();
      boolean shouldBeIndexed = mHtmlParserUrlPatternArr[i].getShouldBeIndexed();

      int offset = 0;
      String contentAsString = rawDocument.getContentAsString();
      try {
        while (re.match(contentAsString, offset)) {
          offset = re.getParenEnd(0);

          String parentUrl = rawDocument.getUrl();
          String url = re.getParen(urlGroup);

          if (url != null) {
            // Convert the URL to an absolute URL
            url = CrawlerToolkit.toAbsoluteUrl(url, parentUrl);

            // Try to get a link text
            String linkText = getLinkText(contentAsString, offset);

            // Add the job
            addJob(url, parentUrl, shouldBeParsed, shouldBeIndexed, linkText);
          }
        }
      } catch( Throwable ex ) {
        throw new RegainException("Too many links in document.", ex);
       
      }
    }
  }

  /**
   * Tries to extract a link text from a position where a URL was found.
   *
   * @param content The content to extract the link text from
   * @param offset The offset where o start looking
   * @return A link text or <code>null</code> if there was no link text found.
   */
  private String getLinkText(String content, int offset) {
    // NOTE: if there is a link text the following code must be something
    //       like: ' someParam="someValue">The link text</a>'
    //       Assumed that the tag started with '<a href="aDocument.doc"'

    // Find the end of the current tag
    int tagEnd = content.indexOf('>', offset);
    if (tagEnd == -1) {
      // No tag end found
      return null;
    }

    // If there is a link text the next part must be: 'The link text</a>'
    // -> Find the start of the next tag
    int tagStart = content.indexOf('<', tagEnd);
    if (tagStart == -1) {
      // No starting tag found
      return null;
    }

    // Check whether the starting tag is a '</a>' tag.
    if ((content.length() > tagStart + 3)
      && (content.charAt(tagStart + 1) == '/')
      && (content.charAt(tagStart + 2) == 'a')
      && (content.charAt(tagStart + 3) == '>'))
    {
      // We have a link text
      String linkText = content.substring(tagEnd + 1, tagStart);
      linkText = linkText.trim();
      if (linkText.length() == 0) {
        linkText = null;
      }

      return linkText;
    } else {
      // The tag was no </a> tag, so the text was no link text
      return null;
    }
  }


  /**
   * Gibt die Anzahl der Fehler zurück (das beinhaltet fatale und nicht fatale
   * Fehler).
   *
   * @return Die Anzahl der Fehler.
   * @see #getFatalErrorCount()
   */
  public int getErrorCount() {
    return mErrorCount;
  }


  /**
   * Gibt Die Anzahl der fatalen Fehler zurück.
   * <p>
   * Fatale Fehler sind Fehler, durch die eine Erstellung oder Aktualisierung
   * des Index verhindert wurde.
   *
   * @return Die Anzahl der fatalen Fehler.
   * @see #getErrorCount()
   */
  public int getFatalErrorCount() {
    return mFatalErrorCount;
  }


  /**
   * Loggs an error.
   *
   * @param msg The error message.
   * @param thr The error. May be <code>null</code>.
   * @param fatal Specifies whether the error was fatal. An error is fatal if
   *        it caused that the index could not be created.
   */
  public void logError(String msg, Throwable thr, boolean fatal) {
    if (fatal) {
      msg = "Fatal: " + msg;
    }
    mLog.error(msg, thr);
    try {
      if (mIndexWriterManager != null) {
        mIndexWriterManager.logError(msg, thr);
      }
    }
    catch (RegainException exc) {
      mLog.error("Logging error in error log of index failed", exc);
    }
   
    mErrorCount ++;
    if (fatal) {
      mFatalErrorCount++;
    }
  }

}
TOP

Related Classes of net.sf.regain.crawler.Crawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.