Source Code of net.sf.regain.crawler.UrlChecker

/*
 * regain - A file search engine providing plenty of formats
 * Copyright (C) 2004  Til Schneider
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Contact: Til Schneider, info@murfman.de
 *
 * CVS information:
 *  $RCSfile$
 *   $Source$
 *     $Date: 2010-04-13 12:47:22 +0200 (Di, 13 Apr 2010) $
 *   $Author: thtesche $
 * $Revision: 456 $
 */
package net.sf.regain.crawler;


import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.net.MalformedURLException;
import java.net.URL;


import net.sf.regain.RegainException;
import net.sf.regain.RegainToolkit;
import net.sf.regain.crawler.config.StartUrl;
import net.sf.regain.crawler.config.UrlMatcher;
import net.sf.regain.crawler.config.UrlMatcherResult;
import net.sf.regain.crawler.config.WhiteListEntry;


import org.apache.log4j.Logger;


/**
 * Decides whether a URL was already accepted or ignored.
 * <p>
 * For this decision we take advantage of a specialty of the processing of file
 * URLs: Since directory are searched by tree traversing, we can be sure that
 * we never find the same file twice in one crawler run. The only thing we have
 * to attend to make this true is that the start URLs are prefix free among each
 * other (Wich is done by {@link #normalizeStartUrls(StartUrl[])}).
 * <p>
 * For http-URLs we have to remember all accepted or ignored URLs, because in
 * http URLs are found by page parsing which can ramdomly find any URL.
 * 
 * @author Til Schneider, www.murfman.de
 */
public class UrlChecker {


  /** The logger for this class. */
  private static Logger mLog = Logger.getLogger(UrlChecker.class);
  
  /** Contains all http-URLs that have been accepted. */
  private HashSet<String> mAcceptedUrlSet;
  /** Contains all http-URLs that have been ignored. */
  private HashSet<String> mIgnoredUrlSet;
  /** The number of URLs that have been ignored. */
  private int mIgnoredCount;


  /**
   * The white list.
   * <p>
   * The white list is an array of WhiteListEntry, a URLs <i>must</i> match to,
   * in order to be processed.
   */
  private WhiteListEntry[] mWhiteListEntryArr;
  /**
   * The black list.
   * <p>
   * The black list is an array of UrlMatchers, a URL <i>must not</i> match to,
   * in order to be processed.
   */
  private UrlMatcher[] mBlackListArr;




  /**
   * Creates a new instance of UrlChecker.
   * 
   * @param whiteList The white list. The white list is an array of
   *        WhiteListEntry, a URL <i>must</i> match to, in order to be processed.
   * @param blackList The black list. The black list is an array of UrlMatchers,
   *        a URL <i>must not</i> match to, in order to be processed.
   */
  public UrlChecker(WhiteListEntry[] whiteList, UrlMatcher[] blackList) {
    mAcceptedUrlSet = new HashSet<String>();
    mIgnoredUrlSet = new HashSet<String>();
    mIgnoredCount = 0;


    mWhiteListEntryArr = whiteList;
    mBlackListArr = blackList;
  }




  /**
   * Normalizes the start URLs
   * 
   * @param urlArr The start URLs to normalize.
   * @return The normalized start URLs.
   */
  public StartUrl[] normalizeStartUrls(StartUrl[] urlArr) {
    // Check whether the file-URLs are prefix free.
    boolean foundPrefix = false;
    for (int i = 0; i < urlArr.length; i++) {
      String currUrl = urlArr[i].getUrl();
      if (currUrl.startsWith("file://")) {
        // Check whether another URL is a prefix of this URL
        for (int j = 0; j < urlArr.length; j++) {
          if ((i != j) && (urlArr[j] != null)) {
            if (currUrl.startsWith(urlArr[j].getUrl())) {
              // URL i is already covered by searching URL j
              // (Example: i = "file:///home/til/docs/abc", j = "file:///home/til")
              // -> Remove URL i to avoid finding the same file twice
              mLog.info("Ignoring start URL '" + currUrl + "', because it is " +
                  "covered by start URL '" + urlArr[j].getUrl() + "'");


              urlArr[i] = null;
              foundPrefix = true;


              break;
            }
          }
        }
      }
    }
    
    if (foundPrefix) {
      // There were entries removed -> We have to build a new array
      ArrayList list = new ArrayList(urlArr.length);
      for (int i = 0; i < urlArr.length; i++) {
        if (urlArr[i] != null) {
          list.add(urlArr[i]);
        }
      }
      urlArr = new StartUrl[list.size()];
      list.toArray(urlArr);
    }
    
    return urlArr;
  }


  /** 
    * This method tries to detect cycles in an URI. Every part of the path will
    * be compared to each other. If more then maxCycles parts are detected the URI
    * the URI will be marked as a 'cycle URI'
    * 
    * @param maxCycles Count of maximum occurence of the same path part
    * @param url the URI to be checked 
    * @return true if the URI has no cycles, false if cycles where detected.
    */
   public boolean hasNoCycles(String url, int maxCycles) {


      String mPath = "";
      boolean mResult = true;


      try {
         URL mUrl = new URL(url);
         mPath = mUrl.getPath();


      } catch (MalformedURLException ex) {
         // This should never happen. We assume all URL where checked before
         return mResult;
      }


      if (mPath.length() < 2) {
         return mResult;
      }


      String[] mParts = RegainToolkit.splitString(mPath, "/");
      HashSet uniqueParts = new HashSet();
      // Add every part to a hashmap. The idea behind: only the first occurence 
      // will resists in the map (because of the same hash value).
      for (int i = 0; i < mParts.length; i++) {
         if (mLog.isDebugEnabled()) {
            mLog.debug("Add part: '" + mParts[i] + "'");
         }
         uniqueParts.add(mParts[i]);
      }


      if (mLog.isDebugEnabled()) {
         mLog.debug("uniqueParts.size(): " + uniqueParts.size());
         mLog.debug("mParts.length: " + mParts.length);
         mLog.debug("maxCycles: " + maxCycles);
      }


      if (uniqueParts.size() != mParts.length) {
         if (uniqueParts.size() <= mParts.length - maxCycles) {
            mResult = false;
         }
      }


      return mResult;
   }


  /**
   * Prüft ob die URL von der Schwarzen und Weißen Liste akzeptiert wird.
   * <p>
   * Dies ist der Fall, wenn sie keinem Präfix aus der Schwarzen Liste und
   * mindestens einem aus der Weißen Liste entspricht.
   *
   * @param url Die zu prüfende URL.
   * @return Ob die URL von der Schwarzen und Weißen Liste akzeptiert wird.
   */
  public UrlMatcher isUrlAccepted(String url) {
    
    UrlMatcher urlMatchResult = new UrlMatcherResult(false, false);
    mLog.debug("isUrlAccepted for url: " + url);
    // check whether this URL matches to a white list prefix
    for (int i = 0; i < mWhiteListEntryArr.length; i++) {
      if (mWhiteListEntryArr[i].shouldBeUpdated()) {
        UrlMatcher matcher = mWhiteListEntryArr[i].getUrlMatcher();
        if (matcher.matches(url)) {
          // get the values for link extraction and indexing 
          // from the current matcher hit
          urlMatchResult.setShouldBeParsed(matcher.getShouldBeParsed());
          urlMatchResult.setShouldBeIndexed(matcher.getShouldBeIndexed());
          mLog.debug("Whitelist matches for url: " + url);
          break;
        }
      }
    }


    // check whether this URL matches to a black list prefix
    // check only if there was a whitelist-hit
    if( urlMatchResult.getShouldBeParsed() || urlMatchResult.getShouldBeIndexed() ) {
      for (int i = 0; i < mBlackListArr.length; i++) {
        if (mBlackListArr[i].matches(url)) {
          urlMatchResult.setShouldBeParsed(false);
          urlMatchResult.setShouldBeIndexed(false);
          mLog.debug("Blacklist matches for url: " + url);
        }
      }
    }


    return urlMatchResult;
  }




  /**
   * Creates an array of UrlMatchers that identify URLs that should not be
   * deleted from the search index.
   * <p>
   * This list is according to the white list entries whichs
   * <code>shouldBeUpdated</code> flag is <code>false</code>.
   *
   * @return An array of UrlMatchers that identify URLs that should not be
   *         deleted from the search index.
   * @see WhiteListEntry#shouldBeUpdated()
   */
  public UrlMatcher[] createPreserveUrlMatcherArr() {
    ArrayList list = new ArrayList();
    for (int i = 0; i < mWhiteListEntryArr.length; i++) {
      if (! mWhiteListEntryArr[i].shouldBeUpdated()) {
        list.add(mWhiteListEntryArr[i].getUrlMatcher());
      }
    }


    UrlMatcher[] asArr = new UrlMatcher[list.size()];
    list.toArray(asArr);
    return asArr;
  }




  /**
   * Decides whether the given URL was already accepted in a crawler run.
   * 
   * @param url The URL to check.
   * @return Whether the URL was already accepted.
   */
  public boolean wasAlreadyAccepted(String url) {
    if (url.startsWith("file://")) {
      // This is a file URL -> We haven't found it yet (Why? See class javadoc)
      return false;
    } else {
      return getmAcceptedUrlSet().contains(url);
    }
  }




  /**
   * Decides whether the given URL was already ignored in a crawler run.
   * 
   * @param url The URL to check.
   * @return Whether the URL was already ignored.
   */
  public boolean wasAlreadyIgnored(String url) {
    if (url.startsWith("file://")) {
      // This is a file URL -> We haven't found it yet (Why? See class javadoc)
      return false;
    } else {
      return mIgnoredUrlSet.contains(url);
    }
  }




  /**
   * Decides whether the given URL should be kept in the search index.
   * 
   * @param url The URL to check.
   * @return Whether the URL should be kept in the search index.
   * @throws RegainException If the url is invalid.
   */
  public boolean shouldBeKeptInIndex(String url) throws RegainException {
    if (url.startsWith("file://")) {
      // This is a file URL -> We have no information whether this file exists
      // since we didn't remember whether it was accepted or not.
      
      // Check whether the url is accepted by the white and black list
      UrlMatcher urlMatch = isUrlAccepted(url);
      if (! urlMatch.getShouldBeIndexed() ) {
        // This file is not accepted -> Remove it from the index
        return false;
      }
      
      // Check whether the file exists
      File file = RegainToolkit.urlToFile(url);
      if (! file.exists()) {
        // This file does not exist -> Remove it from the index
        return false;
      }
      
      // All tests passed -> Keep the file
      return true;
    } else {
      return getmAcceptedUrlSet().contains(url);
    }
  }




  /**
   * Used by the crawler to set the accepted state for a certain URL.
   * 
   * @param url The URL that was accepted by the crawler.
   */
  public void setAccepted(String url) {
    if (url.startsWith("file://")) {
      // This is a file URL -> We haven't to remember it (Why? See class javadoc)
    } else {
      getmAcceptedUrlSet().add(url);
    }
  }




  /**
   * Used by the crawler to set the ignored state for a certain URL.
   * 
   * @param url The URL that was ignored by the crawler.
   */
  public void setIgnored(String url) {
    mIgnoredCount++;
    
    if (url.startsWith("file://")) {
      // This is a file URL -> We haven't to remember it (Why? See class javadoc)
    } else {
      mIgnoredUrlSet.add(url);
    }
  }




  /**
   * Gets the number of URLs that have been ignored.
   * 
   * @return The number of URLs that have been ignored.
   */
  public int getIgnoredCount() {
    return mIgnoredCount;
  }




  /**
   * Gets the set of accepted URLs.
   *
   * @return the mAcceptedUrlSet
   */
  public HashSet getmAcceptedUrlSet() {
    return mAcceptedUrlSet;
  }




}
Source Code of net.sf.regain.crawler.UrlChecker

Related Classes of net.sf.regain.crawler.UrlChecker