Source Code of net.matuschek.spider.RegExpURLCheck

package net.matuschek.spider;


/************************************************
    Copyright (c) 2001/2002 by Daniel Matuschek
 *************************************************/




import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.net.URL;
import java.util.StringTokenizer;
import java.util.Vector;


import org.apache.regexp.RESyntaxException;




/**
 * This URLChecker checks a URL using a list of regular expressions
 * that should be allowed or denied.
 * 
 * @author Daniel Matuschek
 * @version $Revision: 1.4 $
 */
public class RegExpURLCheck 
implements URLCheck
{
  /** vector to store the rules */
  private Vector<RegExpRule> rules = null;


  /** default check result if no matching regexp was found */
  private boolean defaultResult = true;


  /** initializes the object with an empty rule set */
  public RegExpURLCheck() {
    rules = new Vector<RegExpRule>();
  }


  /** 
   * <p>initialized the object with a rule set from an 
   * input stream (e.g. a file)</p>
   *
   * <p>every line of this stream has the format 
   * <code>allow|deny expression</code></p>
   *
   * <p>default value can be set with
   * <code>allow|deny .</code> at the end of the file</p>
   *  
   * <p>lines that start with "#" and empty lines will be
   * ignored</p>
   */
  public RegExpURLCheck(Reader r) 
  throws IOException, 
  org.apache.regexp.RESyntaxException
  {
    this();


    BufferedReader reader = 
      new BufferedReader(r);


    String line = "";
    int lineno=0;


    while (line != null) {
      line=reader.readLine();
      lineno++;


      if ((line != null) &&
          (! line.trim().equals("")) &&
          (! line.startsWith("#"))) {
        StringTokenizer st = new StringTokenizer(line);
        // did we get 2 tokens ?
        if (st.countTokens() != 2) {
          throw new IOException("line "+lineno+" don't consists of 2 fields");
        }


        String allowStr = st.nextToken();
        boolean allow = true;
        String expression = st.nextToken();


        // allow or deny ?
        if (allowStr.equalsIgnoreCase("allow")) {
          allow=true;
        } else if (allowStr.equalsIgnoreCase("deny")) {
          allow=false;
        } else {
          throw new IOException("first token in line "+lineno+
          " has to be allow or deny");
        }


        addRule(expression,allow);
      }
    }
  }




  /** 
   * Sets the default result that will be returned if no matching
   * regular expression was found
   * @param default the default result
   */
  public void setDefaultResult(boolean defaultResult) {
    this.defaultResult = defaultResult;
  }


  /** 
   * Gets the default result that will be returned if no matching
   * regular expression was found
   * @return the default result
   */
  public boolean getDefaultResult() {
    return defaultResult;
  }


  /**
   * Gets the list of rules
   * @return a vector of RegExpRule objects
   */
  public Vector getRules() {
    return rules;
  }


  /**
   * Sets the list of rules
   * @param rules a vector of RegExpRule objects
   */
  public void setRules(Vector<RegExpRule> rules) {
    this.rules=rules;
  }




  /**
   * adds a allow or deny rule
   * @param regExp a String containing the regular expression
   * @param allow allow (TRUE) or deny (FALSE)
   */
  public void addRule(String regExp, boolean allow) 
  throws RESyntaxException
  {
    RegExpRule rule = new RegExpRule();
    rule.setPattern(regExp);
    rule.setAllow(allow);
    rules.add(rule);
  }




  /** 
   * Checks if a given URL is allowed or denied by the rules
   *
   * @return true if a matching "allow" rule was found, 
   * false if a matching "deny" rule was found,
   * the default value if no rule was found
   * @see #setDefaultResult(boolean)
   */
  public boolean checkURL(URL u) {
    String urlStr = u.toString();


    for (int i=0; i<rules.size(); i++) {
      RegExpRule rule = rules.elementAt(i);


      if (rule.match(urlStr)) {
        return rule.getAllow();
      }
    }


    return defaultResult;
  }


  /** 
   * Checks if a given URL is allowed or denied by the rules for processing
   *
   * @return true if a matching "allow" rule was found, 
   * false if a matching "deny" rule was found,
   * the default value if no rule was found
   * @see #setDefaultResult(boolean)
   */
  public boolean checkURLForProcessing(URL u) {
    String urlStr = u.toString();


    for (int i=0; i<rules.size(); i++) {
      RegExpRule rule = rules.elementAt(i);


      if (rule.match(urlStr)) {
        return rule.getProcessAllowed();
      }
    }


    return defaultResult;
  }


} // RegExpURLCheck
Source Code of net.matuschek.spider.RegExpURLCheck

Related Classes of net.matuschek.spider.RegExpURLCheck