package net.matuschek.spider;
/************************************************
Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.net.URL;
import java.util.StringTokenizer;
import java.util.Vector;
import org.apache.regexp.RESyntaxException;
/**
* This URLChecker checks a URL using a list of regular expressions
* that should be allowed or denied.
*
* @author Daniel Matuschek
* @version $Revision: 1.4 $
*/
public class RegExpURLCheck
implements URLCheck
{
/** vector to store the rules */
private Vector<RegExpRule> rules = null;
/** default check result if no matching regexp was found */
private boolean defaultResult = true;
/** initializes the object with an empty rule set */
public RegExpURLCheck() {
rules = new Vector<RegExpRule>();
}
/**
* <p>initialized the object with a rule set from an
* input stream (e.g. a file)</p>
*
* <p>every line of this stream has the format
* <code>allow|deny expression</code></p>
*
* <p>default value can be set with
* <code>allow|deny .</code> at the end of the file</p>
*
* <p>lines that start with "#" and empty lines will be
* ignored</p>
*/
public RegExpURLCheck(Reader r)
throws IOException,
org.apache.regexp.RESyntaxException
{
this();
BufferedReader reader =
new BufferedReader(r);
String line = "";
int lineno=0;
while (line != null) {
line=reader.readLine();
lineno++;
if ((line != null) &&
(! line.trim().equals("")) &&
(! line.startsWith("#"))) {
StringTokenizer st = new StringTokenizer(line);
// did we get 2 tokens ?
if (st.countTokens() != 2) {
throw new IOException("line "+lineno+" don't consists of 2 fields");
}
String allowStr = st.nextToken();
boolean allow = true;
String expression = st.nextToken();
// allow or deny ?
if (allowStr.equalsIgnoreCase("allow")) {
allow=true;
} else if (allowStr.equalsIgnoreCase("deny")) {
allow=false;
} else {
throw new IOException("first token in line "+lineno+
" has to be allow or deny");
}
addRule(expression,allow);
}
}
}
/**
* Sets the default result that will be returned if no matching
* regular expression was found
* @param default the default result
*/
public void setDefaultResult(boolean defaultResult) {
this.defaultResult = defaultResult;
}
/**
* Gets the default result that will be returned if no matching
* regular expression was found
* @return the default result
*/
public boolean getDefaultResult() {
return defaultResult;
}
/**
* Gets the list of rules
* @return a vector of RegExpRule objects
*/
public Vector getRules() {
return rules;
}
/**
* Sets the list of rules
* @param rules a vector of RegExpRule objects
*/
public void setRules(Vector<RegExpRule> rules) {
this.rules=rules;
}
/**
* adds a allow or deny rule
* @param regExp a String containing the regular expression
* @param allow allow (TRUE) or deny (FALSE)
*/
public void addRule(String regExp, boolean allow)
throws RESyntaxException
{
RegExpRule rule = new RegExpRule();
rule.setPattern(regExp);
rule.setAllow(allow);
rules.add(rule);
}
/**
* Checks if a given URL is allowed or denied by the rules
*
* @return true if a matching "allow" rule was found,
* false if a matching "deny" rule was found,
* the default value if no rule was found
* @see #setDefaultResult(boolean)
*/
public boolean checkURL(URL u) {
String urlStr = u.toString();
for (int i=0; i<rules.size(); i++) {
RegExpRule rule = rules.elementAt(i);
if (rule.match(urlStr)) {
return rule.getAllow();
}
}
return defaultResult;
}
/**
* Checks if a given URL is allowed or denied by the rules for processing
*
* @return true if a matching "allow" rule was found,
* false if a matching "deny" rule was found,
* the default value if no rule was found
* @see #setDefaultResult(boolean)
*/
public boolean checkURLForProcessing(URL u) {
String urlStr = u.toString();
for (int i=0; i<rules.size(); i++) {
RegExpRule rule = rules.elementAt(i);
if (rule.match(urlStr)) {
return rule.getProcessAllowed();
}
}
return defaultResult;
}
} // RegExpURLCheck