package net.matuschek.jobo;
/************************************************
Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/
import java.io.File;
import java.io.FileWriter;
import java.io.Writer;
import net.matuschek.http.DownloadRuleSet;
import net.matuschek.http.HttpDocToFile;
import net.matuschek.http.HttpToolCallback;
import net.matuschek.spider.RegExpURLCheck;
import net.matuschek.spider.WebRobot;
import net.matuschek.spider.WebRobotCallback;
import net.matuschek.spider.docfilter.FilterChain;
import net.matuschek.spider.docfilter.LinkLocalizer;
import org.apache.log4j.Category;
import org.exolab.castor.mapping.Mapping;
import org.exolab.castor.xml.Marshaller;
import org.exolab.castor.xml.Unmarshaller;
import org.xml.sax.InputSource;
/**
* This is a simple class that contains all needed features for JoBo
* (the web robot, the download rules, RegExpUrlCheck ...)
*
* @author Daniel Matuschek
* @version $Revision: 1.21 $
*/
public class JoBoBase {
/** Log4J logging */
private static Category log = Category.getInstance("");
/** The file used for XML->Java mapping */
private static String mappingfile="mapping.xml";
/** The jobo configuration in XML */
private static String xmlconfig="jobo.xml";
/** Start URL for the robot */
// private static String startUrl=null;
private String storageDirectory = "/tmp";
private WebRobot robot = null;
private RegExpURLCheck urlcheck = null;
private DownloadRuleSet downloadrules = null;
private HttpDocToFile docstore = null;
/** Filter to localize included links */
private LinkLocalizer linkLocalizer = null;
/** FilterChains with all filters */
private FilterChain filters = null;
/**
* @exception ClassNotFoundException if the Robot could not be instantiated
* for some reason
*/
public JoBoBase()
throws ClassNotFoundException
{
log = Category.getInstance(this.getClass());
docstore = new HttpDocToFile(storageDirectory);
initializeFilters();
robot = new WebRobot();
robot.setFilters(filters);
}
/**
* Set the default filter chain
*/
public void initializeFilters() {
filters = new FilterChain();
linkLocalizer = new LinkLocalizer();
filters.add(linkLocalizer);
}
/**
* write the settings to an XML file
*/
public void saveConfig(String filename) {
File f1 = new File(mappingfile);
if (f1.exists()) {
Mapping mapping = new Mapping();
try {
mapping.loadMapping(mappingfile);
Writer writer = new FileWriter(filename);
Marshaller marshaller = new Marshaller(writer);
marshaller.setMapping(mapping);
marshaller.marshal(this);
writer.close();
log.info("written to XML");
} catch (Exception e) {
log.error(e.getMessage());
e.printStackTrace();
}
} else {
log.error("mapping and/or configfile not found");
}
}
public void registerHttpToolCallback(HttpToolCallback cb) {
robot.setHttpToolCallback(cb);
}
public void registerWebRobotCallback(WebRobotCallback cb) {
robot.setWebRobotCallback(cb);
}
/**
* registers the regexpurlcheck and the download rules with the robot
*/
public void configureRobot() {
robot.setURLCheck(urlcheck);
robot.setDownloadRuleSet(downloadrules);
robot.setDocManager(docstore);
robot.setFilters(filters);
}
/**
* Get the value of urlcheck.
* @return Value of urlcheck.
*/
public RegExpURLCheck getURLCheck () {
return urlcheck;
}
/**
* Set the value of urlcheck.
* @param v Value to assign to urlcheck.
*/
public void setURLCheck(RegExpURLCheck urlcheck ) {
this.urlcheck = urlcheck;
}
/**
* Get the value of robot.
* @return Value of robot.
*/
public WebRobot getRobot () {
return robot;
}
/**
* Set the value of robot. The new Robot will use the
* filter that are defined in JoBoBase, even if he had
* its own FilterChain before.
*
* @param robot WebRobot object to use
*/
public void setRobot(WebRobot robot) {
this.robot = robot;
robot.setFilters(filters);
}
/**
* Localize links ?
*
* @param localize if this is true, JoBo will trz to replace
* absolute links by relative
*/
public void setLocalizeLinks(boolean localize)
{
if (localize) {
linkLocalizer.enable();
} else {
linkLocalizer.disable();
}
}
/**
* is link localization enabled ?
*/
public boolean getLocalizeLinks() {
return linkLocalizer.isEnabled();
}
/**
* Get the value of downloadRules.
* @return Value of downloadRules.
*/
public DownloadRuleSet getDownloadRuleSet () {
return downloadrules;
}
/**
* Set the value of downloadRules.
* @param v Value to assign to downloadRules.
*/
public void setDownloadRuleSet(DownloadRuleSet downloadRuleSet) {
this.downloadrules = downloadRuleSet;
}
/**
* Get the value of storageDirectory.
* @return Value of storageDirectory.
*/
public String getStorageDirectory () {
return storageDirectory;
}
/**
* Set the value of storageDirectory.
* @param v Value to assign to storageDirectory.
*/
public void setStorageDirectory(String storageDirectory ) {
this.storageDirectory = storageDirectory;
docstore.setBaseDir(storageDirectory);
}
/**
* Enable/disable storing of dynamic documents (with an "?"
* somewhere in the URL
*
* @param v true: enable storing of <b>all</b> documents,
* false: store only documents with an URL without "?"
*/
public void setStoreCGI(boolean storeCGI) {
this.docstore.setStoreCGI(storeCGI);
}
/**
* Get the status of storeCGI
*
* @return the current status of storeCGI
* @see #setStoreCGI for more information
*/
public boolean getStoreCGI() {
return this.docstore.getStoreCGI();
}
/**
* Unmarshall the object from an XML file (jobo.xml) in the current
* directory
*
* @exception ClassNotFoundException if the Robot could not be instantiated
* for some reason
*/
public static JoBoBase createFromXML()
throws ClassNotFoundException
{
return createFromXML(".");
}
/**
* Unmarshall the object from an XML file
*
* @param configDirectory name of the directory where jobo.xml and
* mapping.xml should be read from.
* @exception ClassNotFoundException if the Robot could not be instantiated
* for some reason
*/
public static JoBoBase createFromXML(String configDirectory)
throws ClassNotFoundException
{
JoBoBase baseobj = null;
xmlconfig="jobo.xml";
File f1 = new File(configDirectory+File.separatorChar+mappingfile);
File f2 = new File(configDirectory+File.separatorChar+xmlconfig);
if (f1.exists() && f2.exists()) {
Mapping mapping = new Mapping();
try {
mapping.loadMapping(f1.getPath());
Unmarshaller unmar = new Unmarshaller(mapping);
unmar.setDebug(true);
baseobj=(JoBoBase)unmar.unmarshal(new InputSource(f2.getPath()));
log.info("configured from XML");
} catch (Exception e) {
log.error(e.getMessage());
e.printStackTrace();
}
} else {
log.error("mapping and/or configfile not found");
}
if (baseobj==null) {
baseobj = new JoBoBase();
}
baseobj.configureRobot();
return baseobj;
}
} // JoBoBase