package net.matuschek.jobo;
/************************************************
Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/
import java.io.File;
import java.net.URL;
import net.matuschek.getopt.GetOpt;
import net.matuschek.http.HttpDocToFile;
import net.matuschek.http.SystemOutHttpToolCallback;
import net.matuschek.spider.WebRobot;
import org.apache.log4j.Category;
import org.apache.log4j.PropertyConfigurator;
import org.apache.log4j.BasicConfigurator;
/**
* This is the JoBo command line interface.
*
* @author Daniel Matuschek
* @version $Id $
*/
public class JoBo {
protected static Category log = Category.getInstance("");
public static void printUsage() {
final String usageInfo =
"command line options: \n"+
" [-r http://...] set start referer (default \"-\")\n"+
" [-d maxdepth] set maximal search depth (default 0)\n"+
" [-o] allow walk to other hosts (default no)\n"+
" [-s directory] directory to store retrieved documents\n"+
" (default \".\")\n"+
" [-m minsize] store only files larger then this size in bytes\n"+
" (default 0)\n"+
" [-a agentName] set user agent name\n"+
" (default \"JoBo\")\n"+
" [-i] ignore robots.txt\n"+
" [-w seconds] wait n seconds after retrieving a file to limit\n"+
" load on the remote server (default 60)\n"+
" [-v] verbose mode, useful, if something is wrong\n"+
" with the XML configuration\n"+
" url start URL";
System.out.println(usageInfo+"\n\n");
}
/**
initialize log4j logging subsystem
**/
public static void initializeLogging() {
final String configfile = "logging.conf";
File f=new File(configfile);
if (f.exists()) {
// read the logging properties from configuration file
PropertyConfigurator.configure(configfile);
} else {
BasicConfigurator.configure();
}
}
public static void main(String[] argv)
throws Exception
{
String basedir=".";
int minSize=0;
initializeLogging();
if (argv.length<1) {
printUsage();
return;
}
// get command line options
GetOpt opt = new GetOpt(argv);
String option = null;
JoBoBase jobobase = JoBoBase.createFromXML();
WebRobot robby = jobobase.getRobot();
// referer
option=opt.getOptionString("r");
if (option != null) {
robby.setStartReferer(option);
}
// maximal depth
option=opt.getOptionString("d");
if (option != null) {
try {
int maxDepth=Integer.parseInt(option);
robby.setMaxDepth(maxDepth);
} catch (NumberFormatException e) {
System.out.println("Wrong number for maxDepth: "+option);
}
}
// walk to other hosts ?
if (opt.getOptionBoolean("o")) {
robby.setWalkToOtherHosts(true);
}
// store directory
option=opt.getOptionString("s");
if (option != null) {
basedir=option;
}
// minimal file size
option=opt.getOptionString("m");
if (option != null) {
try {
minSize=Integer.parseInt(option);
} catch (NumberFormatException e) {}
}
// agent name
option=opt.getOptionString("a");
if (option != null) {
robby.setAgentName(option);
}
// ignore robots.txt
if (opt.getOptionBoolean("i")) {
robby.setIgnoreRobotsTxt(true);
}
// wait time
option=opt.getOptionString("w");
if (option != null) {
try {
int waitTime=Integer.parseInt(option);
robby.setSleepTime(waitTime*1000);
} catch (NumberFormatException e) {}
}
// print usage
if (opt.getOptionBoolean("?")) {
printUsage();
return;
}
URL u = new URL(argv[argv.length-1]);
HttpDocToFile docStore=new HttpDocToFile(basedir);
docStore.setMinFileSize(minSize);
SystemOutHttpToolCallback statusInfo = new SystemOutHttpToolCallback();
robby.setStartURL(u);
robby.setDocManager(docStore);
robby.setHttpToolCallback(statusInfo);
robby.run();
}
}