Package net.matuschek.jobo

Source Code of net.matuschek.jobo.JoBo

package net.matuschek.jobo;

/************************************************
    Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/

import java.io.File;
import java.net.URL;

import net.matuschek.getopt.GetOpt;
import net.matuschek.http.HttpDocToFile;
import net.matuschek.http.SystemOutHttpToolCallback;
import net.matuschek.spider.WebRobot;

import org.apache.log4j.Category;
import org.apache.log4j.PropertyConfigurator;
import org.apache.log4j.BasicConfigurator;

/**
* This is the JoBo command line interface.
*
* @author Daniel Matuschek
* @version $Id $
*/
public class JoBo {

  protected static Category log = Category.getInstance("");

  public static void printUsage() {
    final String usageInfo =
      "command line options: \n"+
      " [-r http://...]   set start referer (default \"-\")\n"+
      " [-d maxdepth]     set maximal search depth (default 0)\n"+
      " [-o]              allow walk to other hosts (default no)\n"+
      " [-s directory]    directory to store retrieved documents\n"+
      "                   (default \".\")\n"+
      " [-m minsize]      store only files larger then this size in bytes\n"+
      "                   (default 0)\n"+
      " [-a agentName]    set user agent name\n"+
      "                   (default \"JoBo\")\n"+
      " [-i]              ignore robots.txt\n"+
      " [-w seconds]      wait n seconds after retrieving a file to limit\n"+
      "                   load on the remote server (default 60)\n"+
      " [-v]              verbose mode, useful, if something is wrong\n"+
      "                   with the XML configuration\n"+
      " url               start URL";

    System.out.println(usageInfo+"\n\n");
  }


  /**
     initialize log4j logging subsystem
   **/
  public static void initializeLogging() {
    final String configfile = "logging.conf";

   
    File f=new File(configfile);
    if (f.exists()) {
      // read the logging properties from configuration file
      PropertyConfigurator.configure(configfile);
    } else {
      BasicConfigurator.configure();
    }
  }
 


  public static void main(String[] argv)
    throws Exception
  {
    String basedir=".";
    int minSize=0;

    initializeLogging();
  
    if (argv.length<1) {
      printUsage();
      return;
    }

    // get command line options
    GetOpt opt = new GetOpt(argv);
    String option = null;

    JoBoBase jobobase = JoBoBase.createFromXML();
    WebRobot robby = jobobase.getRobot();

    // referer
    option=opt.getOptionString("r");
    if (option != null) {
      robby.setStartReferer(option);
    }
   
    // maximal depth
    option=opt.getOptionString("d");
    if (option != null) {
  try {
    int maxDepth=Integer.parseInt(option);
    robby.setMaxDepth(maxDepth);
  } catch (NumberFormatException e) {
    System.out.println("Wrong number for maxDepth: "+option);
  }
    }
   
    // walk to other hosts ?
    if (opt.getOptionBoolean("o")) {
      robby.setWalkToOtherHosts(true);
    }
   
    // store directory
    option=opt.getOptionString("s");
    if (option != null) {
      basedir=option;
    }
   
    // minimal file size
    option=opt.getOptionString("m");
    if (option != null) {
      try {
  minSize=Integer.parseInt(option);
      } catch (NumberFormatException e) {}
    }
   
    // agent name
    option=opt.getOptionString("a");
    if (option != null) {
      robby.setAgentName(option);
    }
   
    // ignore robots.txt
    if (opt.getOptionBoolean("i")) {
      robby.setIgnoreRobotsTxt(true);
    }
   
    // wait time
    option=opt.getOptionString("w");
    if (option != null) {
      try {
  int waitTime=Integer.parseInt(option);
  robby.setSleepTime(waitTime*1000);
      } catch (NumberFormatException e) {}
    }
   
    // print usage
    if (opt.getOptionBoolean("?")) {
      printUsage();
      return;
    }
   
    URL u = new URL(argv[argv.length-1]);

    HttpDocToFile docStore=new HttpDocToFile(basedir);
    docStore.setMinFileSize(minSize);

    SystemOutHttpToolCallback statusInfo = new SystemOutHttpToolCallback();

    robby.setStartURL(u);
    robby.setDocManager(docStore);
    robby.setHttpToolCallback(statusInfo);
   
    robby.run();
   
  }
}
TOP

Related Classes of net.matuschek.jobo.JoBo

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.