Package org.archive.access.nutch

Source Code of org.archive.access.nutch.Nutchwax$OutputDirectories

/* Nutchwax
*
* $Id: Nutchwax.java 1896 2007-08-01 21:44:31Z jlee-archive $
*
* Created on Feb 14, 2006
*
* Copyright (C) 2006 Internet Archive.
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package org.archive.access.nutch;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.indexer.DeleteDuplicates;
import org.apache.nutch.indexer.IndexMerger;
import org.archive.access.nutch.jobs.ImportArcs;
import org.archive.access.nutch.jobs.NutchwaxCrawlDb;
import org.archive.access.nutch.jobs.NutchwaxIndexer;
import org.archive.access.nutch.jobs.NutchwaxLinkDb;
import org.archive.access.nutch.jobs.NutchwaxPagerank;  
import org.archive.util.ArchiveUtils;
import org.apache.nutch.global.Global;



/**
* Script to run all indexing jobs from index through merge of final index.
*/
public class Nutchwax
{
  public static final Log LOG =
    LogFactory.getLog(Nutchwax.class.getName());
   
  private static final String KEY_COLLECTION_PREFIX = "c=";
  private static final String KEY_COLLECTION_SUFFIX = ",u=";
  private static final Pattern COLLECTION =
    Pattern.compile("^\\s*c=([^,]+),u=(.*)\\s*", Pattern.DOTALL);

  private final static List JOBS = Arrays.asList(new String[] {
    "import", "update", "invert", "pagerank", "index", "dedup", "merge", "all",
    "class", "search", "multiple","version"});
   

  // Lazy initialize these two variables to delay complaint about hadoop not
  // being present -- if its not.  Meantime I get command-line processing
  // done.
  private FileSystem fs = null;
  private JobConf conf = null;
   
  /**
   * Default constructor.
   * @throws IOException
   */
  public Nutchwax() throws IOException
  {
    super();
  }
   
  public synchronized JobConf getJobConf()
  {
    if (this.conf == null) {
      this.conf = new JobConf(NutchwaxConfiguration.getConfiguration());
    }
    
    return this.conf;
  }
   
  public synchronized FileSystem getFS() throws IOException
  {
    if (this.fs == null) {
      this.fs = FileSystem.get(getJobConf());
    }
    
    return this.fs;
  }
   
  public class OutputDirectories
  {
    private final Path output;
    private final Path crawlDb;
    private final Path linkDb;
    private final Path pagerank;
    private final Path segments;
    private final Path indexes;
    private final Path index;
    private final Path tmpDir;

    public OutputDirectories(final Path output) throws IOException
    {
      this.output = output;
      this.crawlDb = new Path(output + "/crawldb");
      this.linkDb = new Path(output + "/linkdb");
      this.pagerank = new Path(output + "/pagerank");
      this.segments = new Path(output + "/segments");
      this.indexes = new Path(output + "/indexes");
      this.index = new Path(output + "/index");
      this.tmpDir = getJobConf().getLocalPath("mapred.temp.dir",
        Generator.generateSegmentName());
    }

    public Path getCrawlDb()
    {
      return crawlDb;
    }

    public Path getIndexes()
    {
      return indexes;
    }

    public Path getLinkDb()
    {
      return linkDb;
    }
   
    public Path getPagerank()
    {
      return pagerank;
    }

    public Path getSegments()
    {
      return segments;
    }

    public Path getTmpDir()
    {
      return tmpDir;
    }

    public Path getIndex()
    {
      return index;
    }

    public Path getOutput()
    {
      return output;
    }
  }

  /**
   * Run passed list of mapreduce indexing jobs. Jobs are always run in
   * order: import, update, etc.
   *
   * @throws Exception
   */
  protected void doAll(final Path input, final String collectionName,
    final OutputDirectories od)
    throws Exception
  {
    doImport(input, collectionName, od);
    doUpdate(od);
    doInvert(od);
    doPagerank(od);
    doIndexing(od);
    doDedup(od);
    doMerge(od);
     
    LOG.info("Nutchwax finished.");
  }
   
  protected void doImport(final Path input, String collectionName,
    final OutputDirectories od)
    throws IOException
  {
    Path segment = new Path(od.getSegments(),
      Generator.generateSegmentName() +
        ((collectionName == null || collectionName.length() <= 0)?
          "": "-" + collectionName));
       
    new ImportArcs(getJobConf()).importArcs(input, segment, collectionName);
  }
   
  protected void doUpdate(final OutputDirectories od)
    throws IOException
  {
    doUpdate(od, null);
  }
   
  protected void doUpdate(final OutputDirectories od,
    final String[] segments)
    throws IOException
  {
    LOG.info("updating crawldb " + od.getCrawlDb());
     
    // Need to make sure the db dir exists before progressing.
    Path dbPath = new Path(od.getCrawlDb(), CrawlDb.CURRENT_NAME);
       
    if (!getFS().exists(dbPath))
    {
      getFS().mkdirs(dbPath);
    }
     
    CrawlDb cdb = new NutchwaxCrawlDb(getJobConf());
     
    if (segments != null)
    {
      List<Path> paths = new ArrayList<Path>(segments.length);
       
      for (int i = 0; i < segments.length; i++)
      {
        Path p = new Path(segments[i]);
         
        if (!getFS().exists(p))
        {
          throw new FileNotFoundException(p.toString());
        }

        paths.add(p);
      }
       
      cdb.update(od.getCrawlDb(), paths.toArray(new Path[paths.size()]),
        true, true);
    }
    else
    {
      Path[] allSegments = getSegments(od);
       
      // This just does the last segment created.
      cdb.update(od.getCrawlDb(),
        new Path[] {allSegments[allSegments.length - 1]}, true, true);
    }
  }

  protected Path [] getSegments(final OutputDirectories od)
    throws IOException
  {
    Path[] allSegments = getFS().listPaths(od.getSegments());
       
    if (allSegments == null || allSegments.length <= 0)
    {
      throw new FileNotFoundException(od.getSegments().toString());
    }
     
    return allSegments;
  }
   
  protected void doInvert(final OutputDirectories od, final Path [] segments)
    throws IOException
  {
    createLinkdb(od);
     
    new NutchwaxLinkDb(getJobConf()).
      invert(od.getLinkDb(), segments, true, true, false);
  }
   
  protected void doInvert(final OutputDirectories od)
    throws IOException
  {
    LOG.info("inverting links in " + od.getSegments());

    new NutchwaxLinkDb(getJobConf()).
      invert(od.getLinkDb(), getSegments(od), true, true, false);
  }
  protected boolean createLinkdb(final OutputDirectories od)
    throws IOException
  {
    boolean result = false;

    // Make sure the linkdb exists.  Otherwise the install where
    // the temporary location gets moved to the permanent fails.
    if (getFS().mkdirs(new Path(od.getLinkDb(),
      NutchwaxLinkDb.CURRENT_NAME)))
    {
      LOG.info("Created " + od.getLinkDb());

      result = true;
    }

    return result;
  }
  protected void doPagerank(final OutputDirectories od)
  throws IOException
  {  
    LOG.info("computing pagerank scores in " + od.getPagerank());
       
    new NutchwaxPagerank(getJobConf()).process(getSegments(od), od.getPagerank());
  }
 
  protected void doIndexing(final OutputDirectories od)
    throws IOException
  {
    doIndexing(od, getFS().listPaths(od.getSegments()));
  }

  protected void doIndexing(final OutputDirectories od,
    final Path [] segments)
    throws IOException
  {
    LOG.info(" indexing " + segments);

    new NutchwaxIndexer(getJobConf()).index(od.getIndexes(), od.getPagerank(), od.getCrawlDb(), od.getLinkDb(), segments);
  }

  protected void doDedup(final OutputDirectories od) throws IOException
  {
    LOG.info("dedup " + od.getIndex());

    new DeleteDuplicates(getJobConf()).dedup(new Path[] {od.getIndexes()});
  }
   
  protected void doMerge(final OutputDirectories od) throws IOException
  {
    LOG.info("index merge " + od.getOutput() + " using tmpDir=" +
      od.getTmpDir());

    new IndexMerger(getJobConf()).merge(getFS().listPaths(od.getIndexes()),
      od.getIndex(), od.getTmpDir());
  }

  static String [] rewriteArgs(final String [] args, final int offset)
  {
    final String [] newArgs = new String[args.length - offset];

    for (int i = 0; i < args.length; i++)
    {
      if (i < offset)
      {
        continue;
      }

      newArgs[i - offset] = args[i];
    }

    return newArgs;
  }

  static Object doClassMain(final String [] args)
  {
    // Redo args so absent our nutchwax 'class' command.
    final String className = args[1];
    String [] newArgs = rewriteArgs(args, 2);

    // From http://www.javaworld.com/javaworld/javaqa/1999-06/01-outside.html
    Class [] argTypes = new Class[1];
    argTypes[0] = String[].class;
    Object result = null;

    try
    {
      Method mainMethod =
        Class.forName(className).getDeclaredMethod("main", argTypes);
      result = mainMethod.invoke(newArgs, new Object [] {newArgs});
    }
    catch (Throwable t)
    {
      t.printStackTrace();
    }

    return result;
  }
   
  protected Object doSearch(final String [] args)
  {
    String [] newArgs = new String[args.length + 1];
    newArgs[0] = args[0];
    newArgs[1] = NutchwaxBean.class.getName();

    for (int i = 1; i < args.length; i++)
    {
      newArgs[i + 1] = args[i];
    }

    return doClassMain(newArgs);
  }

  protected void doMultiple(final String [] args) throws Exception
  {
    (new Multiple()).run(rewriteArgs(args, 1));
  }
   
  protected void doVersion(final String [] args) throws Exception {
    JobConf job = getJobConf();   
    String collectionType = job.get(Global.COLLECTION_TYPE);   
    System.out.println("Collection type:"+collectionType);
 
   
  protected void doJob(final String jobName, final String [] args)
    throws Exception
  {
    if (jobName.equals("import"))
    {
      // Usage: hadoop jar nutchwax.jar import input output name
      if (args.length != 4)
      {
        ImportArcs.doImportUsage(
          "ERROR: Wrong number of arguments passed.", 2);
      }

      final Path input = new Path(args[1]);
      final Path output = new Path(args[2]);
      final String collectionName = args[3];

      checkArcsDir(input);
      OutputDirectories od = new OutputDirectories(output);
      doImport(input, collectionName, od);
    }
    else if (jobName.equals("update"))
    {
      // Usage: hadoop jar nutchwax.jar update output
      if (args.length < 2)
      {
        doUpdateUsage("ERROR: Wrong number of arguments passed.", 2);
      }

      OutputDirectories od = new OutputDirectories(new Path(args[1]));

      if (args.length == 2)
      {
        doUpdate(od);
      }
      else
      {
        for (int i = 2; i < args.length; i++)
        {
          doUpdate(od, new String [] {args[i]});
        }
      }
    }
    else if (jobName.equals("invert"))
    {
      // Usage: hadoop jar nutchwax.jar invert output
      if (args.length < 2)
      {
        doInvertUsage("ERROR: Wrong number of arguments passed.", 2);
      }

      OutputDirectories od = new OutputDirectories(new Path(args[1]));

      if (args.length == 2)
      {
        doInvert(od);
      }
      else
      {
        final int offset = 2;
        Path [] segments = new Path[args.length - offset];

        for (int i = offset; i < args.length; i++)
        {
          Path f = new Path(args[i]);

          if (! getFS().exists(f))
          {
            throw new FileNotFoundException(f.toString());
          }

          segments[i - offset] = f;
        }

        doInvert(od, segments);
      }
    }
    /* TODO MC */
    else if (jobName.equals("pagerank"))
    {
      // Usage: hadoop jar nutchwax.jar pagerank output
      if (args.length != 2)
      {
        doPagerankUsage("ERROR: Wrong number of arguments passed.", 2);
      }

      OutputDirectories od = new OutputDirectories(new Path(args[1]));
      doPagerank(od);     
   
    /* TODO MC */
    else if (jobName.equals("index"))
    {
      // Usage: hadoop jar nutchwax.jar index output
      if (args.length < 2)
      {
        doIndexUsage("ERROR: Wrong number of arguments passed.", 2);
      }

      OutputDirectories od = new OutputDirectories(new Path(args[1]));

      if (args.length == 2)
      {
        doIndexing(od);
      }
      else
      {
        final int offset = 2;
        Path [] segments = new Path[args.length - offset];

        for (int i = offset; i < args.length; i++)
        {
          Path f = new Path(args[i]);

          if (! getFS().exists(f))
          {
            throw new FileNotFoundException(f.toString());
          }

          segments[i - offset] = f;
        }

        doIndexing(od, segments);
      }
    }
    else if (jobName.equals("dedup"))
    {
      // Usage: hadoop jar nutchwax.jar dedup output
      if (args.length != 2)
      {
        doDedupUsage("Wrong number of arguments passed.", 2);
      }

      doDedup(new OutputDirectories(new Path(args[1])));
    }
    else if (jobName.equals("merge"))
    {
      // Usage: hadoop jar nutchwax.jar merge output");
      if (args.length != 2)
      {
        doMergeUsage("ERROR: Wrong number of arguments passed.", 2);
      }

      doMerge(new OutputDirectories(new Path(args[1])));
    }
    else if (jobName.equals("all"))
    {
      // Usage: hadoop jar nutchwax.jar import input output name
      if (args.length != 4)
      {
        doAllUsage("ERROR: Wrong number of arguments passed.", 2);
      }

      final Path input = new Path(args[1]);
      final Path output = new Path(args[2]);
      final String collectionName = args[3];

      checkArcsDir(input);

      OutputDirectories od = new OutputDirectories(output);

      doAll(input, collectionName, od);
    }
    else if (jobName.equals("class"))
    {
      if (args.length < 2)
      {
        doClassUsage("ERROR: Wrong number of arguments passed.", 2);
      }

      doClassMain(args);
    }
    else if (jobName.equals("search"))
    {
      if (args.length < 1)
      {
        doClassUsage("ERROR: Wrong number of arguments passed.", 2);
      }

      doSearch(args);
    }
    else if (jobName.equals("multiple"))
    {
      doMultiple(args);
    }
    else if (jobName.equals("version"))
    {
      doVersion(args);
    }   
    else
    {
      usage("ERROR: No handler for job name " + jobName, 4);
      System.exit(0);
    }
  }

/**
  * Check the arcs dir exists and looks like it has files that list ARCs
  * (rather than ARCs themselves).
  *
  * @param arcsDir Directory to examine.
  * @throws IOException
  */
  protected void checkArcsDir(final Path arcsDir)
    throws IOException
  {
    if (! getFS().exists(arcsDir))
    {
      throw new IOException(arcsDir + " does not exist.");
    }

    if (! fs.isDirectory(arcsDir))
    {
      throw new IOException(arcsDir + " is not a directory.");
    }

    final Path [] files = getFS().listPaths(arcsDir);

    for (int i = 0; i < files.length; i++)
    {
      if (! getFS().isFile(files[i]))
      {
        throw new IOException(files[i] + " is not a file.");
      }

      if (files[i].getName().toLowerCase().endsWith(".arc.gz"))
      {
        throw new IOException(files[i] + " is an ARC file (ARCSDIR " +
          "should contain text file listing ARCs rather than " +
          "actual ARCs).");
      }
    }
  }

  public static Text generateWaxKey(WritableComparable key,
    final String collection)
  {
    return generateWaxKey(key.toString(), collection);
  }
   
  public static Text generateWaxKey(final String keyStr,
    final String collection)
  {
    if (collection == null)
    {
      throw new NullPointerException("Collection is null for " + keyStr);
    }
   
    if (keyStr == null)
    {
      throw new NullPointerException("keyStr is null");
    }
   
    if (keyStr.startsWith(KEY_COLLECTION_PREFIX))
    {
      LOG.warn("Key already has collection prefix: " + keyStr
        + ". Skipping.");
       
      return new Text(keyStr);
    }

    return new Text(KEY_COLLECTION_PREFIX + collection.trim() +
      KEY_COLLECTION_SUFFIX + keyStr.trim());
  }

  public static String getCollectionFromWaxKey(final WritableComparable key)
    throws IOException
  {
    Matcher m = COLLECTION.matcher(key.toString());
   
    if (m == null || !m.matches())
    {
      throw new IOException("Key doesn't have collection " +
        "prefix <" + key.toString() + ">");
    }
   
    return m.group(1);
  }

  public static String getUrlFromWaxKey(final WritableComparable key)
    throws IOException
  {
    Matcher m = COLLECTION.matcher(key.toString());
   
    if (m == null || !m.matches())
    {
      throw new IOException("Key doesn't have collection " +
        " prefix: " + key);
    }
   
    return m.group(2);
  }
   
  public static long getDate(String d) throws IOException
  {
    long date = 0;
   
    try
    {
      date = ArchiveUtils.getDate(d).getTime();
    }
    catch (final java.text.ParseException e)
    {
      throw new IOException("Failed parse of date: " + d + ": " +
        e.getMessage());
    }
   
    // Date can be < 0 if pre-1970 (Seen in some old ARCs).
    return date >= 0? date: 0;
  }

  public static void usage(final String message, final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }

    System.out.println("Usage: hadoop jar nutchwax.jar <job> [args]");
    System.out.println("Launch NutchWAX job(s) on a hadoop platform.");
    System.out.println("Type 'hadoop jar nutchwax.jar help <job>' for" +
      " help on a specific job.");
    System.out.println("Jobs (usually) must be run in the order " +
      "listed below.");
    System.out.println("Available jobs:");
    System.out.println(" import   Import ARCs.");
    System.out.println(" update   Update dbs with recent imports.");
    System.out.println(" invert   Invert links.");
    System.out.println(" pagerank Compute pagerank.")// TODO MC
    System.out.println(" index    Index segments.");
    System.out.println(" dedup    Deduplicate by URL or content MD5.");
    System.out.println(" merge    Merge segment indices into one.");
    System.out.println(" all      Runs all above jobs in order.");
    System.out.println(" class    Run the passed class's main.");
    System.out.println(" search   Run a query against index under " +
      "property 'searcher.dir'");   
    System.out.println(" multiple Run multiple concurrent tasks.");
    System.out.println(" version Indicates the software version.");
   
    System.exit(exitCode);
  }
   
  public static void doUpdateUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
   
    System.out.println("Usage: hadoop jar nutchwax.jar update <output> " +
      "[<segments>...]");
    System.out.println("Arguments:");
    System.out.println(" output    Directory to write crawldb under.");
    System.out.println("Options:");
    System.out.println(" segments  List of segments to update crawldb " +
      "with. If none supplied, updates");
    System.out.println("            using latest segment found.");

    System.exit(exitCode);
  }
   
  public static void doInvertUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
   
    System.out.println("Usage: hadoop jar nutchwax.jar invert <output> " +
      "[<segments>...]");
    System.out.println("Arguments:");
    System.out.println(" output    Directory to write linkdb under.");
    System.out.println("Options:");
    System.out.println(" segments  List of segments to update linkdb " +
      "with. If none supplied, all under");
    System.out.println("           '<output>/segments/' " +
      "are passed.");

    System.exit(exitCode);
  }
 
  /* TODO MC */
  public static void doPagerankUsage(final String message,
      final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }

    System.out.println("Usage: hadoop jar nutchwax.jar pagerank <output> ");
    System.out.println("Arguments:");
    System.out.println(" output    Directory to write pagerank under.");   
    System.exit(exitCode);
  }
  /* TODO MC */ 
 
  public static void doIndexUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
   
    System.out.println("Usage: hadoop jar nutchwax.jar index <output> " +
      "[<segments>...]");
    System.out.println("Arguments:");
    System.out.println(" output    Directory to write indexes under.");
    System.out.println("Options:");
    System.out.println(" segments  List of segments to index. " +
      "If none supplied, all under");
    System.out.println("           '<output>/segments/' " +
      "are indexed.");

    System.exit(exitCode);
  }
   
  public static void doDedupUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
   
    System.out.println("Usage: hadoop jar nutchwax.jar dedup <output>");
    System.out.println("Arguments:");
    System.out.println(" output  Directory in which indices" +
      " to dedup reside.");

    System.exit(exitCode);
  }
   
  public static void doMergeUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }

    System.out.println("Usage: hadoop jar nutchwax.jar merge <output>");
    System.out.println("Arguments:");
    System.out.println(" output  Directory in which indices" +
      " to merge reside.");

    System.exit(exitCode);
  }

  public static void doMultipleUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
   
    Multiple.usage();
   
    System.exit(exitCode);
  }
   
  public static void doSearchUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
   
    System.out.println("Usage: hadoop jar nutchwax.jar search <query>");
    System.out.println("Arguments:");
    System.out.println(" query  Query string to run against index under " +
      "property 'searcher.dir'");

    System.exit(exitCode);
  }
   
  public static void doAllUsage(final String message, final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }

    System.out.println("Usage: hadoop jar nutchwax.jar import <input>" +
      " <output> <collection>");
    System.out.println("Arguments:");
    System.out.println(" input       Directory of files" +
      " listing ARC URLs to import");
    System.out.println(" output      Directory to import to. Inport is " +
      "written to a subdir named");
    System.out.println("             for current date plus collection " +
      "under '<output>/segments/'");
    System.out.println(" collection  Collection name. Added to" +
      " each resource.");
   
    System.exit(exitCode);
  }
   
  public static void doClassUsage(final String message, final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
   
    System.out.println("Usage: hadoop jar nutchwax.jar class CLASS ...");
    System.out.println("Arguments:");
    System.out.println(" CLASS    Name of class to run. Invokes main " +
      "passing command-line arguments.");
    System.out.println("          For example, use to run nutch " +
      "commands. Below is list of command");
    System.out.println("          name and implementing class. " +
      "Pass name of class only and emits usage.");
    System.out.println();
    System.out.println("          readdb      " +
      "org.apache.nutch.crawl.CrawlDbReader");
    System.out.println("          mergedb     " +
      "org.apache.nutch.crawl.CrawlDbMerger");
    System.out.println("          readlinkdb  " +
      "org.apache.nutch.crawl.LinkDbReader");
    System.out.println("          segread     " +
      "org.apache.nutch.segment.SegmentReader");
    System.out.println("          mergesegs   " +
      "org.apache.nutch.segment.SegmentMerger");
    System.out.println("          mergelinkdb " +
      "org.apache.nutch.crawl.LinkDbMerger");
    System.exit(exitCode);
  }

  static void doJobHelp(final String jobName)
  {
    if (! JOBS.contains(jobName))
    {
      usage("ERROR: Unknown job " + jobName, 1);
    }

    if (jobName.equals("import"))
    {
      ImportArcs.doImportUsage(null, 1);
    }
    else if (jobName.equals("update"))
    {
      doUpdateUsage(null, 1);
    }
    else if (jobName.equals("invert"))
    {
      doInvertUsage(null, 1);
    }
    /* TODO MC */   
    else if (jobName.equals("pagerank"))
    {
      doPagerankUsage(null, 1);
    }
    /* TODO MC */
    else if (jobName.equals("index"))
    {
      doIndexUsage(null, 1);
    }
    else if (jobName.equals("dedup"))
    {
      doDedupUsage(null, 1);
    }
    else if (jobName.equals("merge"))
    {
      doMergeUsage(null, 1);
    }
    else if (jobName.equals("all"))
    {
      doAllUsage(null, 1);
    }
    else if (jobName.equals("search"))
    {
      doSearchUsage(null, 1);
    }
    else if (jobName.equals("multiple"))
    {
      doMultipleUsage(null, 1);
    }
    else if (jobName.equals("class"))
    {
      doClassUsage(null, 1);
    }
    else
    {
      usage("ERROR: No help for job name " + jobName, 4);
    }
  }

  public static void main(String args[]) throws Exception
  {
    if (args.length < 1)
    {
      usage(null, 0);
      return;
    }

    if (args[0].toLowerCase().equals("help"))
    {
      if (args.length == 1)
      {
        usage("ERROR: Add command you need help on.", 0);
        return;
      }
     
      doJobHelp(args[1].toLowerCase());
    }
       
    final String jobName = args[0].toLowerCase();
   
    if (! JOBS.contains(jobName))
    {
      usage("ERROR: Unknown <job> " + jobName, 1);
    }
       
    Nutchwax ia = new Nutchwax();
    ia.doJob(jobName, args);
  }
}
TOP

Related Classes of org.archive.access.nutch.Nutchwax$OutputDirectories

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.