Package org.apache.nutch.indexer.field

Source Code of org.apache.nutch.indexer.field.BasicFields$Flipper

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexer.field;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Random;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.document.DateTools;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.scoring.webgraph.LinkDatum;
import org.apache.nutch.scoring.webgraph.Node;
import org.apache.nutch.scoring.webgraph.WebGraph;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.URLUtil;

/**
* Creates the basic FieldWritable objects.  The basic fields are the main
* fields used in indexing segments.  Many other fields jobs will rely on the
* urls being present in the basic fields output to create their fields for
* indexing.
*
* Basic fields are extracted from segements.  Only urls that were successfully
* fetched and parsed will be converted.  This job also implements a portion of
* redirect logic.  If a url contains both a redirect or orig url then both the
* url and its orig will be measured against their link analysis score with the
* highest scoring one being the url used for display in the index.  This
* ensures that we index content under the best, most popular, url which is most
* often the one users are expecting.
*
* The BasicFields tool can accept one or more segments to convert to fields.
* If multiple segments have overlapping content, only the latest successfully
* fetched content will be converted.
*/
public class BasicFields
  extends Configured
  implements Tool {

  public static final Log LOG = LogFactory.getLog(BasicFields.class);

  /**
   * Runs the Extractor job. Extracts basic fields from segments.
   *
   * @param nodeDb The node database
   * @param segment A single segment to process.
   * @param outputDir The extractor output.
   *
   * @throws IOException If an error occurs while processing the segment.
   */
  private void runExtractor(Path nodeDb, Path segment, Path outputDir)
    throws IOException {

    LOG.info("BasicFields: starting extractor");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);

    LOG.info("BasicFields: extractor adding segment: " + segment);
    FileInputFormat.addInputPath(job, new Path(segment,
      CrawlDatum.FETCH_DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
    FileInputFormat.addInputPath(job, nodeDb);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Extractor.class);
    job.setReducerClass(Extractor.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("BasicFields: finished extractor");
    }
  }

  /**
   * Runs the Flipper job. Flipper is the first of a two part job to implement
   * redirect logic.
   *
   * @param basicFields The basic fields temporary output.
   * @param nodeDb The node database.
   * @param outputDir The flipper output.
   *
   * @throws IOException If an error occurs while processing.
   */
  private void runFlipper(Path basicFields, Path nodeDb, Path outputDir)
    throws IOException {

    LOG.info("BasicFields: starting flipper");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    FileInputFormat.addInputPath(job, nodeDb);
    FileInputFormat.addInputPath(job, basicFields);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Flipper.class);
    job.setReducerClass(Flipper.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LinkDatum.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("BasicFields: finished flipper");
    }
  }

  /**
   * Runs the Scorer job. Scorer is the second of a two part job to implement
   * redirect logic.
   *
   * @param basicFields The basic fields temporary output.
   * @param links The temporary output holding urls and any redirects.
   * @param outputDir The scorer output.
   *
   * @throws IOException If an error occurs while processing.
   */
  private void runScorer(Path basicFields, Path links, Path outputDir)
    throws IOException {

    LOG.info("BasicFields: starting scorer");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    FileInputFormat.addInputPath(job, links);
    FileInputFormat.addInputPath(job, basicFields);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Scorer.class);
    job.setReducerClass(Scorer.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("BasicFields: finished scorer");
    }
  }

  /**
   * Runs the Merger job. Merger ensures that the most recent set of fields for
   * any given url is collected.
   *
   * @param basicFields The basic fields final output.
   * @param outputDir The merger output.
   *
   * @throws IOException If an error occurs while processing.
   */
  private void runMerger(Path[] basicFields, Path outputDir)
    throws IOException {

    LOG.info("BasicFields: starting merger");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    for (Path basic : basicFields) {
      FileInputFormat.addInputPath(job, basic);
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setReducerClass(Merger.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("BasicFields: finished merger");
    }
  }

  /**
   * Extracts basic fields from a single segment.
   */
  private static class Extractor
    extends Configured
    implements Mapper<Text, Writable, Text, ObjectWritable>,
    Reducer<Text, ObjectWritable, Text, FieldsWritable> {

    private int MAX_TITLE_LENGTH;
    private Configuration conf;

    /**
     * Default constructor.
     */
    public Extractor() {

    }

    /**
     * Configurable constructor.
     */
    public Extractor(Configuration conf) {
      setConf(conf);
    }

    /**
     * Configures the job.
     */
    public void configure(JobConf conf) {
      this.conf = conf;
      this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
    }

    public void close() {
    }

    /**
     * Wraps values in ObjectWritable.
     */
    public void map(Text key, Writable value,
      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
      throws IOException {
      ObjectWritable objWrite = new ObjectWritable();
      objWrite.set(value);
      output.collect(key, objWrite);
    }

    /**
     * Creates basic fields from a single segment.
     */
    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
      throws IOException {

      Node nodeDb = null;
      List<CrawlDatum> fetchDatums = new ArrayList<CrawlDatum>();
      ParseData parseData = null;
      ParseText parseText = null;
      List<FieldWritable> fieldsList = new ArrayList<FieldWritable>();

      // assign values, url must be successfully fetched and parsed
      while (values.hasNext()) {

        ObjectWritable objWrite = values.next();
        Object value = objWrite.get();
        if (value instanceof CrawlDatum) {
          CrawlDatum datum = (CrawlDatum)value;
          if (datum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
            fetchDatums.add(datum);
          }
        }
        else if (value instanceof Node) {
          nodeDb = (Node)value;
        }
        else if (value instanceof ParseData
          && ((ParseData)value).getStatus().isSuccess()) {
          parseData = (ParseData)value;
        }
        else if (value instanceof ParseText) {
          parseText = (ParseText)value;
        }
      }

      // if not successfully fetched and parsed then stop processing
      int numDatums = fetchDatums.size();
      if (numDatums == 0 || nodeDb == null || parseText == null
        || parseData == null) {
        return;
      }

      // get the most recent fetch time, this is duplicates inside of a single
      // segment, usually due to redirects
      CrawlDatum fetchDatum = null;
      long mostRecent = 0L;
      for (CrawlDatum cur : fetchDatums) {
        long fetchTime = cur.getFetchTime();
        if (fetchDatum == null || fetchTime > mostRecent) {
          fetchDatum = cur;
          mostRecent = fetchTime;
        }
      }

      // get parse metadata
      Metadata metadata = parseData.getContentMeta();
      Parse parse = new ParseImpl(parseText, parseData);

      // handle redirect urls
      Text reprUrlText = (Text)fetchDatum.getMetaData().get(
        Nutch.WRITABLE_REPR_URL_KEY);
      String reprUrl = reprUrlText != null ? reprUrlText.toString() : null;
      String url = key.toString();
      String fieldUrl = (reprUrl != null) ? reprUrl : url;
      String host = URLUtil.getHost(fieldUrl);

      // add segment, used to map from merged index back to segment files
      FieldWritable segField = new FieldWritable(Fields.SEGMENT,
        metadata.get(Nutch.SEGMENT_NAME_KEY), FieldType.CONTENT, false, true,
        false);
      fieldsList.add(segField);

      // add digest, used by dedup
      FieldWritable digestField = new FieldWritable(Fields.DIGEST,
        metadata.get(Nutch.SIGNATURE_KEY), FieldType.CONTENT, false, true,
        false);
      fieldsList.add(digestField);

      // url is both stored and indexed, so it's both searchable and returned
      fieldsList.add(new FieldWritable(Fields.URL, fieldUrl, FieldType.CONTENT,
        true, true, true));
      fieldsList.add(new FieldWritable(Fields.SEG_URL, url, FieldType.CONTENT,
        false, true, false));

      if (reprUrl != null) {
        // also store original url as both stored and indexes
        fieldsList.add(new FieldWritable(Fields.ORIG_URL, url,
          FieldType.CONTENT, true, true, true));
      }

      if (host != null) {
        // add host as un-stored, indexed and tokenized
        FieldWritable hostField = new FieldWritable(Fields.HOST, host,
          FieldType.CONTENT, true, false, true);
        fieldsList.add(hostField);

        // add site as un-stored, indexed and un-tokenized
        FieldWritable siteField = new FieldWritable(Fields.SITE, host,
          FieldType.CONTENT, true, false, false);
        fieldsList.add(siteField);
      }

      // content is indexed, so that it's searchable, but not stored in index
      fieldsList.add(new FieldWritable(Fields.CONTENT, parse.getText(),
        FieldType.CONTENT, true, false, true));

      // title
      String title = parse.getData().getTitle();
      if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
        title = title.substring(0, MAX_TITLE_LENGTH);
      }
      // add title indexed and stored so that it can be displayed
      fieldsList.add(new FieldWritable(Fields.TITLE, title, FieldType.CONTENT,
        true, true, true));

      // add cached content/summary display policy, if available
      String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
      if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
        fieldsList.add(new FieldWritable(Fields.CACHE, caching,
          FieldType.CONTENT, false, true, false));
      }

      // add timestamp when fetched, for deduplication
      fieldsList.add(new FieldWritable(Fields.TSTAMP, DateTools.timeToString(
        fetchDatum.getFetchTime(), DateTools.Resolution.MILLISECOND),
        FieldType.CONTENT, false, true, false));

      FieldsWritable fields = new FieldsWritable();
      fields.setFieldsList(fieldsList);
      output.collect(key, fields);
    }
  }

  /**
   * Runs the first part of redirect logic.  Breaks out fields if a page
   * contains a redirect.
   */
  public static class Flipper
    extends Configured
    implements Mapper<Text, Writable, Text, ObjectWritable>,
    Reducer<Text, ObjectWritable, Text, LinkDatum> {

    private JobConf conf;

    /**
     * Configures the job.
     */
    public void configure(JobConf conf) {
      this.conf = conf;
    }

    public void close() {
    }

    /**
     * Breaks out the collection of fields for url and redirects if necessary.
     */
    public void map(Text key, Writable value,
      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
      throws IOException {

      ObjectWritable objUrl = new ObjectWritable();
      objUrl.set(key);

      if (value instanceof FieldsWritable) {

        // collect the fields for the url
        FieldsWritable fields = (FieldsWritable)value;
        FieldWritable url = fields.getField(Fields.URL);
        FieldWritable orig = fields.getField(Fields.ORIG_URL);
        output.collect(new Text(url.getValue()), objUrl);

        // collect for the orig / redirect url if one exists
        if (orig != null) {
          output.collect(new Text(orig.getValue()), objUrl);
        }
      }
      else {
       
        // anything else passes through
        ObjectWritable objWrite = new ObjectWritable();
        objWrite.set(value);
        output.collect(key, objWrite);
      }
    }

    /**
     * Collects redirect and original links for a given url key.  This will be
     * used in the Scorer to handle redirects.
     */
    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, LinkDatum> output, Reporter reporter)
      throws IOException {

      Node node = null;
      List<String> urls = new ArrayList<String>();

      while (values.hasNext()) {
        ObjectWritable objWrite = values.next();
        Object obj = objWrite.get();
        if (obj instanceof Node) {
          node = (Node)obj;
        }
        else if (obj instanceof Text) {
          urls.add(obj.toString());
        }
      }

      if (urls.size() > 0) {
        float score = (node != null) ? node.getInlinkScore() : 0.0f;
        for (String url : urls) {
          LinkDatum datum = new LinkDatum(key.toString());
          datum.setScore(score);
          output.collect(new Text(url), datum);
        }
      }
    }
  }

  /**
   * The Scorer job sets the boost field from the NodeDb score.
   *
   * It also runs the second part of redirect logic.  Determining the highest
   * scoring url for pages that contain redirects.
   */
  public static class Scorer
    extends Configured
    implements Mapper<Text, Writable, Text, ObjectWritable>,
    Reducer<Text, ObjectWritable, Text, FieldsWritable> {

    private JobConf conf;

    /**
     * Configures the job.
     */
    public void configure(JobConf conf) {
      this.conf = conf;
    }

    public void close() {
    }

    /**
     * Wraps values in ObjectWritable.
     */
    public void map(Text key, Writable value,
      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
      throws IOException {

      ObjectWritable objWrite = new ObjectWritable();
      objWrite.set(value);
      output.collect(key, objWrite);
    }

    /**
     * Sets a document boost field from the NodeDb and determines the best
     * scoring url for pages that have rediects.  Uses the highest scoring url
     * as the display url in the index.
     */
    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
      throws IOException {

      FieldsWritable fields = null;
      List<LinkDatum> datums = new ArrayList<LinkDatum>();

      while (values.hasNext()) {
        ObjectWritable objWrite = values.next();
        Object obj = objWrite.get();
        if (obj instanceof FieldsWritable) {
          fields = (FieldsWritable)obj;
        }
        else if (obj instanceof LinkDatum) {
          datums.add((LinkDatum)obj);
        }
      }

      int numDatums = datums.size();
      if (fields != null && numDatums > 0) {

        // if no redirect for the page just assign the linkrank boost
        List<FieldWritable> fieldsList = fields.getFieldsList();
        if (numDatums == 1) {
          float linkRank = datums.get(0).getScore();
          fieldsList.add(new FieldWritable(Fields.BOOST, "linkrank",
            FieldType.BOOST, linkRank));
          output.collect(new Text(key), fields);
        }
        else {

          // get both the url and any rediect url stored
          FieldWritable url = fields.getField(Fields.URL);
          FieldWritable orig = fields.getField(Fields.ORIG_URL);
          float urlScore = 0.0f;
          float origScore = 0.0f;

          // get the scores for each
          for (LinkDatum datum : datums) {
            String curUrl = datum.getUrl();
            if (curUrl.equals(url.getValue())) {
              urlScore = datum.getScore();
            }
            else if (curUrl.equals(orig.getValue())) {
              origScore = datum.getScore();
            }
          }

          // if the highest scoring url is not the one currently displayed in
          // the index under the current basic fields, then switch it
          String urlKey = url.getValue();
          float linkRank = urlScore;
          if (origScore > urlScore) {
            url.setName(Fields.ORIG_URL);
            orig.setName(Fields.URL);

            // We also need to fix the host because we are changing urls
            String host = URLUtil.getHost(orig.getValue());
            if (host != null) {
              fieldsList.remove(fields.getField(Fields.SITE));
              fieldsList.remove(fields.getField(Fields.HOST));
              fieldsList.add(new FieldWritable(Fields.HOST, host,
                FieldType.CONTENT, true, false, true));
              fieldsList.add(new FieldWritable(Fields.SITE, host,
                FieldType.CONTENT, true, false, false));
            }

            linkRank = origScore;
            urlKey = orig.getValue();
          }

          // create the final document boost field
          fieldsList.add(new FieldWritable(Fields.BOOST, "linkrank",
            FieldType.BOOST, linkRank));
          output.collect(new Text(urlKey), fields);
        }
      }
    }
  }

  /**
   * Merges output of all segments fields collecting only the most recent set
   * of fields for any given url.
   */
  public static class Merger
    extends Configured
    implements Reducer<Text, FieldsWritable, Text, FieldsWritable> {

    private JobConf conf;

    /**
     * Configures the job.
     */
    public void configure(JobConf conf) {
      this.conf = conf;
    }

    public void close() {
    }

    /**
     * Collects the most recent set of fields for any url.
     */
    public void reduce(Text key, Iterator<FieldsWritable> values,
      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
      throws IOException {

      List<FieldsWritable> fields = new ArrayList<FieldsWritable>();

      // collects the various sets of fields
      while (values.hasNext()) {
        fields.add((FieldsWritable)WritableUtils.clone(values.next(), conf));
      }

      // if only one set of fields for a given url passthrough
      FieldsWritable outFields = null;
      int numFields = fields.size();
      if (numFields == 1) {
        outFields = fields.get(0);
      }
      else if (numFields > 1) {

        // more than one set of fields means url has been fetched more than
        // once, collect only the most recent set of fields
        FieldsWritable mostRecent = null;
        long recentTime = 0L;
        for (int i = 0; i < numFields; i++) {
          FieldsWritable cur = fields.get(i);
          String tStampStr = cur.getField(Fields.TSTAMP).getValue();
          long timestamp = Long.parseLong(tStampStr);
          if (mostRecent == null || recentTime < timestamp) {
            recentTime = timestamp;
            mostRecent = cur;
          }
        }

        outFields = mostRecent;
      }

      output.collect(key, outFields);
    }
  }

  /**
   * Runs the BasicFields jobs for every segment and aggregates and filters
   * the output to create a final database of FieldWritable objects.
   *
   * @param nodeDb The node database.
   * @param segments The array of segments to process.
   * @param output The BasicFields output.
   *
   * @throws IOException If an error occurs while processing the segments.
   */
  public void createFields(Path nodeDb, Path[] segments, Path output)
    throws IOException {

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path tempOutput = new Path(output.toString() + "-temp");
    fs.mkdirs(tempOutput);
    int numSegments = segments.length;
    Path[] basicFields = new Path[numSegments];

    // one pass per segment to extract and create the basic fields
    for (int i = 0; i < numSegments; i++) {

      Path segment = segments[i];
      Path segOutput = new Path(tempOutput, String.valueOf(i));
      Path tempBasic = new Path(tempOutput, "basic-"
        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
      Path tempFlip = new Path(tempOutput, "flip-"
        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

      runExtractor(nodeDb, segment, tempBasic);
      runFlipper(tempBasic, nodeDb, tempFlip);
      runScorer(tempBasic, tempFlip, segOutput);

      fs.delete(tempBasic, true);
      fs.delete(tempFlip, true);
      basicFields[i] = segOutput;
    }

    // merge all of the segments and delete any temporary output
    runMerger(basicFields, output);
    fs.delete(tempOutput, true);
  }

  public static void main(String[] args)
    throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new BasicFields(),
      args);
    System.exit(res);
  }

  /**
   * Runs the BasicFields tool.
   */
  public int run(String[] args)
    throws Exception {

    Options options = new Options();
    Option helpOpts = OptionBuilder.withArgName("help").withDescription(
      "show this help message").create("help");
    Option outputOpts = OptionBuilder.withArgName("output").hasArg().withDescription(
      "the output index directory").create("output");
    Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription(
      "the webgraphdb to use").create("webgraphdb");
    Option segOpts = OptionBuilder.withArgName("segment").hasArgs().withDescription(
      "the segment(s) to use").create("segment");
    options.addOption(helpOpts);
    options.addOption(webGraphOpts);
    options.addOption(segOpts);
    options.addOption(outputOpts);

    CommandLineParser parser = new GnuParser();
    try {

      CommandLine line = parser.parse(options, args);
      if (line.hasOption("help") || !line.hasOption("webgraphdb")
        || !line.hasOption("output") || !line.hasOption("segment")) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("BasicFields", options);
        return -1;
      }

      // get the command line options and all of the segments
      String webGraphDb = line.getOptionValue("webgraphdb");
      String output = line.getOptionValue("output");
      String[] segments = line.getOptionValues("segment");
      Path[] segPaths = new Path[segments.length];
      for (int i = 0; i < segments.length; i++) {
        segPaths[i] = new Path(segments[i]);
      }

      createFields(new Path(webGraphDb, WebGraph.NODE_DIR), segPaths, new Path(
        output));
      return 0;
    }
    catch (Exception e) {
      LOG.fatal("BasicFields: " + StringUtils.stringifyException(e));
      return -2;
    }
  }
}
TOP

Related Classes of org.apache.nutch.indexer.field.BasicFields$Flipper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.