Package org.apache.nutch.tools.arc

Source Code of org.apache.nutch.tools.arc.ArcSegmentCreator

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.tools.arc;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.fetcher.FetcherOutputFormat;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;

/**
* <p>The <code>ArcSegmentCreator</code> is a replacement for fetcher that will
* take arc files as input and produce a nutch segment as output.</p>
*
* <p>Arc files are tars of compressed gzips which are produced by both the
* internet archive project and the grub distributed crawler project.</p>
*
*/
public class ArcSegmentCreator
  extends Configured
  implements Tool, Mapper<Text, BytesWritable, Text, NutchWritable> {

  public static final Log LOG = LogFactory.getLog(ArcSegmentCreator.class);
  public static final String URL_VERSION = "arc.url.version";
  private JobConf jobConf;
  private URLFilters urlFilters;
  private ScoringFilters scfilters;
  private ParseUtil parseUtil;
  private URLNormalizers normalizers;
  private int interval;

  private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");

  public ArcSegmentCreator() {

  }

  /**
   * <p>Constructor that sets the job configuration.</p>
   *
   * @param conf
   */
  public ArcSegmentCreator(Configuration conf) {
    setConf(conf);
  }

  /**
   * Generates a random name for the segments.
   *
   * @return The generated segment name.
   */
  public static synchronized String generateSegmentName() {
    try {
      Thread.sleep(1000);
    }
    catch (Throwable t) {
    }
    return sdf.format(new Date(System.currentTimeMillis()));
  }

  /**
   * <p>Configures the job.  Sets the url filters, scoring filters, url normalizers
   * and other relevant data.</p>
   *
   * @param job The job configuration.
   */
  public void configure(JobConf job) {

    // set the url filters, scoring filters the parse util and the url
    // normalizers
    this.jobConf = job;
    this.urlFilters = new URLFilters(jobConf);
    this.scfilters = new ScoringFilters(jobConf);
    this.parseUtil = new ParseUtil(jobConf);
    this.normalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_FETCHER);
    interval = jobConf.getInt("db.fetch.interval.default", 2592000);
  }

  public void close() {
  }

  /**
   * <p>Parses the raw content of a single record to create output.  This method
   * is almost the same as the {@link org.apache.nutch.Fetcher#output} method in
   * terms of processing and output. 
   *
   * @param output  The job output collector.
   * @param segmentName The name of the segment to create.
   * @param key The url of the record.
   * @param datum The CrawlDatum of the record.
   * @param content The raw content of the record
   * @param pstatus The protocol status
   * @param status The fetch status.
   *
   * @return The result of the parse in a ParseStatus object.
   */
  private ParseStatus output(OutputCollector<Text, NutchWritable> output, String segmentName,
    Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus,
    int status) {

    // set the fetch status and the fetch time
    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
      datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

    ParseResult parseResult = null;
    if (content != null) {
      Metadata metadata = content.getMetadata();
      // add segment to metadata
      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
      // add score to content metadata so that ParseSegment can pick it up.
      try {
        scfilters.passScoreBeforeParsing(key, datum, content);
      }
      catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          e.printStackTrace(LogUtil.getWarnStream(LOG));
          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
        }
      }

      try {

        // parse the content
        parseResult = this.parseUtil.parse(content);
      }
      catch (Exception e) {
        LOG.warn("Error parsing: " + key + ": "
          + StringUtils.stringifyException(e));
      }

      // set the content signature
      if (parseResult == null) {
        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
          content, new ParseStatus().getEmptyParse(getConf()));
        datum.setSignature(signature);
      }

      try {
        output.collect(key, new NutchWritable(datum));
        output.collect(key, new NutchWritable(content));

        if (parseResult != null) {
          for (Entry <Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();

            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }

            // Calculate page signature.
            byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
              content, parse);
            // Ensure segment name and score are in parseData metadata
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
              segmentName);
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
              StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
              Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            }
            catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                e.printStackTrace(LogUtil.getWarnStream(LOG));
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
              parse.getText()), parse.getData(), parse.isCanonical())));
          }
        }
      }
      catch (IOException e) {
        if (LOG.isFatalEnabled()) {
          LOG.fatal("ArcSegmentCreator caught:" + StringUtils.stringifyException(e));
        }
      }

      // return parse status if it exits
      if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
          return p.getData().getStatus();
        }
      }
    }
   
    return null;
  }

  /**
   * <p>Logs any error that occurs during conversion.</p>
   *
   * @param url The url we are parsing.
   * @param t The error that occured.
   */
  private void logError(Text url, Throwable t) {
    if (LOG.isInfoEnabled()) {
      LOG.info("Conversion of " + url + " failed with: " +
          StringUtils.stringifyException(t));
    }
  }

  /**
   * <p>Runs the Map job to translate an arc record into output for Nutch
   * segments.</p>
   *
   * @param key The arc record header.
   * @param bytes The arc record raw content bytes.
   * @param output The output collecter.
   * @param reporter The progress reporter.
   */
  public void map(Text key, BytesWritable bytes,
    OutputCollector<Text, NutchWritable> output, Reporter reporter)
    throws IOException {

    String[] headers = key.toString().split("\\s+");
    String urlStr = headers[0];
    String version = headers[2];
    String contentType = headers[3];
   
    // arcs start with a file description.  for now we ignore this as it is not
    // a content record
    if (urlStr.startsWith("filedesc://")) {
      LOG.info("Ignoring file header: " + urlStr);
      return;
    }
    LOG.info("Processing: " + urlStr);

    // get the raw  bytes from the arc file, create a new crawldatum
    Text url = new Text();
    CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval,
      1.0f);
    String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY);

    // normalize and filter the urls
    try {
      urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER);
      urlStr = urlFilters.filter(urlStr); // filter the url
    }
    catch (Exception e) {
      if (LOG.isWarnEnabled()) {
        LOG.warn("Skipping " + url + ":" + e);
      }
      urlStr = null;
    }

    // if still a good url then process
    if (urlStr != null) {

      url.set(urlStr);
      try {

        // set the protocol status to success and the crawl status to success
        // create the content from the normalized url and the raw bytes from
        // the arc file,  TODO: currently this doesn't handle text of errors
        // pages (i.e. 404, etc.). We assume we won't get those.
        ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS;
        Content content = new Content(urlStr, urlStr, bytes.get(), contentType,
          new Metadata(), getConf());
       
        // set the url version into the metadata
        content.getMetadata().set(URL_VERSION, version);
        ParseStatus pstatus = null;
        pstatus = output(output, segmentName, url, datum, content, status,
          CrawlDatum.STATUS_FETCH_SUCCESS);
        reporter.progress();
      }
      catch (Throwable t) { // unexpected exception
        logError(url, t);
        output(output, segmentName, url, datum, null, null,
          CrawlDatum.STATUS_FETCH_RETRY);
      }
    }
  }

  /**
   * <p>Creates the arc files to segments job.</p>
   *
   * @param arcFiles The path to the directory holding the arc files
   * @param segmentsOutDir The output directory for writing the segments
   *
   * @throws IOException If an IO error occurs while running the job.
   */
  public void createSegments(Path arcFiles, Path segmentsOutDir)
    throws IOException {

    if (LOG.isInfoEnabled()) {
      LOG.info("ArcSegmentCreator: starting");
      LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("ArcSegmentCreator " + arcFiles);
    String segName = generateSegmentName();
    job.set(Nutch.SEGMENT_NAME_KEY, segName);
    FileInputFormat.addInputPath(job, arcFiles);
    job.setInputFormat(ArcInputFormat.class);
    job.setMapperClass(ArcSegmentCreator.class);
    FileOutputFormat.setOutputPath(job, new Path(segmentsOutDir, segName));
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("ArcSegmentCreator: done");
    }
  }

  public static void main(String args[])
    throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new ArcSegmentCreator(), args);
    System.exit(res);
  }

  public int run(String[] args)
    throws Exception {

    String usage = "Usage: ArcSegmentCreator <arcFiles> <segmentsOutDir>";

    if (args.length < 2) {
      System.err.println(usage);
      return -1;
    }

    // set the arc files directory and the segments output directory
    Path arcFiles = new Path(args[0]);
    Path segmentsOutDir = new Path(args[1]);

    try {
      // create the segments from the arc files
      createSegments(arcFiles, segmentsOutDir);
      return 0;
    }
    catch (Exception e) {
      LOG.fatal("ArcSegmentCreator: " + StringUtils.stringifyException(e));
      return -1;
    }
  }
}
TOP

Related Classes of org.apache.nutch.tools.arc.ArcSegmentCreator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.