Package org.archive.access.nutch.jobs

Source Code of org.archive.access.nutch.jobs.ImportArcs$WaxFetcherOutputFormat

/*
* $Id: ImportArcs.java 1521 2007-02-27 18:01:29Z stack-sf $
*
* Copyright (C) 2003 Internet Archive.
*
* This file is part of the archive-access tools project
* (http://sourceforge.net/projects/archive-access).
*
* The archive-access tools are free software; you can redistribute them and/or
* modify them under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or any
* later version.
*
* The archive-access tools are distributed in the hope that they will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
* Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License along with
* the archive-access tools; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/

package org.archive.access.nutch.jobs;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.NumberFormat;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.Header;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.MapWritable;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.fetcher.FetcherOutputFormat;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseOutputFormat;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.mime.MimeType;
import org.apache.nutch.util.mime.MimeTypeException;
import org.apache.nutch.util.mime.MimeTypes;
import org.archive.access.nutch.Nutchwax;
import org.archive.access.nutch.NutchwaxConfiguration;
import org.archive.access.nutch.jobs.sql.SqlSearcher;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.mapred.ARCMapRunner;
import org.archive.mapred.ARCRecordMapper;
import org.archive.mapred.ARCReporter;
import org.archive.util.Base32;
import org.archive.util.MimetypeUtils;
import org.archive.util.TextUtils;
import org.apache.nutch.global.Global;


/**
* Ingests ARCs writing ARC Record parse as Nutch FetcherOutputFormat.
* FOF has five outputs:
* <ul><li>crawl_fetch holds a fat CrawlDatum of all vitals including metadata.
* Its written below by our {@link WaxFetcherOutputFormat} (innutch by
* {@link FetcherOutputFormat}).  Here is an example CD: <pre>  Version: 4
*  Status: 5 (fetch_success)
*  Fetch time: Wed Mar 15 12:38:49 PST 2006
*  Modified time: Wed Dec 31 16:00:00 PST 1969
*  Retries since fetch: 0
*  Retry interval: 0.0 days
*  Score: 1.0
*  Signature: null
*  Metadata: collection:test arcname:IAH-20060315203614-00000-debord arcoffset:5127
* </pre></li>
* <li>crawl_parse has CrawlDatum of MD5s.  Used making CrawlDB.
* Its obtained from above fat crawl_fetch CrawlDatum and written
* out as part of the parse output done by {@link WaxParseOutputFormat}.
* This latter class writes three files.  This crawl_parse and both
* of the following parse_text and parse_data.</li>
* <li>parse_text has text from parse.</li>
* <li>parse_data has other metadata found by parse (Depends on
* parser).  This is only input to linkdb.  The html parser
* adds found out links here and content-type and discovered
* encoding as well as advertised encoding, etc.</li>
* <li>cdx has a summary line for every record processed.</li>
* </ul>
*/
public class ImportArcs extends ToolBase implements ARCRecordMapper
{
  public  final Log LOG = LogFactory.getLog(ImportArcs.class);
  private final NumberFormat numberFormatter = NumberFormat.getInstance();

  private static final String WHITESPACE = "\\s+";

  public static final String ARCFILENAME_KEY = "arcname";
  public static final String ARCFILEOFFSET_KEY = "arcoffset";
  private static final String CONTENT_TYPE_KEY = "content-type";
  private static final String TEXT_TYPE = "text/";
  private static final String APPLICATION_TYPE = "application/";
  public static final String ARCCOLLECTION_KEY = "collection";
  public static final String WAX_SUFFIX = "wax.";
  public static final String WAX_COLLECTION_KEY = WAX_SUFFIX + ARCCOLLECTION_KEY;

  private static final String PDF_TYPE = "application/pdf";
   
  private boolean indexAll;
  private int contentLimit;
  private int pdfContentLimit;
  private MimeTypes mimeTypes;
  private String segmentName;
  private String collectionName;
  private int parseThreshold = -1;
  private boolean indexRedirects;
  private boolean sha1 = false;
  private boolean arcNameFromFirstRecord = true ;
  private String arcName; 
  private String collectionType;
  private int timeoutIndexingDocument;
 

/**
  * Usually the URL in first record looks like this:
  * filedesc://IAH-20060315203614-00000-debord.arc.  But in old
  * ARCs, it can look like this: filedesc://19961022/IA-000001.arc.
  */
  private static final Pattern FILEDESC_PATTERN =
   Pattern.compile("^(?:filedesc://)(?:[0-9]+\\/)?(.+)(?:\\.arc)$");

  private static final Pattern TAIL_PATTERN =
    Pattern.compile("(?:.*(?:/|\\\\))?(.+)(?:\\.arc|\\.arc\\.gz)$");

/**
  * Buffer to reuse on each ARCRecord indexing.
  */
  private final byte[] buffer = new byte[1024 * 16];

  private final ByteArrayOutputStream contentBuffer =
    new ByteArrayOutputStream(1024 * 16);

  private URLNormalizers urlNormalizers;
  private URLFilters filters;

  private ParseUtil parseUtil;

  private static final Text CDXKEY = new Text("cdx");
   
  private TimeoutParsingThreadPool threadPool=new TimeoutParsingThreadPool(); // this is one pool of only one thread; it is not necessary to be static
 
 

  public ImportArcs()
  {
    super();
  }

  public ImportArcs(Configuration conf)
  {
    setConf(conf);
  }

  public void importArcs(final Path arcUrlsDir, final Path segment,
    final String collection)
    throws IOException
  {
    LOG.info("ImportArcs segment: " + segment + ", src: " + arcUrlsDir);

    final JobConf job = new JobConf(getConf(), this.getClass());

    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    job.setInputPath(arcUrlsDir);

    //job.setMapRunnerClass(job.getClass("wax.import.maprunner", ARCMapRunner.class));
    //job.setMapperClass(job.getClass("wax.import.mapper", this.getClass()));
    job.setMapRunnerClass( ARCMapRunner.class ); // compatible with hadoop 0.14 TODO MC
    job.setMapperClass( this.getClass() );

    job.setInputFormat(TextInputFormat.class);

    job.setOutputPath(segment);
    job.setOutputFormat(WaxFetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FetcherOutput.class);
   
    // Pass the collection name out to the tasks IF non-null.
    if ((collection != null) && (collection.length() > 0))
    {
      job.set(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY,
        collection);
    }   
    job.setJobName("import " + arcUrlsDir + " " + segment);

    JobClient.runJob(job);
    LOG.info("ImportArcs: done");
  }

  public void configure(final JobConf job)
  {
    setConf(job);
    this.indexAll = job.getBoolean("wax.index.all", false);

    this.contentLimit = job.getInt("http.content.limit", 1024 * 100);
    final int pdfMultiplicand = job.getInt("wax.pdf.size.multiplicand", 10);
    this.pdfContentLimit = (this.contentLimit == -1) ? this.contentLimit
      : pdfMultiplicand * this.contentLimit;
    this.mimeTypes = MimeTypes.get(job.get("mime.types.file"));
    this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);

    // Get the rsync protocol handler into the mix.
    System.setProperty("java.protocol.handler.pkgs", "org.archive.net");

    // Format numbers output by parse rate logging.
    this.numberFormatter.setMaximumFractionDigits(2);
    this.numberFormatter.setMinimumFractionDigits(2);
    this.parseThreshold = job.getInt("wax.parse.rate.threshold", -1);

    this.indexRedirects = job.getBoolean("wax.index.redirects", false);

    this.sha1 = job.getBoolean("wax.digest.sha1", false);

    this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_FETCHER);
    this.filters = new URLFilters(job);

    this.parseUtil = new ParseUtil(job);

    this.collectionName = job.get(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY);

    // Get ARCName by reading first record in ARC?  Otherwise, we parse
    // the name of the file we've been passed to find an ARC name.
    this.arcNameFromFirstRecord = job.getBoolean("wax.arcname.from.first.record", true);
   
    this.collectionType = job.get(Global.COLLECTION_TYPE);
    this.timeoutIndexingDocument = job.getInt(Global.TIMEOUT_INDEXING_DOCUMENT, -1);  
   
    LOG.info("ImportArcs collectionType: " + collectionType);
  }

  public Configuration getConf()
  {
    return this.conf;
  }

  public void setConf(Configuration c)
  {
    this.conf = c;
  }

  public void onARCOpen()
  {
    // Nothing to do.
  }

  public void onARCClose()
  { 
  threadPool.closeAll(); // close the only thread created for this map
  }  

  public void map(final WritableComparable key, final Writable value,
    final OutputCollector output, final Reporter r)
    throws IOException
  {
    // Assumption is that this map is being run by ARCMapRunner.
    // Otherwise, the below casts fail.
    String url = key.toString();
       
    ARCRecord rec = (ARCRecord)((ObjectWritable)value).get();
    ARCReporter reporter = (ARCReporter)r;      

    // Its null first time map is called on an ARC.
    checkArcName(rec);  
    if (! isIndex(rec))
    {
      return;
    }
    checkCollectionName();
   
    final ARCRecordMetaData arcData = rec.getMetaData();
    String oldUrl = url;
   
    try
    {
      url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_FETCHER);
      url = filters.filter(url); // filter the url
    }
    catch (Exception e)
    {
      LOG.warn("Skipping record. Didn't pass normalization/filter " +
        oldUrl + ": " + e.toString());

      return;
    }

    final long b = arcData.getContentBegin();
    final long l = arcData.getLength();
    final long recordLength = (l > b)? (l - b): l;

    // Look at ARCRecord meta data line mimetype. It can be empty.  If so,
    // two more chances at figuring it either by looking at HTTP headers or
    // by looking at first couple of bytes of the file.  See below.
    String mimetype =
      getMimetype(arcData.getMimetype(), this.mimeTypes, url);
   
    if (skip(mimetype))
    {
      return;
    }

    // Copy http headers to nutch metadata.
    final Metadata metaData = new Metadata();
    final Header[] headers = rec.getHttpHeaders();
    for (int j = 0; j < headers.length; j++)
    {
      final Header header = headers[j];
     
      if (mimetype == null)
      {
        // Special handling. If mimetype is still null, try getting it
        // from the http header. I've seen arc record lines with empty
        // content-type and a MIME unparseable file ending; i.e. .MID.
        if ((header.getName() != null) &&
          header.getName().toLowerCase().equals(ImportArcs.CONTENT_TYPE_KEY))
        {
          mimetype = getMimetype(header.getValue(), null, null);
         
          if (skip(mimetype))
          {
            return;
          }
        }
      }
     
      metaData.set(header.getName(), header.getValue());
    }

    // This call to reporter setStatus pings the tasktracker telling it our
    // status and telling the task tracker we're still alive (so it doesn't
    // time us out).
    final String noSpacesMimetype =
      TextUtils.replaceAll(ImportArcs.WHITESPACE,
      ((mimetype == null || mimetype.length() <= 0)?
      "TODO": mimetype),
      "-");
    final String recordLengthAsStr = Long.toString(recordLength);
   
    reporter.setStatus(getStatus(url, oldUrl, recordLengthAsStr, noSpacesMimetype));

    // This is a nutch 'more' field.
    metaData.set("contentLength", recordLengthAsStr);

    rec.skipHttpHeader();
    reporter.setStatusIfElapse("read headers on " + url);

    // TODO: Skip if unindexable type.
    int total = 0;
   
    // Read in first block. If mimetype still null, look for MAGIC.
    int len = rec.read(this.buffer, 0, this.buffer.length);
   
    if (mimetype == null)
    {
      MimeType mt = this.mimeTypes.getMimeType(this.buffer);
     
      if (mt == null || mt.getName() == null)
      {
        LOG.warn("Failed to get mimetype for: " + url);
       
        return;
      }
     
      mimetype = mt.getName();
    }
   
    metaData.set(ImportArcs.CONTENT_TYPE_KEY, mimetype);

    // How much do we read total? If pdf, we will read more. If equal to -1,
    // read all.
    int readLimit = (ImportArcs.PDF_TYPE.equals(mimetype))?
      this.pdfContentLimit : this.contentLimit;
   
    // Reset our contentBuffer so can reuse.  Over the life of an ARC
    // processing will grow to maximum record size.
    this.contentBuffer.reset();
    while ((len != -1) && ((readLimit == -1) || (total < readLimit)))
    {
      total += len;
      this.contentBuffer.write(this.buffer, 0, len);
      len = rec.read(this.buffer, 0, this.buffer.length);
      reporter.setStatusIfElapse("reading " + url);
    }

    // Close the Record.  We're done with it.  Side-effect is calculation
    // of digest -- if we're digesting.
    rec.close();
    reporter.setStatusIfElapse("closed " + url);

    final byte[] contentBytes = this.contentBuffer.toByteArray();
    final CrawlDatum datum = new CrawlDatum();
    datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);

    // Calculate digest or use precalculated sha1.
    String digest = (this.sha1)? rec.getDigestStr():
    MD5Hash.digest(contentBytes).toString();
    metaData.set(Nutch.SIGNATURE_KEY, digest);
   
    // Set digest back into the arcData so available later when we write
    // CDX line.
    arcData.setDigest(digest);

    metaData.set(Nutch.SEGMENT_NAME_KEY, this.segmentName);
   
    // Score at this stage is 1.0f.
    metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore()));

    final long startTime = System.currentTimeMillis();
    final Content content = new Content(url, url, contentBytes, mimetype,
      metaData, getConf());
    datum.setFetchTime(Nutchwax.getDate(arcData.getDate()));

    MapWritable mw = datum.getMetaData();
   
    if (mw == null)
    {
      mw = new MapWritable();
    }
           
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
      mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(SqlSearcher.getCollectionNameWithTimestamp(collectionName,arcData.getDate())));  
    }
    else {
      mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(collectionName));
    }   
    mw.put(new Text(ImportArcs.ARCFILENAME_KEY), new Text(arcName));
    mw.put(new Text(ImportArcs.ARCFILEOFFSET_KEY),
      new Text(Long.toString(arcData.getOffset())));
    datum.setMetaData(mw);
         
  TimeoutParsingThread tout=threadPool.getThread(Thread.currentThread().getId(),timeoutIndexingDocument)
  tout.setUrl(url);
    tout.setContent(content);
    tout.setParseUtil(parseUtil);         
    tout.wakeupAndWait();       
 
  ParseStatus parseStatus=tout.getParseStatus();
  Parse parse=tout.getParse();    
  reporter.setStatusIfElapse("parsed " + url);
    
  if (!parseStatus.isSuccess()) {
      final String status = formatToOneLine(parseStatus.toString());
      LOG.warn("Error parsing: " + mimetype + " " + url + ": " + status);
      parse = null;
    }
    else {
      // Was it a slow parse?
      final double kbPerSecond = getParseRate(startTime,
        (contentBytes != null) ? contentBytes.length : 0);
     
      if (LOG.isDebugEnabled())
      {
        LOG.debug(getParseRateLogMessage(url,
          noSpacesMimetype, kbPerSecond));
      }
      else if (kbPerSecond < this.parseThreshold)
      {
        LOG.warn(getParseRateLogMessage(url, noSpacesMimetype,
          kbPerSecond));
      }
    }

    Writable v = new FetcherOutput(datum, null,
      parse != null ? new ParseImpl(parse) : null);      
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
      LOG.info("multiple: "+SqlSearcher.getCollectionNameWithTimestamp(this.collectionName,arcData.getDate())+" "+url);
      output.collect(Nutchwax.generateWaxKey(url,SqlSearcher.getCollectionNameWithTimestamp(this.collectionName,arcData.getDate())), v);
    }
    else {
      output.collect(Nutchwax.generateWaxKey(url, this.collectionName), v);
    }
  }

  public void setCollectionName(String collectionName)
  {
    this.collectionName = collectionName;
    checkCollectionName();
  }

  public String getArcName()
  {
    return this.arcName;
  }

  public void checkArcName(ARCRecord rec)
  { 
      this.arcName = rec.getMetaData().getArcFile().getName();
      this.arcName = this.arcName.replace(".arc.gz", "");  
  }

  protected boolean checkCollectionName()
  {
    if ((this.collectionName != null) && this.collectionName.length() > 0)
    {
      return true;
    }

    throw new NullPointerException("Collection name can't be empty");
  }

/**
  * @param rec ARC Record to test.
  * @return True if we are to index this record.
  */
  protected boolean isIndex(final ARCRecord rec)
  {
    return ((rec.getStatusCode() >= 200) && (rec.getStatusCode() < 300))
      || (this.indexRedirects && ((rec.getStatusCode() >= 300) &&
      (rec.getStatusCode() < 400)));
  }

  protected String getStatus(final String url, String oldUrl,
    final String recordLengthAsStr, final String noSpacesMimetype)
  {
    // If oldUrl is same as url, don't log.  Otherwise, log original so we
    // can keep url originally imported.
    if (oldUrl.equals(url))
    {
      oldUrl = "-";
    }
   
    StringBuilder sb = new StringBuilder(128);
    sb.append("adding ");
    sb.append(url);
    sb.append(" ");
    sb.append(oldUrl);
    sb.append(" ");
    sb.append(recordLengthAsStr);
    sb.append(" ");
    sb.append(noSpacesMimetype);
   
    return sb.toString();
  }

  protected String formatToOneLine(final String s)
  {
    final StringBuffer sb = new StringBuffer(s.length());
   
    for (final StringTokenizer st = new StringTokenizer(s, "\t\n\r");
      st.hasMoreTokens(); sb.append(st.nextToken()))
    {
      ;
    }
   
    return sb.toString();
  }


  protected String getParseRateLogMessage(final String url,
    final String mimetype, final double kbPerSecond)
  {
    return url + " " + mimetype + " parse KB/Sec "
      + this.numberFormatter.format(kbPerSecond);
  }

  protected double getParseRate(final long startTime, final long len)
  {
    // Get indexing rate:
    long elapsedTime = System.currentTimeMillis() - startTime;
    elapsedTime = (elapsedTime == 0) ? 1 : elapsedTime;
   
    return (len != 0) ? ((double) len / 1024)
      / ((double) elapsedTime / 1000) : 0;
  }

  protected boolean skip(final String mimetype)
  {
    boolean decision = false;
   
    // Are we to index all content?
    if (!this.indexAll)
    {
      if ((mimetype == null)
        || (!mimetype.startsWith(ImportArcs.TEXT_TYPE) && !mimetype
        .startsWith(ImportArcs.APPLICATION_TYPE)))
      {
        // Skip any but basic types.
        decision = true;
      }
    }
   
    return decision;
  }

  protected String getMimetype(final String mimetype, final MimeTypes mts,
    final String url)
  {
    if (mimetype != null && mimetype.length() > 0)
    {
      return checkMimetype(mimetype.toLowerCase());
    }
   
    if (mts != null && url != null)
    {
      final MimeType mt = mts.getMimeType(url);
     
      if (mt != null)
      {
        return checkMimetype(mt.getName().toLowerCase());
      }
    }
   
    return null;
  }

  protected static String checkMimetype(String mimetype)
  {
    if ((mimetype == null) || (mimetype.length() <= 0) ||
      mimetype.startsWith(MimetypeUtils.NO_TYPE_MIMETYPE))
    {
      return null;
    }

    // Test the mimetype makes sense. If not, clear it.
    try
    {
      new MimeType(mimetype);
    }
    catch (final MimeTypeException e)
    {
      mimetype = null;
    }
   
    return mimetype;
  }

/**
  * Override of nutch FetcherOutputFormat so I can substitute my own
  * ParseOutputFormat, {@link WaxParseOutputFormat}.  While I'm here,
  * removed content references.  NutchWAX doesn't save content.
  * @author stack
  */
  public static class WaxFetcherOutputFormat extends FetcherOutputFormat
  {
    public RecordWriter getRecordWriter(final FileSystem fs,
      final JobConf job, final String name, Progressable progress)
      throws IOException
    {
      Path f = new Path(job.getOutputPath(), CrawlDatum.FETCH_DIR_NAME);
      final Path fetch = new Path(f, name);
      final MapFile.Writer fetchOut = new MapFile.Writer(job, fs,
        fetch.toString(), Text.class, CrawlDatum.class);

      // Write a cdx file.  Write w/o compression.
      Path cdx = new Path(new Path(job.getOutputPath(), "cdx"), name);
      final SequenceFile.Writer cdxOut = SequenceFile.createWriter(fs,
        job, cdx, Text.class, Text.class,
        SequenceFile.CompressionType.NONE);

      return new RecordWriter()
      {
        private RecordWriter parseOut;
                         
        // Initialization
        {
          if (Fetcher.isParsing(job))
          {
            // Here is nutchwax change, using WaxParseOutput
            // instead of ParseOutputFormat.         
            this.parseOut = new WaxParseOutputFormat().
              getRecordWriter(fs, job, name, null);
          }
        }

        public void write(WritableComparable key, Writable value)
          throws IOException
        {                
          FetcherOutput fo = (FetcherOutput)value;
          MapWritable mw = fo.getCrawlDatum().getMetaData();
          Text cdxLine = (Text)mw.get(ImportArcs.CDXKEY);
         
          if (cdxLine != null)
          {
            cdxOut.append(key, cdxLine);
          }
         
          mw.remove(ImportArcs.CDXKEY);
          fetchOut.append(key, fo.getCrawlDatum());
         
          if (fo.getParse() != null)
          {
            parseOut.write(key, fo.getParse());        
          }
        }

        public void close(Reporter reporter) throws IOException
        {
          fetchOut.close();
          cdxOut.close();
         
          if (parseOut != null)
          {
            parseOut.close(reporter);
          }
        }
      };
    }
  }

/**
  * Copy so I can add collection prefix to produced signature and link
  * CrawlDatums.
  * @author stack
  */
  public static class WaxParseOutputFormat extends ParseOutputFormat
  {
    public final Log LOG = LogFactory.getLog(WaxParseOutputFormat.class);

    private URLNormalizers urlNormalizers;
    private URLFilters filters;
    private ScoringFilters scfilters;
   
    public RecordWriter getRecordWriter(FileSystem fs, JobConf job,
      String name, Progressable progress)
      throws IOException
    {
      // Extract collection prefix from key to use later when adding
      // signature and link crawldatums.

      this.urlNormalizers =
        new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
      this.filters = new URLFilters(job);
      this.scfilters = new ScoringFilters(job);

      final float interval =
        job.getFloat("db.default.fetch.interval", 30f);
      final boolean ignoreExternalLinks =
        job.getBoolean("db.ignore.external.links", false);
      final boolean sha1 = job.getBoolean("wax.digest.sha1", false);

      Path text = new Path(new Path(job.getOutputPath(),
        ParseText.DIR_NAME), name);
      Path data = new Path(new Path(job.getOutputPath(),
        ParseData.DIR_NAME), name);
      Path crawl = new Path(new Path(job.getOutputPath(),
        CrawlDatum.PARSE_DIR_NAME), name);

      final MapFile.Writer textOut = new MapFile.Writer(job, fs,
        text.toString(), Text.class, ParseText.class,
        CompressionType.RECORD);

      final MapFile.Writer dataOut = new MapFile.Writer(job, fs,
        data.toString(), Text.class, ParseData.class);

      final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs,
        job, crawl, Text.class, CrawlDatum.class);

      return new RecordWriter()
      {
        public void write(WritableComparable key, Writable value)
          throws IOException
        {
          // Test that I can parse the key before I do anything
          // else. If not, write nothing for this record.
          String collection = null;
          String fromUrl = null;
          String fromHost = null;
          String toHost = null;             
         
          try
          {
            collection = Nutchwax.getCollectionFromWaxKey(key);
            fromUrl = Nutchwax.getUrlFromWaxKey(key);
          }
          catch (IOException ioe)
          {
            LOG.warn("Skipping record. Can't parse " + key, ioe);
           
            return;
          }
         
          if (fromUrl == null || collection == null)
          {
            LOG.warn("Skipping record. Null from or collection " +
              key);
           
            return;
          }

          Parse parse = (Parse)value;

          textOut.append(key, new ParseText(parse.getText()));
          ParseData parseData = parse.getData();

          // recover the signature prepared by Fetcher or ParseSegment
          String sig = parseData.getContentMeta().get(
            Nutch.SIGNATURE_KEY);
           
          if (sig != null)
          {
            byte[] signature = (sha1)?
              Base32.decode(sig): StringUtil.fromHexString(sig);
           
            if (signature != null)
            {
              // append a CrawlDatum with a signature
              CrawlDatum d = new CrawlDatum(
                CrawlDatum.STATUS_SIGNATURE, 0.0f);
              d.setSignature(signature);
              crawlOut.append(key, d);
            }
          }

          // collect outlinks for subsequent db update
          Outlink[] links = parseData.getOutlinks();
          if (ignoreExternalLinks)
          {
            try
            {
              fromHost = new URL(fromUrl).getHost().toLowerCase();
            }
            catch (MalformedURLException e)
            {
              fromHost = null;
            }
          }
          else
          {
            fromHost = null;
          }

          String[] toUrls = new String[links.length];
          int validCount = 0;
         
          for (int i = 0; i < links.length; i++)
          {
            String toUrl = links[i].getToUrl();
           
            try
            {
              toUrl = urlNormalizers.normalize(toUrl,URLNormalizers.SCOPE_OUTLINK);            
              toUrl = filters.filter(toUrl); // filter the url
              if (toUrl==null) { 
                LOG.warn("Skipping url (target) because is null."); // TODO MC remove
              }
            }
            catch (Exception e)
            {
              toUrl = null;
            }
           
            // ignore links to self (or anchors within the page)
            if (fromUrl.equals(toUrl))
            {
              toUrl = null;
            }
           
            if (toUrl != null)
            {
              validCount++;
            }
           
            toUrls[i] = toUrl;
          }

          CrawlDatum adjust = null;
         
          // compute score contributions and adjustment to the
          // original score         
          for (int i = 0; i < toUrls.length; i++)
          {
            if (toUrls[i] == null)
            {
              continue;
            }
           
            if (ignoreExternalLinks)
            {
              try
              {
                toHost = new URL(toUrls[i]).getHost().
                  toLowerCase();
              }
              catch (MalformedURLException e)
              {
                toHost = null;
              }
             
              if (toHost == null || ! toHost.equals(fromHost))
              {
                // external links
                continue; // skip it
              }
            }

            CrawlDatum target = new CrawlDatum(
              CrawlDatum.STATUS_LINKED, interval);
            Text fromURLUTF8 = new Text(fromUrl);
            Text targetUrl = new Text(toUrls[i]);
            adjust = null;
           
            try
            {
              // Scoring now expects first two arguments to be
              // URLs (More reason to do our own scoring).
              // St.Ack
              adjust = scfilters.distributeScoreToOutlink(
                fromURLUTF8, targetUrl, parseData,
                target, null, links.length, validCount);          
            }
            catch (ScoringFilterException e)
            {
              if (LOG.isWarnEnabled())
              {
                LOG.warn("Cannot distribute score from " + key
                  + " to " + target + " - skipped ("
                  + e.getMessage());
              }
             
              continue;
            }
           
            Text targetKey =
              Nutchwax.generateWaxKey(targetUrl, collection);
            crawlOut.append(targetKey, target);                
            if (adjust != null)
            {
              crawlOut.append(key, adjust);           
            }
          }

          dataOut.append(key, parseData);
        }

        public void close(Reporter reporter) throws IOException
        {
          textOut.close();
          dataOut.close();
          crawlOut.close();
        }
      };
    }
  }

  public void close()
  {
    // Nothing to close.
  }

  public static void doImportUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
   
    System.out.println("Usage: hadoop jar nutchwax.jar import <input>" +
      " <output> <collection>");
    System.out.println("Arguments:");
    System.out.println(" input       Directory of files" +
      " listing ARC URLs to import");
    System.out.println(" output      Directory to import to. Inport is " +
      "written to a subdir named");
    System.out.println("             for current date plus collection " +
      "under '<output>/segments/'");
    System.out.println(" collection  Collection name. Added to" +
      " each resource.");
    System.exit(exitCode);
  }

  public static void main(String[] args) throws Exception
  {   
    int res = new ImportArcs().
      doMain(NutchwaxConfiguration.getConfiguration(), args);
   
    System.exit(res);
  }

  public int run(final String[] args) throws Exception
  {
    if (args.length != 3)
    {
      doImportUsage("ERROR: Wrong number of arguments passed.", 2);
    }
   
    // Assume list of ARC urls is first arg and output dir the second.
    try
    {
      importArcs(new Path(args[0]), new Path(args[1]), args[2]);
      return 0;
    }
    catch(Exception e)
    {
      LOG.fatal("ImportARCs: " + StringUtils.stringifyException(e));
     
      return -1;
    }
  }
}
TOP

Related Classes of org.archive.access.nutch.jobs.ImportArcs$WaxFetcherOutputFormat

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.