/*
* $Id: ImportArcs.java 1521 2007-02-27 18:01:29Z stack-sf $
*
* Copyright (C) 2003 Internet Archive.
*
* This file is part of the archive-access tools project
* (http://sourceforge.net/projects/archive-access).
*
* The archive-access tools are free software; you can redistribute them and/or
* modify them under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or any
* later version.
*
* The archive-access tools are distributed in the hope that they will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
* Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License along with
* the archive-access tools; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.archive.access.nutch.jobs;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.NumberFormat;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.Header;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.MapWritable;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.fetcher.FetcherOutputFormat;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseOutputFormat;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.mime.MimeType;
import org.apache.nutch.util.mime.MimeTypeException;
import org.apache.nutch.util.mime.MimeTypes;
import org.archive.access.nutch.Nutchwax;
import org.archive.access.nutch.NutchwaxConfiguration;
import org.archive.access.nutch.jobs.sql.SqlSearcher;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.mapred.ARCMapRunner;
import org.archive.mapred.ARCRecordMapper;
import org.archive.mapred.ARCReporter;
import org.archive.util.Base32;
import org.archive.util.MimetypeUtils;
import org.archive.util.TextUtils;
import org.apache.nutch.global.Global;
/**
* Ingests ARCs writing ARC Record parse as Nutch FetcherOutputFormat.
* FOF has five outputs:
* <ul><li>crawl_fetch holds a fat CrawlDatum of all vitals including metadata.
* Its written below by our {@link WaxFetcherOutputFormat} (innutch by
* {@link FetcherOutputFormat}). Here is an example CD: <pre> Version: 4
* Status: 5 (fetch_success)
* Fetch time: Wed Mar 15 12:38:49 PST 2006
* Modified time: Wed Dec 31 16:00:00 PST 1969
* Retries since fetch: 0
* Retry interval: 0.0 days
* Score: 1.0
* Signature: null
* Metadata: collection:test arcname:IAH-20060315203614-00000-debord arcoffset:5127
* </pre></li>
* <li>crawl_parse has CrawlDatum of MD5s. Used making CrawlDB.
* Its obtained from above fat crawl_fetch CrawlDatum and written
* out as part of the parse output done by {@link WaxParseOutputFormat}.
* This latter class writes three files. This crawl_parse and both
* of the following parse_text and parse_data.</li>
* <li>parse_text has text from parse.</li>
* <li>parse_data has other metadata found by parse (Depends on
* parser). This is only input to linkdb. The html parser
* adds found out links here and content-type and discovered
* encoding as well as advertised encoding, etc.</li>
* <li>cdx has a summary line for every record processed.</li>
* </ul>
*/
public class ImportArcs extends ToolBase implements ARCRecordMapper
{
public final Log LOG = LogFactory.getLog(ImportArcs.class);
private final NumberFormat numberFormatter = NumberFormat.getInstance();
private static final String WHITESPACE = "\\s+";
public static final String ARCFILENAME_KEY = "arcname";
public static final String ARCFILEOFFSET_KEY = "arcoffset";
private static final String CONTENT_TYPE_KEY = "content-type";
private static final String TEXT_TYPE = "text/";
private static final String APPLICATION_TYPE = "application/";
public static final String ARCCOLLECTION_KEY = "collection";
public static final String WAX_SUFFIX = "wax.";
public static final String WAX_COLLECTION_KEY = WAX_SUFFIX + ARCCOLLECTION_KEY;
private static final String PDF_TYPE = "application/pdf";
private boolean indexAll;
private int contentLimit;
private int pdfContentLimit;
private MimeTypes mimeTypes;
private String segmentName;
private String collectionName;
private int parseThreshold = -1;
private boolean indexRedirects;
private boolean sha1 = false;
private boolean arcNameFromFirstRecord = true ;
private String arcName;
private String collectionType;
private int timeoutIndexingDocument;
/**
* Usually the URL in first record looks like this:
* filedesc://IAH-20060315203614-00000-debord.arc. But in old
* ARCs, it can look like this: filedesc://19961022/IA-000001.arc.
*/
private static final Pattern FILEDESC_PATTERN =
Pattern.compile("^(?:filedesc://)(?:[0-9]+\\/)?(.+)(?:\\.arc)$");
private static final Pattern TAIL_PATTERN =
Pattern.compile("(?:.*(?:/|\\\\))?(.+)(?:\\.arc|\\.arc\\.gz)$");
/**
* Buffer to reuse on each ARCRecord indexing.
*/
private final byte[] buffer = new byte[1024 * 16];
private final ByteArrayOutputStream contentBuffer =
new ByteArrayOutputStream(1024 * 16);
private URLNormalizers urlNormalizers;
private URLFilters filters;
private ParseUtil parseUtil;
private static final Text CDXKEY = new Text("cdx");
private TimeoutParsingThreadPool threadPool=new TimeoutParsingThreadPool(); // this is one pool of only one thread; it is not necessary to be static
public ImportArcs()
{
super();
}
public ImportArcs(Configuration conf)
{
setConf(conf);
}
public void importArcs(final Path arcUrlsDir, final Path segment,
final String collection)
throws IOException
{
LOG.info("ImportArcs segment: " + segment + ", src: " + arcUrlsDir);
final JobConf job = new JobConf(getConf(), this.getClass());
job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());
job.setInputPath(arcUrlsDir);
//job.setMapRunnerClass(job.getClass("wax.import.maprunner", ARCMapRunner.class));
//job.setMapperClass(job.getClass("wax.import.mapper", this.getClass()));
job.setMapRunnerClass( ARCMapRunner.class ); // compatible with hadoop 0.14 TODO MC
job.setMapperClass( this.getClass() );
job.setInputFormat(TextInputFormat.class);
job.setOutputPath(segment);
job.setOutputFormat(WaxFetcherOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FetcherOutput.class);
// Pass the collection name out to the tasks IF non-null.
if ((collection != null) && (collection.length() > 0))
{
job.set(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY,
collection);
}
job.setJobName("import " + arcUrlsDir + " " + segment);
JobClient.runJob(job);
LOG.info("ImportArcs: done");
}
public void configure(final JobConf job)
{
setConf(job);
this.indexAll = job.getBoolean("wax.index.all", false);
this.contentLimit = job.getInt("http.content.limit", 1024 * 100);
final int pdfMultiplicand = job.getInt("wax.pdf.size.multiplicand", 10);
this.pdfContentLimit = (this.contentLimit == -1) ? this.contentLimit
: pdfMultiplicand * this.contentLimit;
this.mimeTypes = MimeTypes.get(job.get("mime.types.file"));
this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);
// Get the rsync protocol handler into the mix.
System.setProperty("java.protocol.handler.pkgs", "org.archive.net");
// Format numbers output by parse rate logging.
this.numberFormatter.setMaximumFractionDigits(2);
this.numberFormatter.setMinimumFractionDigits(2);
this.parseThreshold = job.getInt("wax.parse.rate.threshold", -1);
this.indexRedirects = job.getBoolean("wax.index.redirects", false);
this.sha1 = job.getBoolean("wax.digest.sha1", false);
this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_FETCHER);
this.filters = new URLFilters(job);
this.parseUtil = new ParseUtil(job);
this.collectionName = job.get(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY);
// Get ARCName by reading first record in ARC? Otherwise, we parse
// the name of the file we've been passed to find an ARC name.
this.arcNameFromFirstRecord = job.getBoolean("wax.arcname.from.first.record", true);
this.collectionType = job.get(Global.COLLECTION_TYPE);
this.timeoutIndexingDocument = job.getInt(Global.TIMEOUT_INDEXING_DOCUMENT, -1);
LOG.info("ImportArcs collectionType: " + collectionType);
}
public Configuration getConf()
{
return this.conf;
}
public void setConf(Configuration c)
{
this.conf = c;
}
public void onARCOpen()
{
// Nothing to do.
}
public void onARCClose()
{
threadPool.closeAll(); // close the only thread created for this map
}
public void map(final WritableComparable key, final Writable value,
final OutputCollector output, final Reporter r)
throws IOException
{
// Assumption is that this map is being run by ARCMapRunner.
// Otherwise, the below casts fail.
String url = key.toString();
ARCRecord rec = (ARCRecord)((ObjectWritable)value).get();
ARCReporter reporter = (ARCReporter)r;
// Its null first time map is called on an ARC.
checkArcName(rec);
if (! isIndex(rec))
{
return;
}
checkCollectionName();
final ARCRecordMetaData arcData = rec.getMetaData();
String oldUrl = url;
try
{
url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_FETCHER);
url = filters.filter(url); // filter the url
}
catch (Exception e)
{
LOG.warn("Skipping record. Didn't pass normalization/filter " +
oldUrl + ": " + e.toString());
return;
}
final long b = arcData.getContentBegin();
final long l = arcData.getLength();
final long recordLength = (l > b)? (l - b): l;
// Look at ARCRecord meta data line mimetype. It can be empty. If so,
// two more chances at figuring it either by looking at HTTP headers or
// by looking at first couple of bytes of the file. See below.
String mimetype =
getMimetype(arcData.getMimetype(), this.mimeTypes, url);
if (skip(mimetype))
{
return;
}
// Copy http headers to nutch metadata.
final Metadata metaData = new Metadata();
final Header[] headers = rec.getHttpHeaders();
for (int j = 0; j < headers.length; j++)
{
final Header header = headers[j];
if (mimetype == null)
{
// Special handling. If mimetype is still null, try getting it
// from the http header. I've seen arc record lines with empty
// content-type and a MIME unparseable file ending; i.e. .MID.
if ((header.getName() != null) &&
header.getName().toLowerCase().equals(ImportArcs.CONTENT_TYPE_KEY))
{
mimetype = getMimetype(header.getValue(), null, null);
if (skip(mimetype))
{
return;
}
}
}
metaData.set(header.getName(), header.getValue());
}
// This call to reporter setStatus pings the tasktracker telling it our
// status and telling the task tracker we're still alive (so it doesn't
// time us out).
final String noSpacesMimetype =
TextUtils.replaceAll(ImportArcs.WHITESPACE,
((mimetype == null || mimetype.length() <= 0)?
"TODO": mimetype),
"-");
final String recordLengthAsStr = Long.toString(recordLength);
reporter.setStatus(getStatus(url, oldUrl, recordLengthAsStr, noSpacesMimetype));
// This is a nutch 'more' field.
metaData.set("contentLength", recordLengthAsStr);
rec.skipHttpHeader();
reporter.setStatusIfElapse("read headers on " + url);
// TODO: Skip if unindexable type.
int total = 0;
// Read in first block. If mimetype still null, look for MAGIC.
int len = rec.read(this.buffer, 0, this.buffer.length);
if (mimetype == null)
{
MimeType mt = this.mimeTypes.getMimeType(this.buffer);
if (mt == null || mt.getName() == null)
{
LOG.warn("Failed to get mimetype for: " + url);
return;
}
mimetype = mt.getName();
}
metaData.set(ImportArcs.CONTENT_TYPE_KEY, mimetype);
// How much do we read total? If pdf, we will read more. If equal to -1,
// read all.
int readLimit = (ImportArcs.PDF_TYPE.equals(mimetype))?
this.pdfContentLimit : this.contentLimit;
// Reset our contentBuffer so can reuse. Over the life of an ARC
// processing will grow to maximum record size.
this.contentBuffer.reset();
while ((len != -1) && ((readLimit == -1) || (total < readLimit)))
{
total += len;
this.contentBuffer.write(this.buffer, 0, len);
len = rec.read(this.buffer, 0, this.buffer.length);
reporter.setStatusIfElapse("reading " + url);
}
// Close the Record. We're done with it. Side-effect is calculation
// of digest -- if we're digesting.
rec.close();
reporter.setStatusIfElapse("closed " + url);
final byte[] contentBytes = this.contentBuffer.toByteArray();
final CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
// Calculate digest or use precalculated sha1.
String digest = (this.sha1)? rec.getDigestStr():
MD5Hash.digest(contentBytes).toString();
metaData.set(Nutch.SIGNATURE_KEY, digest);
// Set digest back into the arcData so available later when we write
// CDX line.
arcData.setDigest(digest);
metaData.set(Nutch.SEGMENT_NAME_KEY, this.segmentName);
// Score at this stage is 1.0f.
metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore()));
final long startTime = System.currentTimeMillis();
final Content content = new Content(url, url, contentBytes, mimetype,
metaData, getConf());
datum.setFetchTime(Nutchwax.getDate(arcData.getDate()));
MapWritable mw = datum.getMetaData();
if (mw == null)
{
mw = new MapWritable();
}
if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(SqlSearcher.getCollectionNameWithTimestamp(collectionName,arcData.getDate())));
}
else {
mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(collectionName));
}
mw.put(new Text(ImportArcs.ARCFILENAME_KEY), new Text(arcName));
mw.put(new Text(ImportArcs.ARCFILEOFFSET_KEY),
new Text(Long.toString(arcData.getOffset())));
datum.setMetaData(mw);
TimeoutParsingThread tout=threadPool.getThread(Thread.currentThread().getId(),timeoutIndexingDocument);
tout.setUrl(url);
tout.setContent(content);
tout.setParseUtil(parseUtil);
tout.wakeupAndWait();
ParseStatus parseStatus=tout.getParseStatus();
Parse parse=tout.getParse();
reporter.setStatusIfElapse("parsed " + url);
if (!parseStatus.isSuccess()) {
final String status = formatToOneLine(parseStatus.toString());
LOG.warn("Error parsing: " + mimetype + " " + url + ": " + status);
parse = null;
}
else {
// Was it a slow parse?
final double kbPerSecond = getParseRate(startTime,
(contentBytes != null) ? contentBytes.length : 0);
if (LOG.isDebugEnabled())
{
LOG.debug(getParseRateLogMessage(url,
noSpacesMimetype, kbPerSecond));
}
else if (kbPerSecond < this.parseThreshold)
{
LOG.warn(getParseRateLogMessage(url, noSpacesMimetype,
kbPerSecond));
}
}
Writable v = new FetcherOutput(datum, null,
parse != null ? new ParseImpl(parse) : null);
if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
LOG.info("multiple: "+SqlSearcher.getCollectionNameWithTimestamp(this.collectionName,arcData.getDate())+" "+url);
output.collect(Nutchwax.generateWaxKey(url,SqlSearcher.getCollectionNameWithTimestamp(this.collectionName,arcData.getDate())), v);
}
else {
output.collect(Nutchwax.generateWaxKey(url, this.collectionName), v);
}
}
public void setCollectionName(String collectionName)
{
this.collectionName = collectionName;
checkCollectionName();
}
public String getArcName()
{
return this.arcName;
}
public void checkArcName(ARCRecord rec)
{
this.arcName = rec.getMetaData().getArcFile().getName();
this.arcName = this.arcName.replace(".arc.gz", "");
}
protected boolean checkCollectionName()
{
if ((this.collectionName != null) && this.collectionName.length() > 0)
{
return true;
}
throw new NullPointerException("Collection name can't be empty");
}
/**
* @param rec ARC Record to test.
* @return True if we are to index this record.
*/
protected boolean isIndex(final ARCRecord rec)
{
return ((rec.getStatusCode() >= 200) && (rec.getStatusCode() < 300))
|| (this.indexRedirects && ((rec.getStatusCode() >= 300) &&
(rec.getStatusCode() < 400)));
}
protected String getStatus(final String url, String oldUrl,
final String recordLengthAsStr, final String noSpacesMimetype)
{
// If oldUrl is same as url, don't log. Otherwise, log original so we
// can keep url originally imported.
if (oldUrl.equals(url))
{
oldUrl = "-";
}
StringBuilder sb = new StringBuilder(128);
sb.append("adding ");
sb.append(url);
sb.append(" ");
sb.append(oldUrl);
sb.append(" ");
sb.append(recordLengthAsStr);
sb.append(" ");
sb.append(noSpacesMimetype);
return sb.toString();
}
protected String formatToOneLine(final String s)
{
final StringBuffer sb = new StringBuffer(s.length());
for (final StringTokenizer st = new StringTokenizer(s, "\t\n\r");
st.hasMoreTokens(); sb.append(st.nextToken()))
{
;
}
return sb.toString();
}
protected String getParseRateLogMessage(final String url,
final String mimetype, final double kbPerSecond)
{
return url + " " + mimetype + " parse KB/Sec "
+ this.numberFormatter.format(kbPerSecond);
}
protected double getParseRate(final long startTime, final long len)
{
// Get indexing rate:
long elapsedTime = System.currentTimeMillis() - startTime;
elapsedTime = (elapsedTime == 0) ? 1 : elapsedTime;
return (len != 0) ? ((double) len / 1024)
/ ((double) elapsedTime / 1000) : 0;
}
protected boolean skip(final String mimetype)
{
boolean decision = false;
// Are we to index all content?
if (!this.indexAll)
{
if ((mimetype == null)
|| (!mimetype.startsWith(ImportArcs.TEXT_TYPE) && !mimetype
.startsWith(ImportArcs.APPLICATION_TYPE)))
{
// Skip any but basic types.
decision = true;
}
}
return decision;
}
protected String getMimetype(final String mimetype, final MimeTypes mts,
final String url)
{
if (mimetype != null && mimetype.length() > 0)
{
return checkMimetype(mimetype.toLowerCase());
}
if (mts != null && url != null)
{
final MimeType mt = mts.getMimeType(url);
if (mt != null)
{
return checkMimetype(mt.getName().toLowerCase());
}
}
return null;
}
protected static String checkMimetype(String mimetype)
{
if ((mimetype == null) || (mimetype.length() <= 0) ||
mimetype.startsWith(MimetypeUtils.NO_TYPE_MIMETYPE))
{
return null;
}
// Test the mimetype makes sense. If not, clear it.
try
{
new MimeType(mimetype);
}
catch (final MimeTypeException e)
{
mimetype = null;
}
return mimetype;
}
/**
* Override of nutch FetcherOutputFormat so I can substitute my own
* ParseOutputFormat, {@link WaxParseOutputFormat}. While I'm here,
* removed content references. NutchWAX doesn't save content.
* @author stack
*/
public static class WaxFetcherOutputFormat extends FetcherOutputFormat
{
public RecordWriter getRecordWriter(final FileSystem fs,
final JobConf job, final String name, Progressable progress)
throws IOException
{
Path f = new Path(job.getOutputPath(), CrawlDatum.FETCH_DIR_NAME);
final Path fetch = new Path(f, name);
final MapFile.Writer fetchOut = new MapFile.Writer(job, fs,
fetch.toString(), Text.class, CrawlDatum.class);
// Write a cdx file. Write w/o compression.
Path cdx = new Path(new Path(job.getOutputPath(), "cdx"), name);
final SequenceFile.Writer cdxOut = SequenceFile.createWriter(fs,
job, cdx, Text.class, Text.class,
SequenceFile.CompressionType.NONE);
return new RecordWriter()
{
private RecordWriter parseOut;
// Initialization
{
if (Fetcher.isParsing(job))
{
// Here is nutchwax change, using WaxParseOutput
// instead of ParseOutputFormat.
this.parseOut = new WaxParseOutputFormat().
getRecordWriter(fs, job, name, null);
}
}
public void write(WritableComparable key, Writable value)
throws IOException
{
FetcherOutput fo = (FetcherOutput)value;
MapWritable mw = fo.getCrawlDatum().getMetaData();
Text cdxLine = (Text)mw.get(ImportArcs.CDXKEY);
if (cdxLine != null)
{
cdxOut.append(key, cdxLine);
}
mw.remove(ImportArcs.CDXKEY);
fetchOut.append(key, fo.getCrawlDatum());
if (fo.getParse() != null)
{
parseOut.write(key, fo.getParse());
}
}
public void close(Reporter reporter) throws IOException
{
fetchOut.close();
cdxOut.close();
if (parseOut != null)
{
parseOut.close(reporter);
}
}
};
}
}
/**
* Copy so I can add collection prefix to produced signature and link
* CrawlDatums.
* @author stack
*/
public static class WaxParseOutputFormat extends ParseOutputFormat
{
public final Log LOG = LogFactory.getLog(WaxParseOutputFormat.class);
private URLNormalizers urlNormalizers;
private URLFilters filters;
private ScoringFilters scfilters;
public RecordWriter getRecordWriter(FileSystem fs, JobConf job,
String name, Progressable progress)
throws IOException
{
// Extract collection prefix from key to use later when adding
// signature and link crawldatums.
this.urlNormalizers =
new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
this.filters = new URLFilters(job);
this.scfilters = new ScoringFilters(job);
final float interval =
job.getFloat("db.default.fetch.interval", 30f);
final boolean ignoreExternalLinks =
job.getBoolean("db.ignore.external.links", false);
final boolean sha1 = job.getBoolean("wax.digest.sha1", false);
Path text = new Path(new Path(job.getOutputPath(),
ParseText.DIR_NAME), name);
Path data = new Path(new Path(job.getOutputPath(),
ParseData.DIR_NAME), name);
Path crawl = new Path(new Path(job.getOutputPath(),
CrawlDatum.PARSE_DIR_NAME), name);
final MapFile.Writer textOut = new MapFile.Writer(job, fs,
text.toString(), Text.class, ParseText.class,
CompressionType.RECORD);
final MapFile.Writer dataOut = new MapFile.Writer(job, fs,
data.toString(), Text.class, ParseData.class);
final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs,
job, crawl, Text.class, CrawlDatum.class);
return new RecordWriter()
{
public void write(WritableComparable key, Writable value)
throws IOException
{
// Test that I can parse the key before I do anything
// else. If not, write nothing for this record.
String collection = null;
String fromUrl = null;
String fromHost = null;
String toHost = null;
try
{
collection = Nutchwax.getCollectionFromWaxKey(key);
fromUrl = Nutchwax.getUrlFromWaxKey(key);
}
catch (IOException ioe)
{
LOG.warn("Skipping record. Can't parse " + key, ioe);
return;
}
if (fromUrl == null || collection == null)
{
LOG.warn("Skipping record. Null from or collection " +
key);
return;
}
Parse parse = (Parse)value;
textOut.append(key, new ParseText(parse.getText()));
ParseData parseData = parse.getData();
// recover the signature prepared by Fetcher or ParseSegment
String sig = parseData.getContentMeta().get(
Nutch.SIGNATURE_KEY);
if (sig != null)
{
byte[] signature = (sha1)?
Base32.decode(sig): StringUtil.fromHexString(sig);
if (signature != null)
{
// append a CrawlDatum with a signature
CrawlDatum d = new CrawlDatum(
CrawlDatum.STATUS_SIGNATURE, 0.0f);
d.setSignature(signature);
crawlOut.append(key, d);
}
}
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
if (ignoreExternalLinks)
{
try
{
fromHost = new URL(fromUrl).getHost().toLowerCase();
}
catch (MalformedURLException e)
{
fromHost = null;
}
}
else
{
fromHost = null;
}
String[] toUrls = new String[links.length];
int validCount = 0;
for (int i = 0; i < links.length; i++)
{
String toUrl = links[i].getToUrl();
try
{
toUrl = urlNormalizers.normalize(toUrl,URLNormalizers.SCOPE_OUTLINK);
toUrl = filters.filter(toUrl); // filter the url
if (toUrl==null) {
LOG.warn("Skipping url (target) because is null."); // TODO MC remove
}
}
catch (Exception e)
{
toUrl = null;
}
// ignore links to self (or anchors within the page)
if (fromUrl.equals(toUrl))
{
toUrl = null;
}
if (toUrl != null)
{
validCount++;
}
toUrls[i] = toUrl;
}
CrawlDatum adjust = null;
// compute score contributions and adjustment to the
// original score
for (int i = 0; i < toUrls.length; i++)
{
if (toUrls[i] == null)
{
continue;
}
if (ignoreExternalLinks)
{
try
{
toHost = new URL(toUrls[i]).getHost().
toLowerCase();
}
catch (MalformedURLException e)
{
toHost = null;
}
if (toHost == null || ! toHost.equals(fromHost))
{
// external links
continue; // skip it
}
}
CrawlDatum target = new CrawlDatum(
CrawlDatum.STATUS_LINKED, interval);
Text fromURLUTF8 = new Text(fromUrl);
Text targetUrl = new Text(toUrls[i]);
adjust = null;
try
{
// Scoring now expects first two arguments to be
// URLs (More reason to do our own scoring).
// St.Ack
adjust = scfilters.distributeScoreToOutlink(
fromURLUTF8, targetUrl, parseData,
target, null, links.length, validCount);
}
catch (ScoringFilterException e)
{
if (LOG.isWarnEnabled())
{
LOG.warn("Cannot distribute score from " + key
+ " to " + target + " - skipped ("
+ e.getMessage());
}
continue;
}
Text targetKey =
Nutchwax.generateWaxKey(targetUrl, collection);
crawlOut.append(targetKey, target);
if (adjust != null)
{
crawlOut.append(key, adjust);
}
}
dataOut.append(key, parseData);
}
public void close(Reporter reporter) throws IOException
{
textOut.close();
dataOut.close();
crawlOut.close();
}
};
}
}
public void close()
{
// Nothing to close.
}
public static void doImportUsage(final String message,
final int exitCode)
{
if (message != null && message.length() > 0)
{
System.out.println(message);
}
System.out.println("Usage: hadoop jar nutchwax.jar import <input>" +
" <output> <collection>");
System.out.println("Arguments:");
System.out.println(" input Directory of files" +
" listing ARC URLs to import");
System.out.println(" output Directory to import to. Inport is " +
"written to a subdir named");
System.out.println(" for current date plus collection " +
"under '<output>/segments/'");
System.out.println(" collection Collection name. Added to" +
" each resource.");
System.exit(exitCode);
}
public static void main(String[] args) throws Exception
{
int res = new ImportArcs().
doMain(NutchwaxConfiguration.getConfiguration(), args);
System.exit(res);
}
public int run(final String[] args) throws Exception
{
if (args.length != 3)
{
doImportUsage("ERROR: Wrong number of arguments passed.", 2);
}
// Assume list of ARC urls is first arg and output dir the second.
try
{
importArcs(new Path(args[0]), new Path(args[1]), args[2]);
return 0;
}
catch(Exception e)
{
LOG.fatal("ImportARCs: " + StringUtils.stringifyException(e));
return -1;
}
}
}