{
// Assumption is that this map is being run by ARCMapRunner.
// Otherwise, the below casts fail.
String url = key.toString();
ARCRecord rec = (ARCRecord)((ObjectWritable)value).get();
ARCReporter reporter = (ARCReporter)r;
// Its null first time map is called on an ARC.
checkArcName(rec);
if (! isIndex(rec))
{
return;
}
checkCollectionName();
final ARCRecordMetaData arcData = rec.getMetaData();
String oldUrl = url;
try
{
url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_FETCHER);
url = filters.filter(url); // filter the url
}
catch (Exception e)
{
LOG.warn("Skipping record. Didn't pass normalization/filter " +
oldUrl + ": " + e.toString());
return;
}
final long b = arcData.getContentBegin();
final long l = arcData.getLength();
final long recordLength = (l > b)? (l - b): l;
// Look at ARCRecord meta data line mimetype. It can be empty. If so,
// two more chances at figuring it either by looking at HTTP headers or
// by looking at first couple of bytes of the file. See below.
String mimetype =
getMimetype(arcData.getMimetype(), this.mimeTypes, url);
if (skip(mimetype))
{
return;
}
// Copy http headers to nutch metadata.
final Metadata metaData = new Metadata();
final Header[] headers = rec.getHttpHeaders();
for (int j = 0; j < headers.length; j++)
{
final Header header = headers[j];
if (mimetype == null)
{
// Special handling. If mimetype is still null, try getting it
// from the http header. I've seen arc record lines with empty
// content-type and a MIME unparseable file ending; i.e. .MID.
if ((header.getName() != null) &&
header.getName().toLowerCase().equals(ImportArcs.CONTENT_TYPE_KEY))
{
mimetype = getMimetype(header.getValue(), null, null);
if (skip(mimetype))
{
return;
}
}
}
metaData.set(header.getName(), header.getValue());
}
// This call to reporter setStatus pings the tasktracker telling it our
// status and telling the task tracker we're still alive (so it doesn't
// time us out).
final String noSpacesMimetype =
TextUtils.replaceAll(ImportArcs.WHITESPACE,
((mimetype == null || mimetype.length() <= 0)?
"TODO": mimetype),
"-");
final String recordLengthAsStr = Long.toString(recordLength);
reporter.setStatus(getStatus(url, oldUrl, recordLengthAsStr, noSpacesMimetype));
// This is a nutch 'more' field.
metaData.set("contentLength", recordLengthAsStr);
rec.skipHttpHeader();
reporter.setStatusIfElapse("read headers on " + url);
// TODO: Skip if unindexable type.
int total = 0;
// Read in first block. If mimetype still null, look for MAGIC.
int len = rec.read(this.buffer, 0, this.buffer.length);
if (mimetype == null)
{
MimeType mt = this.mimeTypes.getMimeType(this.buffer);
if (mt == null || mt.getName() == null)
{
LOG.warn("Failed to get mimetype for: " + url);
return;
}
mimetype = mt.getName();
}
metaData.set(ImportArcs.CONTENT_TYPE_KEY, mimetype);
// How much do we read total? If pdf, we will read more. If equal to -1,
// read all.
int readLimit = (ImportArcs.PDF_TYPE.equals(mimetype))?
this.pdfContentLimit : this.contentLimit;
// Reset our contentBuffer so can reuse. Over the life of an ARC
// processing will grow to maximum record size.
this.contentBuffer.reset();
while ((len != -1) && ((readLimit == -1) || (total < readLimit)))
{
total += len;
this.contentBuffer.write(this.buffer, 0, len);
len = rec.read(this.buffer, 0, this.buffer.length);
reporter.setStatusIfElapse("reading " + url);
}
// Close the Record. We're done with it. Side-effect is calculation
// of digest -- if we're digesting.
rec.close();
reporter.setStatusIfElapse("closed " + url);
final byte[] contentBytes = this.contentBuffer.toByteArray();
final CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
// Calculate digest or use precalculated sha1.
String digest = (this.sha1)? rec.getDigestStr():
MD5Hash.digest(contentBytes).toString();
metaData.set(Nutch.SIGNATURE_KEY, digest);
// Set digest back into the arcData so available later when we write
// CDX line.