// Assumption is that this map is being run by ARCMapRunner.
// Otherwise, the below casts fail.
String url = key.toString();
ARCRecord rec = (ARCRecord)((ObjectWritable)value).get();
ARCReporter reporter = (ARCReporter)r;
// Its null first time map is called on an ARC.
checkArcName(rec);
if (! isIndex(rec))
{
return;
}
checkCollectionName();
final ARCRecordMetaData arcData = rec.getMetaData();
String oldUrl = url;
try
{
url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_FETCHER);
url = filters.filter(url); // filter the url
}
catch (Exception e)
{
LOG.warn("Skipping record. Didn't pass normalization/filter " +
oldUrl + ": " + e.toString());
return;
}
final long b = arcData.getContentBegin();
final long l = arcData.getLength();
final long recordLength = (l > b)? (l - b): l;
// Look at ARCRecord meta data line mimetype. It can be empty. If so,
// two more chances at figuring it either by looking at HTTP headers or
// by looking at first couple of bytes of the file. See below.
String mimetype =
getMimetype(arcData.getMimetype(), this.mimeTypes, url);
if (skip(mimetype))
{
return;
}
// Copy http headers to nutch metadata.
final Metadata metaData = new Metadata();
final Header[] headers = rec.getHttpHeaders();
for (int j = 0; j < headers.length; j++)
{
final Header header = headers[j];
if (mimetype == null)
{
// Special handling. If mimetype is still null, try getting it
// from the http header. I've seen arc record lines with empty
// content-type and a MIME unparseable file ending; i.e. .MID.
if ((header.getName() != null) &&
header.getName().toLowerCase().equals(ImportArcs.CONTENT_TYPE_KEY))
{
mimetype = getMimetype(header.getValue(), null, null);
if (skip(mimetype))
{
return;
}
}
}
metaData.set(header.getName(), header.getValue());
}
// This call to reporter setStatus pings the tasktracker telling it our
// status and telling the task tracker we're still alive (so it doesn't
// time us out).
final String noSpacesMimetype =
TextUtils.replaceAll(ImportArcs.WHITESPACE,
((mimetype == null || mimetype.length() <= 0)?
"TODO": mimetype),
"-");
final String recordLengthAsStr = Long.toString(recordLength);
reporter.setStatus(getStatus(url, oldUrl, recordLengthAsStr, noSpacesMimetype));
// This is a nutch 'more' field.
metaData.set("contentLength", recordLengthAsStr);
rec.skipHttpHeader();
reporter.setStatusIfElapse("read headers on " + url);
// TODO: Skip if unindexable type.
int total = 0;
// Read in first block. If mimetype still null, look for MAGIC.
int len = rec.read(this.buffer, 0, this.buffer.length);
if (mimetype == null)
{
MimeType mt = this.mimeTypes.getMimeType(this.buffer);
if (mt == null || mt.getName() == null)
{
LOG.warn("Failed to get mimetype for: " + url);
return;
}
mimetype = mt.getName();
}
metaData.set(ImportArcs.CONTENT_TYPE_KEY, mimetype);
// How much do we read total? If pdf, we will read more. If equal to -1,
// read all.
int readLimit = (ImportArcs.PDF_TYPE.equals(mimetype))?
this.pdfContentLimit : this.contentLimit;
// Reset our contentBuffer so can reuse. Over the life of an ARC
// processing will grow to maximum record size.
this.contentBuffer.reset();
while ((len != -1) && ((readLimit == -1) || (total < readLimit)))
{
total += len;
this.contentBuffer.write(this.buffer, 0, len);
len = rec.read(this.buffer, 0, this.buffer.length);
reporter.setStatusIfElapse("reading " + url);
}
// Close the Record. We're done with it. Side-effect is calculation
// of digest -- if we're digesting.
rec.close();
reporter.setStatusIfElapse("closed " + url);
final byte[] contentBytes = this.contentBuffer.toByteArray();
final CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
// Calculate digest or use precalculated sha1.
String digest = (this.sha1)? rec.getDigestStr():
MD5Hash.digest(contentBytes).toString();
metaData.set(Nutch.SIGNATURE_KEY, digest);
// Set digest back into the arcData so available later when we write
// CDX line.
arcData.setDigest(digest);
metaData.set(Nutch.SEGMENT_NAME_KEY, this.segmentName);
// Score at this stage is 1.0f.
metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore()));
final long startTime = System.currentTimeMillis();
final Content content = new Content(url, url, contentBytes, mimetype,
metaData, getConf());
datum.setFetchTime(Nutchwax.getDate(arcData.getDate()));
MapWritable mw = datum.getMetaData();
if (mw == null)
{
mw = new MapWritable();
}
if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(SqlSearcher.getCollectionNameWithTimestamp(collectionName,arcData.getDate())));
}
else {
mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(collectionName));
}
mw.put(new Text(ImportArcs.ARCFILENAME_KEY), new Text(arcName));
mw.put(new Text(ImportArcs.ARCFILEOFFSET_KEY),
new Text(Long.toString(arcData.getOffset())));
datum.setMetaData(mw);
TimeoutParsingThread tout=threadPool.getThread(Thread.currentThread().getId(),timeoutIndexingDocument);
tout.setUrl(url);
tout.setContent(content);
tout.setParseUtil(parseUtil);
tout.wakeupAndWait();
ParseStatus parseStatus=tout.getParseStatus();
Parse parse=tout.getParse();
reporter.setStatusIfElapse("parsed " + url);
if (!parseStatus.isSuccess()) {
final String status = formatToOneLine(parseStatus.toString());
LOG.warn("Error parsing: " + mimetype + " " + url + ": " + status);
parse = null;