protected URI writeMetadata(final WARCWriter w,
final String timestamp,
final URI baseid, final CrawlURI curi,
final ANVLRecord namedFields)
throws IOException {
WARCRecordInfo recordInfo = new WARCRecordInfo();
recordInfo.setType(WARCRecordType.metadata);
recordInfo.setUrl(curi.toString());
recordInfo.setCreate14DigitDate(timestamp);
recordInfo.setMimetype(ANVLRecord.MIMETYPE);
recordInfo.setExtraHeaders(namedFields);
recordInfo.setEnforceLength(true);
recordInfo.setRecordId(qualifyRecordID(baseid, TYPE, WARCRecordType.metadata.toString()));
// Get some metadata from the curi.
// TODO: Get all curi metadata.
// TODO: Use other than ANVL (or rename ANVL as NameValue or use
// RFC822 (commons-httpclient?).
ANVLRecord r = new ANVLRecord();
if (curi.isSeed()) {
r.addLabel("seed");
} else {
if (curi.forceFetch()) {
r.addLabel("force-fetch");
}
if(StringUtils.isNotBlank(flattenVia(curi))) {
r.addLabelValue("via", flattenVia(curi));
}
if(StringUtils.isNotBlank(curi.getPathFromSeed())) {
r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
}
if (curi.containsDataKey(A_SOURCE_TAG)) {
r.addLabelValue("sourceTag",
(String)curi.getData().get(A_SOURCE_TAG));
}
}
long duration = curi.getFetchCompletedTime() - curi.getFetchBeginTime();
if (duration > -1) {
r.addLabelValue("fetchTimeMs", Long.toString(duration));
}
if (curi.getData().containsKey(A_FTP_FETCH_STATUS)) {
r.addLabelValue("ftpFetchStatus", curi.getData().get(A_FTP_FETCH_STATUS).toString());
}
if (curi.getRecorder() != null && curi.getRecorder().getCharset() != null) {
r.addLabelValue("charsetForLinkExtraction", curi.getRecorder().getCharset().name());
}
for (String annotation: curi.getAnnotations()) {
if (annotation.startsWith("usingCharsetIn") || annotation.startsWith("inconsistentCharsetIn")) {
String[] kv = annotation.split(":", 2);
r.addLabelValue(kv[0], kv[1]);
}
}
// Add outlinks though they are effectively useless without anchor text.
Collection<CrawlURI> links = curi.getOutLinks();
if (links != null && links.size() > 0) {
for (CrawlURI link: links) {
r.addLabelValue("outlink", link.getURI());
}
}
// TODO: Other curi fields to write to metadata.
//
// Credentials
//
// fetch-began-time: 1154569278774
// fetch-completed-time: 1154569281816
//
// Annotations.
byte [] b = r.getUTF8Bytes();
recordInfo.setContentStream(new ByteArrayInputStream(b));
recordInfo.setContentLength((long) b.length);
w.writeRecord(recordInfo);
return recordInfo.getRecordId();
}