public List<String> getMetadata() {
if (cachedMetadata != null) {
return cachedMetadata;
}
ANVLRecord record = new ANVLRecord();
record.addLabelValue("software", "Heritrix/" +
ArchiveUtils.VERSION + " http://crawler.archive.org");
try {
InetAddress host = InetAddress.getLocalHost();
record.addLabelValue("ip", host.getHostAddress());
record.addLabelValue("hostname", host.getCanonicalHostName());
} catch (UnknownHostException e) {
logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);
}
// conforms to ISO 28500:2009 as of May 2009
// as described at http://bibnum.bnf.fr/WARC/
// latest draft as of November 2008
record.addLabelValue("format","WARC File Format 1.0");
record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
// Get other values from metadata provider
CrawlMetadata provider = getMetadataProvider();
addIfNotBlank(record,"operator", provider.getOperator());
addIfNotBlank(record,"publisher", provider.getOrganization());
addIfNotBlank(record,"audience", provider.getAudience());
addIfNotBlank(record,"isPartOf", provider.getJobName());
// TODO: make date match 'job creation date' as in Heritrix 1.x
// until then, leave out (plenty of dates already in WARC
// records
// String rawDate = provider.getBeginDate();
// if(StringUtils.isNotBlank(rawDate)) {
// Date date;
// try {
// date = ArchiveUtils.parse14DigitDate(rawDate);
// addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));
// } catch (ParseException e) {
// logger.log(Level.WARNING,"obtaining warc created date",e);
// }
// }
addIfNotBlank(record,"description", provider.getDescription());
addIfNotBlank(record,"robots", provider.getRobotsPolicyName().toLowerCase());
addIfNotBlank(record,"http-header-user-agent",
provider.getUserAgent());
addIfNotBlank(record,"http-header-from",
provider.getOperatorFrom());
// really ugly to return as List<String>, but changing would require
// larger refactoring
return Collections.singletonList(record.toString());
}