private SearchResult adaptInner(ARCRecord rec) throws IOException {
rec.close();
ARCRecordMetaData meta = rec.getMetaData();
SearchResult result = new SearchResult();
String arcName = meta.getArc();
int index = arcName.lastIndexOf(File.separator);
if (index > 0 && (index + 1) < arcName.length()) {
arcName = arcName.substring(index + 1);
}
result.put(WaybackConstants.RESULT_ARC_FILE, arcName);
result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta
.getOffset()));
// initialize with default HTTP code...
result.put(WaybackConstants.RESULT_HTTP_CODE, "-");
result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr());
result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype());
result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate());
String uriStr = meta.getUrl();
if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) {
// skip filedesc record altogether...
return null;
}
if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) {
// skip URL + HTTP header processing for dns records...
String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX
.length());
result.put(WaybackConstants.RESULT_ORIG_HOST, origHost);
result.put(WaybackConstants.RESULT_REDIRECT_URL, "-");
result.put(WaybackConstants.RESULT_URL, uriStr);
result.put(WaybackConstants.RESULT_URL_KEY, uriStr);
} else {
UURI uri = UURIFactory.getInstance(uriStr);
result.put(WaybackConstants.RESULT_URL, uriStr);
String uriHost = uri.getHost();
if (uriHost == null) {
LOGGER.info("No host in " + uriStr + " in " + meta.getArc());
} else {
result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost);
String statusCode = (meta.getStatusCode() == null) ? "-" : meta
.getStatusCode();
result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode);
String redirectUrl = "-";
Header[] headers = rec.getHttpHeaders();
if (headers != null) {
for (int i = 0; i < headers.length; i++) {
if (headers[i].getName().equals(
WaybackConstants.LOCATION_HTTP_HEADER)) {
String locationStr = headers[i].getValue();
// TODO: "Location" is supposed to be absolute:
// (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
// (section 14.30) but Content-Location can be
// relative.
// is it correct to resolve a relative Location, as
// we are?
// it's also possible to have both in the HTTP
// headers...
// should we prefer one over the other?
// right now, we're ignoring "Content-Location"
try {
UURI uriRedirect = UURIFactory.getInstance(uri,
locationStr);
redirectUrl = uriRedirect.getEscapedURI();
} catch (URIException e) {
LOGGER.info("Bad Location: " + locationStr
+ " for " + uriStr + " in "
+ meta.getArc() + " Skipped");
}
break;
}
}
}
result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl);
String indexUrl = canonicalizer.urlStringToKey(meta.getUrl());
result.put(WaybackConstants.RESULT_URL_KEY, indexUrl);
}
}
return result;
}