transformWarcFilename(header.getReaderIdentifier()));
result.put(WaybackConstants.RESULT_OFFSET,
String.valueOf(header.getOffset()));
String origUrl = header.getUrl();
UURI uri = addUrlDataToSearchResult(result,origUrl);
// need to parse the documents HTTP message and headers here: WARCReader
// does not implement this... yet..
byte [] statusBytes = HttpParser.readRawLine(rec);
int eolCharCount = getEolCharsCount(statusBytes);
if (eolCharCount <= 0) {
throw new RecoverableIOException("Failed to read http status where one " +
" was expected: " + new String(statusBytes));
}
String statusLine = EncodingUtil.getString(statusBytes, 0,
statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
if ((statusLine == null) ||
!StatusLine.startsWithHTTP(statusLine)) {
throw new RecoverableIOException("Failed parse of http status line.");
}
StatusLine status = new StatusLine(statusLine);
result.put(WaybackConstants.RESULT_HTTP_CODE,
String.valueOf(status.getStatusCode()));
Header[] headers = HttpParser.parseHeaders(rec,
ARCConstants.DEFAULT_ENCODING);
rec.close();
result.put(WaybackConstants.RESULT_MD5_DIGEST,
transformDigest(header.getHeaderValue(
WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
if (headers != null) {
for (Header httpHeader : headers) {
if (httpHeader.getName().equals(
WaybackConstants.LOCATION_HTTP_HEADER)) {
String locationStr = httpHeader.getValue();
// TODO: "Location" is supposed to be absolute:
// (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
// (section 14.30) but Content-Location can be
// relative.
// is it correct to resolve a relative Location, as
// we are?
// it's also possible to have both in the HTTP
// headers...
// should we prefer one over the other?
// right now, we're ignoring "Content-Location"
try {
UURI uriRedirect = UURIFactory.getInstance(uri,
locationStr);
result.put(WaybackConstants.RESULT_REDIRECT_URL,
uriRedirect.getEscapedURI());
} catch (URIException e) {
LOGGER.info("Bad Location: " + locationStr
+ " for " + origUrl + " in "
+ header.getReaderIdentifier() + " Skipped");
}