WarcRecord warcRecord = null;
DataInputStream dis = new DataInputStream(rawContent);
// Regardless of what the stream gives us, we read and return
// the first entry which is a response.
WarcHTMLResponseRecord warcResponse = null;
while ((warcRecord = WarcRecord.readNextWarcRecord(dis)) != null) {
// ignore if no WARC response type, otherwise read and finish
if (warcRecord.getHeaderRecordType().equals("response")) {
warcResponse = new WarcHTMLResponseRecord(warcRecord);
break;
}
}
if (warcResponse != null) {
// parse the HTML content (skip HTTP header)
Reader reader = warcResponse.getContentReader();
int len = 0;
while ((len = reader.read(text, 0, text.length)) >= 0) {
parser.parse(text, 0, len);
}
reader.close();
// We use the TrecID for the URI
metadata.put(MetadataKeys.URI, warcResponse.getTargetTrecID());
// Set the title
metadata.put(MetadataKeys.TITLE, textExtractor.title.trim());
parsed = true;
}