int max = 300;
int count = 0;
int validRecords = 0;
while (count++ < max && iterator.hasNext()) {
ArchiveRecord archiveRecord = iterator.next();
ArchiveRecordHeader header = archiveRecord.getHeader();
String url = header.getUrl();
String protocol = "";
try {
protocol = new URL(url).getProtocol();
} catch (MalformedURLException e) {
// Ignore and skip
}
if (protocol.equals("http")) {
validRecords += 1;
int contentOffset = header.getContentBegin();
long totalLength = header.getLength();
int contentLength = (int) totalLength - contentOffset;
archiveRecord.skip(contentOffset);
byte[] content = new byte[contentLength];
archiveRecord.read(content);
String mimetype = header.getMimetype();
// The Arc headers != HTTP headers, but it's at least some data we can jam
// into the FetchedDatum as a test. Note that the Arc headers will have value
// types other than a long, so we have do to the conversion.
HttpHeaders headers = new HttpHeaders();
Set<String> keys = header.getHeaderFieldKeys();
for (String key : keys) {
String value = header.getHeaderValue(key).toString();
headers.add(key, value);
}
FetchedDatum contentTuple = new FetchedDatum(url, url, System.currentTimeMillis(), headers, new ContentBytes(content), mimetype, 0);
write.add(contentTuple.getTuple());