BasePath inputPath = platform.makePath("build/test/ParserPipeTest/in");
Tap in = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), inputPath);
BasePath outputPath = platform.makePath("build/test/ParserPipeTest/out");
Tap out = platform.makeTap(platform.makeBinaryScheme(ParsedDatum.FIELDS), outputPath, SinkMode.REPLACE);
TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());
ArchiveReader archiveReader = ArchiveReaderFactory.get("src/test/resources/someHtml.arc");
Iterator<ArchiveRecord> iterator = archiveReader.iterator();
int max = 300;
int count = 0;
int validRecords = 0;
while (count++ < max && iterator.hasNext()) {
ArchiveRecord archiveRecord = iterator.next();
ArchiveRecordHeader header = archiveRecord.getHeader();
String url = header.getUrl();
String protocol = "";
try {
protocol = new URL(url).getProtocol();
} catch (MalformedURLException e) {
// Ignore and skip
}
if (protocol.equals("http")) {
validRecords += 1;
int contentOffset = header.getContentBegin();
long totalLength = header.getLength();
int contentLength = (int) totalLength - contentOffset;
archiveRecord.skip(contentOffset);
byte[] content = new byte[contentLength];
archiveRecord.read(content);
String mimetype = header.getMimetype();
// The Arc headers != HTTP headers, but it's at least some data we can jam
// into the FetchedDatum as a test. Note that the Arc headers will have value
// types other than a long, so we have do to the conversion.
HttpHeaders headers = new HttpHeaders();
Set<String> keys = header.getHeaderFieldKeys();
for (String key : keys) {
String value = header.getHeaderValue(key).toString();
headers.add(key, value);
}
FetchedDatum contentTuple = new FetchedDatum(url, url, System.currentTimeMillis(), headers, new ContentBytes(content), mimetype, 0);
write.add(contentTuple.getTuple());
}
}
write.close();
FlowConnector flowConnector = platform.makeFlowConnector();
Flow flow = flowConnector.connect(in, out, parserPipe);
flow.complete();
// Currently many of the docs fail parsing: