assertTrue(historyStore().store.isEmpty());
Server server = newHttpServer();
FetchHTTP fetcher = FetchHTTPTests.newTestFetchHttp(getClass().getName());
WARCWriterProcessor warcWriter = WARCWriterProcessorTest.newTestWarcWriter(getClass().getName());
warcWriter.setServerCache(fetcher.getServerCache());
for (File dir: warcWriter.calcOutputDirs()) {
/* make sure we don't have other stuff hanging around that will
* confuse the warc reader checks later */
FileUtils.deleteDirectory(dir);
}
try {
server.start();
warcWriter.start();
fetcher.start();
CrawlURI curi1 = makeCrawlURI("http://127.0.0.1:7777/url1");
CrawlURI curi2 = makeCrawlURI("http://127.0.0.1:7777/url2");
final String expectedDigest = "sha1:TQ5R6YVOZLTQENRIIENVGXHOPX3YCRNJ";
fetcher.process(curi1);
assertEquals(200, curi1.getFetchStatus());
assertEquals(141, curi1.getContentSize());
assertEquals(expectedDigest, curi1.getContentDigestSchemeString());
assertFalse(curi1.hasContentDigestHistory());
loader().process(curi1);
assertTrue(curi1.hasContentDigestHistory());
assertTrue(curi1.getContentDigestHistory().isEmpty());
warcWriter.process(curi1);
assertEquals(curi1.getUURI().toString(), curi1.getContentDigestHistory().get(A_ORIGINAL_URL));
assertEquals(1, curi1.getContentDigestHistory().get(A_CONTENT_DIGEST_COUNT));
String report = warcWriter.report();
assertTrue(report.contains("Total CrawlURIs: 1\n"));
assertTrue(report.contains("Revisit records: 0\n"));
storer().process(curi1);
assertEquals(1, historyStore().store.size());
assertNotNull(historyStore().store.get(expectedDigest));
assertEquals(curi1.getUURI().toString(), historyStore().store.get(expectedDigest).get(A_ORIGINAL_URL));
assertEquals(1, historyStore().store.get(expectedDigest).get(A_CONTENT_DIGEST_COUNT));
fetcher.process(curi2);
assertEquals(200, curi1.getFetchStatus());
assertEquals(141, curi1.getContentSize());
assertEquals(expectedDigest, curi1.getContentDigestSchemeString());
assertFalse(curi2.hasContentDigestHistory());
loader().process(curi2);
assertTrue(curi2.hasContentDigestHistory());
assertEquals(curi1.getUURI().toString(), curi2.getContentDigestHistory().get(A_ORIGINAL_URL));
assertNotSame(curi2.getUURI().toString(), curi2.getContentDigestHistory().get(A_ORIGINAL_URL));
assertEquals(1, curi2.getContentDigestHistory().get(A_CONTENT_DIGEST_COUNT));
warcWriter.process(curi2);
assertTrue(curi2.getAnnotations().contains("duplicate:digest"));
assertEquals(curi1.getUURI().toString(), curi2.getContentDigestHistory().get(A_ORIGINAL_URL));
assertNotSame(curi2.getUURI().toString(), curi2.getContentDigestHistory().get(A_ORIGINAL_URL));
assertEquals(2, curi2.getContentDigestHistory().get(A_CONTENT_DIGEST_COUNT));
report = warcWriter.report();
assertTrue(report.contains("Total CrawlURIs: 2\n"));
assertTrue(report.contains("Revisit records: 1\n"));
storer().process(curi2);
assertEquals(1, historyStore().store.size());
assertNotNull(historyStore().store.get(expectedDigest));
assertEquals(curi1.getUURI().toString(), historyStore().store.get(expectedDigest).get(A_ORIGINAL_URL));
assertEquals(2, historyStore().store.get(expectedDigest).get(A_CONTENT_DIGEST_COUNT));
warcWriter.stop();
String payloadRecordIdWithBrackets = "<"
+ historyStore().store.get(expectedDigest).get(
A_WARC_RECORD_ID) + ">";
// check the warc records
List<File> warcDirs = warcWriter.calcOutputDirs();
assertEquals(1, warcDirs.size());
String[] warcs = warcDirs.get(0).list();
assertEquals(1, warcs.length);
WARCReader warcReader = WARCReaderFactory.get(new File(warcDirs.get(0), warcs[0]));
Iterator<ArchiveRecord> recordIterator = warcReader.iterator();
ArchiveRecord record = recordIterator.next();
assertEquals(WARCRecordType.warcinfo.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
assertTrue(recordIterator.hasNext());
record = recordIterator.next();
assertEquals(WARCRecordType.response.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
assertEquals("141", record.getHeader().getHeaderValue(CONTENT_LENGTH));
assertEquals(expectedDigest, record.getHeader().getHeaderValue(HEADER_KEY_PAYLOAD_DIGEST));
assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_ID));
assertTrue(recordIterator.hasNext());
record = recordIterator.next();
assertEquals(WARCRecordType.request.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_CONCURRENT_TO));
assertTrue(recordIterator.hasNext());
record = recordIterator.next();
assertEquals(WARCRecordType.metadata.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_CONCURRENT_TO));
// the all-important revisit record
assertTrue(recordIterator.hasNext());
record = recordIterator.next();
assertEquals(WARCRecordType.revisit.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
assertEquals(curi2.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO));
assertEquals(NAMED_FIELD_TRUNCATED_VALUE_LENGTH, record.getHeader().getHeaderValue(HEADER_KEY_TRUNCATED));
assertEquals(HTTP_RESPONSE_MIMETYPE, record.getHeader().getHeaderValue(CONTENT_TYPE));
assertEquals(expectedDigest, record.getHeader().getHeaderValue(HEADER_KEY_PAYLOAD_DIGEST));
assertEquals(PROFILE_REVISIT_IDENTICAL_DIGEST,
record.getHeader().getHeaderValue(HEADER_KEY_PROFILE));
assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO_TARGET_URI));
assertEquals(historyStore().store.get(expectedDigest).get(A_ORIGINAL_DATE),
record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO_DATE));
assertNull(record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO_FILENAME));
assertNull(record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO_FILE_OFFSET));
assertTrue(recordIterator.hasNext());
record = recordIterator.next();
assertEquals(WARCRecordType.request.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
assertEquals(curi2.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
assertTrue(recordIterator.hasNext());
record = recordIterator.next();
assertEquals(WARCRecordType.metadata.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
assertEquals(curi2.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
assertFalse(recordIterator.hasNext());
} finally {
warcWriter.stop();
fetcher.stop();
server.stop();
}
}