Package org.archive.modules.writer

Examples of org.archive.modules.writer.WARCWriterProcessor


        assertTrue(historyStore().store.isEmpty());

        Server server = newHttpServer();

        FetchHTTP fetcher = FetchHTTPTests.newTestFetchHttp(getClass().getName());
        WARCWriterProcessor warcWriter = WARCWriterProcessorTest.newTestWarcWriter(getClass().getName());
        warcWriter.setServerCache(fetcher.getServerCache());
        for (File dir: warcWriter.calcOutputDirs()) {
            /* make sure we don't have other stuff hanging around that will
             * confuse the warc reader checks later */
            FileUtils.deleteDirectory(dir);
        }

        try {
            server.start();
            warcWriter.start();
            fetcher.start();

            CrawlURI curi1 = makeCrawlURI("http://127.0.0.1:7777/url1");
            CrawlURI curi2 = makeCrawlURI("http://127.0.0.1:7777/url2");
            final String expectedDigest = "sha1:TQ5R6YVOZLTQENRIIENVGXHOPX3YCRNJ";

            fetcher.process(curi1);
            assertEquals(200, curi1.getFetchStatus());
            assertEquals(141, curi1.getContentSize());
            assertEquals(expectedDigest, curi1.getContentDigestSchemeString());
            assertFalse(curi1.hasContentDigestHistory());

            loader().process(curi1);
            assertTrue(curi1.hasContentDigestHistory());
            assertTrue(curi1.getContentDigestHistory().isEmpty());

            warcWriter.process(curi1);
            assertEquals(curi1.getUURI().toString(), curi1.getContentDigestHistory().get(A_ORIGINAL_URL));
            assertEquals(1, curi1.getContentDigestHistory().get(A_CONTENT_DIGEST_COUNT));
            String report = warcWriter.report();
            assertTrue(report.contains("Total CrawlURIs:   1\n"));
            assertTrue(report.contains("Revisit records:   0\n"));

            storer().process(curi1);
            assertEquals(1, historyStore().store.size());
            assertNotNull(historyStore().store.get(expectedDigest));
            assertEquals(curi1.getUURI().toString(), historyStore().store.get(expectedDigest).get(A_ORIGINAL_URL));
            assertEquals(1, historyStore().store.get(expectedDigest).get(A_CONTENT_DIGEST_COUNT));

            fetcher.process(curi2);
            assertEquals(200, curi1.getFetchStatus());
            assertEquals(141, curi1.getContentSize());
            assertEquals(expectedDigest, curi1.getContentDigestSchemeString());
            assertFalse(curi2.hasContentDigestHistory());

            loader().process(curi2);
            assertTrue(curi2.hasContentDigestHistory());
            assertEquals(curi1.getUURI().toString(), curi2.getContentDigestHistory().get(A_ORIGINAL_URL));
            assertNotSame(curi2.getUURI().toString(), curi2.getContentDigestHistory().get(A_ORIGINAL_URL));
            assertEquals(1, curi2.getContentDigestHistory().get(A_CONTENT_DIGEST_COUNT));

            warcWriter.process(curi2);
            assertTrue(curi2.getAnnotations().contains("duplicate:digest"));
            assertEquals(curi1.getUURI().toString(), curi2.getContentDigestHistory().get(A_ORIGINAL_URL));
            assertNotSame(curi2.getUURI().toString(), curi2.getContentDigestHistory().get(A_ORIGINAL_URL));
            assertEquals(2, curi2.getContentDigestHistory().get(A_CONTENT_DIGEST_COUNT));
            report = warcWriter.report();
            assertTrue(report.contains("Total CrawlURIs:   2\n"));
            assertTrue(report.contains("Revisit records:   1\n"));

            storer().process(curi2);
            assertEquals(1, historyStore().store.size());
            assertNotNull(historyStore().store.get(expectedDigest));
            assertEquals(curi1.getUURI().toString(), historyStore().store.get(expectedDigest).get(A_ORIGINAL_URL));
            assertEquals(2, historyStore().store.get(expectedDigest).get(A_CONTENT_DIGEST_COUNT));

            warcWriter.stop();
           
            String payloadRecordIdWithBrackets = "<"
                    + historyStore().store.get(expectedDigest).get(
                            A_WARC_RECORD_ID) + ">";
           
            // check the warc records
            List<File> warcDirs = warcWriter.calcOutputDirs();
            assertEquals(1, warcDirs.size());
            String[] warcs = warcDirs.get(0).list();
            assertEquals(1, warcs.length);
            WARCReader warcReader = WARCReaderFactory.get(new File(warcDirs.get(0), warcs[0]));
            Iterator<ArchiveRecord> recordIterator = warcReader.iterator();
           
            ArchiveRecord record = recordIterator.next();
            assertEquals(WARCRecordType.warcinfo.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
           
            assertTrue(recordIterator.hasNext());
            record = recordIterator.next();
            assertEquals(WARCRecordType.response.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
            assertEquals("141", record.getHeader().getHeaderValue(CONTENT_LENGTH));
            assertEquals(expectedDigest, record.getHeader().getHeaderValue(HEADER_KEY_PAYLOAD_DIGEST));
            assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
            assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_ID));
           
            assertTrue(recordIterator.hasNext());
            record = recordIterator.next();
            assertEquals(WARCRecordType.request.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
            assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
            assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_CONCURRENT_TO));
           
            assertTrue(recordIterator.hasNext());
            record = recordIterator.next();
            assertEquals(WARCRecordType.metadata.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
            assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
            assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_CONCURRENT_TO));
           
            // the all-important revisit record
            assertTrue(recordIterator.hasNext());
            record = recordIterator.next();
            assertEquals(WARCRecordType.revisit.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
            assertEquals(curi2.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
            assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO));
            assertEquals(NAMED_FIELD_TRUNCATED_VALUE_LENGTH, record.getHeader().getHeaderValue(HEADER_KEY_TRUNCATED));
            assertEquals(HTTP_RESPONSE_MIMETYPE, record.getHeader().getHeaderValue(CONTENT_TYPE));
            assertEquals(expectedDigest, record.getHeader().getHeaderValue(HEADER_KEY_PAYLOAD_DIGEST));
            assertEquals(PROFILE_REVISIT_IDENTICAL_DIGEST,
                    record.getHeader().getHeaderValue(HEADER_KEY_PROFILE));
            assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO_TARGET_URI));
            assertEquals(historyStore().store.get(expectedDigest).get(A_ORIGINAL_DATE),
                    record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO_DATE));
            assertNull(record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO_FILENAME));
            assertNull(record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO_FILE_OFFSET));

            assertTrue(recordIterator.hasNext());
            record = recordIterator.next();
            assertEquals(WARCRecordType.request.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
            assertEquals(curi2.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
           
            assertTrue(recordIterator.hasNext());
            record = recordIterator.next();
            assertEquals(WARCRecordType.metadata.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
            assertEquals(curi2.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));

            assertFalse(recordIterator.hasNext());
           
        } finally {
            warcWriter.stop();
            fetcher.stop();
            server.stop();
        }
    }
View Full Code Here

TOP

Related Classes of org.archive.modules.writer.WARCWriterProcessor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.