Package org.archive.io

Examples of org.archive.io.ArchiveRecord


        Iterator<ArchiveRecord> iterator = archiveReader.iterator();
        int max = 300;
        int count = 0;
        int validRecords = 0;
        while (count++ < max && iterator.hasNext()) {
            ArchiveRecord archiveRecord = iterator.next();
            ArchiveRecordHeader header = archiveRecord.getHeader();
            String url = header.getUrl();

            String protocol = "";
            try {
                protocol = new URL(url).getProtocol();
            } catch (MalformedURLException e) {
                // Ignore and skip
            }

            if (protocol.equals("http")) {
                validRecords += 1;
                int contentOffset = header.getContentBegin();
                long totalLength = header.getLength();
                int contentLength = (int) totalLength - contentOffset;

                archiveRecord.skip(contentOffset);
                byte[] content = new byte[contentLength];
                archiveRecord.read(content);

                String mimetype = header.getMimetype();
                // The Arc headers != HTTP headers, but it's at least some data we can jam
                // into the FetchedDatum as a test. Note that the Arc headers will have value
                // types other than a long, so we have do to the conversion.
View Full Code Here


            }
        }
        reader.close();
       
        reader = ARCReaderFactory.get(arcFile, offset);
        ArchiveRecord ar = reader.get();
        assertEquals(ar.getHeader().getUrl(), url);
        ar.close();
        reader.close();
       
        // Get reader again.  See how iterator works with offset
        reader = ARCReaderFactory.get(arcFile, offset);
        int count = 0;
View Full Code Here

        File arcFile = writeRecords("writeRecord", true,
            DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
        ARCReader reader = ARCReaderFactory.get(arcFile);
        Iterator<ArchiveRecord> it = reader.iterator();
        while (it.hasNext()) {
            ArchiveRecord next = it.next();
            next.close();
        }
        reader.close();
    }
View Full Code Here

        ArchiveReader reader = null;
        try {
            reader = ArchiveReaderFactory.
                get(new URL("file:////" + arc.getAbsolutePath()));
            for (Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
                ArchiveRecord r = (ArchiveRecord)i.next();
                assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
            }
        } finally {
            if (reader != null) {
                reader.close();
            }
View Full Code Here

        File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
        ArchiveReader reader = null;
        try {
            reader = ArchiveReaderFactory.get(arc.getAbsoluteFile());
            for (Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
                ArchiveRecord r = (ArchiveRecord)i.next();
                assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
            }
        } finally {
            if (reader != null) {
                reader.close();
            }
View Full Code Here

        File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
        ArchiveReader reader = null;
        try {
            reader = ArchiveReaderFactory.get(arc.getAbsoluteFile().getAbsolutePath());
            for (Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
                ArchiveRecord r = (ArchiveRecord)i.next();
                assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
            }
        } finally {
            if (reader != null) {
                reader.close();
            }
View Full Code Here

            String[] warcs = warcDirs.get(0).list();
            assertEquals(1, warcs.length);
            WARCReader warcReader = WARCReaderFactory.get(new File(warcDirs.get(0), warcs[0]));
            Iterator<ArchiveRecord> recordIterator = warcReader.iterator();
           
            ArchiveRecord record = recordIterator.next();
            assertEquals(WARCRecordType.warcinfo.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
           
            assertTrue(recordIterator.hasNext());
            record = recordIterator.next();
            assertEquals(WARCRecordType.response.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
            assertEquals("141", record.getHeader().getHeaderValue(CONTENT_LENGTH));
            assertEquals(expectedDigest, record.getHeader().getHeaderValue(HEADER_KEY_PAYLOAD_DIGEST));
            assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
            assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_ID));
           
            assertTrue(recordIterator.hasNext());
            record = recordIterator.next();
            assertEquals(WARCRecordType.request.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
            assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
            assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_CONCURRENT_TO));
           
            assertTrue(recordIterator.hasNext());
            record = recordIterator.next();
            assertEquals(WARCRecordType.metadata.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
            assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
            assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_CONCURRENT_TO));
           
            // the all-important revisit record
            assertTrue(recordIterator.hasNext());
            record = recordIterator.next();
            assertEquals(WARCRecordType.revisit.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
            assertEquals(curi2.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
            assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO));
            assertEquals(NAMED_FIELD_TRUNCATED_VALUE_LENGTH, record.getHeader().getHeaderValue(HEADER_KEY_TRUNCATED));
            assertEquals(HTTP_RESPONSE_MIMETYPE, record.getHeader().getHeaderValue(CONTENT_TYPE));
            assertEquals(expectedDigest, record.getHeader().getHeaderValue(HEADER_KEY_PAYLOAD_DIGEST));
            assertEquals(PROFILE_REVISIT_IDENTICAL_DIGEST,
                    record.getHeader().getHeaderValue(HEADER_KEY_PROFILE));
            assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO_TARGET_URI));
            assertEquals(historyStore().store.get(expectedDigest).get(A_ORIGINAL_DATE),
                    record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO_DATE));
            assertNull(record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO_FILENAME));
            assertNull(record.getHeader().getHeaderValue(HEADER_KEY_REFERS_TO_FILE_OFFSET));

            assertTrue(recordIterator.hasNext());
            record = recordIterator.next();
            assertEquals(WARCRecordType.request.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
            assertEquals(curi2.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));
           
            assertTrue(recordIterator.hasNext());
            record = recordIterator.next();
            assertEquals(WARCRecordType.metadata.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE));
            assertEquals(curi2.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI));

            assertFalse(recordIterator.hasNext());
           
        } finally {
            warcWriter.stop();
View Full Code Here

        // and the previous.
       
        for (int i = headers.size() - 1; i >= 0; i--) {
            reader = WARCReaderFactory.get(f);
            ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i);
            ArchiveRecord r = reader.get(h.getOffset());
            String mimeType = r.getHeader().getMimetype();
            assertTrue("Record is bogus",
                mimeType != null && mimeType.length() > 0);
            reader.close();
        }
       
        assertTrue("Metadatas not equal", headers.size() == recordCount);
        for (Iterator<ArchiveRecordHeader> i = headers.iterator(); i.hasNext();) {
            ArchiveRecordHeader r = (ArchiveRecordHeader)i.next();
            assertTrue("Record is empty", r.getLength() > 0);
        }
    }
View Full Code Here

            }
        }
        reader.close();
       
        reader = WARCReaderFactory.get(f, offset);
        ArchiveRecord ar = reader.get();
        assertEquals(ar.getHeader().getUrl(), url);
        ar.close();
        reader.close();
       
        // Get reader again.  See how iterator works with offset
        reader = WARCReaderFactory.get(f, offset);
        int count = 0;
View Full Code Here

   
    protected int iterateRecords(WARCReader r)
    throws IOException {
        int count = 0;
        for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
            ArchiveRecord ar = i.next();
            ar.close();
            if (count != 0) {
                assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
                    ar.getHeader().getUrl().equals(SOME_URL));
            }
            count++;
        }
        return count;
    }
View Full Code Here

TOP

Related Classes of org.archive.io.ArchiveRecord

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.