Package org.archive.io.arc

Source Code of org.archive.io.arc.ARCWriterTest

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package org.archive.io.arc;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.NullInputStream;
import org.apache.commons.io.output.NullOutputStream;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.io.WriterPoolSettings;
import org.archive.util.ArchiveUtils;
import org.archive.util.TmpDirTestCase;

import com.google.common.io.Closeables;


/**
* Test ARCWriter class.
*
* This code exercises ARCWriter AND ARCReader.  First it writes ARCs w/
* ARCWriter.  Then it validates what was written w/ ARCReader.
*
* @author stack
*/
public class ARCWriterTest
extends TmpDirTestCase implements ARCConstants {
    /**
     * Utility class for writing bad ARCs (with trailing junk)
     */
    public class CorruptibleARCWriter extends ARCWriter {
        byte[] endJunk = null;

        public CorruptibleARCWriter(AtomicInteger serial_no, WriterPoolSettings settings) {
            super(serial_no, settings);
        }

        @Override
        protected void postWriteRecordTasks() throws IOException {
            if (endJunk != null) {
                this.write(endJunk);
            }
            super.postWriteRecordTasks();
        }

        public void setEndJunk(byte[] b) throws IOException {
            this.endJunk = b;
        }
    }

    /**
     * Suffix to use for ARC files made by JUNIT.
     */
    private static final String SUFFIX =  "JUNIT";
   
    private static final String SOME_URL = "http://www.archive.org/test/";

   
    private static final AtomicInteger SERIAL_NO = new AtomicInteger();

    /*
     * @see TestCase#setUp()
     */
    protected void setUp() throws Exception {
        super.setUp();
    }

    /*
     * @see TestCase#tearDown()
     */
    protected void tearDown() throws Exception {
        super.tearDown();
    }
   
    protected static String getContent() {
        return getContent(null);
    }
   
    protected static String getContent(String indexStr) {
        String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
        return "HTTP/1.1 200 OK\r\n" +
        "Content-Type: text/html\r\n\r\n" +
        "<html><head><title>" + page +
        "</title></head>" +
        "<body>" + page +
        "</body></html>";
    }

    @SuppressWarnings("deprecation")
    protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)
    throws IOException {
        String indexStr = Integer.toString(index);
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        // Start the record with an arbitrary 14-digit date per RFC2540
        String now = ArchiveUtils.get14DigitDate();
        int recordLength = 0;
        byte[] record = (getContent(indexStr)).getBytes();
        recordLength += record.length;
        baos.write(record);
        // Add the newline between records back in
        baos.write("\n".getBytes());
        recordLength += 1;
        arcWriter.write("http://www.one.net/id=" + indexStr, "text/html",
            "0.1.2.3", Long.parseLong(now), recordLength, baos);
        return recordLength;
    }

    private File writeRecords(String baseName, boolean compress,
        long maxSize, int recordCount)
    throws IOException {
        cleanUpOldFiles(baseName);
        File [] files = {getTmpDir()};
        ARCWriter arcWriter =
            new ARCWriter(
                SERIAL_NO,
                new WriterPoolSettingsData(
                        baseName,
                        "${prefix}-"+SUFFIX,
                        maxSize,
                        compress,
                        Arrays.asList(files),
                        null));
        assertNotNull(arcWriter);
        for (int i = 0; i < recordCount; i++) {
            writeRandomHTTPRecord(arcWriter, i);
        }
        arcWriter.close();
        assertTrue("Doesn't exist: " +
                arcWriter.getFile().getAbsolutePath(),
            arcWriter.getFile().exists());
        return arcWriter.getFile();
    }

    private void validate(File arcFile, int recordCount)
    throws FileNotFoundException, IOException {
        ARCReader reader = ARCReaderFactory.get(arcFile);
        assertNotNull(reader);
        List<ArchiveRecordHeader> metaDatas = null;
        if (recordCount == -1) {
            metaDatas = reader.validate();
        } else {
            metaDatas = reader.validate(recordCount);
        }
        reader.close();
        // Now, run through each of the records doing absolute get going from
        // the end to start.  Reopen the arc so no context between this test
        // and the previous.
       
        for (int i = metaDatas.size() - 1; i >= 0; i--) {
            reader = ARCReaderFactory.get(arcFile);
            ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i);
            ArchiveRecord r = reader.get(meta.getOffset());
            String mimeType = r.getHeader().getMimetype();
            assertTrue("Record is bogus",
                mimeType != null && mimeType.length() > 0);
            reader.close();
        }
        assertEquals("Metadata count not as expected",recordCount, metaDatas.size());
        for (Iterator<ArchiveRecordHeader> i = metaDatas.iterator(); i.hasNext();) {
                ARCRecordMetaData r = (ARCRecordMetaData)i.next();
                assertTrue("Record is empty", r.getLength() > 0);
        }
    }

    public void testCheckARCFileSize()
    throws IOException {
        runCheckARCFileSizeTest("checkARCFileSize", false);
    }

    public void testCheckARCFileSizeCompressed()
    throws IOException {
        runCheckARCFileSizeTest("checkARCFileSize", true);
    }

    public void testWriteRecord() throws IOException {
        final int recordCount = 2;
        File arcFile = writeRecords("writeRecord", false,
                DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
        validate(arcFile, recordCount  + 1); // Header record.
    }
   
    public void testRandomAccess() throws IOException {
        final int recordCount = 3;
        File arcFile = writeRecords("writeRecord", true,
            DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
        ARCReader reader = ARCReaderFactory.get(arcFile);
        // Get to second record.  Get its offset for later use.
        boolean readFirst = false;
        String url = null;
        long offset = -1;
        long totalRecords = 0;
        boolean readSecond = false;
        for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext(); totalRecords++) {
            ARCRecord ar = (ARCRecord)i.next();
            if (!readFirst) {
                readFirst = true;
                continue;
            }
            if (!readSecond) {
                url = ar.getMetaData().getUrl();
                offset = ar.getMetaData().getOffset();
                readSecond = true;
            }
        }
        reader.close();
       
        reader = ARCReaderFactory.get(arcFile, offset);
        ArchiveRecord ar = reader.get();
        assertEquals(ar.getHeader().getUrl(), url);
        ar.close();
        reader.close();
       
        // Get reader again.  See how iterator works with offset
        reader = ARCReaderFactory.get(arcFile, offset);
        int count = 0;
        for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext(); i.next()) {
            count++;
        }
        reader.close();
        assertEquals(totalRecords - 1, count);
    }

    public void testWriteRecordCompressed() throws IOException {
        final int recordCount = 2;
        File arcFile = writeRecords("writeRecordCompressed", true,
                DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
        validate(arcFile, recordCount + 1 /*Header record*/);
    }
   
    public void testWriteGiantRecord() throws IOException {
        PrintStream dummyStream = new PrintStream(new NullOutputStream());
        ARCWriter arcWriter =
            new ARCWriter(
                    SERIAL_NO,
                    dummyStream,
                    new File("dummy"),
                    new WriterPoolSettingsData(
                            "",
                            "",
                            -1,
                            false,
                            null,
                            null));
        assertNotNull(arcWriter);

        // Start the record with an arbitrary 14-digit date per RFC2540
        long now = System.currentTimeMillis();
        long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3;
      
        arcWriter.write("dummy:uri", "application/octet-stream",
            "0.1.2.3", now, recordLength, new NullInputStream(recordLength));
        arcWriter.close();
    }
   
    private void runCheckARCFileSizeTest(String baseName, boolean compress)
    throws FileNotFoundException, IOException  {
        File f = writeRecords(baseName, compress, 1024, 15);
        validate(f, 15+1);
    }
   
    protected CorruptibleARCWriter createARCWriter(String name, boolean compress) {
        File [] files = {getTmpDir()};
        return new CorruptibleARCWriter(
                    SERIAL_NO,
                    new WriterPoolSettingsData(
                            name,
                            "${prefix}-"+SUFFIX,
                            DEFAULT_MAX_ARC_FILE_SIZE,
                            compress,
                            Arrays.asList(files),
                            null));
    }
   
    protected static ByteArrayInputStream getBais(String str)
    throws IOException {
        return new ByteArrayInputStream(str.getBytes());
    }
   
    /**
     * Writes a record, suppressing normal length-checks (so that
     * intentionally malformed records may be written).
     */
    protected static void writeRecord(ARCWriter writer, String url,
        String type, int len, ByteArrayInputStream bais)
    throws IOException {
        writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len,
            bais, false);
    }
   
    protected int iterateRecords(ARCReader r)
    throws IOException {
        int count = 0;
        for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
            ARCRecord rec = (ARCRecord)i.next();
            rec.close();
            if (count != 0) {
                assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
                    rec.getMetaData().getUrl().startsWith(SOME_URL));
            }
            count++;
        }
        return count;
    }
   
    protected CorruptibleARCWriter createArcWithOneRecord(String name,
        boolean compressed)
    throws IOException {
      CorruptibleARCWriter writer = createARCWriter(name, compressed);
        String content = getContent();
        writeRecord(writer, SOME_URL, "text/html",
            content.length(), getBais(content));
        return writer;
    }
   
    public void testSpaceInURL() {
        String eMessage = null;
        try {
            holeyUrl("testSpaceInURL", false, " ");
        } catch (IOException e) {
            eMessage = e.getMessage();
        }
        assertTrue("Didn't get expected exception: " + eMessage,
            eMessage.startsWith("Metadata line doesn't match"));
    }

    public void testTabInURL() {       
        String eMessage = null;
        try {
            holeyUrl("testTabInURL", false, "\t");
        } catch (IOException e) {
            eMessage = e.getMessage();
        }
        assertTrue("Didn't get expected exception: " + eMessage,
            eMessage.startsWith("Metadata line doesn't match"));
    }
   
    protected void holeyUrl(String name, boolean compress, String urlInsert)
    throws IOException {
      ARCWriter writer = null;
        try {
            writer = createArcWithOneRecord(name, compress);
            // Add some bytes on the end to mess up the record.
            String content = getContent();
            writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
                content.length(), getBais(content));
        } finally {
            Closeables.close(writer, true);
        }
    }
   
// If uncompressed, length has to be right or parse will fail.
//
//    public void testLengthTooShort() throws IOException {
//        lengthTooShort("testLengthTooShort-" + PREFIX, false);
//    }
   
    public void testLengthTooShortCompressed() throws IOException {
        lengthTooShort("testLengthTooShortCompressed", true, false);
    }
   
    public void testLengthTooShortCompressedStrict()
    throws IOException {     
        String eMessage = null;
        try {
            lengthTooShort("testLengthTooShortCompressedStrict",
                true, true);
        } catch (RuntimeException e) {
            eMessage = e.getMessage();
        }
        assertTrue("Didn't get expected exception: " + eMessage,
            eMessage.startsWith("java.io.IOException: Record STARTING at"));
    }
    
    protected void lengthTooShort(String name, boolean compress, boolean strict)
    throws IOException {
      CorruptibleARCWriter writer = null;
        try {
            writer = createArcWithOneRecord(name, compress);
            // Add some bytes on the end to mess up the record.
            String content = getContent();
            ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES");
            writeRecord(writer, SOME_URL, "text/html",
                content.length(), bais);
            writer.setEndJunk("SOME TRAILING BYTES".getBytes());
            writeRecord(writer, SOME_URL, "text/html",
                content.length(), getBais(content));
        } finally {
            Closeables.close(writer, true);
        }
       
        // Catch System.err into a byte stream.
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        PrintStream origErr = System.err;
        ARCReader r = null;
        try {
            System.setErr(new PrintStream(os));
           
            r = ARCReaderFactory.get(writer.getFile());
            r.setStrict(strict);
            int count = iterateRecords(r);
            assertTrue("Count wrong " + count, count == 4);
   
            // Make sure we get the warning string which complains about the
            // trailing bytes.
            String err = os.toString();
            assertTrue("No message " + err, err.startsWith("WARNING") &&
                (err.indexOf("Record STARTING at") > 0));
            r.close();
        } finally {
            Closeables.close(r, true);
            System.setErr(origErr);
        }
    }
   
//  If uncompressed, length has to be right or parse will fail.
//
//    public void testLengthTooLong()
//    throws IOException {
//        lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
//            false, false);
//    }
   
    public void testLengthTooLongCompressed()
    throws IOException {
        lengthTooLong("testLengthTooLongCompressed",
            true, false);
    }
   
    public void testLengthTooLongCompressedStrict() {
        String eMessage = null;
        try {
            lengthTooLong("testLengthTooLongCompressed",
                true, true);
        } catch (IOException e) {
            eMessage = e.getMessage();
        }
        assertTrue("Didn't get expected exception: " + eMessage,
            eMessage.startsWith("Premature EOF before end-of-record"));
    }
   
    protected void lengthTooLong(String name, boolean compress,
            boolean strict)
    throws IOException {
      ARCWriter writer = createArcWithOneRecord(name, compress);
        // Add a record with a length that is too long.
        String content = getContent();
        writeRecord(writer, SOME_URL+"2", "text/html",
            content.length() + 10, getBais(content));
        writeRecord(writer, SOME_URL+"3", "text/html",
            content.length(), getBais(content));
        writer.close();
       
        // Catch System.err.
        ByteArrayOutputStream os = new ByteArrayOutputStream();
       
        PrintStream origErr = System.err;
        ARCReader r = null;
        try {
            System.setErr(new PrintStream(os));
           
            r = ARCReaderFactory.get(writer.getFile());
            r.setStrict(strict);
            int count = iterateRecords(r);
            assertTrue("Count wrong " + count, count == 4);
           
            // Make sure we get the warning string which complains about the
            // trailing bytes.
            String err = os.toString();
            assertTrue("No message " + err,
                err.startsWith("WARNING Premature EOF before end-of-record"));
        } finally {
            Closeables.close(r, true);
            System.setErr(origErr);
        }
    }
   
    public void testGapError() throws IOException {
      ARCWriter writer = createArcWithOneRecord("testGapError", true);
        String content = getContent();
        // Make a 'weird' RIS that returns bad 'remaining' length
        // awhen remaining should be 0
        ReplayInputStream ris = new ReplayInputStream(content.getBytes(),
                content.length(), null) {
            public long remaining() {
                return (super.remaining()==0) ? -1 : super.remaining();
            }
        };
        String message = null;
        try {
        writer.write(SOME_URL, "text/html", "192.168.1.1",
            (new Date()).getTime(), content.length(), ris);
        } catch (IOException e) {
            message = e.getMessage();
        } finally {
            IOUtils.closeQuietly(ris);
        }
        writer.close();
        assertTrue("No gap when should be",
            message != null &&
            message.indexOf("Gap between expected and actual") >= 0);
    }
   
    /**
     * Write an arc file for other tests to use.
     * @param arcdir Directory to write to.
     * @param compress True if file should be compressed.
     * @return ARC written.
     * @throws IOException
     */
    public static File createARCFile(File arcdir, boolean compress)
    throws IOException {
        File [] files = {arcdir};
        ARCWriter writer = new ARCWriter(SERIAL_NO,
                new WriterPoolSettingsData(
                        "",
                        "test",
                        DEFAULT_MAX_ARC_FILE_SIZE,
                        compress,
                        Arrays.asList(files),
                        null));
        String content = getContent();
        writeRecord(writer, SOME_URL, "text/html", content.length(),
            getBais(content));
        writer.close();
        return writer.getFile();
    }
   
//    public void testSpeed() throws IOException {
//        ARCWriter writer = createArcWithOneRecord("speed", true);
//        // Add a record with a length that is too long.
//        String content = getContent();
//        final int count = 100000;
//        logger.info("Starting speed write of " + count + " records.");
//        for (int i = 0; i < count; i++) {
//            writeRecord(writer, SOME_URL, "text/html", content.length(),
//                    getBaos(content));
//        }
//        writer.close();
//        logger.info("Finished speed write test.");
//    }
   
   
    public void testValidateMetaLine() throws Exception {
        final String line = "http://www.aandw.net/images/walden2.png " +
            "128.197.34.86 20060111174224 image/png 2160";
        ARCWriter w = createARCWriter("testValidateMetaLine", true);
        try {
            w.validateMetaLine(line);
            w.validateMetaLine(line + LINE_SEPARATOR);
            w.validateMetaLine(line + "\\r\\n");
        } finally {
            w.close();
        }
    }
   
    public void testArcRecordOffsetReads() throws Exception {
        ARCReader r = getSingleRecordReader("testArcRecordInBufferStream");
        ARCRecord ar = getSingleRecord(r);
        // Now try getting some random set of bytes out of it
        // at an odd offset (used to fail because we were
        // doing bad math to find where in buffer to read).
        final byte[] buffer = new byte[17];
        final int maxRead = 4;
        int totalRead = 0;
        while (totalRead < maxRead) {
            totalRead = totalRead
                + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
            assertTrue(totalRead > 0);
        }
        r.close();
    }
   
    // available should always be >= 0; extra read()s should all give EOF
    public void testArchiveRecordAvailableConsistent() throws Exception {
        // first test reading byte-at-a-time via no-param read()
        ARCReader r = getSingleRecordReader("testArchiveRecordAvailableConsistent");
        ARCRecord record = getSingleRecord(r);
        int c = record.read();
        while(c>=0) {
            c = record.read();
        }
        // consecutive reads after EOR should always give -1, still show zero available()
        for (int i=0; i<5; i++) {
            assertTrue("available negative:"+record.available(), record.available()>=0);
            assertEquals(-1, record.read());           
        }
        r.close();
    }
   
    // should always give -1 on repeated reads past EOR
    public void testArchiveRecordEORConsistent() throws Exception {
        ARCReader r = getSingleRecordReader("testArchiveRecordEORConsistent");
        ARCRecord record = getSingleRecord(r);
        this.readToEOS(record);
        // consecutive reads after EOR should always give -1
        for (int i=0; i<5; i++) {
            assertEquals(-1, record.read(new byte[1]));           
        }
        r.close();
    }
   
    // should not throw premature EOF when wrapped with BufferedInputStream
    // [HER-1450] showed this was the case using Apache Tika
    public void testArchiveRecordMarkSupport() throws Exception {
        ARCReader r = getSingleRecordReader("testArchiveRecordMarkSupport");
        ARCRecord record = getSingleRecord(r);
        record.setStrict(true);
        // ensure mark support
        InputStream stream = new BufferedInputStream(record);
        if (stream.markSupported()) {
            for (int i=0; i<3; i++) {
                this.readToEOS(stream);
                stream.mark(stream.available());
                stream.reset();
            }
            stream.close();
        }
        r.close();
    }
   
    /**
     * Test a particular style of using the reader iterator. (Should
     * possibly be on a reader-centric test class, but the best setup
     * functionality is here.)
     *
     * @throws IOException
     */
    public void testReadIterator() throws IOException {
        final int recordCount = 3;
        File arcFile = writeRecords("writeRecord", true,
            DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
        ARCReader reader = ARCReaderFactory.get(arcFile);
        Iterator<ArchiveRecord> it = reader.iterator();
        while (it.hasNext()) {
            ArchiveRecord next = it.next();
            next.close();
        }
        reader.close();
    }

    protected void readToEOS(InputStream in) throws Exception {
        byte [] buf = new byte[1024];
        int read = 0;
        while (read >= 0) {
            read = in.read(buf);
            // System.out.println("readToEOS read " + read + " bytes");
        }
    }
   
    protected ARCReader getSingleRecordReader(String name) throws Exception {
        // Get an ARC with one record.
        WriterPoolMember w = createArcWithOneRecord(name, true);
        w.close();
        // Get reader on said ARC.
        ARCReader r = ARCReaderFactory.get(w.getFile());
        return r;
    }
   
    protected ARCRecord getSingleRecord(ARCReader r) {
        final Iterator<ArchiveRecord> i = r.iterator();
        // Skip first ARC meta record.
        i.next();
        i.hasNext();
        // Now we're at first and only record in ARC.
        return (ARCRecord) i.next();
    }
}
TOP

Related Classes of org.archive.io.arc.ARCWriterTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.