Package org.jwat.gzip

Source Code of org.jwat.gzip.GzipReader

/**
* Java Web Archive Toolkit - Software to read and validate ARC, WARC
* and GZip files. (http://jwat.org/)
* Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.jwat.gzip;

import org.jwat.common.ByteCountingPushBackInputStream;
import org.jwat.common.Diagnosis;
import org.jwat.common.DiagnosisType;
import org.jwat.common.Diagnostics;
import org.jwat.common.ISO8859_1;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.zip.CRC32;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;

/**
* A reader for (multi-part) GZip files. Validates header and CRC's.
* Entries are read sequentially from the input stream. Random access is
* supported indirectly if the used input stream supports this.
* Compressed data is available through an uncompressing input stream wrapper.
*
* @author nicl
*/
public class GzipReader implements Closeable {

    /** Buffer size to use when read skipping. */
    public static final int SKIP_READ_BUFFER_SIZE = 8192;

    /** Read buffer used by the skip method. */
    protected byte[] skip_read_buffer = new byte[SKIP_READ_BUFFER_SIZE];

    /** Default input buffer size. */
    public static final int DEFAULT_INPUT_BUFFER_SIZE = 8192;

    /** Input stream of GZip (multi-part) file. */
    protected ByteCountingPushBackInputStream pbin;
    /** Inflater used to uncompress GZip entries. */
    protected Inflater inf = new Inflater( true );
    /** Checksum object used to calculate CRC16 and CRC32 values. */
    protected CRC32 crc = new CRC32();
    /** Last number of bytes read into the input buffer. */
    protected int lastInput;
    /** Input buffer used to feed the inflater. */
    protected byte[] inputBytes;

    /** ISO-8859-1 validating de-/encoder. */
    protected final ISO8859_1 iso8859_1 = new ISO8859_1();

    /** Compliance status for records parsed up to now. */
    protected boolean bIsCompliant = true;

    /** Validation errors and warnings. */
    public final Diagnostics<Diagnosis> diagnostics = new Diagnostics<Diagnosis>();

    /** Entries read. */
    protected int entries = 0;

    /** Entry offset, updated each time an entry is closed. */
    protected long startOffset = -1;

    /** Number of bytes consumed by this reader. */
    protected long consumed;

    /** Current GZip entry object. */
    protected GzipEntry gzipEntry;

    /** Partial GZip entry which could not be completely read. */
    public GzipEntry partialEntry;

    /** Buffer used to read header.  */
    protected byte[] headerBytes = new byte[10];
    /** Buffer used to read the XLEN value. */
    protected byte[] xlenBytes = new byte[2];
    /** Buffer used to read the FNAME data. */
    protected byte[] fnameBytes;
    /** Buffer used to read the FCOMMENT data. */
    protected byte[] fcommentBytes;
    /** Buffer used to read the CRC16 value. */
    protected byte[] crc16Bytes = new byte[2];
    /** Buffer used to read trailer. */
    protected byte[] trailerBytes = new byte[8];

    /**
     * Check head of <code>PushBackInputStream</code> for a GZip magic number.
     * The state of the <code>PushBackInputStream</code> is the same after the
     * call as before the call.
     * @param pbin <code>PushBackInputStream</code> with GZip entries
     * @return boolean indicating presence of a GZip magic number
     * @throws IOException if an i/o error occurs while examining head of stream
     */
    public static boolean isGzipped(ByteCountingPushBackInputStream pbin) throws IOException {
        if (pbin == null) {
            throw new IllegalArgumentException("'pbin'is null!");
        }
        byte[] magicBytes = new byte[2];
        int magicNumber = 0xdeadbeef;
        // Look for the leading 2 magic bytes in front of every valid GZip entry.
        int read = pbin.readFully(magicBytes);
        if (read == 2) {
            magicNumber = ((magicBytes[1] & 255) << 8) | (magicBytes[0] & 255);
        }
        if (read > 0) {
            pbin.unread(magicBytes, 0, read);
        }
        return (magicNumber == GzipConstants.GZIP_MAGIC);
    }

    /**
     * Construct a GZip reader with a default input buffer size of
     * DEFAULT_INPUT_BUFFER_SIZE.
     * @param in input stream of GZip file
     */
    public GzipReader(InputStream in) {
        if (in == null) {
            throw new IllegalArgumentException("in is null!");
        }
        pbin = new ByteCountingPushBackInputStream(in, DEFAULT_INPUT_BUFFER_SIZE);
        inputBytes = new byte[DEFAULT_INPUT_BUFFER_SIZE];
    }

    /**
     * Construct a GZip reader with the specified input buffer size.
     * @param in input stream of GZip file
     * @param buffer_size input buffer size to use
     */
    public GzipReader(InputStream in, int buffer_size) {
        if (in == null) {
            throw new IllegalArgumentException("in is null!");
        }
        if (buffer_size <= 0) {
            throw new IllegalArgumentException(
                    "buffer_size is less or equals to zero: " + buffer_size);
        }
        in = new BufferedInputStream(in, buffer_size);
        pbin = new ByteCountingPushBackInputStream(in, DEFAULT_INPUT_BUFFER_SIZE);
        inputBytes = new byte[DEFAULT_INPUT_BUFFER_SIZE];
    }

    /**
     * Release resources associated with this reader.
     * Also closes the <code>InputStream</code> supplied to the constructor.
     * @throws IOException if an i/o error occurs while closing reader
     */
    public void close() throws IOException {
        if (gzipEntry != null) {
            gzipEntry.close();
            startOffset = pbin.getConsumed();
            gzipEntry = null;
        }
        if (inf != null) {
            inf.end();
            inf = null;
        }
        if (pbin != null){
            pbin.close();
        }
        pbin = null;
    }

    /**
     * Returns a boolean indicating whether all entries parsed so far are compliant.
     * @return a boolean indicating whether all entries parsed so far are compliant
     */
    public boolean isCompliant() {
        return bIsCompliant;
    }

    /**
     * Returns the offset of the current entry or -1 if none have been read.
     * @return the offset of the current entry or -1
     */
    public long getStartOffset() {
        return startOffset;
    }

    /**
     * Returns the current offset in the input stream. Which could be anywhere
     * in the (multi-part) GZip file.
     * @return current offset in the input stream
     */
    public long getOffset() {
        if (pbin != null) {
            return pbin.getConsumed();
        } else {
            return consumed;
        }
    }

    /**
     * Get number of bytes consumed by this reader.
     * Updated after each record has been read.
     * @return number of bytes consumed by this reader
     */
    public long getConsumed() {
        return consumed;
    }

    /**
     * Get the next GZip entry header and prepare the compressed data for
     * input stream retrieval.
     * @return GZip entry or null
     * @throws IOException if an i/o error occurs while reading entry
     */
    public GzipEntry getNextEntry() throws IOException {
        if (gzipEntry != null) {
            gzipEntry.close();
            gzipEntry = null;
        }
        int read = pbin.readFully(headerBytes);
        if (read == 10) {
            try {
                crc.reset();
                inf.reset();
                startOffset = pbin.getConsumed() - 10;
                gzipEntry = new GzipEntry();
                gzipEntry.reader = this;
                gzipEntry.startOffset = startOffset;
                /*
                 * Header.
                 */
                gzipEntry.magic = ((headerBytes[1] & 255) << 8) | (headerBytes[0] & 255);
                gzipEntry.cm = (short)(headerBytes[2] & 255);
                gzipEntry.flg = (short)(headerBytes[3] & 255);
                gzipEntry.mtime = ((headerBytes[7] & 255) << 24) | ((headerBytes[6] & 255) << 16) | ((headerBytes[5] & 255) << 8) | (headerBytes[4] & 255);
                gzipEntry.date = (gzipEntry.mtime != 0) ? new Date(gzipEntry.mtime * 1000) : null;
                gzipEntry.xfl = (short)(headerBytes[8] & 255);
                gzipEntry.os = (short)(headerBytes[9] & 255);
                crc.update(headerBytes);
                if (gzipEntry.magic != GzipConstants.GZIP_MAGIC) {
                    gzipEntry.diagnostics.addError(
                            new Diagnosis(
                                    DiagnosisType.INVALID_EXPECTED,
                                    "Magic Value",
                                    Integer.toHexString(gzipEntry.magic),
                                    Integer.toHexString(GzipConstants.GZIP_MAGIC)
                                )
                            );
                }
                if (gzipEntry.cm != GzipConstants.CM_DEFLATE) {
                    // Currently only the deflate compression method is supported in GZip.
                    gzipEntry.diagnostics.addError(
                            new Diagnosis(
                                    DiagnosisType.INVALID_EXPECTED,
                                    "Compression Method",
                                    Integer.toHexString(gzipEntry.cm),
                                    Integer.toHexString(GzipConstants.CM_DEFLATE)
                                )
                            );
                } else {
                    // Currently only the deflate compression method is supported in GZip.
                    // Check to see whether some xfl reserved bits have been used.
                    if ((gzipEntry.xfl & GzipConstants.DEFLATE_XLF_RESERVED) != 0) {
                        gzipEntry.diagnostics.addWarning(
                                new Diagnosis(
                                        DiagnosisType.RESERVED,
                                        "eXtra FLags",
                                        Integer.toHexString(gzipEntry.xfl & GzipConstants.DEFLATE_XLF_RESERVED)
                                    )
                                );
                    }
                    if ((gzipEntry.xfl & GzipConstants.DEFLATE_XFL_COMPRESSION_MASK) == GzipConstants.DEFLATE_XFL_COMPRESSION_MASK) {
                        gzipEntry.diagnostics.addError(
                                new Diagnosis(
                                        DiagnosisType.INVALID_DATA,
                                        "eXtra FLags",
                                        Integer.toHexString(gzipEntry.xfl & GzipConstants.DEFLATE_XFL_COMPRESSION_MASK)
                                    )
                                );
                    }
                }
                if ((gzipEntry.flg & GzipConstants.FLG_FRESERVED) != 0) {
                    gzipEntry.diagnostics.addWarning(
                            new Diagnosis(
                                    DiagnosisType.RESERVED,
                                    "FLaGs",
                                    Integer.toHexString(gzipEntry.flg & GzipConstants.FLG_FRESERVED)
                                )
                            );
                }
                if (!GzipConstants.osIdxStr.containsKey((int)gzipEntry.os)) {
                    gzipEntry.diagnostics.addWarning(
                            new Diagnosis(
                                    DiagnosisType.UNKNOWN,
                                    "Operating System",
                                    Integer.toString(gzipEntry.os)
                        )
                    );
                }
                /*
                 * FTEXT.
                 */
                if ((gzipEntry.flg & GzipConstants.FLG_FTEXT) == GzipConstants.FLG_FTEXT) {
                    gzipEntry.bFText = true;
                }
                /*
                 * FEXTRA.
                 */
                if ((gzipEntry.flg & GzipConstants.FLG_FEXTRA) == GzipConstants.FLG_FEXTRA) {
                    read = pbin.read(xlenBytes);
                    if (read == 2) {
                        gzipEntry.xlen = ((xlenBytes[1] & 255) << 8) | (xlenBytes[0] & 255);
                        if (gzipEntry.xlen > 0) {
                            gzipEntry.extraBytes = new byte[gzipEntry.xlen];
                            read = pbin.readFully(gzipEntry.extraBytes);
                            if (read != gzipEntry.xlen) {
                                throw new EOFException("Unexpected EOF!");
                            }
                            int idx = 0;
                            boolean b = true;
                            GzipExtraData extraData;
                            int len;
                            while (b) {
                                if (idx <= gzipEntry.extraBytes.length - 4) {
                                    extraData = new GzipExtraData();
                                    extraData.si1 = (byte)(gzipEntry.extraBytes[idx++] & 255);
                                    extraData.si2 = (byte)(gzipEntry.extraBytes[idx++] & 255);
                                    len = ((gzipEntry.extraBytes[idx+1] & 255) << 8) | (gzipEntry.extraBytes[idx] & 255);
                                    idx += 2;
                                    if (idx + len <= gzipEntry.extraBytes.length) {
                                        extraData.data = new byte[len];
                                        System.arraycopy(gzipEntry.extraBytes, idx, extraData.data, 0, len);
                                        idx += len;
                                        gzipEntry.extraData.add(extraData);
                                    } else {
                                        b = false;
                                    }
                                } else {
                                    b = false;
                                }
                            }
                            if (idx != gzipEntry.extraBytes.length) {
                                gzipEntry.diagnostics.addError(
                                        new Diagnosis(
                                                DiagnosisType.INVALID_DATA,
                                                "FEXTRA",
                                                "Invalid structure",
                                                "Data truncated"
                                            )
                                        );
                            }
                        } else {
                            gzipEntry.extraBytes = new byte[0];
                        }
                    } else {
                        throw new EOFException("Unexpected EOF!");
                    }
                    crc.update(xlenBytes);
                    crc.update(gzipEntry.extraBytes);
                }
                /*
                 * FNAME.
                 */
                if ((gzipEntry.flg & GzipConstants.FLG_FNAME) == GzipConstants.FLG_FNAME) {
                    fnameBytes = readZeroTerminated();
                    if (fnameBytes == null) {
                        throw new EOFException("Unexpected EOF!");
                    }
                    if (!iso8859_1.decode(fnameBytes, "")) {
                        gzipEntry.diagnostics.addWarning(
                                new Diagnosis(
                                        DiagnosisType.INVALID_ENCODING,
                                        "FName",
                                        iso8859_1.decoded,
                                        "ISO-8859-1"
                                    )
                                );
                    }
                    gzipEntry.fname = iso8859_1.decoded;
                    crc.update(fnameBytes);
                    crc.update(0);
                }
                /*
                 * FCOMMENT.
                 */
                if ((gzipEntry.flg & GzipConstants.FLG_FCOMMENT) == GzipConstants.FLG_FCOMMENT) {
                    fcommentBytes = readZeroTerminated();
                    if (fcommentBytes == null) {
                        throw new EOFException("Unexpected EOF!");
                    }
                    if (!iso8859_1.decode(fcommentBytes, "\n")) {
                        gzipEntry.diagnostics.addWarning(
                                new Diagnosis(
                                        DiagnosisType.INVALID_ENCODING,
                                        "FComment",
                                        iso8859_1.decoded,
                                        "ISO-8859-1"
                                    )
                                );
                    }
                    gzipEntry.fcomment = iso8859_1.decoded;
                    crc.update(fcommentBytes);
                    crc.update(0);
                }
                /*
                 * FHCRC.
                 */
                if ((gzipEntry.flg & GzipConstants.FLG_FHCRC) == GzipConstants.FLG_FHCRC) {
                    read = pbin.read(crc16Bytes);
                    if (read == 2) {
                        gzipEntry.bFhCrc = true;
                        gzipEntry.crc16 = ((crc16Bytes[1] & 255) << 8) | (crc16Bytes[0] & 255);
                    } else {
                        throw new EOFException("Unexpected EOF!");
                    }
                }
                /*
                 * Computed crc16.
                 */
                gzipEntry.comp_crc16 = ((int)crc.getValue()) & 0x0000ffff;
                crc.reset();
                if (gzipEntry.crc16 != null && gzipEntry.crc16 != gzipEntry.comp_crc16) {
                    gzipEntry.diagnostics.addError(
                            new Diagnosis(
                                    DiagnosisType.INVALID_EXPECTED,
                                    "CRC16",
                                    Integer.toHexString(gzipEntry.crc16),
                                    Integer.toHexString(gzipEntry.comp_crc16)
                                )
                            );
                }
                /*
                 * Prepare Entry InputStream.
                 */
                lastInput = 0;
                gzipEntry.in = new GzipEntryInputStream(this, gzipEntry);
                // Compliance
                if (gzipEntry.diagnostics.hasErrors() || gzipEntry.diagnostics.hasWarnings()) {
                    gzipEntry.bIsCompliant = false;
                } else {
                    gzipEntry.bIsCompliant = true;
                }
                bIsCompliant &= gzipEntry.bIsCompliant;
                ++entries;
            } catch (EOFException e) {
                partialEntry = gzipEntry;
                diagnostics.addError(new Diagnosis(DiagnosisType.INVALID_DATA, "GZip file", "Unexpected EOF!"));
                bIsCompliant = false;
                gzipEntry = null;
            }
        } else {
            // Require one or more entries to be present.
            if (entries == 0) {
                diagnostics.addError(new Diagnosis(DiagnosisType.ERROR_EXPECTED, "GZip file", "One or more records"));
                bIsCompliant = false;
            }
            if (pbin.read() != -1) {
                diagnostics.addError(new Diagnosis(DiagnosisType.INVALID_DATA, "GZip file", "Unexpected trailing data!"));
                bIsCompliant = false;
            }
        }
        return gzipEntry;
    }

    /**
     * Read non-compressed zero terminated data.
     * @return zero terminated data
     * @throws IOException if an i/o error occurs while reading data
     */
    protected byte[] readZeroTerminated() throws IOException {
        ByteArrayOutputStream out = new ByteArrayOutputStream(32);
        int b;
        while ((b = pbin.read()) > 0) {
            out.write(b);
        }
        return (b != -1) ? out.toByteArray() : null;
    }

    /**
     * Read the non-compressed trailing 8 bytes with CRC32 and ISize values.
     * @param entry Gzip entry
     * @throws IOException if an i/o error occurs while reading data
     */
    protected void readTrailer(GzipEntry entry) throws IOException {
        int read = pbin.readFully(trailerBytes);
        entry.consumed = pbin.getConsumed() - entry.startOffset;
        entry.compressed_size = inf.getBytesRead();
        entry.uncompressed_size = inf.getBytesWritten();
        consumed += entry.consumed;
        entry.reader = null;
        if (read == 8) {
            entry.crc32 = ((trailerBytes[3] & 255) << 24) | ((trailerBytes[2] & 255) << 16) | ((trailerBytes[1] & 255) << 8) | (trailerBytes[0] & 255);
            entry.isize = ((trailerBytes[7] & 255) << 24) | ((trailerBytes[6] & 255) << 16) | ((trailerBytes[5] & 255) << 8) | (trailerBytes[4] & 255);
            entry.comp_crc32 = (int)(crc.getValue() & 0xffffffff);
            entry.comp_isize = (int)(inf.getBytesWritten() & 0xffffffff);
            if (entry.comp_crc32 != entry.crc32) {
                entry.diagnostics.addError(
                        new Diagnosis(
                                DiagnosisType.INVALID_EXPECTED,
                                "CRC32",
                                Integer.toHexString(entry.crc32),
                                Integer.toHexString(entry.comp_crc32)
                            )
                        );
            }
            if (entry.comp_isize != entry.isize) {
                entry.diagnostics.addError(
                        new Diagnosis(
                                DiagnosisType.INVALID_EXPECTED,
                                "ISize",
                                Long.toString(entry.isize),
                                Long.toString(entry.comp_isize)
                            )
                        );
            }
        } else {
            gzipEntry.diagnostics.addError(new Diagnosis(DiagnosisType.INVALID_DATA, "GZip file", "Unexpected EOF!"));
            bIsCompliant = false;
        }
        // Compliance
        if (gzipEntry.diagnostics.hasErrors() || gzipEntry.diagnostics.hasWarnings()) {
            gzipEntry.bIsCompliant = false;
        } else {
            gzipEntry.bIsCompliant = true;
        }
        bIsCompliant &= gzipEntry.bIsCompliant;
    }

    /**
     * Read and uncompress data into a buffer.
     * @param b destination buffer for uncompressed data
     * @param off offset in buffer
     * @param len length of uncompressed data to read
     * @return number of bytes uncompressed or -1
     * @throws DataFormatException if the compressed data is in an unknown format
     * @throws IOException if an i/o error occurs while reading data
     */
    protected int readInflated(byte[] b, int off, int len)
                                    throws DataFormatException, IOException {
        int inflated = 0;
        while((inflated = inf.inflate(b, off, len)) == 0) {
            if (inf.finished()) {
                return -1;
            } else if (inf.needsDictionary()) {
                gzipEntry.diagnostics.addError(new Diagnosis(DiagnosisType.INVALID_DATA, "GZip file", "Unexpected EOF!"));
                bIsCompliant = false;
                throw new DataFormatException("Dictionary needed!");
            } else if (inf.needsInput()) {
                lastInput = pbin.read(inputBytes, 0, inputBytes.length);
                if (lastInput == -1) {
                    gzipEntry.diagnostics.addError(new Diagnosis(DiagnosisType.INVALID_DATA, "GZip file", "Unexpected EOF!"));
                    bIsCompliant = false;
                    throw new DataFormatException("Data missing!");
                }
                inf.setInput(inputBytes, 0, lastInput);
            } else {
                gzipEntry.diagnostics.addError(new Diagnosis(DiagnosisType.INVALID_DATA, "GZip file", "Unexpected EOF!"));
                bIsCompliant = false;
                throw new DataFormatException("Inflater malfunction!");
            }
        }
        return inflated;
    }

    /**
     * <code>InputStream</code> to expose GZip'ed data in a controlled fashion.
     *
     * @author nicl
     */
    protected static class GzipEntryInputStream extends InputStream {

        /** GZip reader used to inflate. */
        GzipReader reader;

        /** Associated GZip entry. */
        GzipEntry gzipEntry;

        /** End of uncompressed file status. */
        boolean bEof = false;

        /** Small buffer used by the read() method. */
        byte[] singleByteArray = new byte[1];

        /**
         * Construct input stream bound to a specific reader and entry.
         * @param reader GZip reader
         * @param gzipEntry GZip entry
         */
        public GzipEntryInputStream(GzipReader reader,
                                    GzipEntry gzipEntry) {
            this.reader = reader;
            this.gzipEntry = gzipEntry;
        }

        @Override
        public void close() throws IOException {
            while (!bEof) {
                skip(reader.skip_read_buffer.length);
            }
            reader = null;
            gzipEntry = null;
            singleByteArray = null;
            bEof = true;
        }

        @Override
        public int available() throws IOException {
            return !bEof ? 1 : 0;
        }

        @Override
        public int read() throws IOException {
            return read(singleByteArray, 0, 1) != -1 ? (singleByteArray[0] & 255) : -1;
        }

        @Override
        public int read(byte[] b) throws IOException {
            return read(b, 0, b.length);
        }

        @Override
        public int read(byte[] b, int off, int len) throws IOException {
            if (bEof) {
                return -1;
            }
            int read;
            try {
                read = reader.readInflated(b, off, len);
            } catch (DataFormatException e) {
                gzipEntry.diagnostics.addError(new Diagnosis(DiagnosisType.INVALID_DATA, "GZip file", "Unexpected EOF!"));
                reader.bIsCompliant = false;
                throw new IOException(e);
            }
            if (read != -1) {
                reader.crc.update(b, off, read);
            }
            else {
                int remaining = reader.inf.getRemaining();
                if (remaining > reader.lastInput) {
                    throw new IOException("Remaining larger than lastInput!");
                }
                reader.pbin.unread(reader.inputBytes, reader.lastInput - remaining,
                                   remaining);
                bEof = true;
                reader.readTrailer(gzipEntry);
            }
            return read;
        }

        @Override
        public long skip(long n) throws IOException {
            if (bEof) {
                return 0;
            }
            long remaining = n;
            long skipped = 0;
            long readLast = 0;
            while (remaining > 0 && readLast != -1) {
                remaining -= readLast;
                skipped += readLast;
                if (remaining > 0) {
                    readLast = read(reader.skip_read_buffer, 0,
                            (int) Math.min(remaining, SKIP_READ_BUFFER_SIZE));
                }
            }
            return skipped;
        }

        @Override
        public boolean markSupported() {
            return false;
        }

        @Override
        public synchronized void mark(int readlimit) {
        }

        @Override
        public synchronized void reset() throws IOException {
            throw new UnsupportedOperationException();
        }

    }

}
TOP

Related Classes of org.jwat.gzip.GzipReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.