Package net.sf.samtools.util

Source Code of net.sf.samtools.util.BlockCompressedInputStream

/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package net.sf.samtools.util;


import net.sf.samtools.FileTruncatedException;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.util.Arrays;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.net.URL;

/*
* Utility class for reading BGZF block compressed files.  The caller can treat this file like any other InputStream.
* It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
* The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
* entire file up to the location being sought.  Note that seeking is only possible if the ctor(File) is used.
*
* c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format
*/
public class BlockCompressedInputStream
        extends InputStream
{

    private InputStream mStream = null;
    private SeekableStream mFile = null;
    private byte[] mFileBuffer = null;
    private byte[] mCurrentBlock = null;
    private int mCurrentOffset = 0;
    private long mBlockAddress = 0;
    private int mLastBlockLength = 0;
    private final BlockGunzipper blockGunzipper = new BlockGunzipper();


    /**
     * Note that seek() is not supported if this ctor is used.
     */
    public BlockCompressedInputStream(final InputStream stream) {
        mStream = IOUtil.toBufferedStream(stream);
        mFile = null;
    }

    /**
     * Use this ctor if you wish to call seek()
     */
    public BlockCompressedInputStream(final File file)
        throws IOException {
        mFile = new SeekableFileStream(file);
        mStream = null;

    }

    public BlockCompressedInputStream(final URL url) {
        mFile = new SeekableBufferedStream(new SeekableHTTPStream(url));
        //mFile = new SeekableHTTPStream(url);
        mStream = null;
    }

    /**
     * @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
     * next caller of a method for this input stream. The next caller might be the same thread or another thread.
     */
    public int available()
        throws IOException {
        if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) {
            readBlock();
        }
        if (mCurrentBlock == null) {
            return 0;
        }
        return mCurrentBlock.length - mCurrentOffset;
    }

    /**
     * Closes the underlying InputStream or RandomAccessFile
     */
    public void close()
        throws IOException {
        if (mFile != null) {
            mFile.close();
            mFile = null;
        } else if (mStream != null) {
            mStream.close();
            mStream = null;
        }
        // Encourage garbage collection
        mFileBuffer = null;
        mCurrentBlock = null;
    }

    /**
     * Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
     * If no byte is available because the end of the stream has been reached, the value -1 is returned.
     * This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.

     * @return the next byte of data, or -1 if the end of the stream is reached.
     */
    public int read()
        throws IOException {
        return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1;
    }

    /**
     * Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
     * actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
     * or an exception is thrown.
     *
     * read(buf) has the same effect as read(buf, 0, buf.length).
     *
     * @param buffer the buffer into which the data is read.
     * @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
     * the stream has been reached.
     */
    public int read(final byte[] buffer)
        throws IOException {
        return read(buffer, 0, buffer.length);
    }

    /**
     * Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
     * as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
     *
     * This method blocks until input data is available, end of file is detected, or an exception is thrown.
     *
     * @param buffer buffer into which data is read.
     * @param offset the start offset in array b  at which the data is written.
     * @param length the maximum number of bytes to read.
     * @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
     * the stream has been reached.
     */
    public int read(final byte[] buffer, int offset, int length)
        throws IOException {
        final int originalLength = length;
        while (length > 0) {
            final int available = available();
            if (available == 0) {
                // Signal EOF to caller
                if (originalLength == length) {
                    return -1;
                }
                break;
            }
            final int copyLength = Math.min(length, available);
            System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength);
            mCurrentOffset += copyLength;
            offset += copyLength;
            length -= copyLength;
        }
        return originalLength - length;
    }

    /**
     * Seek to the given position in the file.  Note that pos is a special virtual file pointer,
     * not an actual byte offset.
     *
     * @param pos virtual file pointer
     */
    public void seek(final long pos)
        throws IOException {
        if (mFile == null) {
            throw new IOException("Cannot seek on stream based file");
        }
        // Decode virtual file pointer
        // Upper 48 bits is the byte offset into the compressed stream of a block.
        // Lower 16 bits is the byte offset into the uncompressed stream inside the block.
        final long compressedOffset = pos >> 16;
        final int uncompressedOffset = (int) (pos & 0xFFFF);
        final int available;
        if (mBlockAddress == compressedOffset && mCurrentBlock != null) {
            available = mCurrentBlock.length;
        } else {
            mFile.seek(compressedOffset);
            mBlockAddress = compressedOffset;
            mLastBlockLength = 0;
            readBlock();
            available = available();
        }
        if (uncompressedOffset > available ||
                (uncompressedOffset == available && !eof())) {
            throw new IOException("Invalid file pointer: " + pos);
        }
        mCurrentOffset = uncompressedOffset;
    }

    private boolean eof() throws IOException {
        if (mFile.eof()) {
            return true;
        }
        // If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
        return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
    }

    /**
     * @return virtual file pointer that can be passed to seek() to return to the current position.  This is
     * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
     * the two.
     */
    public long getFilePointer() {
        if (mCurrentOffset == mCurrentBlock.length) {
            // If current offset is at the end of the current block, file pointer should point
            // to the beginning of the next block.
            return (mBlockAddress + mLastBlockLength) << 16;
        }
        return ((mBlockAddress << 16) | mCurrentOffset);
    }

    /**
     * @param stream Must be at start of file.  Throws RuntimeException if !stream.markSupported().
     * @return true if the given file looks like a valid BGZF file.
     */
    public static boolean isValidFile(final InputStream stream)
        throws IOException {
        if (!stream.markSupported()) {
            throw new RuntimeException("Cannot test non-buffered stream");
        }
        stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
        final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
        final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
        stream.reset();
        return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
    }

    private static boolean isValidBlockHeader(final byte[] buffer) {
        return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 &&
                (buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 &&
                (buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 &&
                buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN &&
                buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 &&
                buffer[13] == BlockCompressedStreamConstants.BGZF_ID2);
    }

    private void readBlock()
        throws IOException {

        if (mFileBuffer == null) {
            mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
        }
        int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
        if (count == 0) {
            // Handle case where there is no empty gzip block at end.
            mCurrentOffset = 0;
            mBlockAddress += mLastBlockLength;
            mCurrentBlock = new byte[0];
            return;
        }
        if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
            throw new IOException("Premature end of file");
        }
        final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
        if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
            throw new IOException("Unexpected compressed block length: " + blockLength);
        }
        final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
        count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining);
        if (count != remaining) {
            throw new FileTruncatedException("Premature end of file");
        }
        inflateBlock(mFileBuffer, blockLength);
        mCurrentOffset = 0;
        mBlockAddress += mLastBlockLength;
        mLastBlockLength = blockLength;
    }

    private void inflateBlock(final byte[] compressedBlock, final int compressedLength)
        throws IOException {
        final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4);
        byte[] buffer = mCurrentBlock;
        mCurrentBlock = null;
        if (buffer == null || buffer.length != uncompressedLength) {
            buffer = new byte[uncompressedLength];
        }
        blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
        mCurrentBlock = buffer;
    }

    private int readBytes(final byte[] buffer, final int offset, final int length)
        throws IOException {
        if (mFile != null) {
            return readBytes(mFile, buffer, offset, length);
        } else if (mStream != null) {
            return readBytes(mStream, buffer, offset, length);
        } else {
            return 0;
        }
    }

    private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length)
        throws IOException {
        int bytesRead = 0;
        while (bytesRead < length) {
            final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
            if (count <= 0) {
                break;
            }
            bytesRead += count;
        }
        return bytesRead;
    }

    private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length)
        throws IOException {
        int bytesRead = 0;
        while (bytesRead < length) {
            final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
            if (count <= 0) {
                break;
            }
            bytesRead += count;
        }
        return bytesRead;
    }

    private int unpackInt16(final byte[] buffer, final int offset) {
        return ((buffer[offset] & 0xFF) |
                ((buffer[offset+1] & 0xFF) << 8));
    }

    private int unpackInt32(final byte[] buffer, final int offset) {
        return ((buffer[offset] & 0xFF) |
                ((buffer[offset+1] & 0xFF) << 8) |
                ((buffer[offset+2] & 0xFF) << 16) |
                ((buffer[offset+3] & 0xFF) << 24));
    }

    public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}

    public static FileTermination checkTermination(final File file)
        throws IOException {
        final long fileSize = file.length();
        if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) {
            return FileTermination.DEFECTIVE;
        }
        final RandomAccessFile raFile = new RandomAccessFile(file, "r");
        try {
            raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
            byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length];
            raFile.readFully(buf);
            if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) {
                return FileTermination.HAS_TERMINATOR_BLOCK;
            }
            final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
            buf = new byte[bufsize];
            raFile.seek(fileSize - bufsize);
            raFile.read(buf);
            for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length;
                    i >= 0; --i) {
                if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE,
                        buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
                    continue;
                }
                final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4);
                byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
                final int totalBlockSizeMinusOne =  byteBuffer.getShort() & 0xFFFF;
                if (buf.length - i == totalBlockSizeMinusOne + 1) {
                    return FileTermination.HAS_HEALTHY_LAST_BLOCK;
                } else {
                    return FileTermination.DEFECTIVE;
                }
            }
            return FileTermination.DEFECTIVE;
        } finally {
            raFile.close();
        }
    }

    private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
        for (int i = 0; i < length; ++i) {
            if (preamble[i] != buf[i + startOffset]) {
                return false;
            }
        }
        return true;
    }
}

TOP

Related Classes of net.sf.samtools.util.BlockCompressedInputStream

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.