Source Code of org.broad.igv.tdf.TDFReader$IndexEntry

/*
 * Copyright (c) 2007-2012 The Broad Institute, Inc.
 * SOFTWARE COPYRIGHT NOTICE
 * This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
 *
 * This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
 *
 * This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
 * Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
 */


/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package org.broad.igv.tdf;


import htsjdk.samtools.seekablestream.SeekableStream;
import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.exceptions.DataLoadException;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.track.TrackType;
import org.broad.igv.track.WindowFunction;
import org.broad.igv.util.CompressionUtils;
import org.broad.igv.util.ResourceLocator;
import org.broad.igv.util.StringUtils;
import org.broad.igv.util.collections.LRUCache;
import org.broad.igv.util.stream.IGVSeekableStreamFactory;


import java.io.IOException;
import java.nio.BufferUnderflowException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.*;


/**
 * @author jrobinso
 */
public class TDFReader {


    static final Logger log = Logger.getLogger(TDFReader.class);
    public static final int GZIP_FLAG = 0x1;


    private SeekableStream seekableStream = null;
    private int version;
    private Map<String, IndexEntry> datasetIndex;
    private Map<String, IndexEntry> groupIndex;
    private TrackType trackType;
    private String trackLine;
    private String[] trackNames;
    private String genomeId;
    LRUCache<String, TDFGroup> groupCache = new LRUCache(20);
    LRUCache<String, TDFDataset> datasetCache = new LRUCache(20);
    TDFTile wgTile;


    Map<WindowFunction, Double> valueCache = new HashMap();
    private List<WindowFunction> windowFunctions;
    ResourceLocator locator;


    boolean compressed = false;


    Set<String> chrNames;
    private final CompressionUtils compressionUtils;


    //private String path;


    public static TDFReader getReader(String path) {
        return getReader(new ResourceLocator(path));
    }


    public static TDFReader getReader(ResourceLocator locator) {
        return new TDFReader(locator);
    }


    public TDFReader(ResourceLocator locator) {
        //this.path = path;
        this.locator = locator;
        try {
            log.debug("Getting stream");
            seekableStream = IGVSeekableStreamFactory.getInstance().getStreamFor(locator.getPath());
            log.debug("Reading header");
            readHeader();
            log.debug("Done reading header");


        } catch (IOException ex) {
            log.error("Error loading file: " + locator.getPath(), ex);
            throw new DataLoadException("Error loading file: " + ex.toString(), locator.getPath());
        }
        compressionUtils = new CompressionUtils();
    }


    public void close() {
        try {
            seekableStream.close();
        } catch (IOException e) {
            log.error("Error closing reader for: " + getPath(), e);
        }
    }


    public String getPath() {
        return locator.getPath();
    }


    private void readHeader() throws IOException {


        // Buffer for the magic number, version, index position, and index
        // byte count + header byte count  (4 + 4 + 8 + 4 + 4)
        //byte[] buffer = new byte[24];
        //readFully(buffer);
        byte[] buffer = readBytes(0, 24);
        ByteBuffer byteBuffer = ByteBuffer.wrap(buffer);
        byteBuffer.order(ByteOrder.LITTLE_ENDIAN);


        int magicNumber = byteBuffer.getInt();


        byte[] magicBytes = new byte[4];
        System.arraycopy(buffer, 0, magicBytes, 0, 4);
        String magicString = new String(magicBytes);


        if (!(magicString.startsWith("TDF") || !magicString.startsWith("IBF"))) {
            String msg = "Error reading header: bad magic number.";
            //    throw new DataLoadException(msg, locator.getPath());
        }


        version = byteBuffer.getInt();
        long idxPosition = byteBuffer.getLong();
        int idxByteCount = byteBuffer.getInt();
        int nHeaderBytes = byteBuffer.getInt();


        byte[] bytes = readBytes(24, nHeaderBytes);
        byteBuffer = ByteBuffer.wrap(bytes);
        byteBuffer.order(ByteOrder.LITTLE_ENDIAN);


        if (version >= 2) {
            int nWFs = byteBuffer.getInt();
            log.debug("nWFs: " + nWFs);
            this.windowFunctions = new ArrayList(nWFs);
            for (int i = 0; i < nWFs; i++) {
                String wfName = StringUtils.readString(byteBuffer);
                try {
                    windowFunctions.add(WindowFunction.valueOf(wfName));
                } catch (Exception e) {
                    log.error("Error creating window function: " + wfName, e);
                }
            }
        } else {
            windowFunctions = Arrays.asList(WindowFunction.mean);
        }


        // Track type
        try {
            trackType = TrackType.valueOf(StringUtils.readString(byteBuffer));


        } catch (Exception e) {
            trackType = TrackType.OTHER;
        }


        // Track line
        trackLine = StringUtils.readString(byteBuffer).trim();




        int nTracks = byteBuffer.getInt();
        trackNames = new String[nTracks];
        for (int i = 0; i < nTracks; i++) {
            trackNames[i] = StringUtils.readString(byteBuffer);
        }


        // Flags
        if (version > 2) {
            genomeId = StringUtils.readString(byteBuffer);
            int flags = byteBuffer.getInt();
            compressed = (flags & GZIP_FLAG) != 0;
        } else {
            compressed = false;
        }




        readMasterIndex(idxPosition, idxByteCount);


    }




    private void readMasterIndex(long idxPosition, int nBytes) throws IOException {


        try {
//fis.seek(idxPosition);
            //byte[] bytes = new byte[nBytes];
            //readFully(bytes);
            byte[] bytes = readBytes(idxPosition, nBytes);
            ByteBuffer byteBuffer = ByteBuffer.wrap(bytes);
            byteBuffer.order(ByteOrder.LITTLE_ENDIAN);


            int nDatasets = byteBuffer.getInt();


            datasetIndex = new LinkedHashMap(nDatasets);
            for (int i = 0; i < nDatasets; i++) {
                String name = StringUtils.readString(byteBuffer);
                long fPosition = byteBuffer.getLong();
                int n = byteBuffer.getInt();
                datasetIndex.put(name, new IndexEntry(fPosition, n));
            }


            int nGroups = byteBuffer.getInt();
            groupIndex = new LinkedHashMap(nGroups);
            for (int i = 0; i < nGroups; i++) {
                String name = StringUtils.readString(byteBuffer);
                long fPosition = byteBuffer.getLong();
                int n = byteBuffer.getInt();
                groupIndex.put(name, new IndexEntry(fPosition, n));
            }
        } catch (BufferUnderflowException e) {
            // We intermittently see this exception in this method.  Log as much info as possible
            log.error("BufferUnderflowException.  path=" + getPath() + "  idxPosition=" + idxPosition + "  nBytes=" + nBytes);
            throw e;
        }
    }


    public TDFDataset getDataset(String chr, int zoom, WindowFunction windowFunction) {


        // Version 1 only had mean
        String wf = getVersion() < 2 ? "" : "/" + windowFunction.toString();
        String zoomString = chr.equals(Globals.CHR_ALL) ? "0" : String.valueOf(zoom);
        String dsName = "/" + chr + "/z" + zoomString + wf;


        TDFDataset ds = getDataset(dsName);




        return ds;
    }


    public synchronized TDFDataset getDataset(String name) {


        if (datasetCache.containsKey(name)) {
            return datasetCache.get(name);
        }


        try {
            if (datasetIndex.containsKey(name)) {
                IndexEntry ie = datasetIndex.get(name);
                long position = ie.position;
                int nBytes = ie.nBytes;


                //fis.seek(position);
                //byte[] buffer = new byte[nBytes];
                //readFully(buffer);
                byte[] buffer = readBytes(position, nBytes);
                ByteBuffer byteBuffer = ByteBuffer.wrap(buffer);
                byteBuffer.order(ByteOrder.LITTLE_ENDIAN);


                TDFDataset ds = new TDFDataset(name, byteBuffer, this);
                datasetCache.put(name, ds);
                return ds;
            } else {
                datasetCache.put(name, null);
                return null;
            }


        } catch (IOException ex) {
            log.error("Error reading dataset: " + getPath() + " (" + name + ")", ex);
            throw new RuntimeException("System error occured while reading dataset: " + name);
        }
    }


    public Collection<String> getDatasetNames() {
        return datasetIndex.keySet();
    }


    public Collection<String> getGroupNames() {
        return groupIndex.keySet();
    }


    public synchronized TDFGroup getGroup(String name) {
        if (groupCache.containsKey(name)) {
            return groupCache.get(name);
        }


        try {
            IndexEntry ie = groupIndex.get(name);
            long position = ie.position;
            int nBytes = ie.nBytes;


            //fis.seek(position);
            //byte[] buffer = new byte[nBytes];
            //readFully(buffer);
            byte[] buffer = readBytes(position, nBytes);
            ByteBuffer byteBuffer = ByteBuffer.wrap(buffer);
            byteBuffer.order(ByteOrder.LITTLE_ENDIAN);


            TDFGroup group = new TDFGroup(name, byteBuffer);


            groupCache.put(name, group);


            return group;


        } catch (IOException ex) {
            log.error("Error reading group: " + name, ex);
            throw new RuntimeException("System error occured while reading group: " + name);
        }
    }


    // TODO -- move to dataset class


    public TDFTile readTile(TDFDataset ds, int tileNumber) {


        try {
            if (tileNumber >= ds.tilePositions.length) {
                // TODO - return empty tile
                return null;
            }


            long position = ds.tilePositions[tileNumber];
            if (position < 0) {


                // Indicates empty tile
                // TODO -- return an empty tile?
                return null;
            }


            int nBytes = ds.tileSizes[tileNumber];
            //fis.seek(position);
            //byte[] buffer = new byte[nBytes];
            //readFully(buffer);
            byte[] buffer = readBytes(position, nBytes);
            if (compressed) {
                buffer = compressionUtils.decompress(buffer);


            }


            return TileFactory.createTile(buffer, trackNames.length);
        } catch (IOException ex) {
            String tileName = ds.getName() + "[" + tileNumber + "]";
            log.error("Error reading data tile: " + tileName, ex);
            throw new RuntimeException("System error occured while reading tile: " + tileName);
        }
    }


    /**
     * @return the version
     */
    public int getVersion() {
        return version;
    }


    /**
     * @return the trackType
     */
    public TrackType getTrackType() {
        return trackType;
    }


    /**
     * @return the trackLine
     */
    public String getTrackLine() {
        return trackLine;
    }


    /**
     * @return the trackNames
     */
    public String[] getTrackNames() {
        return trackNames;
    }


    /**
     * Only available for version >= 3.
     *
     * @return
     */
    public String getGenomeId() {
        return genomeId;
    }


    private Double getValue(WindowFunction wf) {
        if (!valueCache.containsKey(wf)) {
            TDFGroup rootGroup = getGroup("/");
            String maxString = rootGroup.getAttribute(wf.getValue());
            try {
                valueCache.put(wf, Double.parseDouble(maxString));
            } catch (Exception e) {
                log.info("Warning: value '" + wf.toString() + "' not found in tdf value " + getPath());
                valueCache.put(wf, null);
            }
        }
        return valueCache.get(wf);
    }


    /**
     * Return the default upper value for the data range.  A check is made to see if both upper and lower limits
     * are equal to zero, within a tolerance.  If they are the upper limit is arbitrarily set to "100".  This is
     * protection against the pathological case that can occur with datasets consisting of largely zero values.
     *
     * @return
     */
    public double getUpperLimit() {
        Double val = getValue(WindowFunction.percentile98);
        double upperLimit = val == null ? getDataMax() : val;
        if (upperLimit < 1.0e-30 && getLowerLimit() < 1.0e-30) {
            upperLimit = 100;
        }
        return upperLimit;
    }


    public double getLowerLimit() {
        Double val = getValue(WindowFunction.percentile2);
        return val == null ? getDataMin() : val;
    }


    public double getDataMax() {
        Double val = getValue(WindowFunction.max);
        return val == null ? 100 : val;
    }


    public double getDataMin() {
        Double val = getValue(WindowFunction.min);
        return val == null ? 0 : val;
    }




    public synchronized byte[] readBytes(long position, int nBytes) throws IOException {
        seekableStream.seek(position);
        byte[] buffer = new byte[nBytes];
        int read = seekableStream.read(buffer, 0, nBytes);
        return buffer;
    }


    /**
     * @return the windowFunctions
     */
    public List<WindowFunction> getWindowFunctions() {
        return windowFunctions;
    }


    /**
     * Return a list of all chromosome names represented in this dataset
     * TODO -- record this explicitly in the TDF file
     *
     * @return
     */


    public Set<String> getChromosomeNames() {
        if (chrNames == null) {
            ///DatasetIndex /chr1/z0/mean=org.broad.igv.tdf.TDFReader$IndexEntry@6a493b65
            chrNames = new HashSet();
            for (String key : datasetIndex.keySet()) {
                String[] tokens = Globals.forwardSlashPattern.split(key);
                int nTokens = tokens.length;
                if (nTokens > 1) {
                    chrNames.add(tokens[1]);
                }
            }
        }
        return chrNames;
    }


    class IndexEntry {


        long position;
        int nBytes;


        public IndexEntry(long position, int nBytes) {
            this.position = position;
            this.nBytes = nBytes;
        }
    }




    /**
     * Print index entries (for debugging)
     */
    public void dumpIndex() {


        for (Map.Entry<String, IndexEntry> entry : datasetIndex.entrySet()) {


            String dsName = entry.getKey();


            TDFDataset ds = getDataset(dsName);


            int size = 0;
            for (int sz : ds.tileSizes) {
                size += sz;
            }
            System.out.println(dsName + "\t" + size);


            datasetCache.clear();
        }
    }


    public TDFTile getWholeGenomeTile(Genome genome, WindowFunction wf) {


        if (wgTile == null) {


            int binCount = 700;
            int nTracks = this.getTrackNames().length; // TODO -- is there a more direct way to know this?
            double binSize = (genome.getNominalLength() / 1000) / binCount;
            Accumulator[][] accumulators = new Accumulator[nTracks][binCount];


            for (String chrName : genome.getLongChromosomeNames()) {


                TDFDataset chrDataset = getDataset(chrName, 0, wf);
                if(chrDataset == null) continue;


                List<TDFTile> chrTiles = chrDataset.getTiles();
                chrDataset.clearCache(); // Don't cache these
                for (TDFTile t : chrTiles) {
                    int[] chrStart = t.getStart();
                    int[] chrEnd = t.getEnd();


                    for (int p = 0; p < chrStart.length; p++) {


                        int gStart = genome.getGenomeCoordinate(chrName, chrStart[p]);
                        int gEnd = genome.getGenomeCoordinate(chrName, chrEnd[p]);


                        int binStart = (int) (gStart / binSize);
                        int binEnd = Math.min(binCount - 1, (int) (gEnd / binSize));
                        for (int b = binStart; b <= binEnd; b++) {


                            for (int n = 0; n < nTracks; n++) {


                                Accumulator acc = accumulators[n][b];
                                if (acc == null) {
                                    acc = new Accumulator(WindowFunction.mean);
                                    accumulators[n][b] = acc;
                                }


                                int basesCovered = Math.min(gEnd, binEnd) - Math.max(gStart, binStart);
                                acc.add(basesCovered, t.getData(n)[p], null);
                            }
                        }
                    }
                }
            }


            // Fill data array for "wg tile"
            float[][] data = new float[nTracks][binCount];
            for (int n = 0; n < nTracks; n++) {
                Accumulator[] accArray = accumulators[n];
                for (int p = 0; p < accArray.length; p++) {
                    Accumulator acc = accArray[p];
                    if (acc != null) {
                        acc.finish();
                        data[n][p] = acc.getValue();


                    }
                }
            }


            //    public TDFFixedTile(int tileStart, int start, double span, float[][] data) {
            wgTile = new TDFFixedTile(0, 0, binSize, data);


        }
        return wgTile;
    }
}
Source Code of org.broad.igv.tdf.TDFReader$IndexEntry

Related Classes of org.broad.igv.tdf.TDFReader$IndexEntry