Source Code of org.broad.igv.bbfile.RPTree

/*
 * Copyright (c) 2007-2012 The Broad Institute, Inc.
 * SOFTWARE COPYRIGHT NOTICE
 * This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
 *
 * This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
 *
 * This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
 * Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
 */


package org.broad.igv.bbfile;


import htsjdk.samtools.seekablestream.SeekableStream;
import org.apache.log4j.Logger;
import htsjdk.tribble.util.LittleEndianInputStream;


import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.ArrayList;


/**
 * Created by IntelliJ IDEA.
 * User: martind
 * Date: Jan 6, 2010
 * Time: 4:05:31 PM
 * To change this template use File | Settings | File Templates.
 */
/*
*   RPTree class will construct a R+ tree from a binary Bed/Wig BBFile.
*   (or by insertion of tree nodes - TBD see insert method)
*
*   1) RPTree will first read in the R+ tree header with RPTreeHeader class.
*
*   2) Starting with the root node, the readRPTreeNode method will read in the
*   node format, determine if the node contains child nodes (isLeaf = false)
*   or leaf items (isLeaf = true).
*
*   3) If the node is a leaf node, all leaf items are read in to the node's leaf array.
*
*   4) If node is a child node, readRPTreeNode will be called recursively,
*   until a leaf node is encountered, where step 3 is performed.
*
*   5) The child nodes will be populated with their child node items in reverse order
*   of recursion from step 4, until the tree is completely populated
*   back up to the root node.
*
*   6) The getChromosomeKey is provided to construct a valid key for B+
*   chromosome tree searches, and getChromosomeID returns a chromosome ID for
*   searches in the R+ index tree.
*
**/
public class RPTree {


    private static Logger log = Logger.getLogger(RPTree.class);


    public static final int RPTREE_NODE_FORMAT_SIZE = 4;       // node format size
    public static final int RPTREE_NODE_LEAF_ITEM_SIZE = 32;   // leaf item size
    public static final int RPTREE_NODE_CHILD_ITEM_SIZE = 24;  // child item size


    // R+ tree access variables   - for reading in R+ tree nodes from a file
    private int uncompressBuffSize;    // decompression buffer size; or 0 for uncompressed data
    private boolean isLowToHigh;       // binary data low to high if true; else high to low
    private long rpTreeOffset;         // file offset to the R+ tree


    // R+ tree index header - Table K
    private RPTreeHeader rpTreeHeader; // R+ tree header (Table K for BBFile)


    // R+ tree bounds
    private RPChromosomeRegion chromosomeBounds;  // R+ tree's bounding chromosome region


    // R+ tree nodal variables
    private int order;         // R+ tree order: maximum number of leaves per node
    private RPTreeNode rootNode;  // root node for R+ tree
    private long nodeCount;        // number of nodes defined in the R+ tree
    private long leafCount;        // number of leaves in the R+ tree




    /*
   * Constructor for reading in a B+ tree from a BBFile/input stream.
   * */
    /*
    *   Constructor for R+ chromosome data locator tree
    *
    *   Parameters:
    *       fis - file input stream handle
    *       fileOffset - location for R+ tree header
    *       isLowToHigh - binary values are low to high if true; else high to low
    *       uncompressBuffSize - buffer size for decompression; else 0 for uncompressed data
    * */


    public RPTree(SeekableStream fis, long fileOffset, boolean isLowToHigh, int uncompressBuffSize, boolean forceDescend) {


        // save the seekable file handle  and B+ Tree file offset
        // Note: the offset is the file position just after the B+ Tree Header
        // mBBFis = fis;
        rpTreeOffset = fileOffset;
        this.uncompressBuffSize = uncompressBuffSize;
        this.isLowToHigh = isLowToHigh;


        // read in R+ tree header - verify the R+ tree info exits
        rpTreeHeader = new RPTreeHeader(fis, rpTreeOffset, isLowToHigh);


        // log error if header not found and throw exception
        if (!rpTreeHeader.isHeaderOK()) {
            int badMagic = rpTreeHeader.getMagic();
            log.error("Error reading R+ tree header: bad magic = " + badMagic);
            throw new RuntimeException("Error reading R+ tree header: bad magic = " + badMagic);
        }


        // assigns R+ tree organization from the header
        order = rpTreeHeader.getBlockSize();
        chromosomeBounds = new RPChromosomeRegion(rpTreeHeader.getStartChromID(), rpTreeHeader.getStartBase(),
                rpTreeHeader.getEndChromID(), rpTreeHeader.getEndBase());


        // populate the tree - read in the nodes
        long nodeOffset = rpTreeOffset + rpTreeHeader.getHeaderSize();
        RPTreeNode parentNode = null;      // parent node of the root is itself, or null


        // start constructing the R+ tree - get the root node
        rootNode = readRPTreeNode(fis, nodeOffset, isLowToHigh, forceDescend);
    }


    /*
     *  Constructs an R+ Tree which conforms to the supplied information
     *      order -  the items per node factor, sometimes called the m factor,
     *      where any node must have at least m/2 items and no more than m items.
     *      keySize - the number of significant bytes in a item key.
     * */


    public RPTree(int order) {


        // R+ tree node specification
        this.order = order;


        // Note: acknowledge no bounds specified as a null  object
        chromosomeBounds = null;


    }


    /*
    *   Method returns size of buffer required for data decompression.
    *
    *   Returns:
    *       Data decompression buffer size in bytes; else 0 for uncompressed data in file.
    * */


    public int getUncompressBuffSize() {
        return uncompressBuffSize;
    }


    /*
    *   Method returns if file contains formatted data in low to high byte order.
    *
    *   Returns:
    *       Returns true if data ordered in low to high byte order; false if high to low.
    * */


    public boolean isIsLowToHigh() {
        return isLowToHigh;
    }


    /*
    *   Method returns the R+ tree order, the maximum number of leaf items per node.
    *
    *   Returns:
    *       Maximum number of leaf items per node..
    * */


    public int getOrder() {
        return order;
    }


    /*
    *   Method returns the R+ tree index header.
    *
    *   Returns:
    *       R+ tree index header.
    * */


    public RPTreeHeader getRPTreeHeader() {
        return rpTreeHeader;
    }


    /*
    *   Method returns the total number of chromosomes or contigs in the R+ tree.
    *
    *   Returns:
    *       Total number of chromosomes or contigs in the R+ tree.
    * */


    public long getItemCount() {
        return rpTreeHeader.getItemCount();
    }


    /*
    *   Method returns the chromosome bounding region for all R+ tree data.
    *
    *   Returns:
    *       chromosome bounding region for all R+ tree data
    * */


    public RPChromosomeRegion getChromosomeBounds() {
        return chromosomeBounds;
    }


    /*
    *   Method returns the total node count for the R+ tree.
    *
    *   Returns:
    *       Node count for R+ tree; or 0 if tree was constructed without nodes.
    * */


    public long getNodeCount() {
        return nodeCount;
    }


    /*
    *   Method finds the bounding chromosome region in R+ tree for a chromosome ID range.
    *
    *   Parameters:
    *       startChromID - start chromosome for the region
    *       endChromID - end chromosome for the region
    *
    *   Returns:
    *       Region which bounds the extremes of chromosome ID range
    * */


    public RPChromosomeRegion getChromosomeRegion(int startChromID, int endChromID) {


        RPChromosomeRegion region;


        // Search the R+ tree to extract the chromosome region.
        RPTreeNode thisNode = rootNode;
        RPChromosomeRegion seedRegion = null;  // null until a chromosome match


        region = findChromosomeRegion(thisNode, startChromID, endChromID, seedRegion);


        return region;
    }


    /*
    *   Method returns list of all chromosome regions found for the chromosome ID range.
    *
    *   Returns:
    *       List of all chromosome regions in the chromosome ID range.
    * */


    public ArrayList<RPChromosomeRegion> getAllChromosomeRegions() {


        // Search the R+ tree to extract the chromosome regions
        RPTreeNode thisNode = rootNode;


        ArrayList<RPChromosomeRegion> regionList = new ArrayList<RPChromosomeRegion>();


        findAllChromosomeRegions(thisNode, regionList);


        return regionList;
    }


    /*
    *   Method extracts a hit list of chromosome data file locations for a specified chromosome region.
    *
    *   Parameters:
    *       chromosomeRegion - chromosome region for feature extraction consists of:
    *           startChromID - start chromosome ID for region
    *           mStartBase - starting base for data extraction
    *           endChromID - end chromosome ID for region
    *           mEndBase - ending base for data extraction
    *       contained - if true indicates all returned data must be
    *           completely contained within the extraction region;
    *           else if false, returns all intersecting region features
    *
    *   Note: The selection region will be limited to accommodate  maxLeafHits; which terminates
    *       selection at the leaf node at which maxLeafHits is reached. Total number of selected
    *       items may exceed maxLeafHits, but only by the number of leaves in the cutoff leaf node.
    *
    *   Returns:
    *       List of chromosome leaf items which identify file locations for bed data
    *       of a chromosome region, or a sub-region subject to maxLeafHits.
    *
    *       Check returned leaf item bounds for cutoff limits on selection region due to maxLeafHits.
    * */


    public ArrayList<RPTreeLeafNodeItem> getChromosomeDataHits(RPChromosomeRegion selectionRegion,
                                                               boolean contained) {


        ArrayList<RPTreeLeafNodeItem> leafHitItems = new ArrayList<RPTreeLeafNodeItem>();


        // check for valid selection region - return empty collection if null
        if (selectionRegion == null)
            return leafHitItems;


        // limit the hit list size
        /*
        if(maxLeafHits > 0)
            mMaxLeafHits = maxLeafHits;
        else
            mMaxLeafHits = mRPTreeHeader.getBlockSize();
        */




        findChromosomeRegionItems(rootNode, selectionRegion, leafHitItems);


        return leafHitItems;
    }


    // prints out the R+ tree  header, nodes, and leaves


    public void print() {


        // check if read in
        if (!rpTreeHeader.isHeaderOK()) {
            int badMagic = rpTreeHeader.getMagic();
            log.error("Error reading R+ tree header: bad magic = " + badMagic);
            return;
        }


        // print R+ tree header
        rpTreeHeader.print();


        // print  R+ tree node and leaf items - recursively
        if (rootNode != null)
            rootNode.printItems();


    }


    /*
    *   Method finds and returns the bounding chromosome region for the specified
    *   chromosome ID range.
    *
    *   Parameters:
    *       thisNode - tree node to start search
    *       startChromID - start chromosome ID for region
    *       endChromID - end chromosome ID for region
    *       region  - leaf region contains extremes for given chromosome ID range
    *
    *   Note: region grows recursively to match the extremes found for the
    *   specified chromosome ID range.  Starting base comes from the startChromID
    *   match and ending base comes form the endChromID match.
    *
    *   Returns:
    *       Chromosome region if found in the R+ tree node passed in;
    *       else null region
    * */


    private RPChromosomeRegion findChromosomeRegion(RPTreeNode thisNode,
                                                    int startChromID, int endChromID, RPChromosomeRegion region) {


        int hitValue;
        RPChromosomeRegion bounds;


        // search down the tree recursively starting with the root node
        if (thisNode.isLeaf()) {
            int nLeaves = thisNode.getItemCount();
            for (int index = 0; index < nLeaves; ++index) {


                RPTreeLeafNodeItem leaf = (RPTreeLeafNodeItem) thisNode.getItem(index);


                // get leaf region bounds
                bounds = leaf.getChromosomeBounds();


                // test this leaf's chromosome ID's for chromosome hit, then include its base bounds
                if (startChromID >= bounds.getStartChromID() && startChromID <= bounds.getEndChromID() ||
                        endChromID >= bounds.getStartChromID() && endChromID <= bounds.getEndChromID()) {


                    // Note: need a start region before comparing other regions for extremes
                    if (region == null)
                        region = new RPChromosomeRegion(bounds); // seed extreme region
                    else
                        region = region.getExtremes(bounds); // update seed extreme region
                }
            }
        } else {
            // check all child nodes
            int nNodes = thisNode.getItemCount();
            for (int index = 0; index < nNodes; ++index) {


                RPTreeChildNodeItem childItem = (RPTreeChildNodeItem) thisNode.getItem(index);


                // get bounding region and compare chromosome ID's
                bounds = childItem.getChromosomeBounds();


                // test node chromosome ID range for any leaf hits for either startChromID or endChromID
                if (startChromID >= bounds.getStartChromID() && startChromID <= bounds.getEndChromID() ||
                        endChromID >= bounds.getStartChromID() && endChromID <= bounds.getEndChromID()) {


                    RPTreeNode childNode = childItem.getChildNode();
                    region = findChromosomeRegion(childNode, startChromID, endChromID, region);
                }


                // check next node
            }
        }


        return region;
    }


    /*
    *   Method finds and returns all chromosome regions in the R+ chromosome data tree.
    *
    *   parameters:
    *       thisNode - tree node to start search
    *       chromosomeList - list of all chromosome names found.
    *
    *   Returns:
    *       Adds chromosome regions if found in the chromosome region list passed in.
    * */


    private void findAllChromosomeRegions(RPTreeNode thisNode,
                                          ArrayList<RPChromosomeRegion> regionList) {


        // search down the tree recursively starting with the root node
        if (thisNode.isLeaf()) {
            int nLeaves = thisNode.getItemCount();
            for (int index = 0; index < nLeaves; ++index) {


                RPTreeLeafNodeItem leaf = (RPTreeLeafNodeItem) thisNode.getItem(index);


                // add all leaf regions
                RPChromosomeRegion region = leaf.getChromosomeBounds();
                regionList.add(region);
            }
        } else {
            // get all child nodes
            int nNodes = thisNode.getItemCount();
            for (int index = 0; index < nNodes; ++index) {


                RPTreeChildNodeItem childItem = (RPTreeChildNodeItem) thisNode.getItem(index);
                RPTreeNode childNode = childItem.getChildNode();


                findAllChromosomeRegions(childNode, regionList);
            }
        }


    }


    /*
    *   Method returns an array of chromosome leaf items for the chromosome test region.
    *
    *   Note: At the leaf item level, any hit is valid. Use of contained is exercised
    *       when the leaf item data region is examined.
    *   Parameters:
    *
    *       thisNode - tree node to start search
    *       testRegion - bounding region for feature extraction consists of:
    *           start chromosome ID for region
    *           starting base for data extraction
    *           end chromosome ID for region
    *           ending base for data extraction
    *       leafHitItems - array containing previous leaf hit items
    *
    *   Note: leaf hit items will be limited to leaves in the current leaf node,
    *       once the maximum number of leaf hits mMaxLeafHits is reached.
    * 
    *   Returns:
    *       ArrayList of leaf hit items containing updated hit regions and data offsets;
    *       else an empty array if hit regions not found.
    * */


    private void findChromosomeRegionItems(RPTreeNode thisNode, RPChromosomeRegion selectionRegion,
                                           ArrayList<RPTreeLeafNodeItem> leafHitItems) {


        int hitValue;


        // check for valid selection region - ignore request if null
        if (selectionRegion == null)
            return;


        // check if node is disjoint
        hitValue = thisNode.compareRegions(selectionRegion);
        if (Math.abs(hitValue) >= 2)
            return;


        // search down the tree recursively starting with the root node
        if (thisNode.isLeaf()) {
            int nLeaves = thisNode.getItemCount();
            for (int index = 0; index < nLeaves; ++index) {
                RPTreeLeafNodeItem leafItem = (RPTreeLeafNodeItem) thisNode.getItem(index);


                // compute the region hit value
                hitValue = leafItem.compareRegions(selectionRegion);


                // select contained or intersected leaf regions - item selection is by iterator
                if (Math.abs(hitValue) < 2) {
                    leafHitItems.add(leafItem);
                }


                // ascending regions will continue to be disjoint so terminate nodal search
                else if (hitValue > 1)
                    break;


                // check next leaf
            }
        } else {
            // check all child nodes
            int nNodes = thisNode.getItemCount();
            for (int index = 0; index < nNodes; ++index) {
                RPTreeChildNodeItem childItem = (RPTreeChildNodeItem) thisNode.getItem(index);


                // check for region intersection at the node level
                hitValue = childItem.compareRegions(selectionRegion);


                // test this node and get any leaf hits; intersections and containing
                if (Math.abs(hitValue) < 2) {
                    RPTreeNode childNode = childItem.getChildNode();
                    findChromosomeRegionItems(childNode, selectionRegion, leafHitItems);
                }


                // ascending regions will continue to be disjoint so terminate nodal search
                else if (hitValue > 1)
                    break;
            }
        }
    }


    /*
    *   Method reads in the R+ tree nodes recursively.
    *
    *   Note: If node is a child node, the node is examined recursively,
    *       until the leaves are found.
    *
    *   Parameters:
    *       fis - file input stream handle
    *       fileOffset - file location for node specification (Table L)
    *       parent - parent node of this node
    *       isLowToHigh - indicates formatted data is low to high byte order if true;
    *           else is high to low byte order
    *
    *   Returns:
    *       A tree node, for success, or null for failure to find the node information.


    * */


    static RPTreeNode readRPTreeNode(SeekableStream fis, long fileOffset, boolean isLowToHigh, boolean forceDescend) {


        LittleEndianInputStream lbdis = null; // low o high byte stream reader
        DataInputStream bdis = null;    // high to low byte stream reader


        byte[] buffer = new byte[RPTREE_NODE_FORMAT_SIZE];
        RPTreeNode thisNode = null;


        try {


            // Read node format into a buffer
            fis.seek(fileOffset);
            fis.readFully(buffer);


            if (isLowToHigh) {
                lbdis = new LittleEndianInputStream(new ByteArrayInputStream(buffer));
            } else {
                bdis = new DataInputStream(new ByteArrayInputStream(buffer));
            }


            // find node type
            byte type;
            if (isLowToHigh) {
                type = lbdis.readByte();
            } else {
                type = bdis.readByte();
            }


            boolean isLeaf;
            int itemSize;
            if (type == 1) {
                isLeaf = true;
                itemSize = RPTREE_NODE_LEAF_ITEM_SIZE;
                thisNode = new RPTreeNode(true);
            } else {
                isLeaf = false;
                itemSize = RPTREE_NODE_CHILD_ITEM_SIZE;
                thisNode = new RPTreeNode(false);
            }
            //nodeCount++;


            int itemCount;
            if (isLowToHigh) {
                lbdis.readByte();          // reserved - not currently used
                itemCount = lbdis.readShort();
            } else {
                bdis.readByte();          // reserved - not currently used
                itemCount = bdis.readShort();
            }


            int itemBlockSize = itemCount * itemSize;
            buffer = new byte[itemBlockSize];            // allocate buffer for item sisze
            fis.readFully(buffer);
            if (isLowToHigh)
                lbdis = new LittleEndianInputStream(new ByteArrayInputStream(buffer));
            else
                bdis = new DataInputStream(new ByteArrayInputStream(buffer));


            // get the node items - leaves or child nodes
            int startChromID, endChromID;
            int startBase, endBase;
            for (int item = 0; item < itemCount; ++item) {


                // always extract the bounding rectangle
                if (isLowToHigh) {
                    startChromID = lbdis.readInt();
                    startBase = lbdis.readInt();
                    endChromID = lbdis.readInt();
                    endBase = lbdis.readInt();
                } else {
                    startChromID = bdis.readInt();
                    startBase = bdis.readInt();
                    endChromID = bdis.readInt();
                    endBase = bdis.readInt();
                }


                if (isLeaf) {
                    long dataOffset;
                    long dataSize;
                    if (isLowToHigh) {
                        dataOffset = lbdis.readLong();
                        dataSize = lbdis.readLong();
                    } else {
                        dataOffset = bdis.readLong();
                        dataSize = bdis.readLong();
                    }


                    thisNode.insertItem(new RPTreeLeafNodeItem(startChromID, startBase, endChromID, endBase,
                            dataOffset, dataSize));
                } else {
                    // get the child node pointed to in the node item
                    long nodeOffset;
                    if (isLowToHigh) {
                        nodeOffset = lbdis.readLong();
                    } else {
                        nodeOffset = bdis.readLong();
                    }


                    // Recursive call to read next child node
                    // The test on chromIds is designed to stop the descent when the tree reaches the level of an
                    // individual chromosome.  These are loaded later on demand.


                    RPTreeChildNodeItem childNodeItem;
                    if (startChromID != endChromID || forceDescend) {
                        RPTreeNode childNode = readRPTreeNode(fis, nodeOffset, isLowToHigh, forceDescend);
                        childNodeItem = new RPTreeChildNodeItem(startChromID, startBase, endChromID,
                                endBase, childNode);
                    } else {
                        RPTreeNodeProxy proxy = new RPTreeNodeProxy(fis, nodeOffset, isLowToHigh, startChromID);
                        childNodeItem = new RPTreeChildNodeItem(startChromID, startBase, endChromID,
                                endBase, proxy);
                    }
                    thisNode.insertItem(childNodeItem);
                }


                fileOffset += itemSize;
            }


        } catch (IOException ex) {
            log.error("Error reading in R+ tree nodes: " + ex);
            throw new RuntimeException("Error reading R+ tree nodes: \n", ex);
        }


        // return success
        return thisNode;
    }
}
Source Code of org.broad.igv.bbfile.RPTree

Related Classes of org.broad.igv.bbfile.RPTree