Source Code of org.broad.igv.bbfile.BPTree

/*
 * Copyright (c) 2007-2012 The Broad Institute, Inc.
 * SOFTWARE COPYRIGHT NOTICE
 * This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
 *
 * This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
 *
 * This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
 * Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
 */


package org.broad.igv.bbfile;


import htsjdk.samtools.seekablestream.SeekableStream;
import org.apache.log4j.Logger;
import htsjdk.tribble.util.LittleEndianInputStream;


import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;


/**
 * Created by IntelliJ IDEA.
 * User: martind
 * Date: Dec 17, 2009
 * Time: 12:28:30 PM
 * To change this template use File | Settings | File Templates.
 */
/*
*   B+ Tree class will construct a B+ tree from a binary Bed/Wig BBFile.
*   (or by insertion of tree nodes - TBD see insert method)
*
*   1) BPTree will first read in the B+ tree header with BPTreeHeader class.
*
*   2) Starting with the root node, the readBPTreeNode method will read in the
*   node format, determine if the node contains child nodes (isLeaf = false)
*   or leaf items (isLeaf = true).
*
*   3) If node is a leaf node, all leaf items are read in to the node's leaf array.
*
*   4) If node is a child node, readBPTreeNode will be called recursively,
*   until the leaf node is encountered, where step 3 is performed.
*
*   5) The child nodes will be populated with their child node items in reverse order
*   of recursion from step 4, until the tree is completely populated
*   back up to the root node.
*
*   6) The getChromosomeKey is provided to construct a valid key for B+
*   chromosome tree searches, and getChromosomeID returns a chromosome ID for
*   searches in the R+ index tree.
*
**/ 
public class BPTree {


    private static Logger log = Logger.getLogger(BPTree.class);


    public static final int BPTREE_NODE_FORMAT_SIZE = 4;   // node format size
    public static final int BPTREE_NODE_ITEM_SIZE = 8;     // Plus keySize to be added


    // B+ tree access variables   - for reading in B+ tree nodes from a file
    private SeekableStream fis;      // file handle - BBFile input stream
    private long treeOffset;         // mChromosome B+ tree file offset
    private BPTreeHeader treeHeader; // B+ tree header (Table E for BBFile)


    // B+ tree organizational variables  - derived from Table E
    private int blockSize;     // number of children per block
    private int keySize;       // character size of primary key
    private int valueSize;     // number of bytes in value being indexed
    private long itemCount;    //  number of contig/mChromosome items in tree


    // B+ tree nodal variables
    private BPTreeNode rootNode;   // B+ tree root node
    private long nodeCount;        // number of nodes defined in the B+ tree
    private long leafCount;        // number of leaves in the B+ tree


   /*
   *    Constructor for reading in a B+ tree from a BBFile/input stream.
   *
   *    Parameters:
   *        fis - file input stream handle
   *        fileOffset - file offset to the B+ tree header
   *        isLowToHigh - indicates byte order is low to high, else is high to low
   * */
    public BPTree(SeekableStream fis, long fileOffset, boolean isLowToHigh) {


        // Save the seekable file handle and B+ Tree file offset
        // Note: the offset is the B+ Tree Header Table E file location
        this.fis = fis;
        treeOffset = fileOffset;


        // read in B+ tree header - verify the B+ tree info exits
        treeHeader = new BPTreeHeader(this.fis, treeOffset, isLowToHigh);


        // log error if header not found and throw exception
        if(!treeHeader.isHeaderOK()){
            int badMagic = treeHeader.getMagic();
            log.error("Error reading B+ tree header: bad magic = " + badMagic);
            throw new RuntimeException("Error reading B+ tree header: bad magic = "
                    +  badMagic);
        }


        // assign B+ tree specifications from the header
        blockSize = treeHeader.getBlockSize();
        keySize =  treeHeader.getKeySize();
        valueSize = treeHeader.getValSize();
        itemCount = treeHeader.getItemCount();


        // populate the tree - read in the nodes
        long nodeOffset = treeOffset + treeHeader.BPTREE_HEADER_SIZE;
        BPTreeNode parentNode = null;  // parent node of the root is itself, or null


        // get the root node - which recursively populates the remaining nodes
        rootNode =  readBPTreeNode(this.fis, nodeOffset, parentNode, isLowToHigh);


    }


    /*
    *   Method returns the file input stream handle
    * */
    public SeekableStream getFis() {
        return fis;
    }


    /*
    *   Method returns the B+ tree file location
    * */
    public long getBPTreeOffset() {
        return treeOffset;
    }


    /*
    *   Method returns the B+ tree header (Table E).
    * */
    public BPTreeHeader getTreeHeader(){
        return treeHeader;
    }


    /*
    *   Method returns the node block size (B+ order).
    * */
    public int getBlockSize() {
        return blockSize;
    }


    /*
    *   Method returns the chromosome name key size, which is
    *   the number of valid characters for chromosome name.
    * */
    public int getKeySize() {
        return keySize;
    }


    /*
    *   Method returns the indexing value size (currently 8).
    * */
    public int getValueSize() {
          return valueSize;
    }


    /*
    *   Method returns the number of chromosome/contig names.
    * */
    public long getItemCount() {
        return itemCount;
    }


    /*
    *   Method returns the number of nodes in the B+ tree.
    * */
    public long getNodeCount() {
        return nodeCount;
    }


    /*
    *   Method returns the root node, from which all other nodes
    *   can be extracted.
    *
    *   Returns:
    *       Root node
    * */
    public BPTreeNode getRootNode() {
        return rootNode;
    }




    Map<String, String> chromosomeKeyCache = new HashMap();
    /*
    *   Returns a search key for the mChromosome region  which  can
    *   be used to search for a corresponding section in the B+ tree.
    *
    *   According the the spec the key is the "first keySize characters of chromosome name, padded with zeroes if needed.
    * */
    public String getChromosomeKey(String chromosome) {


        String key = chromosomeKeyCache.get(chromosome);
        if(key == null) {
            char [] keyChars = new char[keySize];
            char [] chrChars = chromosome.toCharArray();
            System.arraycopy(chrChars, 0, keyChars, 0, Math.min(keySize, chrChars.length));
            key = new String(keyChars);
            chromosomeKeyCache.put(chromosome, key);
        }
        return key;
    }


    /*
    *   Returns a chromosome ID  which  can be used to search for a
    *   corresponding data section in the R+ tree for data.
    *
       Parameters:
    *       chromKey - chromosome name of valid key size.
    *
    *
    *   Note: A chromosomeID of -1 means chromosome name not included in B+ tree.
    *
    * */
    public int getChromosomeID(String chromKey) {
         int chromosomeID;


        // Search the B+ tree to extract the Chromosome ID.
        BPTreeNode thisNode = rootNode;


        chromosomeID = findChromosomeID(thisNode, chromKey);


        return chromosomeID;
    }


    /*
    *   Returns a chromosome name which is the B+ key for returning the
    *   chromosome ID for lookup in the R+ tree for data.
    *
    *   Parameters:
    *       chromID - chromosome ID expected in B+ tree
    *
    *   Returns:
    *       Chromosome name key; a null string means chromosome ID not found.
    *
    * */
    public String getChromosomeName(int chromID) {
         String chromKey;


        // Search the B+ tree to extract the Chromosome ID.
        BPTreeNode thisNode = rootNode;


        chromKey = findChromosomeName(thisNode, chromID);


        return chromKey;
    }


    /*
    *   Method returns all chromosome key names in B+ tree.
    *
    *   Returns:
    *   Collection of all (chromosome ID, chromosome name)entries
    * */
    public ArrayList<String> getChromosomeNames(){


        // Search the B+ tree to extract the chromosome ID.
        BPTreeNode thisNode = rootNode;


        ArrayList<String> chromosomeList = new ArrayList<String>();


        findAllChromosomeNames(thisNode, chromosomeList);


        return chromosomeList;
    }


     /*
    *   Method returns all chromosome name, chromosome ID pairs for a given ID range.
    *
    *   Parameters:
    *       startChromID - starting ID for chromosome range expected in B+ tree
    *       endChromID - ending ID for chromosome range expected in B+ tree
    *
    *   Returns:
    *       Collection of (chromosome ID, chromosome name key) hash items;
    *       where an empty collection means ID range was not found.
    *
    * */
    public HashMap<Integer, String> getChromosomeIDMap(int startChromID, int endChromID){


        // Search the B+ tree to extract the chromosome ID.
        BPTreeNode thisNode = rootNode;


        HashMap<Integer, String> chromosomeIDMap = new HashMap<Integer, String>();


        findChromosomeMap(thisNode, startChromID, endChromID, chromosomeIDMap);


        return chromosomeIDMap;
    }


    // prints out the B+ Tree  nodes and leaves
    public void print() {


       // check if read in
       if(!treeHeader.isHeaderOK()){
            int badMagic = treeHeader.getMagic();
            log.error("Error reading B+ tree header: bad magic = " + badMagic);
           return;
       }


        // print B+ tree header
        treeHeader.print();


        // print  B+ tree node and leaf items - recursively
        if(rootNode != null)
            rootNode.printItems();
   }


    /*
    *   Method finds and returns the chromosome ID for the specified chromosome key.
    *
    *   Note: This method recursively calls itself, traversing the full B+ tree until
    *       either the chromosome name key is found and returns a valid chromosome ID,
    *       or exits with a -1 value.
    *
    *   Parameters:
    *       thisNode - tree node to start search
    *       chromKey - chromosome name key of valid key size.
    *
    *   Returns:
    *       Valid chromosome ID if >= 0; else -1 for not found.
    * */
    private int findChromosomeID( BPTreeNode thisNode, String chromKey){
        int chromID = -1;    // until found


        // search down the tree recursively starting with the root node
        if(thisNode.isLeaf())
        {
           int nLeaves = thisNode.getItemCount();
           for(int index = 0; index < nLeaves; ++index){
               BPTreeLeafNodeItem leaf = (BPTreeLeafNodeItem)thisNode.getItem(index);
               if(leaf == null){
                    log.error("Error finding B+ tree leaf nodes, corruption suspected");
                    throw new RuntimeException("Error reading B+ tree leaf nodes, corruption suspected");
               }


               // test chromosome key match
               if(leaf.chromKeysMatch(chromKey)){
                   chromID = leaf.getChromID();
                   break;
               }
               // else check next leaf
           }
        }
        else {
           // check all child nodes
           int nNodes = thisNode.getItemCount();
           for(int index = 0; index < nNodes; ++index){


               BPTreeChildNodeItem childItem = (BPTreeChildNodeItem)thisNode.getItem(index);              
               BPTreeNode childNode =  childItem.getChildNode();


               // check if key is in the node range
               String lowestKey = childNode.getLowestChromKey();
               String highestKey = childNode.getHighestChromKey();


               // test name key against key range
               if(chromKey.compareTo(lowestKey) >= 0
                       && chromKey.compareTo(highestKey) <= 0) {


                    // keep going until leaf items are checked
                    chromID = findChromosomeID(childNode, chromKey);


                    // check for chromKey match
                    if(chromID >= 0)
                        break;
               }
           }
        }


        return chromID;
    }


    /*
    *   Method finds and returns the chromosome name for the specified chromosome ID.
    *
    *   Parameters:
    *       thisNode - tree node to start search
    *       chromID - B+ tree chromosome ID supplied for the chromosome key
    *
    *   Returns:
    *       chromosome name if found; else a null string.
    * */
    private String findChromosomeName( BPTreeNode thisNode, int chromID){


        String chromKey = null; // mark unfound condition as an empty string


        // search down the tree recursively starting with the root node
        if(thisNode.isLeaf())
        {
           int nLeaves = thisNode.getItemCount();
           for(int index = 0; index < nLeaves; ++index){
               BPTreeLeafNodeItem leaf = (BPTreeLeafNodeItem)thisNode.getItem(index);


               if(leaf.getChromID() == chromID){ // mChromosome key match
                   chromKey = leaf.getChromKey();
                   break;
               }
               // else check next leaf
           }
        }
        else {
           // check all child nodes
           int nNodes = thisNode.getItemCount();
           for(int index = 0; index < nNodes; ++index){


               BPTreeChildNodeItem childItem = (BPTreeChildNodeItem)thisNode.getItem(index);
               BPTreeNode childNode =  childItem.getChildNode();


               // check if key is in the node range
               int lowestID = childNode.getLowestChromID();
               int highestID = childNode.getHighestChromID();


               // test chromosome ID against node ID range
               if(chromID >= lowestID && chromID <= highestID) {


                    // keep going until leaf items are checked
                    chromKey = findChromosomeName(childNode, chromID);


                    // check for chromosome ID match
                    if(chromKey != null)
                        break;
               }
           }
        }


        return chromKey;
    }


    /*
    *   Method finds and returns all chromosome names in the B+ tree.
    *
    *   Note: This method calls itself recursively until the full B+ tree is traversed.
    *
    *   Parameters:
    *       thisNode - tree node to start search
    *       chromosomeList - list of all chromosome names found.
    *
    *   Returns:
    *       Chromosome names found are added to the chromosome list passed in.
    * */
    public void findAllChromosomeNames( BPTreeNode thisNode, ArrayList<String> chromosomeList){


        // search down the tree recursively starting with the root node
        if(thisNode.isLeaf())
        {
           // add all leaf names
           int nLeaves = thisNode.getItemCount();
           for(int index = 0; index < nLeaves; ++index){


               BPTreeLeafNodeItem leaf = (BPTreeLeafNodeItem)thisNode.getItem(index);
               chromosomeList.add(leaf.getChromKey());
           }
        }
        else {
           // get all child nodes
           int nNodes = thisNode.getItemCount();
           for(int index = 0; index < nNodes; ++index){


               BPTreeChildNodeItem childItem = (BPTreeChildNodeItem)thisNode.getItem(index);
               BPTreeNode childNode = childItem.getChildNode();


               // keep going until leaf items are extracted
               findAllChromosomeNames(childNode, chromosomeList);
           }
        }
    }


    /*
    *   Method finds and returns (chromosome ID, chromosome key name) pairs for the specified ID range.
    *
    *   Parameters:
    *       thisNode - tree node to start search
    *       startChromID - starting chromosome ID for the chromosome range
    *       endChromID - ending chromosome ID for the chromosome range
    *
    *   Returns:
    *       (chromosome ID, chromosome key name) items are added to the collection passed in.
    * */
    private void findChromosomeMap( BPTreeNode thisNode, int startChromID, int endChromID,
                                        HashMap<Integer, String> chromosomeMap){
        int chromID;
        int lowestID;
        int highestID;


        // check if node is disjoint
        lowestID = thisNode.getLowestChromID();
        if(lowestID > endChromID)
            return;


        highestID = thisNode.getHighestChromID();
        if(highestID < startChromID)
            return; 


        // search down the tree recursively starting with the root node
        if(thisNode.isLeaf())
        {
           int nLeaves = thisNode.getItemCount();
           for(int index = 0; index < nLeaves; ++index){


               BPTreeLeafNodeItem leaf = (BPTreeLeafNodeItem)thisNode.getItem(index);
               chromID = leaf.getChromID();


               // check for chromosome range match
               if( chromID >= startChromID && chromID <= endChromID ){
                   chromosomeMap.put(chromID, leaf.getChromKey());
               }
               // leaf ID's are in ascending order; check for going out of range
               else if(chromID > endChromID)
                   break;
           }
        }
        else {
           // check all child nodes
           int nNodes = thisNode.getItemCount();
           for(int index = 0; index < nNodes; ++index){


               BPTreeChildNodeItem childItem = (BPTreeChildNodeItem)thisNode.getItem(index);
               BPTreeNode childNode =  childItem.getChildNode();


               // check if keys are in the node range
               lowestID = childNode.getLowestChromID();
               highestID = childNode.getHighestChromID();


               // test for chromosome range intersections
               if( lowestID <= endChromID && highestID >= startChromID )
                    findChromosomeMap(childNode, startChromID, endChromID, chromosomeMap);


               // test node ID range which is always in ascending order going out of range
               else if(lowestID > endChromID)
                   break;   //
           }
        }
    }


    /*
    *   Method reads in the B+ tree nodes from the file, recursively.
    *
    *   Parameters:
    *       fis - file input stream handle
    *       fileOffset - file offset for B+ tree header
    *       keySize - chromosome name key size in characters
    *       parent - parent node
    *       isLowToHigh - if true, indicates byte order is low to high; else is high to low
    *
    *   Returns:
     *      Boolean which indicates if the B+ tree header was read correctly, with
    *       true for success, false for failure to find the header information.
    * */
    private BPTreeNode readBPTreeNode(SeekableStream fis, long fileOffset,
                                      BPTreeNode parent, boolean isLowToHigh){


        LittleEndianInputStream lbdis = null;     // low to high byte reader
        DataInputStream bdis = null;        // high to low byte reader


        // set up for node format
        byte[] buffer = new byte[BPTREE_NODE_FORMAT_SIZE];
        BPTreeNode thisNode = null;
        BPTreeNode childNode = null;


        byte type;
        byte bval;
        int itemCount;
        int itemSize;
        boolean isLeaf;


        try {


           // Read node format into a buffer
           fis.seek(fileOffset);
           fis.readFully(buffer);


           if(isLowToHigh)
                lbdis = new LittleEndianInputStream(new ByteArrayInputStream(buffer));
           else
                bdis = new DataInputStream(new ByteArrayInputStream(buffer));


           // find node type
           if(isLowToHigh)
                type = lbdis.readByte();
           else
                type = bdis.readByte();


           // create the B+ tree node
           if(type == 1) {
               isLeaf = true;
               thisNode = new BPTreeLeafNode(++nodeCount);
           }
           else {
               isLeaf = false;
               thisNode = new BPTreeChildNode(++nodeCount);
           }


           if(isLowToHigh) {
                bval = lbdis.readByte();      // reserved - not currently used
                itemCount = lbdis.readShort();
           }
           else {
                bval = bdis.readByte();      // reserved - not currently used
                itemCount = bdis.readShort();
           }


            // Note: B+ tree node item size is the same for leaf and child items
            itemSize =  BPTREE_NODE_ITEM_SIZE + this.keySize;
            int totalSize = itemSize * itemCount;
            byte[] itemBuffer = new byte[totalSize];
            fis.readFully(itemBuffer);


            if(isLowToHigh)
                 lbdis = new LittleEndianInputStream(new ByteArrayInputStream(itemBuffer));
             else
                 bdis = new DataInputStream(new ByteArrayInputStream(itemBuffer));


            // get the node items - leaves or child nodes
            for(int item = 0; item < itemCount; ++item) {


               // always extract the key from the node format
               char[] keychars = new char[keySize];  // + 1 for 0 byte
               int index;
               for(index = 0; index < keySize; ++index) {


                    if(isLowToHigh)
                        bval = lbdis.readByte();
                    else
                        bval = bdis.readByte();


                    keychars[index] = (char)bval;
               }


               String key = new String(keychars).trim();
                
               int chromID;
               int chromSize;
               long childOffset;


               if(isLeaf) {
                    if(isLowToHigh) {
                        chromID = lbdis.readInt();
                        chromSize = lbdis.readInt();
                    }
                    else {
                        chromID = bdis.readInt();
                        chromSize = bdis.readInt();
                    }


                    // insert leaf items
                    BPTreeLeafNodeItem leafItem = new BPTreeLeafNodeItem(++leafCount, key, chromID, chromSize);
                    thisNode.insertItem(leafItem);
               }
               else {
                   // get the child node pointed to in the node item
                   if(isLowToHigh)
                        childOffset =  lbdis.readLong();
                   else
                        childOffset =  bdis.readLong();


                   childNode = readBPTreeNode(this.fis, childOffset, thisNode, isLowToHigh);


                   // insert child node item 
                   BPTreeChildNodeItem childItem = new BPTreeChildNodeItem(item, key, childNode);
                   thisNode.insertItem(childItem);
                }


                 fileOffset += itemSize;
           }


        }catch(IOException ex) {
           log.error("Error reading B+ tree node " + ex);
           throw new RuntimeException("Error reading B+ tree node \n ", ex);
        }


        // success: return node
        return thisNode;
   }




}
Source Code of org.broad.igv.bbfile.BPTree

Related Classes of org.broad.igv.bbfile.BPTree