/*
* Copyright (C) 2006 http://www.chaidb.org
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
package org.chaidb.db.index.btree;
import org.apache.log4j.Logger;
import org.chaidb.db.KernelContext;
import org.chaidb.db.api.KeyFactory;
import org.chaidb.db.exception.ChaiDBException;
import org.chaidb.db.exception.ErrorCode;
import org.chaidb.db.helper.ByteTool;
import org.chaidb.db.index.Key;
import org.chaidb.db.index.btree.bufmgr.PageBufferManager;
import org.chaidb.db.index.btree.bufmgr.PageNumber;
import org.chaidb.db.log.logrecord.BTreeAddRemoveLogRecord;
import org.chaidb.db.log.logrecord.BTreeFreeOverflowPageLogRecord;
import org.chaidb.db.log.logrecord.BTreeReplLogRecord;
import java.util.Vector;
/**
* For BTree pages, nodes are either key/pgno pairs (internal pages)
* or key/data pairs (leaf pages).
* Each form is stored the same, so I can take a shortcut here and just
* use a single class for either one.
* <P>Each node has the following format:
* <pre>
* Size Description
* 4 Key Size
* 4 Page Number for nodes. BTreePage for internal nodes,and DataPage for leaf nodes
* 1 Flags: 0x01 for overflow data, 0x02 for overflow key
* 2 Node offset in DataPage of data nodes pointed by Leaf nodes.
* Or space allocated for data nodes.
* n Key/Data
* </pre>
*/
final public class BTreeNode {
private static final Logger logger = Logger.getLogger(AbstractBTree.class);
/**
* Origin page of this node
*/
private BTreePage page;
/**
* Location of this node on that page
*/
private int nodeOffset;
/**
* Location in DataPage of data node pointed by this leaf node.
* Meaningless for internal nodes
*/
private short dataNodeOffset;
/**
* Length of key
*/
private int keySize;
/**
* page number it points to, it may be either btree page or data page
*/
private PageNumber pageNumber;
/**
* 1 if data is on an overflow page, 2 for overflow key
*/
private byte flags;
private static final int MAX_KEY_SIZE = 1048576; //1M
private static final int MIN_KEY_SIZE = 1;
boolean isOverflowKey() {
return (flags & 0x02) != 0;
}
boolean isOverflowData() {
return (flags & 0x01) != 0;
}
int getNodeOffset() {
return nodeOffset;
}
public int getDataNodeOffset() {
return dataNodeOffset;
}
void setDataNodeOffset(short dataNodeOff) {
dataNodeOffset = dataNodeOff;
}
public BTreePage getBTreePage() {
return page;
}
void setTreeId(int treeid) {
this.pageNumber.setTreeId(treeid);
}
void setFlags(byte flags) {
this.flags = flags;
}
public PageNumber getPageNumber() {
return pageNumber;
}
void setPageNumber(int _pageNumber) {
this.pageNumber = new PageNumber(_pageNumber);
}
boolean isLeafNode() {
return page.isLeaf();
}
/**
* @param page the page where this node stays
* @param offset the offset of the node in the page
*/
public BTreeNode(BTreePage page, int offset) throws ChaiDBException {
this.page = page;
nodeOffset = offset;
/* need read the node from the page */
byte[] rawPage = this.page.getPage();
if (nodeOffset > rawPage.length) {
throw new ChaiDBException(ErrorCode.BTREE_INVALID_BTREEPAGE, "offset of btree node is out of bound:" + offset);
}
keySize = ByteTool.bytesToInt(rawPage, nodeOffset + BTreeSpec.NODE_OFF_KEYSIZE, this.page.btreeSpec.isMsbFirst());
/* to prevent OOM */
if (keySize > MAX_KEY_SIZE || keySize < MIN_KEY_SIZE) {
throw new ChaiDBException(ErrorCode.BTREE_INVALID_BTREEPAGE, "keysize is out of bound:" + keySize);
}
pageNumber = new PageNumber(ByteTool.bytesToInt(rawPage, nodeOffset + BTreeSpec.NODE_OFF_PAGENUMBER, this.page.btreeSpec.isMsbFirst()));
pageNumber.setTreeId(page.btreeSpec.btree.getBtreeId());
flags = rawPage[nodeOffset + BTreeSpec.NODE_OFF_FLAGS];
if (flags < 0 || flags > 3) {
throw new ChaiDBException(ErrorCode.BTREE_INVALID_BTREEPAGE, "flags is out of bound:" + flags);
}
// if (isLeafNode()){}
dataNodeOffset = ByteTool.bytesToShort(rawPage, nodeOffset + BTreeSpec.NODE_OFF_DATANODEOFFSET, this.page.btreeSpec.isMsbFirst());
}
private BTreeNode() {
}
/**
* Build a leaf node, get all parameters needed
*/
public static BTreeNode createNewBTreeNode(BTreePage page, int offset, byte[] key, PageNumber dataPageNum) throws ChaiDBException {
BTreeNode node = new BTreeNode();
node.page = page;
node.nodeOffset = offset;
node.keySize = key.length;
if (node.keySize > MAX_KEY_SIZE || node.keySize < MIN_KEY_SIZE) {
throw new ChaiDBException(ErrorCode.BTREE_INVALID_BTREEPAGE, "keysize is out of bound:" + node.keySize);
}
node.pageNumber = new PageNumber(dataPageNum);
node.flags = 0;
return node;
}
/*
public BTreeNode(BTreePage page, int offset, byte[] key, PageNumber pageNumber) {
this.page = page;
this.nodeOffset = offset;
keySize = key.length;
this.pageNumber = new PageNumber(pageNumber);
this.flags = 0;
}
public BTreeNode(BTreePage page, int offset, byte[] key, byte[] data) {
this.page = page;
this.nodeOffset = offset;
keySize = key.length;
pageNumber = new PageNumber(ByteTool.bytesToInt(data, 0, page.btreeSpec.isMsbFirst()));
pageNumber.setTreeId(page.btreeSpec.btree.getBtreeId());
flags = 0;
}
*/
/**
* return the node header in byte[]
*/
public byte[] getHeader() {
byte[] ret = new byte[BTreeSpec.NODE_HEADER_SIZE];
System.arraycopy(ByteTool.intToBytes(keySize), 0, ret, BTreeSpec.NODE_OFF_KEYSIZE, 4);
System.arraycopy(ByteTool.intToBytes(pageNumber.getPageNumber()), 0, ret, BTreeSpec.NODE_OFF_PAGENUMBER, 4);//xxxxx???need rewite here???
ret[BTreeSpec.NODE_OFF_FLAGS] = flags;
if (isLeafNode())
System.arraycopy(ByteTool.shortToBytes(dataNodeOffset), 0, ret, BTreeSpec.NODE_OFF_DATANODEOFFSET, 2);
return ret;
}
/**
* Fetch key from page to a new copy
*
* @return A new copy of key in this node
*/
// did NOT fix / unfix the current page ###
public Key getKey() throws ChaiDBException {
//TODO Check cached Key.
byte[] key = new byte[keySize];
if (!isOverflowKey()) {
System.arraycopy(page.getPage(), nodeOffset + BTreeSpec.NODE_HEADER_SIZE, key, 0, keySize);
} else {
// handle overflow key
PageNumber overflowPageNumber = new PageNumber(ByteTool.bytesToInt(page.getPage(), nodeOffset + BTreeSpec.NODE_HEADER_SIZE, page.btreeSpec.isMsbFirst()));
BTreePage overflowPage = new BTreePage(pageNumber.getTreeId(), overflowPageNumber, page.btreeSpec, page.buffer); //pageNumber must have valid treeid
int keyLeft = keySize;
int keyStartPos = 0;
int dataLength = page.btreeSpec.getPageSize() - overflowPage.upperBound;
int keyLength = (keyLeft < dataLength) ? keyLeft : dataLength;
Vector v = new Vector();
while (keyLeft > 0 && keyLength > 0) {
try {
System.arraycopy(overflowPage.getPage(), overflowPage.upperBound, key, keyStartPos, keyLength);
} catch (Exception e) {
logger.error("nodeoffset=0x" + Integer.toHexString(nodeOffset) + " keysize=0x" + Integer.toHexString(keySize) + " node pg=" + page.pageNumber.toHexString() + " overflowpg=0x" + Integer.toHexString(overflowPage.pageNumber.getPageNumber()) + " of file " + PageBufferManager.getInstance().getBTreeName(overflowPage.pageNumber.getTreeId()));
logger.error(e);
Debug.pageHistory(page.pageNumber);
for (int i = 0; i < v.size(); i++) {
PageNumber op = (PageNumber) v.get(i);
Debug.pageHistory(op);
}
String details = e.toString() + ". nodeoffset=0x" + Integer.toHexString(nodeOffset) + " keysize=0x" + Integer.toHexString(keySize) + " pg=0x" + Integer.toHexString(overflowPage.pageNumber.getPageNumber()) + " of file " + PageBufferManager.getInstance().getBTreeName(overflowPage.pageNumber.getTreeId()) + ".";
throw new ChaiDBException(ErrorCode.BTREE_BUFFER_OVERFLOW, details);
}
// unfix / release the overflow page
page.buffer.releasePage(pageNumber.getTreeId(), overflowPage.pageNumber, false);
keyStartPos += keyLength;
keyLeft -= keyLength;
if (keyLeft > 0 && overflowPage.nextPage.getPageNumber() > 0) {
overflowPage = new BTreePage(pageNumber.getTreeId(), overflowPage.nextPage, page.btreeSpec, page.buffer);
dataLength = page.btreeSpec.getPageSize() - overflowPage.upperBound;
keyLength = (keyLeft < dataLength) ? keyLeft : dataLength;
} else if (keyLeft > 0 && overflowPage.nextPage.getPageNumber() <= 0) {
logger.error("[" + Thread.currentThread().getName() + "] time=" + System.currentTimeMillis() + " node offset=0x" + Integer.toHexString(getNodeOffset()) + " keyLeft=0x" + Integer.toHexString(keyLeft) + " keySize=0x" + Integer.toHexString(keySize) + " node pageNumber=0x" + Integer.toHexString(page.pageNumber.getPageNumber()) + " overflowpage=0x" + Integer.toHexString(overflowPage.pageNumber.getPageNumber()) + " of " + page.buffer.getBTreeName(overflowPage.pageNumber.getTreeId()));
for (int i = 0; i < v.size(); i++) {
PageNumber op = (PageNumber) v.get(i);
Debug.dumpPage(op, page.buffer.getPage(page.pageNumber.getTreeId(), page.pageNumber));
}
//page in mem!
Debug.dumpPage(page.pageNumber, page.buffer.getPage(page.pageNumber.getTreeId(), page.pageNumber));
//page we have got!
Debug.dumpPage(page.pageNumber, ByteTool.append(new String("Error??\r\n").getBytes(), page.getPage()));
Debug.flushPages();
// details -ranjeet
String details = "Non-match key length in overflow page. The node page number is " + pageNumber.toString() + ".";
throw new ChaiDBException(ErrorCode.DATA_LENGTH_NOT_MATCH, details);
}
}
}
return KeyFactory.createInstance((int) page.getKeyType(), key);
}
/**
* Fetch data from a leaf page to a new copy; guaranteed to be data.
*
* @return A new copy of data in this node
*/
// did NOT fix / unfix the current page ###
public byte[] getData(KernelContext kContext) throws ChaiDBException {
byte[] data = null;
try {
//lock the page pointed by this node
DataPage dataPage = new DataPage(page.getBtreeSpec().btree.getBtreeId(), pageNumber, page.getBtreeSpec(), page.getBuffer());
data = new DataNode(dataPage, dataNodeOffset).getData();
PageBufferManager buffer = page.getBuffer();
buffer.releasePage(pageNumber.getTreeId(), pageNumber, false);
return data;
} catch (ChaiDBException e) {
String details = page.pageNumber.toHexString() + " of " + page.getBtreeSpec().btree.getBTreeName();
logger.error(details);
throw e;
}
}
/**
* First time set up. need to know whether overflow or not ahead.
*/
// did NOT unfix / fix the current page ###
public void setInternalNode(byte[] key, KernelContext kContext) throws ChaiDBException {
int txnId = kContext.getLocker();
boolean needLog = kContext.getNeedLog();
//try {
if (!isOverflowKey()) {
// add new node
/******************* Add by leon, Sep 30 2001 *************/
if (needLog) {
int pgno = page.pageNumber.getPageNumber();
byte[] newNode = ByteTool.copyByteArray(getHeader(), 0, BTreeSpec.NODE_HEADER_SIZE);
newNode = ByteTool.append(newNode, key);
byte[] oldNode = ByteTool.copyByteArray(page.getPage(), nodeOffset, BTreeSpec.NODE_HEADER_SIZE + keySize);
byte[] emptyNode = new byte[BTreeSpec.NODE_HEADER_SIZE + keySize];
if ((ByteTool.compare(oldNode, 0, emptyNode, 0, BTreeSpec.NODE_HEADER_SIZE + keySize) != 0) && oldNode != newNode) {
BTreeReplLogRecord lr = new BTreeReplLogRecord(page.getPageNumber().getTreeId(), pgno, txnId, nodeOffset, oldNode, newNode, page.btreeSpec.btree.getType());
lr.log();
}
}
/*************************************************************/
System.arraycopy(this.getHeader(), 0, page.getPage(), nodeOffset, BTreeSpec.NODE_HEADER_SIZE);
System.arraycopy(key, 0, page.getPage(), nodeOffset + BTreeSpec.NODE_HEADER_SIZE, keySize);
} else {
//=====>>> Build overflow pages
// make it larger enough
int freeSpace = page.btreeSpec.getPageSize() - BTreeSpec.PAGE_HEADER_SIZE;
int overflows = key.length / freeSpace + (key.length % freeSpace > 0 ? 1 : 0);
int keyStartPos = 0;
int keyLeft = key.length;
int requireSpace;
int left;
int keyLength = 0;
DataPage last = null;
PageNumber firstOverflowPageNo = null;
for (int i = 0; i < overflows; i++) {
left = keyLeft;
if (left <= 0) break;
requireSpace = (left > freeSpace) ? freeSpace : left;
keyLength = (keyLeft >= freeSpace) ? freeSpace : keyLeft;
DataPage overflowPage = DataPage.newPage(page.btreeSpec, page.buffer, true, kContext, 0);
/********* add by leon *******
* Here a little trick is played to reduce the log record number.
* First we turn off the needLog switch to avoiding log individually.
* After done, we can log in batch.
*/
overflowPage.setLogInfo(txnId, false);
if (needLog) {
int pgno = overflowPage.getPageNumber().getPageNumber();
byte[] oldV = ByteTool.copyByteArray(overflowPage.getPage(), 0, BTreeSpec.PAGE_HEADER_SIZE);
byte[] newV = new byte[oldV.length];
System.arraycopy(oldV, 0, newV, 0, oldV.length);
System.arraycopy(ByteTool.intToBytes(4), 0, newV, BTreeSpec.OFF_FLAGS, 4);
System.arraycopy(ByteTool.shortToBytes((short) (overflowPage.upperBound - requireSpace)), 0, newV, BTreeSpec.OFF_UPPERBOUND, 2);
System.arraycopy(ByteTool.intToBytes(i == 0 ? page.pageNumber.getPageNumber() : last.pageNumber.getPageNumber()), 0, newV, BTreeSpec.OFF_PREVPAGE, 4);
if (ByteTool.compare(newV, 0, oldV, 0, oldV.length) != 0) {
BTreeReplLogRecord lr = new BTreeReplLogRecord(overflowPage.getPageNumber().getTreeId(), pgno, txnId, 0, oldV, newV, overflowPage.btreeSpec.btree.getType());
lr.log();
}
}
overflowPage.setOverflow();
overflowPage.setUpperBound((short) (overflowPage.upperBound - requireSpace));
if (i == 0) {
firstOverflowPageNo = overflowPage.pageNumber;
overflowPage.setPrevPage(page.pageNumber);
} else {
overflowPage.setPrevPage(last.pageNumber);
}
overflowPage.setLogInfo(txnId, needLog);
//set the previous page's next page
if (i > 0) last.setNextPage(overflowPage.pageNumber);
//add the new node
if (keyLength > 0) {
/******************* Add by leon, Sep 30 2001 *************/
if (needLog) {
int newPageNo = overflowPage.pageNumber.getPageNumber();
BTreeAddRemoveLogRecord logRec = new BTreeAddRemoveLogRecord(page.getPageNumber().getTreeId(), newPageNo, txnId, BTreeAddRemoveLogRecord.ADD_FLAG, overflowPage.upperBound, ByteTool.copyByteArray(key, keyStartPos, keyLength), page.btreeSpec.btree.getType());
logRec.log();
}
/*************************************************************/
System.arraycopy(key, keyStartPos, overflowPage.page, overflowPage.upperBound, keyLength);
keyStartPos += keyLength;
keyLeft -= keyLength;
}
//release the previoius overflow page. Dont' release nonOverflowpage
if (i > 0) page.buffer.releasePage(last.pageNumber.getTreeId(), last.pageNumber, true);
last = overflowPage;
}
//release the last overflow page
page.buffer.releasePage(last.pageNumber.getTreeId(), last.pageNumber, true);
//Since newing a page will set its next,previous page to be BTreeSpec.INVALID_PGNO,
//the last one has already been BTreeSpec.INVALID_PGNO and
//the overflow page chain has a end!
// add the new node to the current page
/***************** Add by leon,2001-9-27 14:56 ********************/
if (needLog) {
int pgno = page.pageNumber.getPageNumber();
byte[] newNode = ByteTool.copyByteArray(getHeader(), 0, BTreeSpec.NODE_HEADER_SIZE);
newNode = ByteTool.append(newNode, ByteTool.intToBytes(firstOverflowPageNo.getPageNumber()));
byte[] oldNode = ByteTool.copyByteArray(page.getPage(), nodeOffset, BTreeSpec.NODE_HEADER_SIZE + 4);
byte[] emptyNode = new byte[BTreeSpec.NODE_HEADER_SIZE + keySize];
if (ByteTool.compare(oldNode, 0, emptyNode, 0, BTreeSpec.NODE_HEADER_SIZE + keySize) == 0) {
BTreeAddRemoveLogRecord lr = new BTreeAddRemoveLogRecord(page.getPageNumber().getTreeId(), pgno, txnId, BTreeAddRemoveLogRecord.ADD_FLAG, nodeOffset, newNode, page.btreeSpec.btree.getType());
lr.log();
} else if (oldNode != newNode) {
BTreeReplLogRecord lr = new BTreeReplLogRecord(page.getPageNumber().getTreeId(), pgno, txnId, nodeOffset, oldNode, newNode, page.btreeSpec.btree.getType());
lr.log();
}
}
/******************************************************************/
System.arraycopy(this.getHeader(), 0, page.getPage(), nodeOffset, BTreeSpec.NODE_HEADER_SIZE);
// last pageNumber is the one we need here!!
System.arraycopy(ByteTool.intToBytes(firstOverflowPageNo.getPageNumber()), 0, page.getPage(), nodeOffset + BTreeSpec.NODE_HEADER_SIZE, 4);
}
}
/**
* For internal node, replace the old key with new key
*/
public void internalReplaceKey(byte[] newKey, KernelContext kContext) throws ChaiDBException {
int txnId = kContext.getLocker();
boolean needLog = kContext.getNeedLog();
try {
// fix the current page
page.page = page.buffer.getPage(pageNumber.getTreeId(), page.pageNumber);
if (page.page == null)
throw new ChaiDBException(ErrorCode.BTREE_INVALID_DATAPAGE, "Page is null: " + page.pageNumber.toHexString() + " of " + page.getPageNumber().getTreeId());
// 1. remove old key and free the overflow pages if true
if (isOverflowKey()) {
PageNumber overflowPageNumber = new PageNumber(ByteTool.bytesToInt(page.page, nodeOffset + BTreeSpec.NODE_HEADER_SIZE, page.btreeSpec.isMsbFirst()));
overflowPageNumber.setTreeId(pageNumber.getTreeId());
BTreePage overflowPage = new BTreePage(pageNumber.getTreeId(), overflowPageNumber, page.btreeSpec, page.buffer); //pageNumber must have current treeid
// put into freeList
/******************* Add by Leon, Sep 29 *****************/
if (needLog) {
int pgno = overflowPage.pageNumber.getPageNumber();
short upBound = overflowPage.upperBound;
short lowBound = (short) BTreeSpec.PAGE_HEADER_SIZE;
BTreeFreeOverflowPageLogRecord lr = new BTreeFreeOverflowPageLogRecord(overflowPage.getPageNumber().getTreeId(), pgno, txnId, ByteTool.copyByteArray(overflowPage.page, 0, lowBound), ByteTool.copyByteArray(overflowPage.page, upBound, BTreeSpec.PAGE_SIZE - upBound), overflowPage.btreeSpec.btree.getType());
lr.log();
}
/***************************/
page.buffer.addToFreeList(pageNumber.getTreeId(), overflowPage.pageNumber, needLog ? new Integer(txnId) : null);
while (overflowPage.nextPage.getPageNumber() > 0) {
overflowPage = new BTreePage(pageNumber.getTreeId(), overflowPage.nextPage, page.btreeSpec, page.buffer); //pageNumber must have current treeid
// put into freeList
/******************* Add by Leon, Sep 29 *****************/
if (needLog) {
int pgno = overflowPage.pageNumber.getPageNumber();
short upBound = overflowPage.upperBound;
short lowBound = (short) BTreeSpec.PAGE_HEADER_SIZE;
BTreeFreeOverflowPageLogRecord lr = new BTreeFreeOverflowPageLogRecord(overflowPage.getPageNumber().getTreeId(), pgno, txnId, ByteTool.copyByteArray(overflowPage.page, 0, lowBound), ByteTool.copyByteArray(overflowPage.page, upBound, BTreeSpec.PAGE_SIZE - upBound), overflowPage.btreeSpec.btree.getType());
lr.log();
}
/***************************/
page.buffer.addToFreeList(pageNumber.getTreeId(), overflowPage.pageNumber, needLog ? new Integer(txnId) : null);
}
}
// 2. decide whether newKey needs overflow pages
keySize = newKey.length;
int internalNodeSize = page.btreeSpec.getInternalNodeSize();
if (BTreeSpec.NODE_HEADER_SIZE + newKey.length > internalNodeSize) {
// set overflow key
setFlags((byte) 2);
} else {
// not overflow
setFlags((byte) 0);
}
setInternalNode(newKey, kContext);
// unfix the current page
page.buffer.releasePage(pageNumber.getTreeId(), page.pageNumber, true);
} finally {
//TODO release lock in order not to leak lock
}
}
}