/*
* @(#)$Id: BTree.java 3619 2008-03-26 07:23:03Z yui $
*
* Copyright 2006-2008 Makoto YUI
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Contributors:
* Makoto YUI - ported from Apache Xindice and various modifications are made
*/
/*
* Copyright 1999-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package xbird.storage.index;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.util.Arrays;
import javax.annotation.CheckForNull;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import xbird.config.Settings;
import xbird.storage.DbException;
import xbird.storage.indexer.BasicIndexQuery;
import xbird.storage.indexer.IndexQuery;
import xbird.util.codec.VariableByteCodec;
import xbird.util.collections.longs.PurgeOptObservableLongLRUMap;
import xbird.util.collections.longs.LongHash.BucketEntry;
import xbird.util.collections.longs.LongHash.Cleaner;
import xbird.util.io.FastMultiByteArrayOutputStream;
import xbird.util.lang.ArrayUtils;
import xbird.util.primitive.Primitives;
/**
* BTree represents a Variable Magnitude Simple-Prefix B+Tree File.
* <DIV lang="en"></DIV>
* <DIV lang="ja"></DIV>
*
* @author Makoto YUI (yuin405+xbird@gmail.com)
*/
public class BTree extends Paged {
private static final Log LOG = LogFactory.getLog(BTree.class);
/** If page size is 4k, 16m (4k * 4096) cache */
public static final int DEFAULT_IN_MEMORY_NODES;
private static final int BTREE_NODECACHE_PURGE_UNIT;
static {
DEFAULT_IN_MEMORY_NODES = Primitives.parseInt(Settings.get("xbird.storage.index.btree.nodecache_size"), 4096); // 16m
BTREE_NODECACHE_PURGE_UNIT = Primitives.parseInt(Settings.get("xbird.storage.index.bfile.nodecache_purgeunit"), 8); // 32k
}
public static final int KEY_NOT_FOUND = -1;
private static final int LEAST_KEYS = 5;
private static final byte[] EmptyBytes = new byte[0];
private static final Value EmptyValue = new Value(EmptyBytes);
protected static final byte LEAF = 1;
protected static final byte BRANCH = 2;
/**
* Cache of the recently used tree nodes.
*
* Cache contains weak references to the BTreeNode objects, keys are page numbers (Long objects).
* Access synchronized by this map itself.
*/
private final PurgeOptObservableLongLRUMap<BTreeNode> _cache;
private final int numNodeCaches;
private final BTreeFileHeader _fileHeader;
private BTreeRootInfo _rootInfo;
private BTreeNode _rootNode;
public BTree(File file) {
this(file, true);
}
public BTree(File file, boolean duplicateAllowed) {
this(file, DEFAULT_PAGESIZE, DEFAULT_IN_MEMORY_NODES, duplicateAllowed);
}
public BTree(File file, int pageSize, int caches, boolean duplicateAllowed) {
super(file, pageSize);
BTreeFileHeader fh = getFileHeader();
fh.incrTotalPageCount(); // for root page
fh._duplicateAllowed = duplicateAllowed;
this._fileHeader = fh;
final Synchronizer sync = new Synchronizer();
this._cache = new PurgeOptObservableLongLRUMap<BTreeNode>(caches, BTREE_NODECACHE_PURGE_UNIT, sync);
this.numNodeCaches = caches;
}
public void init(boolean bulkload) throws DbException {
if(!exists()) {
boolean created = create(false);
if(!created) {
throw new IllegalStateException("create B+Tree file failed: "
+ _file.getAbsolutePath());
}
} else {
open();
}
}
public void setBulkloading(boolean enable, float nodeCachePurgePerc) {
if(enable) {
if(nodeCachePurgePerc <= 0 || nodeCachePurgePerc > 1) {
throw new IllegalArgumentException("nodeCachePurgePerc is illegal as percentage: "
+ nodeCachePurgePerc);
}
int units = Math.max((int) (numNodeCaches * nodeCachePurgePerc), numNodeCaches);
_cache.setPurgeUnits(units);
} else {
_cache.setPurgeUnits(numNodeCaches);
}
}
private static final class Synchronizer implements Cleaner<BTreeNode> {
Synchronizer() {}
public void cleanup(long key, BTreeNode node) {
if(!node.dirty) {
return;
}
try {
node.write();
} catch (IOException e) {
throw new IllegalStateException(e);
} catch (DbException e) {
throw new IllegalStateException(e);
}
}
}
@Override
public boolean open() throws DbException {
if(super.open()) {
long p = _fileHeader.getRootPage();
this._rootInfo = new BTreeRootInfo(p);
this._rootNode = getBTreeNode(_rootInfo, p, null);
return true;
} else {
return false;
}
}
@Override
public boolean create(boolean close) throws DbException {
if(super.create(false)) {
// Don't call this.open() as it will try to read rootNode from the disk
super.open();
// Initialize root node
long p = _fileHeader.getRootPage();
this._rootInfo = new BTreeRootInfo(p);
this._rootNode = new BTreeNode(_rootInfo, getPage(p), null);
_rootNode.ph.setStatus(LEAF);
_rootNode.set(new Value[0], new long[0]);
try {
_rootNode.write();
} catch (IOException e) {
throw new DbException(e);
}
synchronized(_cache) {
_cache.put(_rootNode.page.getPageNum(), _rootNode);
}
if(close) {
close();
}
return true;
}
return false;
}
protected final boolean isDuplicateAllowed() {
return _fileHeader._duplicateAllowed;
}
/**
* addValue adds a Value to the BTree and associates a pointer with
* it. The pointer can be used for referencing any type of data, it
* just so happens that Xindice uses it for referencing pages of
* associated data in the BTree file or other files.
*
* @param value The Value to add
* @param pointer The pointer to associate with it
* @return The previous value for the pointer (or -1)
*/
public synchronized long addValue(Value value, long pointer) throws DbException {
try {
return _rootNode.addValue(value, pointer);
} catch (IOException e) {
throw new DbException(e);
}
}
/**
* removeValue removes a Value from the BTree and returns the
* associated pointer for it.
*
* @param value The Value to remove
* @return The pointer that was associated with it
*/
public synchronized long removeValue(Value value) throws DbException {
try {
return _rootNode.removeValue(value);
} catch (IOException e) {
throw new DbException(e);
}
}
public synchronized long[] removeValue(Value value, long pointer) throws DbException {
try {
return _rootNode.removeValue(value, pointer);
} catch (IOException e) {
throw new DbException(e);
}
}
/**
* findValue finds a Value in the BTree and returns the associated
* pointer for it.
*
* @param value The Value to find
* @return The pointer that was associated with it
*/
public synchronized long findValue(Value value) throws DbException {
return _rootNode.findValue(value);
}
public enum SearchType {
LEFT_MOST, LEFT /* normal */, RIGHT, RIGHT_MOST
}
/**
* query performs a query against the BTree and performs callback
* operations to report the search results.
*
* @param query The IndexQuery to use
* @param callback The callback instance
*/
public synchronized void search(IndexQuery query, CallbackHandler callback) throws DbException {
if(query == null) {
throw new IllegalArgumentException();
}
final BTreeNode root = _rootNode;
final Value[] keys = query.getOperands();
final int op = query.getOperator();
try {
switch(op) {
case BasicIndexQuery.EQ: {
if(isDuplicateAllowed()) {
BTreeNode left = root.getLeafNode(SearchType.LEFT, keys[0]);
BTreeNode right = root.getLeafNode(SearchType.RIGHT, keys[0]);
scanRange(left, right, query, callback);
} else {
BTreeNode left = root.getLeafNode(SearchType.LEFT, keys[0]);
left.scanLeaf(query, callback, true);
}
break;
}
case BasicIndexQuery.GT:
case BasicIndexQuery.GE: {
BTreeNode right = root.getLeafNode(SearchType.LEFT, keys[keys.length - 1]);
BTreeNode rightmost = root.getLeafNode(SearchType.RIGHT_MOST, null);
scanRange(right, rightmost, query, callback);
break;
}
case BasicIndexQuery.LE:
case BasicIndexQuery.LT: {
BTreeNode left = root.getLeafNode(SearchType.LEFT, keys[0]);
BTreeNode leftmost = root.getLeafNode(SearchType.LEFT_MOST, null);
scanRange(leftmost, left, query, callback);
break;
}
case BasicIndexQuery.NE:
case BasicIndexQuery.NBW:
case BasicIndexQuery.NOT_IN:
case BasicIndexQuery.NOT_START_WITH:
case BasicIndexQuery.NBWX: {
BTreeNode leftmost = root.getLeafNode(SearchType.LEFT_MOST, null);
BTreeNode left = root.getLeafNode(SearchType.LEFT, keys[0]);
BTreeNode rightmost = root.getLeafNode(SearchType.RIGHT_MOST, null);
BTreeNode right = root.getLeafNode(SearchType.RIGHT, keys[keys.length - 1]);
scanRange(leftmost, left, query, callback);
long lp = left.page.getPageNum(), rp = right.page.getPageNum();
if(lp != rp) {
scanRange(right, rightmost, query, callback);
}
break;
}
case BasicIndexQuery.BW:
case BasicIndexQuery.START_WITH:
case BasicIndexQuery.IN:
case BasicIndexQuery.BWX: {
BTreeNode left = root.getLeafNode(SearchType.LEFT, keys[0]);
BTreeNode right = root.getLeafNode(SearchType.RIGHT, keys[keys.length - 1]);
scanRange(left, right, query, callback);
break;
}
default: {
BTreeNode leftmost = root.getLeafNode(SearchType.LEFT_MOST, null);
BTreeNode rightmost = root.getLeafNode(SearchType.RIGHT_MOST, null);
scanRange(leftmost, rightmost, query, callback);
break;
}
}
} catch (IOException e) {
throw new DbException(e);
}
}
private final void scanRange(BTreeNode left, BTreeNode right, IndexQuery query, CallbackHandler callback)
throws DbException {
final long rightmostPageNum = right.page.getPageNum();
if(LOG.isDebugEnabled()) {
LOG.debug("scan range [" + left.page.getPageNum() + ", " + rightmostPageNum + "] start");
}
BTreeNode cur = left;
int scaned = 0;
while(true) {
long curPageNum = cur.page.getPageNum();
if(curPageNum == rightmostPageNum) {
cur.scanLeaf(query, callback, true);
++scaned;
break;
} else {
cur.scanLeaf(query, callback, scaned == 0);
++scaned;
}
long next = cur._next;
if(next == curPageNum) {
throw new IllegalStateException("detected a cyclic link at page#" + curPageNum);
} else if(next == -1L) {
throw new IllegalStateException("range scan failed... bug?");
}
cur = getBTreeNode(_rootInfo, next, null);
}
if(LOG.isDebugEnabled()) {
LOG.debug("scan range end. total scaned pages: " + scaned);
}
}
protected FileHeader createFileHeader(int pageSize) {
return new BTreeFileHeader(pageSize);
}
protected PageHeader createPageHeader() {
return new BTreePageHeader();
}
@Override
protected BTreeFileHeader getFileHeader() {
return (BTreeFileHeader) super.getFileHeader();
}
/**
* getRootNode retreives the BTree node for the specified
* root object.
*
* @param root The root object to retrieve with
* @return The root node
*/
protected final BTreeNode getRootNode(BTreeRootInfo root) throws DbException {
if(root.page == _rootInfo.page) {
return _rootNode;
} else {
return getBTreeNode(root, root.getPage(), null);
}
}
private final BTreeNode getBTreeNode(BTreeRootInfo root, long page, BTreeNode parent)
throws DbException {
BTreeNode node;
synchronized(_cache) {
node = _cache.get(page);
if(node == null) {
node = new BTreeNode(root, getPage(page), parent);
try {
node.read();
} catch (IOException e) {
throw new DbException("failed to read page#" + page, e);
}
if(LOG.isDebugEnabled()) {
LOG.debug("read node page#" + page + ", keys: " + node.keys.length);
}
_cache.put(page, node);
} else {
if(parent != null) {
node.setParent(parent);
}
}
}
return node;
}
private final BTreeNode getBTreeNode(BTreeRootInfo root, long page) throws DbException {
BTreeNode node;
synchronized(_cache) {
node = _cache.get(page);
if(node == null) {
node = new BTreeNode(root, getPage(page));
try {
node.read();
} catch (IOException e) {
throw new DbException("failed to read page#" + page, e);
}
if(LOG.isDebugEnabled()) {
LOG.debug("read node page#" + page + ", keys: " + node.keys.length);
}
_cache.put(page, node);
}
}
return node;
}
private final BTreeNode createBTreeNode(BTreeRootInfo root, byte status, @CheckForNull BTreeNode parent)
throws DbException {
if(parent == null) {
throw new IllegalArgumentException();
}
Page p = getFreePage();
BTreeNode node = new BTreeNode(root, p, parent);
//node.set(new Value[0], new long[0]);
node.ph.setStatus(status);
synchronized(_cache) {
_cache.put(p.getPageNum(), node);
}
return node;
}
public synchronized void flush(boolean purge, boolean clear) throws DbException {
if(purge) {
try {
for(BucketEntry<BTreeNode> e : _cache) {
BTreeNode node = e.getValue();
if(node != null) {
node.write();
}
}
} catch (IOException ioe) {
throw new DbException(ioe);
}
}
if(clear) {
_cache.clear();
}
super.flush();
}
private static final class BTreeRootInfo {
private final long page;
private BTreeRootInfo(long page) {
this.page = page;
}
public long getPage() {
return page;
}
}
private final class BTreeNode implements Comparable<BTreeNode> {
private final BTreeRootInfo root;
private final Page page;
private final BTreePageHeader ph;
// cached entry
private BTreeNode parentCache;
private Value[] keys;
private long[] ptrs;
private long _next = -1;
private long _prev = -1; // internal entry for debugging
private Value prefix = null;
private boolean loaded = false;
private int currentDataLen = -1;
private boolean dirty = false;
//--------------------------------------------
protected BTreeNode(final BTreeRootInfo root, final Page page, final BTreeNode parentNode) {
this.root = root;
this.page = page;
this.ph = (BTreePageHeader) page.getPageHeader();
ph.setParent(parentNode);
this.parentCache = parentNode;
}
protected BTreeNode(final BTreeRootInfo root, final Page page) {
this.root = root;
this.page = page;
this.ph = (BTreePageHeader) page.getPageHeader();
}
private BTreeNode getParent() {
if(parentCache != null) {
return parentCache;
}
long page = ph.parentPage;
if(page != Paged.NO_PAGE) {
try {
parentCache = getBTreeNode(_rootInfo, page);
} catch (DbException e) {
throw new IllegalStateException("failed to get parent page #" + page, e);
}
return parentCache;
}
return null;
}
private void setParent(BTreeNode node) {
long parentPage = node.page.getPageNum();
if(parentPage != ph.parentPage) {
ph.parentPage = parentPage;
this.parentCache = node;
this.dirty = true;
}
}
long addValue(Value value, long pointer) throws IOException, DbException {
if(value == null) {
throw new IllegalArgumentException("Can't add a null Value");
}
int idx = searchRightmostKey(keys, value, keys.length);
switch(ph.getStatus()) {
case BRANCH: {
idx = idx < 0 ? -(idx + 1) : idx + 1;
return getChildNode(idx).addValue(value, pointer);
}
case LEAF: {
final boolean found = idx >= 0;
final long oldPtr;
if(found) {
if(!isDuplicateAllowed()) {
throw new BTreeCorruptException("Attempt to add duplicate key to the unique index: "
+ value);
}
oldPtr = ptrs[idx];
value = keys[idx];
idx = idx + 1;
} else {
oldPtr = -1;
idx = -(idx + 1);
}
set(ArrayUtils.<Value> insert(keys, idx, value), ArrayUtils.insert(ptrs, idx, pointer));
incrDataLength(value, pointer);
// Check to see if we've exhausted the block
if(needSplit()) {
split();
}
return oldPtr;
}
default:
throw new BTreeCorruptException("Invalid Page Type '" + ph.getStatus()
+ "' was detected for page#" + page.getPageNum());
}
}
/** search the leftmost key for duplicate allowed index */
private int searchLeftmostKey(final Value[] ary, final Value key, final int to) {
if(!_fileHeader._duplicateAllowed) {
return ArrayUtils.binarySearch(keys, 0, to, key);
}
int low = 0;
int high = to - 1;
while(low <= high) {
int mid = (low + high) >>> 1;
Value midVal = ary[mid];
int cmp = midVal.compareTo(key);
if(cmp < 0) {
low = mid + 1;
} else if(cmp > 0) {
high = mid - 1;
} else {
for(int i = mid - 1; i >= 0; i--) {
Value nxtVal = ary[i];
cmp = midVal.compareTo(nxtVal);
if(cmp != 0) {
break;
}
mid = i;
}
return mid; // key found
}
}
return -(low + 1); // key not found.
}
/** search the rightmost key for duplicate allowed index */
private int searchRightmostKey(final Value[] ary, final Value key, final int to) {
if(!_fileHeader._duplicateAllowed) {
return ArrayUtils.binarySearch(keys, 0, to, key);
}
int low = 0;
int high = to - 1;
while(low <= high) {
int mid = (low + high) >>> 1;
Value midVal = ary[mid];
int cmp = midVal.compareTo(key);
if(cmp < 0) {
low = mid + 1;
} else if(cmp > 0) {
high = mid - 1;
} else {
for(int i = mid + 1; i <= high; i++) {
Value nxtVal = ary[i];
cmp = midVal.compareTo(nxtVal);
if(cmp != 0) {
break;
}
mid = i;
}
return mid; // key found
}
}
return -(low + 1); // key not found.
}
/** @return pointer of left-most matched item */
long removeValue(Value searchKey) throws IOException, DbException {
int leftIdx = searchLeftmostKey(keys, searchKey, keys.length);
switch(ph.getStatus()) {
case BRANCH:
leftIdx = (leftIdx < 0) ? -(leftIdx + 1) : leftIdx + 1;
return getChildNode(leftIdx).removeValue(searchKey);
case LEAF:
if(leftIdx < 0) {
return KEY_NOT_FOUND;
} else {
long oldPtr = ptrs[leftIdx];
set(ArrayUtils.remove(keys, leftIdx), ArrayUtils.remove(ptrs, leftIdx));
decrDataLength(searchKey);
return oldPtr;
}
default:
throw new BTreeCorruptException("Invalid page type '" + ph.getStatus()
+ "' in removeValue");
}
}
/** @return pointer of matched items */
@Deprecated
long[] removeValue(Value searchKey, long pointer) throws IOException, DbException {
int leftIdx = searchLeftmostKey(keys, searchKey, keys.length);
int rightIdx = isDuplicateAllowed() ? searchRightmostKey(keys, searchKey, keys.length)
: leftIdx;
switch(ph.getStatus()) {
case BRANCH: {
leftIdx = (leftIdx < 0) ? -(leftIdx + 1) : leftIdx + 1;
//FIXME keys may be separated nodes
return getChildNode(leftIdx).removeValue(searchKey, pointer);
}
case LEAF: {
if(leftIdx < 0) {
return new long[0];
} else {
int founds = 0;
long[] matched = new long[rightIdx - leftIdx + 1];
for(int i = leftIdx; i <= rightIdx; i++) {
long p = ptrs[i];
if(p == pointer) {
set(ArrayUtils.remove(keys, i), ArrayUtils.remove(ptrs, i));
decrDataLength(searchKey);
matched[founds++] = p;
i--;
rightIdx--;
}
}
if(founds == 0) {
return new long[0];
}
return (founds == matched.length) ? matched
: ArrayUtils.copyOfRange(matched, 0, founds);
}
}
default:
throw new BTreeCorruptException("Invalid page type '" + ph.getStatus()
+ "' in removeValue");
}
}
/**
* Internal (to the BTreeNode) method.
* Because this method is called only by BTreeNode itself,
* no synchronization done inside of this method.
*/
private BTreeNode getChildNode(final int idx) throws DbException {
if(ph.getStatus() == BRANCH && idx >= 0 && idx < ptrs.length) {
return getBTreeNode(root, ptrs[idx], this);
}
return null;
}
/**
* Need to split this node after adding one more value?
*
* @see #write()
*/
private boolean needSplit() {
int afterKeysLength = keys.length + 1;
if(afterKeysLength < LEAST_KEYS) {// at least 5 elements in a node
return false;
}
if(afterKeysLength > Short.MAX_VALUE) {
return true;
}
assert (prefix != null);
// CurrLength + one Long pointer + value length + one int (for value length)
// actual datalen is smaller than this datalen, because prefix is used.
int datalen = calculateDataLength();
int worksize = _fileHeader.getWorkSize();
return datalen > worksize;
}
/**
* Internal to the BTreeNode method
*/
private void split() throws IOException, DbException {
final Value[] leftVals;
final Value[] rightVals;
final long[] leftPtrs;
final long[] rightPtrs;
final Value separator;
final short vc = ph.getValueCount();
int pivot = vc / 2;
// Split the node into two nodes
final byte pageType = ph.getStatus();
int leftLookup = 0;
switch(pageType) {
case BRANCH: {
leftVals = new Value[pivot];
leftPtrs = new long[leftVals.length + 1];
rightVals = new Value[vc - (pivot + 1)];
rightPtrs = new long[rightVals.length + 1];
System.arraycopy(keys, 0, leftVals, 0, leftVals.length);
System.arraycopy(ptrs, 0, leftPtrs, 0, leftPtrs.length);
System.arraycopy(keys, leftVals.length + 1, rightVals, 0, rightVals.length);
System.arraycopy(ptrs, leftPtrs.length, rightPtrs, 0, rightPtrs.length);
separator = keys[leftVals.length];
break;
}
case LEAF: {
Value pivotLeft = keys[pivot - 1];
Value pivotRight = keys[pivot];
if(pivotLeft.equals(pivotRight)) {
int leftmost = searchLeftmostKey(keys, pivotLeft, pivot - 1);
int diff = pivot - leftmost;
if(diff < 0 || diff > Short.MAX_VALUE) {
throw new IllegalStateException("pivot: " + pivot + ", leftmost: "
+ leftmost + "\nkeys: " + Arrays.toString(keys));
}
leftLookup = diff;
}
leftVals = new Value[pivot];
leftPtrs = new long[leftVals.length];
rightVals = new Value[vc - pivot];
rightPtrs = new long[rightVals.length];
System.arraycopy(keys, 0, leftVals, 0, leftVals.length);
System.arraycopy(ptrs, 0, leftPtrs, 0, leftPtrs.length);
System.arraycopy(keys, leftVals.length, rightVals, 0, rightVals.length);
System.arraycopy(ptrs, leftPtrs.length, rightPtrs, 0, rightPtrs.length);
separator = getSeparator(leftVals[leftVals.length - 1], rightVals[0]);
break;
}
default:
throw new BTreeCorruptException("Invalid page type in split: " + pageType);
}
// Promote the pivot to the parent branch
final BTreeNode parent = getParent(); // this node may be GC'd
if(parent == null) {
// This can only happen if this is the root
BTreeNode lNode = createBTreeNode(root, pageType, this);
lNode.set(leftVals, leftPtrs);
lNode.calculateDataLength();
lNode.setAsParent();
BTreeNode rNode = createBTreeNode(root, pageType, this);
rNode.set(rightVals, rightPtrs);
rNode.calculateDataLength();
rNode.setAsParent();
if(pageType == LEAF) {
setLeavesLinked(lNode, rNode);
}
ph.setStatus(BRANCH);
set(new Value[] { separator }, new long[] { lNode.page.getPageNum(),
rNode.page.getPageNum() });
calculateDataLength();
} else {
set(leftVals, leftPtrs);
calculateDataLength();
BTreeNode rNode = createBTreeNode(root, pageType, parent);
rNode.set(rightVals, rightPtrs);
rNode.calculateDataLength();
rNode.setAsParent();
if(pageType == LEAF) {
setLeavesLinked(this, rNode);
if(leftLookup > 0) {
rNode.ph.setLeftLookup(leftLookup);
}
}
long leftPtr = page.getPageNum();
long rightPtr = rNode.page.getPageNum();
parent.promoteValue(separator, leftPtr, rightPtr);
}
}
/** Set the parent-link in all child nodes to point to this node */
private void setAsParent() throws DbException {
if(ph.getStatus() == BRANCH) {
for(final long ptr : ptrs) {
BTreeNode child = getBTreeNode(_rootInfo, ptr, this);
child.setParent(this);
}
}
}
/** Set leaves linked */
private void setLeavesLinked(final BTreeNode left, final BTreeNode right)
throws DbException {
final long leftPageNum = left.page.getPageNum();
final long rightPageNum = right.page.getPageNum();
final long origNext = left._next;
if(origNext != -1L) {
right._next = origNext;
BTreeNode origNextNode = getBTreeNode(root, origNext);
origNextNode._prev = rightPageNum;
origNextNode.setDirty(true);
}
left._next = rightPageNum;
right._prev = leftPageNum;
}
private void promoteValue(final Value key, final long leftPtr, final long rightPtr)
throws IOException, DbException {
final int leftIdx = searchRightmostKey(keys, key, keys.length);
int insertPoint = (leftIdx < 0) ? -(leftIdx + 1) : leftIdx + 1;
boolean found = false;
for(int i = insertPoint; i >= 0; i--) {
final long ptr = ptrs[i];
if(ptr == leftPtr) {
insertPoint = i;
found = true;
break;
} else {
continue; // just for debugging
}
}
if(!found) {
throw new IllegalStateException("page#" + page.getPageNum() + ", insertPoint: "
+ insertPoint + ", leftPtr: " + leftPtr + ", ptrs: "
+ Arrays.toString(ptrs));
}
set(ArrayUtils.<Value> insert(keys, insertPoint, key), ArrayUtils.insert(ptrs, insertPoint + 1, rightPtr));
incrDataLength(key, rightPtr);
// Check to see if we've exhausted the block
if(needSplit()) {
split();
}
}
/** Gets shortest-possible separator for the pivot */
private Value getSeparator(final Value value1, final Value value2) {
int idx = value1.compareTo(value2);
if(idx == 0) {
return value1.clone();
}
byte[] b = new byte[Math.abs(idx)];
value2.copyTo(b, 0, b.length);
return new Value(b);
}
/**
* Sets values and pointers.
* Internal (to the BTreeNode) method, not synchronized.
*/
private void set(final Value[] values, final long[] ptrs) {
final int vlen = values.length;
if(vlen > Short.MAX_VALUE) {
throw new IllegalArgumentException("entries exceeds limit: " + vlen);
}
this.keys = values;
this.ptrs = ptrs;
this.ph.setValueCount((short) vlen);
if(vlen > 1) {
final int prevPreixLen = ph.getPrefixLength();
this.prefix = getPrefix(values[0], values[vlen - 1]);
final int prefixLen = prefix.getLength();
assert (prefixLen <= Short.MAX_VALUE) : prefixLen;
if(prefixLen != prevPreixLen) {
int diff = prefixLen - prevPreixLen;
currentDataLen += diff;
ph.setPrefixLength((short) prefixLen);
}
} else {
this.prefix = EmptyValue;
ph.setPrefixLength((short) 0);
}
setDirty(true);
//_cache.put(page.getPageNum(), this); // required? REVIEWME
}
private void setDirty(final boolean dirt) {
this.dirty = dirt;
}
private Value getPrefix(final Value v1, final Value v2) {
final int idx = Math.abs(v1.compareTo(v2)) - 1;
if(idx > 0) {
final byte[] d2 = v2.getData();
return new Value(d2, v2.getPosition(), idx);
} else {
return EmptyValue;
}
}
/**
* Reads node only if it is not loaded yet
*/
private void read() throws IOException, DbException {
if(!this.loaded) {
Value v = readValue(page);
DataInputStream in = new DataInputStream(v.getInputStream());
// Read in the common prefix (if any)
final short pfxLen = ph.getPrefixLength();
final byte[] pfxBytes;
if(pfxLen > 0) {
pfxBytes = new byte[pfxLen];
in.read(pfxBytes);
this.prefix = new Value(pfxBytes);
} else {
pfxBytes = EmptyBytes;
this.prefix = EmptyValue;
}
// Read in the Values
Value prevKey = null;
final int keyslen = ph.getValueCount();
keys = new Value[keyslen];
for(int i = 0; i < keyslen; i++) {
final int valSize = in.readInt();
if(valSize == -1) {
prevKey.incrRefCount();
keys[i] = prevKey;
} else {
byte[] b = new byte[pfxLen + valSize];
if(pfxLen > 0) {
System.arraycopy(pfxBytes, 0, b, 0, pfxLen);
}
if(valSize > 0) {
in.read(b, pfxLen, valSize);
}
prevKey = new Value(b);
keys[i] = prevKey;
}
}
// Read in the pointers
final int ptrslen = ph.getPointerCount();
ptrs = new long[ptrslen];
for(int i = 0; i < ptrslen; i++) {
ptrs[i] = VariableByteCodec.decodeLong(in);
}
// Read in the links if current node is a leaf
if(ph.getStatus() == LEAF) {
this._prev = in.readLong();
this._next = in.readLong();
}
this.currentDataLen = v.getLength();
this.loaded = true;
}
}
private void write() throws IOException, DbException {
if(!dirty) {
return;
}
if(LOG.isTraceEnabled()) {
LOG.trace((ph.getStatus() == LEAF ? "Leaf " : "Branch ") + "Node#"
+ page.getPageNum() + " - " + Arrays.toString(keys));
}
final FastMultiByteArrayOutputStream bos = new FastMultiByteArrayOutputStream(_fileHeader.getWorkSize());
final DataOutputStream os = new DataOutputStream(bos);
// write out the prefix
final short prefixlen = ph.getPrefixLength();
if(prefixlen > 0) {
prefix.writeTo(os);
}
// Write out the Values
Value prevKey = null;
for(int i = 0; i < keys.length; i++) {
final Value v = keys[i];
if(v == prevKey) {
os.writeInt(-1);
} else {
final int len = v.getLength();
final int size = len - prefixlen;
os.writeInt(size);
if(size > 0) {
v.writeTo(os, prefixlen, size);
}
}
prevKey = v;
}
// Write out the pointers
for(int i = 0; i < ptrs.length; i++) {
VariableByteCodec.encodeLong(ptrs[i], os);
}
// Write out link if current node is a leaf
if(ph.getStatus() == LEAF) {
os.writeLong(_prev);
os.writeLong(_next);
}
writeValue(page, new Value(bos.toByteArray()));
this.parentCache = null;
setDirty(false);
}
private int calculateDataLength() {
if(currentDataLen > 0) {
return currentDataLen;
}
final int vlen = keys.length;
final short prefixlen = ph.getPrefixLength();
int datalen = prefixlen + (vlen >>> 2) /* key size */;
Value prevValue = null;
for(int i = 0; i < vlen; i++) {
final long ptr = ptrs[i];
datalen += VariableByteCodec.requiredBytes(ptr);
final Value v = keys[i];
if(v == prevValue) {
continue;
}
final int keylen = v.getLength();
final int actkeylen = keylen - prefixlen; /* actual keys length */
datalen += actkeylen;
prevValue = v;
}
if(ph.getStatus() == LEAF) {
datalen += 16;
}
this.currentDataLen = datalen;
return datalen;
}
private void incrDataLength(final Value value, final long ptr) {
int datalen = currentDataLen;
if(datalen == -1) {
datalen = calculateDataLength();
}
final int refcnt = value.incrRefCount();
if(refcnt == 1) {
datalen += value.getLength();
}
datalen += VariableByteCodec.requiredBytes(ptr);
datalen += 4 /* key size */;
this.currentDataLen = datalen;
}
private void decrDataLength(final Value value) {
int datalen = currentDataLen;
final int refcnt = value.decrRefCount();
if(refcnt == 0) {
datalen -= value.getLength();
}
datalen -= (4 + 8);
this.currentDataLen = datalen;
}
/** find lest-most value which matches to the key */
long findValue(Value serarchKey) throws DbException {
if(serarchKey == null) {
throw new DbException("Can't search on null Value");
}
int idx = searchLeftmostKey(keys, serarchKey, keys.length);
switch(ph.getStatus()) {
case BRANCH:
idx = idx < 0 ? -(idx + 1) : idx + 1;
return getChildNode(idx).findValue(serarchKey);
case LEAF:
if(idx < 0) {
return KEY_NOT_FOUND;
} else {
if(idx == 0 && (ph.getLeftLookup() > 0)) {
BTreeNode leftmostNode = this;
while(true) {
leftmostNode = getBTreeNode(root, leftmostNode._prev);
final Value[] lmKeys = leftmostNode.keys;
assert (lmKeys.length > 0);
if(!lmKeys[0].equals(serarchKey)) {
break;
}
final int prevLookup = leftmostNode.ph.getLeftLookup();
if(prevLookup == 0) {
break;
}
}
final Value[] lmKeys = leftmostNode.keys;
final int lmIdx = leftmostNode.searchLeftmostKey(lmKeys, serarchKey, lmKeys.length);
if(lmIdx < 0) {
throw new BTreeCorruptException("Duplicated key was not found: "
+ serarchKey);
}
final long[] leftmostPtrs = leftmostNode.ptrs;
return leftmostPtrs[lmIdx];
} else {
return ptrs[idx];
}
}
default:
throw new BTreeCorruptException("Invalid page type '" + ph.getStatus()
+ "' in findValue");
}
}
/**
* Scan the leaf node.
* Note that keys might be shortest-possible value.
*/
void scanLeaf(IndexQuery query, CallbackHandler callback, boolean edge) {
assert (ph.getStatus() == LEAF) : ph.getStatus();
Value[] conds = query.getOperands();
switch(query.getOperator()) {
case BasicIndexQuery.EQ: {
if(!edge) {
for(int i = 0; i < keys.length; i++) {
callback.indexInfo(keys[i], ptrs[i]);
}
return;
}
final int leftIdx = searchLeftmostKey(keys, conds[0], keys.length);
if(leftIdx >= 0) {
assert (isDuplicateAllowed());
final int rightIdx = searchRightmostKey(keys, conds[conds.length - 1], keys.length);
for(int i = leftIdx; i <= rightIdx; i++) {
callback.indexInfo(keys[i], ptrs[i]);
}
}
break;
}
case BasicIndexQuery.NE: {
int leftIdx = searchLeftmostKey(keys, conds[0], keys.length);
int rightIdx = isDuplicateAllowed() ? searchRightmostKey(keys, conds[conds.length - 1], keys.length)
: leftIdx;
for(int i = 0; i < ptrs.length; i++) {
if(i < leftIdx || i > rightIdx) {
callback.indexInfo(keys[i], ptrs[i]);
}
}
break;
}
case BasicIndexQuery.BWX:
case BasicIndexQuery.BW:
case BasicIndexQuery.START_WITH:
case BasicIndexQuery.IN: {
if(!edge) {
for(int i = 0; i < keys.length; i++) {
if(query.testValue(keys[i])) {
callback.indexInfo(keys[i], ptrs[i]);
}
}
return;
}
int leftIdx = searchLeftmostKey(keys, conds[0], keys.length);
if(leftIdx < 0) {
leftIdx = -(leftIdx + 1);
}
int rightIdx = searchRightmostKey(keys, conds[conds.length - 1], keys.length);
if(rightIdx < 0) {
rightIdx = -(rightIdx + 1);
}
for(int i = leftIdx; i < ptrs.length; i++) {
if(i <= rightIdx && query.testValue(keys[i])) {
callback.indexInfo(keys[i], ptrs[i]);
}
}
break;
}
case BasicIndexQuery.NBWX:
case BasicIndexQuery.NBW:
case BasicIndexQuery.NOT_START_WITH: {
int leftIdx = searchLeftmostKey(keys, conds[0], keys.length);
if(leftIdx < 0) {
leftIdx = -(leftIdx + 1);
}
int rightIdx = searchRightmostKey(keys, conds[conds.length - 1], keys.length);
if(rightIdx < 0) {
rightIdx = -(rightIdx + 1);
}
for(int i = 0; i < ptrs.length; i++) {
if((i <= leftIdx || i >= rightIdx) && query.testValue(keys[i])) {
callback.indexInfo(keys[i], ptrs[i]);
}
}
break;
}
case BasicIndexQuery.LT: {
int leftIdx = searchLeftmostKey(keys, conds[0], keys.length);
if(leftIdx < 0) {
leftIdx = -(leftIdx + 1); // insertion point
}
for(int i = 0; i < leftIdx; i++) {
callback.indexInfo(keys[i], ptrs[i]);
}
break;
}
case BasicIndexQuery.LE: {
int leftIdx = searchRightmostKey(keys, conds[0], keys.length);
if(leftIdx < 0) {
leftIdx = -(leftIdx + 1); // insertion point
}
if(leftIdx >= ptrs.length) {
leftIdx = ptrs.length - 1;
}
for(int i = 0; i <= leftIdx; i++) {
callback.indexInfo(keys[i], ptrs[i]);
}
break;
}
case BasicIndexQuery.GT: {
int rightIdx = searchRightmostKey(keys, conds[0], keys.length);
if(rightIdx < 0) {
rightIdx = -(rightIdx + 1);
}
for(int i = rightIdx + 1; i < ptrs.length; i++) {
callback.indexInfo(keys[i], ptrs[i]);
}
break;
}
case BasicIndexQuery.GE: {
int rightIdx = searchLeftmostKey(keys, conds[0], keys.length);
if(rightIdx < 0) {
rightIdx = -(rightIdx + 1);
}
for(int i = rightIdx; i < ptrs.length; i++) {
callback.indexInfo(keys[i], ptrs[i]);
}
break;
}
case BasicIndexQuery.ANY:
for(int i = 0; i < ptrs.length; i++) {
callback.indexInfo(keys[i], ptrs[i]);
}
break;
case BasicIndexQuery.NOT_IN:
default:
for(int i = 0; i < ptrs.length; i++) {
if(query.testValue(keys[i])) {
callback.indexInfo(keys[i], ptrs[i]);
}
}
break;
}
}
BTreeNode getLeafNode(SearchType searchType, Value key) throws IOException, DbException {
final byte nodeType = ph.getStatus();
switch(nodeType) {
case BRANCH:
switch(searchType) {
case LEFT: {
int leftIdx = searchLeftmostKey(keys, key, keys.length);
leftIdx = leftIdx < 0 ? -(leftIdx + 1) : leftIdx + 1;
return getChildNode(leftIdx).getLeafNode(searchType, key);
}
case RIGHT: {
int rightIdx = searchRightmostKey(keys, key, keys.length);
rightIdx = rightIdx < 0 ? -(rightIdx + 1) : rightIdx + 1;
return getChildNode(rightIdx).getLeafNode(searchType, key);
}
case LEFT_MOST:
return getChildNode(0).getLeafNode(searchType, key);
case RIGHT_MOST:
int rightIdx = ptrs.length - 1;
assert (rightIdx >= 0);
return getChildNode(rightIdx).getLeafNode(searchType, key);
default:
throw new IllegalStateException();
}
case LEAF:
switch(searchType) {
case LEFT: {
if(keys.length == 0) {
break;
}
BTreeNode leftmostNode = this;
if(keys[0].equals(key)) {
int lookup = ph.getLeftLookup();
while(lookup > 0) {
leftmostNode = getBTreeNode(root, leftmostNode._prev);
int keylen = leftmostNode.keys.length;
if(lookup < keylen) {
break;
}
lookup = leftmostNode.ph.getLeftLookup();
if(lookup == 0) {
break;
}
Value firstKey = leftmostNode.keys[0];
if(!firstKey.equals(key)) {
break;
}
}
}
return leftmostNode;
}
case RIGHT_MOST:
if(_next != -1L) {
BTreeNode nextNode = getBTreeNode(root, _next);
BTreeNode parent = getParent();
throw new IllegalStateException("next=" + _next + ".. more leaf ["
+ nextNode + "] exists on the right side of leaf ["
+ this.toString() + "]\n parent-ptrs: "
+ Arrays.toString(parent.ptrs));
}
break;
case LEFT_MOST:
if(_prev != -1L) {
BTreeNode prevNode = getBTreeNode(root, _prev);
BTreeNode parent = getParent();
throw new IllegalStateException("prev=" + _prev + ".. more leaf ["
+ prevNode + "] exists on the left side of leaf ["
+ this.toString() + "]\n parent-ptrs: "
+ Arrays.toString(parent.ptrs));
}
break;
default:
break;
}
return this;
default:
throw new BTreeCorruptException("Invalid page type in query: " + nodeType);
}
}
@Override
public String toString() {
final StringBuilder buf = new StringBuilder();
final long rootPage = root.getPage();
BTreeNode pn = this;
while(true) {
final long curPageNum = pn.page.getPageNum();
buf.append(curPageNum);
pn = pn.getParent();
if(pn == null) {
if(curPageNum != rootPage) {
buf.append("<-?");
}
break;
} else {
buf.append("<-");
}
}
return buf.toString();
}
public int compareTo(BTreeNode other) {
return page.compareTo(other.page);
}
}
protected class BTreeFileHeader extends FileHeader {
private long _rootPage = 0;
private boolean _duplicateAllowed = true;
public BTreeFileHeader(int pageSize) {
super(pageSize);
}
@Override
public synchronized void read(RandomAccessFile raf) throws IOException {
super.read(raf);
this._duplicateAllowed = raf.readBoolean();
this._rootPage = raf.readLong();
}
@Override
public synchronized void write(RandomAccessFile raf) throws IOException {
super.write(raf);
raf.writeBoolean(_duplicateAllowed);
raf.writeLong(_rootPage);
}
/** The root page of the storage tree */
@Deprecated
public final void setRootPage(long rootPage) {
this._rootPage = rootPage;
setDirty(true);
}
/** The root page of the storage tree */
public final long getRootPage() {
return _rootPage;
}
}
protected class BTreePageHeader extends PageHeader {
private long parentPage;
private short valueCount = 0;
private short prefixLength = 0;
private int leftLookup = 0;
public BTreePageHeader() {
super();
}
@Deprecated
public BTreePageHeader(ByteBuffer buf) {
super(buf);
}
@Override
public void read(ByteBuffer buf) {
super.read(buf);
if(getStatus() == UNUSED) {
return;
}
parentPage = buf.getLong();
valueCount = buf.getShort();
prefixLength = buf.getShort();
leftLookup = buf.getInt();
}
@Override
public void write(ByteBuffer buf) {
super.write(buf);
buf.putLong(parentPage);
buf.putShort(valueCount);
buf.putShort(prefixLength);
buf.putInt(leftLookup);
}
public void setParent(BTreeNode parentNode) {
if(parentNode == null) {
this.parentPage = Paged.NO_PAGE;
} else {
this.parentPage = parentNode.page.getPageNum();
}
}
/** The number of values stored by this page */
public final void setValueCount(short valueCount) {
this.valueCount = valueCount;
}
/** The number of values stored by this page */
public final short getValueCount() {
return valueCount;
}
/** The number of pointers stored by this page */
public final int getPointerCount() {
if(getStatus() == BRANCH) {
return valueCount + 1;
} else {
return valueCount;
}
}
public final short getPrefixLength() {
return prefixLength;
}
public final void setPrefixLength(short prefixLength) {
this.prefixLength = prefixLength;
}
public final int getLeftLookup() {
return leftLookup;
}
public final void setLeftLookup(int leftLookup) {
this.leftLookup = leftLookup;
}
}
public static final class BTreeCorruptException extends RuntimeException {
private static final long serialVersionUID = 5609947858701765326L;
public BTreeCorruptException(String message) {
super(message);
}
public BTreeCorruptException(String message, Throwable cause) {
super(message, cause);
}
}
}