/*
* Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved.
* This file is part of Async HBase.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the StumbleUpon nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package org.hbase.async;
import java.io.IOException;
import com.google.protobuf.AbstractMessageLite;
import com.google.protobuf.CodedOutputStream;
import com.google.protobuf.InvalidProtocolBufferException;
import com.google.protobuf.Parser;
import org.jboss.netty.buffer.ChannelBuffer;
import org.jboss.netty.buffer.ChannelBuffers;
import org.jboss.netty.util.CharsetUtil;
import com.stumbleupon.async.Deferred;
/**
* Abstract base class for all RPC requests going out to HBase.
* <p>
* Implementations of this class are <b>not</b> expected to be synchronized.
*
* <h1>A note on passing {@code byte} arrays in argument</h1>
* None of the method that receive a {@code byte[]} in argument will copy it.
* If you change the contents of any byte array you give to an instance of
* this class, you <em>may</em> affect the behavior of the request in an
* <strong>unpredictable</strong> way. If you need to change the byte array,
* {@link Object#clone() clone} it before giving it to this class. For those
* familiar with the term "defensive copy", we don't do it in order to avoid
* unnecessary memory copies when you know you won't be changing (or event
* holding a reference to) the byte array, which is frequently the case.
*/
public abstract class HBaseRpc {
/**
* An RPC from which you can get a table name.
* @since 1.1
*/
public interface HasTable {
/**
* Returns the name of the table this RPC is for.
* <p>
* <strong>DO NOT MODIFY THE CONTENTS OF THE ARRAY RETURNED.</strong>
*/
public byte[] table();
}
/**
* An RPC from which you can get a row key name.
* @since 1.1
*/
public interface HasKey {
/**
* Returns the row key this RPC is for.
* <p>
* <strong>DO NOT MODIFY THE CONTENTS OF THE ARRAY RETURNED.</strong>
*/
public byte[] key();
}
/**
* An RPC from which you can get a family name.
* @since 1.1
*/
public interface HasFamily {
/**
* Returns the family this RPC is for.
* <p>
* <strong>DO NOT MODIFY THE CONTENTS OF THE ARRAY RETURNED.</strong>
*/
public byte[] family();
}
/**
* An RPC from which you can get a column qualifier name.
* @since 1.1
*/
public interface HasQualifier {
/**
* Returns the column qualifier this RPC is for.
* <p>
* <strong>DO NOT MODIFY THE CONTENTS OF THE ARRAY RETURNED.</strong>
*/
public byte[] qualifier();
}
/**
* An RPC from which you can get multiple column qualifier names.
* @since 1.1
*/
public interface HasQualifiers {
/**
* Returns the column qualifiers this RPC is for.
* <p>
* <strong>DO NOT MODIFY THE CONTENTS OF THE ARRAY RETURNED.</strong>
*/
public byte[][] qualifiers();
}
/**
* An RPC from which you can get a value.
* @since 1.1
*/
public interface HasValue {
/**
* Returns the value contained in this RPC.
* <p>
* <strong>DO NOT MODIFY THE CONTENTS OF THE ARRAY RETURNED.</strong>
*/
public byte[] value();
}
/**
* An RPC from which you can get multiple values.
* @since 1.3
*/
public interface HasValues {
/**
* Returns the values contained in this RPC.
* <p>
* <strong>DO NOT MODIFY THE CONTENTS OF THE ARRAY RETURNED.</strong>
*/
public byte[][] values();
}
/**
* An RPC from which you can get a timestamp.
* @since 1.2
*/
public interface HasTimestamp {
/** Returns the strictly positive timestamp contained in this RPC. */
public long timestamp();
}
/**
* Package-private interface to mark RPCs that are changing data in HBase.
* @since 1.4
*/
interface IsEdit {
/** RPC method name to use with HBase 0.95+. */
static final byte[] MUTATE = { 'M', 'u', 't', 'a', 't', 'e' };
}
/*
* This class, although it's part of the public API, is mostly here to make
* it easier for this library to manipulate the HBase RPC protocol.
*
*
* Unofficial Hadoop / HBase RPC protocol documentation
* ****************************************************
*
* HBase uses a modified version of the Hadoop RPC protocol. They took
* Hadoop's RPC code, copy-pasted it into HBase, and tweaked it a little
* bit (mostly in a desperate attempt to try to make it less inefficient).
*
* RPCs are numbered with an arbitrary 32-bit ID. It is customary, but not
* mandatory, to start at 0 and increment by 1 every time you send out an
* RPC. The ID is allowed to wrap around and become negative. As long as
* no 2 RPCs share the same ID at the same time, we're fine.
*
* When requests are written out to the wire, they're framed. Meaning, a
* 4 byte integer value is first written in order to specify how many bytes
* are in the request (excluding the first 4 bytes themselves). The size -1
* is special. The client uses it to send a "ping" to the server at regular
* intervals, and the server specifically ignores any RPC with size -1. We
* don't do this in this client, because it's mostly useless, and we rely on
* TCP keepalive instead.
*
* Then the RPC ID is written (4 bytes). BTW, all integer values are
* encoded in big endian, as it's the default in Java world (Sun, SPARC...).
*
* Then the length of the method name is written on 2 bytes (I guess 1 byte
* wasn't enough in case you wanted to have 32768 byte long method names).
*
* Then the method name itself is written as-is (as a byte array).
*
* The last 4 fields are what constitute the "RPC header". The remaining
* bytes are the parameters of the request. First, there is a 4-byte int
* that specifies how many parameters follow (this way you can have up to
* 2 147 483 648 parameters, which may come in handy in a few centuries).
*
* In HBase 0.92 and above, 3 more fields have been added in the header as
* previously described. The first is a one byte version number that comes
* right before the method name, indicating how the parameters of the RPC
* have been serialized. Then there is a 8 byte (!) client version that's
* right after the method name, followed by a 4 byte "fingerprint", which
* is a sort of hash code of the method's signature (name, return type, and
* parameters types). Note that the client version seems to be always set
* to zero...
*
* In Hadoop RPC, the name of the class is first serialized (2 bytes
* specifying the length of the string, followed by that number of bytes
* of a UTF-8 encoded string in case you name your classes with Kanjis).
* In HBase RPC, a 1-byte ID representing the class name is written instead
* of writing the full class name. Those IDs are hard-coded in a central
* location (`HbaseObjectWritable', HBase's copy-pasted-hacked version of
* Hadoop's `ObjectWritable').
*
* The way each parameter is serialized depends on the object type being
* serialized. Since Hadoop doesn't use any automatic serialization
* framework, every class is free to serialize itself however it wants.
* The way it works is that for built-in types, they'll handle the
* serialization manually, and for other objects, they require that those
* objects implement their `Writable' interface which requires that a method
* named `readFields' and a method named `write' be implemented to
* de-serialize / serialize the object. So since the RPC layer knows the
* name of the class of the parameter, it will grab its `Class' using the
* Java Classloader and then `newInstance' it and then use `readFields' to
* populate the newly created instance. Thankfully most objects use a
* common library to serialize their own fields recursively, however things
* aren't always consistent, particularly when HBase chose to diverge from
* Hadoop in certain (but not all) code paths.
*
* The way RPC responses are encoded is as follows. First comes the 4-byte
* RPC ID. Then 1 byte containing flags indicating whether or not the
* request failed (0x01) on the remote side, and whether the response is
* framed (0x02). If flags are only 0x00, this is an old-style (pre 0.92)
* successful response that is not framed. Framed responses contain a
* 4-byte integer with the length of the entire response, including the
* leading RPC ID, flags, and the length itself. If there is a length, it
* is always followed by a 4-byte integer with the state of the RPC follows.
* As of 0.92, this state mostly useless. If the request failed (flag 0x01
* is set), the rest of the response is just 2 Hadoop-encoded
* strings (2-byte length, followed by a UTF-8 string). The first string is
* the name of the class of the exception and the second is the message of
* the exception (which typically includes some of the server-side stack
* trace). Note that if the response is NOT framed, it's not easy to tell
* ahead of time how many bytes to expect or where the next response starts.
*
* If the RPC was successful, the remaining of the payload is serialized
* using the same method as the RPC parameters are serialized (see above).
*
* Before the very first RPC, the server expects a "hello" message that
* starts with 4-byte magic number, followed by the RPC version (1 byte).
* Then comes 4 bytes to specify the rest of the length of the "hello"
* message. The remaining is a `Writable' instance serialized that
* specifies which authentication provider to use and give our credentials.
* In HBase 0.92 and above, the `Writable' should represent what protocol
* the client wants to speak, which should be the name of an interface.
* "org.apache.hadoop.hbase.ipc.HRegionInterface" should be used.
* The "hello" message is implemented in `RegionClient#helloRpc'. In order
* to support HBase 0.92, we always piggy back a `getProtocolVersion' RPC
* right after the header, so we can tell what version the server is using
* and how to serialize RPCs and read its responses.
*/
// ------ //
// Flags. //
// ------ //
// 5th byte into the response.
// See ipc/ResponseFlag.java in HBase's source code.
static final byte RPC_SUCCESS = 0x00;
static final byte RPC_ERROR = 0x01;
/**
* Indicates that the next byte is an integer with the length of the response.
* This can be found on both successful ({@link RPC_SUCCESS}) or failed
* ({@link RPC_ERROR}) responses.
* @since HBase 0.92
*/
static final byte RPC_FRAMED = 0x02;
// ----------- //
// RPC Status. //
// ----------- //
// 4 byte integer (on wire), located 9 byte into the response, only if
// {@link RPC_FRAMED} is set.
// See ipc/Status.java in HBase's source code.
/**
* Indicates that an error prevented the RPC from being executed.
* This is a somewhat misleading name. It indicates that the RPC couldn't
* be executed, typically because of a protocol version mismatch, an
* incorrectly encoded RPC (or possibly corrupted on-wire such that the
* server couldn't deserialize it), or an authentication error (unsure about
* that one).
*/
static final byte RPC_FATAL = -1;
/**
* To be implemented by the concrete sub-type.
* This method is expected to instantiate a {@link ChannelBuffer} using
* either {@link #newBuffer} and return it
* properly populated so it's ready to be written out to the wire (except
* for the "RPC header" that contains the RPC ID and method name and such,
* which is going to be populated automatically just before sending the RPC
* out, see {@link RegionClient#encode}.
*
* Notice that this method is package-private, so only classes within this
* package can use this as a base class.
*
* @param server_version What RPC protocol version the server is running.
*/
abstract ChannelBuffer serialize(byte server_version);
/**
* To be implemented by the concrete sub-type.
* This method is expected to de-serialize a response received for the
* current RPC, when communicating with HBase 0.95 and newer.
*
* Notice that this method is package-private, so only classes within this
* package can use this as a base class.
*
* @param buf The buffer from which to de-serialize the response.
* @param cell_size The size, in bytes, of the "cell block" that follows the
* protobuf of the RPC response. If 0, then there is just the protobuf.
* The value is guaranteed to be both positive and of a "reasonable" size.
*/
abstract Object deserialize(ChannelBuffer buf, int cell_size);
/**
* Throws an exception if the argument is non-zero.
*/
static void ensureNoCell(final int cell_size) {
if (cell_size != 0) {
throw new InvalidResponseException(
"Should not have gotten any cell blocks, yet there are "
+ cell_size + " bytes that follow the protobuf response."
+ " This should never happen."
+ " Are you using an incompatible version of HBase?", null);
}
}
/**
* The Deferred that will be invoked when this RPC completes or fails.
* In case of a successful completion, this Deferred's first callback
* will be invoked with an {@link Object} containing the de-serialized
* RPC response in argument.
* Once an RPC has been used, we create a new Deferred for it, in case
* the user wants to re-use it.
*/
private Deferred<Object> deferred;
// The next 3 fields are package-private so subclasses can access them
// without them being part of the interface (unlike with `protected').
/**
* The table for which this RPC is.
* {@code null} if this RPC isn't for a particular table.
* Invariants:
* table == null => key == null
* table != null => key != null
*/
final byte[] table; // package-private for subclasses, not other classes.
/**
* The row key for which this RPC is.
* {@code null} if this RPC isn't for a particular row key.
* Invariants:
* table == null => key == null
* table != null => key != null
*/
final byte[] key; // package-private for subclasses, not other classes.
/**
* The region for which this RPC is.
* {@code null} if this RPC isn't for a single specific region.
* Invariants:
* table == null => region == null
* table != null => region != null before {@link #serialize} gets called
*/
RegionInfo region; // package-private for subclasses, not other classes.
/**
* How many times have we retried this RPC?.
* Only used by the low-level retry logic in {@link RegionClient} in order
* to detect broken META tables (e.g. we keep getting an NSRE but META keeps
* sending us to the same RegionServer that sends us the NSREs, or we keep
* looking up the same row in META because of a "hole" in META).
* <p>
* Proper synchronization is required, although in practice most of the code
* that access this attribute will have a happens-before relationship with
* the rest of the code, due to other existing synchronization.
*/
byte attempt; // package-private for RegionClient and HBaseClient only.
/**
* If true, this RPC should fail-fast as soon as we know we have a problem.
*/
boolean failfast = false;
/**
* Set whether or not the RPC not be retried upon encountering a problem.
* <p>
* RPCs can be retried for various legitimate reasons (e.g. NSRE due to a
* region moving), but under certain failure circumstances (such as a node
* going down) we want to give up and be alerted as soon as possible.
* @param failfast If {@code true}, this RPC should fail-fast as soon as
* we know we have a problem.
* @since 1.5
*/
public final boolean setFailfast(final boolean failfast) {
return this.failfast = failfast;
}
/**
* Returns whether or not the RPC not be retried upon encountering a problem.
* @see #setFailfast
* @since 1.5
*/
public final boolean failfast() {
return failfast;
}
/**
* Package private constructor for RPCs that aren't for any region.
*/
HBaseRpc() {
table = null;
key = null;
}
/**
* Package private constructor for RPCs that are for a region.
* @param table The name of the table this RPC is for.
* @param row The name of the row this RPC is for.
*/
HBaseRpc(final byte[] table, final byte[] key) {
KeyValue.checkTable(table);
KeyValue.checkKey(key);
this.table = table;
this.key = key;
}
// ---------------------- //
// Package private stuff. //
// ---------------------- //
/**
* Package private way of getting the name of the RPC method.
* @param server_version What RPC protocol version the server is running.
*/
abstract byte[] method(byte server_version);
/**
* Sets the region this RPC is going to.
* <p>
* This method is invoked by {@link HBaseClient} once the user give it
* their {@code HBaseRpc}, because {@link HBaseClient} is the only thing
* that knows about and keeps track of regions. Users don't care about
* regions.
* @param region The target region of this RPC.
*/
final void setRegion(final RegionInfo region) {
if (table == null) {
throw new AssertionError("Can't use setRegion if no table was given.");
}
this.region = region;
}
/**
* Returns the region this RPC is supposed to go to (can be {@code null}).
*/
final RegionInfo getRegion() {
return region;
}
/** Package private way of accessing / creating the Deferred of this RPC. */
final Deferred<Object> getDeferred() {
if (deferred == null) {
deferred = new Deferred<Object>();
}
return deferred;
}
/**
* Package private way of making an RPC complete by giving it its result.
* If this RPC has no {@link Deferred} associated to it, nothing will
* happen. This may happen if the RPC was already called back.
* <p>
* Once this call to this method completes, this object can be re-used to
* re-send the same RPC, provided that no other thread still believes this
* RPC to be in-flight (guaranteeing this may be hard in error cases).
*/
final void callback(final Object result) {
final Deferred<Object> d = deferred;
if (d == null) {
return;
}
deferred = null;
attempt = 0;
d.callback(result);
}
/** Checks whether or not this RPC has a Deferred without creating one. */
final boolean hasDeferred() {
return deferred != null;
}
public String toString() {
// Try to rightsize the buffer.
final String method = new String(this.method((byte) 0));
final StringBuilder buf = new StringBuilder(16 + method.length() + 2
+ 8 + (table == null ? 4 : table.length + 2) // Assumption: ASCII => +2
+ 6 + (key == null ? 4 : key.length * 2) // Assumption: binary => *2
+ 9 + (region == null ? 4 : region.stringSizeHint())
+ 10 + 1 + 1);
buf.append("HBaseRpc(method=");
buf.append(method);
buf.append(", table=");
Bytes.pretty(buf, table);
buf.append(", key=");
Bytes.pretty(buf, key);
buf.append(", region=");
if (region == null) {
buf.append("null");
} else {
region.toStringbuf(buf);
}
buf.append(", attempt=").append(attempt);
buf.append(')');
return buf.toString();
}
/**
* Helper for subclass's {@link #toString} implementations.
* <p>
* This is used by subclasses such as {@link DeleteRequest}
* or {@link GetRequest}, to avoid code duplication.
* @param classname The name of the class of the caller.
* @param family A possibly null family name.
* @param qualifiers A non-empty list of qualifiers or null.
*/
final String toStringWithQualifiers(final String classname,
final byte[] family,
final byte[][] qualifiers) {
return toStringWithQualifiers(classname, family, qualifiers, null, "");
}
/**
* Helper for subclass's {@link #toString} implementations.
* <p>
* This is used by subclasses such as {@link DeleteRequest}
* or {@link GetRequest}, to avoid code duplication.
* @param classname The name of the class of the caller.
* @param family A possibly null family name.
* @param qualifiers A non-empty list of qualifiers or null.
* @param values A non-empty list of values or null.
* @param fields Additional fields to include in the output.
*/
final String toStringWithQualifiers(final String classname,
final byte[] family,
final byte[][] qualifiers,
final byte[][] values,
final String fields) {
final StringBuilder buf = new StringBuilder(256 // min=182
+ fields.length());
buf.append(classname).append("(table=");
Bytes.pretty(buf, table);
buf.append(", key=");
Bytes.pretty(buf, key);
buf.append(", family=");
Bytes.pretty(buf, family);
buf.append(", qualifiers=");
Bytes.pretty(buf, qualifiers);
if (values != null) {
buf.append(", values=");
Bytes.pretty(buf, values);
}
buf.append(fields);
buf.append(", attempt=").append(attempt)
.append(", region=");
if (region == null) {
buf.append("null");
} else {
region.toStringbuf(buf);
}
buf.append(')');
return buf.toString();
}
/**
* Helper for subclass's {@link #toString} implementations.
* <p>
* This is used by subclasses such as {@link DeleteRequest}
* or {@link GetRequest}, to avoid code duplication.
* @param classname The name of the class of the caller.
* @param family A possibly null family name.
* @param qualifier A possibly null column qualifier.
* @param fields Additional fields to include in the output.
*/
final String toStringWithQualifier(final String classname,
final byte[] family,
final byte[] qualifier,
final String fields) {
final StringBuilder buf = new StringBuilder(256 // min=181
+ fields.length());
buf.append(classname).append("(table=");
Bytes.pretty(buf, table);
buf.append(", key=");
Bytes.pretty(buf, key);
buf.append(", family=");
Bytes.pretty(buf, family);
buf.append(", qualifier=");
Bytes.pretty(buf, qualifier);
buf.append(fields);
buf.append(", attempt=").append(attempt)
.append(", region=");
if (region == null) {
buf.append("null");
} else {
region.toStringbuf(buf);
}
buf.append(')');
return buf.toString();
}
// --------------------- //
// RPC utility functions //
// --------------------- //
/*
* The remaining of this file is just a whole bunch of functions to make
* it easier to deal with the absolutely horrible Hadoop RPC protocol.
*
* One should assume that all the following functions can throw an
* IndexOutOfBoundsException when reading past the end of a buffer
* or writing past the end of a fixed-length buffer.
*
* A number of functions, particularly those reading something, will
* throw an IllegalArgumentException if the buffer they're asked to
* parse contains junk or otherwise corrupted or suspicious data.
*/
/**
* Creates a new fixed-length buffer on the heap.
* @param server_version What RPC protocol version the server is running.
* @param max_payload_size A good approximation of the size of the payload.
* The approximation must be an upper bound on the expected size of the
* payload as trying to store more than {@code max_payload_size} bytes in
* the buffer returned will cause an {@link ArrayIndexOutOfBoundsException}.
*/
final ChannelBuffer newBuffer(final byte server_version,
final int max_payload_size) {
// Add extra bytes for the RPC header:
// 4 bytes: Payload size (always present, even in HBase 0.95+).
// 4 bytes: RPC ID.
// 2 bytes: Length of the method name.
// N bytes: The method name.
final int header = 4 + 4 + 2 + method(server_version).length
// Add extra bytes for the RPC header used in HBase 0.92 and above:
// 1 byte: RPC header version.
// 8 bytes: Client version. Yeah, 8 bytes, WTF seriously.
// 4 bytes: Method fingerprint.
+ (server_version < RegionClient.SERVER_VERSION_092_OR_ABOVE ? 0
: 1 + 8 + 4);
// Note: with HBase 0.95 and up, the size of the protobuf header varies.
// It is currently made of (see RequestHeader in RPC.proto):
// - uint32 callId: varint 1 to 5 bytes
// - RPCTInfo traceInfo: two uint64 varint so 4 to 20 bytes.
// - string methodName: varint length (1 byte) and method name.
// - bool requestParam: 1 byte
// - CellBlockMeta cellBlockMeta: one uint32 varint so 2 to 6 bytes.
// Additionally each field costs an extra 1 byte, and there is a varint
// prior to the header for the size of the header. We don't set traceInfo
// right now so that leaves us with 4 fields for a total maximum size of
// 1 varint + 4 fields + 5 + 1 + N + 1 + 6 = 18 bytes max + method name.
// Since for HBase 0.92 we reserve 19 bytes, we're good, we over-allocate
// at most 1 bytes. So the logic above doesn't need to change for 0.95+.
final ChannelBuffer buf = ChannelBuffers.buffer(header + max_payload_size);
buf.setIndex(0, header); // Advance the writerIndex past the header.
return buf;
}
/**
* Serializes the given protobuf object into a Netty {@link ChannelBuffer}.
* @param method The name of the method of the RPC we're going to send.
* @param pb The protobuf to serialize.
* @return A new channel buffer containing the serialized protobuf, with
* enough free space at the beginning to tack on the RPC header.
*/
static final ChannelBuffer toChannelBuffer(final byte[] method,
final AbstractMessageLite pb) {
final int pblen = pb.getSerializedSize();
final int vlen = CodedOutputStream.computeRawVarint32Size(pblen);
final byte[] buf = new byte[4 + 19 + method.length + vlen + pblen];
try {
final CodedOutputStream out = CodedOutputStream.newInstance(buf, 4 + 19 + method.length,
vlen + pblen);
out.writeRawVarint32(pblen);
pb.writeTo(out);
out.checkNoSpaceLeft();
} catch (IOException e) {
throw new RuntimeException("Should never happen", e);
}
return ChannelBuffers.wrappedBuffer(buf);
}
/**
* Writes a {@link Boolean boolean} as an HBase RPC parameter.
* @param buf The buffer to serialize the string to.
* @param b The boolean value to serialize.
*/
static void writeHBaseBool(final ChannelBuffer buf, final boolean b) {
buf.writeByte(1); // Code for Boolean.class in HbaseObjectWritable
buf.writeByte(b ? 0x01 : 0x00);
}
/**
* Writes an {@link Integer int} as an HBase RPC parameter.
* @param buf The buffer to serialize the string to.
* @param v The value to serialize.
*/
static void writeHBaseInt(final ChannelBuffer buf, final int v) {
buf.writeByte(5); // Code for Integer.class in HbaseObjectWritable
buf.writeInt(v);
}
/**
* Writes a {@link Long long} as an HBase RPC parameter.
* @param buf The buffer to serialize the string to.
* @param v The value to serialize.
*/
static void writeHBaseLong(final ChannelBuffer buf, final long v) {
buf.writeByte(6); // Code for Long.class in HbaseObjectWritable
buf.writeLong(v);
}
/**
* Writes a {@link String} as an HBase RPC parameter.
* @param buf The buffer to serialize the string to.
* @param s The string to serialize.
*/
static void writeHBaseString(final ChannelBuffer buf, final String s) {
buf.writeByte(10); // Code for String.class in HbaseObjectWritable
final byte[] b = s.getBytes(CharsetUtil.UTF_8);
writeVLong(buf, b.length);
buf.writeBytes(b);
}
/**
* Writes a byte array as an HBase RPC parameter.
* @param buf The buffer to serialize the string to.
* @param b The byte array to serialize.
*/
static void writeHBaseByteArray(final ChannelBuffer buf, final byte[] b) {
buf.writeByte(11); // Code for byte[].class in HbaseObjectWritable
writeByteArray(buf, b);
}
/**
* Writes a byte array.
* @param buf The buffer to serialize the string to.
* @param b The byte array to serialize.
*/
static void writeByteArray(final ChannelBuffer buf, final byte[] b) {
writeVLong(buf, b.length);
buf.writeBytes(b);
}
/**
* Serializes a `null' reference.
* @param buf The buffer to write to.
*/
static void writeHBaseNull(final ChannelBuffer buf) {
buf.writeByte(14); // Code type for `Writable'.
buf.writeByte(17); // Code type for `NullInstance'.
buf.writeByte(14); // Code type for `Writable'.
}
/**
* Upper bound on the size of a byte array we de-serialize.
* This is to prevent HBase from OOM'ing us, should there be a bug or
* undetected corruption of an RPC on the network, which would turn a
* an innocuous RPC into something allocating a ton of memory.
* The Hadoop RPC protocol doesn't do any checksumming as they probably
* assumed that TCP checksums would be sufficient (they're not).
*/
static final long MAX_BYTE_ARRAY_MASK =
0xFFFFFFFFF0000000L; // => max = 256MB
/**
* Verifies that the given length looks like a reasonable array length.
* This method accepts 0 as a valid length.
* @param buf The buffer from which the length was read.
* @param length The length to validate.
* @throws IllegalArgumentException if the length is negative or
* suspiciously large.
*/
static void checkArrayLength(final ChannelBuffer buf, final long length) {
// 2 checks in 1. If any of the high bits are set, we know the value is
// either too large, or is negative (if the most-significant bit is set).
if ((length & MAX_BYTE_ARRAY_MASK) != 0) {
if (length < 0) {
throw new IllegalArgumentException("Read negative byte array length: "
+ length + " in buf=" + buf + '=' + Bytes.pretty(buf));
} else {
throw new IllegalArgumentException("Read byte array length that's too"
+ " large: " + length + " > " + ~MAX_BYTE_ARRAY_MASK + " in buf="
+ buf + '=' + Bytes.pretty(buf));
}
}
}
/**
* Verifies that the given array looks like a reasonably big array.
* This method accepts empty arrays.
* @param array The array to check.
* @throws IllegalArgumentException if the length of the array is
* suspiciously large.
* @throws NullPointerException if the array is {@code null}.
*/
static void checkArrayLength(final byte[] array) {
if ((array.length & MAX_BYTE_ARRAY_MASK) != 0) {
if (array.length < 0) { // Not possible unless there's a JVM bug.
throw new AssertionError("Negative byte array length: "
+ array.length + ' ' + Bytes.pretty(array));
} else {
throw new IllegalArgumentException("Byte array length too big: "
+ array.length + " > " + ~MAX_BYTE_ARRAY_MASK);
// Don't dump the gigantic byte array in the exception message.
}
}
}
/**
* Verifies that the given length looks like a reasonable array length.
* This method does not accept 0 as a valid length.
* @param buf The buffer from which the length was read.
* @param length The length to validate.
* @throws IllegalArgumentException if the length is zero, negative or
* suspiciously large.
*/
static void checkNonEmptyArrayLength(final ChannelBuffer buf,
final long length) {
if (length == 0) {
throw new IllegalArgumentException("Read zero-length byte array "
+ " in buf=" + buf + '=' + Bytes.pretty(buf));
}
checkArrayLength(buf, length);
}
/**
* Reads a byte array.
* @param buf The buffer from which to read the array.
* @return A possibly empty but guaranteed non-{@code null} byte array.
* @throws IllegalArgumentException if the length we read for the byte array
* is out of reasonable bounds.
*/
static byte[] readByteArray(final ChannelBuffer buf) {
final long length = readVLong(buf);
checkArrayLength(buf, length);
final byte[] b = new byte[(int) length];
buf.readBytes(b);
return b;
}
/**
* Reads a string encoded by {@code hadoop.io.WritableUtils#readString}.
* @throws IllegalArgumentException if the length we read for the string
* is out of reasonable bounds.
*/
static String readHadoopString(final ChannelBuffer buf) {
final int length = buf.readInt();
checkArrayLength(buf, length);
final byte[] s = new byte[length];
buf.readBytes(s);
return new String(s, CharsetUtil.UTF_8);
}
/**
* De-serializes a protobuf from the given buffer.
* <p>
* The protobuf is assumed to be prefixed by a varint indicating its size.
* @param buf The buffer to de-serialize the protobuf from.
* @param parser The protobuf parser to use for this type of protobuf.
* @return An instance of the de-serialized type.
* @throws InvalidResponseException if the buffer contained an invalid
* protobuf that couldn't be de-serialized.
*/
static <T> T readProtobuf(final ChannelBuffer buf, final Parser<T> parser) {
final int length = HBaseRpc.readProtoBufVarint(buf);
HBaseRpc.checkArrayLength(buf, length);
final byte[] payload;
final int offset;
if (buf.hasArray()) { // Zero copy.
payload = buf.array();
offset = buf.arrayOffset() + buf.readerIndex();
} else { // We have to copy the entire payload out of the buffer :(
payload = new byte[length];
buf.readBytes(payload);
offset = 0;
}
try {
return parser.parseFrom(payload, offset, length);
} catch (InvalidProtocolBufferException e) {
final String msg = "Invalid RPC response: length=" + length
+ ", payload=" + Bytes.pretty(payload);
throw new InvalidResponseException(msg, e);
}
}
// -------------------------------------- //
// Variable-length integer value encoding //
// -------------------------------------- //
/*
* Unofficial documentation of the Hadoop VLong encoding
* *****************************************************
*
* The notation used to refer to binary numbers here is `0b' followed by
* the bits, as is printed by Python's built-in `bin' function for example.
*
* Values between
* -112 0b10010000
* and
* 127 0b01111111
* (inclusive) are encoded on a single byte using their normal
* representation. The boundary "-112" sounds weird at first (and it is)
* but it'll become clearer once you understand the format.
*
* Values outside of the boundaries above are encoded by first having
* 1 byte of meta-data followed by a variable number of bytes that make up
* the value being encoded.
*
* The "meta-data byte" always starts with the prefix 0b1000. Its format
* is as follows:
* 1 0 0 0 | S | L L L
* The bit `S' is the sign bit (1 = positive value, 0 = negative, yes
* that's weird, I would've done it the other way around).
* The 3 bits labeled `L' indicate how many bytes make up this variable
* length value. They're encoded like so:
* 1 1 1 = 1 byte follows
* 1 1 0 = 2 bytes follow
* 1 0 1 = 3 bytes follow
* 1 0 0 = 4 bytes follow
* 0 1 1 = 5 bytes follow
* 0 1 0 = 6 bytes follow
* 0 0 1 = 7 bytes follow
* 0 0 0 = 8 bytes follow
* Yes, this is weird too, it goes backwards, requires more operations to
* convert the length into something human readable, and makes sorting the
* numbers unnecessarily complicated.
* Notice that the prefix wastes 3 bits. Also, there's no "VInt", all
* variable length encoded values are eventually transformed to `long'.
*
* The remaining bytes are just the original number, as-is, without the
* unnecessary leading bytes (that are all zeroes).
*
* Examples:
* 42 is encoded as 00101010 (as-is, 1 byte)
* 127 is encoded as 01111111 (as-is, 1 bytes)
* 128 is encoded as 10001111 10000000 (2 bytes)
* 255 is encoded as 10001111 11111111 (2 bytes)
* 256 is encoded as 10001110 00000001 00000000 (3 bytes)
* -1 is encoded as 11111111 (as-is, 1 byte)
* -42 is encoded as 11010110 (as-is, 1 byte)
* -112 is encoded as 10010000 (as-is, 1 byte)
* -113 is encoded as 10000111 01110000 (2 bytes)
* -256 is encoded as 10000111 11111111 (2 bytes)
* -257 is encoded as 10000110 00000001 00000000 (3 bytes)
*
* The implementations of writeVLong and readVLong below are on average
* 14% faster than Hadoop's implementation given a uniformly distributed
* input (lots of values of all sizes), and can be up to 40% faster on
* certain input sizes (e.g. big values that fit on 8 bytes). This is due
* to two main things: fewer arithmetic and logic operations, and processing
* multiple bytes together when possible.
* Reading is about 6% faster than writing (negligible difference).
* My MacBook Pro with a 2.66 GHz Intel Core 2 Duo easily does 5000 calls to
* readVLong or writeVLong per millisecond.
*
* However, since we use Netty, we don't have to deal with the stupid Java
* I/O library, so unlike Hadoop we don't use DataOutputStream and
* ByteArrayOutputStream, instead we use ChannelBuffer. This gives us a
* significant extra performance boost over Hadoop. The 14%-60% difference
* above becomes a 70% to 80% difference! Yes, that's >4 times faster! With
* the code below my MacBook Pro with a 2.66 GHz Intel Core 2 Duo easily
* does 11000 writeVLong/ms or 13500 readVLong/ms (notice that reading is
* 18% faster) when using a properly sized dynamicBuffer. When using a
* fixed-size buffer, writing (14200/s) is almost as fast as reading
* (14500/s).
*
* So there's really no reason on Earth to use java.io. Its API is horrible
* and so is its performance.
*/
/**
* Writes a variable-length {@link Long} value.
* @param buf The buffer to write to.
* @param n The value to write.
*/
@SuppressWarnings("fallthrough")
static void writeVLong(final ChannelBuffer buf, long n) {
// All those values can be encoded on 1 byte.
if (n >= -112 && n <= 127) {
buf.writeByte((byte) n);
return;
}
// Set the high bit to indicate that more bytes are to come.
// Both 0x90 and 0x88 have the high bit set (and are thus negative).
byte b = (byte) 0x90; // 0b10010000
if (n < 0) {
n = ~n;
b = (byte) 0x88; // 0b10001000
}
{
long tmp = n;
do {
tmp >>>= 8;
// The first time we decrement `b' here, it's going to move the
// rightmost `1' in `b' to the right, due to the way 2's complement
// representation works. So if `n' is positive, and we started with
// b = 0b10010000, now we'll have b = 0b10001111, which correctly
// indicates that `n' is positive (5th bit set) and has 1 byte so far
// (last 3 bits are set). If `n' is negative, and we started with
// b = 0b10001000, now we'll have b = 0b10000111, which correctly
// indicates that `n' is negative (5th bit not set) and has 1 byte.
// Each time we keep decrementing this value, the last remaining 3
// bits are going to change according to the format described above.
b--;
} while (tmp != 0);
}
buf.writeByte(b);
switch (b & 0x07) { // Look at the low 3 bits (the length).
case 0x00:
buf.writeLong(n);
break;
case 0x01:
buf.writeInt((int) (n >>> 24));
buf.writeMedium((int) n);
break;
case 0x02:
buf.writeMedium((int) (n >>> 24));
buf.writeMedium((int) n);
break;
case 0x03:
buf.writeByte((byte) (n >>> 32));
case 0x04:
buf.writeInt((int) n);
break;
case 0x05:
buf.writeMedium((int) n);
break;
case 0x06:
buf.writeShort((short) n);
break;
case 0x07:
buf.writeByte((byte) n);
}
}
/**
* Reads a variable-length {@link Long} value.
* @param buf The buffer to read from.
* @return The value read.
*/
@SuppressWarnings("fallthrough")
static long readVLong(final ChannelBuffer buf) {
byte b = buf.readByte();
// Unless the first half of the first byte starts with 0xb1000, we're
// dealing with a single-byte value.
if ((b & 0xF0) != 0x80) { // 0xF0 = 0b11110000, 0x80 = 0b10000000
return b;
}
// The value is negative if the 5th bit is 0.
final boolean negate = (b & 0x08) == 0; // 0x08 = 0b00001000
long result = 0;
switch (b & 0x07) { // Look at the low 3 bits (the length).
case 0x00:
result = buf.readLong();
break;
case 0x01:
result = buf.readUnsignedInt();
result <<= 32;
result |= buf.readUnsignedMedium();
break;
case 0x02:
result = buf.readUnsignedMedium();
result <<= 24;
result |= buf.readUnsignedMedium();
break;
case 0x03:
b = buf.readByte();
result <<= 8;
result |= b & 0xFF;
case 0x04:
result <<= 32;
result |= buf.readUnsignedInt();
break;
case 0x05:
result |= buf.readUnsignedMedium();
break;
case 0x06:
result |= buf.readUnsignedShort();
break;
case 0x07:
b = buf.readByte();
result <<= 8;
result |= b & 0xFF;
}
return negate ? ~result : result;
}
/**
* Reads a 32-bit variable-length integer value as used in Protocol Buffers.
* @param buf The buffer to read from.
* @return The integer read.
*/
static int readProtoBufVarint(final ChannelBuffer buf) {
int result = buf.readByte();
if (result >= 0) {
return result;
}
result &= 0x7F;
result |= buf.readByte() << 7;
if (result >= 0) {
return result;
}
result &= 0x3FFF;
result |= buf.readByte() << 14;
if (result >= 0) {
return result;
}
result &= 0x1FFFFF;
result |= buf.readByte() << 21;
if (result >= 0) {
return result;
}
result &= 0x0FFFFFFF;
final byte b = buf.readByte();
result |= b << 28;
if (b >= 0) {
return result;
}
throw new IllegalArgumentException("Not a 32 bit varint: " + result
+ " (5th byte: " + b + ")");
}
}