/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.serde2.lazybinary;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.hadoop.hive.serde2.ByteStream.Output;
import org.apache.hadoop.hive.serde2.lazybinary.objectinspector.LazyBinaryObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.WritableUtils;
public class LazyBinaryUtils {
/**
* Convert the byte array to an int starting from the given offset.
* Refer to code by aeden on DZone Snippets:
* @param b the byte array
* @param offset the array offset
* @return the integer
*/
public static int byteArrayToInt(byte[] b, int offset) {
int value = 0;
for (int i = 0; i < 4; i++) {
int shift = (4 - 1 - i) * 8;
value += (b[i + offset] & 0x000000FF) << shift;
}
return value;
}
/**
* Convert the byte array to a long starting from the given offset.
* @param b the byte array
* @param offset the array offset
* @return the long
*/
public static long byteArrayToLong(byte[] b, int offset) {
long value = 0;
for (int i = 0; i < 8; i++) {
int shift = (8 - 1 - i) * 8;
value += ((long) (b[i + offset] & 0x00000000000000FF)) << shift;
}
return value;
}
/**
* Convert the byte array to a short starting from the given offset.
* @param b the byte array
* @param offset the array offset
* @return the short
*/
public static short byteArrayToShort(byte[] b, int offset) {
short value = 0;
value += (b[offset ] & 0x000000FF) << 8;
value += (b[offset+1] & 0x000000FF);
return value;
}
/**
* Record is the unit that data is serialized in.
* A record includes two parts. The first part stores the
* size of the element and the second part stores the
* real element.
* size element
* record -> |----|-------------------------|
*
* A RecordInfo stores two information of a record,
* the size of the "size" part which is the element offset
* and the size of the element part which is element size.
*/
public static class RecordInfo {
public RecordInfo () {
elementOffset = 0;
elementSize = 0;
}
public byte elementOffset;
public int elementSize;
}
static VInt vInt = new LazyBinaryUtils.VInt();
/**
* Check a particular field and set its size and offset in bytes
* based on the field type and the bytes arrays.
*
* For void, boolean, byte, short, int, long, float and double,
* there is no offset and the size is fixed. For string, map,
* list, struct, the first four bytes are used to store the size.
* So the offset is 4 and the size is computed by concating the
* first four bytes together. The first four bytes are defined
* with respect to the offset in the bytes arrays.
*
* @param objectInspector object inspector of the field
* @param bytes bytes arrays store the table row
* @param offset offset of this field
* @param recordInfo modify this byteinfo object and return it
*/
public static void checkObjectByteInfo(ObjectInspector objectInspector, byte[] bytes, int offset, RecordInfo recordInfo) {
Category category = objectInspector.getCategory();
switch (category) {
case PRIMITIVE:
PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector)objectInspector).getPrimitiveCategory();
switch (primitiveCategory) {
case VOID:
case BOOLEAN:
case BYTE:
recordInfo.elementOffset = 0;
recordInfo.elementSize = 1;
break;
case SHORT:
recordInfo.elementOffset = 0;
recordInfo.elementSize = 2;
break;
case FLOAT:
recordInfo.elementOffset = 0;
recordInfo.elementSize = 4;
break;
case DOUBLE:
recordInfo.elementOffset = 0;
recordInfo.elementSize = 8;
break;
case INT:
recordInfo.elementOffset = 0;
recordInfo.elementSize = WritableUtils.decodeVIntSize(bytes[offset]);
break;
case LONG:
recordInfo.elementOffset = 0;
recordInfo.elementSize = WritableUtils.decodeVIntSize(bytes[offset]);
break;
case STRING:
// using vint instead of 4 bytes
LazyBinaryUtils.readVInt(bytes, offset, vInt);
recordInfo.elementOffset = vInt.length;
recordInfo.elementSize = vInt.value;
break;
default: {
throw new RuntimeException("Unrecognized primitive type: " + primitiveCategory);
}
}
break;
case LIST:
case MAP:
case STRUCT:
recordInfo.elementOffset = 4;
recordInfo.elementSize = LazyBinaryUtils.byteArrayToInt(bytes, offset);
break;
default : {
throw new RuntimeException("Unrecognized non-primitive type: " + category);
}
}
}
/**
* A zero-compressed encoded long
*/
public static class VLong {
public VLong() {
value = 0;
length = 0;
}
public long value;
public byte length;
};
/**
* Reads a zero-compressed encoded long from a byte array and returns it.
* @param bytes the byte array
* @param offset offset of the array to read from
* @param vlong storing the deserialized long and its size in byte
*/
public static void readVLong(byte[] bytes, int offset, VLong vlong) {
byte firstByte = bytes[offset];
vlong.length = (byte)WritableUtils.decodeVIntSize(firstByte);
if (vlong.length == 1) {
vlong.value = firstByte;
return;
}
long i = 0;
for (int idx = 0; idx < vlong.length-1; idx++) {
byte b = bytes[offset+1+idx];
i = i << 8;
i = i | (b & 0xFF);
}
vlong.value = (WritableUtils.isNegativeVInt(firstByte) ? (i ^ -1L) : i);
}
/**
* A zero-compressed encoded integer
*/
public static class VInt {
public VInt() {
value = 0;
length = 0;
}
public int value;
public byte length;
};
/**
* Reads a zero-compressed encoded int from a byte array and returns it.
* @param bytes the byte array
* @param offset offset of the array to read from
* @param vInt storing the deserialized int and its size in byte
*/
public static void readVInt(byte[] bytes, int offset, VInt vInt) {
byte firstByte = bytes[offset];
vInt.length = (byte)WritableUtils.decodeVIntSize(firstByte);
if (vInt.length == 1) {
vInt.value = firstByte;
return;
}
int i = 0;
for (int idx = 0; idx < vInt.length-1; idx++) {
byte b = bytes[offset+1+idx];
i = i << 8;
i = i | (b & 0xFF);
}
vInt.value = (WritableUtils.isNegativeVInt(firstByte) ? (i ^ -1) : i);
}
/**
* Writes a zero-compressed encoded int to a byte array.
* @param byteStream the byte array/stream
* @param i the int
*/
public static void writeVInt(Output byteStream, int i) {
writeVLong(byteStream, i);
}
/**
* Write a zero-compressed encoded long to a byte array.
* @param byteStream the byte array/stream
* @param l the long
*/
public static void writeVLong(Output byteStream, long l) {
if (l >= -112 && l <= 127) {
byteStream.write((byte)l);
return;
}
int len = -112;
if (l < 0) {
l ^= -1L; // take one's complement'
len = -120;
}
long tmp = l;
while (tmp != 0) {
tmp = tmp >> 8;
len--;
}
byteStream.write((byte)len);
len = (len < -120) ? -(len + 120) : -(len + 112);
for (int idx = len; idx != 0; idx--) {
int shiftbits = (idx - 1) * 8;
long mask = 0xFFL << shiftbits;
byteStream.write((byte)((l & mask) >> shiftbits));
}
}
static HashMap<TypeInfo, ObjectInspector> cachedLazyBinaryObjectInspector = new HashMap<TypeInfo, ObjectInspector>();
/**
* Returns the lazy binary object inspector that can be used to inspect an
* lazy binary object of that typeInfo
*
* For primitive types, we use the standard writable object inspector.
*/
public static ObjectInspector getLazyBinaryObjectInspectorFromTypeInfo(TypeInfo typeInfo) {
ObjectInspector result = cachedLazyBinaryObjectInspector.get(typeInfo);
if (result == null) {
switch(typeInfo.getCategory()) {
case PRIMITIVE: {
result = PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(
((PrimitiveTypeInfo)typeInfo).getPrimitiveCategory());
break;
}
case LIST: {
ObjectInspector elementObjectInspector = getLazyBinaryObjectInspectorFromTypeInfo(
((ListTypeInfo)typeInfo).getListElementTypeInfo());
result = LazyBinaryObjectInspectorFactory.getLazyBinaryListObjectInspector(elementObjectInspector);
break;
}
case MAP: {
MapTypeInfo mapTypeInfo = (MapTypeInfo)typeInfo;
ObjectInspector keyObjectInspector = getLazyBinaryObjectInspectorFromTypeInfo(mapTypeInfo.getMapKeyTypeInfo());
ObjectInspector valueObjectInspector = getLazyBinaryObjectInspectorFromTypeInfo(mapTypeInfo.getMapValueTypeInfo());
result = LazyBinaryObjectInspectorFactory.getLazyBinaryMapObjectInspector(keyObjectInspector, valueObjectInspector);
break;
}
case STRUCT: {
StructTypeInfo structTypeInfo = (StructTypeInfo)typeInfo;
List<String> fieldNames = structTypeInfo.getAllStructFieldNames();
List<TypeInfo> fieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos();
List<ObjectInspector> fieldObjectInspectors = new ArrayList<ObjectInspector>(fieldTypeInfos.size());
for(int i=0; i<fieldTypeInfos.size(); i++) {
fieldObjectInspectors.add(getLazyBinaryObjectInspectorFromTypeInfo(fieldTypeInfos.get(i)));
}
result = LazyBinaryObjectInspectorFactory.getLazyBinaryStructObjectInspector(fieldNames, fieldObjectInspectors);
break;
}
default: {
result = null;
}
}
cachedLazyBinaryObjectInspector.put(typeInfo, result);
}
return result;
}
}