package com.infochimps.hbase.pig;
import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.conf.Configuration;
import org.apache.pig.LoadCaster;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadPushDown;
import org.apache.pig.LoadStoreCaster;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.StoreFuncInterface;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.backend.hadoop.hbase.HBaseTableInputFormat.HBaseTableIFBuilder;
import org.apache.pig.backend.hadoop.hbase.HBaseBinaryConverter;
import org.apache.pig.builtin.Utf8StorageConverter;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.data.BagFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.util.Utils;
import org.apache.pig.backend.hadoop.hbase.HBaseTableInputFormat;
import com.google.common.collect.Lists;
/**
* FIXME! I can't read data right now.
*
*
*/
public class HBaseStorage extends LoadFunc implements StoreFuncInterface, LoadPushDown {
private static final Log LOG = LogFactory.getLog(HBaseStorage.class);
private final static String STRING_CASTER = "UTF8StorageConverter";
private final static String BYTE_CASTER = "HBaseBinaryConverter";
private final static String CASTER_PROPERTY = "pig.hbase.caster";
private List<byte[]> columnList_ = Lists.newArrayList();
private List<byte[][]> input_columnList_ = Lists.newArrayList();
private HTable m_table;
//private HBaseConfiguration m_conf;
private Configuration m_conf;
private RecordReader reader;
private RecordWriter writer;
private Scan scan;
private final CommandLine configuredOptions_;
private final static Options validOptions_ = new Options();
private final static CommandLineParser parser_ = new GnuParser();
private boolean loadRowKey_;
private final long limit_;
private final int tsField_;
private final int caching_;
protected transient byte[] gt_;
protected transient byte[] gte_;
protected transient byte[] lt_;
protected transient byte[] lte_;
private LoadCaster caster_;
private ResourceSchema schema_;
private static void populateValidOptions() {
validOptions_.addOption("loadKey", false, "Load Key");
validOptions_.addOption("gt", true, "Records must be greater than this value " +
"(binary, double-slash-escaped)");
validOptions_.addOption("lt", true, "Records must be less than this value (binary, double-slash-escaped)");
validOptions_.addOption("gte", true, "Records must be greater than or equal to this value");
validOptions_.addOption("lte", true, "Records must be less than or equal to this value");
validOptions_.addOption("caching", true, "Number of rows scanners should cache");
validOptions_.addOption("limit", true, "Per-region limit");
validOptions_.addOption("timestamp_field", true, "Zero based index of the field to use as the timestamp");
validOptions_.addOption("caster", true, "Caster to use for converting values. A class name, " +
"HBaseBinaryConverter, or Utf8StorageConverter. For storage, casters must implement LoadStoreCaster.");
}
/**
* Constructor. Construct a HBase Table LoadFunc and StoreFunc to load or store the cells of the
* provided columns.
*
* @param columnList
* columnlist that is a presented string delimited by space.
* @throws ParseException when unale to parse arguments
* @throws IOException
*/
public HBaseStorage(String columnList) throws ParseException, IOException {
this(columnList,"");
}
/**
* Constructor. Construct a HBase Table LoadFunc and StoreFunc to load or store.
* @param columnList
* @param optString Loader options. Known options:<ul>
* <li>-loadKey=(true|false) Load the row key as the first column
* <li>-gt=minKeyVal
* <li>-lt=maxKeyVal
* <li>-gte=minKeyVal
* <li>-lte=maxKeyVal
* <li>-caching=numRows number of rows to cache (faster scans, more memory).
* <li>-caster Caster to use for converting values. A class name, HBaseBinaryConverter, or Utf8StorageConverter. For storage, casters must implement LoadStoreCaster.
* </ul>
* @throws ParseException
* @throws IOException
*/
public HBaseStorage(String columnList, String optString) throws ParseException, IOException {
populateValidOptions();
String[] colNames = columnList.split(" ");
String[] optsArr = optString.split(" ");
try {
configuredOptions_ = parser_.parse(validOptions_, optsArr);
} catch (ParseException e) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "[-loadKey] [-gt] [-gte] [-lt] [-lte] [-caching] [-caster] [-timestamp_field]", validOptions_ );
throw e;
}
loadRowKey_ = configuredOptions_.hasOption("loadKey");
for (String colName : colNames) {
columnList_.add(Bytes.toBytes(colName));
input_columnList_.add(Bytes.toByteArrays(colName.split(":")));
}
m_conf = HBaseConfiguration.create();
String defaultCaster = m_conf.get(CASTER_PROPERTY, STRING_CASTER);
String casterOption = configuredOptions_.getOptionValue("caster", defaultCaster);
if (STRING_CASTER.equalsIgnoreCase(casterOption)) {
caster_ = new Utf8StorageConverter();
} else if (BYTE_CASTER.equalsIgnoreCase(casterOption)) {
caster_ = new HBaseBinaryConverter();
} else {
try {
@SuppressWarnings("unchecked")
Class<LoadCaster> casterClass = (Class<LoadCaster>) Class.forName(casterOption);
caster_ = casterClass.newInstance();
} catch (ClassCastException e) {
LOG.error("Congifured caster does not implement LoadCaster interface.");
throw new IOException(e);
} catch (ClassNotFoundException e) {
LOG.error("Configured caster class not found.", e);
throw new IOException(e);
} catch (InstantiationException e) {
LOG.error("Unable to instantiate configured caster " + casterOption, e);
throw new IOException(e);
} catch (IllegalAccessException e) {
LOG.error("Illegal Access Exception for configured caster " + casterOption, e);
throw new IOException(e);
}
}
caching_ = Integer.valueOf(configuredOptions_.getOptionValue("caching", "100"));
limit_ = Long.valueOf(configuredOptions_.getOptionValue("limit", "-1"));
tsField_ = Integer.valueOf(configuredOptions_.getOptionValue("timestamp_field", "-1"));
initScan();
}
private void initScan() {
scan = new Scan();
// Set filters, if any.
if (configuredOptions_.hasOption("gt")) {
gt_ = Bytes.toBytesBinary(Utils.slashisize(configuredOptions_.getOptionValue("gt")));
addFilter(CompareOp.GREATER, gt_);
}
if (configuredOptions_.hasOption("lt")) {
lt_ = Bytes.toBytesBinary(Utils.slashisize(configuredOptions_.getOptionValue("lt")));
addFilter(CompareOp.LESS, lt_);
}
if (configuredOptions_.hasOption("gte")) {
gte_ = Bytes.toBytesBinary(Utils.slashisize(configuredOptions_.getOptionValue("gte")));
addFilter(CompareOp.GREATER_OR_EQUAL, gte_);
}
if (configuredOptions_.hasOption("lte")) {
lte_ = Bytes.toBytesBinary(Utils.slashisize(configuredOptions_.getOptionValue("lte")));
addFilter(CompareOp.LESS_OR_EQUAL, lte_);
}
}
private void addFilter(CompareOp op, byte[] val) {
LOG.info("Adding filter " + op.toString() + " with value " + Bytes.toStringBinary(val));
FilterList scanFilter = (FilterList) scan.getFilter();
if (scanFilter == null) {
scanFilter = new FilterList();
}
scanFilter.addFilter(new RowFilter(op, new BinaryComparator(val)));
scan.setFilter(scanFilter);
}
@Override
public Tuple getNext() throws IOException {
try {
if (reader.nextKeyValue()) {
ImmutableBytesWritable rowKey = (ImmutableBytesWritable) reader
.getCurrentKey();
Result result = (Result) reader.getCurrentValue();
int tupleSize=columnList_.size();
if (loadRowKey_){
tupleSize++;
}
Tuple tuple=TupleFactory.getInstance().newTuple(tupleSize);
int startIndex=0;
if (loadRowKey_){
tuple.set(0, new DataByteArray(rowKey.get()));
startIndex++;
}
for (int i=0;i<columnList_.size();++i){
byte[][] col = input_columnList_.get(i);
if (col.length==2) { // we've got a colfam:colname pair
byte[] cell=result.getValue(col[0],col[1]);
if (cell!=null) {
if (cell.length > 0) {
tuple.set(i+startIndex, new DataByteArray(cell));
} else {
tuple.set(i+startIndex, new DataByteArray("1")); // As in, the column exists but is empty?
}
} else {
tuple.set(i+startIndex, null);
}
} else { // we've just got a colfam:
NavigableMap<byte[],byte[]> family = result.getFamilyMap(col[0]);
DataBag bagged_family = BagFactory.getInstance().newDefaultBag();
for (Map.Entry<byte[],byte[]> keyvalue : family.entrySet()) {
Tuple entry = TupleFactory.getInstance().newTuple(2);
entry.set(0, new DataByteArray(keyvalue.getKey()));
entry.set(1, new DataByteArray(keyvalue.getValue()));
bagged_family.add(entry);
}
tuple.set(i+startIndex, bagged_family);
}
}
return tuple;
}
} catch (InterruptedException e) {
throw new IOException(e);
}
return null;
}
@Override
public InputFormat getInputFormat() {
HBaseTableInputFormat inputFormat = new HBaseTableIFBuilder()
.withLimit(limit_)
.withGt(gt_)
.withGte(gte_)
.withLt(lt_)
.withLte(lte_)
.withConf(m_conf)
.build();
return inputFormat;
}
@Override
public void prepareToRead(RecordReader reader, PigSplit split) {
this.reader = reader;
}
@Override
public void setLocation(String location, Job job) throws IOException {
String tablename = location;
if (location.startsWith("hbase://")){
tablename = location.substring(8);
}
if (m_table == null) {
m_table = new HTable(m_conf, tablename);
}
m_table.setScannerCaching(caching_);
m_conf.set(TableInputFormat.INPUT_TABLE, tablename);
for (byte[][] col : input_columnList_) {
if (col.length==2) {
scan.addColumn(col[0], col[1]);
} else {
scan.addFamily(col[0]);
}
}
m_conf.set(TableInputFormat.SCAN, convertScanToString(scan));
m_conf.setBoolean("pig.splitCombination", false);
}
@Override
public String relativeToAbsolutePath(String location, Path curDir)
throws IOException {
return location;
}
private static String convertScanToString(Scan scan) {
try {
ByteArrayOutputStream out = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(out);
scan.write(dos);
return Base64.encodeBytes(out.toByteArray());
} catch (IOException e) {
LOG.error(e);
return "";
}
}
/**
* Set up the caster to use for reading values out of, and writing to, HBase.
*/
@Override
public LoadCaster getLoadCaster() throws IOException {
return caster_;
}
/*
* StoreFunc Methods
* @see org.apache.pig.StoreFuncInterface#getOutputFormat()
*/
@Override
public OutputFormat getOutputFormat() throws IOException {
TableOutputFormat outputFormat = new TableOutputFormat();
return outputFormat;
}
@Override
public void checkSchema(ResourceSchema s) throws IOException {
if (! (caster_ instanceof LoadStoreCaster)) {
LOG.error("Caster must implement LoadStoreCaster for writing to HBase.");
throw new IOException("Bad Caster " + caster_.getClass());
}
schema_ = s;
}
// Suppressing unchecked warnings for RecordWriter, which is not parameterized by StoreFuncInterface
@Override
public void prepareToWrite(@SuppressWarnings("rawtypes") RecordWriter writer) throws IOException {
this.writer = writer;
}
// Suppressing unchecked warnings for RecordWriter, which is not parameterized by StoreFuncInterface
@SuppressWarnings("unchecked")
@Override
public void putNext(Tuple t) throws IOException {
ResourceFieldSchema[] fieldSchemas = (schema_ == null) ? null : schema_.getFields();
byte[] rowKey = objToBytes(t.get(0), (fieldSchemas == null) ? DataType.findType(t.get(0)) : fieldSchemas[0].getType());
if(rowKey != null) {
Put put=new Put(objToBytes(t.get(0),
(fieldSchemas == null) ? DataType.findType(t.get(0)) : fieldSchemas[0].getType()));
put.setWriteToWAL(false);
long ts = System.currentTimeMillis();
if (tsField_!=-1) {
try {
ts = Long.valueOf(t.get(tsField_).toString());
} catch (Exception e) {
ts = System.currentTimeMillis();
}
}
for (int i=1;i<t.size();++i){
byte[] colVal = objToBytes(t.get(i), (fieldSchemas == null) ? DataType.findType(t.get(i)) : fieldSchemas[i].getType());
if (colVal!=null) { // Don't add null column values
put.add(columnList_.get(i-1), ts, colVal);
}
}
try {
if (!put.isEmpty()) { // Don't try to write a row with 0 columns
writer.write(null, put);
}
} catch (InterruptedException e) {
throw new IOException(e);
}
}
}
@SuppressWarnings("unchecked")
private byte[] objToBytes(Object o, byte type) throws IOException {
LoadStoreCaster caster = (LoadStoreCaster) caster_;
switch (type) {
case DataType.BYTEARRAY: return ((DataByteArray) o).get();
case DataType.BAG: return caster.toBytes((DataBag) o);
case DataType.CHARARRAY: return caster.toBytes((String) o);
case DataType.DOUBLE: return caster.toBytes((Double) o);
case DataType.FLOAT: return caster.toBytes((Float) o);
case DataType.INTEGER: return caster.toBytes((Integer) o);
case DataType.LONG: return caster.toBytes((Long) o);
// The type conversion here is unchecked.
// Relying on DataType.findType to do the right thing.
case DataType.MAP: return caster.toBytes((Map<String, Object>) o);
case DataType.NULL: return null;
case DataType.TUPLE: return caster.toBytes((Tuple) o);
case DataType.ERROR: throw new IOException("Unable to determine type of " + o.getClass());
default: throw new IOException("Unable to find a converter for tuple field " + o);
}
}
@Override
public String relToAbsPathForStoreLocation(String location, Path curDir)
throws IOException {
return location;
}
@Override
public void setStoreFuncUDFContextSignature(String signature) { }
@Override
public void setStoreLocation(String location, Job job) throws IOException {
if (location.startsWith("hbase://")){
job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, location.substring(8));
}else{
job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, location);
}
}
@Override
public void cleanupOnFailure(String location, Job job) throws IOException {
}
/*
* LoadPushDown Methods.
*/
@Override
public List<OperatorSet> getFeatures() {
return Arrays.asList(LoadPushDown.OperatorSet.PROJECTION);
}
@Override
public RequiredFieldResponse pushProjection(
RequiredFieldList requiredFieldList) throws FrontendException {
List<RequiredField> requiredFields = requiredFieldList.getFields();
List<byte[]> newColumns = Lists.newArrayListWithExpectedSize(requiredFields.size());
// HBase Row Key is the first column in the schema when it's loaded,
// and is not included in the columnList (since it's not a proper column).
int offset = loadRowKey_ ? 1 : 0;
if (loadRowKey_) {
if (requiredFields.size() < 1 || requiredFields.get(0).getIndex() != 0) {
loadRowKey_ = false;
} else {
// We just processed the fact that the row key needs to be loaded.
requiredFields.remove(0);
}
}
for (RequiredField field : requiredFields) {
int fieldIndex = field.getIndex();
newColumns.add(columnList_.get(fieldIndex - offset));
}
LOG.info("pushProjection After Projection: loadRowKey is " + loadRowKey_) ;
for (byte[] col : newColumns) {
LOG.info("pushProjection -- col: " + Bytes.toStringBinary(col));
}
columnList_ = newColumns;
return new RequiredFieldResponse(true);
}
}