Source Code of com.infochimps.hbase.pig.HFileStorage

package com.infochimps.hbase.pig;


import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Iterator;
import java.util.TreeSet;


import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.pig.LoadCaster;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadPushDown;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.StoreFuncInterface;
import org.apache.pig.StoreFunc;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.builtin.Utf8StorageConverter;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.util.Utils;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


import com.google.common.collect.Lists;


//
// This can work. Note: 1. Need to run a total sort over the whole
// data set. 2. Need to sort the keyvalues before writing. It may be
// that simply calling ORDER with no other arguments in the pig script
// itself will be fine.
//
public class HFileStorage extends StoreFunc {


    protected RecordWriter writer = null;
    private String tableURI;
    private byte[] tableName;
    private byte[] columnFamily;
    private String[] columnNames;
    
    /**
     * Constructor. Construct a HFile StoreFunc to write data out as HFiles. These
     * HFiles will then have to be imported with the hbase/bin/loadtable.rb tool. 
     * @param tN The HBase table name the data will ultimately wind up in. It does not need to exist ahead of time.
     * @param cF The HBase column family name for the table the data will wind up it. It does not need to exist ahead of time.
     * @param columnNames A comma separated list of column names descibing the fields in a tuple.
     */
    public HFileStorage(String tN, String cF, String names) {
        this.tableName    = Bytes.toBytes(tN);
        this.columnFamily = Bytes.toBytes(cF);
        this.columnNames  = names.split(",");
    }


    //
    // getOutputFormat()
    //
    // This method will be called by Pig to get the OutputFormat
    // used by the storer. The methods in the OutputFormat (and
    // underlying RecordWriter and OutputCommitter) will be
    // called by pig in the same manner (and in the same context)
    // as by Hadoop in a map-reduce java program. If the
    // OutputFormat is a hadoop packaged one, the implementation
    // should use the new API based one under
    // org.apache.hadoop.mapreduce. If it is a custom OutputFormat,
    // it should be implemented using the new API under
    // org.apache.hadoop.mapreduce. The checkOutputSpecs() method
    // of the OutputFormat will be called by pig to check the
    // output location up-front. This method will also be called as
    // part of the Hadoop call sequence when the job is launched. So
    // implementations should ensure that this method can be called
    // multiple times without inconsistent side effects. 
    public OutputFormat getOutputFormat() throws IOException {
        HFileOutputFormat outputFormat = new HFileOutputFormat();
        return outputFormat;
    }


    //
    // setStoreLocation()
    // This method is called by Pig to communicate the store location
    // to the storer. The storer should use this method to communicate
    // the same information to the underlying OutputFormat. This
    // method is called multiple times by pig - implementations should
    // bear in mind that this method is called multiple times and should
    // ensure there are no inconsistent side effects due to the multiple
    // calls.
    public void setStoreLocation(String location, Job job) throws IOException {
        job.getConfiguration().set("mapred.textoutputformat.separator", "");
        FileOutputFormat.setOutputPath(job, new Path(location));
    }


    //
    // prepareToWrite()
    //
    // In the new API, writing of the data is through the OutputFormat provided
    // by the StoreFunc. In prepareToWrite() the RecordWriter associated with
    // the OutputFormat provided by the StoreFunc is passed to the StoreFunc.
    // The RecordWriter can then be used by the implementation in putNext() to
    // write a tuple representing a record of data in a manner expected by the
    // RecordWriter.
    public void prepareToWrite(@SuppressWarnings("rawtypes") RecordWriter writer) throws IOException {
        this.writer = writer;
    }


    //
    // Here we are going to get the following:
    //
    // (row_key, {(field1),(field2),...})
    //
    // we must iterate through the tuples in the
    // bag and insert them into a TreeSet for
    // sorting. Then we need to iterate through
    // the sorted set and serialize each column
    // 
    //
    @SuppressWarnings("unchecked")
    public void putNext(Tuple t) throws IOException {
        try {
            byte[] rowKey         = Bytes.toBytes(t.get(0).toString()); // use zeroth field as row key
            //byte[] rowKey         = ((DataByteArray)t.get(0)).get();
            DataBag columns       = (DataBag)t.get(1);
            ImmutableBytesWritable hbaseRowKey = new ImmutableBytesWritable(rowKey);
            TreeSet<KeyValue> map = sortedKeyValues(rowKey, columns);
            for (KeyValue kv: map) {
                writer.write(hbaseRowKey, kv);
            }
        } catch (InterruptedException e) {
            throw new IOException("Interrupted");
        } catch (NullPointerException e) {
            System.out.println("@('_')@ Null pointer exception.");
        }
    }


    private TreeSet<KeyValue> sortedKeyValues(byte[] rowKey, DataBag columns) throws IOException {
        TreeSet<KeyValue> map              = new TreeSet<KeyValue>(KeyValue.COMPARATOR);
        long ts                            = System.currentTimeMillis();
        int idx                            = 0;
        Iterator<Tuple> tupleIter = columns.iterator();
        while(tupleIter.hasNext()) {
            byte[] columnName = Bytes.toBytes(columnNames[idx]);
            byte[] value      = Bytes.toBytes((String)tupleIter.next().get(0));                
            if (idx != 0) {
                KeyValue kv = new KeyValue(rowKey, columnFamily, columnName, ts, value);
                map.add(kv.clone());
            }
            idx += 1;
        }
        return map;
    }


    // private byte[] getValue(Object field) throws IOException {
    //     byte[] value = null; 
    //     switch (DataType.findType(field)) {
    //     case DataType.INTEGER: {
    //         value = ((Integer)field).toString().getBytes();
    //         break;
    //     }
    //     case DataType.LONG: {
    //         value = ((Long)field).toString().getBytes();
    //         break;
    //     }
    // 
    //     case DataType.FLOAT: {
    //         value = ((Float)field).toString().getBytes();
    //         break;
    //     }
    // 
    //     case DataType.DOUBLE: {
    //         value = ((Double)field).toString().getBytes();
    //         break;
    //     }
    // 
    //     case DataType.CHARARRAY: {
    //         value = ((String)field).getBytes("UTF-8");
    //         break;
    //     }
    // 
    //     case DataType.BYTEARRAY: {
    //         value = ((DataByteArray)field).get();
    //         break;
    //     }
    // 
    //     default: {
    //         throw new IllegalArgumentException("You fail:" + DataType.findType(field));
    //     }
    //     }
    //     return value;
    // }
}
Source Code of com.infochimps.hbase.pig.HFileStorage

Related Classes of com.infochimps.hbase.pig.HFileStorage