Package com.tellapart.cascading.hbase

Source Code of com.tellapart.cascading.hbase.SerializingHBaseScheme

package com.tellapart.cascading.hbase;

import java.io.IOException;

import cascading.CascadingException;

import cascading.hbase.HBaseScheme;

import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import org.apache.hadoop.hbase.io.Cell;
import org.apache.hadoop.mapred.JobConf;

// Yes, we're using the old Hadoop APIs until Cascading is ready for the new ones.
// Sigh.
@SuppressWarnings("deprecation")

/**
* The SerializingHBaseScheme class is a {@link HBaseScheme} subclass.
* It is used in conjunction with the {@HBaseTap} to allow for the reading and writing of data to
* and from a HBase cluster.
* Unlike HBaseScheme, it supports serialization of the types inside the Tuples, as well as
* returning the timestamp of the HBase cell that was read.
*
* This must extend HBaseScheme since HBaseTap only takes an HBaseScheme.
* The other option is to make our own HBaseTap, or modify HBaseScheme directly.
*
* @see HBaseTap
*/
public class SerializingHBaseScheme extends HBaseScheme {
  private static final long serialVersionUID = 1L;

  // Indicates if the scheme is used for reading or writing.
  // This is a bit of a hack: Cascading doesn't allow a tap equal to another tap
  // to be used both as a source and a sink for a Flow.
  // For HBaseTaps, this is sometimes desirable -- if we want to read from a column
  // in HBase, do some transformation, and write back to the same column. In such
  // a case, normally, the HBaseTaps and schemes are equal so Cascading throws an error.
  // To overcome this, we can mark one of the schemes for read and the other for write,
  // and make sure (in the equals() method), that this is used for distinguishing between
  // these kinds of taps. That way we can tell Cascading that it's OK to both read and
  // write from this same source.
  public enum Direction {
    FOR_READ,
    FOR_WRITE,
  }

  // Serialization object used to serialize and deserialize values.
  private HBaseSerialization mSerialization;

  // Indicates if the scheme will return timestamped results.
  private boolean mIsTimestamped = false;

  // The Direction of this scheme (READ or WRITE).
  private Direction mDirection;

  // Indicates which classes we expect the HBase table to contain.
  private Class<?>[] mClasses;

  // Corresponds to what we should set hbase.client.scanner.caching for this scheme.
  // If not set, we do not override the jobconf value.
  private Integer mClientScanCaching = null;

  /**
   * Constructor SerializingHBaseScheme creates a new SerializingHBaseScheme instance using fully
   * qualified column names. This means that the names of the fields in valueFields should
   * correspond to the column family names in hbase that we will be pulling our data from.
   *
   * @param keyField    the Field that will be used for the row key.
   * @param valueFields the Fields, from the input pipe into this scheme, that will be written
   *                    out to HBase. Each Field must be an HBase column.
   * @param valueClasses the Classes that correspond to the values inside each of the valueFields.
   *                     When this is used as a source, these classes are used to determine
   *                     how to deserialize the object corresponding to that field.
   * @param isTimestamped When this is used as a source, determines whether to return the raw
   *                      deserialized object in the output tuple, or the CellElement
   *                      wrapper object which will carry the deserialized object as well as the
   *                      HBase timestamp of the cell it came from.
   * @param use The Direction (FOR_READ or FOR_WRITE) of this scheme.
   */
  public SerializingHBaseScheme(Fields keyField, Fields valueFields, Class<?>[] valueClasses,
        boolean isTimestamped, Direction direction) {
    super(keyField, valueFields);
    validateClasses(valueFields, valueClasses);
    mClasses = valueClasses;
    mIsTimestamped = isTimestamped;
    mDirection = direction;
  }

  public SerializingHBaseScheme(Fields keyField, String familyName, Fields valueFields,
      Class<?>[] valueClasses, boolean isTimestamped, Direction direction) {
    super(keyField, familyName, valueFields);
    validateClasses(valueFields, valueClasses);
    mClasses = valueClasses;
    mIsTimestamped = isTimestamped;
    mDirection = direction;
  }

  /**
   * Sets the hbase.client.scanner.caching property for this scheme.
   * This determines how many rows are cached for HBase reads.
   * If not set, this defers to the hbase.client.scanner.caching set in the job conf
   * that invoked this scheme, or to the default of "10".
   * @param clientScanCaching how many rows to cache.
   */
  public void setClientScanCaching(int clientScanCaching) {
    mClientScanCaching = clientScanCaching;
  }

  /**
   * Validates that the number of classes passed in matches the number of fields.
   */
  private void validateClasses(Fields valueFields, Class<?>[] valueClasses) {
    if (valueClasses.length != valueFields.size()) {
      throw new CascadingException("Expected same number of classes and value fields for "
          + "SerializingHBaseScheme. "
          + "Got: " + valueClasses.length + " classes, and " + valueFields.size() + " fields.");
    }
  }

  @Override
  public void sinkInit(Tap tap, JobConf conf) throws IOException {
    super.sinkInit(tap, conf);
    // We can only initialize serialization here and not in the constructor since we don't have
    // JobConf available to us in the constructor.
    initSerialization(conf);
  }

  @Override
  public void sourceInit(Tap tap, JobConf conf) throws IOException {
    super.sourceInit(tap, conf);
    // We can only initialize serialization here and not in the constructor since we don't have
    // JobConf available to us in the constructor.
    initSerialization(conf);
    if (mClientScanCaching != null) {
      conf.set("hbase.client.scanner.caching", mClientScanCaching.toString());
    }
  }

  /**
   * Initialize the HBaseSerialization used to serialize and deserialize classes.
   */
  private void initSerialization(JobConf conf) {
    mSerialization = new HBaseSerialization(conf);
  }

  @Override
  protected byte[] toCell(Tuple tuple, int pos) {
    return toCell(tuple.getObject(pos));
  }

  /**
   * Serializes the given object using HBaseSerialization.
   * @param object object to serialize.
   * @return byte[] serialized bytes of the object.
   */
  protected byte[] toCell(Object object) {
    return mSerialization.serialize(object);
  }

  @Override
  protected Object fromCell(Cell cell, int fieldIndex) {
    if (cell == null) {
      return null;
    }

    // Deserialize this element based on the Class we expect this column to contain.
    Object element = mSerialization.deserialize(cell.getValue(), mClasses[fieldIndex]);
    // Wrap the element in a CellElement if requested.
    if (mIsTimestamped && element != null) {
      return new CellElement(element, cell.getTimestamp());
    } else {
      return element;
    }
  }

  public boolean equals(Object other) {
    boolean isEqual = super.equals(other);
    if (isEqual) {
      SerializingHBaseScheme otherScheme = (SerializingHBaseScheme)other;
      isEqual &= (otherScheme.mIsTimestamped == mIsTimestamped);
      isEqual &= (otherScheme.mDirection == mDirection);
    }
    return isEqual;
  }

  public String toString() {
    return super.toString() + "," + mIsTimestamped + "," + mDirection;
  }
}
TOP

Related Classes of com.tellapart.cascading.hbase.SerializingHBaseScheme

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.