Package com.tellapart.cascading.hbase

Source Code of com.tellapart.cascading.hbase.HBaseSerialization

package com.tellapart.cascading.hbase;

import java.io.ObjectInputStream;
import java.io.IOException;
import java.io.Serializable;

import java.io.ByteArrayOutputStream;
import java.io.ByteArrayInputStream;

import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;

import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.io.serializer.Serializer;
import org.apache.hadoop.mapred.JobConf;

import cascading.CascadingException;

// Using the old hadoop API until Cascading supports the newer one.
@SuppressWarnings("deprecation")

/**
* HBaseSerialization is a wrapper around the Apache Hadoop SerializationFactory
* and allows us to serialize and deserialize objects using Hadoop Serialization into
* HBase.
* In addition to the base Hadoop Serialization, HBaseSerialization also supports
* serializing primitives (Integer, Long, Double, Boolean, String, byte[], etc.).
* Serializing primitives is done by converting them to a Hadoop Writable, and
* serializing that object. This is transparent to the end user.
* HBaseSerialization must be serializable since Cascading serializes and unserializes
* the FullHBaseScheme that uses it when running on a cluster.
* SerializationFactory is not serializable, so to save the state we save the ioSerializations
* hadoop property that's used to find Serialization classes that we support.
*/
public class HBaseSerialization implements Serializable {
  private static final long serialVersionUID = 1L;
  private transient SerializationFactory mFactory = null;

  private String ioSerializations = null;

  /**
   * Create an HBaseSerialization using a JobConf that must have io.serializations
   * filled in to the list of Hadoop serializations we will support.
   * @param conf the JobConf with the settings we'll use.
   */
  public HBaseSerialization(JobConf conf) {
    ioSerializations = conf.get("io.serializations");
    initialize();
  }

  /**
   * Initialize the SerializationFactory used to access Hadoop serializers.
   */
  private void initialize() {
    JobConf conf = new JobConf();
    conf.set("io.serializations", ioSerializations);
    mFactory = new SerializationFactory(conf);
  }

  /**
   * @return a Serializer for the given class.
   * @throws a CascadingException if the Serializer for that particular class
   * could not be loaded.
   */
  private Serializer<?> getNewSerializer(Class<?> type ) {
    try {
      return mFactory.getSerializer(type);
    } catch( NullPointerException exception ) {
      throw new CascadingException("Unable to load serializer for: " + type.getName()
          + " from: " + mFactory.getClass().getName() );
    }
  }

  /**
   * @return a Deserializer for the given class.
   * @throws a CascadingException if the Deserializer for that particular class
   * could not be loaded.
   */
  private Deserializer<?> getNewDeserializer(Class<?> type ) {
    try {
      return mFactory.getDeserializer(type);
    } catch( NullPointerException exception ) {
      throw new CascadingException("Unable to load deserializer for: " + type.getName()
          + " from: " + mFactory.getClass().getName() );
    }
  }

  /**
   * Serializes the given object into a byte array.
   * This supports serialization for bytes, Strings, and any object that is handled
   * by an Apache Hadoop serialization.
   * @param o the Object to serialize
   * @return byte[] array of the serialized object.
   *         If o is null, returns null.
   */
  public byte[] serialize(Object o) {
    if (o == null) {
      return null;
    }
    if (o instanceof String) {
      return ((String)o).getBytes();
    } else if (o instanceof byte[]) {
      return (byte[])o;
    } else {
      // Try to convert the object to a Writable,
      // and serialize the result of that, if we have a writable.
      Writable oWritable = toWritable(o);
      if (oWritable == null) {
        return internalSerialize(o);
      } else {
        return internalSerialize(oWritable);
      }
    }
  }

  /**
   * Converts primitives (Integer, Long, Double, Boolean) to a Writable
   * for writing them, since the serialization schemes we know about don't
   * support serializing these types by themselves.
   * @param o the object to serialize
   * @return a Writable wrapping this object, or null if we don't know how
   * to convert this object to a writable.
   */
  private Writable toWritable(Object o) {
    if (o instanceof Boolean) {
      return new BooleanWritable((Boolean)o);
    } else if (o instanceof Integer) {
      return new IntWritable((Integer)o);
    } else if (o instanceof Long) {
      return new LongWritable((Long)o);
    } else if (o instanceof Double) {
      return new DoubleWritable((Double)o);
    }
    return null;
  }

  /**
   * Converts Writables of primitives (Boolean, Int, Long, Double) to their
   * associated primitive.
   * This is used to read objects that were written as Writables but should
   * be returned to the user as the primitive value inside the Writable.
   * @param o the object to serialize
   * @return a Writable wrapping this object, or null if we don't know how
   * to convert this object to a writable.
   */
  private Object fromWritable(Object o) {
    if (o instanceof BooleanWritable) {
      return ((BooleanWritable)o).get();
    } else if (o instanceof IntWritable) {
      return ((IntWritable)o).get();
    } else if (o instanceof LongWritable) {
      return ((LongWritable)o).get();
    } else if (o instanceof DoubleWritable) {
      return ((DoubleWritable)o).get();
    }
    return null;
  }

  /**
   * @return the Writable version of a class. The user of this class can then specify
   * that a given column is of Boolean or Long type, and this will return the correct
   * Writable object that was written to HBase and that we should use for deserializing
   * the class.
   */
  private Class<?> getWritableVersionOfClass(Class<?> clazz) {
    if (clazz.equals(Boolean.class)) {
      return BooleanWritable.class;
    } else if (clazz.equals(Integer.class)) {
      return IntWritable.class;
    } else if (clazz.equals(Long.class)) {
      return LongWritable.class;
    } else if (clazz.equals(Double.class)) {
      return DoubleWritable.class;
    }
    return null;
  }

  /**
   * Serializes o by getting a Hadoop Serializer based of the object's class.
   * @param o object to serialize
   * @return the serialized byte[] array.
   */
  private byte[] internalSerialize(Object o) {
    Serializer serializer = getNewSerializer(o.getClass());

    // Write the object into an in-memory byte array.
    ByteArrayOutputStream byteOs = new ByteArrayOutputStream();
    try {
      serializer.open(byteOs);
      serializer.serialize(o);
      serializer.close();
    } catch (IOException ex) {
      // This should never happen since we're writing to an in-memory ByteArrayOutputStream.
      throw new RuntimeException(ex);
    }
    return byteOs.toByteArray();
  }

  /**
   * Deserialize a byte array into an object based on the given class.
   * @param bytes the bytes to deserialize.
   * @param clazz the expected class these bytes represent.
   * @return the deserialized Object.
   */
  public Object deserialize(byte[] bytes, Class<?> clazz) {
    try {
      // If null, return null.
      if (bytes == null) {
        return null;
      }

      // First check for our special handling byte[] and Strings.
      if (clazz.equals(byte[].class)) {
        return bytes;
      } else if (clazz.equals(String.class)) {
        return new String(bytes);
      } else {
        Class<?> writableVersionOfClazz = getWritableVersionOfClass(clazz);
        if (writableVersionOfClazz != null) {
          clazz = writableVersionOfClazz;
        }

        // Now deserialize using the hadoop deserializer.
        Deserializer deserializer = getNewDeserializer(clazz);
        deserializer.open(new ByteArrayInputStream(bytes));
        Object o = clazz.newInstance();
        o = deserializer.deserialize(o);
        deserializer.close();

        if (writableVersionOfClazz != null) {
          return fromWritable(o);
        } else {
          return o;
        }
      }
    } catch(Exception ex) {
      throw new RuntimeException(ex);
    }
  }

  // We add a hook to the deserialization to also re-initialize the SerializationFactory class.
  private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
    in.defaultReadObject();
    initialize();
  }
}
TOP

Related Classes of com.tellapart.cascading.hbase.HBaseSerialization

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.