* Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
* Project and contact information: http://www.cascading.org/
* This file is part of the Cascading project.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package cascading.tuple.hadoop;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import cascading.CascadingException;
import cascading.flow.FlowProcess;
import cascading.flow.FlowProps;
import cascading.tuple.Comparison;
import cascading.tuple.Tuple;
import cascading.tuple.TupleException;
import cascading.tuple.hadoop.io.HadoopTupleOutputStream;
import cascading.tuple.hadoop.io.IndexTupleDeserializer;
import cascading.tuple.hadoop.io.IndexTupleSerializer;
import cascading.tuple.hadoop.io.TupleDeserializer;
import cascading.tuple.hadoop.io.TuplePairDeserializer;
import cascading.tuple.hadoop.io.TuplePairSerializer;
import cascading.tuple.hadoop.io.TupleSerializer;
import cascading.tuple.io.IndexTuple;
import cascading.tuple.io.TupleInputStream;
import cascading.tuple.io.TupleOutputStream;
import cascading.tuple.io.TuplePair;
import cascading.util.Util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.Serialization;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.io.serializer.Serializer;
import org.apache.hadoop.io.serializer.WritableSerialization;
import org.apache.hadoop.util.ReflectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static cascading.tuple.hadoop.TupleSerializationProps.HADOOP_IO_SERIALIZATIONS;
* Class TupleSerialization is an implementation of Hadoop's {@link Serialization} interface.
* <p/>
* Typically developers will not use this implementation directly as it is automatically added
* to any relevant MapReduce jobs via the {@link org.apache.hadoop.conf.Configuration}.
* <p/>
* By default, all primitive types are natively handled, and {@link org.apache.hadoop.io.BytesWritable}
* has a pre-configured serialization token since byte arrays are not handled natively by {@link Tuple}.
* <p/>
* To add or manipulate Hadoop serializations or Cascading serializations tokens, see
* {@link TupleSerializationProps} for a fluent property builder class.
* <p/>
* By default this Serialization interface registers the class {@link org.apache.hadoop.io.ByteWritable} as
* token 127.
tokens = {127},
classNames = {"org.apache.hadoop.io.BytesWritable"})
public class TupleSerialization extends Configured implements Serialization
/** Field LOG */
private static final Logger LOG = LoggerFactory.getLogger( TupleSerialization.class );
/** Field defaultComparator * */
private Comparator defaultComparator;
/** Field classCache */
private final Map<String, Class> classCache = new HashMap<String, Class>();
/** Field serializationFactory */
private SerializationFactory serializationFactory;
/** Field tokenClassesMap */
private HashMap<Integer, String> tokenClassesMap;
/** Field classesTokensMap */
private HashMap<String, Integer> classesTokensMap;
/** Field tokenMapSize */
private long tokensSize = 0;
* Adds the given token and className pair as a serialization token property. During object serialization and deserialization,
* the given token will be used instead of the className when an instance of the className is encountered.
* <p/>
* This method has moved to {@link TupleSerializationProps#addSerializationToken(java.util.Map, int, String)}.
* @param properties of type Map
* @param token of type int
* @param className of type String
public static void addSerializationToken( Map<Object, Object> properties, int token, String className )
TupleSerializationProps.addSerializationToken( properties, token, className );
* Returns the serialization tokens property.
* <p/>
* This method has moved to {@link TupleSerializationProps#getSerializationTokens(java.util.Map)}.
* @param properties of type Map
* @return returns a String
public static String getSerializationTokens( Map<Object, Object> properties )
return TupleSerializationProps.getSerializationTokens( properties );
static String getSerializationTokens( Configuration jobConf )
return jobConf.get( TupleSerializationProps.SERIALIZATION_TOKENS );
* Adds the given className as a Hadoop IO serialization class.
* <p/>
* This method has moved to {@link TupleSerializationProps#addSerialization(java.util.Map, String)}.
* @param properties of type Map
* @param className of type String
public static void addSerialization( Map<Object, Object> properties, String className )
TupleSerializationProps.addSerialization( properties, className );
* Adds this class as a Hadoop Serialization class. This method is safe to call redundantly.
* <p/>
* This method will guarantee {@link TupleSerialization} and {@link WritableSerialization} are
* first in the list, as both are required.
* @param jobConf of type JobConf
public static void setSerializations( Configuration jobConf )
String serializations = getSerializations( jobConf );
LinkedList<String> list = new LinkedList<String>();
if( serializations != null && !serializations.isEmpty() )
Collections.addAll( list, serializations.split( "," ) );
// required by MultiInputSplit
String writable = WritableSerialization.class.getName();
String tuple = TupleSerialization.class.getName();
list.remove( writable );
list.remove( tuple );
list.addFirst( writable );
list.addFirst( tuple );
// make writable last
jobConf.set( HADOOP_IO_SERIALIZATIONS, Util.join( list, "," ) );
static String getSerializations( Configuration jobConf )
return jobConf.get( HADOOP_IO_SERIALIZATIONS, null );
public static Comparator getDefaultComparator( Comparator comparator, Configuration jobConf )
String typeName = jobConf.get( FlowProps.DEFAULT_ELEMENT_COMPARATOR );
if( Util.isEmpty( typeName ) )
return null;
if( comparator == null )
return createComparator( jobConf, typeName );
if( comparator.getClass().getName().equals( typeName ) && !( comparator instanceof Configured ) )
return comparator;
return createComparator( jobConf, typeName );
public static Comparator getDefaultComparator( Configuration jobConf )
String typeName = jobConf.get( FlowProps.DEFAULT_ELEMENT_COMPARATOR );
if( Util.isEmpty( typeName ) )
return null;
return createComparator( jobConf, typeName );
private static Comparator createComparator( Configuration jobConf, String typeName )
LOG.debug( "using default comparator: {}", typeName );
Class<Comparator> type = (Class<Comparator>) TupleSerialization.class.getClassLoader().loadClass( typeName );
return ReflectionUtils.newInstance( type, jobConf );
catch( ClassNotFoundException exception )
throw new CascadingException( "unable to load class: " + typeName, exception );
/** Constructor TupleSerialization creates a new TupleSerialization instance. */
public TupleSerialization()
public TupleSerialization( final FlowProcess<? extends Configuration> flowProcess )
super( new Configuration()
public String get( String name )
return get( name, null );
public String get( String name, String defaultValue )
Object value = flowProcess.getProperty( name );
return value == null ? defaultValue : String.valueOf( value );
} );
* Constructor TupleSerialization creates a new TupleSerialization instance.
* @param conf of type Configuration
public TupleSerialization( Configuration conf )
super( conf );
public void setConf( Configuration conf )
super.setConf( conf );
if( conf != null )
defaultComparator = getDefaultComparator( conf );
public Configuration getConf()
if( super.getConf() == null )
setConf( new Configuration() );
return super.getConf();
SerializationFactory getSerializationFactory()
if( serializationFactory == null )
serializationFactory = new SerializationFactory( getConf() );
return serializationFactory;
/** Must be called before {@link #getClassNameFor(int)} and {@link #getTokenFor(String)} methods. */
void initTokenMaps()
if( tokenClassesMap != null )
tokenClassesMap = new HashMap<Integer, String>();
classesTokensMap = new HashMap<String, Integer>();
String tokenProperty = getSerializationTokens( getConf() );
if( tokenProperty != null )
tokenProperty = tokenProperty.replaceAll( "\\s", "" ); // allow for whitespace in token set
for( String pair : tokenProperty.split( "," ) )
String[] elements = pair.split( "=" );
addToken( null, Integer.parseInt( elements[ 0 ] ), elements[ 1 ] );
String serializationsString = getSerializations( getConf() );
LOG.debug( "using hadoop serializations from the job conf: {} ", serializationsString );
if( serializationsString == null )
String[] serializations = serializationsString.split( "," );
for( String serializationName : serializations )
Class type = getConf().getClassByName( serializationName );
SerializationToken tokenAnnotation = (SerializationToken) type.getAnnotation( SerializationToken.class );
if( tokenAnnotation == null )
if( tokenAnnotation.tokens().length != tokenAnnotation.classNames().length )
throw new CascadingException( "serialization annotation tokens and classNames must be the same length" );
int[] tokens = tokenAnnotation.tokens();
for( int i = 0; i < tokens.length; i++ )
addToken( type, tokens[ i ], tokenAnnotation.classNames()[ i ] );
catch( ClassNotFoundException exception )
LOG.warn( "unable to load serialization class: {}", serializationName, exception );
tokensSize = tokenClassesMap.size();
private void addToken( Class type, int token, String className )
if( type != null && !type.getName().startsWith( "cascading." ) && token < 128 )
throw new CascadingException( "serialization annotation tokens may not be less than 128, was: " + token );
if( tokenClassesMap.containsKey( token ) )
if( type == null )
throw new IllegalStateException( "duplicate serialization token: " + token + " for class: " + className + " found in properties" );
throw new IllegalStateException( "duplicate serialization token: " + token + " for class: " + className + " on serialization: " + type.getName() );
if( classesTokensMap.containsKey( className ) )
if( type == null )
throw new IllegalStateException( "duplicate serialization classname: " + className + " for token: " + token + " found in properties " );
throw new IllegalStateException( "duplicate serialization classname: " + className + " for token: " + token + " on serialization: " + type.getName() );
LOG.debug( "adding serialization token: {}, for classname: {}", token, className );
tokenClassesMap.put( token, className );
classesTokensMap.put( className, token );
* Returns the className for the given token.
* @param token of type int
* @return a String
final String getClassNameFor( int token )
if( tokensSize == 0 )
return null;
return tokenClassesMap.get( token );
final long getTokensMapSize()
return tokensSize;
* Returns the token for the given className.
* @param className of type String
* @return an Integer
final Integer getTokenFor( String className )
if( tokensSize == 0 )
return null;
return classesTokensMap.get( className );
public Comparator getDefaultComparator()
return defaultComparator;
public Comparator getComparator( Class type )
Serialization serialization = getSerialization( type );
Comparator comparator = null;
if( serialization instanceof Comparison )
comparator = ( (Comparison) serialization ).getComparator( type );
if( comparator != null )
return comparator;
return defaultComparator;
Serialization getSerialization( String className )
return getSerialization( getClass( className ) );
Serialization getSerialization( Class type )
return getSerializationFactory().getSerialization( type );
Serializer getNewSerializer( Class type )
Serializer serializer = getSerializationFactory().getSerializer( type );
if( serializer == null )
throw new CascadingException( "unable to load serializer for: " + type.getName() + " from: " + getSerializationFactory().getClass().getName() );
return serializer;
catch( NullPointerException exception )
throw new CascadingException( "unable to load serializer for: " + type.getName() + " from: " + getSerializationFactory().getClass().getName() );
Deserializer getNewDeserializer( String className )
Deserializer deserializer = getSerializationFactory().getDeserializer( getClass( className ) );
if( deserializer == null )
throw new CascadingException( "unable to load deserializer for: " + className + " from: " + getSerializationFactory().getClass().getName() );
return deserializer;
catch( NullPointerException exception )
throw new CascadingException( "unable to load deserializer for: " + className + " from: " + getSerializationFactory().getClass().getName() );
TuplePairDeserializer getTuplePairDeserializer()
return new TuplePairDeserializer( getElementReader() );
* Method getElementReader returns the elementReader of this TupleSerialization object.
* @return the elementReader (type SerializationElementReader) of this TupleSerialization object.
public SerializationElementReader getElementReader()
return new SerializationElementReader( this );
TupleDeserializer getTupleDeserializer()
return new TupleDeserializer( getElementReader() );
private TuplePairSerializer getTuplePairSerializer()
return new TuplePairSerializer( getElementWriter() );
IndexTupleDeserializer getIndexTupleDeserializer()
return new IndexTupleDeserializer( getElementReader() );
* Method getElementWriter returns the elementWriter of this TupleSerialization object.
* @return the elementWriter (type SerializationElementWriter) of this TupleSerialization object.
public SerializationElementWriter getElementWriter()
return new SerializationElementWriter( this );
private TupleSerializer getTupleSerializer()
return new TupleSerializer( getElementWriter() );
private IndexTupleSerializer getIndexTupleSerializer()
return new IndexTupleSerializer( getElementWriter() );
* Method accept implements {@link Serialization#accept(Class)}.
* @param c of type Class
* @return boolean
public boolean accept( Class c )
return Tuple.class == c || TuplePair.class == c || IndexTuple.class == c;
* Method getDeserializer implements {@link Serialization#getDeserializer(Class)}.
* @param c of type Class
* @return Deserializer
public Deserializer getDeserializer( Class c )
if( c == Tuple.class )
return getTupleDeserializer();
else if( c == TuplePair.class )
return getTuplePairDeserializer();
else if( c == IndexTuple.class )
return getIndexTupleDeserializer();
throw new IllegalArgumentException( "unknown class, cannot deserialize: " + c.getName() );
* Method getSerializer implements {@link Serialization#getSerializer(Class)}.
* @param c of type Class
* @return Serializer
public Serializer getSerializer( Class c )
if( c == Tuple.class )
return getTupleSerializer();
else if( c == TuplePair.class )
return getTuplePairSerializer();
else if( c == IndexTuple.class )
return getIndexTupleSerializer();
throw new IllegalArgumentException( "unknown class, cannot serialize: " + c.getName() );
public Class getClass( String className )
Class type = classCache.get( className );
if( type != null )
return type;
if( className.charAt( 0 ) == '[' )
type = Class.forName( className, true, Thread.currentThread().getContextClassLoader() );
type = Thread.currentThread().getContextClassLoader().loadClass( className );
catch( ClassNotFoundException exception )
throw new TupleException( "unable to load class named: " + className, exception );
classCache.put( className, type );
return type;
public static class SerializationElementReader implements TupleInputStream.ElementReader
/** Field LOG */
private static final Logger LOG = LoggerFactory.getLogger( SerializationElementReader.class );
/** Field tupleSerialization */
private final TupleSerialization tupleSerialization;
/** Field deserializers */
final Map<String, Deserializer> deserializers = new HashMap<String, Deserializer>();
* Constructor SerializationElementReader creates a new SerializationElementReader instance.
* @param tupleSerialization of type TupleSerialization
public SerializationElementReader( TupleSerialization tupleSerialization )
this.tupleSerialization = tupleSerialization;
public Object read( int token, DataInputStream inputStream ) throws IOException
String className = getClassNameFor( token, inputStream );
Deserializer deserializer = getDeserializerFor( inputStream, className );
Object foundObject = null;
Object object;
object = deserializer.deserialize( foundObject );
catch( IOException exception )
LOG.error( "failed deserializing token: " + token + " with classname: " + className, exception );
throw exception;
return object;
public Comparator getComparatorFor( int token, DataInputStream inputStream ) throws IOException
Class type = tupleSerialization.getClass( getClassNameFor( token, inputStream ) );
return tupleSerialization.getComparator( type );
private Deserializer getDeserializerFor( DataInputStream inputStream, String className ) throws IOException
Deserializer deserializer = deserializers.get( className );
if( deserializer == null )
deserializer = tupleSerialization.getNewDeserializer( className );
deserializer.open( inputStream );
deserializers.put( className, deserializer );
return deserializer;
public String getClassNameFor( int token, DataInputStream inputStream ) throws IOException
String className = tupleSerialization.getClassNameFor( token );
if( className == null )
className = WritableUtils.readString( inputStream );
catch( IOException exception )
LOG.error( "unable to resolve token: {}, to a valid classname, with token map of size: {}, rethrowing IOException", token, tupleSerialization.getTokensMapSize() );
throw exception;
return className;
public void close()
if( deserializers.size() == 0 )
Collection<Deserializer> clone = new ArrayList<Deserializer>( deserializers.values() );
for( Deserializer deserializer : clone )
catch( IOException exception )
// do nothing
public static class SerializationElementWriter implements TupleOutputStream.ElementWriter
/** Field LOG */
private static final Logger LOG = LoggerFactory.getLogger( SerializationElementWriter.class );
/** Field tupleSerialization */
private final TupleSerialization tupleSerialization;
/** Field serializers */
final Map<Class, Serializer> serializers = new HashMap<Class, Serializer>();
public SerializationElementWriter( TupleSerialization tupleSerialization )
this.tupleSerialization = tupleSerialization;
public void write( DataOutputStream outputStream, Object object ) throws IOException
Class<?> type = object.getClass();
String className = type.getName();
Integer token = tupleSerialization.getTokenFor( className );
if( token == null )
LOG.debug( "no serialization token found for classname: {}", className );
WritableUtils.writeVInt( outputStream, HadoopTupleOutputStream.WRITABLE_TOKEN ); // denotes to punt to hadoop serialization
WritableUtils.writeString( outputStream, className );
WritableUtils.writeVInt( outputStream, token );
Serializer serializer = serializers.get( type );
if( serializer == null )
serializer = tupleSerialization.getNewSerializer( type );
serializer.open( outputStream );
serializers.put( type, serializer );
serializer.serialize( object );
catch( IOException exception )
LOG.error( "failed serializing token: " + token + " with classname: " + className, exception );
throw exception;
public void close()
if( serializers.size() == 0 )
Collection<Serializer> clone = new ArrayList<Serializer>( serializers.values() );
for( Serializer serializer : clone )
catch( IOException exception )
// do nothing