package com.ebay.erl.mobius.core.mapred;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import com.ebay.erl.mobius.core.ConfigureConstants;
import com.ebay.erl.mobius.core.builder.Dataset;
import com.ebay.erl.mobius.core.collection.BigTupleList;
import com.ebay.erl.mobius.core.criterion.TupleCriterion;
import com.ebay.erl.mobius.core.datajoin.DataJoinReducer;
import com.ebay.erl.mobius.core.datajoin.DataJoinValueGroup;
import com.ebay.erl.mobius.core.function.base.ExtendFunction;
import com.ebay.erl.mobius.core.function.base.GroupFunction;
import com.ebay.erl.mobius.core.function.base.Projectable;
import com.ebay.erl.mobius.core.model.ReadFieldImpl;
import com.ebay.erl.mobius.core.model.Tuple;
import com.ebay.erl.mobius.util.SerializableUtil;
import com.ebay.erl.mobius.util.Util;
/**
* Reducer for handling Mobius joining and group
* by job.
*
* <p>
* This product is licensed under the Apache License, Version 2.0,
* available at http://www.apache.org/licenses/LICENSE-2.0.
*
* This product contains portions derived from Apache hadoop which is
* licensed under the Apache License, Version 2.0, available at
* http://hadoop.apache.org.
*
* © 2007 – 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
*/
@SuppressWarnings({"deprecation", "unchecked"})
public class DefaultMobiusReducer extends DataJoinReducer<Tuple, Tuple, NullWritable, WritableComparable<?>>
{
private static final Log LOGGER = LogFactory.getLog(DefaultMobiusReducer.class);
/**
* IDs of all the participated {@link Dataset}, in the order
* of left to right in a join job.
*
* If this is a join job, this array is used to keep track the
* current dataset is the expected one, if not, then we cannot
* perform inner join.
*/
private Byte[] _allDatasetIDs;
/**
* array store all the dataset IDs but not the last
* one
*/
private Byte[] _allButNotLastDatasetIDs;
/**
* A quick reference to get the last dataset ID.
*/
private Byte _lastDatasetID;
/**
* Hadoop job config.
*/
private JobConf conf;
/**
* the criteria specified by the user and to be applied
* before the persistent step.
*/
protected TupleCriterion _persistantCriteria;
/**
* the final projection functions.
*/
protected Projectable[] _projections = null;
/**
* The final projected column names, in user
* specified order.
*/
protected String[] outputColumnNames = null;
/**
* A flag to indicate if we have set the reference
* of Hadoop reporter to every projectable functions
* or not.
*/
protected boolean reporterSet = false;
/**
* When set to true, that mean there is at least one
* projectable function require columns from different
* datasets as the inputs.
*/
protected boolean requirePreCrossProduct = false;
/**
* list of group functions that need columns from multiple
* datasets as the input.
*/
protected List<GroupFunction> multiDatasetGroupFunction = new LinkedList<GroupFunction>();
/**
* list of extend functions that need columns from multiple
* datasets as the input.
*/
protected List<ExtendFunction> multiDatasetExtendFunction = null;
/**
* mapping from a datasetID to a list of group functions that
* require columns only from that datasetID.
*/
protected Map<Byte, List<GroupFunction>> singleDatasetGroupFunction = new HashMap<Byte, List<GroupFunction>>();
/**
* mapping from a datasetID to a list of extend functions that
* require columns only from that datasetID.
*/
protected Map<Byte, List<ExtendFunction>> singleDatasetExtendFunction = new HashMap<Byte, List<ExtendFunction>>();
/**
* mapping from a datasetID to the result of its extend functions
* that require only the columns from the dataset.
*/
protected Map<Byte, BigTupleList> singleDatasetExtendFunResult = new HashMap<Byte, BigTupleList>();
/**
* only used when <code>requirePreCrossProduct</code> is true.
*/
protected Map<Byte, BigTupleList> valuesForAllDatasets = new HashMap<Byte, BigTupleList>();
/**
* A mapping to remember the schema of each dataset.
*/
private Map<Byte/*dataset ID*/, String[]/* schema belongs to the dataset*/> datasetToSchemaMapping
= new HashMap<Byte, String[]>();
/**
* a boolean flag to indicate this job is outer join (
* including left-outer-join and right-outer-join) or
* not.
*/
protected boolean isOuterJoin;
/**
* the replacement specified by user to replace
* the null columns for outer-join job.
*/
protected Object nullReplacement;
/**
* If the extend functions for a given dataset only
* require the join key, the engine will not compute
* it per values.
*/
private Map<Byte, Boolean> onlyHasGroupKeyExtendFunctions = new HashMap<Byte, Boolean>();
@Override
public void configure(JobConf conf)
{
super.configure(conf);
this.conf = conf;
///////////////////////////////////////////
// setup the criteria to be applied in the
// final projections
///////////////////////////////////////////
if( this.conf.get(ConfigureConstants.PERSISTANT_CRITERIA, null)!=null )
{
try
{
this._persistantCriteria = (TupleCriterion)SerializableUtil.deserializeFromBase64(this.conf.get(ConfigureConstants.PERSISTANT_CRITERIA), this.conf);
}
catch (IOException e)
{
throw new IllegalArgumentException("Cannot deserialize "+ConfigureConstants.PERSISTANT_CRITERIA+
" from ["+this.conf.get(ConfigureConstants.PERSISTANT_CRITERIA)+"]", e);
}
}
////////////////////////////////////////
// setup <code>_allDatasetIDs</code>
////////////////////////////////////////
String[] allDSIDs = this.conf.getStrings(ConfigureConstants.ALL_DATASET_IDS, Util.ZERO_SIZE_STRING_ARRAY);
this._allDatasetIDs = new Byte[allDSIDs.length];
for( int i=0;i<allDSIDs.length;i++ )
{
this._allDatasetIDs[i] = Byte.valueOf(allDSIDs[i]);
}
if( this._allDatasetIDs.length==0 )
throw new IllegalStateException(ConfigureConstants.ALL_DATASET_IDS+" is not set.");
////////////////////////////////////////
// setup <code>_lastDatasetID</code>
////////////////////////////////////////
this._lastDatasetID = this._allDatasetIDs[this._allDatasetIDs.length-1];
this._allButNotLastDatasetIDs = new Byte[this._allDatasetIDs.length-1];
for( int i=0;i<this._allDatasetIDs.length-1;i++ )
this._allButNotLastDatasetIDs[i] = this._allDatasetIDs[i];
///////////////////////////////////////
// setup <code>_projections</code>
//////////////////////////////////////
try
{
this._projections = (Projectable[]) SerializableUtil.deserializeFromBase64(this.conf.get(ConfigureConstants.PROJECTION_COLUMNS), this.conf);
List<String> outptuColumnNames = new ArrayList<String>();
for(Projectable p:this._projections )
{
p.setCalledByCombiner(false);
// save the output columns in user specified order
// so that the tuples in the final projections
// can emit the columns in user expected ordering.
for( String name:p.getOutputSchema() )
outptuColumnNames.add(name);
}
this.outputColumnNames = outptuColumnNames.toArray(new String[outptuColumnNames.size()]);
}
catch (IOException e)
{
throw new IllegalArgumentException(e);
}
// use then in final cross-product.
for( Projectable func: this._projections )
{
if( func.requireDataFromMultiDatasets() )
{
// there is at least one projectable function require columns
// from different datasets, these functions require cross-product
// the values from different datasets to compute their values,
// we need to set this flag to true so later we will do the
// cross product.
requirePreCrossProduct = true;
if( func instanceof GroupFunction )
{
if( this.multiDatasetGroupFunction==null )
this.multiDatasetGroupFunction = new LinkedList<GroupFunction>();
this.multiDatasetGroupFunction.add((GroupFunction)func);
}
else if ( func instanceof ExtendFunction )
{
if( this.multiDatasetExtendFunction==null )
this.multiDatasetExtendFunction = new LinkedList<ExtendFunction>();
this.multiDatasetExtendFunction.add((ExtendFunction)func);
}
else
{
throw new IllegalArgumentException(func.getClass().getCanonicalName()+" is not a sub-class of "+
GroupFunction.class.getCanonicalName()+" nor, "+
ExtendFunction.class.getCanonicalName());
}
}
else
{
// projectable functions that require columns from one dataset only.
boolean onlyUseGroupKey = true;
Byte datasetID = func.getParticipatedDataset().toArray(new Dataset[0])[0].getID();
if( func instanceof GroupFunction )
{
List<GroupFunction> funcs = null;
if( (funcs=this.singleDatasetGroupFunction.get(datasetID))==null ){
funcs = new LinkedList<GroupFunction>();
this.singleDatasetGroupFunction.put(datasetID, funcs);
}
funcs.add((GroupFunction)func);
}
else if ( func instanceof ExtendFunction )
{
List<ExtendFunction> funcs = null;
if( (funcs=this.singleDatasetExtendFunction.get(datasetID))==null ){
funcs = new LinkedList<ExtendFunction>();
this.singleDatasetExtendFunction.put(datasetID, funcs);
}
funcs.add((ExtendFunction)func);
if( !func.useGroupKeyOnly() )
onlyUseGroupKey = false;
}
else
{
throw new IllegalArgumentException(func.getClass().getCanonicalName()+" is not a sub-class of "+
GroupFunction.class.getCanonicalName()+" nor, "+
ExtendFunction.class.getCanonicalName());
}
this.onlyHasGroupKeyExtendFunctions.put(datasetID, onlyUseGroupKey);
}
}
this.isOuterJoin = this.conf.getBoolean(ConfigureConstants.IS_OUTER_JOIN, false);
////////////////////////////////////////
// setup <code>nullReplacement</code>
////////////////////////////////////////
try
{
if( this.conf.get(ConfigureConstants.NULL_REPLACEMENT, null)!=null )
{
byte[] binary = Base64.decodeBase64(this.conf.get(ConfigureConstants.NULL_REPLACEMENT).getBytes("UTF-8"));
byte type = (byte)this.conf.getInt(ConfigureConstants.NULL_REPLACEMENT_TYPE, -1);
ByteArrayInputStream buffer = new ByteArrayInputStream(binary);
DataInputStream input = new DataInputStream(buffer);
List<Object> temp = new LinkedList<Object>();
ReadFieldImpl reader = new ReadFieldImpl(temp, input, this.conf);
reader.handle(type);
this.nullReplacement = temp.remove(0);
}
}
catch(IOException e)
{
throw new RuntimeException("Cannot deserialize null_replacement from:" +
"["+this.conf.get(ConfigureConstants.NULL_REPLACEMENT)+"]", e);
}
}
@Override
public void joinreduce(Tuple key, DataJoinValueGroup<Tuple> values, OutputCollector<NullWritable, WritableComparable<?>> output, Reporter reporter)
throws IOException
{
// set reporter for all projectable
if( !reporterSet )
{
for( Projectable p:this._projections )
{
LOGGER.info("Set reporter to "+p.getClass().getCanonicalName());
p.setReporter(reporter);
}
reporterSet = true;
}
this.clearPreviousResults();
int expectingDatasetIDX = 0;
// don't keep the values from last dataset into {@BigTupleList},
// using iterator to iterate them through to perform
// cross product
Iterator<Tuple> valuesFromLastDataset = null;
//////////////////////////////////////////
// compute the result for the projections
//////////////////////////////////////////
while( values.hasNext() )
{
Byte datasetID = values.nextDatasetID ();
if ( !datasetID.equals (_allDatasetIDs[expectingDatasetIDX]) )
{
// no records coming from the expected dataset, means
// 1) not full inner join-able,
// 2) no records from the left dataset when this is a left-outer-join, or
// 3) no records from the right dataset when this is a right-outer-join, return
return;
}
expectingDatasetIDX++;
if( !datasetID.equals(this._lastDatasetID) )// values not from the last dataset
{
Iterator<Tuple> valuesForCurrentDataset = values.next ();
computeSingleDSFunctionsResults(valuesForCurrentDataset, datasetID, reporter);
}
else
{
// the remaining values are all from the last
// dataset, keep the reference of the value
// iterator to perform cross product later
valuesFromLastDataset = values.next();
break;
}
}
if ( valuesFromLastDataset == null )
{
if( !this.isOuterJoin )
{
// no records from the last dataset, not be able to
// do full inner join, return
return;
}
else
{
// no records from the last dataset, but this is a
// outer-join job, continue.
}
}
//////////////////////////////////////////////////////////////
// cross product the results for all the datasets except
// the last one, the result only contain projectable functions
// that don't require columns from multiple datasets only.
/////////////////////////////////////////////////////////////
Iterable<Tuple> resultsFromOtherDatasets = this.crossProduct(reporter, false, _allButNotLastDatasetIDs);
List<Iterable<Tuple> > toBeCrossProduct = new ArrayList<Iterable<Tuple>>();
if( resultsFromOtherDatasets!=null)
toBeCrossProduct.add(resultsFromOtherDatasets);
/////////////////////////////////////////////////////
// start to compute the results for the final dataset
/////////////////////////////////////////////////////
boolean hasMultiDSFunctions = this.requirePreCrossProduct;
if( hasMultiDSFunctions )
{
// there are functions require columns from multiple
// dataset, so save the values from last dataset
// into BigTupleList so we can iterate it multiple
// times.
if( valuesFromLastDataset!=null )
{
while( valuesFromLastDataset.hasNext() )
{
Tuple aRow = valuesFromLastDataset.next();
this.rememberTuple(_lastDatasetID, aRow, reporter);
}
Iterable<Tuple> preCrossProduct = Util.crossProduct(conf, reporter, this.valuesForAllDatasets.values().toArray(new BigTupleList[0]));
BigTupleList btl = new BigTupleList(reporter);
for( Tuple aRow:preCrossProduct )
{
this.computeExtendFunctions(aRow, btl, this.multiDatasetExtendFunction);
this.computeGroupFunctions(aRow, this.multiDatasetGroupFunction);
}
if( btl.size()>0 )
toBeCrossProduct.add(btl);
for(GroupFunction fun:this.multiDatasetGroupFunction )
toBeCrossProduct.add(fun.getResult());
valuesFromLastDataset = this.valuesForAllDatasets.get(_lastDatasetID).iterator();
}
else
{
if( this.multiDatasetExtendFunction.size()>0 )
{
BigTupleList btl = new BigTupleList(reporter);
this.computeExtendFunctions(null, btl, this.multiDatasetExtendFunction);
toBeCrossProduct.add(btl);
}
for(GroupFunction fun:this.multiDatasetGroupFunction )
toBeCrossProduct.add(fun.getNoMatchResult(nullReplacement));
}
}
// finished the computation of multi-dataset functions, start
// to compute the projectable funcitons results for last
// dataset
//
// first compute the cross product of all other functions
Iterable<Tuple> others = null;
if( toBeCrossProduct.size()>0 )
{
Iterable<Tuple>[] array = new Iterable[toBeCrossProduct.size()];
for( int i=0;i<toBeCrossProduct.size();i++ )
{
array[i] = toBeCrossProduct.get(i);
}
others = Util.crossProduct(conf, reporter, array);
}
if( valuesFromLastDataset==null )
{// outer-join, so <code>others</code> is always not null.
List<BigTupleList> nullResult = new ArrayList<BigTupleList>();
if( this.singleDatasetExtendFunction.get(_lastDatasetID)!=null )
{
BigTupleList btl = new BigTupleList(reporter);
this.computeExtendFunctions(null, btl, this.singleDatasetExtendFunction.get(_lastDatasetID));
nullResult.add(btl);
}
if( this.singleDatasetGroupFunction.get(_lastDatasetID)!=null )
{
for(GroupFunction fun:this.singleDatasetGroupFunction.get(_lastDatasetID) )
nullResult.add(fun.getNoMatchResult(nullReplacement));
}
for( Tuple t1:Util.crossProduct(conf, reporter, nullResult) )
{
for( Tuple t2:others )
{
this.output(Tuple.merge(t1, t2), output, reporter);
}
}
}
else
{
boolean hasNoGroupFunctionForLastDS = this.singleDatasetGroupFunction.get(this._lastDatasetID)==null;
while( valuesFromLastDataset.hasNext() )
{
Tuple aRow = valuesFromLastDataset.next();
aRow.setSchema(this.getSchemaByDatasetID(_lastDatasetID));
if(hasNoGroupFunctionForLastDS)
{
// there is no group function from the last DS, we can
// do some optimization here: as we streaming over the
// values of last dataset, we also emit outputs.
Tuple merged = new Tuple();
for( ExtendFunction func:this.singleDatasetExtendFunction.get(_lastDatasetID) )
{
merged = Tuple.merge(merged, func.getResult(aRow));
}
if( others!=null )
{
for(Tuple t:others)
{
this.output(Tuple.merge(t, merged), output, reporter);
}
}
else
{
this.output(merged, output, reporter);
}
}
else
{
this.processExtendFunctions(_lastDatasetID, aRow, reporter);
this.computeGroupFunctions(_lastDatasetID, aRow);
}
}
if(!hasNoGroupFunctionForLastDS)
{
for( Tuple t1: this.crossProduct(reporter, false, _lastDatasetID) )
{
if( others!=null )
{
for(Tuple t2:others)
{
this.output(Tuple.merge(t1, t2), output, reporter);
}
}
else
{
this.output(t1, output, reporter);
}
}
}
}
}
/**
* compute functions that require columns from the datasetID only.
*/
private void computeSingleDSFunctionsResults(Iterator<Tuple> tuples, Byte datasetID, Reporter reporter)
{
while( tuples.hasNext() )
{
Tuple aTuple = tuples.next();
aTuple.setSchema(this.getSchemaByDatasetID(datasetID));
if( this.requirePreCrossProduct )
{
// some functions need columns from multiple
// dataset, so remember this value for later
// corss product
rememberTuple(datasetID, aTuple, reporter);
}
this.processExtendFunctions(datasetID, aTuple, reporter);
this.computeGroupFunctions(datasetID, aTuple);
}
}
private void output(Tuple aTuple, OutputCollector<NullWritable, WritableComparable<?>> output, Reporter reporter)
throws IOException
{
aTuple.setToStringOrdering(this.outputColumnNames);
if( this._persistantCriteria!=null )
{
if( this._persistantCriteria.accept(aTuple, this.conf) )
{
output.collect(NullWritable.get(), aTuple);
reporter.getCounter("Join/Grouping Records", "EMITTED").increment(1);
}
else
{
reporter.getCounter("Join/Grouping Records", "FILTERED").increment(1);
}
}
else
{
output.collect(NullWritable.get(), aTuple);
reporter.getCounter("Join/Grouping Records", "EMITTED").increment(1);
}
}
private void rememberTuple(Byte datasetID, Tuple aTuple, Reporter reporter){
BigTupleList tuples = null;
if( (tuples=this.valuesForAllDatasets.get(datasetID))==null )
{
tuples = new BigTupleList(reporter);
this.valuesForAllDatasets.put(datasetID, tuples);
}
tuples.add(aTuple);
}
/**
* compute the extend functions for the given datasetID, using the
* <code>aRow</code> as the input and save the result for final
* cross-product.
*/
private void processExtendFunctions(Byte datasetID, Tuple aRow, Reporter reporter)
{
// process extend function for this current dataset and save the result
List<ExtendFunction> extendFunctions = this.singleDatasetExtendFunction.get(datasetID);
if( extendFunctions==null )
return;
BigTupleList computedResult = null;
if( (computedResult=this.singleDatasetExtendFunResult.get(datasetID))==null )
{
computedResult = new BigTupleList(reporter);
this.singleDatasetExtendFunResult.put(datasetID, computedResult);
}
if( onlyHasGroupKeyExtendFunctions.get(datasetID) )
{
if(computedResult.size()==0 )
this.computeExtendFunctions(aRow, computedResult, extendFunctions);
}
else
{
this.computeExtendFunctions(aRow, computedResult, extendFunctions);
}
}
/**
* compute the extend function result using the <code>aRow</code>,
* merge the tuple from each function together into single one and
* add it to the <code>result</code> list for final cross-product.
*/
private void computeExtendFunctions(Tuple aRow, BigTupleList result, List<ExtendFunction> functions)
{
if( functions!=null && !functions.isEmpty() )
{
Tuple mergedResult = new Tuple();
for( ExtendFunction aFunction:functions )
{
if( aRow!=null )
mergedResult = Tuple.merge(mergedResult, aFunction.getResult(aRow));
else
mergedResult = Tuple.merge(mergedResult, aFunction.getNoMatchResult(nullReplacement));
}
result.add(mergedResult);
}
}
/**
* For each group function from of the given datasetID,
* call their consume method with the <code>aRow</code>
* as the input.
*/
private void computeGroupFunctions(Byte datasetID, Tuple aRow)
{
List<GroupFunction> groupFunctions = this.singleDatasetGroupFunction.get(datasetID);
this.computeGroupFunctions(aRow, groupFunctions);
}
private void computeGroupFunctions(Tuple aRow, List<GroupFunction> functions)
{
if( functions!=null && !functions.isEmpty()){
for( GroupFunction aFunction:functions )
{
if( aRow!=null)
aFunction.consume(aRow);
}
}
}
private void clearPreviousResults()
{
for( BigTupleList list:this.singleDatasetExtendFunResult.values() )
{
list.clear();
}
for( Projectable fun:this._projections ){
if( fun instanceof GroupFunction ){
((GroupFunction)fun).reset();
}
}
for( Byte aDatasetID:this.valuesForAllDatasets.keySet() )
{
this.valuesForAllDatasets.remove(aDatasetID).clear();
}
}
protected String[] getSchemaByDatasetID(Byte datasetID)
{
String[] schema = null;
if( (schema=this.datasetToSchemaMapping.get(datasetID))==null )
{
schema = this.conf.getStrings(datasetID+".value.columns", Util.ZERO_SIZE_STRING_ARRAY);
if( schema.length==0 )
{
// should never happen
throw new IllegalStateException("Schema for dataset:"+datasetID+" is not set.");
}
this.datasetToSchemaMapping.put(datasetID, schema);
}
return schema;
}
/**
* compute the cross product result of the given dataset id
*/
private Iterable<Tuple> crossProduct(Reporter reporter, boolean usingNull, Byte... datasetIDs)
throws IOException
{
if( datasetIDs==null || datasetIDs.length==0 )
return null;
List<BigTupleList> resultsToBeCrossProducts = new ArrayList<BigTupleList>();
for( Byte datasetID:datasetIDs )
{
if( this.singleDatasetExtendFunResult.get(datasetID)!=null )
{
resultsToBeCrossProducts.add(this.singleDatasetExtendFunResult.get(datasetID));
}
if( this.singleDatasetGroupFunction.get(datasetID)!=null )
{
for( GroupFunction fun:this.singleDatasetGroupFunction.get(datasetID) )
{
if( usingNull )
resultsToBeCrossProducts.add(fun.getNoMatchResult(this.nullReplacement));
else
resultsToBeCrossProducts.add(fun.getResult());
}
}
}
return Util.crossProduct(conf, reporter, resultsToBeCrossProducts);
}
}