Source Code of com.ebay.erl.mobius.core.Persistable

package com.ebay.erl.mobius.core;


import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;


import com.ebay.erl.mobius.core.builder.AbstractDatasetBuilder;
import com.ebay.erl.mobius.core.builder.Dataset;
import com.ebay.erl.mobius.core.builder.DatasetBuildersFactory;
import com.ebay.erl.mobius.core.criterion.TupleCriterion;
import com.ebay.erl.mobius.core.datajoin.DataJoinKey;
import com.ebay.erl.mobius.core.datajoin.DataJoinKeyPartitioner;
import com.ebay.erl.mobius.core.datajoin.DataJoinValue;
import com.ebay.erl.mobius.core.function.base.GroupFunction;
import com.ebay.erl.mobius.core.function.base.Projectable;
import com.ebay.erl.mobius.core.mapred.DefaultMobiusCombiner;
import com.ebay.erl.mobius.core.mapred.DefaultMobiusReducer;
import com.ebay.erl.mobius.core.model.Column;
import com.ebay.erl.mobius.util.SerializableUtil;
import com.ebay.erl.mobius.util.Util;


/**
 * Sets the projections (columns to be saved on disk ) 
 * for join or group-by jobs.
 * <p>
 * 
 * The user cannot create an instance of this class
 * directly.  To get an instance of this class, use 
 * {@link JoinOnConfigure} for join type jobs, or 
 * {@link GroupByConfigure} for group-by jobs.
 * <p>
 * 
 * See {@link MobiusJob#innerJoin(Dataset...)} or
 * {@link MobiusJob#group(Dataset)} for information
 * on creating a join or group-by job.
 * 
 * 
 * 
 * This product is licensed under the Apache License,  Version 2.0, 
 * available at http://www.apache.org/licenses/LICENSE-2.0.
 * 
 * This product contains portions derived from Apache hadoop which is 
 * licensed under the Apache License, Version 2.0, available at 
 * http://hadoop.apache.org.
 * 
 * © 2007 – 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
 */
@SuppressWarnings({"deprecation", "unchecked"})
public class Persistable 
{
  private Configuration userDefinedConf;
  
  private JobConf jobConf;
  
  private Dataset[] datasets;
  
  private static final Log LOGGER = LogFactory.getLog(Persistable.class);
  
  
  Persistable(Configuration jobConf, Dataset... datasets)
  {
    this.jobConf  = new JobConf(jobConf);
    this.datasets  = datasets;
  }
  
  
  
  /**
   * set a configuration property to this job's 
   * configuration.
   * <p>
   * 
   * @param name a property name in a Hadoop job configuration.
   * @param value the value for the property name in a Hadoop 
   * job configuration.
   */
  public Persistable setConf(String name, String value)
  {
    if( userDefinedConf==null )
    {
      this.userDefinedConf = new Configuration(false);
    }
    this.userDefinedConf.set(name, value);
    return this;
  }
  
  
  
  /**
   * Specify the name of this job.
   */
  public Persistable setJobName(String newJobName)
  {
    this.jobConf.set("mapred.job.name", newJobName);
    return this;
  }
  
  
  
  /**
   * Specify the number of reducer of this job.
   */
  public Persistable setReducersNumber(int reducerNumber)
  {
    if( reducerNumber<=0 )
      throw new IllegalArgumentException("number of reducer must grater than 0.");
    
    this.jobConf.setInt("mapred.reduce.tasks", reducerNumber);
    return this;
  }
  
  
  
  /**
   * Build the dataset and store the <code>projections</code>
   * into a temporal path (under hadoop.tmp.dir) in the format of
   * {@link SequenceFileOutputFormat}.
   */
  public Dataset build(MobiusJob job, Projectable... projections)
    throws IOException
  {
    return this.build(job, SequenceFileOutputFormat.class, projections);
  }
  
  
  
  /**
   * Build the dataset and store the <code>projections</code>
   * into a temporal path (under hadoop.tmp.dir) in the format of
   * {@link SequenceFileOutputFormat}.
   * <p>
   * 
   * Only the rows that meet the <code>criteria</code> will be 
   * stored.  The <code>criteria</code> can only evaluate the 
   * columns specified in the <code>projections</code>.
   */
  public Dataset build(MobiusJob job, TupleCriterion criteria, Projectable... projections)
    throws IOException
  {
    return this.build(job, SequenceFileOutputFormat.class, criteria, projections);
  }
  
  
  
  /**
   * Build the dataset and store the <code>projections</code>
   * into a temporal path (under hadoop.tmp.dir) in the format of
   * the given <code>outputFormat</code>.
   * <p>
   */
  public Dataset build(MobiusJob job, Class<? extends FileOutputFormat> outputFormat, Projectable... projections)
    throws IOException
  {
    return this.build(job, outputFormat, null, projections);
  }
  
  
  
  /**
   * Build the dataset and store the <code>projections</code>
   * into a temporal path (under hadoop.tmp.dir) in the format of
   * {@link SequenceFileOutputFormat}.
   * <p>
   * 
   * Only the rows that meet the <code>criteria</code> will be 
   * stored.  The <code>criteria</code> can only evaluate the 
   * columns specified in the <code>projections</code>.
   * 
   * @param job 
   * @param outputFormat
   * @param criteria if specified (not null), only rows that satisfy the given <code>criteria</code>
   * will be saved.  Note that, <code>criteria</code> is applied just before the persistant step, so
   * it can only operate on the columns in the output schema of this job.
   * @param projections the columns to be saved in the returned {@link Dataset}.
   * @return a {@link Dataset} with the specified columns ()
   * @throws IOException
   */
  public Dataset build(MobiusJob job, Class<? extends FileOutputFormat> outputFormat, TupleCriterion criteria, Projectable... projections)
    throws IOException
  {
    return this.save(job, job.newTempPath(), outputFormat, criteria, projections);
  }
  
  
  
  /**
   * Save the dataset and store the <code>projections</code>
   * into a the specified <code>output</code> path in the 
   * format of {@link TextOutputFormat}.
   * <p>
   * 
   * <code>output</code> will be deleted before the job gets started.
   */
  public Dataset save(MobiusJob job, Path output, Projectable... projections)
    throws IOException
  {
    return this.save(job, output, TextOutputFormat.class, null, projections);
  }
  
  
  /**
   * Save the dataset and store the <code>projections</code>
   * into a the specified <code>output</code> path in the 
   * format of {@link TextOutputFormat}.
   * <p>
   * 
   * Only the rows that meet the <code>criteria</code> will be 
   * stored.  The <code>criteria</code> can only evaluate the 
   * columns specified in the <code>projections</code>.
   * <p>
   * 
   * <code>output</code> will be deleted before the job gets started.
   */
  public Dataset save(MobiusJob job, Path output, TupleCriterion criteria, Projectable... projections)
    throws IOException
  {
    return this.save(job, output, TextOutputFormat.class, criteria, projections);
  }
  
  
  /**
   * Save the dataset and store the <code>projections</code>
   * into a the specified <code>output</code> path in the 
   * format of the given <code>outputFormat</code>.
   * <p>
   * 
   * <code>output</code> will be deleted before the job gets started.
   */
  public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat, Projectable... projections)
    throws IOException
  {
    return this.save(job, output, outputFormat, null, projections);
  }
  
  
  /**
   * Save the dataset and store the <code>projections</code>
   * into a the specified <code>output</code> path in the 
   * format of the given <code>outputFormat</code>.
   * <p>
   * 
   * Only the rows that meet the <code>criteria</code> will be 
   * stored.  The <code>criteria</code> can only evaluate the 
   * columns specified in the <code>projections</code>.
   * <p>
   * 
   * <code>output</code> will be deleted before the job gets started.
   */
  public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat, TupleCriterion criteria, Projectable... projections)
    throws IOException
  {
    if( projections==null || projections.length==0 )
      throw new IllegalArgumentException("Please specify the output columns.");
    
    
    
    // - VALIDATION - make sure no ambiguous column names.
    //
    // make sure the projections don't have two or more different columns that
    // have the same name but in different dataset, as we are going the use 
    // the {@link Column#getOutputColumnName} as the output schema of the
    // returned dataset.
    Set<String> columnNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
    for( Projectable aColumn:projections )
    {      
      String[] outputSchema = aColumn.getOutputSchema();
      for(String anOutput:outputSchema)
      {
        if( !columnNames.contains(anOutput) )
        {
          columnNames.add(anOutput);
        }
        else
        {
          throw new IllegalArgumentException(columnNames+" from "+aColumn.toString()+" is ambiguous, it has the same name" +
              "as aother selected projected in different dataset, please use Column#setNewName(String) to" +
              "change it.");
        }
      }
    }
    
    // - VALIDATION - if <code>criteria</code> is not null, need to make
    // sure the columns used in the criteria are in the output columns.
    if ( criteria!=null )
    {
      TupleCriterion.validate(columnNames, criteria);
      this.jobConf.set(ConfigureConstants.PERSISTANT_CRITERIA, SerializableUtil.serializeToBase64(criteria));
    }
    
    
    // setup {@link Dataset} to {@link Column} mapping so we can setup projection columns
    // for each dataset, and also perform validation on making sure all the projection columns 
    // are from the selected <code>datasets</code> only,
    Map<Dataset, List<Column> > datasetToColumns = new HashMap<Dataset, List<Column>>();
    
    for( Projectable aFunc:projections )
    {
      Column[] requiredInputColumns = aFunc.getInputColumns();
      for( Column aColumn:requiredInputColumns )
      {
        Dataset aDataset = aColumn.getDataset();
        // make sure the <code>aDataset</code> within the participated datasets
        boolean withinSelectedDataset = false;
        for( Dataset aSelectedDataset:this.datasets )
        {
          if( aSelectedDataset.equals(aDataset) )
          {
            withinSelectedDataset = true;
            break;
          }
        }
        
        if( !withinSelectedDataset )
        {
          // user select a column from a dataset that doesn't
          // in the selected datasets in this join/group by job.
          throw new IllegalArgumentException(aColumn.toString()+" does not within the selected datasets " +
              "in this join/group task, please select columns only from the selected datasets.");
        }
        
        List<Column> projectablesInADataset = null;
        if ( (projectablesInADataset=datasetToColumns.get(aDataset))==null )
        {
          projectablesInADataset = new LinkedList<Column>();
          datasetToColumns.put(aDataset, projectablesInADataset);
        }
        
        if( !projectablesInADataset.contains(aColumn) )
          projectablesInADataset.add(aColumn);
      }
    }
    
    if( datasetToColumns.keySet().size()!=this.datasets.length )
    {
      throw new IllegalArgumentException("Please select at least one column from each dataset in the join/group-by job.");
    }
    
    // SETUP JOB
    if( this.userDefinedConf!=null )
    {
      this.jobConf = new JobConf(Util.merge(this.jobConf, this.userDefinedConf));
    }
    this.jobConf.setJarByClass(job.getClass());
    this.jobConf.setMapOutputKeyClass(DataJoinKey.class);
    this.jobConf.setMapOutputValueClass(DataJoinValue.class);
    this.jobConf.setPartitionerClass (DataJoinKeyPartitioner.class);
    this.jobConf.setOutputValueGroupingComparator (DataJoinKey.Comparator.class);
    this.jobConf.setOutputKeyComparatorClass (DataJoinKey.class);
    this.jobConf.setReducerClass(DefaultMobiusReducer.class);
    this.jobConf.set(ConfigureConstants.PROJECTION_COLUMNS, SerializableUtil.serializeToBase64(projections));
    
    
    
    JobSetup.setupOutputs(this.jobConf, output, outputFormat);
    
    // setup input paths, projection columns for each datasets.
    for( byte assignedDatasetID=0;assignedDatasetID<this.datasets.length;assignedDatasetID++)
    {
      Dataset aDataset = this.datasets[assignedDatasetID];
      
      // setup input for each dataset
      JobSetup.setupInputs(jobConf, aDataset, assignedDatasetID);
      
      // setup projection for each dataset
      JobSetup.setupProjections(jobConf, aDataset, assignedDatasetID, datasetToColumns.get(aDataset).toArray(new Column[0]));
    }
    
    // setup all dataset IDs
    for( int i=0;i<this.datasets.length;i++)
    {
      Byte id = this.datasets[i].getID();
      if( !this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS, "").isEmpty() )
      {
        this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS, this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS)+","+id);
      }
      else
      {
        this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS, id.toString());
      }
    }
    
    
    boolean isCombinable = true;
    for( Projectable aFunc:projections )
    {
      aFunc.setConf(jobConf);
      
      if( !aFunc.isCombinable() )
      {
        isCombinable = false;
        LOGGER.info(aFunc.toString()+" is not combinable, #isCombinable() return false.");
        break;
      }
      if( aFunc instanceof GroupFunction && aFunc.useGroupKeyOnly() )
      {
        LOGGER.info(aFunc.toString()+" is a group function and use group key as its input only, disable combiner.");
        isCombinable = false;
        break;
      }
    }
    
    LOGGER.info("Using Combiner? "+isCombinable);
    if( isCombinable )
    {  
      jobConf.setCombinerClass(DefaultMobiusCombiner.class);
    }
    
    job.addToExecQueue(jobConf);
    
    AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(job).getBuilder(outputFormat, "Dataset_"+output.getName());
    
    // form the output column from the projections
    List<String> outputColumns = new ArrayList<String>();
    for( Projectable func:projections )
    {
      String[] aProjectOutputs = func.getOutputSchema();
      for(String anOutputName:aProjectOutputs)
      {
        outputColumns.add(anOutputName);
      }
    }
    
    
    
    
    return builder.buildFromPreviousJob(jobConf, outputFormat, outputColumns.toArray(new String[0]));
  }
}
Source Code of com.ebay.erl.mobius.core.Persistable

Related Classes of com.ebay.erl.mobius.core.Persistable