Package com.datasalt.pangool.tuplemr

Source Code of com.datasalt.pangool.tuplemr.TupleMRBuilder

/**
* Copyright [2012] [Datasalt Systems S.L.]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datasalt.pangool.tuplemr;

import static com.datasalt.pangool.tuplemr.TupleMRException.failIfEmpty;
import static com.datasalt.pangool.tuplemr.TupleMRException.failIfNull;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.UUID;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.datasalt.pangool.io.DatumWrapper;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.tuplemr.MultipleInputsInterface.Input;
import com.datasalt.pangool.tuplemr.NamedOutputsInterface.Output;
import com.datasalt.pangool.tuplemr.mapred.GroupComparator;
import com.datasalt.pangool.tuplemr.mapred.RollupReducer;
import com.datasalt.pangool.tuplemr.mapred.SimpleCombiner;
import com.datasalt.pangool.tuplemr.mapred.SimpleReducer;
import com.datasalt.pangool.tuplemr.mapred.SortComparator;
import com.datasalt.pangool.tuplemr.mapred.TupleHashPartitioner;
import com.datasalt.pangool.tuplemr.mapred.lib.input.PangoolMultipleInputs;
import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleInputFormat;
import com.datasalt.pangool.tuplemr.mapred.lib.output.ProxyOutputFormat;
import com.datasalt.pangool.tuplemr.mapred.lib.output.TupleOutputFormat;
import com.datasalt.pangool.tuplemr.serialization.TupleSerialization;
import com.datasalt.pangool.utils.InstancesDistributor;

/**
*
* TupleMRBuilder creates Tuple-based Map-Reduce jobs.
* <p>
*
* One of the key concepts of Tuple-based Map-Reduce is that Hadoop Key-Value pairs are no longer used.Instead,they are
* replaced by tuples.<br>
*
* Tuples(see {@link ITuple}) are just an ordered list of elements whose types are defined in a {@link Schema}
* .TupleMRBuilder contains several methods to define how grouping and sorting among tuples will be performed, avoiding
* the complex task of defining custom binary {@link SortComparator} , {@link GroupComparator} and
* {@link TupleHashPartitioner} implementations.
* <p>
*
* A Tuple-based Map-Red job, in its simplest form, requires to define :<br>
* <ul>
*
* <li><b>Intermediate schemas:</b><br>
* An schema specifies the name and types of a Tuple's fields. Several schemas can be defined in order to perform joins
* among different input data. It's mandatory to specify ,at least,one schema using
* {@link #addIntermediateSchema(Schema)}<br>
*
* <li><b>Group-by fields:</b><br>
* Needed to specify how the tuples will be grouped. Several tuples with the same group-by fields will be groupped and
* reduced together in the Reduce phase.<br>
*
* <li><b>Tuple-based Mapper:</b><br>
* The job needs to specify a {@link TupleMapper} instance,the Tuple-based implementation of Hadoop's {@link Mapper}.
* Unlike Hadoop's Mappers, Tuple-based mappers are configured using stateful serializable instances and not static
* class definitions.<br>
*
* <li><b>Tuple-based Reducer:</b> Similar to mapper instances,the job needs to specify a {@link TupleReducer}
* instance,the Tuple-based implementation of Hadoop's {@link Reducer}. <br>
*
* </ul>
*
* @see ITuple
* @see Schema
* @see TupleMapper
* @see TupleReducer
*
*/
@SuppressWarnings("rawtypes")
public class TupleMRBuilder extends TupleMRConfigBuilder {

  private Configuration conf;

  private TupleReducer tupleReducer;
  private TupleReducer tupleCombiner;
  private OutputFormat outputFormat;
  private Class<?> jarByClass;
  private Class<?> outputKeyClass;
  private Class<?> outputValueClass;
  private String jobName;

  private Path outputPath;

  private MultipleInputsInterface multipleInputs;
  private NamedOutputsInterface namedOutputs;
 
  private Set<String> instanceFilesCreated = new HashSet<String>();

  public TupleMRBuilder(Configuration conf) {
    this.conf = conf;
    multipleInputs = new MultipleInputsInterface(this.conf);
    namedOutputs = new NamedOutputsInterface(this.conf);
  }

  /**
   * @param conf
   *          Configuration instance
   * @param name
   *          Job's name as in {@link Job}
   */
  public TupleMRBuilder(Configuration conf, String name) {
    this(conf);
    this.jobName = name;
  }

  public Configuration getConf() {
    return conf;
  }

  /**
   * Sets the jar by class , as in {@link Job#setJarByClass(Class)}
   */
  public void setJarByClass(Class<?> jarByClass) {
    this.jarByClass = jarByClass;
  }

  /**
   * Defines an input as in {@link PangoolMultipleInputs} using {@link TupleInputFormat}
   *
   * @see PangoolMultipleInputs
   */
  public void addTupleInput(Path path, TupleMapper<ITuple, NullWritable> tupleMapper) {
    multipleInputs.getMultiInputs().add(new Input(path, new TupleInputFormat(), tupleMapper));
  }

  public void addNamedOutput(String namedOutput, OutputFormat outputFormat, Class keyClass,
      Class valueClass) throws TupleMRException {
    addNamedOutput(namedOutput, outputFormat, keyClass, valueClass, null);
  }

  public void addNamedOutput(String namedOutput, OutputFormat outputFormat, Class keyClass,
      Class valueClass, Map<String, String> specificContext) throws TupleMRException {
    namedOutputs.add(new Output(namedOutput, outputFormat, keyClass, valueClass, specificContext));
  }

  public void addNamedTupleOutput(String namedOutput, Schema outputSchema) throws TupleMRException {
    Output output = new Output(namedOutput, new TupleOutputFormat(outputSchema), ITuple.class,
        NullWritable.class, null);
    namedOutputs.add(output);
  }

  /**
   * Defines an input as in {@link PangoolMultipleInputs}
   *
   * @see PangoolMultipleInputs
   */
  public void addInput(Path path, InputFormat inputFormat, TupleMapper inputProcessor) {
    multipleInputs.getMultiInputs().add(new Input(path, inputFormat, inputProcessor));
  }

  /**
   *
   */
  public void setTupleCombiner(TupleReducer tupleCombiner) {
    this.tupleCombiner = tupleCombiner;
  }

  public void setOutput(Path outputPath, OutputFormat outputFormat, Class<?> outputKeyClass,
      Class<?> outputValueClass) {
    this.outputFormat = outputFormat;
    this.outputKeyClass = outputKeyClass;
    this.outputValueClass = outputValueClass;
    this.outputPath = outputPath;
  }

  public void setTupleOutput(Path outputPath, Schema schema) {
    this.outputPath = outputPath;
    this.outputFormat = new TupleOutputFormat(schema);
    this.outputKeyClass = ITuple.class;
    this.outputValueClass = NullWritable.class;
  }

  public void setTupleReducer(TupleReducer tupleReducer) {
    this.tupleReducer = tupleReducer;
  }

  /**
   * Run this method after running your Job for instance files to be properly cleaned.
   * @throws IOException
   */
  public void cleanUpInstanceFiles() throws IOException {
    for(String instanceFile: instanceFilesCreated) {
      InstancesDistributor.removeFromCache(conf, instanceFile);
    }
  }
 
  public Job createJob() throws IOException, TupleMRException {

    failIfNull(tupleReducer, "Need to set a group handler");
    failIfEmpty(multipleInputs.getMultiInputs(), "Need to add at least one input");
    failIfNull(outputFormat, "Need to set output format");
    failIfNull(outputKeyClass, "Need to set outputKeyClass");
    failIfNull(outputValueClass, "Need to set outputValueClass");
    failIfNull(outputPath, "Need to set outputPath");

    // perform a deep copy of the Configuration
    this.conf = new Configuration(this.conf);
   
    TupleMRConfig tupleMRConf = buildConf();
    // Serialize PangoolConf in Hadoop Configuration
    instanceFilesCreated.addAll(TupleMRConfig.set(tupleMRConf, conf));
    Job job = (jobName == null) ? new Job(conf) : new Job(conf, jobName);
    if(tupleMRConf.getRollupFrom() != null) {
      job.setReducerClass(RollupReducer.class);
    } else {
      job.setReducerClass(SimpleReducer.class);
    }

    if(tupleCombiner != null) {
      job.setCombinerClass(SimpleCombiner.class); // not rollup by now
      // Set Combiner Handler
      String uniqueName = UUID.randomUUID().toString() + '.' + "combiner-handler.dat";
      try {
        InstancesDistributor.distribute(tupleCombiner, uniqueName, job.getConfiguration());
        instanceFilesCreated.add(uniqueName);
        job.getConfiguration().set(SimpleCombiner.CONF_COMBINER_HANDLER, uniqueName);
      } catch(URISyntaxException e1) {
        throw new TupleMRException(e1);
      }
    }

    // Set Tuple Reducer
    try {
      String uniqueName = UUID.randomUUID().toString() + '.' + "group-handler.dat";
      InstancesDistributor.distribute(tupleReducer, uniqueName, job.getConfiguration());
      instanceFilesCreated.add(uniqueName);
      job.getConfiguration().set(SimpleReducer.CONF_REDUCER_HANDLER, uniqueName);
    } catch(URISyntaxException e1) {
      throw new TupleMRException(e1);
    }

    // Enabling serialization
    TupleSerialization.enableSerialization(job.getConfiguration());

    job.setJarByClass((jarByClass != null) ? jarByClass : tupleReducer.getClass());
    job.setMapOutputKeyClass(DatumWrapper.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setPartitionerClass(TupleHashPartitioner.class);
    job.setGroupingComparatorClass(GroupComparator.class);
    job.setSortComparatorClass(SortComparator.class);
    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);
    FileOutputFormat.setOutputPath(job, outputPath);
    instanceFilesCreated.addAll(multipleInputs.configureJob(job));
    instanceFilesCreated.addAll(namedOutputs.configureJob(job));
    // Configure a {@link ProxyOutputFormat} for Pangool's Multiple Outputs to
    // work: {@link PangoolMultipleOutput}
    String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat";
    try {
      InstancesDistributor.distribute(outputFormat, uniqueName, conf);
      instanceFilesCreated.add(uniqueName);
    } catch(URISyntaxException e1) {
      throw new TupleMRException(e1);
    }
    job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName);
    job.setOutputFormatClass(ProxyOutputFormat.class);

    return job;
  }
}
TOP

Related Classes of com.datasalt.pangool.tuplemr.TupleMRBuilder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.