package com.splout.db.benchmark;
/*
* #%L
* Splout SQL Hadoop library
* %%
* Copyright (C) 2012 Datasalt Systems S.L.
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;
import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.tuplemr.IdentityTupleMapper;
import com.datasalt.pangool.tuplemr.IdentityTupleReducer;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat;
import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat.FieldSelector;
import com.datasalt.pangool.tuplemr.mapred.lib.output.TupleTextOutputFormat;
import com.datasalt.pangool.utils.HadoopUtils;
/**
* This Job implements a identity Map/Reduce in two ways: one using the plain Hadoop API and the other one using the Pangool API
* for parsing CSV files. With this Job we can measure 1) The overhead of using Splout store generator tools against a plain Indetity Hadoop Job
* and 2) Which part of this overhead is only due to parsing CSV files with Pangool.
*/
public class IdentityJob implements Tool {
@Parameter(required = true, names = { "-i", "--inputpath" }, description = "The input path for the identity Job. Must be textual files.")
private String inputPath;
@Parameter(required = true, names = { "-o", "--outputpath" }, description = "The output path for the identity Job.")
private String outputPath;
@Parameter(required = false, names = { "-ps", "--pangoolSchema" }, description = "Provide a Pangool-schema and Pangool will be used for parsing the input text file into a Tuple. Using this option one can measure the overhead of using Pangool's textual input format.")
private String pangoolSchema = null;
@Parameter(required = false, names = { "-gb", "--groupBy" }, description = "If pangoolSchema is provided, a groupBy clause must be provided too. Use a field in your schema that makes the data as evenly spread across reducers as possible.")
private String groupBy = null;
// Basic CSV parsing parameters, optionally used if pangoolSchema != null, can be overrided
// --------------------------------//
@Parameter(names = { "-sep", "--separator" }, description = "The separator character of your text input file, defaults to a space.")
private String separator = " ";
@Parameter(names = { "-quo", "--quotes" }, description = "The quotes character of your input file, defaults to none.")
private String quotes = TupleTextInputFormat.NO_QUOTE_CHARACTER + "";
@Parameter(names = { "-esc", "--escape" }, description = "The escape character of your input file, defaults to none.")
private String escape = TupleTextInputFormat.NO_ESCAPE_CHARACTER + "";
@Parameter(names = { "-sh", "--skipheading" }, description = "Specify this flag for skipping the header line of your text file.")
private boolean skipHeading = false;
// --------------------------------//
private Configuration conf;
@Override
public Configuration getConf() {
return null;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public int run(String[] params) throws Exception {
// Validate params etc
JCommander jComm = new JCommander(this);
jComm.setProgramName("Identity Job");
try {
jComm.parse(params);
} catch(ParameterException e) {
System.err.println(e.getMessage());
jComm.usage();
System.exit(-1);
}
Path outP = new Path(outputPath);
HadoopUtils.deleteIfExists(FileSystem.get(conf), outP);
if(pangoolSchema == null) {
// Use plain Hadoop API
Job job = new Job(conf);
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outP);
job.waitForCompletion(true);
} else {
if(groupBy == null) {
System.err.println("If pangoolSchema is used, groupBy must also be used.");
jComm.usage();
System.exit(-1);
}
Schema schema = new Schema("sch", Fields.parse(pangoolSchema));
Path inputP = new Path(inputPath);
// Use Pangool API - parse CSV, etc
TupleMRBuilder builder = new TupleMRBuilder(conf);
TupleTextInputFormat parsingInputFormat = new TupleTextInputFormat(schema, skipHeading, false,
separator.charAt(0), quotes.charAt(0), escape.charAt(0), FieldSelector.NONE, null);
TupleTextOutputFormat outputFormat = new TupleTextOutputFormat(schema, false, separator.charAt(0),
quotes.charAt(0), escape.charAt(0));
builder.addIntermediateSchema(schema);
builder.addInput(inputP, parsingInputFormat, new IdentityTupleMapper());
builder.setGroupByFields(groupBy);
builder.setOutput(outP, outputFormat, ITuple.class, NullWritable.class);
builder.setTupleReducer(new IdentityTupleReducer());
builder.setJarByClass(this.getClass());
builder.createJob().waitForCompletion(true);
}
return 1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new IdentityJob(), args);
}
}