/*
* Copyright 2012 Alex Holmes
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.alexholmes.hadooputils.sort;
import com.hadoop.compression.lzo.LzoIndexer;
import com.hadoop.compression.lzo.LzopCodec;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.InputSampler;
import org.apache.hadoop.mapred.lib.TotalOrderPartitioner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
/**
* This is a simple MapReduce sorting utility, modeled after the Linux sort utility.
* It supports a subset of the options available with Linux sort.
*/
public class Sort<K, V> extends Configured implements Tool {
/**
* Details about the job.
*/
private RunningJob jobResult = null;
/**
* Usage string.
*/
private static final String[] USAGE = {
"bin/hadoop jar hadoop-utils-<version>.jar com.alexholmes.hadooputils.sort.Sort "
+ "[OPTION]... INPUT_DIR OUTPUT_DIR",
"",
"Ordering options:",
"-f, --ignore-case",
" Fold lower case to upper case characters.",
"",
"Other options:",
"",
"-m MAPS",
" The number of map tasks.",
"-r REDUCERS",
" The number of reduce tasks.",
"-k, --key POS1[,POS2]",
" Start a key at POS1 (origin 1), end it at POS2 (default end of line).",
"-t, --field-separator SEP",
" Use SEP instead of non-blank to blank transition.",
"-u, --unique",
" Output only the first of an equal run.",
"--total-order PCNT NUM_SAMPLES MAX_SPLITS",
" Produce total order across all reducer files.",
" PCNT = Probability with which a key will be chosen (range 0.0 - 1.0).",
" NUM_SAMPLES = Number of samples which will be extracted.",
" MAX_SPLITS = Number of input splits to extract samples from.",
"--map-codec CODEC",
" Compression codec for map intermediary outputs.",
"--codec CODEC",
" Compression codec for final outputs.",
"--lzop-index",
" Creates LZOP indexes for the output files.",
};
/**
* Print the usage.
*
* @return the Java exit code
*/
static int printUsage() {
System.out.println(StringUtils.join(USAGE, "\n"));
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
/**
* The driver for sort program which works with command-line arguments.
*
* @param args command-line arguments
* @return 0 if everything went well, non-zero for everything else
* @throws Exception When there is communication problems with the
* job tracker.
*/
@SuppressWarnings("unchecked")
public int run(final String[] args) throws Exception {
SortConfig sortConfig = new SortConfig(getConf());
Integer numMapTasks = null;
Integer numReduceTasks = null;
List<String> otherArgs = new ArrayList<String>();
InputSampler.Sampler<K, V> sampler = null;
Class<? extends CompressionCodec> codecClass = null;
Class<? extends CompressionCodec> mapCodecClass = null;
boolean createLzopIndex = false;
for (int i = 0; i < args.length; ++i) {
try {
if ("-m".equals(args[i])) {
numMapTasks = Integer.parseInt(args[++i]);
} else if ("-r".equals(args[i])) {
numReduceTasks = Integer.parseInt(args[++i]);
} else if ("-f".equals(args[i]) || "--ignore-case".equals(args[i])) {
sortConfig.setIgnoreCase(true);
} else if ("-u".equals(args[i]) || "--unique".equals(args[i])) {
sortConfig.setUnique(true);
} else if ("-k".equals(args[i]) || "--key".equals(args[i])) {
String[] parts = StringUtils.split(args[++i], ",");
sortConfig.setStartKey(Integer.valueOf(parts[0]));
if (parts.length > 1) {
sortConfig.setEndKey(Integer.valueOf(parts[1]));
}
} else if ("-t".equals(args[i]) || "--field-separator".equals(args[i])) {
sortConfig.setFieldSeparator(args[++i]);
} else if ("--total-order".equals(args[i])) {
double pcnt = Double.parseDouble(args[++i]);
int numSamples = Integer.parseInt(args[++i]);
int maxSplits = Integer.parseInt(args[++i]);
if (0 >= maxSplits) {
maxSplits = Integer.MAX_VALUE;
}
sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
} else if ("--map-codec".equals(args[i])) {
mapCodecClass = (Class<? extends CompressionCodec>) Class.forName(args[++i]);
} else if ("--codec".equals(args[i])) {
codecClass = (Class<? extends CompressionCodec>) Class.forName(args[++i]);
} else if ("--lzop-index".equals(args[i])) {
createLzopIndex = true;
} else {
otherArgs.add(args[i]);
}
} catch (NumberFormatException except) {
System.out.println("ERROR: Integer expected instead of " + args[i]);
return printUsage();
} catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from "
+ args[i - 1]);
return printUsage(); // exits
}
}
// Make sure there are exactly 2 parameters left.
if (otherArgs.size() != 2) {
System.out.println("ERROR: Wrong number of parameters: "
+ otherArgs.size() + " instead of 2.");
return printUsage();
}
if (runJob(new JobConf(sortConfig.getConfig()), numMapTasks, numReduceTasks, sampler,
codecClass, mapCodecClass, createLzopIndex, otherArgs.get(0), otherArgs.get(1))) {
return 0;
}
return 1;
}
/**
* The driver for the sort MapReduce job.
*
* @param jobConf sort configuration
* @param numMapTasks number of map tasks
* @param numReduceTasks number of reduce tasks
* @param sampler sampler, if required
* @param codecClass the compression codec for compressing final outputs
* @param mapCodecClass the compression codec for compressing intermediary map outputs
* @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes
* for the job output files
* @param inputDirAsString input directory in CSV-form
* @param outputDirAsString output directory
* @return true if the job completed successfully
* @throws IOException if something went wrong
* @throws URISyntaxException if a URI wasn't correctly formed
*/
public boolean runJob(final JobConf jobConf, final Integer numMapTasks,
final Integer numReduceTasks, final InputSampler.Sampler<K, V> sampler,
final Class<? extends CompressionCodec> codecClass,
final Class<? extends CompressionCodec> mapCodecClass,
final boolean createLzopIndexes,
final String inputDirAsString, final String outputDirAsString)
throws IOException, URISyntaxException {
jobConf.setJarByClass(Sort.class);
jobConf.setJobName("sorter");
JobClient client = new JobClient(jobConf);
ClusterStatus cluster = client.getClusterStatus();
if (numMapTasks != null) {
jobConf.setNumMapTasks(numMapTasks);
}
if (numReduceTasks != null) {
jobConf.setNumReduceTasks(numReduceTasks);
} else {
int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
String sortReduces = jobConf.get("test.sort.reduces_per_host");
if (sortReduces != null) {
numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
}
// Set user-supplied (possibly default) job configs
jobConf.setNumReduceTasks(numReduces);
}
jobConf.setMapperClass(IdentityMapper.class);
jobConf.setReducerClass(SortReduce.class);
jobConf.setInputFormat(SortInputFormat.class);
jobConf.setMapOutputKeyClass(Text.class);
jobConf.setMapOutputValueClass(Text.class);
jobConf.setOutputKeyClass(Text.class);
jobConf.setOutputValueClass(Text.class);
if (mapCodecClass != null) {
jobConf.setMapOutputCompressorClass(mapCodecClass);
}
if (codecClass != null) {
jobConf.setBoolean("mapred.output.compress", true);
jobConf.setClass("mapred.output.compression.codec",
codecClass, CompressionCodec.class);
}
FileInputFormat.setInputPaths(jobConf, inputDirAsString);
FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString));
if (sampler != null) {
System.out.println("Sampling input to effect total-order sort...");
jobConf.setPartitionerClass(TotalOrderPartitioner.class);
Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
FileSystem fileSystem = FileSystem.get(jobConf);
if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) {
inputDir = inputDir.getParent();
}
inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
Path partitionFile = new Path(inputDir, "_sortPartitioning");
TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
InputSampler.writePartitionFile(jobConf, sampler);
URI partitionUri = new URI(partitionFile.toString()
+ "#" + "_sortPartitioning");
DistributedCache.addCacheFile(partitionUri, jobConf);
DistributedCache.createSymlink(jobConf);
}
System.out.println("Running on "
+ cluster.getTaskTrackers()
+ " nodes to sort from "
+ FileInputFormat.getInputPaths(jobConf)[0] + " into "
+ FileOutputFormat.getOutputPath(jobConf)
+ " with " + jobConf.getNumReduceTasks() + " reduces.");
Date startTime = new Date();
System.out.println("Job started: " + startTime);
jobResult = JobClient.runJob(jobConf);
Date endTime = new Date();
System.out.println("Job ended: " + endTime);
System.out.println("The job took "
+ TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime())
+ " seconds.");
if (jobResult.isSuccessful()) {
if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) {
new LzoIndexer(jobConf).index(new Path(outputDirAsString));
}
return true;
}
return false;
}
/**
* Main entry point for the utility.
*
* @param args arguments
* @throws Exception when something goes wrong
*/
public static void main(final String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new Sort(), args);
System.exit(res);
}
/**
* Get the last job that was run using this instance.
*
* @return the results of the last job that was run
*/
public RunningJob getResult() {
return jobResult;
}
}