* Copyright 2012 Twitter, Inc.
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
package com.twitter.elephanttwin.lzo.retrieval;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import com.google.common.base.Joiner;
import com.twitter.common.args.Arg;
import com.twitter.common.args.ArgFilters;
import com.twitter.common.args.ArgScanner;
import com.twitter.common.args.CmdLine;
import com.twitter.common.args.constraints.NotNull;
import com.twitter.elephanttwin.indexing.AbstractBlockIndexingJob;
import com.twitter.elephanttwin.util.HdfsUtils;
* LZOBlockLevelIndexingJobs takes in a list of input paths, finds all lzo
* files from the input paths (recursively), indexes each file using user
* provided InputFormat class and value class
* (can be either Thrift or Protocol Buffer class)
* on the Column (user provided). The indexed files are stored under user
* provided IndexDir directory. The actual indexes are stored in MapFile format.
* There is also an index meta file for each indexed file which records what
* columns have been indexed for an input file. <p>
* The user can specify how many parallel index jobs to be used via: <br>
* -jobpoolsize=10 <br>
* An example indexing job:<br>
hadoop jar my.jar
public class LZOBlockLevelIndexingJobs extends AbstractBlockIndexingJob implements Tool {
protected Logger LOG =Logger
public static final String YAML_INPUT_DIR = "input_file_base_directory";
@NotNull @CmdLine(name = "input", help = "one or more paths to input data, comma" +
" separated.")
public static Arg<List<String>> input = Arg.create(null);
@NotNull @CmdLine(name = "index", help = "index location")
public static Arg<String> index = Arg.create();
@NotNull @CmdLine(name = "inputformat", help = "actual inputformat class name to" +
" be used (example com.twitter.elephantbird.mapreduce." +
public static Arg<String> inputformat = Arg.create();
@NotNull @CmdLine(name = "value_class", help = "class name used to read values " +
"from HDFS (example com.twitter.clientapp.gen.LogEvent")
public static Arg<String> value_class = Arg.create();
@NotNull @CmdLine(name = "columnname", help = "column name to be indexed")
public static Arg<String> columnname = Arg.create();
@CmdLine(name = "keyvalue", help = "searched value on column")
public static Arg<String> keyvalue = Arg.create();
@CmdLine(name = "num_partitions", help = "number of reducers to use to " +
"create partial index files")
public static Arg<Integer> numPartitions = Arg.create();
@CmdLine(name = "sleeptime", help = "# of seconds (in a loop) to wait" +
" for all index jobs to finish")
public static Arg<Integer> sleepTime = Arg.create(3);
@CmdLine(name = "configfile", help = "Path to yaml configuration file")
private static Arg<String> configFile = Arg.create();
@CmdLine(name = "overwrite", help = " reindex files (from -input) even " +
"if previous indexes exist, true by default ")
private static Arg<Boolean> overwrite = Arg.create(true);
@CmdLine(name = "dryrun", help = "won't submit mapreduce jobs, just" +
" print information before submitting MR jobs")
private static Arg<Boolean> dryrun = Arg.create(false);
@CmdLine(name = "jobpoolsize", help = "the number of concurrent" +
" MR jobs to be used pool")
private static Arg<Integer> jobPoolSize = Arg.create(10);
* Index all lzo files (recursively) from user provided input paths. <br>
* Currently each file is indexed separately.
public int run(String[] args) throws Exception {
ArgScanner scanner = new ArgScanner();
scanner.parse(ArgFilters.selectClass(getClass()), Arrays.asList(args));
return work(null, null, 0);
protected String getJobName() {
return "LZOBlockLevelIndexingJobs:input="
+ Joiner.on(",").join(input.get());
public static void main(String[] args) throws Exception {
GenericOptionsParser optParser = new GenericOptionsParser(args);
new LZOBlockLevelIndexingJobs(), optParser.getRemainingArgs());
* Currently, we skip files that don't have regular lzo indexes.
* In the future we may want to index them on the fly.
protected boolean fileIsOk(FileStatus stat, FileSystem fs)
throws IOException {
Path lzoIndexFile = stat.getPath().suffix(".index");
return fs.exists(lzoIndexFile);
protected Job setMapper(Job job) {
return job;
protected Boolean doOverwrite() {
return overwrite.get();
protected String getColumnName() {
return columnname.get();
protected String getIndex() {
return index.get();
protected List<String> getInput() {
return input.get();
protected String getInputFormat() {
return inputformat.get();
protected Integer getJobPoolSize() {
return jobPoolSize.get();
protected String getKeyValue() {
return keyvalue.get();
protected Integer getNumPartitions() {
return numPartitions.get();
protected Integer getSleepTime() {
return sleepTime.get();
protected String getValueClass() {
return value_class.get();
protected Boolean isDryRun() {
return dryrun.get();
protected PathFilter getFileFilter() {
return HdfsUtils.lzoFileFilter;