Package

Source Code of Distcp$Partition

import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.List;
import java.util.UUID;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.google.common.collect.Lists;
import com.netflix.aegisthus.tools.DirectoryWalker;
import com.netflix.aegisthus.tools.StorageHelper;
import com.netflix.aegisthus.tools.Utils;

import com.netflix.hadoop.output.CleanOutputFormat;

public class Distcp extends Configured implements Tool {
  public static class Map extends Mapper<LongWritable, Text, LongWritable, Text> {
    private long count = 0;

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
      context.write(new LongWritable(count++), value);
    }
  }

  public static class Partition extends Partitioner<LongWritable, Text> {

    @Override
    public int getPartition(LongWritable arg0, Text arg1, int arg2) {
      return Long.valueOf(arg0.get()).intValue() % arg2;
    }
  }

  public static class Reduce extends Reducer<LongWritable, Text, Text, Text> {

    @Override
    protected void reduce(LongWritable key, Iterable<Text> values, Context ctx) throws IOException,
        InterruptedException {
      StorageHelper helper = new StorageHelper(ctx);
      boolean snappy = ctx.getConfiguration().getBoolean(CFG_PREFIX + OPT_PRIAM, false);
      String base = ctx.getConfiguration().get(CFG_PREFIX + OPT_RECURSIVE);
      if (base != null && !base.endsWith("/")) {
        base = base + "/";
      }
      for (Text file : values) {
        String fileString = cleanS3(file.toString().trim());
        if (base == null) {
          helper.copyToTemp(fileString, snappy);
        } else {
          String prefix = fileString.substring(base.length());
          prefix = prefix.replaceAll("/[^/]+$", "");
          helper.copyToTemp(fileString, prefix, snappy);
        }

      }
    }
  }

  private static final String OPT_DISTCP_TARGET = "distcp.target";
  private static final String OPT_INPUT_FILE = "input";
  private static final String OPT_MANIFEST_OUT = "manifestOut";
  private static final String OPT_MANIFEST_IN = "manifest";
  private static final String OPT_OVERWRITE = "overwrite";
  private static final String OPT_PRIAM = "priam";
  private static final String OPT_RECURSIVE = "recursive";
  private static final String CFG_PREFIX = "distcp.";

  private static final Log LOG = LogFactory.getLog(Distcp.class);

  private static final int MAX_REDUCERS = 800;

  private static final String OUTPUT = "output";

  @SuppressWarnings("static-access")
  public static CommandLine getOptions(String[] args) {
    Options opts = new Options();
    opts.addOption(OptionBuilder
        .withArgName(OPT_INPUT_FILE)
        .withDescription("Each input location")
        .hasArgs()
        .create(OPT_INPUT_FILE));
    opts.addOption(OptionBuilder
        .withArgName(OPT_PRIAM)
        .withDescription("the input is snappy stream compressed and should be decompressed (priam backup)")
        .create(OPT_PRIAM));
    opts.addOption(OptionBuilder
        .withArgName(OPT_RECURSIVE)
        .withDescription("retain directory structure under this directory")
        .hasArg()
        .create(OPT_RECURSIVE));
    opts.addOption(OptionBuilder
        .withArgName(OPT_MANIFEST_IN)
        .withDescription("a manifest of the files to be copied")
        .hasArg()
        .create(OPT_MANIFEST_IN));
    opts.addOption(OptionBuilder
        .withArgName(OPT_MANIFEST_IN)
        .withDescription("a manifest of the files to be copied")
        .hasArg()
        .create(OPT_MANIFEST_IN));
    opts.addOption(OptionBuilder
        .withArgName(OPT_MANIFEST_OUT)
        .withDescription("write out a manifest file of movement")
        .create(OPT_MANIFEST_OUT));
    opts.addOption(OptionBuilder
        .withArgName(OPT_OVERWRITE)
        .withDescription("overwrite the target directory if it exists.")
        .create(OPT_OVERWRITE));
    opts.addOption(OptionBuilder
        .withArgName(OUTPUT)
        .isRequired()
        .withDescription("output location")
        .hasArg()
        .create(OUTPUT));
    CommandLineParser parser = new GnuParser();

    try {
      CommandLine cl = parser.parse(opts, args, true);
      if (cl.hasOption(OPT_MANIFEST_IN) && cl.hasOption(OPT_INPUT_FILE)) {
        System.out.println("Cannot have both a manifest and input files");
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(String.format("hadoop jar distcp.jar %s", Distcp.class.getName()), opts);
        return null;
      }
      return cl;
    } catch (ParseException e) {
      System.out.println("Unexpected exception:" + e.getMessage());
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(String.format("hadoop jar distcp.jar %s", Distcp.class.getName()), opts);
      return null;
    }

  }

  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(new Configuration(), new Distcp(), args);

    System.exit(res);
  }

  /**
   * checks to see if the output directory exists and throws an error if it
   * does.
   *
   * TODO: extend this to allow overwrite if set.
   *
   * @throws IOException
   */
  protected void checkOutputDirectory(Job job, String outputDir, boolean overwrite) throws IOException {
    Path out = new Path(outputDir);
    FileSystem fsOut = out.getFileSystem(job.getConfiguration());
    if (fsOut.exists(out)) {
      if (overwrite) {
        fsOut.delete(out, true);
      } else {
        String error = String.format("Ouput directory (%s) exists, failing", outputDir);
        LOG.error(error);
        throw new IOException(error);
      }
    }
  }

  protected List<FileStatus> getInputs() {
    return null;
  }

  protected Job initializeJob() throws IOException {
    Job job = new Job(getConf());
    job.setJarByClass(Distcp.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(CleanOutputFormat.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setPartitionerClass(Partition.class);
    StorageHelper sh = new StorageHelper(job.getConfiguration());
    LOG.info(String.format("temp location for job: %s", sh.getBaseTempLocation()));

    return job;
  }

  protected static final String cleanS3(String file) {
    return file.replaceFirst("s3://", "s3n://");
  }

  protected void setupTempDirs() {
  }

  protected void setReducers(Job job, int fileCount) {
    int reducers = job.getConfiguration().getInt("mapred.reduce.tasks", 1);
    LOG.info(String.format("fileCount: %d - set reducers: %d", fileCount, reducers));
    if (reducers == 1) {
      job.getConfiguration().setInt("mapred.reduce.tasks", Math.min(fileCount, MAX_REDUCERS));
    } else {
      job.getConfiguration().setInt("mapred.reduce.tasks", Math.min(fileCount, reducers));
    }
  }

  protected int setupInput(Job job, Path inputPath, String[] inputFiles, String manifestPath) throws IOException {
    int size = 0;
    if (manifestPath == null) {
      FileSystem fs = inputPath.getFileSystem(job.getConfiguration());
      DataOutputStream dos = fs.create(inputPath);
      List<String> inputs = Lists.newArrayList(inputFiles);
      for (int i = 0; i < inputs.size(); i++) {
        inputs.set(i, cleanS3(inputs.get(i)));
      }
      List<FileStatus> files = Lists.newArrayList(DirectoryWalker
          .with(job.getConfiguration())
          .addAll(inputs)
          .statuses());

      for (FileStatus file : files) {
        dos.writeBytes(file.getPath().toUri().toString());
        dos.write('\n');
      }
      dos.close();
      size = files.size();
    } else {
      Utils.copy(new Path(manifestPath), inputPath, false, job.getConfiguration());
      FileSystem fs = inputPath.getFileSystem(job.getConfiguration());
      BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(inputPath)));
      String l;
      while ((l = br.readLine()) != null) {
        LOG.info(String.format("inputfile: %s", l));
        size++;
      }
    }
    return size;
  }

  @Override
  public int run(String[] args) throws Exception {

    CommandLine cl = getOptions(args);
    if (cl == null) {
      return 1;
    }

    Job job = initializeJob();
    String outputDir = cl.getOptionValue(OUTPUT);
    StorageHelper helper = new StorageHelper(job.getConfiguration());
    helper.setFinalPath(outputDir);

    checkOutputDirectory(job, outputDir, cl.hasOption(OPT_OVERWRITE));

    job.getConfiguration().setBoolean(CFG_PREFIX + OPT_PRIAM, cl.hasOption(OPT_PRIAM));
    if (cl.hasOption(OPT_RECURSIVE)) {
      job.getConfiguration().set(CFG_PREFIX + OPT_RECURSIVE, cleanS3(cl.getOptionValue(OPT_RECURSIVE)));
    }

    String pathTemp = String.format("/tmp/%s", UUID.randomUUID().toString());
    LOG.info(String.format("writing to %s", pathTemp));

    Path tmp = new Path("/tmp");
    FileSystem fs = tmp.getFileSystem(job.getConfiguration());
    fs.mkdirs(new Path(pathTemp));
    Path inputPath = new Path(new Path(pathTemp), "input.txt");
    Path tmpPath = new Path(new Path(pathTemp), "out");

    int fileCount = setupInputjob,
                  inputPath,
                  cl.getOptionValues(OPT_INPUT_FILE),
                  cl.getOptionValue(OPT_MANIFEST_IN));
    setReducers(job, fileCount);

    TextInputFormat.setInputPaths(job, inputPath.toUri().toString());
    FileOutputFormat.setOutputPath(job, tmpPath);

    boolean success = runJob(job, cl);
    // TODO: output manifest
    /*
     * if (success && cl.hasOption(OPT_MANIFEST_OUT)) { writeManifest(job,
     * files); }
     */
    fs.delete(new Path(pathTemp), true);
    return success ? 0 : 1;
  }

  protected boolean runJob(Job job, CommandLine cl) throws IOException, InterruptedException, ClassNotFoundException {
    job.submit();
    System.out.println(job.getJobID());
    System.out.println(job.getTrackingURL());
    return job.waitForCompletion(true);
  }

  protected void writeManifest(Job job, List<FileStatus> files) throws IOException {
    Path out = new Path(job.getConfiguration().get(OPT_DISTCP_TARGET));
    FileSystem fsOut = out.getFileSystem(job.getConfiguration());
    DataOutputStream dos = fsOut.create(new Path(out, "_manifest/.manifest"));
    for (FileStatus file : files) {
      Path output = new Path(out, file.getPath().getName());
      dos.writeBytes(output.toUri().toString());
      dos.write('\n');
    }
    dos.close();
  }
}
TOP

Related Classes of Distcp$Partition

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.