Package com.linkedin.camus.sweeper

Source Code of com.linkedin.camus.sweeper.CamusSweeper

package com.linkedin.camus.sweeper;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.joda.time.DateTimeZone;

import com.linkedin.camus.sweeper.mapreduce.CamusSweeperJob;
import com.linkedin.camus.sweeper.utils.PriorityExecutor;
import com.linkedin.camus.sweeper.utils.PriorityExecutor.Important;
import com.linkedin.camus.sweeper.utils.Utils;

@SuppressWarnings("deprecation")
public class CamusSweeper extends Configured implements Tool
{
  private static final String DEFAULT_NUM_THREADS = "5";
  private static final String CAMUS_SWEEPER_PRIORITY_LIST = "camus.sweeper.priority.list";
  private List<SweeperError> errorMessages;
  private List<Job> runningJobs;

  private Properties props;
  private ExecutorService executorService;
  private FsPermission perm = new FsPermission(FsAction.ALL, FsAction.READ_EXECUTE, FsAction.READ_EXECUTE);
 
  private String destSubdir;
  private String sourceSubdir;
 
  private static Logger log = Logger.getLogger(CamusSweeper.class);
 
  private CamusSweeperPlanner planner;
 
  private Map<String, Integer> priorityTopics = new HashMap<String, Integer>();


  public CamusSweeper()
  {
    props = new Properties();
  }

  public CamusSweeper(Properties props)
  {
    this.props = props;
    init();
  }

  private void init()
  {
    for (String str : props.getProperty(CAMUS_SWEEPER_PRIORITY_LIST, "").split(",")){
      String[] tokens = str.split("=");
      String topic = tokens[0];
      int priority = tokens.length > 1 ? Integer.parseInt(tokens[1]) : 1;
     
      priorityTopics.put(topic, priority);
    }
   
    this.errorMessages = Collections.synchronizedList(new ArrayList<SweeperError>());
    DateTimeZone.setDefault(DateTimeZone.forID(props.getProperty("default.timezone")));
    this.runningJobs = Collections.synchronizedList(new ArrayList<Job>());
    sourceSubdir = props.getProperty("camus.sweeper.source.subdir");
    destSubdir = props.getProperty("camus.sweeper.dest.subdir");

    try
    {
      planner = ((CamusSweeperPlanner) Class.forName(props.getProperty("camus.sweeper.planner.class")).newInstance()).setPropertiesLogger(props, log);
    }
    catch (Exception e)
    {
      throw new RuntimeException(e);
    }
   
  }

  // TODO:
  // Figure out the logic of canceling the jobs... How should the jobs be cancelled?
  // Essentially kill the job.. essentially all the jobs that have been launched should be killed
  // Do we need the cancel : Simply letting them execute depending on the files to be read should
  // solve the problem
  public void cancel() throws Exception
  {
    executorService.shutdownNow();

    for (Job hadoopJob : runningJobs)
    {
      if (!hadoopJob.isComplete())
      {
        try
        {
          hadoopJob.killJob();
        }
        catch (IOException e)
        {
          e.printStackTrace();
        }
      }
    }
  }
 
  private Map<FileStatus, String> findAllTopics(Path input, PathFilter filter, String topicSubdir, FileSystem fs) throws IOException{
    Map<FileStatus, String> topics = new HashMap<FileStatus, String>();
    for (FileStatus f : fs.listStatus(input)){
      // skipping first level, in search of the topic subdir
      findAllTopics(f.getPath(), filter, topicSubdir, "", fs, topics);
    }
    return topics;
  }
 
  private void findAllTopics(Path input, PathFilter filter, String topicSubdir, String topicNameSpace, FileSystem fs, Map<FileStatus, String> topics) throws IOException{
    for (FileStatus f : fs.listStatus(input)){
      if (! f.isDir())
        return;
      if (f.getPath().getName().equals(topicSubdir) && filter.accept(f.getPath().getParent())){
        topics.put(fs.getFileStatus(f.getPath().getParent()), topicNameSpace);
      } else {
        findAllTopics(f.getPath(), filter, topicSubdir, (topicNameSpace.isEmpty() ? "" : topicNameSpace + "/") + f.getPath().getParent().getName(), fs, topics);
      }
    }
  }

  public void run() throws Exception
  {
    log.info("Starting kafka sweeper");
    int numThreads = Integer.parseInt(props.getProperty("num.threads", DEFAULT_NUM_THREADS));

    executorService = executorService = new PriorityExecutor(numThreads);

    String fromLocation = (String) props.getProperty("camus.sweeper.source.dir");
    String destLocation = (String) props.getProperty("camus.sweeper.dest.dir", "");
    String tmpLocation = (String) props.getProperty("camus.sweeper.tmp.dir", "");
   
    if (destLocation.isEmpty())
      destLocation = fromLocation;
   
    if (tmpLocation.isEmpty())
      tmpLocation = "/tmp/camus-sweeper-tmp";
    else
      tmpLocation += "/camus-sweeper-tmp";
   
    props.setProperty("camus.sweeper.tmp.dir", tmpLocation);

    log.info("fromLocation: " + fromLocation);
    log.info("destLocation: " + destLocation);

    List<String> blacklist = Utils.getStringList(props, "camus.sweeper.blacklist");
    List<String> whitelist = Utils.getStringList(props, "camus.sweeper.whitelist");
    Configuration conf = new Configuration();

    for (Entry<Object, Object> pair : props.entrySet())
    {
      String key = (String) pair.getKey();
      conf.set(key, (String) pair.getValue());
    }

    FileSystem fs = FileSystem.get(conf);
   
    Path tmpPath = new Path(tmpLocation);
    fs.mkdirs(tmpPath, perm);
    String user = UserGroupInformation.getCurrentUser().getUserName();
    fs.setOwner(tmpPath, user, user);
   
    Map<FileStatus, String> topics = findAllTopics(new Path(fromLocation), new WhiteBlackListPathFilter(whitelist, blacklist), sourceSubdir, fs);
    for (FileStatus topic : topics.keySet())
    {
      String topicNameSpace = topics.get(topic).replaceAll("/", "_");
      String topicName =  (topicNameSpace.isEmpty() ? "" : topicNameSpace + "_") + topic.getPath().getName();
      log.info("Processing topic " + topicName);

      Path destinationPath = new Path(destLocation + "/" + topics.get(topic) + "/" + topic.getPath().getName() + "/" + destSubdir);
      try
      {
        runCollectorForTopicDir(fs, topicName, new Path(topic.getPath(), sourceSubdir), destinationPath);
      }
      catch (Exception e)
      {
        System.err.println("unable to process " + topicName + " skipping...");
        e.printStackTrace();
      }
    }
   
    log.info("Shutting down priority executor");
    executorService.shutdown();
    while (!executorService.isTerminated())
    {
      executorService.awaitTermination(30, TimeUnit.SECONDS);
    }
   
    log.info("Shutting down");

    if (!errorMessages.isEmpty())
    {
      for (SweeperError error : errorMessages)
      {
        System.err.println("Error occurred in " + error.getTopic() + " at " + error.getInputPath().toString()
            + " message " + error.getException().getMessage());
        error.e.printStackTrace();
      }
      throw new RuntimeException("Sweeper Failed");
    }
  }

  private void runCollectorForTopicDir(FileSystem fs, String topic, Path topicSourceDir, Path topicDestDir) throws Exception
  {
    log.info("Running collector for topic " + topic + " source:" + topicSourceDir + " dest:" + topicDestDir);
    ArrayList<Future<?>> tasksToComplete = new ArrayList<Future<?>>();
   
    List<Properties> jobPropsList = planner.createSweeperJobProps(topic, topicSourceDir, topicDestDir, fs);
   
    for (Properties jobProps : jobPropsList){
      tasksToComplete.add(runCollector(jobProps, topic));
    }

    log.info("Finishing processing for topic " + topic);
  }

  @SuppressWarnings("unchecked")
  private Future runCollector(Properties props, String topic)
  {
    String jobName = topic + "-" + UUID.randomUUID().toString();
    props.put("tmp.path", props.getProperty("camus.sweeper.tmp.dir") + "/" + jobName + "_" + System.currentTimeMillis());

    if (props.containsKey("reduce.count.override." + topic))
      props.put("reducer.count", Integer.parseInt(props.getProperty("reduce.count.override." + topic)));

    log.info("Processing " + props.get("input.paths"));
   
    return executorService.submit(new KafkaCollectorRunner(jobName, props, errorMessages, topic));
  }

  public class KafkaCollectorRunner implements Runnable, Important
  {
    private Properties props;
    private String name;
    private List<SweeperError> errorQueue;
    private String topic;
    private int priority;

    public KafkaCollectorRunner(String name, Properties props, List<SweeperError> errorQueue, String topic)
    {
      this.name = name;
      this.props = props;
      this.errorQueue = errorQueue;
      this.topic = topic;
     
      priority = priorityTopics.containsKey(topic) ? priorityTopics.get(topic) : 0;
    }

    public void run()
    {
      KafkaCollector collector = null;
      try
      {
        log.info("Starting runner for " + name);
        collector = new KafkaCollector(props, name, topic);
        log.info("Running " + name + " for input " + props.getProperty("input.paths"));
        collector.run();
      }
      catch (Throwable e) // Sometimes the error is the Throwable, e.g. java.lang.NoClassDefFoundError
      {
        e.printStackTrace();
        log.error("Failed for " + name
                  + " ,job: " + collector == null ? null : collector.getJob()
                  + " failed for " + props.getProperty("input.paths") + " Exception:"
                  + e.getLocalizedMessage());
        errorQueue.add(new SweeperError(name, props.get("input.paths").toString(), e));
      }
    }

    @Override
    public int getPriority() {
      return priority;
    }
  }

  private class KafkaCollector
  {
    private static final String TARGET_FILE_SIZE = "camus.sweeper.target.file.size";
    private static final long TARGET_FILE_SIZE_DEFAULT = 1536l * 1024l * 1024l;
    private long targetFileSize;
    private final String jobName;
    private final Properties props;
    private final String topicName;
   
    private Job job;

    public KafkaCollector(Properties props, String jobName, String topicName) throws IOException
    {
      this.jobName = jobName;
      this.props = props;
      this.topicName = topicName;
      this.targetFileSize = props.containsKey(TARGET_FILE_SIZE) ? Long.parseLong(props.getProperty(TARGET_FILE_SIZE)) : TARGET_FILE_SIZE_DEFAULT;
     
      job = new Job(getConf());
      job.setJarByClass(CamusSweeper.class);
      job.setJobName(jobName);

      for (Entry<Object, Object> pair : props.entrySet())
      {
        String key = (String) pair.getKey();
        job.getConfiguration().set(key, (String) pair.getValue());
      }
    }

    public void run() throws Exception
    {
      FileSystem fs = FileSystem.get(job.getConfiguration());
      List<String> strPaths = Utils.getStringList(props, "input.paths");
      Path[] inputPaths = new Path[strPaths.size()];

      for (int i = 0; i < strPaths.size(); i++)
        inputPaths[i] = new Path(strPaths.get(i));

      for (Path path : inputPaths)
      {
        FileInputFormat.addInputPath(job, path);
      }

      job.getConfiguration().set("mapred.compress.map.output", "true");

      Path tmpPath = new Path(job.getConfiguration().get("tmp.path"));
      Path outputPath = new Path(job.getConfiguration().get("dest.path"));

      FileOutputFormat.setOutputPath(job, tmpPath);
      ((CamusSweeperJob) Class.forName(props.getProperty("camus.sweeper.io.configurer.class"))
          .newInstance()).setLogger(log).configureJob(topicName, job);
     
      long dus = 0;
      for (Path p : inputPaths)
        dus += fs.getContentSummary(p).getLength();
     
      int maxFiles = job.getConfiguration().getInt("max.files", 24);
      int numTasks = Math.min((int) (dus / targetFileSize) + 1, maxFiles);
     
      if (job.getNumReduceTasks() != 0){
        int numReducers;
        if (job.getConfiguration().get("reducer.count") != null)
        {
          numReducers = job.getConfiguration().getInt("reducer.count", 45);
        }
        else
        {
          numReducers = numTasks;
        }
        log.info("Setting reducer " + numReducers);
        job.setNumReduceTasks(numReducers);
      } else {
        long targetSplitSize = dus / numTasks;
       
        log.info("Setting target split size " + targetSplitSize);
       
        job.getConfiguration().setLong("mapred.max.split.size", targetSplitSize);
        job.getConfiguration().setLong("mapred.min.split.size", targetSplitSize);
      }   
     
      job.submit();
      runningJobs.add(job);
      log.info("job running: " + job.getTrackingURL() + " for: " + jobName);
      job.waitForCompletion(false);

      if (!job.isSuccessful())
      {
        System.err.println("hadoop job failed");
        throw new RuntimeException("hadoop job failed.");
      }

      Path oldPath = null;
      if (fs.exists(outputPath))
      {
        oldPath = new Path("/tmp", "_old_" + job.getJobID());
        log.info("Path " + outputPath + " exists. Overwriting.");
        if (!fs.rename(outputPath, oldPath))
        {
          fs.delete(tmpPath, true);
          throw new RuntimeException("Error: cannot rename " + outputPath + " to " + outputPath);
        }
      }

      log.info("Swapping " + tmpPath + " to " + outputPath);
      mkdirs(fs, outputPath.getParent(), perm);
 
      if (!fs.rename(tmpPath, outputPath))
      {
        fs.rename(oldPath, outputPath);
        fs.delete(tmpPath, true);
        throw new RuntimeException("Error: cannot rename " + tmpPath + " to " + outputPath);
      }

      if (oldPath != null && fs.exists(oldPath))
      {
        log.info("Deleting " + oldPath);
        fs.delete(oldPath, true);
      }
    }
   
    protected String getJobName() {
      return jobName;
    }
   
    protected String getTopicName() {
      return topicName;
    }

    protected Job getJob() {
      return job;
    }
   
    protected Properties getProps() {
      return this.props;
    }
  }
 
  private void mkdirs(FileSystem fs, Path path, FsPermission perm) throws IOException{
    log.info("mkdir: " + path);
    if (! fs.exists(path.getParent()))
      mkdirs(fs, path.getParent(), perm);
    fs.mkdirs(path, perm);
  }
 
  private Pattern compileMultiPattern(Collection<String> list){
    String patternStr = "(";
   
    for (String str : list){
      patternStr += str + "|";
    }
   
    patternStr = patternStr.substring(0, patternStr.length() - 1) + ")";
    return Pattern.compile(patternStr);
  }

  private class WhiteBlackListPathFilter implements PathFilter
  {
    private Pattern whitelist;
    private Pattern blacklist;

    public WhiteBlackListPathFilter(Collection<String> whitelist, Collection<String> blacklist)
    {
      if (whitelist.isEmpty())
        this.whitelist = Pattern.compile(".*")//whitelist everything
      else
        this.whitelist = compileMultiPattern(whitelist);
     
      if (blacklist.isEmpty())
        this.blacklist = Pattern.compile("a^")//blacklist nothing
      else
        this.blacklist = compileMultiPattern(blacklist);
      log.info("whitelist: " + this.whitelist.toString());
      log.info("blacklist: " + this.blacklist.toString());
    }

    @Override
    public boolean accept(Path path)
    {
      String name = path.getName();
      return whitelist.matcher(name).matches() && ! ( blacklist.matcher(name).matches()
          || name.startsWith(".") || name.startsWith("_"));
    }
  }

  private static class SweeperError
  {
    private final String topic;
    private final String input;
    private final Throwable e;

    public SweeperError(String topic, String input, Throwable e)
    {
      this.topic = topic;
      this.input = input;
      this.e = e;
    }

    public String getTopic()
    {
      return topic;
    }

    public String getInputPath()
    {
      return input;
    }

    public Throwable getException()
    {
      return e;
    }
  }

  public int run(String[] args) throws Exception
  {
    Options options = new Options();

    options.addOption("p", true, "properties filename from the classpath");
    options.addOption("P", true, "external properties filename");

    options.addOption(OptionBuilder.withArgName("property=value")
                                   .hasArgs(2)
                                   .withValueSeparator()
                                   .withDescription("use value for given property")
                                   .create("D"));

    CommandLineParser parser = new PosixParser();
    CommandLine cmd = parser.parse(options, args);

    if (!(cmd.hasOption('p') || cmd.hasOption('P')))
    {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("CamusJob.java", options);
      return 1;
    }

    if (cmd.hasOption('p'))
      props.load(this.getClass().getResourceAsStream(cmd.getOptionValue('p')));

    if (cmd.hasOption('P'))
    {
      File file = new File(cmd.getOptionValue('P'));
      FileInputStream fStream = new FileInputStream(file);
      props.load(fStream);
    }

    props.putAll(cmd.getOptionProperties("D"));

    init();
    run();
    return 0;
  }

  public static void main(String args[]) throws Exception
  {
    CamusSweeper job = new CamusSweeper();
    ToolRunner.run(job, args);
  }
}
TOP

Related Classes of com.linkedin.camus.sweeper.CamusSweeper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.