Source Code of fm.last.hadoop.programs.labs.trackstats.TrackStatisticsProgram

/*
 * Copyright 2008 Last.fm.
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package fm.last.hadoop.programs.labs.trackstats;


import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.jobcontrol.Job;
import org.apache.hadoop.mapred.jobcontrol.JobControl;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.MultipleInputs;


import fm.last.hadoop.io.records.TrackStats;


/**
 * Program that calculates various track-related statistics from raw listening data.
 */
public class TrackStatisticsProgram {


  public static final Log log = LogFactory.getLog(TrackStatisticsProgram.class);


  // values below indicate position in raw data for each value


  private static final int COL_USERID = 0;
  private static final int COL_TRACKID = 1;
  private static final int COL_SCROBBLES = 2;
  private static final int COL_RADIO = 3;
  private static final int COL_SKIP = 4;


  private Configuration conf;


  /**
   * Constructs a new TrackStatisticsProgram, using a default Configuration.
   */
  public TrackStatisticsProgram() {
    this.conf = new Configuration();
  }


  /**
   * Enumeration for Hadoop error counters.
   */
  private enum COUNTER_KEYS {
    INVALID_LINES, NOT_LISTEN
  };


  /**
   * Mapper that takes in raw listening data and outputs the number of unique listeners per track.
   */
  public static class UniqueListenersMapper extends MapReduceBase implements
      Mapper<LongWritable, Text, IntWritable, IntWritable> {


    public void map(LongWritable position, Text rawLine, OutputCollector<IntWritable, IntWritable> output,
        Reporter reporter) throws IOException {


      String line = (rawLine).toString();
      if (line.trim().isEmpty()) { // if the line is empty, report error and ignore
        reporter.incrCounter(COUNTER_KEYS.INVALID_LINES, 1);
        return;
      }


      String[] parts = line.split(" "); // raw data is whitespace delimited
      try {
        int scrobbles = Integer.parseInt(parts[TrackStatisticsProgram.COL_SCROBBLES]);
        int radioListens = Integer.parseInt(parts[TrackStatisticsProgram.COL_RADIO]);
        if (scrobbles <= 0 && radioListens <= 0) {
          // if track somehow is marked with zero plays, report error and ignore
          reporter.incrCounter(COUNTER_KEYS.NOT_LISTEN, 1);
          return;
        }
        // if we get to here then user has listened to track, so output user id against track id
        IntWritable trackId = new IntWritable(Integer.parseInt(parts[TrackStatisticsProgram.COL_TRACKID]));
        IntWritable userId = new IntWritable(Integer.parseInt(parts[TrackStatisticsProgram.COL_USERID]));
        output.collect(trackId, userId);
      } catch (NumberFormatException e) {
        reporter.incrCounter(COUNTER_KEYS.INVALID_LINES, 1);
        reporter.setStatus("Invalid line in listening data: " + rawLine);
        return;
      }
    }
  }


  /**
   * Combiner that improves efficiency by removing duplicate user ids from mapper output.
   */
  public static class UniqueListenersCombiner extends MapReduceBase implements
      Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {


    public void reduce(IntWritable trackId, Iterator<IntWritable> values,
        OutputCollector<IntWritable, IntWritable> output, Reporter reporter) throws IOException {
      
      Set<IntWritable> userIds = new HashSet<IntWritable>();
      while (values.hasNext()) {
        IntWritable userId = values.next();
        if (!userIds.contains(userId)) {
          // if this user hasn't already been marked as listening to the track, add them to set and output them
          userIds.add(new IntWritable(userId.get()));
          output.collect(trackId, userId);
        }
      }
    }
  }


  /**
   * Reducer that outputs only unique listener ids per track (i.e. it removes any duplicated). Final output is number of
   * unique listeners per track.
   */
  public static class UniqueListenersReducer extends MapReduceBase implements
      Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {


    public void reduce(IntWritable trackId, Iterator<IntWritable> values,
        OutputCollector<IntWritable, IntWritable> output, Reporter reporter) throws IOException {
      
      Set<Integer> userIds = new HashSet<Integer>();
      // add all userIds to the set, duplicates automatically removed (set contract)
      while (values.hasNext()) {
        IntWritable userId = values.next();
        userIds.add(Integer.valueOf(userId.get()));
      }
      // output trackId -> number of unique listeners per track
      output.collect(trackId, new IntWritable(userIds.size()));
    }


  }


  /**
   * Mapper that summarizes various statistics per track. Input is raw listening data, output is a partially filled in
   * TrackStatistics object per track id.
   */
  public static class SumMapper extends MapReduceBase implements Mapper<LongWritable, Text, IntWritable, TrackStats> {


    public void map(LongWritable position, Text rawLine, OutputCollector<IntWritable, TrackStats> output,
        Reporter reporter) throws IOException {


      String line = (rawLine).toString();
      if (line.trim().isEmpty()) { // ignore empty lines
        reporter.incrCounter(COUNTER_KEYS.INVALID_LINES, 1);
        return;
      }


      String[] parts = line.split(" ");
      try {
        int trackId = Integer.parseInt(parts[TrackStatisticsProgram.COL_TRACKID]);
        int scrobbles = Integer.parseInt(parts[TrackStatisticsProgram.COL_SCROBBLES]);
        int radio = Integer.parseInt(parts[TrackStatisticsProgram.COL_RADIO]);
        int skip = Integer.parseInt(parts[TrackStatisticsProgram.COL_SKIP]);
        // set number of listeners to 0 (this is calculated later) and other values as provided in text file
        TrackStats trackstat = new TrackStats(0, scrobbles + radio, scrobbles, radio, skip);
        output.collect(new IntWritable(trackId), trackstat);
      } catch (NumberFormatException e) {
        reporter.incrCounter(COUNTER_KEYS.INVALID_LINES, 1);
        log.warn("Invalid line in listening data: " + rawLine);
      }
    }
  }


  /**
   * Sum up the track statistics per track. Output is a TrackStatistics object per track id.
   */
  public static class SumReducer extends MapReduceBase implements
      Reducer<IntWritable, TrackStats, IntWritable, TrackStats> {


    @Override
    public void reduce(IntWritable trackId, Iterator<TrackStats> values,
        OutputCollector<IntWritable, TrackStats> output, Reporter reporter) throws IOException {
      
      TrackStats sum = new TrackStats(); // holds the totals for this track
      while (values.hasNext()) {
        TrackStats trackStats = (TrackStats) values.next();
        sum.setListeners(sum.getListeners() + trackStats.getListeners());
        sum.setPlays(sum.getPlays() + trackStats.getPlays());
        sum.setSkips(sum.getSkips() + trackStats.getSkips());
        sum.setScrobbles(sum.getScrobbles() + trackStats.getScrobbles());
        sum.setRadioPlays(sum.getRadioPlays() + trackStats.getRadioPlays());
      }
      output.collect(trackId, sum);
    }
  }


  /**
   * Mapper that takes the number of listeners for a track and converts this to a TrackStats object which is output
   * against each track id.
   */
  public static class MergeListenersMapper extends MapReduceBase implements
      Mapper<IntWritable, IntWritable, IntWritable, TrackStats> {


    public void map(IntWritable trackId, IntWritable uniqueListenerCount,
        OutputCollector<IntWritable, TrackStats> output, Reporter reporter) throws IOException {
      
      TrackStats trackStats = new TrackStats();
      trackStats.setListeners(uniqueListenerCount.get());
      output.collect(trackId, trackStats);
    }
  }


  /**
   * Create a JobConf for a Job that will calculate the number of unique listeners per track.
   * 
   * @param inputDir The path to the folder containing the raw listening data files.
   * @return The unique listeners JobConf.
   */
  private JobConf getUniqueListenersJobConf(Path inputDir) {
    log.info("Creating configuration for unique listeners Job");


    // output results to a temporary intermediate folder, this will get deleted by start() method
    Path uniqueListenersOutput = new Path("uniqueListeners");


    JobConf conf = new JobConf(TrackStatisticsProgram.class);
    conf.setOutputKeyClass(IntWritable.class); // track id
    conf.setOutputValueClass(IntWritable.class); // number of unique listeners
    conf.setInputFormat(TextInputFormat.class); // raw listening data
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setMapperClass(UniqueListenersMapper.class);
    conf.setCombinerClass(UniqueListenersCombiner.class);
    conf.setReducerClass(UniqueListenersReducer.class);


    FileInputFormat.addInputPath(conf, inputDir);
    FileOutputFormat.setOutputPath(conf, uniqueListenersOutput);
    conf.setJobName("uniqueListeners");
    return conf;
  }


  /**
   * Creates a JobConf for a Job that will sum up the TrackStatistics per track.
   * 
   * @param inputDir The path to the folder containing the raw input data files.
   * @return The sum JobConf.
   */
  private JobConf getSumJobConf(Path inputDir) {
    log.info("Creating configuration for sum job");
    // output results to a temporary intermediate folder, this will get deleted by start() method
    Path playsOutput = new Path("sum");


    JobConf conf = new JobConf(TrackStatisticsProgram.class);
    conf.setOutputKeyClass(IntWritable.class); // track id
    conf.setOutputValueClass(TrackStats.class); // statistics for a track
    conf.setInputFormat(TextInputFormat.class); // raw listening data
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setMapperClass(SumMapper.class);
    conf.setCombinerClass(SumReducer.class);
    conf.setReducerClass(SumReducer.class);


    FileInputFormat.addInputPath(conf, inputDir);
    FileOutputFormat.setOutputPath(conf, playsOutput);
    conf.setJobName("sum");
    return conf;
  }


  /**
   * Creates a JobConf for a Job that will merge the unique listeners and track statistics.
   * 
   * @param outputPath The path for the results to be output to.
   * @param sumInputDir The path containing the data from the sum Job.
   * @param listenersInputDir The path containing the data from the unique listeners job.
   * @return The merge JobConf.
   */
  private JobConf getMergeConf(Path outputPath, Path sumInputDir, Path listenersInputDir) {
    log.info("Creating configuration for merge job");
    JobConf conf = new JobConf(TrackStatisticsProgram.class);
    conf.setOutputKeyClass(IntWritable.class); // track id
    conf.setOutputValueClass(TrackStats.class); // overall track statistics
    conf.setCombinerClass(SumReducer.class); // safe to re-use reducer as a combiner here
    conf.setReducerClass(SumReducer.class);
    conf.setOutputFormat(TextOutputFormat.class);


    FileOutputFormat.setOutputPath(conf, outputPath);


    MultipleInputs.addInputPath(conf, sumInputDir, SequenceFileInputFormat.class, IdentityMapper.class);
    MultipleInputs.addInputPath(conf, listenersInputDir, SequenceFileInputFormat.class, MergeListenersMapper.class);
    conf.setJobName("merge");
    return conf;
  }


  /**
   * Start the program.
   * 
   * @param inputDir The path to the folder containing the raw listening data files.
   * @param outputPath The path for the results to be output to.
   * @throws IOException If an error occurs retrieving data from the file system or an error occurs running the job.
   */
  public void start(Path inputDir, Path outputDir) throws IOException {
    FileSystem fs = FileSystem.get(this.conf);


    JobConf uniqueListenersConf = getUniqueListenersJobConf(inputDir);
    Path listenersOutputDir = FileOutputFormat.getOutputPath(uniqueListenersConf);
    Job listenersJob = new Job(uniqueListenersConf);
    // delete any output that might exist from a previous run of this job
    if (fs.exists(FileOutputFormat.getOutputPath(uniqueListenersConf))) {
      fs.delete(FileOutputFormat.getOutputPath(uniqueListenersConf), true);
    }


    JobConf sumConf = getSumJobConf(inputDir);
    Path sumOutputDir = FileOutputFormat.getOutputPath(sumConf);
    Job sumJob = new Job(sumConf);
    // delete any output that might exist from a previous run of this job
    if (fs.exists(FileOutputFormat.getOutputPath(sumConf))) {
      fs.delete(FileOutputFormat.getOutputPath(sumConf), true);
    }


    // the merge job depends on the other two jobs
    ArrayList<Job> mergeDependencies = new ArrayList<Job>();
    mergeDependencies.add(listenersJob);
    mergeDependencies.add(sumJob);
    JobConf mergeConf = getMergeConf(outputDir, sumOutputDir, listenersOutputDir);
    Job mergeJob = new Job(mergeConf, mergeDependencies);
    // delete any output that might exist from a previous run of this job
    if (fs.exists(FileOutputFormat.getOutputPath(mergeConf))) {
      fs.delete(FileOutputFormat.getOutputPath(mergeConf), true);
    }


    // store the output paths of the intermediate jobs so this can be cleaned up after a successful run
    List<Path> deletePaths = new ArrayList<Path>();
    deletePaths.add(FileOutputFormat.getOutputPath(uniqueListenersConf));
    deletePaths.add(FileOutputFormat.getOutputPath(sumConf));


    JobControl control = new JobControl("TrackStatisticsProgram");
    control.addJob(listenersJob);
    control.addJob(sumJob);
    control.addJob(mergeJob);


    // execute the jobs
    try {
      Thread jobControlThread = new Thread(control, "jobcontrol");
      jobControlThread.start();
      while (!control.allFinished()) {
        Thread.sleep(1000);
      }
      if (control.getFailedJobs().size() > 0) {
        throw new IOException("One or more jobs failed");
      }
    } catch (InterruptedException e) {
      throw new IOException("Interrupted while waiting for job control to finish", e);
    }


    // remove intermediate output paths
    for (Path deletePath : deletePaths) {
      fs.delete(deletePath, true);
    }
  }


  /**
   * Set the Configuration used by this Program.
   * 
   * @param conf The new Configuration to use by this program.
   */
  public void setConf(Configuration conf) {
    this.conf = conf; // this will usually only be set by unit test.
  }


  /**
   * Gets the Configuration used by this program.
   * 
   * @return This program's Configuration.
   */
  public Configuration getConf() {
    return conf;
  }


  /**
   * Main method used to run the TrackStatisticsProgram from the command line. This takes two parameters - first the
   * path to the folder containing the raw input data; and second the path for the data to be output to.
   * 
   * @param args Command line arguments.
   * @throws IOException If an error occurs running the program.
   */
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      log.info("Args: <input directory> <output directory>");
      return;
    }


    Path inputPath = new Path(args[0]);
    Path outputDir = new Path(args[1]);
    log.info("Running on input directories: " + inputPath);
    TrackStatisticsProgram listeners = new TrackStatisticsProgram();
    listeners.start(inputPath, outputDir);
  }


}
Source Code of fm.last.hadoop.programs.labs.trackstats.TrackStatisticsProgram

Related Classes of fm.last.hadoop.programs.labs.trackstats.TrackStatisticsProgram