Package mrdp.ch4

Source Code of mrdp.ch4.Binning$BinningMapper

package mrdp.ch4;

import java.io.IOException;
import java.util.Map;

import mrdp.utils.MRDPUtils;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class Binning {

  public static class BinningMapper extends
      Mapper<Object, Text, Text, NullWritable> {

    private MultipleOutputs<Text, NullWritable> mos = null;

    @SuppressWarnings({ "unchecked", "rawtypes" })
    @Override
    protected void setup(Context context) {
      // Create a new MultipleOutputs using the context object
      mos = new MultipleOutputs(context);
    }

    @Override
    protected void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {

      // Parse the input string into a nice map
      Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
          .toString());

      String rawtags = parsed.get("Tags");
      if (rawtags == null) {
        return;
      }

      // Tags are delimited by ><. i.e. <tag1><tag2><tag3>
      String[] tagTokens = StringEscapeUtils.unescapeHtml(rawtags).split(
          "><");

      // For each tag
      for (String tag : tagTokens) {
        // Remove any > or < from the token
        String groomed = tag.replaceAll(">|<", "").toLowerCase();

        // If this tag is one of the following, write to the named bin
        if (groomed.equalsIgnoreCase("hadoop")) {
          mos.write("bins", value, NullWritable.get(), "hadoop-tag");
        }

        if (groomed.equalsIgnoreCase("pig")) {
          mos.write("bins", value, NullWritable.get(), "pig-tag");
        }

        if (groomed.equalsIgnoreCase("hive")) {
          mos.write("bins", value, NullWritable.get(), "hive-tag");
        }

        if (groomed.equalsIgnoreCase("hbase")) {
          mos.write("bins", value, NullWritable.get(), "hbase-tag");
        }
      }

      // Get the body of the post
      String post = parsed.get("Body");

      if (post == null) {
        return;
      }

      // If the post contains the word "hadoop", write it to its own bin
      if (post.toLowerCase().contains("hadoop")) {
        mos.write("bins", value, NullWritable.get(), "hadoop-post");
      }
    }

    @Override
    protected void cleanup(Context context) throws IOException,
        InterruptedException {
      // Close multiple outputs!
      mos.close();
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args)
        .getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: Binning <posts> <outdir>");
      System.exit(1);
    }

    Job job = new Job(conf, "Binning");
    job.setJarByClass(Binning.class);
    job.setMapperClass(BinningMapper.class);
    job.setNumReduceTasks(0);

    TextInputFormat.setInputPaths(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    // Configure the MultipleOutputs by adding an output called "bins"
    // With the proper output format and mapper key/value pairs
    MultipleOutputs.addNamedOutput(job, "bins", TextOutputFormat.class,
        Text.class, NullWritable.class);

    // Enable the counters for the job
    // If there is a significant number of different named outputs, this
    // should be disabled
    MultipleOutputs.setCountersEnabled(job, true);

    System.exit(job.waitForCompletion(true) ? 0 : 2);
  }
}
TOP

Related Classes of mrdp.ch4.Binning$BinningMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.