Source Code of mrdp.ch3.QueryBloomFiltering$BloomFilteringMapper

package mrdp.ch3;


import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;
import java.util.Map;


import mrdp.utils.MRDPUtils;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.bloom.BloomFilter;
import org.apache.hadoop.util.bloom.Key;


public class QueryBloomFiltering {


  public static class BloomFilteringMapper extends
      Mapper<Object, Text, Text, NullWritable> {


    private BloomFilter filter = new BloomFilter();
    private HTable table = null;


    @Override
    protected void setup(Context context) throws IOException,
        InterruptedException {
      URI[] files = DistributedCache.getCacheFiles(context
          .getConfiguration());


      // if the files in the distributed cache are set
      if (files != null && files.length == 1) {
        System.out.println("Reading Bloom filter from: "
            + files[0].getPath());


        // Open local file for read.
        DataInputStream strm = new DataInputStream(new FileInputStream(
            files[0].getPath()));


        // Read into our Bloom filter.
        filter.readFields(strm);
        strm.close();
      } else {
        throw new IOException(
            "Bloom filter file not set in the DistributedCache.");
      }


      // Get HBase table of user info
      Configuration hconf = HBaseConfiguration.create();
      table = new HTable(hconf, "user_table");
    }


    @Override
    public void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {


      // Parse the input into a nice map.
      Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
          .toString());


      // Get the value for the comment
      String userid = parsed.get("UserId");


      // If it is null, skip this record
      if (userid == null) {
        return;
      }


      // If this user ID is in the set
      if (filter.membershipTest(new Key(userid.getBytes()))) {
        // Get the reputation from the HBase table
        Result r = table.get(new Get(userid.getBytes()));
        int reputation = Integer.parseInt(new String(r.getValue(
            "attr".getBytes(), "Reputation".getBytes())));
        // If the reputation is at least 1,500,
        // write the record to the file system
        if (reputation >= 1500) {
          context.write(value, NullWritable.get());
        }
      }
    }
  }


  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args)
        .getRemainingArgs();
    if (otherArgs.length != 3) {
      System.err.println("Usage: BloomFiltering <in> <cachefile> <out>");
      System.exit(1);
    }


    FileSystem.get(conf).delete(new Path(otherArgs[2]), true);


    Job job = new Job(conf, "StackOverflow Bloom Filtering");
    job.setJarByClass(QueryBloomFiltering.class);
    job.setMapperClass(BloomFilteringMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));


    DistributedCache.addCacheFile(
        FileSystem.get(conf).makeQualified(new Path(otherArgs[1]))
            .toUri(), job.getConfiguration());


    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}
Source Code of mrdp.ch3.QueryBloomFiltering$BloomFilteringMapper

Related Classes of mrdp.ch3.QueryBloomFiltering$BloomFilteringMapper