Source Code of com.urbanairship.datacube.backfill.HBaseSnapshotter$ResultToKvsMapper

/*
Copyright 2012 Urban Airship and Contributors
*/


package com.urbanairship.datacube.backfill;


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableExistsException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.hfile.Compression.Algorithm;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.regionserver.StoreFile.BloomType;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.mapreduce.Job;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * Takes a "snapshot" of an HBase column family in two steps:
 *  - Mapreduce to write all KeyValues of the source into HFiles on disk
 *  - Use LoadIncrementalHFiles to bulk-load the HFiles into the target CF.
 * 
 * The snapshot isn't a true snapshot because writers could alter the source data
 * while we're mapreducing over it.
 */
public class HBaseSnapshotter implements Runnable {
    private static final Logger log = LoggerFactory.getLogger(HBaseSnapshotter.class);
    
    private final byte[] sourceTableName;
    private final byte[] destTableName;
    private final byte[] cf;
    private final Configuration conf;
    private final Path hfileOutputPath;
    private final boolean okIfTableExists;
    private final byte[] startKey;
    private final byte[] stopKey;


    /**
     * @param okIfTableExists if the destination table already exists, and this bool is false,
     * then there will be a TableExistsException.
     * @param startKey if non-null, the HBase scan will use this key as its start row
     * @param stopKey if non-null, the HBase scan will use this key as its stop row 
     */
    public HBaseSnapshotter(Configuration conf, byte[] sourceTable, byte[] cf,
            byte[] destTable, Path hfileOutputPath, boolean okIfTableExists, byte[] startKey,
            byte[] stopKey) {
        this.sourceTableName = sourceTable;
        this.destTableName = destTable;
        this.conf = conf;
        this.hfileOutputPath = hfileOutputPath;
        this.cf = cf;
        this.okIfTableExists = okIfTableExists;
        this.startKey = startKey;
        this.stopKey = stopKey;
    }


    /**
     * A wrapper around {@link #runWithCheckedExceptions()} that rethrows IOExceptions as
     * RuntimeExceptions.
     */
    @Override
    public void run() {
        try {
            this.runWithCheckedExceptions();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }
    
    public boolean runWithCheckedExceptions() throws IOException, InterruptedException {
        HTable destHTable = null;
        HTable sourceHTable = null;
        ResultScanner destScanner = null;
        try {
            Job job = new Job(conf);
            
            sourceHTable = new HTable(conf, sourceTableName);
            Pair<byte[][],byte[][]> regionStartsEnds = sourceHTable.getStartEndKeys();
            
            HBaseAdmin admin = new HBaseAdmin(conf);
            if(admin.tableExists(destTableName)) {
                if(!okIfTableExists) {
                    throw new TableExistsException(new String(destTableName) + " already exists");
                }
            } else {
                createSnapshotTable(conf, destTableName, 
                        BackfillUtil.getSplitKeys(regionStartsEnds), cf);                
            }
           
            destHTable = new HTable(conf, destTableName);


            Scan scan = new Scan();
            scan.setCaching(5000);
            scan.addFamily(cf);
            if(startKey != null) {
                scan.setStartRow(startKey);
            }
            if(stopKey != null) {
                scan.setStopRow(stopKey);
            }
            
            TableMapReduceUtil.initTableMapperJob(new String(sourceTableName), scan,
                    ResultToKvsMapper.class, ImmutableBytesWritable.class, KeyValue.class, 
                    job);


            job.setJobName("DataCube HBase snapshotter");
            job.setJarByClass(HBaseSnapshotter.class);
            HFileOutputFormat.configureIncrementalLoad(job, destHTable);
            HFileOutputFormat.setOutputPath(job, hfileOutputPath);
            
            job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false");
            job.getConfiguration().set("mapred.reduce.tasks.speculative.execution", "false");
            
            log.debug("Starting HBase mapreduce snapshotter");
            if(!job.waitForCompletion(true)) {
                log.error("Job return false, mapreduce must have failed");
                return false;
            }


            log.debug("Starting HBase bulkloader to load snapshot from HFiles");
            try {
              new LoadIncrementalHFiles(conf).doBulkLoad(hfileOutputPath, destHTable);
            } catch (Exception e) {
              throw new IOException("Bulkloader couldn't run", e);
            }
            
            // Delete the mapreduce output directory if it's empty. This will prevent future
            // mapreduce jobs from quitting with "target directory already exists".
            FileSystem fs = FileSystem.get(hfileOutputPath.toUri(), conf);
            FileStatus stat = fs.getFileStatus(hfileOutputPath);
            FileStatus[] dirListing = fs.listStatus(hfileOutputPath); 
            if(stat.isDir() && dirListing.length <= 3) {
                // In the case where bulkloading was successful, there should be three remaining
                // entries in the HFile directory: _SUCCESS, _LOGS, and cfname. Go ahead and delete.
                fs.delete(hfileOutputPath, true);
            } else {
                List<String> fileNames = new ArrayList<String>();
                for(FileStatus dentry: dirListing) {
                    fileNames.add(dentry.getPath().toString());
                }
                final String errMsg = "Mapreduce output dir had unexpected contents, won't delete: " + 
                        hfileOutputPath + " contains " + fileNames; 
                log.error(errMsg);
                throw new RuntimeException(errMsg);
            }
            
            
            destScanner = destHTable.getScanner(cf);
            if(!destScanner.iterator().hasNext()) {
                log.warn("Destination CF was empty after snapshotting");
            }
            return true;
        } catch (ClassNotFoundException e) { // Mapreduce throws this
            throw new RuntimeException(e);
        } finally {
            if(destScanner != null) {
                destScanner.close();
            }
            if (destHTable != null) {
                destHTable.close();
            }
            if(sourceHTable != null) {
                sourceHTable.close();
            }
        }
    }
    
    private static void createSnapshotTable(Configuration conf, byte[] tableName, byte[][] splitKeys,
            byte[] cf) throws IOException {
        HBaseAdmin hba = new HBaseAdmin(conf);
        HColumnDescriptor cfDesc = new HColumnDescriptor(cf);
        cfDesc.setBloomFilterType(BloomType.NONE);
        cfDesc.setMaxVersions(1);
        cfDesc.setCompressionType(Algorithm.NONE); // TODO change to snappy in 0.92
        HTableDescriptor tableDesc = new HTableDescriptor(tableName);
        tableDesc.addFamily(cfDesc);
        hba.createTable(tableDesc, splitKeys);
    }   
    
    public static class ResultToKvsMapper extends TableMapper<ImmutableBytesWritable,KeyValue> {
        @Override
        protected void map(ImmutableBytesWritable key, Result result,
                Context context) throws IOException, InterruptedException {
//            DebugHack.log("Snapshot mapper running");
            for(KeyValue kv: result.list()) {
                context.write(key, kv);
            }
        }
    }
    
    
}
Source Code of com.urbanairship.datacube.backfill.HBaseSnapshotter$ResultToKvsMapper

Related Classes of com.urbanairship.datacube.backfill.HBaseSnapshotter$ResultToKvsMapper