Package org.apache.crunch.io.hbase

Source Code of org.apache.crunch.io.hbase.WordCountHBaseIT

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch.io.hbase;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.Map;
import java.util.Random;

import org.apache.crunch.DoFn;
import org.apache.crunch.Emitter;
import org.apache.crunch.MapFn;
import org.apache.crunch.PCollection;
import org.apache.crunch.PTable;
import org.apache.crunch.Pair;
import org.apache.crunch.Pipeline;
import org.apache.crunch.PipelineResult;
import org.apache.crunch.impl.mr.MRPipeline;
import org.apache.crunch.test.TemporaryPath;
import org.apache.crunch.test.TemporaryPaths;
import org.apache.crunch.types.writable.Writables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.After;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;

import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableSet;

public class WordCountHBaseIT {

  static class StringifyFn extends MapFn<Pair<ImmutableBytesWritable, Pair<Result, Result>>, String> {
    @Override
    public String map(Pair<ImmutableBytesWritable, Pair<Result, Result>> input) {
      byte[] firstStrBytes = input.second().first().getValue(WORD_COLFAM, null);
      byte[] secondStrBytes = input.second().second().getValue(WORD_COLFAM, null);
      if (firstStrBytes != null && secondStrBytes != null) {
        return Joiner.on(',').join(new String(firstStrBytes), new String(secondStrBytes));
      }
      return "";
    }
  }

  @Rule
  public TemporaryPath tmpDir = TemporaryPaths.create();

  private static final byte[] COUNTS_COLFAM = Bytes.toBytes("cf");
  private static final byte[] WORD_COLFAM = Bytes.toBytes("cf");

  private HBaseTestingUtility hbaseTestUtil;

  @SuppressWarnings("serial")
  public static PCollection<Put> wordCount(PTable<ImmutableBytesWritable, Result> words) {
    PTable<String, Long> counts = words.parallelDo(
        new DoFn<Pair<ImmutableBytesWritable, Result>, String>() {
          @Override
          public void process(Pair<ImmutableBytesWritable, Result> row, Emitter<String> emitter) {
            byte[] word = row.second().getValue(WORD_COLFAM, null);
            if (word != null) {
              emitter.emit(Bytes.toString(word));
            }
          }
        }, words.getTypeFamily().strings()).count();

    return counts.parallelDo("convert to put", new DoFn<Pair<String, Long>, Put>() {
      @Override
      public void process(Pair<String, Long> input, Emitter<Put> emitter) {
        Put put = new Put(Bytes.toBytes(input.first()));
        put.add(COUNTS_COLFAM, null, Bytes.toBytes(input.second()));
        emitter.emit(put);
      }

    }, HBaseTypes.puts());
  }
 
  @SuppressWarnings("serial")
  public static PCollection<Delete> clearCounts(PTable<ImmutableBytesWritable, Result> counts) {
    return counts.parallelDo("convert to delete", new DoFn<Pair<ImmutableBytesWritable, Result>, Delete>() {
      @Override
      public void process(Pair<ImmutableBytesWritable, Result> input, Emitter<Delete> emitter) {
        Delete delete = new Delete(input.first().get());
        emitter.emit(delete);
      }

    }, HBaseTypes.deletes());
  }

  @Before
  public void setUp() throws Exception {
    Configuration conf = HBaseConfiguration.create(tmpDir.getDefaultConfiguration());
    hbaseTestUtil = new HBaseTestingUtility(conf);
    hbaseTestUtil.startMiniZKCluster();
    hbaseTestUtil.startMiniHBaseCluster(1, 1);
  }

  @Test
  public void testWordCount() throws Exception {
    run(new MRPipeline(WordCountHBaseIT.class, hbaseTestUtil.getConfiguration()));
  }

  @After
  public void tearDown() throws Exception {
    hbaseTestUtil.shutdownMiniHBaseCluster();
    hbaseTestUtil.shutdownMiniZKCluster();
  }

  public void run(Pipeline pipeline) throws Exception {

    Random rand = new Random();
    int postFix = Math.abs(rand.nextInt());
    String inputTableName = "crunch_words_" + postFix;
    String outputTableName = "crunch_counts_" + postFix;
    String otherTableName = "crunch_other_" + postFix;
    String joinTableName = "crunch_join_words_" + postFix;

    HTable inputTable = hbaseTestUtil.createTable(Bytes.toBytes(inputTableName), WORD_COLFAM);
    HTable outputTable = hbaseTestUtil.createTable(Bytes.toBytes(outputTableName), COUNTS_COLFAM);
    HTable otherTable = hbaseTestUtil.createTable(Bytes.toBytes(otherTableName), COUNTS_COLFAM);

    int key = 0;
    key = put(inputTable, key, "cat");
    key = put(inputTable, key, "cat");
    key = put(inputTable, key, "dog");
    inputTable.flushCommits();

    Scan scan = new Scan();
    scan.addFamily(WORD_COLFAM);
    HBaseSourceTarget source = new HBaseSourceTarget(inputTableName, scan);
    PTable<ImmutableBytesWritable, Result> words = pipeline.read(source);

    Map<ImmutableBytesWritable, Result> materialized = words.materializeToMap();
    assertEquals(3, materialized.size());
    PCollection<Put> puts = wordCount(words);
    pipeline.write(puts, new HBaseTarget(outputTableName));
    pipeline.write(puts, new HBaseTarget(otherTableName));
    PipelineResult res = pipeline.done();
    assertTrue(res.succeeded());

    assertIsLong(otherTable, "cat", 2);
    assertIsLong(otherTable, "dog", 1);
    assertIsLong(outputTable, "cat", 2);
    assertIsLong(outputTable, "dog", 1);

    // verify we can do joins.
    HTable joinTable = hbaseTestUtil.createTable(Bytes.toBytes(joinTableName), WORD_COLFAM);

    key = 0;
    key = put(joinTable, key, "zebra");
    key = put(joinTable, key, "donkey");
    key = put(joinTable, key, "bird");
    key = put(joinTable, key, "horse");
    joinTable.flushCommits();

    Scan joinScan = new Scan();
    joinScan.addFamily(WORD_COLFAM);
    PTable<ImmutableBytesWritable, Result> other = pipeline.read(FromHBase.table(joinTableName, joinScan));
    PCollection<String> joined = words.join(other).parallelDo(new StringifyFn(), Writables.strings());
    assertEquals(ImmutableSet.of("cat,zebra", "cat,donkey", "dog,bird"),
        ImmutableSet.copyOf(joined.materialize()));
    pipeline.done();

    //verify HBaseTarget supports deletes.
    Scan clearScan = new Scan();
    clearScan.addFamily(COUNTS_COLFAM);
    pipeline = new MRPipeline(WordCountHBaseIT.class, hbaseTestUtil.getConfiguration());
    HBaseSourceTarget clearSource = new HBaseSourceTarget(outputTableName, clearScan);
    PTable<ImmutableBytesWritable, Result> counts = pipeline.read(clearSource);
    pipeline.write(clearCounts(counts), new HBaseTarget(outputTableName));
    pipeline.done();

    assertDeleted(outputTable, "cat");
    assertDeleted(outputTable, "dog");
  }

  protected int put(HTable table, int key, String value) throws IOException {
    Put put = new Put(Bytes.toBytes(key));
    put.add(WORD_COLFAM, null, Bytes.toBytes(value));
    table.put(put);
    return key + 1;
  }

  protected static void assertIsLong(HTable table, String key, long i) throws IOException {
    Get get = new Get(Bytes.toBytes(key));
    get.addFamily(COUNTS_COLFAM);
    Result result = table.get(get);

    byte[] rawCount = result.getValue(COUNTS_COLFAM, null);
    assertNotNull(rawCount);
    assertEquals(i, Bytes.toLong(rawCount));
  }
 
  protected static void assertDeleted(HTable table, String key) throws IOException {
    Get get = new Get(Bytes.toBytes(key));
    get.addFamily(COUNTS_COLFAM);
    Result result = table.get(get);
    assertTrue(result.isEmpty());
  }

}
TOP

Related Classes of org.apache.crunch.io.hbase.WordCountHBaseIT

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.