/*
* Copyright (C) 2012 Daniel Aioanei.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.uzaygezen.core.hbase;
import java.io.IOException;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.junit.Assert;
import org.junit.Test;
import com.google.common.base.Charsets;
import com.google.common.base.Functions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterators;
import com.google.common.primitives.Ints;
import com.google.uzaygezen.core.BacktrackingQueryBuilder;
import com.google.uzaygezen.core.BigIntegerContent;
import com.google.uzaygezen.core.BitVector;
import com.google.uzaygezen.core.BitVectorFactories;
import com.google.uzaygezen.core.BitVectorMath;
import com.google.uzaygezen.core.BoundedRollup;
import com.google.uzaygezen.core.CompactHilbertCurve;
import com.google.uzaygezen.core.FilteredIndexRange;
import com.google.uzaygezen.core.HilbertIndexMasks;
import com.google.uzaygezen.core.MapNode;
import com.google.uzaygezen.core.MapRegionInspector;
import com.google.uzaygezen.core.MultiDimensionalSpec;
import com.google.uzaygezen.core.NodeValue;
import com.google.uzaygezen.core.PlainFilterCombiner;
import com.google.uzaygezen.core.Pow2LengthBitSetRange;
import com.google.uzaygezen.core.Pow2LengthBitSetRangeFactory;
import com.google.uzaygezen.core.Query;
import com.google.uzaygezen.core.QueryBuilder;
import com.google.uzaygezen.core.RegionInspector;
import com.google.uzaygezen.core.SimpleRegionInspector;
import com.google.uzaygezen.core.SpaceFillingCurve;
import com.google.uzaygezen.core.StreamingRollup;
import com.google.uzaygezen.core.TestUtils;
import com.google.uzaygezen.core.ZoomingSpaceVisitorAdapter;
import com.google.uzaygezen.core.ranges.BigIntegerRange;
import com.google.uzaygezen.core.ranges.BigIntegerRangeHome;
import com.google.uzaygezen.core.ranges.RangeUtil;
/**
* Test case that also serves as an example of how to use the query
* functionality. While this class relies on BigInteger, BigIntegerContent,
* BigIntegerRange and BigIntegerRangeHome, it is recommended to use the
* parallel Long, LongContent, LongRange and LogRangeHome classes when the total
* precision of the Hilbert space is less than 63 bits.
*
* @author Daniel Aioanei
*/
public class HBaseQueryTest {
private static final Logger logger = Logger.getLogger(HBaseQueryTest.class.getSimpleName());
/**
* With more than 62 bits (using {@link BigInteger} rather than plain
* {@link Long}) and without any caching rollup version of the data
* {@link BoundedRollup}, this way of building the queries is likely to be
* quite slow, but it shows off the capability of perform queries of
* non-cached arbitrary-precision data.
*/
@Test
public void queryHBase() throws IOException, InterruptedException {
MockHTable table = MockHTable.create();
final byte[] family = "FAMILY".getBytes(Charsets.ISO_8859_1);
/*
* We choose not to store the coordinates themselves, since storing the
* Hilbert index is sufficient to recover the coordinate values. So let's
* use a dummy column.
*/
final byte[][] qualifiers = {"NICE".getBytes(Charsets.ISO_8859_1),};
MultiDimensionalSpec spec = new MultiDimensionalSpec(Ints.asList(30, 10, 25));
// Add some data.
Random rnd = new Random(TestUtils.SEED);
int[][] data = generateData(spec, 1 << 16, rnd);
SpaceFillingCurve sfc = new CompactHilbertCurve(spec);
logger.log(Level.INFO, "Populating table with up to {0} rows.", data.length);
populateTable(family, qualifiers, spec, data, sfc, table);
int cacheSize = 1 << 8;
logger.log(Level.INFO, "Building cache of size {0}.", cacheSize);
// The cache is optional.
Map<Pow2LengthBitSetRange, NodeValue<BigIntegerContent>> rolledupMap = createRolledupCache(
table, spec, sfc, cacheSize);
logger.log(Level.INFO, "Constructed cache of actual size {0}.", rolledupMap.size());
for (int trial = 0; trial < 1; ++trial) {
logger.log(Level.INFO, "trial={0}", trial);
int[] maxLengthPerDimension = new int[spec.getBitsPerDimension().size()];
for (boolean useCache : new boolean[] {false, true}) {
int m = useCache ? 256 : 32;
/*
* For testing purposes limit the range size to m values for each
* dimension to speed up query computation. In practice, query volume
* should be enforced to be small, and when a certain query volume is
* exceeded, a full table scan will probably be faster anyway.
*/
Arrays.fill(maxLengthPerDimension, m);
int[][] ranges = generateRanges(spec, maxLengthPerDimension, rnd);
logger.log(Level.INFO, "ranges={0}", Arrays.deepToString(ranges));
// Limit the maximum number of ranges.
int maxRanges = 1 + rnd.nextInt(32);
List<int[]> actual = queryAndFilter(
table, spec, sfc, ranges, maxRanges, useCache ? rolledupMap : null);
List<int[]> expected = uniq(fullScanQuery(data, sfc, ranges));
logger.log(Level.INFO, "expected.size()={0}", expected.size());
Assert.assertEquals(expected.size(), actual.size());
for (int i = 0; i < expected.size(); ++i) {
Assert.assertArrayEquals(expected.get(i), actual.get(i));
}
}
}
}
public Map<Pow2LengthBitSetRange, NodeValue<BigIntegerContent>> createRolledupCache(
MockHTable table, MultiDimensionalSpec spec, SpaceFillingCurve sfc, int cacheSize)
throws IOException {
int[] elementLengths = Ints.toArray(new HilbertIndexMasks(sfc.getSpec()).cardinalities());
BitVector[] path = new BitVector[elementLengths.length];
for (int i = 0; i < path.length; ++i) {
path[i] = BitVectorFactories.OPTIMAL.apply(elementLengths[path.length - i - 1]);
}
StreamingRollup<BitVector, BigIntegerContent> rollup = BoundedRollup.create(
new BigIntegerContent(BigInteger.ZERO), cacheSize);
Scan fullScan = new Scan();
ResultScanner scanner = table.getScanner(fullScan);
BitVector hilbertIndex = BitVectorFactories.OPTIMAL.apply(spec.sumBitsPerDimension());
for (Result row : scanner) {
hilbertIndex.copyFromBigEndian(row.getRow());
for (int i = 0; i < path.length; ++i) {
path[i] = path[i].clone();
}
BitVectorMath.split(hilbertIndex, path);
// We should say the exact number of times. Saying one is correct, but
// suboptimal.
BigIntegerContent v = new BigIntegerContent(BigInteger.ONE);
rollup.feedRow(Iterators.<BitVector>forArray(path), v);
}
MapNode<BitVector, BigIntegerContent> rolledupTree = rollup.finish();
Pow2LengthBitSetRangeFactory<BigIntegerContent> factory = Pow2LengthBitSetRangeFactory.create(Ints.asList(elementLengths));
Map<Pow2LengthBitSetRange, NodeValue<BigIntegerContent>> rolledupMap = factory.apply(rolledupTree);
return rolledupMap;
}
public List<int[]> fullScanQuery(int[][] data, SpaceFillingCurve sfc, int[][] ranges) {
MultiDimensionalSpec spec = sfc.getSpec();
List<Integer> filtered = filter(data, ranges);
List<Pair<BitVector, Integer>> pairs = new ArrayList<>(filtered.size());
BitVector[] point = new BitVector[spec.getBitsPerDimension().size()];
for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
point[j] = BitVectorFactories.OPTIMAL.apply(spec.getBitsPerDimension().get(j));
}
for (int i : filtered) {
BitVector index = BitVectorFactories.OPTIMAL.apply(spec.sumBitsPerDimension());
// int has 32 bits, which fits in each dimensions.
for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
point[j].copyFrom(data[i][j]);
}
sfc.index(point, 0, index);
pairs.add(Pair.of(index.clone(), i));
}
// Sort by Hilbert index.
Collections.sort(pairs);
List<int[]> expected = new ArrayList<>(pairs.size());
for (Pair<BitVector, Integer> pair : pairs) {
expected.add(data[pair.getRight()]);
}
return expected;
}
private static List<Integer> filter(int[][] data, int[][] ranges) {
List<Integer> result = new ArrayList<>();
for (int i = 0; i < data.length; ++i) {
if (RangeUtil.contains(ranges, data[i])) {
result.add(i);
}
}
return result;
}
public List<int[]> queryAndFilter(
MockHTable table, MultiDimensionalSpec spec, SpaceFillingCurve sfc, int[][] ranges,
int maxRanges, Map<Pow2LengthBitSetRange, NodeValue<BigIntegerContent>> rolledupMap)
throws IOException {
List<BigIntegerRange> region = rangesToQueryRegion(ranges);
List<FilteredIndexRange<Object, BigIntegerRange>> indexRanges = query(
table, region, sfc, maxRanges, rolledupMap);
Assert.assertTrue(indexRanges.size() <= maxRanges);
logger.log(Level.INFO, "indexRanges={0}", indexRanges);
// The ranges are in strictly increasing hilbert index order.
for (int i = 0; i < indexRanges.size() - 1; ++i) {
FilteredIndexRange<Object, BigIntegerRange> a = indexRanges.get(i);
FilteredIndexRange<Object, BigIntegerRange> b = indexRanges.get(i + 1);
Assert.assertTrue(a.getIndexRange().getEnd().compareTo(b.getIndexRange().getStart()) < 0);
}
BitVector start = BitVectorFactories.OPTIMAL.apply(spec.sumBitsPerDimension());
BitVector end = BitVectorFactories.OPTIMAL.apply(spec.sumBitsPerDimension());
Scan[] scans = new Scan[indexRanges.size()];
for (int i = 0; i < indexRanges.size(); ++i) {
FilteredIndexRange<Object, BigIntegerRange> indexRange = indexRanges.get(i);
BigInteger startBigInteger = indexRange.getIndexRange().getStart();
start.copyFrom(startBigInteger);
BigInteger endBigInteger = indexRange.getIndexRange().getEnd();
final Scan scan;
if (endBigInteger.testBit(spec.sumBitsPerDimension())) {
scan = new Scan(start.toBigEndianByteArray());
} else {
end.copyFrom(endBigInteger);
scan = new Scan(start.toBigEndianByteArray(), end.toBigEndianByteArray());
}
scans[i] = scan;
}
BitVector[] point = new BitVector[spec.getBitsPerDimension().size()];
BitVector index = BitVectorFactories.OPTIMAL.apply(spec.sumBitsPerDimension());
for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
point[j] = BitVectorFactories.OPTIMAL.apply(spec.getBitsPerDimension().get(j));
}
List<int[]> actual = new ArrayList<>();
for (int i = 0; i < indexRanges.size(); ++i) {
ResultScanner scanner = table.getScanner(scans[i]);
FilteredIndexRange<Object, BigIntegerRange> indexRange = indexRanges.get(i);
logger.log(Level.FINE, "indexRange={0}", indexRange);
for (Result result : scanner) {
byte[] row = result.getRow();
index.copyFromBigEndian(row);
sfc.indexInverse(index, point);
boolean isContained = RangeUtil.containsBigInteger(
region, Arrays.asList(bitVectorPointToBigIntegerPoint(point)));
if (!indexRange.isPotentialOverSelectivity()) {
Assert.assertTrue(isContained);
}
if (isContained) {
int[] e = new int[point.length];
for (int j = 0; j < e.length; ++j) {
e[j] = (int) point[j].toExactLong();
}
actual.add(e);
}
}
}
return actual;
}
private BigInteger[] bitVectorPointToBigIntegerPoint(BitVector[] point) {
BigInteger[] a = new BigInteger[point.length];
for (int i = 0; i < a.length; ++i) {
a[i] = point[i].toBigInteger();
}
return a;
}
private List<FilteredIndexRange<Object, BigIntegerRange>> query(
MockHTable table, List<BigIntegerRange> region, SpaceFillingCurve sfc, int maxRanges,
Map<Pow2LengthBitSetRange, NodeValue<BigIntegerContent>> rolledupMap) {
List<? extends List<BigIntegerRange>> x = ImmutableList.of(region);
BigIntegerContent zero = new BigIntegerContent(BigInteger.ZERO);
Object filter = "";
BigIntegerContent one = new BigIntegerContent(BigInteger.ONE);
RegionInspector<Object, BigIntegerContent> simpleRegionInspector = SimpleRegionInspector.create(
x, one, Functions.constant(filter), BigIntegerRangeHome.INSTANCE, zero);
final RegionInspector<Object, BigIntegerContent> regionInspector;
if (rolledupMap == null) {
regionInspector = simpleRegionInspector;
} else {
regionInspector = MapRegionInspector.create(
rolledupMap, simpleRegionInspector, false, zero, one);
}
// Not using using sub-ranges here.
PlainFilterCombiner<Object, BigInteger, BigIntegerContent, BigIntegerRange> combiner = new PlainFilterCombiner<>(
filter);
QueryBuilder<Object, BigIntegerRange> queryBuilder = BacktrackingQueryBuilder.create(
regionInspector, combiner, maxRanges, true, BigIntegerRangeHome.INSTANCE, zero);
sfc.accept(new ZoomingSpaceVisitorAdapter(sfc, queryBuilder));
Query<Object, BigIntegerRange> query = queryBuilder.get();
return query.getFilteredIndexRanges();
}
private static List<BigIntegerRange> rangesToQueryRegion(int[][] ranges) {
List<BigIntegerRange> region = new ArrayList<>();
for (int j = 0; j < ranges.length; ++j) {
region.add(BigIntegerRange.of(ranges[j][0], ranges[j][1]));
}
return region;
}
private static int[][] generateRanges(
MultiDimensionalSpec spec, int[] maxLengthPerDimension, Random rnd) {
int[][] ranges = new int[spec.getBitsPerDimension().size()][2];
for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
int bound = 1 << spec.getBitsPerDimension().get(j);
int start = bound / 2 - rnd.nextInt(Math.min(bound, maxLengthPerDimension[j])) / 2;
assert start >= 0;
int end = (bound + 1) / 2 + rnd.nextInt(Math.min(bound, maxLengthPerDimension[j])) / 2;
assert end <= bound;
ranges[j][0] = start;
ranges[j][1] = end;
}
return ranges;
}
private static void populateTable(
final byte[] family, final byte[][] qualifiers, MultiDimensionalSpec spec, int[][] data,
SpaceFillingCurve sfc, MockHTable table) throws IOException, InterruptedException {
BitVector[] point = new BitVector[spec.getBitsPerDimension().size()];
BitVector index = BitVectorFactories.OPTIMAL.apply(spec.sumBitsPerDimension());
for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
point[j] = BitVectorFactories.OPTIMAL.apply(spec.getBitsPerDimension().get(j));
}
Put[] puts = new Put[data.length];
for (int i = 0; i < data.length; ++i) {
// int has 32 bits, which fits in each dimensions.
for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
point[j].copyFrom(data[i][j]);
}
sfc.index(point, 0, index);
byte[] row = index.toBigEndianByteArray();
Put put = new Put(row);
KeyValue[] keyValues = new KeyValue[qualifiers.length];
for (int k = 0; k < qualifiers.length; ++k) {
// Put a nice string representation of the data point in the dummy
// column.
keyValues[k] = new KeyValue(row, family, qualifiers[k], Arrays.toString(data[i]).getBytes(
Charsets.ISO_8859_1));
}
put.setFamilyMap(ImmutableMap.of(family, Arrays.asList(keyValues)));
puts[i] = put;
}
table.batch(Arrays.asList(puts));
}
/**
* It may generate duplicates.
*/
private static int[][] generateData(MultiDimensionalSpec spec, int n, Random rnd) {
int[][] data = new int[n][spec.getBitsPerDimension().size()];
for (int i = 0; i < n; ++i) {
// int has 32 bits, which fits in each dimensions.
for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
int bound = 1 << spec.getBitsPerDimension().get(j);
double gauss = rnd.nextGaussian();
// Std of 1024.
int d = bound / 2 + (int) (gauss * (1 << (spec.getBitsPerDimension().get(j) / 2)) / 1024);
if (d < 0) {
d = 0;
}
if (d >= bound) {
d = bound - 1;
}
data[i][j] = d;
}
}
return data;
}
public static List<int[]> uniq(List<int[]> data) {
List<int[]> u = new ArrayList<>();
for (int i = 0; i < data.size(); ++i) {
if (i == data.size() - 1 || !Arrays.equals(data.get(i), data.get(i + 1))) {
u.add(data.get(i));
}
}
return u;
}
}