Package net.agkn.hll

Source Code of net.agkn.hll.SparseHLLTest

package net.agkn.hll;

/*
* Copyright 2013 Aggregate Knowledge, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import static org.powermock.reflect.Whitebox.getInternalState;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
import it.unimi.dsi.fastutil.ints.Int2ByteOpenHashMap;
import java.util.Random;

import net.agkn.hll.serialization.ISchemaVersion;
import net.agkn.hll.serialization.SerializationUtil;
import net.agkn.hll.util.HLLUtil;

import org.testng.annotations.Test;

/**
* Tests {@link HLL} of type {@link HLLType#SPARSE}.
*
* @author timon
*/
public class SparseHLLTest {
    private static final int log2m = 11;

    /**
     * Tests {@link HLL#addRaw(long)}.
     */
    @Test
    public void addTest() {
        { // insert an element with register value 1 (minimum set value)
            final int registerIndex = 0;
            final int registerValue = 1;
            final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);

            final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
            hll.addRaw(rawValue);

            assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
        }
        { // insert an element with register value 31 (maximum set value)
            final int registerIndex = 0;
            final int registerValue = 31;
            final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);

            final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
            hll.addRaw(rawValue);

            assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
        }
        { // insert an element that could overflow the register (past 31)
            final int registerIndex = 0;
            final int registerValue = 36;
            final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);

            final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
            hll.addRaw(rawValue);

            assertOneRegisterSet(hll, (short)registerIndex, (byte)31/*register max*/);
        }
        { // insert duplicate elements, observe no change
            final int registerIndex = 0;
            final int registerValue = 1;
            final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);

            final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
            hll.addRaw(rawValue);
            hll.addRaw(rawValue);

            assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
        }
        { // insert elements that increase a register's value
            final int registerIndex = 0;
            final int registerValue = 1;
            final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);

            final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
            hll.addRaw(rawValue);

            final int registerValue2 = 2;
            final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
            hll.addRaw(rawValue2);

            assertOneRegisterSet(hll, registerIndex, (byte)registerValue2);
        }
        { // insert elements that have lower register values, observe no change
            final int registerIndex = 0;
            final int registerValue = 2;
            final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);

            final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
            hll.addRaw(rawValue);

            final int registerValue2 = 1;
            final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
            hll.addRaw(rawValue2);

            assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
        }
    }

    /**
     * Smoke test for {@link HLL#cardinality()} and the proper use of the small
     * range correction.
     */
    @Test
    public void smallRangeSmokeTest() {
        final int log2m = 11;
        final int m = (1 << log2m);
        final int regwidth = 5;

        // only one register set
        {
            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
            hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0, 1));

            final long cardinality = hll.cardinality();

            // Trivially true that small correction conditions hold: one register
            // set implies zeroes exist, and estimator trivially smaller than 5m/2.
            // Small range correction: m * log(m/V)
            final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
            assertEquals(cardinality, expected);
        }

        // all but one register set
        {
            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
            for(int i=0; i<(m - 1); i++) {
                hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
            }

            // Trivially true that small correction conditions hold: all but
            // one register set implies a zero exists, and estimator trivially
            // smaller than 5m/2 since it's alpha / ((m-1)/2)
            final long cardinality = hll.cardinality();

            // Small range correction: m * log(m/V)
            final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
            assertEquals(cardinality, expected);
        }
    }

    /**
     * Smoke test for {@link HLL#cardinality()} and the proper use of the
     * uncorrected estimator.
     */
    @Test
    public void normalRangeSmokeTest() {
        final int log2m = 11;
        final int m = (1 << log2m);
        final int regwidth = 5;
        // regwidth = 5, so hash space is
        // log2m + (2^5 - 1 - 1), so L = log2m + 30
        final int l = log2m + 30;

        // all registers at 'medium' value
        {
            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);

            final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
            for(int i=0; i<m; i++) {
                hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
            }

            final long cardinality = hll.cardinality();

            // Simplified estimator when all registers take same value: alpha / (m/2^val)
            final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));

            // Assert conditions for uncorrected range
            assertTrue(estimator <= Math.pow(2,l)/30);
            assertTrue(estimator > (5 * m /(double)2));

            final long expected = (long)Math.ceil(estimator);
            assertEquals(cardinality, expected);
        }
    }

    /**
     * Smoke test for {@link HLL#cardinality()} and the proper use of the large
     * range correction.
     */
    @Test
    public void largeRangeSmokeTest() {
        final int log2m = 11;
        final int m = (1 << log2m);
        final int regwidth = 5;
        // regwidth = 5, so hash space is
        // log2m + (2^5 - 1 - 1), so L = log2m + 30
        final int l = log2m + 30;

        // all registers at large value
        {
            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);

            final int registerValue = 31/*chosen to ensure large correction kicks in*/;
            for(int i=0; i<m; i++) {
                hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
            }

            final long cardinality = hll.cardinality();


            // Simplified estimator when all registers take same value: alpha / (m/2^val)
            final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));

            // Assert conditions for large range
            assertTrue(estimator > Math.pow(2, l)/30);

            // Large range correction: -2^32 * log(1 - E/2^32)
            final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
            assertEquals(cardinality, expected);
        }
    }

    /**
     * Tests {@link HLL#union(HLL)}.
     */
    @Test
    public void unionTest() {
        final int log2m = 11/*arbitrary*/;
        final int sparseThreshold = 256/*arbitrary*/;

        { // two empty multisets should union to an empty set
            final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
            final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);

            hllA.union(hllB);

            assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
            assertEquals(hllA.cardinality(), 0L);
        }
        { // two disjoint multisets should union properly
            final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
            hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 1));
            final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
            hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 2, 1));


            hllA.union(hllB);

            assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
            assertEquals(hllA.cardinality(), 3L/*precomputed*/);
            assertRegisterPresent(hllA, 1, (byte)1);
            assertRegisterPresent(hllA, 2, (byte)1);
        }
        { // two exactly overlapping multisets should union properly
            final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
            hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 10));
            final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
            hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 13));

            hllA.union(hllB);

            assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
            assertEquals(hllA.cardinality(), 2L/*precomputed*/);
            assertOneRegisterSet(hllA, 1, (byte)13/*max(10,13)*/);
        }
        { // overlapping multisets should union properly
            final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
            final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
            // register index = 3
            final long rawValueA = ProbabilisticTestUtil.constructHLLValue(log2m, 3, 11);

            // register index = 4
            final long rawValueB = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 13);
            final long rawValueBPrime = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 21);

            // register index = 5
            final long rawValueC = ProbabilisticTestUtil.constructHLLValue(log2m, 5, 14);

            hllA.addRaw(rawValueA);
            hllA.addRaw(rawValueB);

            hllB.addRaw(rawValueBPrime);
            hllB.addRaw(rawValueC);

            hllA.union(hllB);
            // union should have three registers set, with partition B set to the
            // max of the two registers
            assertRegisterPresent(hllA, 3, (byte)11);
            assertRegisterPresent(hllA, 4, (byte)21/*max(21,13)*/);
            assertRegisterPresent(hllA, 5, (byte)14);
        }
        { // too-large unions should promote
            final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
            final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);

            // fill up sets to maxCapacity
            for(int i=0; i<sparseThreshold; i++) {
                hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
                hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, (i + sparseThreshold)/*non-overlapping*/, 1));
            }

            hllA.union(hllB);

            assertEquals(hllA.getType(), HLLType.FULL);
        }
    }

    /**
     * Tests {@link HLL#clear()}.
     */
    @Test
    public void clearTest() {
        final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.SPARSE);
        hll.addRaw(1L);
        hll.clear();
        assertEquals(hll.cardinality(), 0L);
    }

    /**
     * Tests {@link HLL#toBytes(ISchemaVersion)} and
     * {@link HLL#fromBytes(byte[])}.
     */
    @Test
    public void toFromBytesTest() {
        final int log2m = 11/*arbitrary*/;
        final int regwidth = 5/*arbitrary*/;
        final int sparseThreshold = 256/*arbitrary*/;
        final int shortWordLength = 16/*log2m + regwidth = 11 + 5*/;

        final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
        final HLLType type = HLLType.SPARSE;
        final int padding = schemaVersion.paddingBytes(type);

        {// Should work on an empty element
            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
            final byte[] bytes = hll.toBytes(schemaVersion);

            // output should just be padding since no registers are used
            assertEquals(bytes.length, padding);

            final HLL inHLL = HLL.fromBytes(bytes);

            // assert register values correct
            assertElementsEqual(hll, inHLL);
        }
        {// Should work on a partially filled element
            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);

            for(int i=0; i<3; i++) {
                final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i+9));
                hll.addRaw(rawValue);
            }

            final byte[] bytes = hll.toBytes(schemaVersion);

            assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, 3/*registerCount*/));

            final HLL inHLL = HLL.fromBytes(bytes);

            // assert register values correct
            assertElementsEqual(hll, inHLL);
        }
        {// Should work on a full set
            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);

            for(int i=0; i<sparseThreshold; i++) {
                final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i % 9) + 1);
                hll.addRaw(rawValue);
            }

            final byte[] bytes = hll.toBytes(schemaVersion);

            // 'short words' should be 12 bits + 5 bits = 17 bits long
            assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, sparseThreshold));

            final HLL inHLL = HLL.fromBytes(bytes);

            // assert register values correct
            assertElementsEqual(hll, inHLL);
        }
    }

    /**
     * Smoke tests the multisets by adding random values.
     */
    @Test
    public void randomValuesTest() {
        final int log2m = 11/*arbitrary*/;
        final int regwidth = 5/*arbitrary*/;
        final int sparseThreshold = 256/*arbitrary*/;

        final long seed = 1L;
        final Random random = new Random(seed);

        for(int run=0; run<100; run++) {
            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);

            final Int2ByteOpenHashMap map = new Int2ByteOpenHashMap();
            map.defaultReturnValue((byte)0);

            for(int i=0; i<sparseThreshold; i++) {
                final long rawValue = random.nextLong();

                final short registerIndex = ProbabilisticTestUtil.getRegisterIndex(rawValue, log2m);
                final byte registerValue = ProbabilisticTestUtil.getRegisterValue(rawValue, log2m);
                if(map.get(registerIndex) < registerValue) {
                    map.put(registerIndex, registerValue);
                }

                hll.addRaw(rawValue);
            }

            for(int key : map.keySet()) {
                final byte expectedRegisterValue = map.get(key);
                assertRegisterPresent(hll, key, expectedRegisterValue);
            }
        }
    }

    //*************************************************************************
    // assertion helpers
    /**
     * Asserts that the register at the specified index is set to the specified
     * value.
     */
    private static void assertRegisterPresent(final HLL hll,
                                              final int registerIndex,
                                              final int registerValue) {
        final Int2ByteOpenHashMap sparseProbabilisticStorage = (Int2ByteOpenHashMap)getInternalState(hll, "sparseProbabilisticStorage");
        assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
    }

    /**
     * Asserts that only the specified register is set and has the specified value.
     */
    private static void assertOneRegisterSet(final HLL hll,
                                             final int registerIndex,
                                             final byte registerValue) {
        final Int2ByteOpenHashMap sparseProbabilisticStorage = (Int2ByteOpenHashMap)getInternalState(hll, "sparseProbabilisticStorage");
        assertEquals(sparseProbabilisticStorage.size(), 1);
        assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
    }

    /**
     * Asserts that all registers in the two {@link HLL} instances are identical.
     */
    private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
        final Int2ByteOpenHashMap sparseProbabilisticStorageA = (Int2ByteOpenHashMap)getInternalState(hllA, "sparseProbabilisticStorage");
        final Int2ByteOpenHashMap sparseProbabilisticStorageB = (Int2ByteOpenHashMap)getInternalState(hllB, "sparseProbabilisticStorage");
        assertEquals(sparseProbabilisticStorageA.size(), sparseProbabilisticStorageB.size());
        for(final int index : sparseProbabilisticStorageA.keySet()) {
            assertEquals(sparseProbabilisticStorageA.get(index), sparseProbabilisticStorageB.get(index));
        }
    }
}
TOP

Related Classes of net.agkn.hll.SparseHLLTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.