Package net.agkn.hll

Source Code of net.agkn.hll.HLL

package net.agkn.hll;

/*
* Copyright 2013 Aggregate Knowledge, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.util.Arrays;

import it.unimi.dsi.fastutil.ints.Int2ByteOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import net.agkn.hll.serialization.HLLMetadata;
import net.agkn.hll.serialization.IHLLMetadata;
import net.agkn.hll.serialization.ISchemaVersion;
import net.agkn.hll.serialization.IWordDeserializer;
import net.agkn.hll.serialization.IWordSerializer;
import net.agkn.hll.serialization.SerializationUtil;
import net.agkn.hll.util.BitVector;
import net.agkn.hll.util.BitUtil;
import net.agkn.hll.util.HLLUtil;
import net.agkn.hll.util.LongIterator;
import net.agkn.hll.util.NumberUtil;

/**
* A probabilistic set of hashed <code>long</code> elements. Useful for computing
* the approximate cardinality of a stream of data in very small storage.<p/>
*
* A modified version of the <a href="http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf">
* 'HyperLogLog' data structure and algorithm</a> is used, which combines both
* probabilistic and non-probabilistic techniques to improve the accuracy and
* storage requirements of the original algorithm.<p/>
*
* More specifically, initializing and storing a new {@link HLL} will
* allocate a sentinel value symbolizing the empty set ({@link HLLType#EMPTY}).
* After adding the first few values, a sorted list of unique integers is
* stored in a {@link HLLType#EXPLICIT} hash set. When configured, accuracy can
* be sacrificed for memory footprint: the values in the sorted list are
* "promoted" to a "{@link HLLType#SPARSE}" map-based HyperLogLog structure.
* Finally, when enough registers are set, the map-based HLL will be converted
* to a bit-packed "{@link HLLType#FULL}" HyperLogLog structure.<p/>
*
* This data structure is interoperable with the implementations found at:
* <ul>
*   <li><a href="https://github.com/aggregateknowledge/postgresql-hll">postgresql-hll</a>, and</li>
*   <li><a href="https://github.com/aggregateknowledge/js-hll">js-hll</a></li>
* </ul>
* when <a href="https://github.com/aggregateknowledge/postgresql-hll/blob/master/STORAGE.markdown">properly serialized</a>.
*
* @author timon
*/
public class HLL implements Cloneable {
    // minimum and maximum values for the log-base-2 of the number of registers
    // in the HLL
    public static final int MINIMUM_LOG2M_PARAM = 4;
    public static final int MAXIMUM_LOG2M_PARAM = 30;

    // minimum and maximum values for the register width of the HLL
    public static final int MINIMUM_REGWIDTH_PARAM = 1;
    public static final int MAXIMUM_REGWIDTH_PARAM = 8;

    // minimum and maximum values for the 'expthresh' parameter of the
    // constructor that is meant to match the PostgreSQL implementation's
    // constructor and parameter names
    public static final int MINIMUM_EXPTHRESH_PARAM = -1;
    public static final int MAXIMUM_EXPTHRESH_PARAM = 18;
    public static final int MAXIMUM_EXPLICIT_THRESHOLD = (1 << (MAXIMUM_EXPTHRESH_PARAM - 1)/*per storage spec*/);

    // ************************************************************************
    // Storage
    // storage used when #type is EXPLICIT, null otherwise
    private LongOpenHashSet explicitStorage;
    // storage used when #type is SPARSE, null otherwise
    private Int2ByteOpenHashMap sparseProbabilisticStorage;
    // storage used when #type is FULL, null otherwise
    private BitVector probabilisticStorage;

    // current type of this HLL instance, if this changes then so should the
    // storage used (see above)
    private HLLType type;

    // ------------------------------------------------------------------------
    // Characteristic parameters
    // NOTE:  These members are named to match the PostgreSQL implementation's
    //        parameters.
    // log2(the number of probabilistic HLL registers)
    private final int log2m;
    // the size (width) each register in bits
    private final int regwidth;

    // ------------------------------------------------------------------------
    // Computed constants
    // ........................................................................
    // EXPLICIT-specific constants
    // flag indicating if the EXPLICIT representation should NOT be used
    private final boolean explicitOff;
    // flag indicating that the promotion threshold from EXPLICIT should be
    // computed automatically
    // NOTE:  this only has meaning when 'explicitOff' is false
    private final boolean explicitAuto;
    // threshold (in element count) at which a EXPLICIT HLL is converted to a
    // SPARSE or FULL HLL, always greater than or equal to zero and always a
    // power of two OR simply zero
    // NOTE:  this only has meaning when 'explicitOff' is false
    private final int explicitThreshold;

    // ........................................................................
    // SPARSE-specific constants
    // the computed width of the short words
    private final int shortWordLength;
    // flag indicating if the SPARSE representation should not be used
    private final boolean sparseOff;
    // threshold (in register count) at which a SPARSE HLL is converted to a
    // FULL HLL, always greater than zero
    private final int sparseThreshold;

    // ........................................................................
    // Probabilistic algorithm constants
    // the number of registers, will always be a power of 2
    private final int m;
    // a mask of the log2m bits set to one and the rest to zero
    private final int mBitsMask;
    // a mask as wide as a register (see #fromBytes())
    private final int valueMask;
    // mask used to ensure that p(w) does not overflow register (see #Constructor() and #addRaw())
    private final long pwMaxMask;
    // alpha * m^2 (the constant in the "'raw' HyperLogLog estimator")
    private final double alphaMSquared;
    // the cutoff value of the estimator for using the "small" range cardinality
    // correction formula
    private final double smallEstimatorCutoff;
    // the cutoff value of the estimator for using the "large" range cardinality
    // correction formula
    private final double largeEstimatorCutoff;

    // ========================================================================
    /**
     * NOTE: Arguments here are named and structured identically to those in the
     *       PostgreSQL implementation, which can be found
     *       <a href="https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning">here</a>.
     *
     * @param log2m log-base-2 of the number of registers used in the HyperLogLog
     *        algorithm. Must be at least 4 and at most 30.
     * @param regwidth number of bits used per register in the HyperLogLog
     *        algorithm. Must be at least 1 and at most 8.
     * @param expthresh tunes when the {@link HLLType#EXPLICIT} to
     *        {@link HLLType#SPARSE} promotion occurs,
     *        based on the set's cardinality. Must be at least -1 and at most 18.
     *        <table>
     *        <thead><tr><th><code>expthresh</code> value</th><th>Meaning</th></tr></thead>
     *        <tbody>
     *        <tr>
     *            <td>-1</td>
     *            <td>Promote at whatever cutoff makes sense for optimal memory usage. ('auto' mode)</td>
     *        </tr>
     *        <tr>
     *            <td>0</td>
     *            <td>Skip <code>EXPLICIT</code> representation in hierarchy.</td>
     *        </tr>
     *        <tr>
     *            <td>1-18</td>
     *            <td>Promote at 2<sup>expthresh - 1</sup> cardinality</td>
     *        </tr>
     *        </tbody>
     *        </table>
     * @param sparseon Flag indicating if the {@link HLLType#SPARSE}
     *        representation should be used.
     * @param type the type in the promotion hierarchy which this instance should
     *        start at. This cannot be <code>null</code>.
     */
    public HLL(final int log2m, final int regwidth, final int expthresh, final boolean sparseon, final HLLType type) {
        this.log2m = log2m;
        if((log2m < MINIMUM_LOG2M_PARAM) || (log2m > MAXIMUM_LOG2M_PARAM)) {
            throw new IllegalArgumentException("'log2m' must be at least " + MINIMUM_LOG2M_PARAM + " and at most " + MAXIMUM_LOG2M_PARAM + " (was: " + log2m + ")");
        }

        this.regwidth = regwidth;
        if((regwidth < MINIMUM_REGWIDTH_PARAM) || (regwidth > MAXIMUM_REGWIDTH_PARAM)) {
            throw new IllegalArgumentException("'regwidth' must be at least " + MINIMUM_REGWIDTH_PARAM + " and at most " + MAXIMUM_REGWIDTH_PARAM + " (was: " + regwidth + ")");
        }

        this.m = (1 << log2m);
        this.mBitsMask = m - 1;
        this.valueMask = (1 << regwidth) - 1;
        this.pwMaxMask = HLLUtil.pwMaxMask(regwidth);
        this.alphaMSquared = HLLUtil.alphaMSquared(m);
        this.smallEstimatorCutoff = HLLUtil.smallEstimatorCutoff(m);
        this.largeEstimatorCutoff = HLLUtil.largeEstimatorCutoff(log2m, regwidth);

        if(expthresh == -1) {
            this.explicitAuto = true;
            this.explicitOff = false;

            // NOTE:  This math matches the size calculation in the PostgreSQL impl.
            final long fullRepresentationSize = (this.regwidth * (long)this.m + 7/*round up to next whole byte*/)/Byte.SIZE;
            final int numLongs = (int)(fullRepresentationSize / 8/*integer division to round down*/);

            if(numLongs > MAXIMUM_EXPLICIT_THRESHOLD) {
                this.explicitThreshold = MAXIMUM_EXPLICIT_THRESHOLD;
            } else {
                this.explicitThreshold = numLongs;
            }
        } else if(expthresh == 0) {
            this.explicitAuto = false;
            this.explicitOff = true;
            this.explicitThreshold = 0;
        } else if((expthresh > 0) && (expthresh <= MAXIMUM_EXPTHRESH_PARAM)){
            this.explicitAuto = false;
            this.explicitOff = false;
            this.explicitThreshold = (1 << (expthresh - 1));
        } else {
            throw new IllegalArgumentException("'expthresh' must be at least " + MINIMUM_EXPTHRESH_PARAM + " and at most " + MAXIMUM_EXPTHRESH_PARAM + " (was: " + expthresh + ")");
        }

        this.shortWordLength = (regwidth + log2m);
        this.sparseOff = !sparseon;
        if(this.sparseOff) {
            this.sparseThreshold = 0;
        } else {
            // TODO improve this cutoff to include the cost overhead of Java
            //      members/objects
            final int largestPow2LessThanCutoff =
                    (int)NumberUtil.log2((this.m * this.regwidth) / this.shortWordLength);
            this.sparseThreshold = (1 << largestPow2LessThanCutoff);
        }

        initializeStorage(type);
    }

    /**
     *  Construct an empty HLL with the given {@code log2m} and {@code regwidth}.<p/>
     *
     *  This is equivalent to calling <code>HLL(log2m, regwidth, -1, true, HLLType.EMPTY)</code>.
     *
     * @param log2m log-base-2 of the number of registers used in the HyperLogLog
     *        algorithm. Must be at least 4 and at most 30.
     * @param regwidth number of bits used per register in the HyperLogLog
     *        algorithm. Must be at least 1 and at most 8.
     *
     * @see #HLL(int, int, int, boolean, HLLType)
     */
    public HLL(final int log2m, final int regwidth) {
        this(log2m, regwidth, -1, true, HLLType.EMPTY);
    }

    // -------------------------------------------------------------------------
    /**
     * Convenience constructor for testing. Assumes that both {@link HLLType#EXPLICIT}
     * and {@link HLLType#SPARSE} representations should be enabled.
     *
     * @param log2m log-base-2 of the number of registers used in the HyperLogLog
     *        algorithm. Must be at least 4 and at most 30.
     * @param regwidth number of bits used per register in the HyperLogLog
     *        algorithm. Must be at least 1 and at most 8.
     * @param explicitThreshold cardinality threshold at which the {@link HLLType#EXPLICIT}
     *        representation should be promoted to {@link HLLType#SPARSE}.
     *        This must be greater than zero and less than or equal to {@value #MAXIMUM_EXPLICIT_THRESHOLD}.
     * @param sparseThreshold register count threshold at which the {@link HLLType#SPARSE}
     *        representation should be promoted to {@link HLLType#FULL}.
     *        This must be greater than zero.
     * @param type the type in the promotion hierarchy which this instance should
     *        start at. This cannot be <code>null</code>.
     */
    /*package, for testing*/ HLL(final int log2m, final int regwidth, final int explicitThreshold, final int sparseThreshold, final HLLType type) {
        this.log2m = log2m;
        if((log2m < MINIMUM_LOG2M_PARAM) || (log2m > MAXIMUM_LOG2M_PARAM)) {
            throw new IllegalArgumentException("'log2m' must be at least " + MINIMUM_LOG2M_PARAM + " and at most " + MAXIMUM_LOG2M_PARAM + " (was: " + log2m + ")");
        }

        this.regwidth = regwidth;
        if((regwidth < MINIMUM_REGWIDTH_PARAM) || (regwidth > MAXIMUM_REGWIDTH_PARAM)) {
            throw new IllegalArgumentException("'regwidth' must be at least " + MINIMUM_REGWIDTH_PARAM + " and at most " + MAXIMUM_REGWIDTH_PARAM + " (was: " + regwidth + ")");
        }

        this.m = (1 << log2m);
        this.mBitsMask = m - 1;
        this.valueMask = (1 << regwidth) - 1;
        this.pwMaxMask = HLLUtil.pwMaxMask(regwidth);
        this.alphaMSquared = HLLUtil.alphaMSquared(m);
        this.smallEstimatorCutoff = HLLUtil.smallEstimatorCutoff(m);
        this.largeEstimatorCutoff = HLLUtil.largeEstimatorCutoff(log2m, regwidth);

        this.explicitAuto = false;
        this.explicitOff = false;
        this.explicitThreshold = explicitThreshold;
        if((explicitThreshold < 1) || (explicitThreshold > MAXIMUM_EXPLICIT_THRESHOLD)) {
            throw new IllegalArgumentException("'explicitThreshold' must be at least 1 and at most " + MAXIMUM_EXPLICIT_THRESHOLD + " (was: " + explicitThreshold + ")");
        }

        this.shortWordLength = (regwidth + log2m);
        this.sparseOff = false;
        this.sparseThreshold = sparseThreshold;

        initializeStorage(type);
    }

    /**
     * @return the type in the promotion hierarchy of this instance. This will
     *         never be <code>null</code>.
     */
    public HLLType getType() { return type; }

    // ========================================================================
    // Add
    /**
     * Adds <code>rawValue</code> directly to the HLL.
     *
     * @param  rawValue the value to be added. It is very important that this
     *         value <em>already be hashed</em> with a strong (but not
     *         necessarily cryptographic) hash function. For instance, the
     *         Murmur3 implementation in
     *         <a href="http://guava-libraries.googlecode.com/git/guava/src/com/google/common/hash/Murmur3_128HashFunction.java">
     *         Google's Guava</a> library is an excellent hash function for this
     *         purpose and, for seeds greater than zero, matches the output
     *         of the hash provided in the PostgreSQL implementation.
     */
    public void addRaw(final long rawValue) {
        switch(type) {
            case EMPTY: {
                // NOTE:  EMPTY type is always promoted on #addRaw()
                if(explicitThreshold > 0) {
                    initializeStorage(HLLType.EXPLICIT);
                    explicitStorage.add(rawValue);
                } else if(!sparseOff) {
                    initializeStorage(HLLType.SPARSE);
                    addRawSparseProbabilistic(rawValue);
                } else {
                    initializeStorage(HLLType.FULL);
                    addRawProbabilistic(rawValue);
                }
                return;
            }
            case EXPLICIT: {
                explicitStorage.add(rawValue);

                // promotion, if necessary
                if(explicitStorage.size() > explicitThreshold) {
                    if(!sparseOff) {
                        initializeStorage(HLLType.SPARSE);
                        for(final long value : explicitStorage) {
                            addRawSparseProbabilistic(value);
                        }
                    } else {
                        initializeStorage(HLLType.FULL);
                        for(final long value : explicitStorage) {
                            addRawProbabilistic(value);
                        }
                    }
                    explicitStorage = null;
                }
                return;
            }
            case SPARSE: {
                addRawSparseProbabilistic(rawValue);

                // promotion, if necessary
                if(sparseProbabilisticStorage.size() > sparseThreshold) {
                    initializeStorage(HLLType.FULL);
                    for(final int registerIndex : sparseProbabilisticStorage.keySet()) {
                        final byte registerValue = sparseProbabilisticStorage.get(registerIndex);
                        probabilisticStorage.setMaxRegister(registerIndex, registerValue);
                    }
                    sparseProbabilisticStorage = null;
                }
                return;
            }
            case FULL:
                addRawProbabilistic(rawValue);
                return;
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }
    }

    // ------------------------------------------------------------------------
    // #addRaw(..) helpers
    /**
     * Adds the raw value to the {@link #sparseProbabilisticStorage}.
     * {@link #type} must be {@link HLLType#SPARSE}.
     *
     * @param rawValue the raw value to add to the sparse storage.
     */
    private void addRawSparseProbabilistic(final long rawValue) {
        // p(w): position of the least significant set bit (one-indexed)
        // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register value)
        //
        // By construction of pwMaxMask (see #Constructor()),
        //      lsb(pwMaxMask) = 2^(registerValueInBits) - 2,
        // thus lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) - 2,
        // thus 1 + lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) -1.
        final long substreamValue = (rawValue >>> log2m);
        final byte p_w;

        if(substreamValue == 0L) {
            // The paper does not cover p(0x0), so the special value 0 is used.
            // 0 is the original initialization value of the registers, so by
            // doing this the multiset simply ignores it. This is acceptable
            // because the probability is 1/(2^(2^registerSizeInBits)).
            p_w = 0;
        } else {
            p_w = (byte)(1 + BitUtil.leastSignificantBit(substreamValue| pwMaxMask));
        }

        // Short-circuit if the register is being set to zero, since algorithmically
        // this corresponds to an "unset" register, and "unset" registers aren't
        // stored to save memory. (The very reason this sparse implementation
        // exists.) If a register is set to zero it will break the #algorithmCardinality
        // code.
        if(p_w == 0) {
            return;
        }

        // NOTE:  no +1 as in paper since 0-based indexing
        final int j = (int)(rawValue & mBitsMask);

        final byte currentValue = sparseProbabilisticStorage.get(j);
        if(p_w > currentValue) {
            sparseProbabilisticStorage.put(j, p_w);
        }
    }

    /**
     * Adds the raw value to the {@link #probabilisticStorage}.
     * {@link #type} must be {@link HLLType#FULL}.
     *
     * @param rawValue the raw value to add to the full probabilistic storage.
     */
    private void addRawProbabilistic(final long rawValue) {
        // p(w): position of the least significant set bit (one-indexed)
        // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register value)
        //
        // By construction of pwMaxMask (see #Constructor()),
        //      lsb(pwMaxMask) = 2^(registerValueInBits) - 2,
        // thus lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) - 2,
        // thus 1 + lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) -1.
        final long substreamValue = (rawValue >>> log2m);
        final byte p_w;

        if (substreamValue == 0L) {
            // The paper does not cover p(0x0), so the special value 0 is used.
            // 0 is the original initialization value of the registers, so by
            // doing this the multiset simply ignores it. This is acceptable
            // because the probability is 1/(2^(2^registerSizeInBits)).
            p_w = 0;
        } else {
            p_w = (byte)(1 + BitUtil.leastSignificantBit(substreamValue| pwMaxMask));
        }

        // Short-circuit if the register is being set to zero, since algorithmically
        // this corresponds to an "unset" register, and "unset" registers aren't
        // stored to save memory. (The very reason this sparse implementation
        // exists.) If a register is set to zero it will break the #algorithmCardinality
        // code.
        if(p_w == 0) {
            return;
        }

        // NOTE:  no +1 as in paper since 0-based indexing
        final int j = (int)(rawValue & mBitsMask);

        probabilisticStorage.setMaxRegister(j, p_w);
    }

    // ------------------------------------------------------------------------
    // Storage helper
    /**
     * Initializes storage for the specified {@link HLLType} and changes the
     * instance's {@link #type}.
     *
     * @param type the {@link HLLType} to initialize storage for. This cannot be
     *        <code>null</code> and must be an instantiable type. (For instance,
     *        it cannot be {@link HLLType#UNDEFINED}.)
     */
    private void initializeStorage(final HLLType type) {
        this.type = type;
        switch(type) {
            case EMPTY:
                // nothing to be done
                break;
            case EXPLICIT:
                this.explicitStorage = new LongOpenHashSet();
                break;
            case SPARSE:
                this.sparseProbabilisticStorage = new Int2ByteOpenHashMap();
                break;
            case FULL:
                this.probabilisticStorage = new BitVector(regwidth, m);
                break;
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }
    }

    // ========================================================================
    // Cardinality
    /**
     * Computes the cardinality of the HLL.
     *
     * @return the cardinality of HLL. This will never be negative.
     */
    public long cardinality() {
        switch(type) {
            case EMPTY:
                return 0/*by definition*/;
            case EXPLICIT:
                return explicitStorage.size();
            case SPARSE:
                return (long)Math.ceil(sparseProbabilisticAlgorithmCardinality());
            case FULL:
                return (long)Math.ceil(fullProbabilisticAlgorithmCardinality());
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }
    }

    // ------------------------------------------------------------------------
    // Cardinality helpers
    /**
     * Computes the exact cardinality value returned by the HLL algorithm when
     * represented as a {@link HLLType#SPARSE} HLL. Kept
     * separate from {@link #cardinality()} for testing purposes. {@link #type}
     * must be {@link HLLType#SPARSE}.
     *
     * @return the exact, unrounded cardinality given by the HLL algorithm
     */
    /*package, for testing*/ double sparseProbabilisticAlgorithmCardinality() {
        final int m = this.m/*for performance*/;

        // compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
        // 'j'th register value
        double sum = 0;
        int numberOfZeroes = 0/*"V" in the paper*/;
        for(int j=0; j<m; j++) {
            final long register = sparseProbabilisticStorage.get(j);

            sum += 1.0 / (1L << register);
            if(register == 0L) numberOfZeroes++;
        }

        // apply the estimate and correction to the indicator function
        final double estimator = alphaMSquared / sum;
        if((numberOfZeroes != 0) && (estimator < smallEstimatorCutoff)) {
            return HLLUtil.smallEstimator(m, numberOfZeroes);
        } else if(estimator <= largeEstimatorCutoff) {
            return estimator;
        } else {
            return HLLUtil.largeEstimator(log2m, regwidth, estimator);
        }
    }

    /**
     * Computes the exact cardinality value returned by the HLL algorithm when
     * represented as a {@link HLLType#FULL} HLL. Kept
     * separate from {@link #cardinality()} for testing purposes. {@link #type}
     * must be {@link HLLType#FULL}.
     *
     * @return the exact, unrounded cardinality given by the HLL algorithm
     */
    /*package, for testing*/ double fullProbabilisticAlgorithmCardinality() {
        final int m = this.m/*for performance*/;

        // compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
        // 'j'th register value
        double sum = 0;
        int numberOfZeroes = 0/*"V" in the paper*/;
        final LongIterator iterator = probabilisticStorage.registerIterator();
        while(iterator.hasNext()) {
            final long register = iterator.next();

            sum += 1.0 / (1L << register);
            if(register == 0L) numberOfZeroes++;
        }

        // apply the estimate and correction to the indicator function
        final double estimator = alphaMSquared / sum;
        if((numberOfZeroes != 0) && (estimator < smallEstimatorCutoff)) {
            return HLLUtil.smallEstimator(m, numberOfZeroes);
        } else if(estimator <= largeEstimatorCutoff) {
            return estimator;
        } else {
            return HLLUtil.largeEstimator(log2m, regwidth, estimator);
        }
    }

    // ========================================================================
    // Clear
    /**
     * Clears the HLL. The HLL will have cardinality zero and will act as if no
     * elements have been added.<p/>
     *
     * NOTE: Unlike {@link #addRaw(long)}, <code>clear</code> does NOT handle
     * transitions between {@link HLLType}s - a probabilistic type will remain
     * probabilistic after being cleared.
     */
    public void clear() {
        switch(type) {
            case EMPTY:
                return /*do nothing*/;
            case EXPLICIT:
                explicitStorage.clear();
                return;
            case SPARSE:
                sparseProbabilisticStorage.clear();
                return;
            case FULL:
                probabilisticStorage.fill(0);
                return;
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }
    }

    // ========================================================================
    // Union
    /**
     * Computes the union of HLLs and stores the result in this instance.
     *
     * @param other the other {@link HLL} instance to union into this one. This
     *        cannot be <code>null</code>.
     */
    public void union(final HLL other) {
        // TODO: verify HLLs are compatible
        final HLLType otherType = other.getType();

        if(type.equals(otherType)) {
            homogeneousUnion(other);
            return;
        } else {
            heterogenousUnion(other);
            return;
        }
    }

    // ------------------------------------------------------------------------
    // Union helpers
    /**
     * Computes the union of two HLLs, of different types, and stores the
     * result in this instance.
     *
     * @param other the other {@link HLL} instance to union into this one. This
     *        cannot be <code>null</code>.
     */
    /*package, for testing*/ void heterogenousUnion(final HLL other) {
        /*
         * The logic here is divided into two sections: unions with an EMPTY
         * HLL, and unions between EXPLICIT/SPARSE/FULL
         * HLL.
         *
         * Between those two sections, all possible heterogeneous unions are
         * covered. Should another type be added to HLLType whose unions
         * are not easily reduced (say, as EMPTY's are below) this may be more
         * easily implemented as Strategies. However, that is unnecessary as it
         * stands.
         */

        // ....................................................................
        // Union with an EMPTY
        if(HLLType.EMPTY.equals(type)) {
            // NOTE:  The union of empty with non-empty HLL is just a
            //        clone of the non-empty.

            switch(other.getType()) {
                case EXPLICIT: {
                    // src:  EXPLICIT
                    // dest: EMPTY

                    if(other.explicitStorage.size() <= explicitThreshold) {
                        type = HLLType.EXPLICIT;
                        explicitStorage = other.explicitStorage.clone();
                    } else {
                        if(!sparseOff) {
                            initializeStorage(HLLType.SPARSE);
                        } else {
                            initializeStorage(HLLType.FULL);
                        }
                        for(final long value : other.explicitStorage) {
                            addRaw(value);
                        }
                    }
                    return;
                }
                case SPARSE: {
                    // src:  SPARSE
                    // dest: EMPTY

                    if(!sparseOff) {
                        type = HLLType.SPARSE;
                        sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone();
                    } else {
                        initializeStorage(HLLType.FULL);
                        for(final int registerIndex : other.sparseProbabilisticStorage.keySet()) {
                            final byte registerValue = other.sparseProbabilisticStorage.get(registerIndex);
                            probabilisticStorage.setMaxRegister(registerIndex, registerValue);
                        }
                    }
                    return;
                }
                default/*case FULL*/: {
                    // src:  FULL
                    // dest: EMPTY

                    type = HLLType.FULL;
                    probabilisticStorage = other.probabilisticStorage.clone();
                    return;
                }
            }
        } else if (HLLType.EMPTY.equals(other.getType())) {
            // source is empty, so just return destination since it is unchanged
            return;
        } /* else -- both of the sets are not empty */

        // ....................................................................
        // NOTE: Since EMPTY is handled above, the HLLs are non-EMPTY below
        switch(type) {
            case EXPLICIT: {
                // src:  FULL/SPARSE
                // dest: EXPLICIT
                // "Storing into destination" cannot be done (since destination
                // is by definition of smaller capacity than source), so a clone
                // of source is made and values from destination are inserted
                // into that.

                // Determine source and destination storage.
                // NOTE:  destination storage may change through promotion if
                //        source is SPARSE.
                if(HLLType.SPARSE.equals(other.getType())) {
                    if(!sparseOff) {
                        type = HLLType.SPARSE;
                        sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone();
                    } else {
                        initializeStorage(HLLType.FULL);
                        for(final int registerIndex : other.sparseProbabilisticStorage.keySet()) {
                            final byte registerValue = other.sparseProbabilisticStorage.get(registerIndex);
                            probabilisticStorage.setMaxRegister(registerIndex, registerValue);
                        }
                    }
                } else /*source is HLLType.FULL*/ {
                    type = HLLType.FULL;
                    probabilisticStorage = other.probabilisticStorage.clone();
                }
                for(final long value : explicitStorage) {
                    addRaw(value);
                }
                explicitStorage = null;
                return;
            }
            case SPARSE: {
                if(HLLType.EXPLICIT.equals(other.getType())) {
                    // src:  EXPLICIT
                    // dest: SPARSE
                    // Add the raw values from the source to the destination.

                    for(final long value : other.explicitStorage) {
                        addRaw(value);
                    }
                    // NOTE:  addRaw will handle promotion cleanup
                } else /*source is HLLType.FULL*/ {
                    // src:  FULL
                    // dest: SPARSE
                    // "Storing into destination" cannot be done (since destination
                    // is by definition of smaller capacity than source), so a
                    // clone of source is made and registers from the destination
                    // are merged into the clone.

                    type = HLLType.FULL;
                    probabilisticStorage = other.probabilisticStorage.clone();
                    for(final int registerIndex : sparseProbabilisticStorage.keySet()) {
                        final byte registerValue = sparseProbabilisticStorage.get(registerIndex);
                        probabilisticStorage.setMaxRegister(registerIndex, registerValue);
                    }
                    sparseProbabilisticStorage = null;
                }
                return;
            }
            default/*destination is HLLType.FULL*/: {
                if(HLLType.EXPLICIT.equals(other.getType())) {
                    // src:  EXPLICIT
                    // dest: FULL
                    // Add the raw values from the source to the destination.
                    // Promotion is not possible, so don't bother checking.

                    for(final long value : other.explicitStorage) {
                        addRaw(value);
                    }
                } else /*source is HLLType.SPARSE*/ {
                    // src:  SPARSE
                    // dest: FULL
                    // Merge the registers from the source into the destination.
                    // Promotion is not possible, so don't bother checking.

                    for(final int registerIndex : other.sparseProbabilisticStorage.keySet()) {
                        final byte registerValue = other.sparseProbabilisticStorage.get(registerIndex);
                        probabilisticStorage.setMaxRegister(registerIndex, registerValue);
                    }
                }

            }
        }
    }

    /**
     * Computes the union of two HLLs of the same type, and stores the
     * result in this instance.
     *
     * @param other the other {@link HLL} instance to union into this one. This
     *        cannot be <code>null</code>.
     */
    private void homogeneousUnion(final HLL other) {
        switch(type) {
            case EMPTY:
                // union of empty and empty is empty
                return;
        case EXPLICIT:
            for(final long value : other.explicitStorage) {
                addRaw(value);
            }
            // NOTE:  #addRaw() will handle promotion, if necessary
            return;
        case SPARSE:
            for(final int registerIndex : other.sparseProbabilisticStorage.keySet()) {
                final byte registerValue = other.sparseProbabilisticStorage.get(registerIndex);
                final byte currentRegisterValue = sparseProbabilisticStorage.get(registerIndex);
                if(registerValue > currentRegisterValue) {
                    sparseProbabilisticStorage.put(registerIndex, registerValue);
                }
            }

            // promotion, if necessary
            if(sparseProbabilisticStorage.size() > sparseThreshold) {
                initializeStorage(HLLType.FULL);
                for(final int registerIndex : sparseProbabilisticStorage.keySet()) {
                    final byte registerValue = sparseProbabilisticStorage.get(registerIndex);
                    probabilisticStorage.setMaxRegister(registerIndex, registerValue);
                }
                sparseProbabilisticStorage = null;
            }
            return;
        case FULL:
            for(int i=0; i<m; i++) {
                final long registerValue = other.probabilisticStorage.getRegister(i);
                probabilisticStorage.setMaxRegister(i, registerValue);
            }
            return;
        default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }
    }

    // ========================================================================
    // Serialization
    /**
     * Serializes the HLL to an array of bytes in correspondence with the format
     * of the default schema version, {@link SerializationUtil#DEFAULT_SCHEMA_VERSION}.
     *
     * @return the array of bytes representing the HLL. This will never be
     *         <code>null</code> or empty.
     */
    public byte[] toBytes() {
        return toBytes(SerializationUtil.DEFAULT_SCHEMA_VERSION);
    }

    /**
     * Serializes the HLL to an array of bytes in correspondence with the format
     * of the specified schema version.
     *
     * @param  schemaVersion the schema version dictating the serialization format
     * @return the array of bytes representing the HLL. This will never be
     *         <code>null</code> or empty.
     */
    public byte[] toBytes(final ISchemaVersion schemaVersion) {
        final byte[] bytes;
        switch(type) {
            case EMPTY:
                bytes = new byte[schemaVersion.paddingBytes(type)];
                break;
            case EXPLICIT: {
                final IWordSerializer serializer =
                    schemaVersion.getSerializer(type, Long.SIZE, explicitStorage.size());

                final long[] values = explicitStorage.toLongArray();
                Arrays.sort(values);
                for(final long value : values) {
                    serializer.writeWord(value);
                }

                bytes = serializer.getBytes();
                break;
            }
            case SPARSE: {
                final IWordSerializer serializer =
                        schemaVersion.getSerializer(type, shortWordLength, sparseProbabilisticStorage.size());

                final int[] indices = sparseProbabilisticStorage.keySet().toIntArray();
                Arrays.sort(indices);
                for(final int registerIndex : indices) {
                    final long registerValue = sparseProbabilisticStorage.get(registerIndex);
                    // pack index and value into "short word"
                    final long shortWord = ((registerIndex << regwidth) | registerValue);
                    serializer.writeWord(shortWord);
                }

                bytes = serializer.getBytes();
                break;
            }
            case FULL: {
                final IWordSerializer serializer = schemaVersion.getSerializer(type, regwidth, m);
                probabilisticStorage.getRegisterContents(serializer);

                bytes = serializer.getBytes();
                break;
            }
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }

        final IHLLMetadata metadata = new HLLMetadata(schemaVersion.schemaVersionNumber(),
                                                      type,
                                                      log2m,
                                                      regwidth,
                                                      (int)NumberUtil.log2(explicitThreshold),
                                                      explicitOff,
                                                      explicitAuto,
                                                      !sparseOff);
        schemaVersion.writeMetadata(bytes, metadata);

        return bytes;
    }

    /**
     * Deserializes the HLL (in {@link #toBytes(ISchemaVersion)} format) serialized
     * into <code>bytes</code>.<p/>
     *
     * @param  bytes the serialized bytes of new HLL
     * @return the deserialized HLL. This will never be <code>null</code>.
     *
     * @see #toBytes(ISchemaVersion)
     */
    public static HLL fromBytes(final byte[] bytes) {
        final ISchemaVersion schemaVersion = SerializationUtil.getSchemaVersion(bytes);
        final IHLLMetadata metadata = schemaVersion.readMetadata(bytes);

        final HLLType type = metadata.HLLType();
        final int regwidth = metadata.registerWidth();
        final int log2m = metadata.registerCountLog2();
        final boolean sparseon = metadata.sparseEnabled();

        final int expthresh;
        if(metadata.explicitAuto()) {
            expthresh = -1;
        } else if(metadata.explicitOff()) {
            expthresh = 0;
        } else {
            // NOTE: take into account that the postgres-compatible constructor
            //       subtracts one before taking a power of two.
            expthresh = metadata.log2ExplicitCutoff() + 1;
        }

        final HLL hll = new HLL(log2m, regwidth, expthresh, sparseon, type);

        // Short-circuit on empty, which needs no other deserialization.
        if(HLLType.EMPTY.equals(type)) {
            return hll;
        }

        final int wordLength;
        switch(type) {
            case EXPLICIT:
                wordLength = Long.SIZE;
                break;
            case SPARSE:
                wordLength = hll.shortWordLength;
                break;
            case FULL:
                wordLength = hll.regwidth;
                break;
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }

        final IWordDeserializer deserializer =
                schemaVersion.getDeserializer(type, wordLength, bytes);
        switch(type) {
            case EXPLICIT:
                // NOTE:  This should not exceed expthresh and this will always
                //        be exactly the number of words that were encoded,
                //        because the word length is at least a byte wide.
                // SEE:   IWordDeserializer#totalWordCount()
                for(int i=0; i<deserializer.totalWordCount(); i++) {
                    hll.explicitStorage.add(deserializer.readWord());
                }
                break;
            case SPARSE:
                // NOTE:  If the shortWordLength were smaller than 8 bits
                //        (1 byte) there would be a possibility (because of
                //        padding arithmetic) of having one or more extra
                //        registers read. However, this is not relevant as the
                //        extra registers will be all zeroes, which are ignored
                //        in the sparse representation.
                for(int i=0; i<deserializer.totalWordCount(); i++) {
                    final long shortWord = deserializer.readWord();
                    final byte registerValue = (byte)(shortWord & hll.valueMask);
                    // Only set non-zero registers.
                    if (registerValue != 0) {
                        hll.sparseProbabilisticStorage.put((int)(shortWord >>> hll.regwidth), registerValue);
                    }
                }
                break;
            case FULL:
                // NOTE:  Iteration is done using m (register count) and NOT
                //        deserializer#totalWordCount() because regwidth may be
                //        less than 8 and as such the padding on the 'last' byte
                //        may be larger than regwidth, causing an extra register
                //        to be read.
                // SEE: IWordDeserializer#totalWordCount()
                for(long i=0; i<hll.m; i++) {
                    hll.probabilisticStorage.setRegister(i, deserializer.readWord());
                }
                break;
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }

        return hll;
    }

    /**
     * Create a deep copy of this HLL.
     *
     * @see java.lang.Object#clone()
     */
    @Override
    public HLL clone() throws CloneNotSupportedException {
        // NOTE: Since the package-only constructor assumes both explicit and
        //       sparse are enabled, the easiest thing to do here is to re-derive
        //       the expthresh parameter and create a new HLL with the public
        //       constructor.
        // TODO: add a more sensible constructor to make this less obfuscated
        final int copyExpthresh;
        if(explicitAuto) {
            copyExpthresh = -1;
        } else if(explicitOff) {
            copyExpthresh = 0;
        } else {
            // explicitThreshold is defined as:
            //
            //      this.explicitThreshold = (1 << (expthresh - 1));
            //
            // Since explicitThreshold is a power of two and only has a single
            // bit set, finding the LSB is the same as finding the inverse
            copyExpthresh = BitUtil.leastSignificantBit(explicitThreshold) + 1;
        }
        final HLL copy = new HLL(log2m, regwidth, copyExpthresh, !sparseOff/*sparseOn*/, type);
        switch(type) {
            case EMPTY:
                // nothing to be done
                break;
            case EXPLICIT:
                copy.explicitStorage = this.explicitStorage.clone();
                break;
            case SPARSE:
                copy.sparseProbabilisticStorage = this.sparseProbabilisticStorage.clone();
                break;
            case FULL:
                copy.probabilisticStorage = this.probabilisticStorage.clone();
                break;
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }
        return copy;
    }
}
TOP

Related Classes of net.agkn.hll.HLL

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.