Package brickhouse.analytics.uniques

Source Code of brickhouse.analytics.uniques.SketchSet

package brickhouse.analytics.uniques;
/**
* Copyright 2012 Klout, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**/


import java.math.BigDecimal;
import java.math.BigInteger;
import java.math.RoundingMode;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.SortedMap;
import java.util.TreeMap;

import com.google.common.hash.HashCode;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;

public class SketchSet implements ICountDistinct {
  static final int SIZEOF_LONG = 64;
 
    public  static int DEFAULT_MAX_ITEMS = 5000;
    private  int maxItems = DEFAULT_MAX_ITEMS;
  private TreeMap<Long,String> sortedMap;
  private static HashFunction HASH = Hashing.md5();
 
   
    public SketchSet() {
      sortedMap = new TreeMap<Long,String>();
    }
     
   
    public SketchSet(int max ) {
      this.maxItems = max;
      sortedMap = new TreeMap<Long,String>();
    }
   
    public void addHashItem( long hash, String str) {
      if(sortedMap.size() < maxItems) {
        sortedMap.put( hash, str);
      } else {
        Long hashLong = hash;
        if(! sortedMap.containsKey( hashLong)) {
          long maxHash = sortedMap.lastKey();
          if( hash< maxHash) {
            sortedMap.remove(maxHash);
            sortedMap.put( hashLong,str);
          }
        }
      }
    }
  /**
   *   for testing
   * @param hash
   */
  public void addHash( long hash) {
    addHashItem( hash, Long.toString( hash));
  }
 
  public void addItem( String str) {
    HashCode hc = HASH.hashUnencodedChars( str);
    this.addHashItem( hc.asLong(), str);
  }
 
  public List<String> getMinHashItems() {
    return new ArrayList(this.sortedMap.values());
  }
 
  public SortedMap<Long,String> getHashItemMap() {
    return this.sortedMap;
  }
 
  public List<Long> getMinHashes() {
     return new ArrayList( this.sortedMap.keySet());
  }
 
  public void clear() {
    this.sortedMap.clear();
  }
 
  public int getMaxItems() {
    return maxItems;
  }
 
  public long lastHash() {
    return sortedMap.lastKey();
  }
 
  public String lastItem() {
    return sortedMap.lastEntry().getValue();
  }
 
  public double estimateReach() {
    if(sortedMap.size() < maxItems) {
      return sortedMap.size();
    }
    long maxHash = sortedMap.lastKey();
    return EstimatedReach(maxHash, maxItems);
  }
 
  static public double EstimatedReach( String lastItem, int maxItems) {
    long maxHash = HASH.hashUnencodedChars(lastItem).asLong();
    return EstimatedReach( maxHash, maxItems);
  }
 
  static public double EstimatedReach( long maxHash, int maxItems) {
    BigDecimal maxHashShifted = new BigDecimal(BigInteger.valueOf( maxHash).add( BigInteger.valueOf( Long.MAX_VALUE)));
   
    BigDecimal bigMaxItems = new BigDecimal( maxItems*2).multiply( BigDecimal.valueOf( Long.MAX_VALUE));
    BigDecimal ratio = bigMaxItems.divide(maxHashShifted, RoundingMode.HALF_EVEN);
    return ratio.doubleValue();
  }
 
 
  public long calculateSimHash() {
    int[] sumTable = new int[ SIZEOF_LONG];
   
    Iterator<Long> hashes = getHashItemMap().keySet().iterator();
    while( hashes.hasNext() ) {
      long hash = hashes.next();
      long mask = 1l;
      for(int pos =0; pos < SIZEOF_LONG; ++pos ) {
        if( (hash & mask) != 0l) {
          sumTable[pos]++;
        } else {
          sumTable[pos]--;
        }
        mask <<=  1;
      }
    }
    long simHash = 0l;
    long mask = 1l;
    for(int pos=0; pos <SIZEOF_LONG; ++pos) {
      if( sumTable[pos] > 0) {
        simHash |= mask;
      }
      mask <<=1;
    }
    return simHash;
  }
 
 
 
  public void combine( SketchSet other) {
    for( Entry<Long,String> entry: other.sortedMap.entrySet() ) {
      addHashItem( entry.getKey(), entry.getValue());
    }
  }
 
}
TOP

Related Classes of brickhouse.analytics.uniques.SketchSet

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.