Package brickhouse.analytics.uniques

Source Code of brickhouse.analytics.uniques.SketchSetTest

package brickhouse.analytics.uniques;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.SortedMap;
import java.util.UUID;

import org.junit.Assert;
import org.junit.Test;

import brickhouse.udf.sketch.SetSimilarityUDF;

import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;

public class SketchSetTest {

  @Test
  public void testSketchSet() {
    SketchSet ss = new SketchSet();
    ss.addHash(1);
    ss.addHash(2);
    ss.addHash(3);
   
    double card = ss.estimateReach();
    Assert.assertEquals( 3.0, card, 0.0);
  }
 
  @Test
  public void testDupSketchSet() {
    SketchSet ss = new SketchSet();
    ss.addHash(-11);
    ss.addHash(-11);
    ss.addHash(1);
    ss.addHash(2);
    ss.addHash(2);
    ss.addHash(3);
    ss.addHash(3);
   
    double card = ss.estimateReach();
    Assert.assertEquals( 4.0, card, 0.0);
  }
 
  @Test
  public void testDupSketchSetOver5000() {
    SketchSet ss = new SketchSet();
    int numHashes = (int) ( 5000 + Math.random()*1024*4);
    long minHash = Long.MAX_VALUE;
    for( int i=0; i<numHashes; ++i ) {
      double randHash = (Math.random()*((double)Long.MAX_VALUE)*2 ) - (double)Long.MAX_VALUE;
      if( randHash < minHash)
        minHash = (long)randHash;
      ss.addItem( "" + randHash);
    }
    double ratio = ss.estimateReach()/(double)numHashes;
    System.out.println(" Estimate reach = " + ss.estimateReach() + " size = "  + numHashes  + " ratio = " + ratio);
    Assert.assertTrue( ratio > 0.9 && ratio < 1.1);
   
   
    ss.addHash( minHash -1 );
    numHashes++;
    ratio = ss.estimateReach()/(double)numHashes;
    System.out.println(" Estimate reach = " + ss.estimateReach() + " size = "  + numHashes  + " ratio = " + ratio);
    Assert.assertTrue( ratio > 0.95 && ratio < 1.05);

    ss.addHash( minHash -1 );
    ratio = ss.estimateReach()/(double)numHashes;
    System.out.println(" Estimate reach = " + ss.estimateReach() + " size = "  + numHashes  + " ratio = " + ratio);
    Assert.assertTrue( ratio > 0.95 && ratio < 1.05);

    ss.addHash( minHash -2 );
    numHashes++;
    ratio = ss.estimateReach()/(double)numHashes;
    System.out.println(" Estimate reach = " + ss.estimateReach() + " size = "  + numHashes  + " ratio = " + ratio);
    Assert.assertTrue( ratio > 0.95 && ratio < 1.05);

    ss.addHash( minHash -2 );
    ratio = ss.estimateReach()/(double)numHashes;
    System.out.println(" Estimate reach = " + ss.estimateReach() + " size = "  + numHashes  + " ratio = " + ratio);
    Assert.assertTrue( ratio > 0.95 && ratio < 1.05);
   
   
    long lastHash = ss.lastHash();
    ss.addHash( lastHash - 1);
    numHashes++;
    ratio = ss.estimateReach()/(double)numHashes;
    System.out.println(" Estimate reach = " + ss.estimateReach() + " size = "  + numHashes  + " ratio = " + ratio);
    Assert.assertTrue( ratio > 0.95 && ratio < 1.05);
   
  }



  @Test
  public void testRandomHashes() {
    SketchSet ss = new SketchSet();
    int numHashes = 1024*1024;
    for( int i=0; i<numHashes; ++i ) {
      double randHash = Math.random()*((double)Long.MAX_VALUE);
      if(Math.random() < 0.5) {
        randHash = -1*randHash;
      }
      ss.addHash((long)randHash);
    }
   
    double card = ss.estimateReach();
    double tolerance = 0.05;
   
   
    int diff = (int) Math.abs( card - numHashes);
    double diffRatio = ((double)diff)/numHashes;
    System.out.println(" Estimated cardinality is " +card + " ; Expected " + numHashes + " ; Difference was " + diff + " ; diff ratio is " + diffRatio);
    Assert.assertTrue( diffRatio <= tolerance);
   
    System.out.println(" Estimated cardinality is " +card);
    Assert.assertEquals( numHashes, card, numHashes*tolerance);
  }

 
  ///@Test
  public void testManyRandomHashes() {
    double maxDiff = 0;
    double totDiff = 0;
    int numRuns =512;
    long now = System.currentTimeMillis();
    for(int j=0; j<numRuns; ++j) {
      SketchSet ss = new SketchSet();
      int numHashes = (int) (Math.random()*1024*514);
      for( int i=0; i<numHashes; ++i ) {
        double randHash = Math.random()*((double)Long.MAX_VALUE);
        if(Math.random() < 0.5) {
          randHash = -1*randHash;
        }
        ss.addHash((long)randHash);
      }

      double card = ss.estimateReach();
      double tolerance = 0.05;

      int diff = (int) Math.abs( card - numHashes);
      double diffRatio = ((double)diff)/numHashes;
      System.out.println(" J = " + j);
      System.out.println(" Estimated cardinality is " +card + " ; Expected " + numHashes + " ; Difference was " + diff + " ; diff ratio is " + diffRatio);
      Assert.assertTrue( diffRatio <= tolerance);
      if(diffRatio > maxDiff)
        maxDiff = diffRatio;

      totDiff+= diffRatio;
      System.out.println(" Estimated cardinality is " +card);
      Assert.assertEquals( numHashes, card, numHashes*tolerance);
    }
    long later = System.currentTimeMillis();
    int numSecs = (int) ((later -now)/1000.0);
    System.out.println(" Max Diff Ratio = " + maxDiff);
    double avgDiff = totDiff/(double)numRuns;
    System.out.println(" Avg Diff Ratio = " + avgDiff);
   
    System.out.print( numRuns + " took " + numSecs);
  }


  @Test
  public void testHashStrings() {
    SketchSet ss = new SketchSet(5000);
    int numHashes = 512*1024;
    for(int i=0; i<numHashes; ++i) {
      UUID randomUUID = UUID.randomUUID();
      ///System.out.println(" RandomUUID " + randomUUID.toString());
      ss.addItem( randomUUID.toString());
    }
   
    double card = ss.estimateReach();
    double tolerance = 0.05;
   
    int diff = (int) Math.abs( card - numHashes);
    double diffRatio = ((double)diff)/numHashes;
    System.out.println(" Estimated cardinality is " +card + " ; Expected " + numHashes + " ; Difference was " + diff + " ; diff ratio is " + diffRatio);
    Assert.assertTrue( diffRatio <= tolerance);
  }
 
  @Test
  public void testDistinctSets() {
   
    SketchSet ss = new SketchSet(5000);
    int numHashes1 = (int) ((double)(1024*512)*Math.random());
    System.out.println(" Number of hashes one = " + numHashes1);
    for(int i=0; i<numHashes1; ++i) {
      UUID randomUUID = UUID.randomUUID();
      ///System.out.println(" RandomUUID " + randomUUID.toString());
      ss.addItem( randomUUID.toString());
    }
   
    double card = ss.estimateReach();
    System.out.println(" Estimated Card 1 = " + card);
   
   
    SketchSet ss2 = new SketchSet(5000);
    int numHashes2 = (int) ((double)(1024*512)*Math.random());
    System.out.println(" Number of hashes two = " + numHashes2);
    for(int i=0; i<numHashes2; ++i) {
      UUID randomUUID = UUID.randomUUID();
      ///System.out.println(" RandomUUID " + randomUUID.toString());
      ss2.addItem( randomUUID.toString());
    }
   
    double card2 = ss2.estimateReach();
    System.out.println(" Estimated Card 2 = " + card2);
   
    ss.combine( ss2);
   
    double newCard = ss.estimateReach();
    System.out.println(" Sum of hashes  = " + ( card + card2));
    System.out.println(" Estimated Combined Card = " + newCard);
   
   
    System.out.println(" New Card = " + newCard);
   
  }
 
  @Test
  public void testOverlapSets() {
   
    SketchSet ss = new SketchSet(5000);
    int numHashes1 = (int) ((double)(1024*512)*Math.random());
    System.out.println(" Number of hashes one = " + numHashes1);
    for(int i=0; i<numHashes1; ++i) {
      UUID randomUUID = UUID.randomUUID();
      ///System.out.println(" RandomUUID " + randomUUID.toString());
      ss.addItem( randomUUID.toString());
    }
   
    double card = ss.estimateReach();
    System.out.println(" Estimated Card 1 = " + card);
   
   
    SketchSet ss2 = new SketchSet(5000);
    int numHashes2 = (int) ((double)(1024*512)*Math.random());
    System.out.println(" Number of hashes two = " + numHashes2);
    for(int i=0; i<numHashes2; ++i) {
      UUID randomUUID = UUID.randomUUID();
      ///System.out.println(" RandomUUID " + randomUUID.toString());
      ss2.addItem( randomUUID.toString());
    }
   
    double card2 = ss2.estimateReach();
    System.out.println(" Estimated Card 2 = " + card2);
   
    SketchSet ss3 = new SketchSet(5000);
    int numHashes3 = (int) ((double)(1024*512)*Math.random());
    System.out.println(" Number of hashes three = " + numHashes3);
    for(int i=0; i<numHashes3; ++i) {
      UUID randomUUID = UUID.randomUUID();
      ///System.out.println(" RandomUUID " + randomUUID.toString());
      ss3.addItem( randomUUID.toString());
    }
   
    double card3 = ss3.estimateReach();
    System.out.println(" Estimated Card 3= " + card3);
   
    ss.combine( ss2);
   
   
    ss.combine( ss2);
   
    double cardCombine1 = ss.estimateReach();
    System.out.println(" Combine 1 = " + cardCombine1);
    System.out.println(" 1 +2 " + ( numHashes1 + numHashes2 ));
   
    ss3.combine( ss2);
    double cardCombine3 = ss3.estimateReach();
    System.out.println(" Combine 3 = " + cardCombine3);
    System.out.println(" 2  + 3 " + ( numHashes3 + numHashes2 ));
   
    SketchSet overLap = new SketchSet();
    overLap.combine( ss);
    overLap.combine( ss3 );
   
    double cardOverlap = overLap.estimateReach();
    System.out.println(" Overlap = " + cardOverlap);
    System.out.println(" All hashes = " + ( numHashes1 + numHashes2 + numHashes3));
   
  }
 

  @Test
  public void testObama() throws IOException {
    SketchSet ss = new SketchSet();
    SketchSet ss2 = new SketchSet();
   
    HashFunction md5 = Hashing.md5();
   
    System.out.println(" Directory is " + System.getProperty("user.dir"));
    FileInputStream fs = new FileInputStream("src/test/resources/obama.txt");
    int cnt =0;
    BufferedReader reader = new BufferedReader( new InputStreamReader(fs ));
    String line;
    while( (line = reader.readLine() ) != null) {
      ss.addItem( line);
      ss2.addHashItemmd5.hashUnencodedChars( line).asLong(), line );
      cnt++;
    }
   
    System.out.println(" Estimated Reach = " + ss.estimateReach() + " count = " + cnt);
    double diff = cnt - ss.estimateReach();
    double pctDiff = Math.abs( diff/(double)cnt);
    System.out.println( " Difference is " + pctDiff);
   
    Assert.assertTrue( pctDiff < 0.03);
   
    System.out.println(" Estimated Reach = " + ss2.estimateReach() + " count = " + cnt);
     diff = cnt - ss2.estimateReach();
     pctDiff = Math.abs( diff/(double)cnt);
    System.out.println( " Difference is " + pctDiff);
   
    Assert.assertTrue( pctDiff < 0.03);
   
    SortedMap<Long,String> hashItemMap = ss.getHashItemMap();
    System.out.println( " First Key is " +  hashItemMap.firstKey() );
    System.out.println( " Last Key is " +  hashItemMap.lastKey() );
  }
 
 
  @Test
  public void testGetMinHashes() {
    SketchSet ss = new SketchSet();

    int numHashes =  5100 + (int)(Math.random()*15000);
    for(int i=0; i<numHashes; ++i) {
      UUID randomUUID = UUID.randomUUID();
      ///System.out.println(" RandomUUID " + randomUUID.toString());
      ss.addItem( randomUUID.toString());
    }
   
    List<Long> md5Hashes = ss.getMinHashes();
    long last = Long.MIN_VALUE;
    for( long md5 : md5Hashes) {
      Assert.assertTrue( md5 > last);
      last = md5;
    }
    double estReach = SketchSet.EstimatedReach( last, ss.getMaxItems());
    double ratio = estReach/(double)numHashes;
    System.out.println( " Estimated Reach = " + estReach + " num Hashes  = " + numHashes + " ; Ratio = " + ratio);
    Assert.assertTrue( ratio < 1.05 && ratio > 0.95);
   
  }
 
  @Test
  public void testSetSimilarity() {
    int numHashes = 200000;
    SketchSet a = new SketchSet();
    SketchSet b = new SketchSet();
    SketchSet c = new SketchSet();
   
    for(int i=0; i<numHashes; ++i) {
      UUID randomUUID = UUID.randomUUID();
      ///System.out.println(" RandomUUID " + randomUUID.toString());
      a.addItem( randomUUID.toString());
      randomUUID = UUID.randomUUID();
      b.addItem( randomUUID.toString());
      randomUUID = UUID.randomUUID();
      c.addItem( randomUUID.toString());
    }
    SetSimilarityUDF simUDF = new SetSimilarityUDF();
   
    double same = simUDF.evaluate(a.getMinHashItems(), a.getMinHashItems());
    System.out.println( "Similarity with self = " + same);
    Assert.assertEquals( 1.0, same, 0);
   
    double diff = simUDF.evaluate(a.getMinHashItems(), b.getMinHashItems());
    System.out.println( "Similarity with different  = " + diff);
    Assert.assertEquals( 0, diff , 0.03); /// Might not be quite zero
   
    a.combine(c);
    b.combine(c);
   
    double mixed = simUDF.evaluate( a.getMinHashItems(), b.getMinHashItems());
    System.out.println("Similarity with mixed = " +mixed);
    //// Should be about a third
    Assert.assertEquals( 0.333333333, mixed, 0.03);
   
  }



}
TOP

Related Classes of brickhouse.analytics.uniques.SketchSetTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.