Package brickhouse.udf.bloom

Source Code of brickhouse.udf.bloom.BloomFactory

package brickhouse.udf.bloom;
/**
* Copyright 2012 Klout, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**/


import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.util.bloom.BloomFilter;
import org.apache.hadoop.util.bloom.Filter;
import org.apache.hadoop.util.hash.Hash;
import org.apache.log4j.Logger;

/**
*  Utility class for construction and serialization of BloomFilters ...
*
*/
public class BloomFactory {
  private static final Logger LOG = Logger.getLogger( BloomFactory.class);
  private static Map<String,Filter> localBloomMap = new HashMap<String,Filter>();

  public static final int DEFAULT_NUM_ELEMENTS = 10000000;
  public static final double DEFAULT_FALSE_POS_PROB = 0.005;
  public static final int DEFAULT_HASH_TYPE = Hash.JENKINS_HASH;
  public static final int NUMBER_OF_BLOOMS = 5;

 
  public static Filter NewBloomInstance() {
    return NewBloomInstance( DEFAULT_NUM_ELEMENTS, DEFAULT_FALSE_POS_PROB);
  }

  static Filter NewVesselBloom() {
    return new BloomFilter();
  }
 
  public static Filter NewBloomInstance( int expectedNumberOfElements, double falsePositiveProbability ) {
    return NewBloomInstance(Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2))) / Math.log(2), // c = k / ln(2)
                   expectedNumberOfElements,
                   (int)Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2)))); // k = ceil(-log_2(false prob.))
   
  }
 
  public static Filter NewBloomInstance( double c, int n, int k) {
    LOG.info("Creating new Bloom filter C = " + c + " N =  " + n  + " K = " + k );
    BloomFilter dbf = new BloomFilter((int)Math.ceil(c*n),
              k , DEFAULT_HASH_TYPE);
    return dbf;
  }
  /**
   *   Generic method for getting BloomFilter from a string.
   *   First, the local map is checked for a bloom loaded from
   *   the distributed cache. Next the bloom is attempted to be
   *    parsed from UUencoded format.
   * @param name
   * @return
   */
  public static Filter GetBloomFilter( String str) {
    Filter bloom = GetNamedBloomFilter( str);
    if( bloom == null) {
      try {
        bloom = ReadBloomFromString( str);
        return bloom;
      } catch (IOException e) {
        LOG.error(" Unable to get bloom for string " + str);
        return null;
      }
    } else {
      return bloom;
    }
  }
 
  public static Filter GetNamedBloomFilter( String name) {
    return localBloomMap.get( name);
  }
 
  public static void PutNamedBloomFilter( String name, Filter bloom) {
    localBloomMap.put( name,bloom);
  }
 
 
  public static Filter ReadBloomFromStream( InputStream stream) throws IOException {
    /// Need to UUDecode first,
    /// TODO - read bytes directly when hive handles byte arrays better
    ByteArrayOutputStream buffer = new ByteArrayOutputStream();
    byte[] bufferArr = new byte[4096];
    int len=0;
    while((len=stream.read(bufferArr, 0, 4096)) > 0) {
      buffer.write( bufferArr, 0, len);
    }
    if( buffer.size() ==0) {
      return BloomFactory.NewBloomInstance();
    }
    return ReadBloomFromString( new String(buffer.toByteArray()));
  }
 
  public static void WriteBloomToStream( OutputStream stream, Filter bloom) throws IOException {
      stream.write( WriteBloomToString(bloom).getBytes() );
      stream.flush();
  }
 
  public static Filter ReadBloomFromString( String str) throws IOException {
    if( str != null ) {
      Filter filter = NewVesselBloom();
      byte[] decoded = Base64.decodeBase64( str.getBytes());
      DataInputStream dataInput = new DataInputStream( new ByteArrayInputStream(decoded));
   
      filter.readFields(dataInput);
      return filter;
    } else {
      return NewBloomInstance();
    }
  }

  public static String WriteBloomToString( Filter bloom) throws IOException {
    if( bloom != null ) {
      ByteArrayOutputStream buffer = new ByteArrayOutputStream();
      bloom.write( new DataOutputStream(buffer));
      byte[] encodedBloom = Base64.encodeBase64( buffer.toByteArray());
      return new String(encodedBloom);
    } else {
      return null;
    }
  }


}
TOP

Related Classes of brickhouse.udf.bloom.BloomFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.