// Calculate our vector size and optimal K value based on approximations
int vectorSize = getOptimalBloomFilterSize(numMembers, falsePosRate);
int nbHash = getOptimalK(numMembers, vectorSize);
// create new Bloom filter
BloomFilter filter = new BloomFilter(vectorSize, nbHash,
Hash.MURMUR_HASH);
// Open file for read
System.out.println("Training Bloom filter of size " + vectorSize
+ " with " + nbHash + " hash functions, " + numMembers
+ " approximate number of records, and " + falsePosRate
+ " false positive rate");
String line = null;
int numRecords = 0;
for (FileStatus status : fs.listStatus(inputFile)) {
BufferedReader rdr;
// if file is gzipped, wrap it in a GZIPInputStream
if (status.getPath().getName().endsWith(".gz")) {
rdr = new BufferedReader(new InputStreamReader(
new GZIPInputStream(fs.open(status.getPath()))));
} else {
rdr = new BufferedReader(new InputStreamReader(fs.open(status
.getPath())));
}
System.out.println("Reading " + status.getPath());
while ((line = rdr.readLine()) != null) {
filter.add(new Key(line.getBytes()));
++numRecords;
}
rdr.close();
}
System.out.println("Trained Bloom filter with " + numRecords
+ " entries.");
System.out.println("Serializing Bloom filter to HDFS at " + bfFile);
FSDataOutputStream strm = fs.create(bfFile);
filter.write(strm);
strm.flush();
strm.close();
System.out.println("Done training Bloom filter.");