SpamPercentileScore spamScores = new SpamPercentileScore();
spamScores.initialize(spamScoresPath, fs);
int[] newDocids = DocumentUtility.spamSortDocids(spamScores);
int collectionSize = env.readCollectionTermCount();
Posting posting = new Posting();
FSDataOutputStream out;
BloomConfig bloomConfig = new BloomConfig((int) env.getDocumentCount(),
collectionSize, nbHash, bitsPerElement);
//Deletes the output path if it already exists.
fs.delete(new Path(outputPath), true);
//Serialize and write the configuration parameters.
out = fs.create(new Path(outputPath + "/" + BloomConfig.CONFIG_FILE));
bloomConfig.write(out);
out.close();
for(int i = 0; i <= collectionSize; i++) {
if(i % 100000 == 0) {
if(i != 0) {
out.close();
}
out = fs.create(new Path(outputPath + "/" + i));
}
try {
PostingsList pl = env.getPostingsList(env.getTermFromId(i));
PostingsReader reader = pl.getPostingsReader();
Signature filter = null;
//Decide which filter to use based on the configuration parameters
int df = pl.getDf();
if (df <= bloomConfig.getIdentityHashThreshold()) {
filter = new BloomFilterHash(df * bloomConfig.getBitsPerElement(),
bloomConfig.getHashCount());
} else {
filter = new BloomFilterIdentityHash(bloomConfig.getDocumentCount());
}
while (reader.nextPosting(posting)) {
filter.add(newDocids[posting.getDocno()]);
}
out.writeInt(i);
out.writeInt(df);
filter.write(out);