sumOfSquares = sumOfSquares.add(bigSize.pow(2));
sum = sum.add(bigSize);
numberOfShards++;
if(numberOfShards % 1000 == 0) {
GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser);
logger.info(String.format("PROGRESS: Calculating mean and variance: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size));
}
}
// Print out the stddev: (sum(x^2) - (1/N)*sum(x)^2)/N
long mean = sum.divide(BigInteger.valueOf(numberOfShards)).longValue();
long stddev = (long)(Math.sqrt(sumOfSquares.subtract(sum.pow(2).divide(BigInteger.valueOf(numberOfShards))).divide(BigInteger.valueOf(numberOfShards)).doubleValue()));
logger.info(String.format("Number of shards: %d; mean uncompressed size = %d; stddev uncompressed size = %d%n",numberOfShards,mean,stddev));
// Crank through the shards again, this time reporting on the shards significantly larger than the mean.
long threshold = mean + stddev*5;
logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n");
sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet,IntervalMergingRule.ALL);
while(sharder.hasNext()) {
FilePointer filePointer = sharder.next();
// Bounding region.
GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser);
// Size of the file pointer.
final long size = filePointer.size();
numberOfShards++;
if(filePointer.size() <= threshold) {
if(numberOfShards % 1000 == 0)
logger.info(String.format("PROGRESS: Searching for large shards: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size));
continue;
}
out.printf("%s\t%d\t%d\t%d%n",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size);
}
return 0;
}