package it.unimi.dsi.mg4j.test;
import it.unimi.dsi.mg4j.index.DiskBasedIndex;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.Util;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.util.Properties;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
/** Selects part of a stats using global frequency.
final public class SelectStats {
private final static Logger LOGGER = Util.getLogger( SelectStats.class );
private SelectStats() {}
/** A reasonable format for real numbers. */
private static final java.text.NumberFormat formatDouble = new java.text.DecimalFormat( "#,##0.00000" );
/** Formats a number.
* <P>This method formats a double separating thousands and printing just two fractional digits.
* @param d a number.
* @return a string containing a pretty print of the number.
public static String format( final double d ) {
final StringBuffer s = new StringBuffer();
return formatDouble.format( d, s, new java.text.FieldPosition( 0 ) ).toString();
public static void main( final String[] arg ) throws IOException, JSAPException, ConfigurationException {
SimpleJSAP jsap = new SimpleJSAP( SelectStats.class.getName(), "Prints or selects parts of a stat file using global counts.",
new Parameter[] {
new Switch( "print", 'p', "print", "Just print global occurrences." ),
new FlaggedOption( "globalFrequency", JSAP.DOUBLE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'g', "global-frequency", "The global count divided by the sum of document lengths that will be used to choose words to dump." ),
new FlaggedOption( "quantumBitLength", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'q', "quantum-bit-length", "The quantum bit length that will be used to choose words to dump." ),
new FlaggedOption( "error", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'e', "error", "The error w.r.t. frequency (as a percentage) that will be used to choose words to dump." ),
new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The index basename." ),
new UnflaggedOption( "statFile", JSAP.STRING_PARSER, JSAP.REQUIRED, "The stat file to be scanned." )
JSAPResult jsapResult = jsap.parse( arg );
if ( jsap.messagePrinted() ) return;
final boolean print = jsapResult.getBoolean( "print" );
final String basename = jsapResult.getString( "basename" );
final String statFile = jsapResult.getString( "statFile" );
final int quantumBitLength = jsapResult.getInt( "quantumBitLength", 0 );
final double globalFrequency = jsapResult.getDouble( "globalFrequency", 0 );
final int error = jsapResult.getInt( "error", 1 );
final double lowGlobFreq = globalFrequency * ( 1 - error / 100.0 );
final double highGlobFreq = globalFrequency * ( 1 + error / 100.0 );
final int lowQbl= (int)Math.round(quantumBitLength * ( 1 - error / 100.0 ));
final int highQbl = (int)Math.round( quantumBitLength* ( 1 + error / 100.0 ) );
final Properties properties = new Properties( basename + DiskBasedIndex.PROPERTIES_EXTENSION );
final int numberOfTerms = properties.getInt( Index.PropertyKeys.TERMS );
final long numberOfoccurrences = properties.getLong( Index.PropertyKeys.OCCURRENCES );
final InputBitStream globCounts = new InputBitStream( basename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
long gc[] = new long[ numberOfTerms ];
for( int t = 0; t < numberOfTerms; t++ ) gc[ t ] = globCounts.readLongGamma();
final MutableString line = new MutableString();
MutableString number;
final FastBufferedReader reader = new FastBufferedReader( new FileReader( statFile ) );
boolean dumping = false;
int f, q;
reader.readLine( line );
while( reader.readLine( line ) != null ) {
if ( line.charAt( 0 ) == '#' ) {
number = line.substring( 2 );
f = Integer.parseInt( number.delete( number.indexOf( ' ' ), number.length() ).toString() );
double freq = (double)gc[ f ] / numberOfoccurrences;
if ( print ) System.out.println( line + " " + format( freq ) );
else {
if ( quantumBitLength != 0 ) {
// We choose using the quantum bit length
number = line.substring( 2 );
number = number.substring( number.indexOf( ' ' ) + 1 );
q = Integer.parseInt( number.delete( number.indexOf( ' ' ), number.length() ).toString() );
dumping = q >= lowQbl && q <= highQbl;
else dumping = freq >= lowGlobFreq && freq <= highGlobFreq;
if ( dumping ) line.println( System.out );
else if ( ! print && dumping ) {
line.println( System.out );