package it.unimi.dsi.mg4j.test;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.Util;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.log4j.Logger;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.UnflaggedOption;
/** Reads a sequence of documents represented as blank-separated
* sequences of words, where documents are separated by new-lines.
* Produces and prints <var>q</var> DNF queries (OR's of AND's)
* as follows: for every query, <var>k</var> documents are selected
* at random, and from each of them <var>h</var> words at most are
* selected. The query is a <var>k</var>-ary OR of the corresponding
* AND's.
*/
final public class ProduceDNFFromLines {
private final static Logger LOGGER = Util.getLogger( ProduceDNFFromLines.class );
private ProduceDNFFromLines() {}
public static void main( final String[] arg ) throws IOException, JSAPException {
SimpleJSAP jsap = new SimpleJSAP( ProduceDNFFromLines.class.getName(), "Prints or selects parts of a stat file using global counts.",
new Parameter[] {
new UnflaggedOption( "numberOfDocuments", JSAP.INTEGER_PARSER, JSAP.REQUIRED, "The number of documents." ),
new FlaggedOption( "queries", JSAP.INTEGER_PARSER, "1", JSAP.NOT_REQUIRED, 'q', "queries", "The number of queries to be produced." ),
new FlaggedOption( "docperquery", JSAP.INTEGER_PARSER, "2", JSAP.NOT_REQUIRED, 'd', "docperquery", "The number of documents per query." ),
new FlaggedOption( "wordsperdoc", JSAP.INTEGER_PARSER, "2", JSAP.NOT_REQUIRED, 'w', "words", "The (maximum) number of words per document." ),
});
JSAPResult jsapResult = jsap.parse( arg );
if ( jsap.messagePrinted() ) return;
final int numberOfDocuments = jsapResult.getInt( "numberOfDocuments" );
final int queries = jsapResult.getInt( "queries" );
final int docperquery = jsapResult.getInt( "docperquery" );
final int wordsperdoc = jsapResult.getInt( "wordsperdoc" );
if ( docperquery > numberOfDocuments ) {
System.err.println( "There are not enough documents for the number of documents/query required" );
System.exit( 1 );
}
int i, j, q, t;
final int docs[] = new int[ numberOfDocuments ];
final int docForQuery[][] = new int[ queries ][ docperquery ];
final String query[][][] = new String[ queries ][ docperquery ][ wordsperdoc ];
final int coveredForQuery[] = new int [ queries ];
int maxDoc = 0;
final boolean[] used = new boolean[ numberOfDocuments ];
for ( i = 0; i < numberOfDocuments; i++ ) docs[ i ] = i;
for ( q = 0; q < queries; q++ ) {
for ( i = 0; i < docperquery; i++ ) {
j = i + (int)( ( numberOfDocuments - i ) * Math.random() );
t = docs[ i ]; docs[ i ] = docs[ j ]; docs[ j ] = t;
docForQuery[ q ][ i ] = docs[ i ];
used[ docs[ i ] ] = true;
if ( docs[ i ] > maxDoc ) maxDoc = docs[ i ];
}
Arrays.sort( docForQuery[ q ] );
}
//for ( q = 0; q < queries; q++ ) System.out.println( "Query " + q + ": " + new IntArrayList( docForQuery[ q ] ) );
String split[];
int words[] = new int[ 1024 ];
final FastBufferedReader reader = new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ) );
int lineNumber = 0;
int numberOfPartialQueries = queries;
ProgressLogger pl = new ProgressLogger( LOGGER );
pl.itemsName = "Klines";
pl.expectedUpdates = maxDoc / 1000;
pl.start( "Generating queries..." );
MutableString line = new MutableString();
while( reader.readLine( line ) != null && numberOfPartialQueries > 0 ) {
if ( used[ lineNumber ] ) {
for ( q = 0; q < queries; q++ )
if ( coveredForQuery[ q ] < docperquery && docForQuery[ q ][ coveredForQuery[ q ] ] == lineNumber ) {
split = line.toString().split( " " );
int nw = split.length;
words = IntArrays.ensureCapacity( words, nw + 1 );
for ( i = 0; i < nw; i++ ) words[ i ] = i;
for ( i = 0; i < Math.min( wordsperdoc, nw ); i++ ) {
j = i + (int)( ( nw - i ) * Math.random() );
t = words[ i ]; words[ i ] = words[ j ]; words[ j ] = t;
query[ q ][ coveredForQuery[ q ] ][ i ] = split[ words[ i ] ];
}
coveredForQuery[ q ]++;
if ( coveredForQuery[ q ] == docperquery ) numberOfPartialQueries--;
}
}
lineNumber++;
if ( lineNumber % 1000 == 0 ) pl.update();
}
pl.done();
MutableString p[] = new MutableString[ Math.max( queries, wordsperdoc ) ], s = new MutableString();
for( i = 0; i < p.length; i++ ) p[ i ] = new MutableString();
for ( q = 0; q < queries; q++ ) {
for( int d = 0; d < wordsperdoc; d++ ) {
int last = 0;
while( last < wordsperdoc && query[ q ][ d ][ last ] != null ) last++;
p[ d ].replace( '(' ).append( query[ q ][ d ], 0, last, " AND " ).append( ')' );
}
System.out.println( s.length( 0 ).append( p, 0, queries, " OR " ) );
}
ArrayList<String> l = new ArrayList<String>();
final String[] emptyArray = new String[ 0 ];
for ( q = 0; q < queries; q++ ) {
for( int w = 0; w < wordsperdoc; w++ ) {
l.clear();
for( int d = 0; d < wordsperdoc; d++ ) if ( query[ q ][ d ][ w ] != null ) l.add( query[ q ][ d ][ w ] );
p[ w ].replace( '(' ).append( l.toArray( emptyArray ), " OR " ).append( ')' );
}
System.err.println( s.length( 0 ).append( p, 0, wordsperdoc, " AND " ) );
}
}
}