Package it.unimi.dsi.mg4j.test

Source Code of it.unimi.dsi.mg4j.test.ProduceDNFFromLines

package it.unimi.dsi.mg4j.test;

import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.Util;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;

import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;

import org.apache.log4j.Logger;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.UnflaggedOption;

/** Reads a sequence of documents represented as blank-separated
* sequences of words, where documents are separated by new-lines.
* Produces and prints <var>q</var> DNF queries (OR's of AND's)
* as follows: for every query, <var>k</var> documents are selected
* at random, and from each of them <var>h</var> words at most are
* selected. The query is a <var>k</var>-ary OR of the corresponding
* AND's.
*/

final public class ProduceDNFFromLines {
  private final static Logger LOGGER = Util.getLogger( ProduceDNFFromLines.class );
 
  private ProduceDNFFromLines() {}

  public static void main( final String[] arg ) throws IOException, JSAPException {

    SimpleJSAP jsap = new SimpleJSAP( ProduceDNFFromLines.class.getName(), "Prints or selects parts of a stat file using global counts.",
      new Parameter[] {
        new UnflaggedOption( "numberOfDocuments", JSAP.INTEGER_PARSER, JSAP.REQUIRED, "The number of documents." ),
        new FlaggedOption( "queries", JSAP.INTEGER_PARSER, "1", JSAP.NOT_REQUIRED, 'q', "queries", "The number of queries to be produced." ),
        new FlaggedOption( "docperquery", JSAP.INTEGER_PARSER, "2", JSAP.NOT_REQUIRED, 'd', "docperquery", "The number of documents per query." ),
        new FlaggedOption( "wordsperdoc", JSAP.INTEGER_PARSER, "2", JSAP.NOT_REQUIRED, 'w', "words", "The (maximum) number of words per document." ),
     
    });

    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;

    final int numberOfDocuments = jsapResult.getInt( "numberOfDocuments" );
    final int queries = jsapResult.getInt( "queries" );
    final int docperquery = jsapResult.getInt( "docperquery" );
    final int wordsperdoc = jsapResult.getInt( "wordsperdoc" );
   
    if ( docperquery > numberOfDocuments ) {
      System.err.println( "There are not enough documents for the number of documents/query required" );
      System.exit( 1 );
    }
   
    int i, j, q, t;

    final int docs[] = new int[ numberOfDocuments ];
    final int docForQuery[][] = new int[ queries ][ docperquery ];
    final String query[][][] = new String[ queries ][ docperquery ][ wordsperdoc ];

    final int coveredForQuery[] = new int [ queries ];
    int maxDoc = 0;
    final boolean[] used = new boolean[ numberOfDocuments ];
    for ( i = 0; i < numberOfDocuments; i++ ) docs[ i ] = i;
    for ( q = 0; q < queries; q++ ) {
      for ( i = 0; i < docperquery; i++ ) {
        j = i + (int)( ( numberOfDocuments - i ) * Math.random() );
        t = docs[ i ]; docs[ i ] = docs[ j ]; docs[ j ] = t;
        docForQuery[ q ][ i ] = docs[ i ];
        used[ docs[ i ] ] = true;
        if ( docs[ i ] > maxDoc ) maxDoc = docs[ i ];
      }
      Arrays.sort( docForQuery[ q ] );
    }
   
    //for ( q = 0; q < queries; q++ )  System.out.println( "Query " + q + ": " + new IntArrayList( docForQuery[ q ] ) );
     
   
    String split[];
    int words[] = new int[ 1024 ];
    final FastBufferedReader reader = new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ) );
   
    int lineNumber = 0;
    int numberOfPartialQueries = queries;
    ProgressLogger pl = new ProgressLogger( LOGGER );
    pl.itemsName = "Klines";
    pl.expectedUpdates = maxDoc / 1000;
    pl.start( "Generating queries..." );
    MutableString line = new MutableString();
    while( reader.readLine( line ) != null && numberOfPartialQueries > 0 ) {
      if ( used[ lineNumber ] ) {
        for ( q = 0; q < queries; q++ )
          if ( coveredForQuery[ q ] < docperquery && docForQuery[ q ][ coveredForQuery[ q ] ] == lineNumber ) {
            split = line.toString().split( " " );
            int nw = split.length;
            words = IntArrays.ensureCapacity( words, nw + 1 );
            for ( i = 0; i < nw; i++ ) words[ i ] = i;
            for ( i = 0; i < Math.min( wordsperdoc, nw ); i++ ) {
              j = i + (int)( ( nw - i ) * Math.random() );
              t = words[ i ]; words[ i ] = words[ j ]; words[ j ] = t;
              query[ q ][ coveredForQuery[ q ] ][ i ] = split[ words[ i ] ];
            }
            coveredForQuery[ q ]++;
            if ( coveredForQuery[ q ] == docperquery ) numberOfPartialQueries--;
          }
      }
      lineNumber++;
      if ( lineNumber % 1000 == 0 ) pl.update();
    }
    pl.done();

    MutableString p[] = new MutableString[ Math.max( queries, wordsperdoc ) ], s = new MutableString();
    for( i = 0; i < p.length; i++ ) p[ i ] = new MutableString();

    for ( q = 0; q < queries; q++ ) {
      for( int d = 0; d < wordsperdoc; d++ ) {
        int last = 0;
        while( last < wordsperdoc && query[ q ][ d ][ last ] != null ) last++;
        p[ d ].replace( '(' ).append( query[ q ][ d ], 0, last, " AND " ).append( ')' );
      }
      System.out.println( s.length( 0 ).append( p, 0, queries, " OR " ) );
    }

    ArrayList<String> l = new ArrayList<String>();
    final String[] emptyArray = new String[ 0 ];
    for ( q = 0; q < queries; q++ ) {
      for( int w = 0; w < wordsperdoc; w++ ) {
        l.clear();
        for( int d = 0; d < wordsperdoc; d++ ) if ( query[ q ][ d ][ w ] != null ) l.add( query[ q ][ d ][ w ] );
        p[ w ].replace( '(' ).append( l.toArray( emptyArray ), " OR " ).append( ')' );
      }
      System.err.println( s.length( 0 ).append( p, 0, wordsperdoc, " AND " ) );
    }

  }
}
TOP

Related Classes of it.unimi.dsi.mg4j.test.ProduceDNFFromLines

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.