Package com.martiansoftware.jsap

Examples of com.martiansoftware.jsap.JSAPResult


      new FlaggedOption( "skipBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "skip-buffer-size", "The size of the internal temporary buffer used while creating an index with skips." ),
      new UnflaggedOption( "inputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the global index." ),
      new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the local indices." )
    });
   
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;
    String inputBasename = jsapResult.getString( "inputBasename" );
    String outputBasename = jsapResult.getString( "outputBasename" );
    String strategyFilename = jsapResult.getString( "strategy" );
    DocumentalPartitioningStrategy strategy = null;

    if ( jsapResult.userSpecified( "uniformStrategy" ) ) {
      strategy = DocumentalStrategies.uniform( jsapResult.getInt( "uniformStrategy" ), Index.getInstance( inputBasename ).numberOfDocuments );
      BinIO.storeObject( strategy, strategyFilename = outputBasename + IndexCluster.STRATEGY_DEFAULT_EXTENSION );
    }
    else if ( strategyFilename != null ) strategy = (DocumentalPartitioningStrategy)BinIO.loadObject( strategyFilename );
    else throw new IllegalArgumentException( "You must specify a partitioning strategy" );
   
    final boolean skips = ! jsapResult.getBoolean( "noSkips" );
    final boolean interleaved = jsapResult.getBoolean( "interleaved" );
    if ( ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) throw new IllegalArgumentException( "You specified quantum or height, but you also disabled skips." );

    new PartitionDocumentally( inputBasename,
        outputBasename,
        strategy,
        strategyFilename,
        jsapResult.getInt( "bloom" ),
        jsapResult.getInt( "bufferSize" ),
        CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ),
        interleaved,
        skips,
        jsapResult.getInt( "quantum" ),
        jsapResult.getInt( "height" ),
        jsapResult.getInt( "skipBufferSize" ),
        jsapResult.getLong( "logInterval" ) ).run();
  }     
View Full Code Here


        new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),
        new FlaggedOption( "titles", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 't', "titles", "The resulting document titles." ),
        new FlaggedOption( "uris", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'u', "uris", "The resulting document URIs." ),
    });

    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;

    DocumentSequence documentSequence = Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );

    if ( ! jsapResult.userSpecified( "uris" ) && ! jsapResult.userSpecified( "titles" ) )
      throw new IllegalArgumentException( "You specify either a title or a URI output file" );
   
    Util.ensureLog4JIsConfigured();

    final DocumentIterator documentIterator = documentSequence.iterator();

    Document document;
    FastBufferedOutputStream uriStream = null, titleStream = null;
   
    if ( jsapResult.userSpecified( "uris" ) ) uriStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "uris" ) ) );
    if ( jsapResult.userSpecified( "titles" ) ) titleStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "titles" ) ) );
   
    MutableString s = new MutableString();

    ProgressLogger progressLogger = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "documents" );
    if ( documentSequence instanceof DocumentCollection ) progressLogger.expectedUpdates = ((DocumentCollection)documentSequence).size();
    progressLogger.start( "Scanning..." );
   
    while( ( document = documentIterator.nextDocument() ) != null ) {
      if ( uriStream != null ) {
View Full Code Here

        new Parameter[] {
          new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename of the collection." ),
        }
    );
   
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;
   
    optimize( jsapResult.getString( "basename" ) );
  }
View Full Code Here

          new FlaggedOption( "where", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'w', "where", "The the WHERE part (without the WHERE keyword) of the SQL query generating the collection." )       

        }
    );
   
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;

    // We run the query to get meta-information about the columns.
    @SuppressWarnings("unused") Class<?> jdbcDriver = Class.forName( jsapResult.getString( "jdbcDriver" ) );
    Connection connection = DriverManager.getConnection( jsapResult.getString( "dburi" ) );
    Statement s = connection.createStatement();
    ResultSet rs = s.executeQuery( "SELECT " + jsapResult.getString( "select" ) );
    ResultSetMetaData metaData = rs.getMetaData();   
    String[] column = new String[ metaData.getColumnCount() - 2 ];
    for( int i = 3; i <= metaData.getColumnCount(); i++ ) column[ i - 3 ] = metaData.getColumnName( i );
    rs.close();
    s.close();
    connection.close();
   
    final DocumentFactory[] factory = new DocumentFactory[ column.length ];
    final Class<?>[] factoryClass = jsapResult.getClassArray( "factory" );
    final String[] property = jsapResult.getStringArray( "property" );
    for( int i = 0; i < factory.length; i++ ) {
      factory[ i ] = PropertyBasedDocumentFactory.getInstance( factoryClass[ Math.min( i, factoryClass.length - 1 ) ], property );
      if ( factory[ i ].numberOfFields() > 1 && ! jsapResult.userSpecified( "fieldName" ) ) throw new IllegalArgumentException( "For factories with more than one field you must specify the name of each field of the composed factory" );
    }

    if ( jsapResult.userSpecified"fieldName" ) ) column = jsapResult.getStringArray( "fieldName" );
   
    BinIO.storeObject( new JdbcDocumentCollection(
                jsapResult.getString( "dburi" ),
                jsapResult.getString( "jdbcDriver" ),
                jsapResult.getString( "select" ),
                jsapResult.getString( "idSpec" ),
                jsapResult.getString( "where" ),
                CompositeDocumentFactory.getFactory( factory, column )
              ), jsapResult.getString( "collection" ) );
  }
View Full Code Here

          new UnflaggedOption( "fileName", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename of the source CSV file." ),
          new UnflaggedOption( "column", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.GREEDY, "Columns names that will be indexed." ),
        }
    );
   
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;
   
    final int titleColumn = jsapResult.getInt( "titleColumn" );
    final String collection = jsapResult.getString( "collection" );
    final String fileName = jsapResult.getString( "fileName" );
    final String separator = jsapResult.getString( "separator" ).equals( "\\t" ) ? "\t" : jsapResult.getString( "separator" );
    final String[] column = jsapResult.getStringArray( "column" );
   
    final DocumentFactory[] factory = new DocumentFactory[ column.length ];
    for( int i = 0; i < factory.length; i++ )
      factory[ i ] = PropertyBasedDocumentFactory.getInstance( jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ) );
   
    BinIO.storeObject( new CSVDocumentCollection( fileName, separator, column, titleColumn, CompositeDocumentFactory.getFactory( factory, column ) ), collection );
  }
View Full Code Here

        new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),
        new FlaggedOption( "tempDir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'T', "temp-dir", "A directory for all temporary batch files." ),
        new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the resulting index." )
    });

    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;

    if ( ( jsapResult.userSpecified( "builderClass" ) || jsapResult.userSpecified( "exact" ) ) && ! jsapResult.userSpecified( "buildCollection" ) )  throw new IllegalArgumentException( "To specify options about the collection building process, you must specify a basename first." );
    if ( jsapResult.userSpecified( "sequence" ) && jsapResult.userSpecified( "objectSequence" ) ) throw new IllegalArgumentException( "You cannot specify both a serialised and an parseable-object sequence" );
   
    final DocumentSequence documentSequence = jsapResult.userSpecified( "objectSequence" ) ? (DocumentSequence)jsapResult.getObject( "objectSequence" ) : Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );
    final DocumentFactory factory = documentSequence.factory();

    final int[] indexedField = Scan.parseFieldNames( jsapResult.getStringArray( "indexedField" ), factory, jsapResult.getBoolean( "allFields" ) );
    final VirtualDocumentResolver[] virtualDocumentResolver = Scan.parseVirtualDocumentResolver( jsapResult.getStringArray( "virtualDocumentResolver" ), indexedField, factory );
    final int[] virtualDocumentGap = Scan.parseVirtualDocumentGap( jsapResult.getStringArray( "virtualDocumentGap" ), indexedField, factory );

    final TermProcessor termProcessor = jsapResult.getBoolean( "downcase" ) ? DowncaseTermProcessor.getInstance() :
      ObjectParser.fromSpec( jsapResult.getString( "termProcessor" ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } );

    final boolean skips = ! jsapResult.getBoolean( "noSkips" );
    final boolean interleaved = jsapResult.getBoolean( "interleaved" );
    if ( ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) throw new IllegalArgumentException( "You specified quantum or height, but you also disabled skips." );

    DocumentCollectionBuilder builder = null;
    if ( jsapResult.userSpecified( "buildCollection" ) ) {
      final Class<? extends DocumentCollectionBuilder> builderClass = jsapResult.getClass( "builderClass" );
      builder = builderClass != null ? builderClass.getConstructor( String.class, DocumentFactory.class, boolean.class ).newInstance(
          jsapResult.getString( "buildCollection" ),
          documentSequence.factory().numberOfFields() == indexedField.length ? documentSequence.factory().copy() : new SubDocumentFactory( documentSequence.factory().copy(), indexedField ),
          Boolean.valueOf( jsapResult.getBoolean( "exact" ) ) ) : null;
    }

    final IndexBuilder indexBuilder = new IndexBuilder( jsapResult.getString( "basename" ), documentSequence )
    .termProcessor( termProcessor )
    .builder( builder )
    .scanBufferSize( jsapResult.getInt( "scanBufferSize" ) )
    .skipBufferSize( jsapResult.getInt( "skipBufferSize" ) )
    .pasteBufferSize( jsapResult.getInt( "pasteBufferSize" ) )
    .combineBufferSize( jsapResult.getInt( "combineBufferSize" ) )
    .documentsPerBatch( jsapResult.getInt( "batchSize" ) )
    .maxTerms( jsapResult.getInt( "maxTerms" ) )
    .keepBatches( jsapResult.getBoolean( "keepBatches" ) )
    .termMapClass( jsapResult.getClass( "termMap" ) )
    .indexedFields( indexedField )
    .skips( skips )
    .interleaved( interleaved )
    .quantum( jsapResult.getInt( "quantum" ) )
    .height( jsapResult.getInt( "height" ) )
    .logInterval( jsapResult.getLong( "logInterval" ) )
    .batchDirName( jsapResult.getString( "tempDir" ) );
   
    for( int i = 0; i < virtualDocumentResolver.length; i++ ) if ( virtualDocumentResolver[ i ] != null ) indexBuilder.virtualDocumentResolvers.put( i, virtualDocumentResolver[ i ] );
    for( int i = 0; i < virtualDocumentGap.length; i++ ) indexBuilder.virtualDocumentGaps.put( i, virtualDocumentGap[ i ] );
   
    if ( jsapResult.userSpecified( "comp" ) ) indexBuilder.standardWriterFlags( CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ) );
    if ( jsapResult.userSpecified( "compPayload" ) ) indexBuilder.payloadWriterFlags( CompressionFlags.valueOf( jsapResult.getStringArray( "compPayload" ), CompressionFlags.DEFAULT_PAYLOAD_INDEX ) );
    if ( jsapResult.userSpecified( "renumber" ) ) indexBuilder.mapFile( jsapResult.getString( "renumber" ) );
   
    indexBuilder.run();
  }
View Full Code Here

        new FlaggedOption( "error", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'e', "error", "The error w.r.t. frequency (as a percentage) that will be used to choose words to dump." ),
        new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The index basename." ),
        new UnflaggedOption( "statFile", JSAP.STRING_PARSER, JSAP.REQUIRED, "The stat file to be scanned." )
    });

    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;

    final boolean print = jsapResult.getBoolean( "print" );
    final String basename = jsapResult.getString( "basename" );
    final String statFile = jsapResult.getString( "statFile" );
    final int quantumBitLength = jsapResult.getInt( "quantumBitLength", 0 );
    final double globalFrequency = jsapResult.getDouble( "globalFrequency", 0 );
    final int error = jsapResult.getInt( "error", 1 );
    final double lowGlobFreq = globalFrequency * ( 1 - error / 100.0 );
    final double highGlobFreq = globalFrequency * ( 1 + error / 100.0 );
    final int lowQbl= (int)Math.round(quantumBitLength * ( 1 - error / 100.0 ));
    final int highQbl = (int)Math.round( quantumBitLength* ( 1 + error / 100.0 ) );

View Full Code Here

          new FlaggedOption( "termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file (without loading them into core memory) instead of standard input." ),
          new FlaggedOption( "uniqueUris", JSAP.INTSIZE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'U', "unique-uris", "Force URIs to be unique by adding random garbage at the end of duplicates; the argument is an upper bound for the number of URIs that will be read, and will be used to create a Bloom filter." ),
          new UnflaggedOption( "resolver", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the resolver." )
    });
   
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;
   
    final int bufferSize = jsapResult.getInt( "bufferSize" );
    final String resolverName = jsapResult.getString( "resolver" );
    //final Class<?> tableClass = jsapResult.getClass( "class" );
    final boolean iso = jsapResult.getBoolean( "iso" );
    String termFile = jsapResult.getString( "termFile" );
   
    BloomFilter filter = null;
    final boolean uniqueURIs = jsapResult.userSpecified( "uniqueUris" );
    if ( uniqueURIs ) filter = new BloomFilter( jsapResult.getInt( "uniqueUris" ) );
   
    final Collection<? extends CharSequence> collection;
    if ( termFile == null ) {
      ArrayList<MutableString> termList = new ArrayList<MutableString>();
      final ProgressLogger pl = new ProgressLogger();
      pl.itemsName = "URIs";
      final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ), bufferSize ), pl );
     
      pl.start( "Reading URIs..." );
      MutableString uri;
      while( termIterator.hasNext() ) {
        uri = termIterator.next();
        if ( uniqueURIs ) makeUnique( filter, uri );
        termList.add( uri.copy() );
      }
      pl.done();
     
      collection = termList;
    }
    else {
      if ( uniqueURIs ) {
        // Create temporary file with unique URIs
        final ProgressLogger pl = new ProgressLogger();
        pl.itemsName = "URIs";
        pl.start( "Copying URIs..." );
        final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( new FileInputStream( termFile ) ), bufferSize ), pl );
        File temp = File.createTempFile( URLMPHVirtualDocumentResolver.class.getName(), ".uniqueuris" );
        temp.deleteOnExit();
        termFile = temp.toString();
        final FastBufferedOutputStream outputStream = new FastBufferedOutputStream( new FileOutputStream( termFile ), bufferSize );
        MutableString uri;
        while( termIterator.hasNext() ) {
          uri = termIterator.next();
          makeUnique( filter, uri );
          uri.writeUTF8( outputStream );
          outputStream.write( '\n' );
        }
        pl.done();
        outputStream.close();
      }
      collection = new FileLinesCollection( termFile, "UTF-8" );
    }
    LOGGER.debug( "Building function..." );
    final int width = jsapResult.getInt( "width" );
    if ( jsapResult.getBoolean( "sorted" ) ) BinIO.storeObject( new URLMPHVirtualDocumentResolver( new ShiftAddXorSignedStringMap( collection.iterator(), new TwoStepsLcpMonotoneMinimalPerfectHashFunction<CharSequence>( collection, iso ? TransformationStrategies.prefixFreeIso() : TransformationStrategies.prefixFreeUtf16() ), width ) ), resolverName );
    else BinIO.storeObject( new URLMPHVirtualDocumentResolver( new ShiftAddXorSignedStringMap( collection.iterator(), new MWHCFunction<CharSequence>( collection, iso ? TransformationStrategies.iso() : TransformationStrategies.utf16() ), width ) ), resolverName );
    LOGGER.debug( " done." );
    }
View Full Code Here

    SimpleJSAP jsap = new SimpleJSAP( ComputeNumBitsPositions.class.getName(), "Scans and prints to standard output metadata of a collection. All line terminators in the metadata will be substituted with spaces.",
      new Parameter[] {
        new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the index." ),
    });

    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;
    final String basename = jsapResult.getString( "basename" );

    // Just to check that the index is of the right type
    final BitStreamHPIndex index = (BitStreamHPIndex)Index.getInstance( basename, false, false );
   
    final InputBitStream ibs = new InputBitStream( basename + DiskBasedIndex.INDEX_EXTENSION );
View Full Code Here

          new Switch( "noSkip", JSAP.NO_SHORTFLAG, "no-skip", "Skip \"all-skips\" check." ),
          new Switch( "noComp", JSAP.NO_SHORTFLAG, "no-comp", "Skip composite iterator check." ),
          new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the index." )
      });
   
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;
   
    DocumentSequence documentSequence = it.unimi.dsi.mg4j.tool.Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );
   
    final DocumentFactory factory = documentSequence.factory();
    final boolean stem = jsapResult.getBoolean( "stem" );
    final boolean termLists = jsapResult.getBoolean( "termLists" );
    final int[] indexedField = it.unimi.dsi.mg4j.tool.Scan.parseFieldNames( jsapResult.getStringArray( "indexedField" ), factory, jsapResult.getBoolean( "allFields" )  );
   
    LOGGER.debug( "Parsed indexed field: " + IntArrayList.wrap( indexedField ) );
   
    final String basename = jsapResult.getString( "basename" );
    final String permutationFile = jsapResult.getString( "renumber" );

    final boolean isVirtual = jsapResult.getBoolean( "virtual" );

    int i, t = 0;

    final ProgressLogger pl = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "ints" );
    final Index[] index = stem ? new Index[ indexedField.length ] : new Index[ 1 ];
    final int numberOfTerms[] = new int[ indexedField.length ];
    final ObjectArrayList<MutableString>[] terms = new ObjectArrayList[ indexedField.length ];
    final IndexReader[] indexReader = new IndexReader[ index.length ];
    final InputBitStream[] frequencies = new InputBitStream[ index.length ];
    final int[][] count = new int[ index.length ][];
    final int[] permutation = permutationFile != null ? BinIO.loadInts( permutationFile ) : null;
    final int[][] occ = new int[ index.length ][];
    final int[][] wordInPos = new int[ index.length ][];
    final Int2IntMap[] termsInDoc = new Int2IntOpenHashMap[ index.length ];
    int totalTerms = 0;
   
    boolean allBitStreamIndices = true;
   
    for( i = 0; i < index.length; i++ ) {
      final String basenameField = basename + (stem ? "-" + factory.fieldName( indexedField[ i ] ) : "" );
      index[ i ] = Index.getInstance( basenameField );
      if ( ! ( index[ i ] instanceof BitStreamIndex ) ) allBitStreamIndices = false;
     
      if ( termLists ) {
        terms[ i ] = new ObjectArrayList<MutableString>( new FileLinesCollection( basenameField + DiskBasedIndex.TERMS_EXTENSION, "UTF-8" ).allLines() );
        numberOfTerms[ i ] = terms[ i ].size();
      }
      else numberOfTerms[ i ] = index[ i ].numberOfTerms;
      totalTerms += numberOfTerms[ i ];
     
      // This will be matched with the number of occurrences per document
      count[ i ] = new int[ index[ i ].numberOfDocuments ];

      occ[ i ] = index[ i ].maxCount > 0 ? new int[ index[ i ].maxCount ] : IntArrays.EMPTY_ARRAY;
      wordInPos[ i ] = new int[ Math.max( 0, index[ i ].properties.getInt( Index.PropertyKeys.MAXDOCSIZE ) ) ];
      indexReader[ i ] = index[ i ].getReader();
     
      if ( new File( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION ).exists() ) frequencies[ i ] = new InputBitStream( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION );
      termsInDoc[ i ] = new Int2IntOpenHashMap();
    }


    int currDoc = 0,
    // Term position in the current document.
    pos = 0, f = 0, p;

    pl.itemsName = "lists";
    pl.expectedUpdates = totalTerms;
   
    int indexFrequency = -1;
   
    // Sequential scan
    if ( !jsapResult.getBoolean( "noSeq" ) ) {
      try {
        for ( i = 0; i < index.length; i++ ) {
          int numberOfPostings = 0;
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start( "Verifying sequentially index " + index[ i ] + "..." );

          if ( allBitStreamIndices ) {
            for ( t = 0; t < numberOfTerms[ i ]; t++ ) {
              pl.update();
              IndexIterator indexIterator = indexReader[ i ].nextIterator();
              indexFrequency = indexIterator.frequency();
              numberOfPostings += indexFrequency;
              if ( frequencies[ i ] != null && indexFrequency != ( f = frequencies[ i ].readGamma() ) ) {
                System.err.println( "Error in frequency for term " + t + ": expected " + f + " documents, found " + indexFrequency );
                return;
              }

              while ( indexFrequency-- != 0 ) {
                p = indexIterator.nextDocument();
                if (index[i].hasCounts) count[i][p] += indexIterator.count();
                if (index[i].hasPositions) indexIterator.positionArray(); // Just to force reading in high-performance indices
              }
              if ( indexIterator.nextDocument() != -1 ) throw new AssertionError( "nextDocument() is not -1 after exhaustive iteration" );
            }
           
            // Check document sizes
            if ( ! isVirtual && ( (BitStreamIndex) index[ i ] ).sizes != null && index[ i ].hasCounts )
              for ( p = 0; p < index[ i ].numberOfDocuments; p++ )
                if ( index[ i ].sizes.getInt( p ) != count[ i ][ p ] )
                  System.err.println( "Document " + p + " has size " + ( (BitStreamIndex) index[ i ] ).sizes.getInt( p ) + " but " + count[ i ][ p ] + " occurrences have been stored." );
           
          }
          else { // Non-bitstream indices
            for (t = 0; t < numberOfTerms[ i ]; t++) {
              pl.update();
              IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              indexFrequency = indexIterator.frequency();
              numberOfPostings += indexFrequency;
              if (frequencies[i] != null && indexFrequency != (f = frequencies[i].readGamma())) {
                System.err.println("Error in frequency for term " + t
                    + ": expected " + f + " documents, found "
                    + indexFrequency);
                return;
              }
             
              int prevp = -1;
              while (indexFrequency-- != 0) {
                p = indexIterator.nextDocument();
                if ( prevp >= p ) throw new AssertionError( "previous pointer: " + prevp + "; current pointer: " + p );
                prevp = p;
                if (index[i].hasCounts) count[i][p] += indexIterator.count();
              }
            }
          }
          pl.done();
         
          if ( ! isVirtual && numberOfPostings != index[ i ].numberOfPostings ) System.err.println( "Index declares " + index[ i ].numberOfPostings + " postings, but we found " + numberOfPostings );
          long numberOfOccurrences = 0;
          if ( index[ i ].hasCounts ) {
            for ( p = 0; p < index[ i ].numberOfDocuments; p++ ) numberOfOccurrences += count[ i ][ p ];
            if ( numberOfOccurrences != index[ i ].numberOfOccurrences ) System.err.println( "Index declares " + index[ i ].numberOfOccurrences + " occurrences, but we found " + numberOfOccurrences );
          }
        }
      } catch ( Exception e ) {
        System.err.println( "Exception while scanning sequentially term " + t + " of index " + index[ i ] );
        System.err.println( "Term frequency was " + f + " and position " + ( f - indexFrequency - 1 ) );
        throw e;
      }
    }
 
    IntArrayList l = new IntArrayList();
    ObjectArrayList<int[]> positions = new ObjectArrayList<int[]>();
   
    if ( ! jsapResult.getBoolean( "noSkip" ) ) {
      int start = 0, end = 0, result;
      try {
        for (i = 0; i < index.length; i++) {
         
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start("Verifying all skips in " + index[i] + "...");

          for (t = 0; t < numberOfTerms[ i ]; t++) {
            l.clear();
            positions.clear();
            IndexIterator documents = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
            int d;
            while( ( d = documents.nextDocument() ) != -1 ) {
              l.add( d );
              if ( index[ i ].hasPositions ) positions.add( ArrayUtils.subarray( documents.positionArray(), 0, documents.count() ) );
            }
           
            for( start = 0; start < l.size(); start++ ) {
              for( end = start + 1; end < l.size(); end++ ) {
                IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
               
                result = indexIterator.skipTo( l.getInt( start ) );
                if ( indexIterator.document() != l.getInt( start ) || result != l.getInt( start ) ) throw new AssertionError( "Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
                result = indexIterator.skipTo( l.getInt( end ) );
                if ( indexIterator.document() != l.getInt( end ) || result != l.getInt( end ) ) throw new AssertionError( "Trying to skip to document " + l.getInt( end ) + " (term " + t + ") after a skip to " + start + " moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
               
                if ( index[ i ].hasPositions ) {
                  // This catches wrong state reconstruction after skips.
                  indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
                  indexIterator.skipTo( l.getInt( start ) );
                  if ( indexIterator.document() != l.getInt( start ) ) throw new AssertionError(indexIterator.document() + " != " + l.getInt( start ) );
                  if ( indexIterator.count() != positions.get( start ).length ) throw new AssertionError(indexIterator.count() + " != " + positions.get( start ).length );
                  if ( ! Arrays.equals( positions.get( start ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )
                     ) throw new AssertionError(Arrays.toString( positions.get( start ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) ) );
                  indexIterator.skipTo( l.getInt( end ) );
                  if ( indexIterator.document() != l.getInt( end )  ) throw new AssertionError(indexIterator.document() + " != " + l.getInt( end ) );
                  if ( indexIterator.count() != positions.get( end ).length ) throw new AssertionError(indexIterator.count() + " != " + positions.get( end ).length );
                  if ( ! Arrays.equals( positions.get( end ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )
                     ) throw new AssertionError(Arrays.toString( positions.get( end ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) ) );
                }
               
              }
             
              IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
             
              result = indexIterator.skipTo( l.getInt( start ) );
              if ( indexIterator.document() != l.getInt( start ) || result != l.getInt( start ) ) throw new AssertionError("Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
              result = indexIterator.skipTo( Integer.MAX_VALUE );
              if ( indexIterator.hasNext() || result != Integer.MAX_VALUE ) throw new AssertionError("Trying to skip beyond end of list (term " + t + ") after a skip to " + start + " returned " + result + " (hasNext()=" + indexIterator.hasNext() + ")" );
             
             
            }
            pl.update();
          }
          pl.done();
        }
      }
      catch( Throwable e  ) {
        System.err.println( "Exception during all-skip test (index=" + index[ i ] + ", term=" + t + ", start=" + start + ", end=" + end + ")" );
        throw e;
      }
     }
   

    if ( ! jsapResult.getBoolean( "noComp" ) ) {
      IndexReader additionalReader;
      IntLinkedOpenHashSet s0 = new IntLinkedOpenHashSet();
      IntOpenHashSet s1 = new IntOpenHashSet();
      IntAVLTreeSet s2 = new IntAVLTreeSet();
      IntIterator it;
      IndexIterator indexIterator, additionalIterator;
      it.unimi.dsi.mg4j.search.DocumentIterator documentIterator;
      int u = 0;
     
      try {
        for (i = 0; i < index.length; i++) {
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start("Verifying composite iterators in " + index[i] + "...");
          additionalReader = index[ i ].getReader();
         
          for (t = 0; t < numberOfTerms[ i ]; t++) {
            for (u = 0; u < numberOfTerms[ i ]; u++) {
              s0.clear();
              s1.clear();
              // TODO: in case we have positions, we should check them, too
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t ), s0 );
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( u ) ) : indexReader[ i ].documents( u ), s1 );
              s0.retainAll( s1 );
              indexIterator =  termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              additionalIterator = termLists ? additionalReader.documents( terms[ i ].get( u ) ) : additionalReader.documents( u );
              it = s0.iterator();
              documentIterator = AndDocumentIterator.getInstance( indexIterator, additionalIterator );
              for( int j = s0.size(); j-- != 0; ) if ( it.nextInt() != documentIterator.nextDocument() ) throw new AssertionError();
              if ( documentIterator.hasNext() ) throw new AssertionError();

              s2.clear();
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t ), s2 );
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( u ) ) : indexReader[ i ].documents( u ), s2 );

              indexIterator =  termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              additionalIterator = termLists ? additionalReader.documents( terms[ i ].get( u ) ) : additionalReader.documents( u );

              it = s2.iterator();
              documentIterator = OrDocumentIterator.getInstance( indexIterator, additionalIterator );
              for( int j = s2.size(); j-- != 0; ) if ( it.nextInt() != documentIterator.nextDocument() ) throw new AssertionError();
              if ( documentIterator.hasNext() ) throw new AssertionError();
           
            pl.update();
          }
          pl.done();
          additionalReader.close();
        }
      }
      catch( Throwable e  ) {
        System.err.println( "Exception during composite iterator test (index=" + index[ i ] + ", first term=" + t + ", second term =" + u + ")" );
        throw e;
     
    }
   
    if ( ! isVirtual && jsapResult.getBoolean( "random" ) ) {
     
      // Random access scan
      pl.expectedUpdates = index[ 0 ].numberOfDocuments;
      pl.itemsName = "documents";
      pl.start( "Verifying random access..." );
View Full Code Here

TOP

Related Classes of com.martiansoftware.jsap.JSAPResult

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.