Source Code of it.unimi.dsi.mg4j.tool.PartitionDocumentally

package it.unimi.dsi.mg4j.tool;


/*     
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2006-2010 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */


import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.mg4j.index.BitStreamIndex;
import it.unimi.dsi.mg4j.index.CachingOutputBitStream;
import it.unimi.dsi.mg4j.index.CompressionFlags;
import it.unimi.dsi.mg4j.index.DiskBasedIndex;
import it.unimi.dsi.mg4j.index.BitStreamHPIndexWriter;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.IndexIterator;
import it.unimi.dsi.mg4j.index.IndexReader;
import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;
import it.unimi.dsi.mg4j.index.IndexWriter;
import it.unimi.dsi.mg4j.index.SkipBitStreamIndexWriter;
import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
import it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy;
import it.unimi.dsi.mg4j.index.cluster.DocumentalCluster;
import it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster;
import it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster;
import it.unimi.dsi.mg4j.index.cluster.DocumentalPartitioningStrategy;
import it.unimi.dsi.mg4j.index.cluster.DocumentalStrategies;
import it.unimi.dsi.mg4j.index.cluster.IndexCluster;
import it.unimi.dsi.mg4j.index.payload.Payload;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.Util;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.BloomFilter;
import it.unimi.dsi.util.ImmutableExternalPrefixMap;
import it.unimi.dsi.util.PrefixMap;
import it.unimi.dsi.util.Properties;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.util.StringMap;


import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URISyntaxException;
import java.util.Map;


import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.ConfigurationMap;
import org.apache.log4j.Logger;


import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;




/** Partitions an index documentally.
 *
 * <p>A global index is partitioned documentally by providing a {@link DocumentalPartitioningStrategy}
 * that specifies a destination local index for each document, and a local document pointer. The global index
 * is scanned, and the postings are partitioned among the local indices using the provided strategy. For instance,
 * a {@link ContiguousDocumentalStrategy} divides an index into blocks of contiguous documents.
 * 
 * <p>Since each local index contains a (proper) subset of the original set of documents, it contains in general a (proper)
 * subset of the terms in the global index. Thus, the local term numbers and the global term numbers will not in general coincide.
 * As a result, when a set of local indices is accessed transparently as a single index
 * using a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalCluster}, 
 * a call to {@link it.unimi.dsi.mg4j.index.Index#documents(int)} will throw an {@link java.lang.UnsupportedOperationException},
 * because there is no way to map the global term numbers to local term numbers.
 * 
 * <p>On the other hand, a call to {@link it.unimi.dsi.mg4j.index.Index#documents(CharSequence)} will be passed each local index to
 * build a global iterator. To speed up this phase for not-so-frequent terms, when partitioning an index you can require
 * the construction of {@linkplain BloomFilter Bloom filters} that will be used to try to avoid
 * inquiring indices that do not contain a term. The precision of the filters is settable.
 *
 * <p>The property file will use a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster} unless you provide
 * a {@link ContiguousDocumentalStrategy}, in which case a 
 * {@link it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster} will be used instead. Note that there might
 * be other cases in which the latter is adapt, in which case you can edit manually the property file.
 * 
 * <strong>Important</strong>: this class just partitions the index. No auxiliary files (most notably, {@linkplain StringMap term maps} 
 * or {@linkplain PrefixMap prefix maps}) will be generated. Please refer to a {@link StringMap} implementation (e.g.,
 * {@link ShiftAddXorSignedStringMap} or {@link ImmutableExternalPrefixMap}).
 * 
 * <strong>Warning</strong>: variable quanta are not supported by this class, as it is impossible to predict accurately
 * the number of bits used for positions when partitioning documentally. If you want to use variable quanta, use a
 * simple interleaved indices without skips as an intermediate step, and pass them through {@link Combine}.
 * 
 * <h2>Write-once output and distributed index partitioning</h2>
 * 
 * Plase see {@link it.unimi.dsi.mg4j.tool.PartitionLexically}&mdash;the same comments apply.
 * 
 * @author Alessandro Arrabito
 * @author Sebastiano Vigna
 * 
 * @since 1.0.1
 */


public class PartitionDocumentally {
  private final static Logger LOGGER = Util.getLogger( PartitionDocumentally.class );


  /**  The default buffer size for all involved indices. */
  public final static int DEFAULT_BUFFER_SIZE = 1024 * 1024;
  
  /** The number of local indices. */
  private final int numIndices;
  /** The output basenames. */
  private final String outputBasename;
  /** The array of local output basenames. */
  private final String[] localBasename;
  /** The input basename. */
  private final String inputBasename;
  /** The properties of the input index. */
  private final Properties inputProperties;
  /** The size of I/O buffers. */
  private final int bufferSize;
  /** The filename of the strategy used to partition the index. */
  private final String strategyFilename;
  /** The strategy used to perform the partitioning. */
  private final DocumentalPartitioningStrategy strategy;
  /** The additional local properties of each local index. */
  private final Properties[] strategyProperties;
  /** The logging interval. */
  private final long logInterval;
  /** The global index to be partitioned. */
  private final BitStreamIndex globalIndex;
  /** A reader on {@link #globalIndex}. */
  private final IndexReader indexReader;
  /** A reader for the terms of the global index. */
  private final FastBufferedReader terms;
  /** An index writer for each local index. */
  private final IndexWriter[] indexWriter;
  /** Whether each {@link #indexWriter} has counts. */
  private final boolean haveCounts;
  /** Whether each {@link #indexWriter} has positions. */
  private final boolean havePositions;
  /** Whether each {@link #indexWriter} has payloads. */
  private final boolean havePayloads;
  /** A bit output stream for global counts of each local index. */
  private final OutputBitStream[] localGlobCounts;
  /** A bit output stream for the frequencies of each local index. */
  private final OutputBitStream[] localFrequencies;
  /** A print writer for the terms of each local index. */
  private final PrintWriter[] localTerms;
  /** The maximum size of a document in each local index. */
  private final int[] maxDocSize;
  /** The maximum number of positions in each local index. */
  private final int[] maxDocPos;
  /** The number of terms in each local index. */
  private final int[] numTerms;
  /** The number of postings in each local index. */
  private final long[] numPostings;
  /** The number of occurrences in each local index. */
  private final long[] numOccurrences;
  /** The global count for each local index. */
  private final long[] globCount;
  /** The required precision for Bloom filters (0 means no filter). */
  private final int bloomFilterPrecision;


  
  
  
  public PartitionDocumentally( final String inputBasename, 
      final String outputBasename,
      final DocumentalPartitioningStrategy strategy,
      final String strategyFilename,
      final int bloomFilterPrecision,
      final int bufferSize,
      final Map<Component,Coding> writerFlags,
      boolean interleaved,
      boolean skips,
      final int quantum,
      final int height,
      final int skipBufferSize,
      final long logInterval ) throws ConfigurationException, IOException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException {


    this.inputBasename = inputBasename;
    this.outputBasename = outputBasename;
    this.strategy = strategy;
    this.strategyFilename = strategyFilename;
    this.strategyProperties = strategy.properties();
    this.bufferSize = bufferSize;
    this.logInterval = logInterval;
    this.bloomFilterPrecision = bloomFilterPrecision;


    numIndices = strategy.numberOfLocalIndices();


    final Coding positionCoding = writerFlags.get( Component.POSITIONS );


    inputProperties = new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
    globalIndex = DiskBasedIndex.getInstance( inputBasename, inputProperties, false, positionCoding == Coding.GOLOMB || positionCoding == Coding.INTERPOLATIVE, false, null );
    indexReader = globalIndex.getReader();


    localBasename = new String[ numIndices ];
    for( int i = 0; i < numIndices; i++ ) localBasename[ i ] = outputBasename + "-" + i;


    localGlobCounts = new OutputBitStream[ numIndices ];
    localFrequencies = new OutputBitStream[ numIndices ];
    localTerms = new PrintWriter[ numIndices ];
    maxDocSize = new int[ numIndices ];
    maxDocPos = new int[ numIndices ];
    numTerms = new int[ numIndices ];
    globCount = new long[ numIndices ];
    numOccurrences = new long[ numIndices ];
    numPostings = new long[ numIndices ];
    indexWriter = new IndexWriter[ numIndices ];
    
    if ( ( havePayloads = writerFlags.containsKey( Component.PAYLOADS ) ) && ! globalIndex.hasPayloads ) 
      throw new IllegalArgumentException( "You requested payloads, but the global index does not contain them." );
    if ( ( haveCounts = writerFlags.containsKey( Component.COUNTS ) ) && ! globalIndex.hasCounts ) 
      throw new IllegalArgumentException( "You requested counts, but the global index does not contain them." );
    if ( ( havePositions = writerFlags.containsKey( Component.POSITIONS ) ) && !  globalIndex.hasPositions ) 
      throw new IllegalArgumentException( "You requested positions, but the global index does not contain them." );


    interleaved |= ! havePositions || havePayloads;
    skips |= ! interleaved;
    if ( skips && ( quantum <= 0 || height < 0 ) ) throw new IllegalArgumentException( "You must specify a positive quantum and a nonnegative height (variable quanta are not available when partitioning documentally)." );
    
    for ( int i = 0; i < numIndices; i++ ) {
      String name = localBasename[ i ]; 
      if ( ! interleaved ) indexWriter[ i ] = new BitStreamHPIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, skipBufferSize, writerFlags, quantum, height );
      else if ( ! skips ) indexWriter[ i ] = new BitStreamIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, writerFlags );
      else indexWriter[ i ] = new SkipBitStreamIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, skipBufferSize, writerFlags, quantum, height );
      
      if ( haveCounts ) localGlobCounts[ i ] = new OutputBitStream( name + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
      localFrequencies[ i ] = new OutputBitStream( name + DiskBasedIndex.FREQUENCIES_EXTENSION );
      localTerms[ i ] = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) ) );      
    }
    
    terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
  }
  
  private void partitionSizes() throws IOException {      
    final File sizesFile = new File( inputBasename + DiskBasedIndex.SIZES_EXTENSION );
    if ( sizesFile.exists() ) {
      LOGGER.info( "Partitioning sizes..." );
      final InputBitStream sizes = new InputBitStream ( sizesFile );
      final OutputBitStream localSizes[] = new OutputBitStream[ numIndices ];
      for ( int i = 0; i < numIndices; i++ ) localSizes[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );


      // ALERT: for the time being, we decide whether to "fill the gaps" in sizes using as sole indicator the equality between global and local number of documents.
      int size, localIndex;
      if ( globalIndex.numberOfDocuments == strategy.numberOfDocuments( 0 ) ) {
        for( int i = 0; i < globalIndex.numberOfDocuments; i++ ) {
          localSizes[ localIndex = strategy.localIndex( i ) ].writeGamma( size = sizes.readGamma() );
          if ( maxDocSize[ localIndex ] < size ) maxDocSize[ localIndex ] = size;
          for( int l = numIndices; l-- != 0; ) if ( l != localIndex ) localSizes[ l ].writeGamma( 0 ); 
        }
      }
      else { 
        for( int i = 0; i < globalIndex.numberOfDocuments; i++ ) {
          localSizes[ localIndex = strategy.localIndex( i ) ].writeGamma( size = sizes.readGamma() );
          if ( maxDocSize[ localIndex ] < size ) maxDocSize[ localIndex ] = size;
        }
      }


      sizes.close();
      for ( int i = 0; i < numIndices; i++ ) localSizes[ i ].close();
    }
  }
  
  public void run() throws Exception {
    final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );
    final IntList sizeList = globalIndex.sizes;
    partitionSizes();
    
    final int[] position = new int[ globalIndex.maxCount ];  
    final int[] localFrequency = new int[ numIndices ];  
    final int[] usedIndex = new int[ numIndices ];
    final InputBitStream[] direct = new InputBitStream[ numIndices ];
    final InputBitStream[] indirect = new InputBitStream[ numIndices ];
    final BloomFilter[] bloomFilter = bloomFilterPrecision != 0 ? new BloomFilter[ numIndices ] : null;
    final File[] tempFile = new File[ numIndices ];
    final CachingOutputBitStream[] temp = new CachingOutputBitStream[ numIndices ];
    IndexIterator indexIterator;
    
    for ( int i = 0; i < numIndices; i++ ) {
      tempFile[ i ] = new File( localBasename[ i ] + ".temp" );
      temp[ i ] = new CachingOutputBitStream( tempFile[ i ], bufferSize );
      direct[ i ] = new InputBitStream( temp[ i ].buffer() );
      indirect[ i ] = new InputBitStream( tempFile[ i ] );
      if ( bloomFilterPrecision != 0 ) bloomFilter[ i ] = new BloomFilter( globalIndex.numberOfTerms, bloomFilterPrecision );
    }
    int usedIndices;
    MutableString currentTerm = new MutableString();
    Payload payload = null;
    int frequency, globalPointer, localIndex, localPointer, count = -1;


    pl.expectedUpdates = globalIndex.numberOfPostings;
    pl.itemsName = "postings";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index..." );


    for ( int t = 0; t < globalIndex.numberOfTerms; t++ ) {
      terms.readLine( currentTerm );
      indexIterator = indexReader.nextIterator();
      usedIndices = 0;
      frequency = indexIterator.frequency();
      
      for ( int j = 0; j < frequency; j++ ) {
        globalPointer = indexIterator.nextDocument();                
        localIndex = strategy.localIndex( globalPointer );  


        if ( localFrequency[ localIndex ] == 0 ) {
          // First time we see a document for this index.
          currentTerm.println( localTerms[ localIndex ] );
          numTerms[ localIndex ]++;
          usedIndex[ usedIndices++ ] = localIndex;
          if ( bloomFilterPrecision != 0 ) bloomFilter[ localIndex ].add( currentTerm );
        }
        
        /* Store temporarily posting data; note that we save the global pointer as we
         * will have to access the size list. */
        
        localFrequency[ localIndex ]++;
        numPostings[ localIndex ]++;
        temp[ localIndex ].writeGamma( globalPointer );


        if ( globalIndex.hasPayloads ) payload = indexIterator.payload();
        if ( havePayloads ) payload.write( temp[ localIndex ] );
        
        if ( haveCounts ) {
          count = indexIterator.count();
          temp[ localIndex ].writeGamma( count );
          globCount[ localIndex ] += count;        
          if ( maxDocPos[ localIndex ] < count ) maxDocPos[ localIndex ] = count;         
          if ( havePositions ) {
            final int[] pos = indexIterator.positionArray();
            // TODO: compress this stuff
            for( int p = 0; p < count; p++ ) temp[ localIndex ].writeGamma( pos[ p ] ); 
          }
        }
      }
      
      // We now run through the indices used by this term and copy from the temporary buffer.


      OutputBitStream obs;
      
      for( int k = 0; k < usedIndices; k++ ) {
        final int i = usedIndex[ k ];


        localFrequencies[ i ].writeGamma( localFrequency[ i ] );
        if ( haveCounts ) numOccurrences[ i ] += globCount[ i ];
        if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].writeLongGamma( globCount[ i ] );
        globCount[ i ] = 0;
        
        InputBitStream ibs;
        indexWriter[ i ].newInvertedList();


        temp[ i ].align();
        if ( temp[ i ].buffer() != null ) ibs = direct[ i ];
        else {
          // We cannot read directly from the internal buffer.
          ibs = indirect[ i ];
          ibs.flush();
          temp[ i ].flush();
        }


        ibs.position( 0 );
          
        indexWriter[ i ].writeFrequency( localFrequency[ i ] );
        for( int j = 0; j < localFrequency[ i ]; j++ ) {
          obs = indexWriter[ i ].newDocumentRecord();
          globalPointer = ibs.readGamma();
          localPointer = strategy.localPointer( globalPointer );  
          indexWriter[ i ].writeDocumentPointer( obs, localPointer );
          if ( havePayloads ) {
            payload.read( ibs );
            indexWriter[ i ].writePayload( obs, payload );
          }
          if ( haveCounts ) indexWriter[ i ].writePositionCount( obs, count = ibs.readGamma() );
          if ( havePositions ) {
            for( int p = 0; p < count; p++ ) position[ p ] = ibs.readGamma();
            indexWriter[ i ].writeDocumentPositions( obs, position, 0, count, sizeList != null ? sizeList.getInt( globalPointer ) : -1 );
          }
          
        }
        temp[ i ].position( 0 );
        temp[ i ].writtenBits( 0 );
        localFrequency[ i ] = 0;
      }
      
      usedIndices = 0;
      pl.count += frequency - 1;
      pl.update();
    }


    pl.done();


    Properties globalProperties = new Properties();
    globalProperties.setProperty( Index.PropertyKeys.FIELD, inputProperties.getProperty( Index.PropertyKeys.FIELD ) );
    globalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, inputProperties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) );
    
    for ( int i = 0; i < numIndices; i++ ) {
      localFrequencies[ i ].close();
      if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].close();
      localTerms[ i ].close(); 
      indexWriter[ i ].close();
      if ( bloomFilterPrecision != 0 ) BinIO.storeObject( bloomFilter[ i ], localBasename[ i ] + DocumentalCluster.BLOOM_EXTENSION );
      temp[ i ].close();
      tempFile[ i ].delete();
      
      Properties localProperties = indexWriter[ i ].properties();
      localProperties.addAll( globalProperties );
      localProperties.setProperty( Index.PropertyKeys.MAXCOUNT, String.valueOf( maxDocPos[ i ] ) );
      localProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize[ i ] );
      localProperties.setProperty( Index.PropertyKeys.FIELD, globalProperties.getProperty( Index.PropertyKeys.FIELD ) );
      localProperties.setProperty( Index.PropertyKeys.OCCURRENCES, haveCounts ? numOccurrences[ i ] : -1 );
      localProperties.setProperty( Index.PropertyKeys.POSTINGS, numPostings[ i ] );
      localProperties.setProperty( Index.PropertyKeys.TERMS, numTerms[ i ] );
      if ( havePayloads ) localProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );
      if ( strategyProperties[ i ] != null ) localProperties.addAll( strategyProperties[ i ] );
      localProperties.save( localBasename[ i ] + DiskBasedIndex.PROPERTIES_EXTENSION );
    }


    if ( strategyFilename != null ) globalProperties.setProperty( IndexCluster.PropertyKeys.STRATEGY, strategyFilename );
    for( int i = 0; i < numIndices; i++ ) globalProperties.addProperty( IndexCluster.PropertyKeys.LOCALINDEX, localBasename[ i ] );
    globalProperties.setProperty( DocumentalCluster.PropertyKeys.BLOOM, bloomFilterPrecision != 0 );
    // If we partition an index with a single term, by definition we have a flat cluster
    globalProperties.setProperty( DocumentalCluster.PropertyKeys.FLAT, inputProperties.getInt( Index.PropertyKeys.TERMS ) <= 1 );
    globalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, inputProperties.getProperty( Index.PropertyKeys.MAXCOUNT ) );
    globalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, inputProperties.getProperty( Index.PropertyKeys.MAXDOCSIZE ) );
    globalProperties.setProperty( Index.PropertyKeys.POSTINGS, inputProperties.getProperty( Index.PropertyKeys.POSTINGS ) );
    globalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, inputProperties.getProperty( Index.PropertyKeys.OCCURRENCES ) );
    globalProperties.setProperty( Index.PropertyKeys.DOCUMENTS, inputProperties.getProperty( Index.PropertyKeys.DOCUMENTS ) );
    globalProperties.setProperty( Index.PropertyKeys.TERMS, inputProperties.getProperty( Index.PropertyKeys.TERMS ) );
    if ( havePayloads ) globalProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );


    /* For the general case, we must rely on a merged cluster. However, if we detect a contiguous
     * strategy we can optimise a bit. */
    
    globalProperties.setProperty( Index.PropertyKeys.INDEXCLASS, 
        strategy instanceof ContiguousDocumentalStrategy ?
            DocumentalConcatenatedCluster.class.getName() :
            DocumentalMergedCluster.class.getName() );
    
    globalProperties.save(  outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
    LOGGER.debug( "Properties for clustered index " + outputBasename + ": " + new ConfigurationMap( globalProperties ) );
    
  }


  
  public static void main( final String arg[] ) throws ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, Exception {    
    
    SimpleJSAP jsap = new SimpleJSAP( PartitionDocumentally.class.getName(), "Partitions an index documentally.",
        new Parameter[] {
      new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),
      new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),
      new FlaggedOption( "strategy", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "strategy", "A serialised documental partitioning strategy." ),
      new FlaggedOption( "uniformStrategy", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'u', "uniform", "Requires a uniform partitioning in the given number of parts." ),
      new FlaggedOption( "bloom", JSAP.INTEGER_PARSER, "0", JSAP.NOT_REQUIRED, 'B', "bloom", "Generates Bloom filters with given precision." ),
      new FlaggedOption( "comp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag for the index (may be specified several times)." ).setAllowMultipleDeclarations( true ),
      new Switch( "noSkips", JSAP.NO_SHORTFLAG, "no-skips", "Disables skips." ),
      new Switch( "interleaved", JSAP.NO_SHORTFLAG, "interleaved", "Forces an interleaved index." ),
      new FlaggedOption( "quantum", JSAP.INTSIZE_PARSER, "32", JSAP.NOT_REQUIRED, 'Q', "quantum", "The skip quantum." ),
      new FlaggedOption( "height", JSAP.INTSIZE_PARSER, Integer.toString( BitStreamIndex.DEFAULT_HEIGHT ), JSAP.NOT_REQUIRED, 'H', "height", "The skip height." ),
      new FlaggedOption( "skipBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "skip-buffer-size", "The size of the internal temporary buffer used while creating an index with skips." ),
      new UnflaggedOption( "inputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the global index." ),
      new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the local indices." )
    });
    
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;
    String inputBasename = jsapResult.getString( "inputBasename" );
    String outputBasename = jsapResult.getString( "outputBasename" );
    String strategyFilename = jsapResult.getString( "strategy" );
    DocumentalPartitioningStrategy strategy = null;


    if ( jsapResult.userSpecified( "uniformStrategy" ) ) {
      strategy = DocumentalStrategies.uniform( jsapResult.getInt( "uniformStrategy" ), Index.getInstance( inputBasename ).numberOfDocuments );
      BinIO.storeObject( strategy, strategyFilename = outputBasename + IndexCluster.STRATEGY_DEFAULT_EXTENSION );
    }
    else if ( strategyFilename != null ) strategy = (DocumentalPartitioningStrategy)BinIO.loadObject( strategyFilename );
    else throw new IllegalArgumentException( "You must specify a partitioning strategy" );
    
    final boolean skips = ! jsapResult.getBoolean( "noSkips" );
    final boolean interleaved = jsapResult.getBoolean( "interleaved" );
    if ( ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) throw new IllegalArgumentException( "You specified quantum or height, but you also disabled skips." );


    new PartitionDocumentally( inputBasename,
        outputBasename, 
        strategy, 
        strategyFilename,
        jsapResult.getInt( "bloom" ),
        jsapResult.getInt( "bufferSize" ),
        CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ),
        interleaved,
        skips,
        jsapResult.getInt( "quantum" ),
        jsapResult.getInt( "height" ),
        jsapResult.getInt( "skipBufferSize" ),
        jsapResult.getLong( "logInterval" ) ).run();
  }      
}
Source Code of it.unimi.dsi.mg4j.tool.PartitionDocumentally

Related Classes of it.unimi.dsi.mg4j.tool.PartitionDocumentally