package it.unimi.dsi.mg4j.tool;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2006-2010 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.mg4j.index.BitStreamIndex;
import it.unimi.dsi.mg4j.index.CachingOutputBitStream;
import it.unimi.dsi.mg4j.index.CompressionFlags;
import it.unimi.dsi.mg4j.index.DiskBasedIndex;
import it.unimi.dsi.mg4j.index.BitStreamHPIndexWriter;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.IndexIterator;
import it.unimi.dsi.mg4j.index.IndexReader;
import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;
import it.unimi.dsi.mg4j.index.IndexWriter;
import it.unimi.dsi.mg4j.index.SkipBitStreamIndexWriter;
import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
import it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy;
import it.unimi.dsi.mg4j.index.cluster.DocumentalCluster;
import it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster;
import it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster;
import it.unimi.dsi.mg4j.index.cluster.DocumentalPartitioningStrategy;
import it.unimi.dsi.mg4j.index.cluster.DocumentalStrategies;
import it.unimi.dsi.mg4j.index.cluster.IndexCluster;
import it.unimi.dsi.mg4j.index.payload.Payload;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.Util;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.BloomFilter;
import it.unimi.dsi.util.ImmutableExternalPrefixMap;
import it.unimi.dsi.util.PrefixMap;
import it.unimi.dsi.util.Properties;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.util.StringMap;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URISyntaxException;
import java.util.Map;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.ConfigurationMap;
import org.apache.log4j.Logger;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
/** Partitions an index documentally.
*
* <p>A global index is partitioned documentally by providing a {@link DocumentalPartitioningStrategy}
* that specifies a destination local index for each document, and a local document pointer. The global index
* is scanned, and the postings are partitioned among the local indices using the provided strategy. For instance,
* a {@link ContiguousDocumentalStrategy} divides an index into blocks of contiguous documents.
*
* <p>Since each local index contains a (proper) subset of the original set of documents, it contains in general a (proper)
* subset of the terms in the global index. Thus, the local term numbers and the global term numbers will not in general coincide.
* As a result, when a set of local indices is accessed transparently as a single index
* using a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalCluster},
* a call to {@link it.unimi.dsi.mg4j.index.Index#documents(int)} will throw an {@link java.lang.UnsupportedOperationException},
* because there is no way to map the global term numbers to local term numbers.
*
* <p>On the other hand, a call to {@link it.unimi.dsi.mg4j.index.Index#documents(CharSequence)} will be passed each local index to
* build a global iterator. To speed up this phase for not-so-frequent terms, when partitioning an index you can require
* the construction of {@linkplain BloomFilter Bloom filters} that will be used to try to avoid
* inquiring indices that do not contain a term. The precision of the filters is settable.
*
* <p>The property file will use a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster} unless you provide
* a {@link ContiguousDocumentalStrategy}, in which case a
* {@link it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster} will be used instead. Note that there might
* be other cases in which the latter is adapt, in which case you can edit manually the property file.
*
* <strong>Important</strong>: this class just partitions the index. No auxiliary files (most notably, {@linkplain StringMap term maps}
* or {@linkplain PrefixMap prefix maps}) will be generated. Please refer to a {@link StringMap} implementation (e.g.,
* {@link ShiftAddXorSignedStringMap} or {@link ImmutableExternalPrefixMap}).
*
* <strong>Warning</strong>: variable quanta are not supported by this class, as it is impossible to predict accurately
* the number of bits used for positions when partitioning documentally. If you want to use variable quanta, use a
* simple interleaved indices without skips as an intermediate step, and pass them through {@link Combine}.
*
* <h2>Write-once output and distributed index partitioning</h2>
*
* Plase see {@link it.unimi.dsi.mg4j.tool.PartitionLexically}—the same comments apply.
*
* @author Alessandro Arrabito
* @author Sebastiano Vigna
*
* @since 1.0.1
*/
public class PartitionDocumentally {
private final static Logger LOGGER = Util.getLogger( PartitionDocumentally.class );
/** The default buffer size for all involved indices. */
public final static int DEFAULT_BUFFER_SIZE = 1024 * 1024;
/** The number of local indices. */
private final int numIndices;
/** The output basenames. */
private final String outputBasename;
/** The array of local output basenames. */
private final String[] localBasename;
/** The input basename. */
private final String inputBasename;
/** The properties of the input index. */
private final Properties inputProperties;
/** The size of I/O buffers. */
private final int bufferSize;
/** The filename of the strategy used to partition the index. */
private final String strategyFilename;
/** The strategy used to perform the partitioning. */
private final DocumentalPartitioningStrategy strategy;
/** The additional local properties of each local index. */
private final Properties[] strategyProperties;
/** The logging interval. */
private final long logInterval;
/** The global index to be partitioned. */
private final BitStreamIndex globalIndex;
/** A reader on {@link #globalIndex}. */
private final IndexReader indexReader;
/** A reader for the terms of the global index. */
private final FastBufferedReader terms;
/** An index writer for each local index. */
private final IndexWriter[] indexWriter;
/** Whether each {@link #indexWriter} has counts. */
private final boolean haveCounts;
/** Whether each {@link #indexWriter} has positions. */
private final boolean havePositions;
/** Whether each {@link #indexWriter} has payloads. */
private final boolean havePayloads;
/** A bit output stream for global counts of each local index. */
private final OutputBitStream[] localGlobCounts;
/** A bit output stream for the frequencies of each local index. */
private final OutputBitStream[] localFrequencies;
/** A print writer for the terms of each local index. */
private final PrintWriter[] localTerms;
/** The maximum size of a document in each local index. */
private final int[] maxDocSize;
/** The maximum number of positions in each local index. */
private final int[] maxDocPos;
/** The number of terms in each local index. */
private final int[] numTerms;
/** The number of postings in each local index. */
private final long[] numPostings;
/** The number of occurrences in each local index. */
private final long[] numOccurrences;
/** The global count for each local index. */
private final long[] globCount;
/** The required precision for Bloom filters (0 means no filter). */
private final int bloomFilterPrecision;
public PartitionDocumentally( final String inputBasename,
final String outputBasename,
final DocumentalPartitioningStrategy strategy,
final String strategyFilename,
final int bloomFilterPrecision,
final int bufferSize,
final Map<Component,Coding> writerFlags,
boolean interleaved,
boolean skips,
final int quantum,
final int height,
final int skipBufferSize,
final long logInterval ) throws ConfigurationException, IOException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException {
this.inputBasename = inputBasename;
this.outputBasename = outputBasename;
this.strategy = strategy;
this.strategyFilename = strategyFilename;
this.strategyProperties = strategy.properties();
this.bufferSize = bufferSize;
this.logInterval = logInterval;
this.bloomFilterPrecision = bloomFilterPrecision;
numIndices = strategy.numberOfLocalIndices();
final Coding positionCoding = writerFlags.get( Component.POSITIONS );
inputProperties = new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
globalIndex = DiskBasedIndex.getInstance( inputBasename, inputProperties, false, positionCoding == Coding.GOLOMB || positionCoding == Coding.INTERPOLATIVE, false, null );
indexReader = globalIndex.getReader();
localBasename = new String[ numIndices ];
for( int i = 0; i < numIndices; i++ ) localBasename[ i ] = outputBasename + "-" + i;
localGlobCounts = new OutputBitStream[ numIndices ];
localFrequencies = new OutputBitStream[ numIndices ];
localTerms = new PrintWriter[ numIndices ];
maxDocSize = new int[ numIndices ];
maxDocPos = new int[ numIndices ];
numTerms = new int[ numIndices ];
globCount = new long[ numIndices ];
numOccurrences = new long[ numIndices ];
numPostings = new long[ numIndices ];
indexWriter = new IndexWriter[ numIndices ];
if ( ( havePayloads = writerFlags.containsKey( Component.PAYLOADS ) ) && ! globalIndex.hasPayloads )
throw new IllegalArgumentException( "You requested payloads, but the global index does not contain them." );
if ( ( haveCounts = writerFlags.containsKey( Component.COUNTS ) ) && ! globalIndex.hasCounts )
throw new IllegalArgumentException( "You requested counts, but the global index does not contain them." );
if ( ( havePositions = writerFlags.containsKey( Component.POSITIONS ) ) && ! globalIndex.hasPositions )
throw new IllegalArgumentException( "You requested positions, but the global index does not contain them." );
interleaved |= ! havePositions || havePayloads;
skips |= ! interleaved;
if ( skips && ( quantum <= 0 || height < 0 ) ) throw new IllegalArgumentException( "You must specify a positive quantum and a nonnegative height (variable quanta are not available when partitioning documentally)." );
for ( int i = 0; i < numIndices; i++ ) {
String name = localBasename[ i ];
if ( ! interleaved ) indexWriter[ i ] = new BitStreamHPIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, skipBufferSize, writerFlags, quantum, height );
else if ( ! skips ) indexWriter[ i ] = new BitStreamIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, writerFlags );
else indexWriter[ i ] = new SkipBitStreamIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, skipBufferSize, writerFlags, quantum, height );
if ( haveCounts ) localGlobCounts[ i ] = new OutputBitStream( name + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
localFrequencies[ i ] = new OutputBitStream( name + DiskBasedIndex.FREQUENCIES_EXTENSION );
localTerms[ i ] = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) ) );
}
terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
}
private void partitionSizes() throws IOException {
final File sizesFile = new File( inputBasename + DiskBasedIndex.SIZES_EXTENSION );
if ( sizesFile.exists() ) {
LOGGER.info( "Partitioning sizes..." );
final InputBitStream sizes = new InputBitStream ( sizesFile );
final OutputBitStream localSizes[] = new OutputBitStream[ numIndices ];
for ( int i = 0; i < numIndices; i++ ) localSizes[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );
// ALERT: for the time being, we decide whether to "fill the gaps" in sizes using as sole indicator the equality between global and local number of documents.
int size, localIndex;
if ( globalIndex.numberOfDocuments == strategy.numberOfDocuments( 0 ) ) {
for( int i = 0; i < globalIndex.numberOfDocuments; i++ ) {
localSizes[ localIndex = strategy.localIndex( i ) ].writeGamma( size = sizes.readGamma() );
if ( maxDocSize[ localIndex ] < size ) maxDocSize[ localIndex ] = size;
for( int l = numIndices; l-- != 0; ) if ( l != localIndex ) localSizes[ l ].writeGamma( 0 );
}
}
else {
for( int i = 0; i < globalIndex.numberOfDocuments; i++ ) {
localSizes[ localIndex = strategy.localIndex( i ) ].writeGamma( size = sizes.readGamma() );
if ( maxDocSize[ localIndex ] < size ) maxDocSize[ localIndex ] = size;
}
}
sizes.close();
for ( int i = 0; i < numIndices; i++ ) localSizes[ i ].close();
}
}
public void run() throws Exception {
final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );
final IntList sizeList = globalIndex.sizes;
partitionSizes();
final int[] position = new int[ globalIndex.maxCount ];
final int[] localFrequency = new int[ numIndices ];
final int[] usedIndex = new int[ numIndices ];
final InputBitStream[] direct = new InputBitStream[ numIndices ];
final InputBitStream[] indirect = new InputBitStream[ numIndices ];
final BloomFilter[] bloomFilter = bloomFilterPrecision != 0 ? new BloomFilter[ numIndices ] : null;
final File[] tempFile = new File[ numIndices ];
final CachingOutputBitStream[] temp = new CachingOutputBitStream[ numIndices ];
IndexIterator indexIterator;
for ( int i = 0; i < numIndices; i++ ) {
tempFile[ i ] = new File( localBasename[ i ] + ".temp" );
temp[ i ] = new CachingOutputBitStream( tempFile[ i ], bufferSize );
direct[ i ] = new InputBitStream( temp[ i ].buffer() );
indirect[ i ] = new InputBitStream( tempFile[ i ] );
if ( bloomFilterPrecision != 0 ) bloomFilter[ i ] = new BloomFilter( globalIndex.numberOfTerms, bloomFilterPrecision );
}
int usedIndices;
MutableString currentTerm = new MutableString();
Payload payload = null;
int frequency, globalPointer, localIndex, localPointer, count = -1;
pl.expectedUpdates = globalIndex.numberOfPostings;
pl.itemsName = "postings";
pl.logInterval = logInterval;
pl.start( "Partitioning index..." );
for ( int t = 0; t < globalIndex.numberOfTerms; t++ ) {
terms.readLine( currentTerm );
indexIterator = indexReader.nextIterator();
usedIndices = 0;
frequency = indexIterator.frequency();
for ( int j = 0; j < frequency; j++ ) {
globalPointer = indexIterator.nextDocument();
localIndex = strategy.localIndex( globalPointer );
if ( localFrequency[ localIndex ] == 0 ) {
// First time we see a document for this index.
currentTerm.println( localTerms[ localIndex ] );
numTerms[ localIndex ]++;
usedIndex[ usedIndices++ ] = localIndex;
if ( bloomFilterPrecision != 0 ) bloomFilter[ localIndex ].add( currentTerm );
}
/* Store temporarily posting data; note that we save the global pointer as we
* will have to access the size list. */
localFrequency[ localIndex ]++;
numPostings[ localIndex ]++;
temp[ localIndex ].writeGamma( globalPointer );
if ( globalIndex.hasPayloads ) payload = indexIterator.payload();
if ( havePayloads ) payload.write( temp[ localIndex ] );
if ( haveCounts ) {
count = indexIterator.count();
temp[ localIndex ].writeGamma( count );
globCount[ localIndex ] += count;
if ( maxDocPos[ localIndex ] < count ) maxDocPos[ localIndex ] = count;
if ( havePositions ) {
final int[] pos = indexIterator.positionArray();
// TODO: compress this stuff
for( int p = 0; p < count; p++ ) temp[ localIndex ].writeGamma( pos[ p ] );
}
}
}
// We now run through the indices used by this term and copy from the temporary buffer.
OutputBitStream obs;
for( int k = 0; k < usedIndices; k++ ) {
final int i = usedIndex[ k ];
localFrequencies[ i ].writeGamma( localFrequency[ i ] );
if ( haveCounts ) numOccurrences[ i ] += globCount[ i ];
if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].writeLongGamma( globCount[ i ] );
globCount[ i ] = 0;
InputBitStream ibs;
indexWriter[ i ].newInvertedList();
temp[ i ].align();
if ( temp[ i ].buffer() != null ) ibs = direct[ i ];
else {
// We cannot read directly from the internal buffer.
ibs = indirect[ i ];
ibs.flush();
temp[ i ].flush();
}
ibs.position( 0 );
indexWriter[ i ].writeFrequency( localFrequency[ i ] );
for( int j = 0; j < localFrequency[ i ]; j++ ) {
obs = indexWriter[ i ].newDocumentRecord();
globalPointer = ibs.readGamma();
localPointer = strategy.localPointer( globalPointer );
indexWriter[ i ].writeDocumentPointer( obs, localPointer );
if ( havePayloads ) {
payload.read( ibs );
indexWriter[ i ].writePayload( obs, payload );
}
if ( haveCounts ) indexWriter[ i ].writePositionCount( obs, count = ibs.readGamma() );
if ( havePositions ) {
for( int p = 0; p < count; p++ ) position[ p ] = ibs.readGamma();
indexWriter[ i ].writeDocumentPositions( obs, position, 0, count, sizeList != null ? sizeList.getInt( globalPointer ) : -1 );
}
}
temp[ i ].position( 0 );
temp[ i ].writtenBits( 0 );
localFrequency[ i ] = 0;
}
usedIndices = 0;
pl.count += frequency - 1;
pl.update();
}
pl.done();
Properties globalProperties = new Properties();
globalProperties.setProperty( Index.PropertyKeys.FIELD, inputProperties.getProperty( Index.PropertyKeys.FIELD ) );
globalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, inputProperties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) );
for ( int i = 0; i < numIndices; i++ ) {
localFrequencies[ i ].close();
if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].close();
localTerms[ i ].close();
indexWriter[ i ].close();
if ( bloomFilterPrecision != 0 ) BinIO.storeObject( bloomFilter[ i ], localBasename[ i ] + DocumentalCluster.BLOOM_EXTENSION );
temp[ i ].close();
tempFile[ i ].delete();
Properties localProperties = indexWriter[ i ].properties();
localProperties.addAll( globalProperties );
localProperties.setProperty( Index.PropertyKeys.MAXCOUNT, String.valueOf( maxDocPos[ i ] ) );
localProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize[ i ] );
localProperties.setProperty( Index.PropertyKeys.FIELD, globalProperties.getProperty( Index.PropertyKeys.FIELD ) );
localProperties.setProperty( Index.PropertyKeys.OCCURRENCES, haveCounts ? numOccurrences[ i ] : -1 );
localProperties.setProperty( Index.PropertyKeys.POSTINGS, numPostings[ i ] );
localProperties.setProperty( Index.PropertyKeys.TERMS, numTerms[ i ] );
if ( havePayloads ) localProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );
if ( strategyProperties[ i ] != null ) localProperties.addAll( strategyProperties[ i ] );
localProperties.save( localBasename[ i ] + DiskBasedIndex.PROPERTIES_EXTENSION );
}
if ( strategyFilename != null ) globalProperties.setProperty( IndexCluster.PropertyKeys.STRATEGY, strategyFilename );
for( int i = 0; i < numIndices; i++ ) globalProperties.addProperty( IndexCluster.PropertyKeys.LOCALINDEX, localBasename[ i ] );
globalProperties.setProperty( DocumentalCluster.PropertyKeys.BLOOM, bloomFilterPrecision != 0 );
// If we partition an index with a single term, by definition we have a flat cluster
globalProperties.setProperty( DocumentalCluster.PropertyKeys.FLAT, inputProperties.getInt( Index.PropertyKeys.TERMS ) <= 1 );
globalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, inputProperties.getProperty( Index.PropertyKeys.MAXCOUNT ) );
globalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, inputProperties.getProperty( Index.PropertyKeys.MAXDOCSIZE ) );
globalProperties.setProperty( Index.PropertyKeys.POSTINGS, inputProperties.getProperty( Index.PropertyKeys.POSTINGS ) );
globalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, inputProperties.getProperty( Index.PropertyKeys.OCCURRENCES ) );
globalProperties.setProperty( Index.PropertyKeys.DOCUMENTS, inputProperties.getProperty( Index.PropertyKeys.DOCUMENTS ) );
globalProperties.setProperty( Index.PropertyKeys.TERMS, inputProperties.getProperty( Index.PropertyKeys.TERMS ) );
if ( havePayloads ) globalProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );
/* For the general case, we must rely on a merged cluster. However, if we detect a contiguous
* strategy we can optimise a bit. */
globalProperties.setProperty( Index.PropertyKeys.INDEXCLASS,
strategy instanceof ContiguousDocumentalStrategy ?
DocumentalConcatenatedCluster.class.getName() :
DocumentalMergedCluster.class.getName() );
globalProperties.save( outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
LOGGER.debug( "Properties for clustered index " + outputBasename + ": " + new ConfigurationMap( globalProperties ) );
}
public static void main( final String arg[] ) throws ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, Exception {
SimpleJSAP jsap = new SimpleJSAP( PartitionDocumentally.class.getName(), "Partitions an index documentally.",
new Parameter[] {
new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),
new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),
new FlaggedOption( "strategy", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "strategy", "A serialised documental partitioning strategy." ),
new FlaggedOption( "uniformStrategy", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'u', "uniform", "Requires a uniform partitioning in the given number of parts." ),
new FlaggedOption( "bloom", JSAP.INTEGER_PARSER, "0", JSAP.NOT_REQUIRED, 'B', "bloom", "Generates Bloom filters with given precision." ),
new FlaggedOption( "comp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag for the index (may be specified several times)." ).setAllowMultipleDeclarations( true ),
new Switch( "noSkips", JSAP.NO_SHORTFLAG, "no-skips", "Disables skips." ),
new Switch( "interleaved", JSAP.NO_SHORTFLAG, "interleaved", "Forces an interleaved index." ),
new FlaggedOption( "quantum", JSAP.INTSIZE_PARSER, "32", JSAP.NOT_REQUIRED, 'Q', "quantum", "The skip quantum." ),
new FlaggedOption( "height", JSAP.INTSIZE_PARSER, Integer.toString( BitStreamIndex.DEFAULT_HEIGHT ), JSAP.NOT_REQUIRED, 'H', "height", "The skip height." ),
new FlaggedOption( "skipBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "skip-buffer-size", "The size of the internal temporary buffer used while creating an index with skips." ),
new UnflaggedOption( "inputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the global index." ),
new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the local indices." )
});
JSAPResult jsapResult = jsap.parse( arg );
if ( jsap.messagePrinted() ) return;
String inputBasename = jsapResult.getString( "inputBasename" );
String outputBasename = jsapResult.getString( "outputBasename" );
String strategyFilename = jsapResult.getString( "strategy" );
DocumentalPartitioningStrategy strategy = null;
if ( jsapResult.userSpecified( "uniformStrategy" ) ) {
strategy = DocumentalStrategies.uniform( jsapResult.getInt( "uniformStrategy" ), Index.getInstance( inputBasename ).numberOfDocuments );
BinIO.storeObject( strategy, strategyFilename = outputBasename + IndexCluster.STRATEGY_DEFAULT_EXTENSION );
}
else if ( strategyFilename != null ) strategy = (DocumentalPartitioningStrategy)BinIO.loadObject( strategyFilename );
else throw new IllegalArgumentException( "You must specify a partitioning strategy" );
final boolean skips = ! jsapResult.getBoolean( "noSkips" );
final boolean interleaved = jsapResult.getBoolean( "interleaved" );
if ( ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) throw new IllegalArgumentException( "You specified quantum or height, but you also disabled skips." );
new PartitionDocumentally( inputBasename,
outputBasename,
strategy,
strategyFilename,
jsapResult.getInt( "bloom" ),
jsapResult.getInt( "bufferSize" ),
CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ),
interleaved,
skips,
jsapResult.getInt( "quantum" ),
jsapResult.getInt( "height" ),
jsapResult.getInt( "skipBufferSize" ),
jsapResult.getLong( "logInterval" ) ).run();
}
}