indirect[ i ] = new InputBitStream( tempFile[ i ] );
if ( bloomFilterPrecision != 0 ) bloomFilter[ i ] = new BloomFilter( globalIndex.numberOfTerms, bloomFilterPrecision );
}
int usedIndices;
MutableString currentTerm = new MutableString();
Payload payload = null;
int frequency, globalPointer, localIndex, localPointer, count = -1;
pl.expectedUpdates = globalIndex.numberOfPostings;
pl.itemsName = "postings";
pl.logInterval = logInterval;
pl.start( "Partitioning index..." );
for ( int t = 0; t < globalIndex.numberOfTerms; t++ ) {
terms.readLine( currentTerm );
indexIterator = indexReader.nextIterator();
usedIndices = 0;
frequency = indexIterator.frequency();
for ( int j = 0; j < frequency; j++ ) {
globalPointer = indexIterator.nextDocument();
localIndex = strategy.localIndex( globalPointer );
if ( localFrequency[ localIndex ] == 0 ) {
// First time we see a document for this index.
currentTerm.println( localTerms[ localIndex ] );
numTerms[ localIndex ]++;
usedIndex[ usedIndices++ ] = localIndex;
if ( bloomFilterPrecision != 0 ) bloomFilter[ localIndex ].add( currentTerm );
}
/* Store temporarily posting data; note that we save the global pointer as we
* will have to access the size list. */
localFrequency[ localIndex ]++;
numPostings[ localIndex ]++;
temp[ localIndex ].writeGamma( globalPointer );
if ( globalIndex.hasPayloads ) payload = indexIterator.payload();
if ( havePayloads ) payload.write( temp[ localIndex ] );
if ( haveCounts ) {
count = indexIterator.count();
temp[ localIndex ].writeGamma( count );
globCount[ localIndex ] += count;
if ( maxDocPos[ localIndex ] < count ) maxDocPos[ localIndex ] = count;
if ( havePositions ) {
final int[] pos = indexIterator.positionArray();
// TODO: compress this stuff
for( int p = 0; p < count; p++ ) temp[ localIndex ].writeGamma( pos[ p ] );
}
}
}
// We now run through the indices used by this term and copy from the temporary buffer.
OutputBitStream obs;
for( int k = 0; k < usedIndices; k++ ) {
final int i = usedIndex[ k ];
localFrequencies[ i ].writeGamma( localFrequency[ i ] );
if ( haveCounts ) numOccurrences[ i ] += globCount[ i ];
if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].writeLongGamma( globCount[ i ] );
globCount[ i ] = 0;
InputBitStream ibs;
indexWriter[ i ].newInvertedList();
temp[ i ].align();
if ( temp[ i ].buffer() != null ) ibs = direct[ i ];
else {
// We cannot read directly from the internal buffer.
ibs = indirect[ i ];
ibs.flush();
temp[ i ].flush();
}
ibs.position( 0 );
indexWriter[ i ].writeFrequency( localFrequency[ i ] );
for( int j = 0; j < localFrequency[ i ]; j++ ) {
obs = indexWriter[ i ].newDocumentRecord();
globalPointer = ibs.readGamma();
localPointer = strategy.localPointer( globalPointer );
indexWriter[ i ].writeDocumentPointer( obs, localPointer );
if ( havePayloads ) {
payload.read( ibs );
indexWriter[ i ].writePayload( obs, payload );
}
if ( haveCounts ) indexWriter[ i ].writePositionCount( obs, count = ibs.readGamma() );
if ( havePositions ) {
for( int p = 0; p < count; p++ ) position[ p ] = ibs.readGamma();
indexWriter[ i ].writeDocumentPositions( obs, position, 0, count, sizeList != null ? sizeList.getInt( globalPointer ) : -1 );
}
}
temp[ i ].position( 0 );
temp[ i ].writtenBits( 0 );
localFrequency[ i ] = 0;
}
usedIndices = 0;
pl.count += frequency - 1;
pl.update();
}
pl.done();
Properties globalProperties = new Properties();
globalProperties.setProperty( Index.PropertyKeys.FIELD, inputProperties.getProperty( Index.PropertyKeys.FIELD ) );
globalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, inputProperties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) );
for ( int i = 0; i < numIndices; i++ ) {
localFrequencies[ i ].close();
if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].close();
localTerms[ i ].close();
indexWriter[ i ].close();
if ( bloomFilterPrecision != 0 ) BinIO.storeObject( bloomFilter[ i ], localBasename[ i ] + DocumentalCluster.BLOOM_EXTENSION );
temp[ i ].close();
tempFile[ i ].delete();
Properties localProperties = indexWriter[ i ].properties();
localProperties.addAll( globalProperties );
localProperties.setProperty( Index.PropertyKeys.MAXCOUNT, String.valueOf( maxDocPos[ i ] ) );
localProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize[ i ] );
localProperties.setProperty( Index.PropertyKeys.FIELD, globalProperties.getProperty( Index.PropertyKeys.FIELD ) );
localProperties.setProperty( Index.PropertyKeys.OCCURRENCES, haveCounts ? numOccurrences[ i ] : -1 );
localProperties.setProperty( Index.PropertyKeys.POSTINGS, numPostings[ i ] );
localProperties.setProperty( Index.PropertyKeys.TERMS, numTerms[ i ] );
if ( havePayloads ) localProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );
if ( strategyProperties[ i ] != null ) localProperties.addAll( strategyProperties[ i ] );
localProperties.save( localBasename[ i ] + DiskBasedIndex.PROPERTIES_EXTENSION );
}
if ( strategyFilename != null ) globalProperties.setProperty( IndexCluster.PropertyKeys.STRATEGY, strategyFilename );
for( int i = 0; i < numIndices; i++ ) globalProperties.addProperty( IndexCluster.PropertyKeys.LOCALINDEX, localBasename[ i ] );
globalProperties.setProperty( DocumentalCluster.PropertyKeys.BLOOM, bloomFilterPrecision != 0 );
// If we partition an index with a single term, by definition we have a flat cluster
globalProperties.setProperty( DocumentalCluster.PropertyKeys.FLAT, inputProperties.getInt( Index.PropertyKeys.TERMS ) <= 1 );
globalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, inputProperties.getProperty( Index.PropertyKeys.MAXCOUNT ) );
globalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, inputProperties.getProperty( Index.PropertyKeys.MAXDOCSIZE ) );
globalProperties.setProperty( Index.PropertyKeys.POSTINGS, inputProperties.getProperty( Index.PropertyKeys.POSTINGS ) );
globalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, inputProperties.getProperty( Index.PropertyKeys.OCCURRENCES ) );
globalProperties.setProperty( Index.PropertyKeys.DOCUMENTS, inputProperties.getProperty( Index.PropertyKeys.DOCUMENTS ) );
globalProperties.setProperty( Index.PropertyKeys.TERMS, inputProperties.getProperty( Index.PropertyKeys.TERMS ) );
if ( havePayloads ) globalProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );
/* For the general case, we must rely on a merged cluster. However, if we detect a contiguous
* strategy we can optimise a bit. */
globalProperties.setProperty( Index.PropertyKeys.INDEXCLASS,