final int maxDocSize;
if ( writeSizes ) {
logger.info( "Combining sizes..." );
final OutputBitStream sizesOutputBitStream = new OutputBitStream( outputBasename + DiskBasedIndex.SIZES_EXTENSION, bufferSize );
maxDocSize = combineSizes( sizesOutputBitStream );
sizesOutputBitStream.close();
logger.info( "Sizes combined." );
}
else maxDocSize = -1;
// To write the global count of each term
final OutputBitStream outputGlobCounts = writeGlobCounts ? new OutputBitStream( outputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION ) : null;
// To write the frequency of each term
final OutputBitStream frequencies = new OutputBitStream( outputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
// To write the new term list
final PrintWriter termFile = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( outputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ), bufferSize ) );
// The current term
MutableString currTerm;
// Total number of pointers and occurrences
long numPointers = 0;
pl.expectedUpdates = writeGlobCounts ? numberOfOccurrences : -1;
pl.itemsName = writeGlobCounts ? "occurrences" : "terms";
pl.logInterval = logInterval;
pl.start( "Combining lists..." );
int totalFrequency, numTerms = 0, numUsedIndices, k;
long totalGlobCount = 0;
predictedSize = -1;
predictedLengthNumBits = -1;
// Discard first zero from offsets
if ( p != 0 ) for( InputBitStream ibs: offsets ) ibs.readGamma();
// TODO: use the front of the queue?
while( ! termQueue.isEmpty() ) {
numUsedIndices = 0;
// We read a new word from the queue, copy it and write it to the term file
currTerm = term[ k = usedIndex[ numUsedIndices++ ] = termQueue.first() ].copy();
if ( DEBUG ) System.err.println( "Merging term " + currTerm );
currTerm.println( termFile );
if ( termReader[ k ].readLine( term[ k ] ) == null ) termQueue.dequeue();
else termQueue.changed();
// Then, we extract all equal words from the queue, accumulating the set of indices in inIndex and currIndex
while( ! termQueue.isEmpty() && term[ termQueue.first() ].equals( currTerm ) ) {
k = usedIndex[ numUsedIndices++ ] = termQueue.first();
if ( termReader[ k ].readLine( term[ k ] ) == null ) termQueue.dequeue();
else termQueue.changed();
}
if ( numUsedIndices > 1 ) Arrays.sort( usedIndex, 0, numUsedIndices );
// Load index iterators
for( int i = numUsedIndices; i-- != 0; ) indexIterator[ usedIndex[ i ] ] = indexReader[ usedIndex[ i ] ].nextIterator();
numTerms++;
if ( writeGlobCounts ) {
// Compute and write the total global count. This works for all kind of indices.
totalGlobCount = 0;
for( int i = 0; i < numUsedIndices; i++ ) totalGlobCount += globCounts[ usedIndex[ i ] ].readLongGamma();
outputGlobCounts.writeLongGamma( totalGlobCount );
}
if ( p != 0 ) {
predictedSize = 0;
predictedLengthNumBits = 0;
for( int i = numUsedIndices; i-- != 0; ) {
if ( index[ usedIndex[ i ] ] instanceof BitStreamHPIndex ) {
predictedSize += offsets[ usedIndex[ i ] ].readLongGamma();
if ( hasPositions ) predictedLengthNumBits += posNumBits[ usedIndex[ i ] ].readLongGamma();
}
else {
// Interleaved index: we must subtract the number of bits used for positions from the length of the overall inverted list
final long t = hasPositions ? posNumBits[ usedIndex[ i ] ].readLongGamma() : 0;
predictedSize += offsets[ usedIndex[ i ] ].readLongGamma() - t;
predictedLengthNumBits += t;
}
}
}
totalFrequency = combine( numUsedIndices );
frequencies.writeGamma( totalFrequency );
numPointers += totalFrequency;
/* A trick to get a correct prediction. */
if ( writeGlobCounts ) pl.count += totalGlobCount - 1;
pl.update();
}
pl.done();
if ( writeGlobCounts ) outputGlobCounts.close();
termFile.close();
frequencies.close();
if ( ! metadataOnly ) {
for( int i = numIndices; i-- != 0; ) {
indexReader[ i ].close();
if ( writeGlobCounts ) globCounts[ i ].close();