final PrintWriter pw = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( batchBasename + DiskBasedIndex.TERMS_EXTENSION ), bufferSize ), "UTF-8" ) );
for ( MutableString t : termArray ) t.println( pw );
pw.close();
try {
final OutputBitStream frequencies = new OutputBitStream( batchBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
final OutputBitStream globCounts = new OutputBitStream( batchBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
if ( indexingIsStandard ) {
final OutputBitStream index = new OutputBitStream( batchBasename + DiskBasedIndex.INDEX_EXTENSION );
final OutputBitStream offsets = new OutputBitStream( batchBasename + DiskBasedIndex.OFFSETS_EXTENSION );
final OutputBitStream posNumBits = new OutputBitStream( batchBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
ByteArrayPostingList baps;
int maxCount = 0, frequency;
long bitLength, postings = 0, prevOffset = 0;
offsets.writeGamma( 0 );
for ( int i = 0; i < numTerms; i++ ) {
baps = termMap.get( termArray[ i ] );
frequency = baps.frequency;
if ( maxCount < baps.maxCount ) maxCount = baps.maxCount;
bitLength = baps.writtenBits();
baps.align();
postings += frequency;
index.writeGamma( frequency - 1 );
// We need special treatment for terms appearing in all documents
if ( frequency == documentCount ) baps.stripPointers( index, bitLength );
else index.write( baps.buffer, bitLength );
frequencies.writeGamma( frequency );
globCounts.writeLongGamma( baps.globCount );
offsets.writeLongGamma( index.writtenBits() - prevOffset );
posNumBits.writeLongGamma( baps.posNumBits );
prevOffset = index.writtenBits();
}
totPostings += postings;
final Properties properties = new Properties();
properties.setProperty( Index.PropertyKeys.DOCUMENTS, documentCount );
properties.setProperty( Index.PropertyKeys.TERMS, numTerms );
properties.setProperty( Index.PropertyKeys.POSTINGS, postings );
properties.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount );
properties.setProperty( Index.PropertyKeys.INDEXCLASS, FileIndex.class.getName() );
properties.addProperty( Index.PropertyKeys.CODING, "FREQUENCIES:GAMMA" );
properties.addProperty( Index.PropertyKeys.CODING, "POINTERS:DELTA" );
if ( completeness.compareTo( Completeness.COUNTS ) >= 0 ) properties.addProperty( Index.PropertyKeys.CODING, "COUNTS:GAMMA" );
if ( completeness.compareTo( Completeness.POSITIONS ) >= 0 ) properties.addProperty( Index.PropertyKeys.CODING, "POSITIONS:DELTA" );
properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );
properties.setProperty( Index.PropertyKeys.OCCURRENCES, numOccurrences );
properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize );
properties.setProperty( Index.PropertyKeys.SIZE, index.writtenBits() );
if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );
properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
index.close();
offsets.close();
posNumBits.close();
}
else {
final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, maxDocInBatch + 1, true, flags );
ByteArrayPostingList bapl;
OutputBitStream obs;
int maxCount = -1, maxFrequency = 0, frequency;
// Compute max frequency and allocate position array.
for ( ByteArrayPostingList b : termMap.values() ) {
b.close();
b.align();
if ( maxFrequency < b.frequency ) maxFrequency = b.frequency;
if ( maxCount < b.maxCount ) maxCount = b.maxCount;
}
final long[] bitPos = new long[ maxFrequency ];
final int[] pointer = new int[ maxFrequency ];
int[] pos = new int[ maxCount ];
final boolean hasCounts = completeness.compareTo( Completeness.COUNTS ) >= 0;
final boolean hasPositions = completeness.compareTo( Completeness.POSITIONS ) >= 0;
int count = -1, moreCount = -1;
for ( int i = 0; i < numTerms; i++ ) {
bapl = termMap.get( termArray[ i ] );
final InputBitStream ibs = new InputBitStream( bapl.buffer );
frequency = bapl.frequency; // This could be much more than the actual frequency in virtual indices
// Calculate posting bit positions and corresponding pointers
for ( int j = 0; j < frequency; j++ ) {
bitPos[ j ] = ibs.readBits(); // Cache bit poisition
pointer[ j ] = ibs.readDelta(); // Cache pointer
if ( hasCounts ) count = ibs.readGamma() + 1;
if ( hasPositions ) ibs.skipDeltas( count ); // Skip document positions
}
// Sort stably pointers and positions by increasing pointer
it.unimi.dsi.fastutil.Arrays.quickSort( 0, frequency, new AbstractIntComparator() {
public int compare( final int i0, final int i1 ) {
final int t = pointer[ i0 ] - pointer[ i1 ];
if ( t != 0 ) return t;
final long u = bitPos[ i0 ] - bitPos[ i1 ]; // We need a stable sort
return u < 0 ? -1 : u > 0 ? 1 : 0;
}
},
new Swapper() {
public void swap( final int i0, final int i1 ) {
final long t = bitPos[ i0 ]; bitPos[ i0 ] = bitPos[ i1 ]; bitPos[ i1 ] = t;
final int p = pointer[ i0 ]; pointer[ i0 ] = pointer[ i1 ]; pointer[ i1 ] = p;
}
} );
int actualFrequency = frequency;
// Compute actual frequency for virtual indices
if ( indexingIsVirtual ) {
actualFrequency = 1;
for ( int j = 1; j < frequency; j++ ) if ( pointer[ j ] != pointer[ j - 1 ] ) actualFrequency++;
if ( ASSERTS ) {
for ( int j = 1; j < frequency; j++ ) {
assert pointer[ j ] >= pointer[ j - 1 ];
assert pointer[ j ] != pointer[ j - 1 ] || bitPos[ j ] > bitPos[ j - 1 ];
}
}
}
indexWriter.newInvertedList();
indexWriter.writeFrequency( actualFrequency );
int currPointer;
for ( int j = 0; j < frequency; j++ ) {
ibs.position( bitPos[ j ] );
obs = indexWriter.newDocumentRecord();
indexWriter.writeDocumentPointer( obs, currPointer = ibs.readDelta() );
if ( ASSERTS ) assert currPointer == pointer[ j ];
if ( hasCounts ) count = ibs.readGamma() + 1;
if ( hasPositions ) {
ibs.readDeltas( pos, count );
for ( int p = 1; p < count; p++ ) pos[ p ] += pos[ p - 1 ] + 1;
}
if ( indexingIsVirtual ) {
while( j < frequency - 1 ) {
ibs.position( bitPos[ j + 1 ] );
if ( currPointer != ibs.readDelta() ) break;
j++;
if ( hasCounts ) moreCount = ibs.readGamma() + 1;
if ( hasPositions ) {
pos = IntArrays.grow( pos, count + moreCount, count );
pos[ count ] = ibs.readDelta();
if ( ASSERTS ) assert pos[ count ] > pos[ count - 1 ];
for ( int p = 1; p < moreCount; p++ ) pos[ count + p ] = pos[ count + p - 1 ] + 1 + ibs.readDelta();
}
count += moreCount;
}
if ( maxCount < count ) maxCount = count;
}
if ( hasCounts ) indexWriter.writePositionCount( obs, count );
if ( hasPositions ) indexWriter.writeDocumentPositions( obs, pos, 0, count, -1 );
}
frequencies.writeGamma( actualFrequency );
globCounts.writeLongGamma( bapl.globCount );
}
indexWriter.close();
final Properties properties = indexWriter.properties();
totPostings += properties.getLong( "postings" );
properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );
properties.setProperty( Index.PropertyKeys.OCCURRENCES, numOccurrences );
properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize );
properties.setProperty( Index.PropertyKeys.SIZE, indexWriter.writtenBits() );
if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );
properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
if ( indexingIsRemapped ) {
// We must permute sizes
final int[] document = new int[ documentCount ], size = new int[ documentCount ];
final InputBitStream sizes = new InputBitStream( batchBasename + DiskBasedIndex.SIZES_EXTENSION );
for ( int i = 0; i < documentCount; i++ ) {
document[ i ] = sizes.readGamma();
size[ i ] = sizes.readGamma();
}
sizes.close();
it.unimi.dsi.fastutil.Arrays.quickSort( 0, documentCount, new AbstractIntComparator() {
public int compare( int x, int y ) {
return document[ x ] - document[ y ];
}
}, new Swapper() {
public void swap( int x, int y ) {
int t = document[ x ];
document[ x ] = document[ y ];
document[ y ] = t;
t = size[ x ];
size[ x ] = size[ y ];
size[ y ] = t;
}
} );
final OutputBitStream permutedSizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION );
for ( int i = 0, d = 0; i < documentCount; i++ ) {
while ( d++ < document[ i ] )
permutedSizes.writeGamma( 0 );
permutedSizes.writeGamma( size[ i ] );
}
permutedSizes.close();
}
}
if ( indexingIsVirtual ) {
final OutputBitStream sizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION );
for ( int i = 0; i < currSize.length; i++ ) sizes.writeGamma( currSize[ i ] );
sizes.close();
}
globCounts.close();
frequencies.close();
termMap.clear();