package it.unimi.dsi.mg4j.io;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2007-2010 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
import static it.unimi.dsi.io.OutputBitStream.DELTA;
import static it.unimi.dsi.io.OutputBitStream.GAMMA;
import static it.unimi.dsi.io.OutputBitStream.MAX_PRECOMPUTED;
import static it.unimi.dsi.mg4j.tool.Scan.Completeness.COUNTS;
import static it.unimi.dsi.mg4j.tool.Scan.Completeness.POSITIONS;
import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.fastutil.bytes.ByteArrays;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;
import it.unimi.dsi.mg4j.tool.Scan;
import it.unimi.dsi.mg4j.tool.Scan.Completeness;
import java.io.Closeable;
import java.io.File;
import java.io.Flushable;
import java.io.IOException;
/** Lightweight posting accumulator with format similar to that generated by {@link BitStreamIndexWriter}.
*
* <p>This class is essentially a dirty trick: it borrows some code and precomputed tables from {@link OutputBitStream}
* and exposes two simple methods ({@link #setDocumentPointer(int)} and {@link #addPosition(int)}) with obvious
* semantics. The resulting posting list is compressed exactly like an {@link BitStreamIndexWriter} would do (also in this
* case, duplicating some logic found therein). As a result, after completing the calls and after a call to {@link #close()}
* the internal {@link #buffer} can be written directly to a bit stream to build an index (but see {@link #stripPointers(OutputBitStream, long)}).
*
* <p>{@link Scan} uses an instance of this class for each indexed term. Instances can be <em>differential</em>, in which
* case they assume {@link #setDocumentPointer(int)} will be called with increasing values and store gaps rather
* than document pointers. A {@linkplain Completeness completeness level} can be used to set whether an instance of this class
* should store positions or counts.
*
* @author Sebastiano Vigna
* @since 1.2
*/
public class ByteArrayPostingList implements Flushable, Closeable {
private final static boolean DEBUG = false;
/** If the enlargement of the backing array causes an out-of-memory error, we set {@link #outOfMemoryError} and try again with a very small increment. This
* should help in the unlikely (but entirely possible) circumstance that there is not enough memory to double a posting list. */
private final static int POSTINGS_EMERGENCY_INCREMENT = 1024;
/** If the enlargement of the position array causes an out-of-memory error, we set {@link #outOfMemoryError} and try again with a very small increment. This
* should help in the unlikely (but entirely possible) circumstance that there is not enough memory to double a posting list. */
private final static int POSITIONS_EMERGENCY_INCREMENT = 64;
/** The internal buffer. */
public byte[] buffer;
/** The current frequency (number of calls to {@link #setDocumentPointer(int)}). */
public int frequency;
/** The current global count. */
public long globCount;
/** The number of bits used for positions. */
public long posNumBits;
/** The current count (number of valid entries in {@link #position}). */
private int count;
/** The maximum count ever seen. */
public int maxCount;
/** If true, this list experienced an {@link OutOfMemoryError} during some buffer reallocation. */
public boolean outOfMemoryError;
/** Current bit buffer. */
private int current;
/** Current number of free bits in the bit buffer (the bits in the buffer are stored high). */
private int free;
/** Current position in the byte buffer. */
private int pos;
/** Current number of bytes available in the byte buffer. */
private int avail;
/** A small, local cache for positions. */
private int[] position;
/** The last document pointer passed to {@link #setDocumentPointer(int)}. */
private int lastPointer;
/** Whether this stream is differential. */
private final boolean differential;
/** The completeness level of this stream (more precisely, its {@linkplain Completeness#ordinal() ordinal}). */
private final int completeness;
/** Creates a new posting list wrapping a given byte array.
*
* @param a the byte array to wrap.
* @param differential whether this stream should be differential (e.g., whether it should store document pointers as gaps).
* @param completeness
*/
public ByteArrayPostingList( final byte[] a, final boolean differential, final Completeness completeness ) {
this.differential = differential;
this.completeness = completeness.ordinal();
free = 8;
buffer = a;
avail = a.length;
if ( this.completeness >= POSITIONS.ordinal() ) position = new int[ 2 ];
lastPointer = -1;
}
private void write( final int b ) {
if ( avail == 0 ) {
final int oldLength = buffer.length;
try {
buffer = ByteArrays.grow( buffer, buffer.length + 1 );
}
catch( OutOfMemoryError e ) {
outOfMemoryError = true;
try {
// We try at all costs to avoid out-of-memory errors: we dump the buffer, try to allocate a slightly larger array and reload it.
File temp = File.createTempFile( ByteArrayPostingList.class.getSimpleName(), "dump" );
temp.deleteOnExit();
BinIO.storeBytes( buffer, temp );
buffer = null;
buffer = new byte[ oldLength + POSTINGS_EMERGENCY_INCREMENT ];
BinIO.loadBytes( temp, buffer );
temp.delete();
}
catch ( IOException f ) {
throw new RuntimeException( f );
}
}
avail += buffer.length - oldLength;
}
avail--;
buffer[ pos++ ] = (byte)b;
}
/** Flushes the internal bit buffer to the {@linkplain #buffer byte buffer}.
*
* @return the number of bits written.
*/
public int align() {
if ( free != 8 ) return writeInCurrent( 0, free );
else return 0;
}
/*
* The code below is copied from OutputBitStream.
*/
private int writeInCurrent( final int b, final int len ) {
if ( DEBUG ) if ( len > free ) throw new IllegalArgumentException( Integer.toString( len ) + " bit(s) to write, " + free + " available." );
current |= ( b & ( ( 1 << len ) - 1 ) ) << ( free -= len );
if ( free == 0 ) {
write( current );
free = 8;
current = 0;
}
return len;
}
private int writeInt( int x, final int len ) {
if ( len < 0 || len > 32 ) throw new IllegalArgumentException( "You cannot write " + len + " bits to an integer." );
if ( len <= free ) return writeInCurrent( x, len );
int i = len - free;
final int queue = i & 7;
if ( free != 0 ) writeInCurrent( x >>> i, free );
// Dirty trick: since queue < 8, we pre-write the last bits in the bit buffer.
if ( queue != 0 ) {
i -= queue;
writeInCurrent( x, queue );
x >>>= queue;
}
if ( i == 32 ) write( x >>> 24 );
if ( i > 23 ) write( x >>> 16 );
if ( i > 15 ) write( x >>> 8 );
if ( i > 7 ) write( x );
return len;
}
private int writeUnary( int x ) {
if ( x < 0 ) throw new IllegalArgumentException( "The argument " + x + " is negative" );
if ( x < free ) return writeInCurrent( 1, x + 1 );
final int shift = free;
x -= shift;
write( current );
free = 8;
current = 0;
int i = x >> 3;
while( i-- != 0 ) write( 0 );
writeInCurrent( 1, ( x & 7 ) + 1 );
return x + shift + 1;
}
private int writeGamma( int x ) {
if ( x < 0 ) throw new IllegalArgumentException( "The argument " + x + " is negative" );
if ( x < MAX_PRECOMPUTED ) return writeInt( GAMMA[ x ], GAMMA[ x ] >>> 26 );
final int msb = Fast.mostSignificantBit( ++x );
final int l = writeUnary( msb );
return l + ( msb != 0 ? writeInt( x, msb ) : 0 );
}
private int writeDelta( int x ) {
if ( x < 0 ) throw new IllegalArgumentException( "The argument " + x + " is negative" );
if ( x < MAX_PRECOMPUTED ) return writeInt( DELTA[ x ], DELTA[ x ] >>> 26 );
final int msb = Fast.mostSignificantBit( ++x );
final int l = writeGamma( msb );
return l + ( msb != 0 ? writeInt( x, msb ) : 0 );
}
/** Flushes the positions cached internally.
*
*/
public void flush() {
if ( count != 0 ) {
if ( completeness >= COUNTS.ordinal() ) writeGamma( count - 1 );
globCount += count;
if ( maxCount < count ) maxCount = count;
if ( completeness >= POSITIONS.ordinal() ) {
posNumBits += writeDelta( position[ 0 ] );
for( int i = 1; i < count; i++ ) posNumBits += writeDelta( position[ i ] - position[ i - 1 ] - 1 );
}
count = 0;
}
}
/** Sets the current document pointer.
*
* <p>If the document pointer is changed since the last call, the positions currently
* stored are {@linkplain #flush() flushed} and the new pointer is written to the stream.
*
* @param pointer a document pointer.
*/
public void setDocumentPointer( final int pointer ) {
if ( pointer != lastPointer ) {
flush();
writeDelta( differential ? pointer - lastPointer - 1 : pointer );
lastPointer = pointer;
frequency++;
}
}
/** Adds a new position for the current document pointer.
*
* <p>It is mandatory that successive calls to this method for
* the same document pointer have increasing arguments.
*
* @param pos a position.
*/
public void addPosition( final int pos ) {
if ( lastPointer == -1 ) throw new IllegalStateException();
if ( completeness >= POSITIONS.ordinal() ) {
if ( count == position.length ) {
try {
position = IntArrays.grow( position, count + 1 );
}
catch( OutOfMemoryError e ) {
outOfMemoryError = true;
try {
// We try at all costs to avoid out-of-memory errors: we dump the buffer, try to allocate a slightly larger array and reload it.
File temp = File.createTempFile( ByteArrayPostingList.class.getSimpleName(), "dump" );
temp.deleteOnExit();
BinIO.storeInts( position, temp );
final int oldLength = position.length;
position = null;
position = new int[ oldLength + POSITIONS_EMERGENCY_INCREMENT ];
BinIO.loadInts( temp, position );
temp.delete();
}
catch( IOException f ) {
throw new RuntimeException( f );
}
}
}
position[ count ] = pos;
}
count++;
}
/** Returns the number of bits written by this posting list.
*
* @return the number of bits written by this posting list.
*/
public long writtenBits() {
return pos * 8L + 8 - free;
}
/** Writes the given number of bits of the internal buffer to the provided output bit stream,
* stripping all document pointers.
*
* <p>This method is a horrible kluge solving the problem of terms appearing in all documents:
* {@link BitStreamIndexWriter} would <em>not</em> write pointers in this case, but we do not know
* whether we will need pointers or not while we are filling the internal buffer. Thus, for
* those (hopefully few) termas appearing in all documents this method can be used to
* dump the internal buffer stripping all pointers.
*
* <p>Note that the valid number of bits should be retrieved using {@link #writtenBits()}
* after a {@link #flush()}. Then, a call to {@link #align()} will dump to the buffer
* the bits still floating in the bit buffer; at that point this method can be called safely.
*
* @param obs an output bit stream.
* @param bitLength the number of bits to be scanned.
* @throws IOException
*/
public void stripPointers( final OutputBitStream obs, final long bitLength ) throws IOException {
final InputBitStream ibs = new InputBitStream( buffer );
int count;
while( ibs.readBits() < bitLength ) {
ibs.readDelta(); // Discard pointer
if ( completeness >= COUNTS.ordinal() ) {
count = ibs.readGamma() + 1;
obs.writeGamma( count - 1 );
if ( completeness >= POSITIONS.ordinal() ) while( count-- != 0 ) obs.writeDelta( ibs.readDelta() );
}
}
}
/** Calls {@link #flush()} and then releases resources allocated by this byte-array posting list, keeping just the internal buffer. */
public void close() {
flush();
position = null;
}
}