Package it.unimi.dsi.mg4j.io

Source Code of it.unimi.dsi.mg4j.io.ByteArrayPostingList

package it.unimi.dsi.mg4j.io;

/*    
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2007-2010 Sebastiano Vigna
*
*  This library is free software; you can redistribute it and/or modify it
*  under the terms of the GNU Lesser General Public License as published by the Free
*  Software Foundation; either version 3 of the License, or (at your option)
*  any later version.
*
*  This library is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
*  for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/

import static it.unimi.dsi.io.OutputBitStream.DELTA;
import static it.unimi.dsi.io.OutputBitStream.GAMMA;
import static it.unimi.dsi.io.OutputBitStream.MAX_PRECOMPUTED;
import static it.unimi.dsi.mg4j.tool.Scan.Completeness.COUNTS;
import static it.unimi.dsi.mg4j.tool.Scan.Completeness.POSITIONS;
import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.fastutil.bytes.ByteArrays;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;
import it.unimi.dsi.mg4j.tool.Scan;
import it.unimi.dsi.mg4j.tool.Scan.Completeness;

import java.io.Closeable;
import java.io.File;
import java.io.Flushable;
import java.io.IOException;

/** Lightweight posting accumulator with format similar to that generated by {@link BitStreamIndexWriter}.
*
* <p>This class is essentially a dirty trick: it borrows some code and precomputed tables from {@link OutputBitStream}
* and exposes two simple methods ({@link #setDocumentPointer(int)} and {@link #addPosition(int)}) with obvious
* semantics. The resulting posting list is compressed exactly like an {@link BitStreamIndexWriter} would do (also in this
* case, duplicating some logic found therein). As a result, after completing the calls and after a call to {@link #close()}
* the internal {@link #buffer} can be written directly to a bit stream to build an index (but see {@link #stripPointers(OutputBitStream, long)}).
*
* <p>{@link Scan} uses an instance of this class for each indexed term. Instances can be <em>differential</em>, in which
* case they assume {@link #setDocumentPointer(int)} will be called with increasing values and store gaps rather
* than document pointers. A {@linkplain Completeness completeness level} can be used to set whether an instance of this class
* should store positions or counts.
*
* @author Sebastiano Vigna
* @since 1.2
*/

public class ByteArrayPostingList implements Flushable, Closeable {
  private final static boolean DEBUG = false;

  /** If the enlargement of the backing array causes an out-of-memory error, we set {@link #outOfMemoryError} and try again with a very small increment. This
   * should help in the unlikely (but entirely possible) circumstance that there is not enough memory to double a posting list. */
  private final static int POSTINGS_EMERGENCY_INCREMENT = 1024;
  /** If the enlargement of the position array causes an out-of-memory error, we set {@link #outOfMemoryError} and try again with a very small increment. This
   * should help in the unlikely (but entirely possible) circumstance that there is not enough memory to double a posting list. */
  private final static int POSITIONS_EMERGENCY_INCREMENT = 64;

  /** The internal buffer. */
  public byte[] buffer;
  /** The current frequency (number of calls to {@link #setDocumentPointer(int)}). */
  public int frequency;
  /** The current global count. */
  public long globCount;
  /** The number of bits used for positions. */
  public long posNumBits;
  /** The current count (number of valid entries in {@link #position}). */
  private int count;
  /** The maximum count ever seen. */
  public int maxCount;
  /** If true, this list experienced an {@link OutOfMemoryError} during some buffer reallocation. */
  public boolean outOfMemoryError;
  /** Current bit buffer. */
  private int current;
  /** Current number of free bits in the bit buffer (the bits in the buffer are stored high). */
  private int free;
  /** Current position in the byte buffer. */
  private int pos;
  /** Current number of bytes available in the byte buffer. */
  private int avail;
  /** A small, local cache for positions. */
  private int[] position;
  /** The last document pointer passed to {@link #setDocumentPointer(int)}. */
  private int lastPointer;
  /** Whether this stream is differential. */
  private final boolean differential;
  /** The completeness level of this stream (more precisely, its {@linkplain Completeness#ordinal() ordinal}). */
  private final int completeness;
 
 
  /** Creates a new posting list wrapping a given byte array.
   *
   * @param a the byte array to wrap.
   * @param differential whether this stream should be differential (e.g., whether it should store document pointers as gaps).
   * @param completeness
   */
  public ByteArrayPostingList( final byte[] a, final boolean differential, final Completeness completeness ) {
    this.differential = differential;
    this.completeness = completeness.ordinal();
    free = 8;
    buffer = a;
    avail = a.length;
    if ( this.completeness >= POSITIONS.ordinal() ) position = new int[ 2 ];
    lastPointer = -1;
  }

  private void write( final int b ) {
    if ( avail == 0 ) {
      final int oldLength = buffer.length;
      try {
        buffer = ByteArrays.grow( buffer, buffer.length + 1 );
      }
      catch( OutOfMemoryError e ) {
        outOfMemoryError = true;
        try {
          // We try at all costs to avoid out-of-memory errors: we dump the buffer, try to allocate a slightly larger array and reload it.
          File temp = File.createTempFile( ByteArrayPostingList.class.getSimpleName(), "dump" );
          temp.deleteOnExit();
          BinIO.storeBytes( buffer, temp );
          buffer = null;
          buffer = new byte[ oldLength + POSTINGS_EMERGENCY_INCREMENT ];
          BinIO.loadBytes( temp, buffer );
          temp.delete();
        }
        catch ( IOException f ) {
          throw new RuntimeException( f );
        }
      }
      avail += buffer.length - oldLength;
    }

    avail--;
    buffer[ pos++ ] = (byte)b;
  }

  /** Flushes the internal bit buffer to the {@linkplain #buffer byte buffer}.
   *
   * @return the number of bits written.
   */
 
  public int align() {
    if ( free != 8 ) return writeInCurrent( 0, free );
    else return 0;
  }

  /*
   * The code below is copied from OutputBitStream.
   */
 
  private int writeInCurrent( final int b, final int len ) {
    if ( DEBUG ) if ( len > free ) throw new IllegalArgumentException( Integer.toString( len ) + " bit(s) to write, " + free + " available." );

    current |= ( b & ( ( 1 << len ) - 1 ) ) << ( free -= len );
    if ( free == 0 ) {
      write( current );
      free = 8;
      current = 0;
    }

    return len;
  }


  private int writeInt( int x, final int len ) {

    if ( len < 0 || len > 32 ) throw new IllegalArgumentException( "You cannot write " + len + " bits to an integer." );

    if ( len <= free ) return writeInCurrent( x, len );

    int i = len - free;
    final int queue = i & 7;
   
    if ( free != 0 ) writeInCurrent( x >>> i, free );

    // Dirty trick: since queue < 8, we pre-write the last bits in the bit buffer.
    if ( queue != 0 ) {
      i -= queue;
      writeInCurrent( x, queue );
      x >>>= queue;
    }

    if ( i == 32 ) write( x >>> 24 );
    if ( i > 23 ) write( x >>> 16 );
    if ( i > 15 ) write( x >>> 8 );
    if ( i > 7 ) write( x );
   
    return len;
  }

  private int writeUnary( int x ) {
    if ( x < 0 ) throw new IllegalArgumentException( "The argument " + x + " is negative" );

    if ( x < free ) return writeInCurrent( 1, x + 1 );

    final int shift = free;
    x -= shift;

    write( current );
    free = 8;
    current = 0;

    int i = x >> 3;

    while( i-- != 0 ) write( 0 );

    writeInCurrent( 1, ( x & 7 ) + 1 );

    return x + shift + 1;
  }

  private int writeGamma( int x ) {
    if ( x < 0 ) throw new IllegalArgumentException( "The argument " + x + " is negative" );
    if ( x < MAX_PRECOMPUTED ) return writeInt( GAMMA[ x ], GAMMA[ x ] >>> 26 );
   
    final int msb = Fast.mostSignificantBit( ++x );
    final int l = writeUnary( msb );
    return l + ( msb != 0 ? writeInt( x, msb ) : 0 );
  }

  private int writeDelta( int x ) {
    if ( x < 0 ) throw new IllegalArgumentException( "The argument " + x + " is negative" );
    if ( x < MAX_PRECOMPUTED ) return writeInt( DELTA[ x ], DELTA[ x ] >>> 26 );

    final int msb = Fast.mostSignificantBit( ++x );
    final int l = writeGamma( msb );
    return l + ( msb != 0 ? writeInt( x, msb ) : 0 );
  }

  /** Flushes the positions cached internally.
   *
   */

  public void flush() {
    if ( count != 0 ) {
      if ( completeness >= COUNTS.ordinal() ) writeGamma( count - 1 );
      globCount += count;
      if ( maxCount < count ) maxCount = count;
      if ( completeness >= POSITIONS.ordinal() )  {
        posNumBits += writeDelta( position[ 0 ] );
        for( int i = 1; i < count; i++ ) posNumBits += writeDelta( position[ i ] - position[ i - 1 ] - 1 );
      }
      count = 0;
    }
  }
  /** Sets the current document pointer.
   *
   * <p>If the document pointer is changed since the last call, the positions currently
   * stored are {@linkplain #flush() flushed} and the new pointer is written to the stream.
   *
   * @param pointer a document pointer.
   */
  public void setDocumentPointer( final int pointer ) {
    if ( pointer != lastPointer ) {
      flush();
      writeDelta( differential ? pointer - lastPointer - 1 : pointer );
      lastPointer = pointer;
      frequency++;
    }
  }

  /** Adds a new position for the current document pointer.
   *
   * <p>It is mandatory that successive calls to this method for
   * the same document pointer have increasing arguments.
   *
   * @param pos a position.
   */

  public void addPosition( final int pos ) {
    if ( lastPointer == -1 ) throw new IllegalStateException();
    if ( completeness >= POSITIONS.ordinal() ) {
      if ( count == position.length ) {
        try {
          position = IntArrays.grow( position, count + 1 );
        }
        catch( OutOfMemoryError e ) {
          outOfMemoryError = true;
          try {
            // We try at all costs to avoid out-of-memory errors: we dump the buffer, try to allocate a slightly larger array and reload it.
            File temp = File.createTempFile( ByteArrayPostingList.class.getSimpleName(), "dump" );
            temp.deleteOnExit();
            BinIO.storeInts( position, temp );
            final int oldLength = position.length;
            position = null;
            position = new int[ oldLength + POSITIONS_EMERGENCY_INCREMENT ];
            BinIO.loadInts( temp, position );
            temp.delete();
          }
          catch( IOException f ) {
            throw new RuntimeException( f );
          }
        }
      }
      position[ count ] = pos;
    }
    count++;
  }
 
 
  /** Returns the number of bits written by this posting list.
   *
   * @return the number of bits written by this posting list.
   */
  public long writtenBits() {
    return pos * 8L + 8 - free;
  }
 
  /** Writes the given number of bits of the internal buffer to the provided output bit stream,
   * stripping all document pointers.
   *
   * <p>This method is a horrible kluge solving the problem of terms appearing in all documents:
   * {@link BitStreamIndexWriter} would <em>not</em> write pointers in this case, but we do not know
   * whether we will need pointers or not while we are filling the internal buffer. Thus, for
   * those (hopefully few) termas appearing in all documents this method can be used to
   * dump the internal buffer stripping all pointers.
   *
   * <p>Note that the valid number of bits should be retrieved using {@link #writtenBits()}
   * after a {@link #flush()}. Then, a call to {@link #align()} will dump to the buffer
   * the bits still floating in the bit buffer; at that point this method can be called safely.
   *
   * @param obs an output bit stream.
   * @param bitLength the number of bits to be scanned.
   * @throws IOException
   */
  public void stripPointers( final OutputBitStream obs, final long bitLength ) throws IOException {
    final InputBitStream ibs = new InputBitStream( buffer );
    int count;
    while( ibs.readBits() < bitLength ) {
      ibs.readDelta(); // Discard pointer
      if ( completeness >= COUNTS.ordinal() ) {
        count = ibs.readGamma() + 1;
        obs.writeGamma( count - 1 );
        if ( completeness >= POSITIONS.ordinal() ) while( count-- != 0 ) obs.writeDelta( ibs.readDelta() );
      }
    }
  }
 
  /** Calls {@link #flush()} and then releases resources allocated by this byte-array posting list, keeping just the internal buffer. */
  public void close() {
    flush();
    position = null;
  }
}
TOP

Related Classes of it.unimi.dsi.mg4j.io.ByteArrayPostingList

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.