Package it.unimi.dsi.mg4j.index

Source Code of it.unimi.dsi.mg4j.index.BitStreamIndexWriter

package it.unimi.dsi.mg4j.index;

/*    
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2003-2010 Paolo Boldi and Sebastiano Vigna
*
*  This library is free software; you can redistribute it and/or modify it
*  under the terms of the GNU Lesser General Public License as published by the Free
*  Software Foundation; either version 3 of the License, or (at your option)
*  any later version.
*
*  This library is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
*  for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/

import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
import it.unimi.dsi.mg4j.index.payload.Payload;
import it.unimi.dsi.mg4j.io.InterpolativeCoding;
import it.unimi.dsi.util.Properties;
import java.io.IOException;
import java.util.Map;

/** Writes a bitstream-based interleaved index.
*
* <H2>Offsets bit stream</H2>
*
* <P>An inverted index may have an associated {@link OutputBitStream} of
* offsets: this file contains <code>T+1</code> integers, where <code>T</code>
* is the number of inverted lists (i.e., the number of terms), and the
* <code>i</code>-th entry is the position in bits where
* the <code>i</code>-th inverted list starts (the last entry is actually the
* length, in bytes, of the inverted index file itself).
*
* <p>The file actually contains &gamma;-coded gaps: thus, in practice, it is formed by
* the number zero (the offset of the first list) followed by the length of each inverted list.
*
* @author Paolo Boldi
* @author Sebastiano Vigna
* @since 0.6
*/


public class BitStreamIndexWriter extends AbstractBitStreamIndexWriter {
  private static final boolean ASSERTS = false;
 
  /** This value of {@link #state} means that we should call {@link #newInvertedList()}.*/
  protected static final int BEFORE_INVERTED_LIST = 0;

  /** This value of {@link #state} means that we are positioned at the start of an inverted list,
   * and we should call {@link #writeFrequency(int)}.*/
  protected static final int BEFORE_FREQUENCY = 1;

  /** This value of {@link #state} means that we are ready to call {@link #newDocumentRecord()}. */
  protected static final int BEFORE_DOCUMENT_RECORD = 2;

  /** This value of {@link #state} means that we just started a new document record, and we
   * should call {@link #writeDocumentPointer(OutputBitStream, int)}. */
  protected static final int BEFORE_POINTER = 3;

  /** This value of {@link #state} can be assumed only in indices that contain payloads; it
   * means that we are positioned just before the payload for the current document record. */
  protected static final int BEFORE_PAYLOAD = 4;

  /** This value of {@link #state} can be assumed only in indices that contain counts; it
   * means that we are positioned just before the count for the current document record. */
  protected static final int BEFORE_COUNT = 5;

  /** This value of {@link #state} can be assumed only in indices that contain document positions;
   * it means that we are positioned just before the position list of the current document record. */
  protected static final int BEFORE_POSITIONS = 6;

  /** This is the first unused state. Subclasses may start from this value to define new states. */
  protected static final int FIRST_UNUSED_STATE = 7;

  /** The underlying {@link OutputBitStream}. */
  protected OutputBitStream obs;
  /** The offsets {@link OutputBitStream}. */
  private OutputBitStream offsets;
  /** The {@link OutputBitStream} for the number of bits for positions. */
  private OutputBitStream posNumBits;
  /** The current state of the writer. */
  protected int state;
  /** The number of document records that the current inverted list will contain. */
  protected int frequency;
  /** The number of document records already written for the current inverted list. */
  protected int writtenDocuments;
  /** The current document pointer. */
  protected int currentDocument;
  /** The last document pointer in the current list. */
  protected int lastDocument;
  /** The position (in bytes) where the last inverted list started. */
  protected long lastInvertedListPos;
  /** The number of bits spent for positions in this the current inverted list. */
  private long currPosNumBits;
  /** The parameter <code>b</code> for Golomb coding of pointers. */
  protected int b;
  /** The parameter <code>log2b</code> for Golomb coding of pointers; it is the most significant bit of {@link #b}. */
  protected int log2b;
  /** The maximum number of positions in a document record so far. */
  public int maxCount;
 
 
  /** Creates a new index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).
   *  If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>).
   *  When {@link #close()} will be called, the property file will also be produced (stemmed with <samp>.properties</samp>),
   *  or enriched if it already exists.
   *
   * @param basename the basename.
   * @param numberOfDocuments the number of documents in the collection to be indexed.
   * @param writeOffsets if <code>true</code>, the offset file will also be produced.
   * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
   */
  public BitStreamIndexWriter( final CharSequence basename, final int numberOfDocuments, final boolean writeOffsets, final Map<Component,Coding> flags ) throws IOException {
    this(
      new OutputBitStream( basename + DiskBasedIndex.INDEX_EXTENSION ),
      writeOffsets ? new OutputBitStream( basename + DiskBasedIndex.OFFSETS_EXTENSION ) : null,
      writeOffsets && flags.get( Component.POSITIONS ) != null ? new OutputBitStream( basename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION ) : null,
      numberOfDocuments,
      flags
     );
  }

  /** Creates a new index writer with payloads using the specified underlying {@link OutputBitStream}.
   *
   * @param obs the underlying output bit stream.
   * @param offset the offset bit stream, or <code>null</code> if offsets should not be written.
   * @param posNumBits the bit stream for positions bit lengths, or <code>null</code> if such lengths should not be written.
   * @param numberOfDocuments the number of documents in the collection to be indexed.
   * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
   */
  public BitStreamIndexWriter( final OutputBitStream obs, final OutputBitStream offset, final OutputBitStream posNumBits, final int numberOfDocuments, final Map<Component,Coding> flags ) {
    super( numberOfDocuments, flags );
    this.obs = obs;
    this.posNumBits = posNumBits;
    this.offsets = offset;
    this.frequency = -1;
    this.currentTerm = -1;
    this.maxCount = 0;
    this.currPosNumBits = -1;

    if ( ! hasCounts && hasPositions ) throw new IllegalArgumentException( "Index would have positions but no counts (this can't happen)" );
  }

  /** Creates a new index writer, with the specified underlying {@link OutputBitStream},
   *  without additional bit streams.
   *
   * @param obs the underlying output bit stream.
   * @param numberOfDocuments the number of documents in the collection to be indexed.
   * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
   */
  public BitStreamIndexWriter( final OutputBitStream obs, final int numberOfDocuments, final Map<Component,Coding> flags ) {
    this( obs, null,null, numberOfDocuments, flags );
  }

  public long newInvertedList() throws IOException {
    if ( frequency >= 0 && frequency != writtenDocuments ) throw new IllegalStateException( "The number of document records (" + this.writtenDocuments + ") does not match the frequency (" + this.frequency + ")" );
    if ( state != BEFORE_INVERTED_LIST && state != BEFORE_DOCUMENT_RECORD ) throw new IllegalStateException( "Trying to start new inverted list in state " + state );

    // The position (in bits) where the new inverted list starts
    long pos = obs.writtenBits();
    // Reset variables
    writtenDocuments = 0;
    currentTerm++;
    currentDocument = -1;

    // If needed, write the offset
    if ( offsets != null ) offsets.writeLongGamma( pos - lastInvertedListPos );
    lastInvertedListPos = pos;
    if ( posNumBits != null && currPosNumBits != -1 ) {
      posNumBits.writeLongGamma( currPosNumBits );
      currPosNumBits = 0;
    }
    state = BEFORE_FREQUENCY;
    return pos;
  }

  public int writeFrequency( final int frequency ) throws IOException {
    if ( state != BEFORE_FREQUENCY ) throw new IllegalStateException( "Trying to write frequency in state " + state );

    int bitCount;
    // Write the frequency
    switch( frequencyCoding ) {
    case SHIFTED_GAMMA:
      bitCount = obs.writeShiftedGamma( frequency - 1 ); // frequency cannot be 0
      break;
    case GAMMA:
      bitCount = obs.writeGamma( frequency - 1 ); // frequency cannot be 0
      break;
    case DELTA:
      bitCount = obs.writeDelta( frequency - 1 ); // frequency cannot be 0
      break;
    default:
      throw new IllegalStateException( "The required frequency coding (" + frequencyCoding + ") is not supported." );
    }

    this.frequency = frequency;

    // We compute the modulus used for pointer Golomb coding
    if ( pointerCoding == Coding.GOLOMB ) {
      b = BitStreamIndex.golombModulus( frequency, numberOfDocuments );
      log2b = Fast.mostSignificantBit( b );
    }

    state = BEFORE_DOCUMENT_RECORD;
    bitsForFrequencies += bitCount;
    return bitCount;
  }

  public OutputBitStream newDocumentRecord() throws IOException {
    if ( frequency == writtenDocuments ) throw new IllegalStateException( "Document record overflow (written " + this.frequency + " already)" );
    if ( state != BEFORE_DOCUMENT_RECORD ) throw new IllegalStateException( "Trying to start new document record in state " + state );

    writtenDocuments++;
    numberOfPostings++;
    lastDocument = currentDocument;
    state = BEFORE_POINTER;
    return obs;
  }

  public int writeDocumentPointer( final OutputBitStream out, final int pointer ) throws IOException {
    if ( state != BEFORE_POINTER ) throw new IllegalStateException( "Trying to write pointer in state " + state );

    currentDocument = pointer;
    int bitCount = 0;

    if ( frequency != numberOfDocuments ) { // We do not write pointers for everywhere occurring documents.
      switch( pointerCoding ) {
        case SHIFTED_GAMMA:
          bitCount = out.writeShiftedGamma( pointer - lastDocument - 1 );
          break;
        case UNARY:
          bitCount = out.writeUnary( pointer - lastDocument - 1 );
          break;
        case GAMMA:
          bitCount = out.writeGamma( pointer - lastDocument - 1 );
          break;
        case DELTA:
          bitCount = out.writeDelta( pointer - lastDocument - 1 );
          break;
        case GOLOMB:
          bitCount = out.writeGolomb( pointer - lastDocument - 1, b, log2b );
          break;
        default:
          throw new IllegalStateException( "The required pointer coding (" + pointerCoding + ") is not supported." );
      }
    }
    else if ( pointer - lastDocument != 1 ) throw new IllegalStateException( "Term " + currentTerm + " has frequency equal to the number of documents, but pointers are not consecutive integers" );

    state = hasPayloads ? BEFORE_PAYLOAD : hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD;
    bitsForPointers += bitCount;
    return bitCount;
  }

  public int writePayload( final OutputBitStream out, final Payload payload ) throws IOException {
    if ( frequency < 0 ) throw new IllegalStateException( "Trying to write payload without calling newInvertedList" );
    if ( state != BEFORE_PAYLOAD ) throw new IllegalStateException( "Trying to write payload in state " + state );
    final int count = payload.write( out );
    bitsForPayloads += count;
    state = hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD;
    return count;
  }


  public void close() throws IOException {
    if ( state != BEFORE_DOCUMENT_RECORD && state != BEFORE_INVERTED_LIST ) throw new IllegalStateException( "Trying to close index in state " + state );
    if ( frequency >= 0 && frequency != writtenDocuments ) throw new IllegalStateException( "The number of document records (" + this.writtenDocuments + ") does not match the frequency (" + this.frequency + ")" );

    if ( writtenBits() != obs.writtenBits() )
      throw new IllegalStateException( "Written bits count mismatch: we say " + writtenBits() + ", the stream says " + obs.writtenBits() );

    if ( offsets != null ) {
      offsets.writeLongGamma( obs.writtenBits() - lastInvertedListPos );
      offsets.close();
    }
   
    if ( posNumBits != null ) {
      if ( currPosNumBits != -1 ) posNumBits.writeLongGamma( currPosNumBits );
      posNumBits.close();
    }

    obs.close();
  }
 

  public int writePositionCount( final OutputBitStream out, final int count ) throws IOException {
    if ( frequency < 0 ) throw new IllegalStateException( "Trying to write count without calling newInvertedList" );
    if ( state != BEFORE_COUNT ) throw new IllegalStateException( "Trying to write count in state " + state );
    final int bitCount;

    numberOfOccurrences += count;
    switch( countCoding ) {
      case SHIFTED_GAMMA:
        bitCount = out.writeShiftedGamma( count - 1 );
        break;
      case GAMMA:
        bitCount = out.writeGamma( count - 1 );
        break;
      case UNARY:
        bitCount = out.writeUnary( count - 1 );
        break;
      case DELTA:
        bitCount = out.writeDelta( count - 1 );
        break;
      default:
        throw new IllegalStateException( "The required count coding (" + countCoding + ") is not supported." );
    }
   
    state = hasPositions ? BEFORE_POSITIONS : BEFORE_DOCUMENT_RECORD;
    bitsForCounts += bitCount;
    return bitCount;
  }

  public int writeDocumentPositions( final OutputBitStream out, final int[] occ, final int offset, final int len, final int docSize ) throws IOException {
    if ( frequency < 0 ) throw new IllegalStateException( "Trying to write occurrences without calling newInvertedList" );
    if ( state != BEFORE_POSITIONS ) throw new IllegalStateException( "Trying to write positions in state " + state );

    if ( ASSERTS ) if ( docSize > 0 ) for( int i = 0; i< len; i++ ) assert occ[ offset + i ] < docSize : "Position " + occ[ offset + i ] + " for document " + currentDocument + " is too large; size is " + docSize;
   
    int i;
    int prev = -1;
    int bitCount = 0;
    final int end = offset + len;

    switch( positionCoding ) {
      case GAMMA:
        for( i = offset; i < end; i++ ) {
          bitCount += out.writeGamma( occ[ i ] - prev - 1 );
          prev = occ[ i ];
        }
        break;
      case DELTA:
        for( i = offset; i < end; i++ ) {
          bitCount += out.writeDelta( occ[ i ] - prev - 1 );
          prev = occ[ i ];
        }
        break;
      case SHIFTED_GAMMA:
        for( i = offset; i < end; i++ ) {
          bitCount += out.writeShiftedGamma( occ[ i ] - prev - 1 );
          prev = occ[ i ];
        }
        break;
      case GOLOMB:
        if ( len < 3 ) {
          for( i = 0; i < len; i++ ) bitCount += out.writeMinimalBinary( occ[ i ], docSize );
          break;
        }

        // We compute b and log2b for positions
        final int positionB = BitStreamIndex.golombModulus( len, docSize );
        final int positionLog2b = Fast.mostSignificantBit( positionB );

        for( i = offset; i < end; i++ ) {
          bitCount += out.writeGolomb( occ[ i ] - prev - 1, positionB, positionLog2b );
          prev = occ[ i ];
        }
        break;
      case INTERPOLATIVE:
        bitCount = InterpolativeCoding.write( out, occ, 0, len, 0, docSize - 1 );
        break;
      default:
        throw new IllegalStateException( "The required position coding (" + positionCoding + ") is not supported." );
    }

    state = BEFORE_DOCUMENT_RECORD;
    bitsForPositions += bitCount;
    currPosNumBits += bitCount;
    if ( len > maxCount ) maxCount = len;
    return bitCount;
  }

  public long writtenBits() {
    return bitsForFrequencies + bitsForPointers + bitsForPayloads + bitsForCounts + bitsForPositions;
  }

  public Properties properties() {
    Properties result = new Properties();
    result.setProperty( Index.PropertyKeys.DOCUMENTS, numberOfDocuments );
    result.setProperty( Index.PropertyKeys.TERMS, currentTerm + 1 );
    result.setProperty( Index.PropertyKeys.POSTINGS, numberOfPostings );
    result.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount );
    result.setProperty( Index.PropertyKeys.INDEXCLASS, FileIndex.class.getName() );
    // We save all flags, except for the PAYLOAD component, which is just used internally.
    for( Map.Entry<Component,Coding> e: flags.entrySet() )
      if ( e.getKey() != Component.PAYLOADS ) result.addProperty( Index.PropertyKeys.CODING, new MutableString().append( e.getKey() ).append( ':' ).append( e.getValue() ) );
    return result;
  }
}
TOP

Related Classes of it.unimi.dsi.mg4j.index.BitStreamIndexWriter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.