Source Code of it.unimi.dsi.mg4j.tool.Paste$DocumentIndexComparator

package it.unimi.dsi.mg4j.tool;


/*     
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2005-2010 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */


import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
import it.unimi.dsi.fastutil.ints.IntHeapPriorityQueue;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.mg4j.index.CachingOutputBitStream;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.IndexIterator;
import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.mg4j.index.CompressionFlags.Component;


import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.net.URISyntaxException;
import java.util.Map;


import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;


import com.martiansoftware.jsap.JSAPException;


/** Pastes several indices.
 * 
 * <p>Pasting is a very slow way of combining indices: we assume
 * that not only documents, but also document occurrences might be scattered
 * throughout several indices. When a document appears in several indices,
 * its occurrences in a given index are combined. We have two possibilities:
 * <ul>
 * <li><em>standard</em> pasting: position lists are simply concatenated&mdash;it
 * is responsibility of the caller to guarantee that they have been numbered
 * in an increasing fashion; the sizes of the last input index are the sizes of
 * the pasted index;
 * <li><em>incremental</em> pasting: position lists are concatenated, but each
 * list is renumbered by adding to all positions the sum of the sizes of the
 * current document for all indices the precede the current one (this kind
 * of pasting was the only one available before version 3.0).
 * </ul>
 * 
 * <p>Standard pasting is used, for instance, to paste the batches of a
 * {@linkplain it.unimi.dsi.mg4j.document.DocumentFactory.FieldType#VIRTUAL virtual field}
 * generated by {@link Scan}; the latter takes care of numbering positions
 * correctly. If, however, you index parts of the same document collection on 
 * different machines using the same {@link VirtualDocumentResolver}, then
 * the resulting indices for virtual fields will 
 * have all position starting from zero, and they will need an incremental
 * pasting to be combined correctly.
 * 
 * <p>Conceptually, this operation is equivalent to splitting a collection
 * <em>vertically</em>: each document is divided into a fixed number <var>n</var> 
 * of consecutive segments (possibly of length 0), and a set of <var>n</var> indices
 * is created using the <var>k</var>-th segment of all documents. Pasting the
 * resulting indices will produce an index that is identical to the index generated
 * by the original collection. The behaviour is analogous to that of the UN*X
 * <samp>paste</samp> command if documents are single-line lists of words. 
 * 
 * <p>Note that in case every document appears at most in one index pasting
 * is equivalent to {@linkplain it.unimi.dsi.mg4j.tool.Merge merging}. It is, however,
 * significantly slower, as the presence of the same document in several lists makes
 * it necessary to scan completely the inverted lists to be pasted to compute the
 * frequency. To do so, an in-memory buffer is allocated. If an inverted list does not fit
 * in the memory buffer, it is spilled on disk. Sizing correctly the buffer, and choosing a fast
 * file system for the temporary directory can significantly affect performance.
 * 
 * <p><strong>Warning</strong>: incremental pasting is very memory-intensive, as
 * a list of sizes must be loaded for each index. You can use the URI option
 * <samp>succinctsizes=1</samp> to load sizes in a succinct format, which will
 * ease the problem.
 *   
 * @author Sebastiano Vigna
 * @since 1.0
 */


final public class Paste extends Combine {
  @SuppressWarnings("unused")
  private static final Logger LOGGER = Util.getLogger( Paste.class );
  
  /** The default size of the temporary bit stream buffer used while pasting. Posting lists larger
   * than this size will be precomputed on disk and then added to the index. */
  public final static int DEFAULT_MEMORY_BUFFER_SIZE = 16 * 1024 * 1024;
  
  /** The reference array of the document queue. */
  protected final int[] doc;
  /** Whether this paste is incremental. */
  private final boolean incremental;
  /** The queue containing document pointers (for remapped indices). */
  protected final IntHeapPriorityQueue documentQueue;
  /** The temporary cache file {@link #combine(int)}. */
  private final File tempFile;
  /** The temporary output bit stream for {@link #combine(int)}. */
  private final CachingOutputBitStream cacheBitStreamOut;
  /** The temporary output bit stream for {@link #combine(int)}. */
  private final InputBitStream cacheBitStreamIn;
  /** The input bit stream used to wrap directly {@link #cacheBitStreamOut}'s buffer. */
  private final InputBitStream cacheBitStreamInWrapper;
  /** The size of the size list for each index. */
  private final int[] sizesSize;
  
  /** Pastes several indices into one.
   * 
   * @param outputBasename the basename of the combined index.
   * @param inputBasename the basenames of the input indices.
   * @param metadataOnly if true, we save only metadata (term list, frequencies, global counts).
   * @param incremental if true, we perform an incremental paste (needs sizes).
   * @param bufferSize the buffer size for index readers.
   * @param tempFileDir the directory of the temporary file used when pasting.
   * @param tempBufferSize the size of the in-memory buffer used when pasting.
   * @param writerFlags the flags for the index writer.
   * @param interleaved forces an interleaved index.
   * @param skips whether to insert skips in case <code>interleaved</code> is true.
   * @param quantum the quantum of skipping structures; if negative, a percentage of space for variable-quantum indices (irrelevant if <code>skips</code> is false).
   * @param height the height of skipping towers (irrelevant if <code>skips</code> is false).
   * @param skipBufferSize the size of the buffer used to hold temporarily inverted lists during the skipping structure construction.
   * @param logInterval how often we log.
   */
  public Paste( final String outputBasename,
      final String[] inputBasename,
      final boolean metadataOnly,
      final boolean incremental,
      final int bufferSize,
      final File tempFileDir,
      final int tempBufferSize,
      final Map<Component,Coding> writerFlags,
      final boolean interleaved,
      final boolean skips,
      final int quantum,
      final int height,
      final int skipBufferSize,
      final long logInterval ) throws IOException, ConfigurationException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    super( outputBasename, inputBasename, metadataOnly, incremental, bufferSize, writerFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval );
    this.incremental = incremental;


    tempFile = File.createTempFile( "MG4J", ".data", tempFileDir );
    cacheBitStreamOut = new CachingOutputBitStream( tempFile, tempBufferSize );
    cacheBitStreamIn = new InputBitStream( tempFile, bufferSize );
    cacheBitStreamInWrapper = new InputBitStream( cacheBitStreamOut.buffer() );
    /* In this case, we must reallocate position as by merging occurences we might
     * obtain an occurrence list as large as the concatenation of all largest
     * lists. We use this estimate to allocate position, and update maxCount in
     * combine() to get the real maxCount. */
    int estimateForMaxCount = 0, tempSize = 0;
    sizesSize = incremental ? new int[ numIndices ] : null;
    if ( incremental ) for( int i = index.length; i-- != 0; ) sizesSize[ i ] = index[ i ].sizes.size();
  
    for( int i = 0; i < numIndices; i++ ) {
      if ( index[ i ].hasPayloads ) throw new IllegalArgumentException( "You cannot paste indices with payloads" );
      estimateForMaxCount += index[ i ].maxCount;
      tempSize = Math.max( tempSize, index[ i ].maxCount );
    }


    if ( hasPositions ) position = new int[ estimateForMaxCount ];
    doc = new int[ numIndices ];
    documentQueue = new IntHeapPriorityQueue( numIndices, new DocumentIndexComparator( doc ) );
  }


  /** A comparator making an integer priority queue work much like an indirect
   * priority queue, with the additional property of using the reference index as secondary key.
   */
  
  private final static class DocumentIndexComparator extends AbstractIntComparator {
    private final int[] refArray;


    public DocumentIndexComparator( final int[] refArray ) {
      this.refArray = refArray;
    }
     
    public int compare( final int i, final int j ) {
      final int t = refArray[ i ] - refArray[ j ];
      return t != 0 ? t : i - j;
    }
  }
  
  
  protected int combineNumberOfDocuments() {
    int n = 0;
    for( int i = 0; i < numIndices; i++ ) n = Math.max( n, index[ i ].numberOfDocuments );
    return n;
  }


  protected int combineSizes( final OutputBitStream sizesOutputBitStream ) throws IOException {
    int currDoc = 0, maxDocSize = 0;
    
    if ( incremental ) {
      // We accumulate document sizes in an array.
      size = new int[ numberOfDocuments ];
      for( int i = 0; i < numIndices; i++ ) {
        final IntIterator sizes = sizes( i );
        int j = index[ i ].numberOfDocuments;
        currDoc = 0;
        while( j-- != 0 ) maxDocSize = Math.max( maxDocSize, size[ currDoc++ ] += sizes.nextInt() );
        if ( sizes instanceof Closeable ) ((Closeable)sizes).close();
      }
      // We write the array.
      for( int s: size ) sizesOutputBitStream.writeGamma( s );
      // We keep it if we need sizes.
      if ( ! needsSizes ) size = null;
    }
    else {
      if ( needsSizes ) size = new int[ numberOfDocuments ]; 
      final IntIterator sizes = sizes( numIndices - 1 );
      int s = 0;
      // We copy the last file size, and store the elements in an array if needsSizes is true.
      for( int j = 0; j < numberOfDocuments; j++ ) {
        s = sizes.nextInt();
        if ( needsSizes ) size[ j ] = s;
        maxDocSize = Math.max( maxDocSize, s );
        sizesOutputBitStream.writeGamma( s );
      }
      if ( sizes instanceof Closeable ) ((Closeable)sizes).close();
      // We keep the array if we need sizes.
      if ( ! needsSizes ) size = null;
    }
    return maxDocSize;
  }




  protected int combine( final int numUsedIndices ) throws IOException {
    /* If we're merging just one list, merging is fine, and moreover
     * maxCount need not be updated, as it is already initialised to
     * the maximum over all indices. */
    int currIndex, prevDoc = -1, currDoc, count;
    int temp[];
    OutputBitStream obs;
    Index i;
    IndexIterator ii;
  
    // Note that the total frequency can be computed only during the merge.
    for( int k = numUsedIndices; k-- != 0; ) {
      currIndex = usedIndex[ k ];
      frequency[ currIndex ] = indexIterator[ currIndex ].frequency();
      doc[ currIndex ] = indexIterator[ currIndex ].nextDocument();
      documentQueue.enqueue( currIndex );
    }
    
    // First phase: we write the inverted list using a quick-and-dirty format in the cache.
    cacheBitStreamOut.position( 0 );
    int  totalFrequency = 0, increment, prevIndex, totalCount;
    
    while( ! documentQueue.isEmpty() ) {
      // We extract the smallest document pointer, and enqueue it in the new index.
      currDoc = doc[ currIndex = documentQueue.firstInt() ];
      totalFrequency++;
      if ( ! metadataOnly ) cacheBitStreamOut.writeDelta( currDoc - prevDoc - 1 );
      
      totalCount = prevIndex = increment = 0;
      
      do {
        if ( incremental) 
          while( prevIndex < currIndex ) {
            /* Note that some virtual documents could not exist at all in some index (in which
             * case we extend the size list with zeroes). */ 
            if ( sizesSize[ prevIndex ] > currDoc ) increment += index[ prevIndex ].sizes.getInt( currDoc );
            prevIndex++;
          }
        i = index[ currIndex ];


        i = index[ currIndex ];
        ii = indexIterator[ currIndex ];
      
        if ( ! metadataOnly && i.hasCounts ) {
          count = ii.count();
          if ( i.hasPositions ) {
            temp = ii.positionArray();
            if ( ! incremental && totalCount > 0 && temp[ 0 ] <= position[ totalCount - 1 ] ) throw new IllegalStateException( "Positions in document " + currDoc + " are not increasing; you probably need to require an incremental pasting" );
            for( int k = count; k-- != 0; ) position[ totalCount + k ] = temp[ k ] + increment;
          }
          totalCount += count;
        }
        
        // If we just wrote the last document pointer of this term in index j, we dequeue it.
        if ( --frequency[ currIndex ] == 0 ) documentQueue.dequeue();
        else {
          doc[ currIndex ] = ii.nextDocument();
          documentQueue.changed();
        }
      } while( ! documentQueue.isEmpty() && doc[ currIndex = documentQueue.firstInt() ] == currDoc );
  
      if ( totalCount > maxCount ) maxCount = totalCount;
  
      if ( ! metadataOnly && hasCounts ) { 
        cacheBitStreamOut.writeGamma( totalCount );
        if ( hasPositions ) {
          cacheBitStreamOut.writeDelta( position[ 0 ] );
          for( int k = 1; k < totalCount; k++ ) cacheBitStreamOut.writeDelta( position[ k ] - position[ k - 1 ] - 1 );
        }
      }
  
      prevDoc = currDoc;
    }
  
    if ( ! metadataOnly ) {
      // Finally, we pour the data into the actual index.


      if ( p != 0 ) variableQuantumIndexWriter.newInvertedList( totalFrequency, p, predictedSize, predictedLengthNumBits ); 
      else indexWriter.newInvertedList();


      indexWriter.writeFrequency( totalFrequency );
      cacheBitStreamOut.align();
      final InputBitStream ibs;


      if ( cacheBitStreamOut.buffer() != null ) ibs = cacheBitStreamInWrapper;
      else {
        cacheBitStreamOut.flush();
        ibs = cacheBitStreamIn;
        ibs.flush();
      }


      ibs.position( 0 );


      currDoc = -1;
      for( int j = totalFrequency; j-- != 0; ) {
        obs = indexWriter.newDocumentRecord();
        indexWriter.writeDocumentPointer( obs, currDoc = ibs.readDelta() + currDoc + 1 );
        if ( hasCounts ) {
          count = ibs.readGamma();
          indexWriter.writePositionCount( obs, count );
          if ( hasPositions ) {
            position[ 0 ] = ibs.readDelta();
            for( int k = 1; k < count; k++ ) position[ k ] = position[ k - 1 ] + ibs.readDelta() + 1;
            indexWriter.writeDocumentPositions( obs, position, 0, count, size != null ? size[ currDoc ] : -1 );
          }
        }
      }


    }
    
    return totalFrequency;
  }
  
  public void run() throws ConfigurationException, IOException {
    super.run();
    cacheBitStreamOut.close();
    tempFile.delete();
  }


  public static void main( String arg[] ) throws ConfigurationException, SecurityException, JSAPException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    Combine.main( arg, Paste.class );
  }
}
Source Code of it.unimi.dsi.mg4j.tool.Paste$DocumentIndexComparator

Related Classes of it.unimi.dsi.mg4j.tool.Paste$DocumentIndexComparator