package it.unimi.dsi.mg4j.tool;
* MG4J: Managing Gigabytes for Java
* Copyright (C) 2005-2010 Sebastiano Vigna
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see <>.
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
import it.unimi.dsi.fastutil.ints.IntHeapPriorityQueue;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.mg4j.index.CachingOutputBitStream;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.IndexIterator;
import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;
import com.martiansoftware.jsap.JSAPException;
/** Pastes several indices.
* <p>Pasting is a very slow way of combining indices: we assume
* that not only documents, but also document occurrences might be scattered
* throughout several indices. When a document appears in several indices,
* its occurrences in a given index are combined. We have two possibilities:
* <ul>
* <li><em>standard</em> pasting: position lists are simply concatenated—it
* is responsibility of the caller to guarantee that they have been numbered
* in an increasing fashion; the sizes of the last input index are the sizes of
* the pasted index;
* <li><em>incremental</em> pasting: position lists are concatenated, but each
* list is renumbered by adding to all positions the sum of the sizes of the
* current document for all indices the precede the current one (this kind
* of pasting was the only one available before version 3.0).
* </ul>
* <p>Standard pasting is used, for instance, to paste the batches of a
* {@linkplain it.unimi.dsi.mg4j.document.DocumentFactory.FieldType#VIRTUAL virtual field}
* generated by {@link Scan}; the latter takes care of numbering positions
* correctly. If, however, you index parts of the same document collection on
* different machines using the same {@link VirtualDocumentResolver}, then
* the resulting indices for virtual fields will
* have all position starting from zero, and they will need an incremental
* pasting to be combined correctly.
* <p>Conceptually, this operation is equivalent to splitting a collection
* <em>vertically</em>: each document is divided into a fixed number <var>n</var>
* of consecutive segments (possibly of length 0), and a set of <var>n</var> indices
* is created using the <var>k</var>-th segment of all documents. Pasting the
* resulting indices will produce an index that is identical to the index generated
* by the original collection. The behaviour is analogous to that of the UN*X
* <samp>paste</samp> command if documents are single-line lists of words.
* <p>Note that in case every document appears at most in one index pasting
* is equivalent to {@linkplain it.unimi.dsi.mg4j.tool.Merge merging}. It is, however,
* significantly slower, as the presence of the same document in several lists makes
* it necessary to scan completely the inverted lists to be pasted to compute the
* frequency. To do so, an in-memory buffer is allocated. If an inverted list does not fit
* in the memory buffer, it is spilled on disk. Sizing correctly the buffer, and choosing a fast
* file system for the temporary directory can significantly affect performance.
* <p><strong>Warning</strong>: incremental pasting is very memory-intensive, as
* a list of sizes must be loaded for each index. You can use the URI option
* <samp>succinctsizes=1</samp> to load sizes in a succinct format, which will
* ease the problem.
* @author Sebastiano Vigna
* @since 1.0
final public class Paste extends Combine {
private static final Logger LOGGER = Util.getLogger( Paste.class );
/** The default size of the temporary bit stream buffer used while pasting. Posting lists larger
* than this size will be precomputed on disk and then added to the index. */
public final static int DEFAULT_MEMORY_BUFFER_SIZE = 16 * 1024 * 1024;
/** The reference array of the document queue. */
protected final int[] doc;
/** Whether this paste is incremental. */
private final boolean incremental;
/** The queue containing document pointers (for remapped indices). */
protected final IntHeapPriorityQueue documentQueue;
/** The temporary cache file {@link #combine(int)}. */
private final File tempFile;
/** The temporary output bit stream for {@link #combine(int)}. */
private final CachingOutputBitStream cacheBitStreamOut;
/** The temporary output bit stream for {@link #combine(int)}. */
private final InputBitStream cacheBitStreamIn;
/** The input bit stream used to wrap directly {@link #cacheBitStreamOut}'s buffer. */
private final InputBitStream cacheBitStreamInWrapper;
/** The size of the size list for each index. */
private final int[] sizesSize;
/** Pastes several indices into one.
* @param outputBasename the basename of the combined index.
* @param inputBasename the basenames of the input indices.
* @param metadataOnly if true, we save only metadata (term list, frequencies, global counts).
* @param incremental if true, we perform an incremental paste (needs sizes).
* @param bufferSize the buffer size for index readers.
* @param tempFileDir the directory of the temporary file used when pasting.
* @param tempBufferSize the size of the in-memory buffer used when pasting.
* @param writerFlags the flags for the index writer.
* @param interleaved forces an interleaved index.
* @param skips whether to insert skips in case <code>interleaved</code> is true.
* @param quantum the quantum of skipping structures; if negative, a percentage of space for variable-quantum indices (irrelevant if <code>skips</code> is false).
* @param height the height of skipping towers (irrelevant if <code>skips</code> is false).
* @param skipBufferSize the size of the buffer used to hold temporarily inverted lists during the skipping structure construction.
* @param logInterval how often we log.
public Paste( final String outputBasename,
final String[] inputBasename,
final boolean metadataOnly,
final boolean incremental,
final int bufferSize,
final File tempFileDir,
final int tempBufferSize,
final Map<Component,Coding> writerFlags,
final boolean interleaved,
final boolean skips,
final int quantum,
final int height,
final int skipBufferSize,
final long logInterval ) throws IOException, ConfigurationException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
super( outputBasename, inputBasename, metadataOnly, incremental, bufferSize, writerFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval );
this.incremental = incremental;
tempFile = File.createTempFile( "MG4J", ".data", tempFileDir );
cacheBitStreamOut = new CachingOutputBitStream( tempFile, tempBufferSize );
cacheBitStreamIn = new InputBitStream( tempFile, bufferSize );
cacheBitStreamInWrapper = new InputBitStream( cacheBitStreamOut.buffer() );
/* In this case, we must reallocate position as by merging occurences we might
* obtain an occurrence list as large as the concatenation of all largest
* lists. We use this estimate to allocate position, and update maxCount in
* combine() to get the real maxCount. */
int estimateForMaxCount = 0, tempSize = 0;
sizesSize = incremental ? new int[ numIndices ] : null;
if ( incremental ) for( int i = index.length; i-- != 0; ) sizesSize[ i ] = index[ i ].sizes.size();
for( int i = 0; i < numIndices; i++ ) {
if ( index[ i ].hasPayloads ) throw new IllegalArgumentException( "You cannot paste indices with payloads" );
estimateForMaxCount += index[ i ].maxCount;
tempSize = Math.max( tempSize, index[ i ].maxCount );
if ( hasPositions ) position = new int[ estimateForMaxCount ];
doc = new int[ numIndices ];
documentQueue = new IntHeapPriorityQueue( numIndices, new DocumentIndexComparator( doc ) );
/** A comparator making an integer priority queue work much like an indirect
* priority queue, with the additional property of using the reference index as secondary key.
private final static class DocumentIndexComparator extends AbstractIntComparator {
private final int[] refArray;
public DocumentIndexComparator( final int[] refArray ) {
this.refArray = refArray;
public int compare( final int i, final int j ) {
final int t = refArray[ i ] - refArray[ j ];
return t != 0 ? t : i - j;
protected int combineNumberOfDocuments() {
int n = 0;
for( int i = 0; i < numIndices; i++ ) n = Math.max( n, index[ i ].numberOfDocuments );
return n;
protected int combineSizes( final OutputBitStream sizesOutputBitStream ) throws IOException {
int currDoc = 0, maxDocSize = 0;
if ( incremental ) {
// We accumulate document sizes in an array.
size = new int[ numberOfDocuments ];
for( int i = 0; i < numIndices; i++ ) {
final IntIterator sizes = sizes( i );
int j = index[ i ].numberOfDocuments;
currDoc = 0;
while( j-- != 0 ) maxDocSize = Math.max( maxDocSize, size[ currDoc++ ] += sizes.nextInt() );
if ( sizes instanceof Closeable ) ((Closeable)sizes).close();
// We write the array.
for( int s: size ) sizesOutputBitStream.writeGamma( s );
// We keep it if we need sizes.
if ( ! needsSizes ) size = null;
else {
if ( needsSizes ) size = new int[ numberOfDocuments ];
final IntIterator sizes = sizes( numIndices - 1 );
int s = 0;
// We copy the last file size, and store the elements in an array if needsSizes is true.
for( int j = 0; j < numberOfDocuments; j++ ) {
s = sizes.nextInt();
if ( needsSizes ) size[ j ] = s;
maxDocSize = Math.max( maxDocSize, s );
sizesOutputBitStream.writeGamma( s );
if ( sizes instanceof Closeable ) ((Closeable)sizes).close();
// We keep the array if we need sizes.
if ( ! needsSizes ) size = null;
return maxDocSize;
protected int combine( final int numUsedIndices ) throws IOException {
/* If we're merging just one list, merging is fine, and moreover
* maxCount need not be updated, as it is already initialised to
* the maximum over all indices. */
int currIndex, prevDoc = -1, currDoc, count;
int temp[];
OutputBitStream obs;
Index i;
IndexIterator ii;
// Note that the total frequency can be computed only during the merge.
for( int k = numUsedIndices; k-- != 0; ) {
currIndex = usedIndex[ k ];
frequency[ currIndex ] = indexIterator[ currIndex ].frequency();
doc[ currIndex ] = indexIterator[ currIndex ].nextDocument();
documentQueue.enqueue( currIndex );
// First phase: we write the inverted list using a quick-and-dirty format in the cache.
cacheBitStreamOut.position( 0 );
int totalFrequency = 0, increment, prevIndex, totalCount;
while( ! documentQueue.isEmpty() ) {
// We extract the smallest document pointer, and enqueue it in the new index.
currDoc = doc[ currIndex = documentQueue.firstInt() ];
if ( ! metadataOnly ) cacheBitStreamOut.writeDelta( currDoc - prevDoc - 1 );
totalCount = prevIndex = increment = 0;
do {
if ( incremental)
while( prevIndex < currIndex ) {
/* Note that some virtual documents could not exist at all in some index (in which
* case we extend the size list with zeroes). */
if ( sizesSize[ prevIndex ] > currDoc ) increment += index[ prevIndex ].sizes.getInt( currDoc );
i = index[ currIndex ];
i = index[ currIndex ];
ii = indexIterator[ currIndex ];
if ( ! metadataOnly && i.hasCounts ) {
count = ii.count();
if ( i.hasPositions ) {
temp = ii.positionArray();
if ( ! incremental && totalCount > 0 && temp[ 0 ] <= position[ totalCount - 1 ] ) throw new IllegalStateException( "Positions in document " + currDoc + " are not increasing; you probably need to require an incremental pasting" );
for( int k = count; k-- != 0; ) position[ totalCount + k ] = temp[ k ] + increment;
totalCount += count;
// If we just wrote the last document pointer of this term in index j, we dequeue it.
if ( --frequency[ currIndex ] == 0 ) documentQueue.dequeue();
else {
doc[ currIndex ] = ii.nextDocument();
} while( ! documentQueue.isEmpty() && doc[ currIndex = documentQueue.firstInt() ] == currDoc );
if ( totalCount > maxCount ) maxCount = totalCount;
if ( ! metadataOnly && hasCounts ) {
cacheBitStreamOut.writeGamma( totalCount );
if ( hasPositions ) {
cacheBitStreamOut.writeDelta( position[ 0 ] );
for( int k = 1; k < totalCount; k++ ) cacheBitStreamOut.writeDelta( position[ k ] - position[ k - 1 ] - 1 );
prevDoc = currDoc;
if ( ! metadataOnly ) {
// Finally, we pour the data into the actual index.
if ( p != 0 ) variableQuantumIndexWriter.newInvertedList( totalFrequency, p, predictedSize, predictedLengthNumBits );
else indexWriter.newInvertedList();
indexWriter.writeFrequency( totalFrequency );
final InputBitStream ibs;
if ( cacheBitStreamOut.buffer() != null ) ibs = cacheBitStreamInWrapper;
else {
ibs = cacheBitStreamIn;
ibs.position( 0 );
currDoc = -1;
for( int j = totalFrequency; j-- != 0; ) {
obs = indexWriter.newDocumentRecord();
indexWriter.writeDocumentPointer( obs, currDoc = ibs.readDelta() + currDoc + 1 );
if ( hasCounts ) {
count = ibs.readGamma();
indexWriter.writePositionCount( obs, count );
if ( hasPositions ) {
position[ 0 ] = ibs.readDelta();
for( int k = 1; k < count; k++ ) position[ k ] = position[ k - 1 ] + ibs.readDelta() + 1;
indexWriter.writeDocumentPositions( obs, position, 0, count, size != null ? size[ currDoc ] : -1 );
return totalFrequency;
public void run() throws ConfigurationException, IOException {;
public static void main( String arg[] ) throws ConfigurationException, SecurityException, JSAPException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
Combine.main( arg, Paste.class );