Source Code of it.unimi.dsi.mg4j.tool.PartitionLexically

package it.unimi.dsi.mg4j.tool;


/*     
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2006-2010 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */


import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.mg4j.index.BitStreamHPIndex;
import it.unimi.dsi.mg4j.index.BitStreamIndex;
import it.unimi.dsi.mg4j.index.DiskBasedIndex;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.cluster.ContiguousLexicalStrategy;
import it.unimi.dsi.mg4j.index.cluster.DocumentalCluster;
import it.unimi.dsi.mg4j.index.cluster.IndexCluster;
import it.unimi.dsi.mg4j.index.cluster.LexicalCluster;
import it.unimi.dsi.mg4j.index.cluster.LexicalPartitioningStrategy;
import it.unimi.dsi.mg4j.index.cluster.LexicalStrategies;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.mg4j.search.score.BM25Scorer;
import it.unimi.dsi.Util;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.BloomFilter;
import it.unimi.dsi.util.ImmutableExternalPrefixMap;
import it.unimi.dsi.util.PrefixMap;
import it.unimi.dsi.util.Properties;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.util.StringMap;


import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;


import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.ConfigurationMap;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;


import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;


/** Partitions an index lexically.
 * 
 * <p>A global index is partitioned lexically by providing a {@link LexicalPartitioningStrategy}
 * that specifies a destination local index for each term, and a local term number. The global index
 * is read directly at the bit level, and the posting lists are divided among the 
 * local indices using the provided strategy. For instance,
 * an {@link ContiguousLexicalStrategy} divides an index into 
 * contiguous blocks (of terms) specified by the given strategy.
 * 
 * <p>By choice, document pointers are not remapped. Thus, it may happen that one of the local indices 
 * contains <em>no</em> posting with a certain document. However, computing the subset of documents contained
 * in each local index to remap them in a contiguous interval is not a good idea, as usually the subset
 * of documents appearing in the postings of each local index is large.
 *
 * <p>To speed up the search of the right local index of a not-so-frequent term (in
 * particular with a {@linkplain it.unimi.dsi.mg4j.index.cluster.ChainedLexicalClusteringStrategy chained strategy}), 
 * after partitioning an index you can create {@linkplain BloomFilter Bloom filters} that will be used to try to avoid
 * inquiring indices that do not contain a term. The filters will be automatically loaded
 * by {@link it.unimi.dsi.mg4j.index.cluster.IndexCluster#getInstance(CharSequence, boolean, boolean)}.
 * 
 * <p>Note that the size file is the same for each local index and <em>is not copied</em>. Please use
 * standard operating system features such as symbolic links to provide size files to 
 * local indices. 
 * 
 * <p>If you plan to {@linkplain LexicalCluster cluster} the partitioned indices and you need document sizes 
 * (e.g., for {@linkplain BM25Scorer BM25 scoring}), you can use the index property 
 * {@link it.unimi.dsi.mg4j.index.Index.UriKeys#SIZES} to load the original size file.  
 * 
 * If you plan on partitioning an index requiring
 * document sizes, you should consider a custom index loading scheme 
 * that shares the {@linkplain it.unimi.dsi.mg4j.index.BitStreamIndex#sizes size list}
 * among all local indices.
 *
 * <strong>Important</strong>: this class just partitions the index. No auxiliary files (most notably, {@linkplain StringMap term maps} 
 * or {@linkplain PrefixMap prefix maps}) will be generated. Please refer to a {@link StringMap} implementation (e.g.,
 * {@link ShiftAddXorSignedStringMap} or {@link ImmutableExternalPrefixMap}).
 *
 * <h2>Write-once output and distributed index partitioning</h2>
 * 
 * <p>The partitioning process writes each index file sequentially exactly once, so index partitioning
 * can output its results to <em>pipes</em>, which in
 * turn can spill their content, for instance, through the network. In other words, albeit this
 * class theoretically creates a number of local indices on disk, those indices can be
 * substituted with suitable pipes creating remote local indices without affecting the partitioning process.
 * For instance, the following <samp>bash</samp> code creates three sets of pipes:
 * <pre style="margin: 1em 0">
 * for i in 0 1 2; do
 *   for e in frequencies globcounts index offsets properties sizes terms; do 
 *     mkfifo pipe-$i.$e
 *   done
 * done
 * </pre> 
 * 
 * <p>Each pipe must be emptied elsewhere, for instance (assuming
 * you want local indices <samp>index0</samp>, <samp>index1</samp> and <samp>index2</samp> on <samp>example.com</samp>):
 * <pre style="margin: 1em 0">
 * for i in 0 1 2; do 
 *   for e in frequencies globcounts index offsets properties sizes terms; do 
 *     (cat pipe-$i.$e | ssh -x example.com "cat >index-$i.$e" &)
 *   done
 * done
 * </pre> 
 * <p>If we now start a partitioning process generating three local indices named <samp>pipe-0</samp>,
 * <samp>pipe-1</samp> and <samp>pipe-2</samp>
 * all pipes will be written to by the process, and the data will create remotely
 * indices <samp>index-0</samp>, <samp>index-1</samp> and <samp>index-2</samp>.
 *
 * @author Sebastiano Vigna
 * 
 * @since 1.0.1
 */


public class PartitionLexically {
  private static final Logger LOGGER = Util.getLogger( PartitionLexically.class );


  /**  The default buffer size for all involved indices. */
  public final static int DEFAULT_BUFFER_SIZE = 1024 * 1024;
  
  /** The number of local indices. */
  private final int numIndices;
  /** The output basenames. */
  private final String outputBasename;
  /** The array of local output basenames. */
  private final String[] localBasename;
  /** The input basename. */
  private final String inputBasename;
  /** The size of I/O buffers. */
  private final int bufferSize;
  /** The filename of the strategy used to partition the index. */
  private final String strategyFilename;
  /** The strategy used to partition the index. */
  private final LexicalPartitioningStrategy strategy;
  /** The additional local properties of each local index. */
  private final Properties[] strategyProperties;
  /** The logging interval. */
  private final long logInterval;
    
  public PartitionLexically( final String inputBasename, 
      final String outputBasename,
      final LexicalPartitioningStrategy strategy,
      final String strategyFilename,
      final int bufferSize,
      final long logInterval ) {


    this.inputBasename = inputBasename;
    this.outputBasename = outputBasename;
    this.strategy = strategy;
    this.strategyFilename = strategyFilename;
    this.bufferSize = bufferSize;
    this.logInterval = logInterval;
    numIndices = strategy.numberOfLocalIndices();
    strategyProperties = strategy.properties();
    localBasename = new String[ numIndices ];
    for( int i = 0; i < numIndices; i++ ) localBasename[ i ] = outputBasename + "-" + i;
  }
  
  public void runTermsOnly() throws IOException {
    final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );
    
    final PrintWriter[] localTerms = new PrintWriter[ numIndices ]; 
    final int numTerms[] = new int[ numIndices ];
    final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
    
    for( int i = 0; i < numIndices; i++ ) localTerms[ i ] = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ) ), "UTF-8" ) );


    // The current term
    final MutableString currTerm = new MutableString();
    
    pl.itemsName = "terms";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index terms..." );


    int termNumber = 0, k;
    
    while( terms.readLine( currTerm ) != null ) {
      k = strategy.localIndex( termNumber ); // The local index for this term
      if ( numTerms[ k ] != strategy.localNumber( termNumber ) ) throw new IllegalStateException();
      numTerms[ k ]++;
      currTerm.println( localTerms[ k ] );
      pl.update();
      termNumber++;
    }


    terms.close();
    for( int i = 0; i < numIndices; i++ ) localTerms[ i ].close();


    pl.done();
  }
  
  public void run() throws ConfigurationException, IOException, ClassNotFoundException {
    final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );
    final byte[] buffer = new byte[ bufferSize ];
    
    final OutputBitStream[] localIndexStream = new OutputBitStream[ numIndices ];
    final OutputBitStream[] localPositionsStream = new OutputBitStream[ numIndices ];
    final OutputBitStream[] localOffsets = new OutputBitStream[ numIndices ];
    final OutputBitStream[] localPosNumBits = new OutputBitStream[ numIndices ];
    final OutputBitStream[] localFrequencies = new OutputBitStream[ numIndices ];
    final OutputBitStream[] localGlobCounts = new OutputBitStream[ numIndices ];
    final PrintWriter[] localTerms = new PrintWriter[ numIndices ]; 
    final int numTerms[] = new int[ numIndices ];
    final long numberOfOccurrences[] = new long[ numIndices ];
    final long numberOfPostings[] = new long[ numIndices ];
    
    final boolean isHighPerformance = BitStreamHPIndex.class.isAssignableFrom( Class.forName( new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION ).getString( Index.PropertyKeys.INDEXCLASS ) ) );
    
    final InputBitStream globalIndex = new InputBitStream( inputBasename + DiskBasedIndex.INDEX_EXTENSION, bufferSize );
    final long globalPositionsLength = new File( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION ).length();
    final InputBitStream globalPositions = isHighPerformance ? new InputBitStream( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize ) : null;
    final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
    final InputBitStream offsets = new InputBitStream( inputBasename + DiskBasedIndex.OFFSETS_EXTENSION );
    
    final File posNumBitsFile = new File( inputBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
    final InputBitStream posNumBits = posNumBitsFile.exists() ? new InputBitStream( inputBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION ) : null;
    final InputBitStream frequencies = new InputBitStream( inputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
    final InputBitStream globCounts = new InputBitStream( inputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
    offsets.readGamma();
    
    for( int i = 0; i < numIndices; i++ ) {
      localIndexStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.INDEX_EXTENSION, bufferSize );
      if ( isHighPerformance ) localPositionsStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize );
      localFrequencies[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.FREQUENCIES_EXTENSION );
      localGlobCounts[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
      localTerms[ i ] = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ) ), "UTF-8" ) );
      localOffsets[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.OFFSETS_EXTENSION );
      if ( posNumBits != null ) localPosNumBits[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
      localOffsets[ i ].writeGamma( 0 );
    }


    // The current term
    final MutableString currTerm = new MutableString();
    
    pl.expectedUpdates = ( new File( inputBasename + DiskBasedIndex.INDEX_EXTENSION ).length() + ( isHighPerformance ? new File( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION ).length() : 0 ) ) * 8;
    pl.itemsName = "bits";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index..." );


    int termNumber = 0, k, prevK = -1, previousHeaderLength = 0, newHeaderLength = 0;
    long length, count, positionsOffset = 0;
    int res, frequency;
    
    while( terms.readLine( currTerm ) != null ) {
      k = strategy.localIndex( termNumber ); // The local index for this term
      if ( numTerms[ k ] != strategy.localNumber( termNumber ) ) throw new IllegalStateException();
      numTerms[ k ]++;
      
      if ( isHighPerformance ) {
        final long temp = globalIndex.readBits();
        positionsOffset = globalIndex.readLongDelta();
        previousHeaderLength = (int)( globalIndex.readBits() - temp );
        if ( prevK != -1 ) {
          length = positionsOffset - globalPositions.readBits();
          pl.count += length;
          while( length > 0 ) {
            res = (int)Math.min( bufferSize * 8, length );
            globalPositions.read( buffer, res );
            localPositionsStream[ prevK ].write( buffer, res );
            length -= res;
          }
        }
        newHeaderLength = localIndexStream[ k ].writeLongDelta( localPositionsStream[ k ].writtenBits() );
      }
      
      
      frequency = frequencies.readGamma();
      localFrequencies[ k ].writeGamma( frequency );
      numberOfPostings[ k ] += frequency;


      if ( posNumBits != null ) localPosNumBits[ k ].writeGamma( posNumBits.readGamma() );
      
      count = globCounts.readLongGamma();
      numberOfOccurrences[ k ] += count;
      localGlobCounts[ k ].writeLongGamma( count );
      
      currTerm.println( localTerms[ k ] );
      
      length = offsets.readLongGamma() - previousHeaderLength;
      localOffsets[ k ].writeLongGamma( length + newHeaderLength );
      pl.count += length + previousHeaderLength - 1;
      
      while( length > 0 ) {
        res = (int)Math.min( bufferSize * 8, length );
        globalIndex.read( buffer, res );
        localIndexStream[ k ].write( buffer, res );
        length -= res;
      }
      
      pl.update();
      prevK = k;
      termNumber++;
    }


    // We pour the last piece of positions
    if ( isHighPerformance ) {
      if ( prevK != -1 ) {
        length = globalPositionsLength * 8 - globalPositions.readBits();
        System.err.println( globalPositionsLength * 8 - globalPositions.readBits() );
        while( length > 0 ) {
          res = (int)Math.min( bufferSize * 8, length );
          globalPositions.read( buffer, res );
          localPositionsStream[ prevK ].write( buffer, res );
          length -= res;
        }
      }
    }


    pl.done();


    terms.close();
    offsets.close();
    frequencies.close();
    globCounts.close();
    globalIndex.close();
    if ( posNumBits != null ) posNumBits.close();
    if ( isHighPerformance ) globalPositions.close();
    
    // We copy the relevant properties from the original 
    Properties properties = new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
    Properties globalProperties = new Properties();
    if ( strategyFilename != null ) globalProperties.setProperty( IndexCluster.PropertyKeys.STRATEGY, strategyFilename );
    globalProperties.setProperty( DocumentalCluster.PropertyKeys.BLOOM, false );
    globalProperties.setProperty( Index.PropertyKeys.INDEXCLASS, LexicalCluster.class.getName() );
    for( int i = 0; i < numIndices; i++ ) globalProperties.addProperty( IndexCluster.PropertyKeys.LOCALINDEX, localBasename[ i ] );
    globalProperties.setProperty( Index.PropertyKeys.FIELD, properties.getProperty( Index.PropertyKeys.FIELD ) );
    globalProperties.setProperty( Index.PropertyKeys.POSTINGS, properties.getProperty( Index.PropertyKeys.POSTINGS ) );
    globalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, properties.getProperty( Index.PropertyKeys.OCCURRENCES ) );
    globalProperties.setProperty( Index.PropertyKeys.DOCUMENTS, properties.getProperty( Index.PropertyKeys.DOCUMENTS ) );
    globalProperties.setProperty( Index.PropertyKeys.TERMS, properties.getProperty( Index.PropertyKeys.TERMS ) );
    globalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, properties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) );
    globalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, properties.getProperty( Index.PropertyKeys.MAXCOUNT ) );
    globalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, properties.getProperty( Index.PropertyKeys.MAXDOCSIZE ) );
    globalProperties.save( outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
    LOGGER.debug( "Properties for clustered index " + outputBasename + ": " + new ConfigurationMap( globalProperties ) );
    
    for( int i = 0; i < numIndices; i++ ) {
      localIndexStream[ i ].close();
      if ( isHighPerformance ) localPositionsStream[ i ].close();
      localOffsets[ i ].close();
      if ( posNumBits != null ) localPosNumBits[ i ].close();
      localFrequencies[ i ].close();
      localGlobCounts[ i ].close();
      localTerms[ i ].close();
      final InputStream input = new FileInputStream( inputBasename + DiskBasedIndex.SIZES_EXTENSION );
      final OutputStream output = new FileOutputStream( localBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );
      IOUtils.copy( input, output );
      input.close();
      output.close();
      Properties localProperties = new Properties();
      localProperties.addAll( globalProperties );
      localProperties.setProperty( Index.PropertyKeys.TERMS, numTerms[ i ] );
      localProperties.setProperty( Index.PropertyKeys.OCCURRENCES, numberOfOccurrences[ i ] );
      localProperties.setProperty( Index.PropertyKeys.POSTINGS, numberOfPostings[ i ] );
      localProperties.setProperty( Index.PropertyKeys.POSTINGS, numberOfPostings[ i ] );
      localProperties.setProperty( Index.PropertyKeys.INDEXCLASS, properties.getProperty( Index.PropertyKeys.INDEXCLASS ) );
      localProperties.addProperties( Index.PropertyKeys.CODING, properties.getStringArray( Index.PropertyKeys.CODING ) );
      localProperties.setProperty( BitStreamIndex.PropertyKeys.SKIPQUANTUM, properties.getProperty( BitStreamIndex.PropertyKeys.SKIPQUANTUM ) );
      localProperties.setProperty( BitStreamIndex.PropertyKeys.SKIPHEIGHT, properties.getProperty( BitStreamIndex.PropertyKeys.SKIPHEIGHT ) );
      if ( strategyProperties[ i ] != null ) localProperties.addAll( strategyProperties[ i ] );
      localProperties.save( localBasename[ i ] + DiskBasedIndex.PROPERTIES_EXTENSION );
      LOGGER.debug( "Post-partitioning properties for index " + localBasename[ i ] + ": " + new ConfigurationMap( localProperties ) );
    }
  }


  public static void main( final String[] arg ) throws JSAPException, ConfigurationException, IOException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException {
    
    SimpleJSAP jsap = new SimpleJSAP( PartitionLexically.class.getName(), "Partitions an index lexically.",
        new Parameter[] {
        new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),
        new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),
        new FlaggedOption( "strategy", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "strategy", "A serialised lexical partitioning strategy." ),
        new FlaggedOption( "uniformStrategy", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'u', "uniform", "Requires a uniform partitioning in the given number of parts." ),
        new Switch( "termsOnly", 't', "terms-only", "Just partition the term list." ),
        new UnflaggedOption( "inputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the global index." ),
        new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the local indices." )
    });
    
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;
    String inputBasename = jsapResult.getString( "inputBasename" );
    String outputBasename = jsapResult.getString( "outputBasename" );
    String strategyFilename = jsapResult.getString( "strategy" );
    LexicalPartitioningStrategy strategy = null;


    if ( jsapResult.userSpecified( "uniformStrategy" ) ) {
      strategy = LexicalStrategies.uniform( jsapResult.getInt( "uniformStrategy" ), DiskBasedIndex.getInstance( inputBasename, false, false, true ) );
      BinIO.storeObject( strategy, strategyFilename = outputBasename + IndexCluster.STRATEGY_DEFAULT_EXTENSION );
    }
    else if ( strategyFilename != null ) strategy = (LexicalPartitioningStrategy)BinIO.loadObject( strategyFilename );
    else throw new IllegalArgumentException( "You must specify a splitting strategy" );


    final PartitionLexically partitionLexically = new PartitionLexically( inputBasename,
        outputBasename, 
        strategy, 
        strategyFilename,
        jsapResult.getInt( "bufferSize" ),
        jsapResult.getLong( "logInterval" ) );


    
    if ( jsapResult.getBoolean( "termsOnly" ) ) partitionLexically.runTermsOnly();
    else partitionLexically.run();
  }
}
Source Code of it.unimi.dsi.mg4j.tool.PartitionLexically

Related Classes of it.unimi.dsi.mg4j.tool.PartitionLexically