Source Code of it.unimi.dsi.mg4j.search.score.BM25FScorer

package it.unimi.dsi.mg4j.search.score;


/*     
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2006-2010 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */


import it.unimi.dsi.fastutil.doubles.DoubleArrays;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongList;
import it.unimi.dsi.fastutil.objects.Object2DoubleMap;
import it.unimi.dsi.fastutil.objects.Object2DoubleOpenHashMap;
import it.unimi.dsi.fastutil.objects.Reference2DoubleMap;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.query.Query;
import it.unimi.dsi.mg4j.query.nodes.MultiIndexTermExpander;
import it.unimi.dsi.mg4j.search.DocumentIterator;
import it.unimi.dsi.mg4j.search.visitor.CounterCollectionVisitor;
import it.unimi.dsi.mg4j.search.visitor.CounterSetupVisitor;
import it.unimi.dsi.mg4j.search.visitor.TermCollectionVisitor;
import it.unimi.dsi.mg4j.tool.Paste;
import it.unimi.dsi.util.ImmutableExternalPrefixMap;
import it.unimi.dsi.util.SemiExternalGammaList;
import it.unimi.dsi.util.StringMap;


import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;


import org.apache.log4j.Logger;


/** A scorer that implements the BM25F ranking scheme.
 * 
 * <p>BM25F is an evolution of {@linkplain BM25Scorer BM25} described by
 * Stephen Robertson, Hugo Zaragoza and Michael Taylor in &ldquo;Simple BM25 extension to multiple weighted fields&rdquo;,
 * <i>CIKM '04: Proceedings of the thirteenth ACM international Conference on Information and Knowledge Management</i>, 
 * pages 42&minus;49, ACM Press, 2004.
 * 
 * <p>The idea behind BM25F is that adding up (albeit with weights) BM25 scores from different fields breaks down the nonlinearity
 * of BM25. Instead, we should work on a <em>virtual document collection</em>: more precisely,
 * we should behave as if all fields were concatenated in a single stream of text.
 * For instance, if weights are integers, the formula behaves as if the text of each field is concatenated as many times as its weight to form a global
 * text, which is then scored using BM25.
 *  
 * <p>Note that, for this to happen, we would need to know the corresponding frequency&mdash;that is, for each term, the number of documents in which
 * the term appears <em>in at least one of the fields</em>. This number must be provided at construction time: more precisely, you must specify
 * a {@link StringMap} that maps each term appearing in some field to an index into a {@link LongList} containing
 * the correct frequencies. These data is
 * accessed only in the preparatory phase, so access can be reasonably slow.
 * 
 * <p><strong>Important</strong>: the only source of knowledge about the overall set of indices involved in query resolution is given
 * by calls to {@link #setWeights(it.unimi.dsi.fastutil.objects.Reference2DoubleMap)}. That is, this scorer will assume that all indices
 * appearing in a query are also keys of the weight function passed to {@link #setWeights(it.unimi.dsi.fastutil.objects.Reference2DoubleMap)}. An
 * exception will be raised if these guidelines are not followed.
 * 
 * <h2>Computing frequency data</h2>
 * 
 * <p>The tool {@link Paste} can be used to create the metadata of the virtual collection. To do so, simply run {@link Paste} on
 * the indices of <em>all</em> fields over which you want to compute BM25F with the <samp>--metadata-only</samp> option.
 * The resulting frequency file is what you need to pass 
 * to the constructor, and from the term file you can build a {@link StringMap} (e.g., using an {@link ImmutableExternalPrefixMap})
 * that will be used to index the frequencies.
 * 
 * <h2>Boldi's variant</h2>
 * 
 * <p>Providing global frequency data makes it possible to compute the classical BM25F formula. 
 * If no frequency data is provided, this class implements Paolo Boldi's variant of BM25. In this case, we multiply the
 * IDF score by the weighted count of each term to form the virtual count that will be passed through BM25's nonlinear function.
 * 
 * <h2>Using this scorer</h2>
 * 
 * <p>This scorer assigns to each pair index/term {@linkplain DocumentIterator#acceptOnTruePaths(it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor) reachable by true paths}
 * a score that depends on the <em>virtual count</em> of the term, which is the count of the term for the given index multiplied by the weight of the index.
 * To obtain the &ldquo;classical&rdquo; BM25F score you must write a query <var>q</var> that contains no index selector and multiplexes it on all indices, 
 * e.g., <samp>a:<var>q</var> | b:<var>q</var> | c:<var>q</var></samp>. If a term appears only in some specific index/query pair, its score will be computed
 * using a smaller virtual count, obtained just by adding up the values associated to the actually present index/query pairs. Usually, the
 * simplest way to obtain this result is to use a {@link MultiIndexTermExpander}, which can be even set from the command-line interface
 * provided by {@link Query}.
 *
 * <h2>Correctness</h2>
 * 
 * <p>The code in this scorer is verified by unit tests developed jointly with Hugo Zaragoza. 
 * This is an important point, as the definition of BM25F contains many subtleties.
 * 
 * @see BM25Scorer
 * @author Sebastiano Vigna
 */


public class BM25FScorer extends AbstractWeightedScorer implements DelegatingScorer {
  private static final Logger LOGGER = Logger.getLogger( BM25FScorer.class );
  private static final boolean DEBUG = false;


  /** The default value used for the parameter <var>k</var><sub>1</sub>. */
  public final static double DEFAULT_K1 = 1.2;
  /** The default value used for the parameter <var>b</var>. */
  public final static double DEFAULT_B = 0.5;
  /** The value of the document-frequency part for terms appearing in more than half of the documents. */
  public final static double EPSILON_SCORE = 1E-6;
  
  /** The counter collection visitor used to estimate counts. */
  private final CounterCollectionVisitor counterCollectionVisitor;
  /** The counter setup visitor used to estimate counts. */
  private final CounterSetupVisitor setupVisitor;
  /** The term collection visitor used to estimate counts. */
  private final TermCollectionVisitor termVisitor;


  /** The parameter <var>k</var><sub>1</sub>. */
  public final double k1;
  /** The parameter <var>b</var>; you must provide one value for each index. */
  public final Reference2DoubleMap<Index> bByIndex;


  /** The parameter {@link #k1} plus one, precomputed. */
  private final double k1Plus1;
  /** A virtual average document size obtained by the weighted average of the average document size of each index. */
  private double[] avgDocumentSize;
  /** An array (parallel to {@link #currIndex}) that caches size lists. */
  private IntList sizes[];
  /** An array indexed by offsets that caches the inverse document-frequency part of the formula, multiplied by the index weight. */
  private double[] idfPart;
  /** An array indexed by offsets that caches the weight corresponding to each pair. */
  private double[] offset2Weight;
  /** An array indexed by offsets that gives the unique id of each term in the query. */
  private int[] offset2TermId;
  /** A term map to index {@link #frequencies}. */
  private final StringMap<? extends CharSequence> termMap;
  /** The list of virtual frequencies (possibly approximated using just the frequencies of the main field). */
  private final LongList frequencies;
  /** An array indexed by offsets mapping each offset to the corresponding index number. */
  private int[] offset2Index;
  /** An array indexed by term ids used by {@link #score()} to compute virtual counts. */
  private double[] virtualCount;
  /** For expected IDF runs, an array indexed by term ids used by {@link #score()} to compute virtual counts combined with IDF scoring. */
  private double[] virtualIdfCount;
  /** An array (parallel to {@link #currIndex}) used by {@link #score()} to cache the current document sizes. */
  private int[] size;
  /** The weight of each index. */
  private double[] weight;
  /** An array indexed by offsets mapping each offset to the parameter <var>b</var> of the corresponding index. */
  private double[] index2B;
  private Object2DoubleMap<String> bByName;
  
  /** Creates a BM25F scorer.
   * @param k1 the <var>k</var><sub>1</sub> parameter.
   * @param b the <var>b</var> parameter, specified as a map from indices to values.
   * @param termMap a map from terms to positions in <code>frequencies</code>, or <code>null</code> if <code>frequencies</code> is <code>null</code>.
   * @param frequencies the frequencies, or <code>null</code> for Boldi's variant.
   */
  public BM25FScorer( final double k1, final Reference2DoubleMap<Index> b, final StringMap<? extends CharSequence> termMap, final LongList frequencies ) {
    this.termMap = termMap;
    termVisitor = new TermCollectionVisitor();
    setupVisitor = new CounterSetupVisitor( termVisitor );
    counterCollectionVisitor = new CounterCollectionVisitor( setupVisitor );
    this.k1 = k1;
    this.bByIndex = b;
    this.frequencies = frequencies;
    k1Plus1 = k1 + 1;
    bByName = null;
  }


  /** Creates a BM25F scorer.
   * 
   * <p>This constructor exists to provide a typed counterpart to the {@linkplain #BM25FScorer(String...) string-based constructor} (mainly
   * for documentation purposes).
   * 
   * @param k1 the <var>k</var><sub>1</sub> parameter.
   * @param termMap a map from terms to positions in <code>frequencies</code>, or <code>null</code> if <code>frequencies</code> is <code>null</code>.
   * @param frequencies the frequencies, or <code>null</code> for Boldi's variant.
   * @param b the <var>b</var> parameter, specified as a map from indices to values.
   */
  public BM25FScorer( final double k1, final StringMap<? extends CharSequence> termMap, final LongList frequencies, final Object2DoubleMap<String> b ) {
    this.termMap = termMap;
    termVisitor = new TermCollectionVisitor();
    setupVisitor = new CounterSetupVisitor( termVisitor );
    counterCollectionVisitor = new CounterCollectionVisitor( setupVisitor );
    this.k1 = k1;
    this.bByName = b;
    this.frequencies = frequencies;
    k1Plus1 = k1 + 1;
    bByIndex = null;
  }


  /** Creates a BM25F scorer using Boldi's variant (frequencies are not needed).
   * @param k1 the <var>k</var><sub>1</sub> parameter.
   * @param b the <var>b</var> parameter, specified as a map from indices to values.
   */
  public BM25FScorer( final double k1, final Reference2DoubleMap<Index> b ) {
    this( k1, b, null, null );
  }


  private static Object2DoubleMap<String> parseBArray( final String[] b ) {
    final Object2DoubleOpenHashMap<String> result = new Object2DoubleOpenHashMap<String>();
    for( int i = 3; i < b.length; i++ ) {
      final String[] part = b[ i ].split( "=" );
      result.put( part[ 0 ], Double.parseDouble( part[ 1 ] ) ); 
    }
    return result;
  }
  
  /** Creates a BM25F scorer using parameters specified by strings.
   * 
   * <p>This constructor has string parameters tha correspond to the arguments of {@link #BM25FScorer(double, StringMap, LongList, Object2DoubleMap)}.
   * The two middle arguments can be omitted by specifying them as empty. The last argument is represented by a number of
   * assignments <samp><var>index</var>=<var>b</var></samp>, separated by commas (as if they were multiple arguments), which
   * will be compacted into a function representing the values of <var>b</var>. 
   */
  @SuppressWarnings("unchecked")
  public BM25FScorer( String... arg ) throws NumberFormatException, FileNotFoundException, IOException, ClassNotFoundException {
    this( 
        Double.parseDouble( arg[ 0 ] ), // k1
        arg[ 1 ].length() == 0 ? null : (StringMap<? extends CharSequence>)BinIO.loadObject( arg[ 1 ] ), // termMap
        arg[ 2 ].length() == 0 ? null : new SemiExternalGammaList( new InputBitStream( arg[ 2 ] ) ), // frequencies 
        parseBArray( arg )
    );
  }


  
  public synchronized BM25FScorer copy() {
    final BM25FScorer scorer = new BM25FScorer( k1, bByIndex, termMap, frequencies );
    scorer.setWeights( index2Weight );
    return scorer;
  }


  public double score() throws IOException {
    setupVisitor.clear();
    documentIterator.acceptOnTruePaths( counterCollectionVisitor );
    
    final int document = documentIterator.document();
    final int[] count = setupVisitor.count;
    final double[] offset2Weight = this.offset2Weight;
    final int[] offset2TermId = this.offset2TermId;
    final double[] idfPart = this.idfPart;
    final double[] virtualCount = this.virtualCount;
    final double[] virtualIdfCount = this.virtualIdfCount;
    final double[] index2B = this.index2B;
    
    final int[] size = this.size;
    for( int i = currIndex.length; i-- != 0; ) size[ i ] = sizes[ i ].getInt( document );


    // Compute virtual size
    int term2Index, termId;
    DoubleArrays.fill( virtualCount, 0 );
    double score = 0, v;
    if ( termMap != null ) {
      for ( int i = offset2TermId.length; i-- != 0; ) {
        term2Index = offset2Index[ i ];
        virtualCount[ offset2TermId[ i ] ] += count[ i ] * offset2Weight[ i ] / ( ( 1 - index2B[ term2Index ] ) + index2B[ term2Index ] * size[ term2Index ] / avgDocumentSize[ term2Index ] );
      }


      for ( int i = virtualCount.length; i-- != 0; ) {
        v = virtualCount[ i ];
        score += ( k1Plus1 * v ) / ( v + k1 ) * idfPart[ i ];
      }
    }
    else {
      DoubleArrays.fill( virtualIdfCount, 0 );
      for ( int i = offset2TermId.length; i-- != 0; ) {
        term2Index = offset2Index[ i ];
        termId = offset2TermId[ i ];
        v = count[ i ] * offset2Weight[ i ] / ( ( 1 - index2B[ term2Index ] ) + index2B[ term2Index ] * size[ term2Index ] / avgDocumentSize[ term2Index ] );
        virtualCount[ termId ] += v; 
        virtualIdfCount[ termId ] += idfPart[ i ] * v;
      }


      for ( int i = virtualCount.length; i-- != 0; )
        score += ( k1Plus1 * virtualIdfCount[ i ] ) / ( virtualCount[ i ] + k1 );
    }
    return score;
}


  public double score( final Index index ) {
    throw new UnsupportedOperationException();
  }




  public void wrap( DocumentIterator d ) throws IOException {
    documentIterator = d;
    
    // Note that we use the index array provided by the weight function, *not* by the visitor or by the iterator.
    termVisitor.prepare( index2Weight.keySet() );
    if ( DEBUG ) LOGGER.debug( "Weight map: " + index2Weight );
    
    d.accept( termVisitor );
    if ( DEBUG ) LOGGER.debug( "Term Visitor found " + termVisitor.numberOfPairs() + " leaves" );


    final Index[] index = termVisitor.indices();
    if ( DEBUG ) LOGGER.debug( "Indices: " + Arrays.toString( index ) );
    if ( ! index2Weight.keySet().containsAll( Arrays.asList( index ) ) ) throw new IllegalArgumentException( "A BM25F scorer must have a weight for all indices involved in a query" );
    
    for( Index i: index )
      if ( bByIndex != null && ! bByIndex.containsKey( i ) || bByName != null && ! bByName.containsKey( i.field ) )
        throw new IllegalArgumentException( "A BM25F scorer must have a b parameter for all indices involved in a query" );


    // Some caching of frequently-used values
    sizes = new IntList[ index.length ];
    for( int i = index.length; i-- != 0; )
      if ( ( sizes[ i ] = index[ i ].sizes ) == null ) throw new IllegalStateException( "A BM25F scorer requires document sizes" );
    
    setupVisitor.prepare();
    d.accept( setupVisitor );


    avgDocumentSize = new double[ index.length ];


    weight = new double[ index.length ];
    for( int i = weight.length; i-- != 0; ) {
      weight[ i ] = index2Weight.getDouble( index[ i ] );
      avgDocumentSize[ i ] = (double)index[ i ].numberOfOccurrences / index[ i ].numberOfDocuments;
    }
    
    offset2TermId = setupVisitor.offset2TermId;
    offset2Index = setupVisitor.indexNumber;
    offset2Weight = new double[ offset2Index.length ];
    index2B = new double[ index.length ];
    for( int i = 0;  i < index2B.length; i++ ) index2B[ i ] = bByIndex != null ? bByIndex.getDouble( index[ i ] ) : bByName.getDouble( index[ i ].field );
    
    for( int i = offset2Weight.length; i-- != 0; ) offset2Weight[ i ] = index2Weight.getDouble( index[ offset2Index[ i ] ] );


    // We do all logs here
    idfPart = new double[ termVisitor.numberOfPairs() ];


    if ( termMap != null ) {
      // Classical BM25F, based on global frequency data
      for( int i = idfPart.length; i-- != 0; ) {
        final int id = (int)termMap.getLong( setupVisitor.termId2Term[ setupVisitor.offset2TermId[ i ] ] );
        if ( id == -1 ) throw new IllegalStateException( "The term map passed to a BM25F scorer must contain all terms appearing in all indices" );
        final long f = frequencies.getLong( id );
        idfPart[ i ] = Math.max( EPSILON_SCORE, Math.log( ( index[ 0 ].numberOfDocuments - f + 0.5 ) / ( f + 0.5 ) ) );
      }
    }
    else {
      // Modified BM25F, using expected IDF
      final int[] frequency = setupVisitor.frequency;
      final int[] indexNumber = setupVisitor.indexNumber;
      for( int i = idfPart.length; i-- != 0; ) {
        idfPart[ i ] = Math.max( EPSILON_SCORE,  
            Math.log( ( index[ indexNumber[ i ] ].numberOfDocuments - frequency[ i ] + 0.5 ) / ( frequency[ i ] + 0.5 ) ) );
      }
    }


    size = new int[ index.length ];
    virtualCount = new double[ setupVisitor.termId2Term.length ];
    if ( termMap == null ) virtualIdfCount = new double[ setupVisitor.termId2Term.length ];
    currIndex = index;
  }
  
  public boolean usesIntervals() {
    return false;
  }
}
Source Code of it.unimi.dsi.mg4j.search.score.BM25FScorer

Related Classes of it.unimi.dsi.mg4j.search.score.BM25FScorer