Source Code of it.unimi.dsi.mg4j.index.MultiTermIndexIterator$MultiTermIntervalIterator

package it.unimi.dsi.mg4j.index;


/*     
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2003-2010 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */


import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntIterators;
import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.fastutil.objects.ObjectHeapIndirectPriorityQueue;
import it.unimi.dsi.mg4j.index.payload.Payload;
import it.unimi.dsi.mg4j.search.AbstractCompositeDocumentIterator;
import it.unimi.dsi.mg4j.search.AbstractUnionDocumentIterator;
import it.unimi.dsi.mg4j.search.DocumentIterator;
import it.unimi.dsi.util.Interval;
import it.unimi.dsi.mg4j.search.IntervalIterator;
import it.unimi.dsi.mg4j.search.OrDocumentIterator;
import it.unimi.dsi.mg4j.search.score.BM25Scorer;
import it.unimi.dsi.mg4j.search.score.Scorer;
import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;


import java.io.IOException;


/** A virtual {@linkplain IndexIterator index iterator} that merges several component index iterators.
*
* <P>This class adds to {@link it.unimi.dsi.mg4j.search.AbstractUnionDocumentIterator}
* an interval iterator generating the OR of the intervals returned for each of the documents involved.
* The main difference with an {@link OrDocumentIterator} built on the same array of component iterators
* is that this class implements {@link IndexIterator} and hence provides a {@link #count()} (the sum
* of counts of those component iterators positioned on the current document) and a {@link #frequency()}. The
* latter is by default the maximum frequency of a component iterator, but it can be set 
* at {@link MultiTermIndexIterator#getInstance(int, Index, IndexIterator[]) construction time}.
* 
* <p>The main <i>raison d'&ecirc;tre</i> of this class is support for query expansion: a blind application
* of {@link OrDocumentIterator} to an array of index iterators would mislead {@linkplain Scorer scorers} such as {@link BM25Scorer}
* because low-frequency terms (e.g., <i>hapax legomena</i>) would be responsible for most of the score.
* 
* <p>Note that {@linkplain DocumentIteratorVisitor} has a {@linkplain DocumentIteratorVisitor#visit(IndexIterator) visit method for generic index iterator}
* and a {@linkplain DocumentIteratorVisitor#visit(MultiTermIndexIterator) visit method for instances of this class}.
* This approach provides additional flexibility&mdash;a scorer, for instance, might treat an instance of
* this class as a standard {@link IndexIterator}, or it might choose to {@linkplain #front(IndexIterator[]) query which terms actually appear}
* and do something more sophisticated (for instance, using {@linkplain DocumentIterator#weight() weights}).
*/


public class MultiTermIndexIterator extends AbstractUnionDocumentIterator implements IndexIterator {
  @SuppressWarnings("unused")
  private static final boolean ASSERTS = false;
  
  /** Value to be used for term frequency, or {@link Integer#MIN_VALUE} to use the max; in any case, this attribute is used to cache
   *  frequency after the first call to {@link #frequency()}. */
  private int frequency;
  /** The term of this iterator. */
  protected String term;
  /** The id of this iterator. */
  protected int id;
  /** The count of the last returned document. */
  private int count = -1;
  /** Whether all underlying index iterators have counts. */
  private final boolean hasCounts; 
  /** Whether all underlying index iterators have positions. */
  private final boolean hasPositions;
  
  /** Returns an index iterator that merges the given array of iterators.
   *  This method requires that at least one iterator is provided. The frequency is computed as a max,
   *  and {@link #index()} will return the result of the same method on the first iterator.
   * 
   * @param indexIterator the iterators to be joined (at least one).
   * @return a merged index iterator. 
   * @throws IllegalArgumentException if no iterators were provided.
   */
  public static IndexIterator getInstance( final IndexIterator... indexIterator  ) throws IOException {
    return getInstance( Integer.MIN_VALUE, indexIterator );
  }


  /** Returns an index iterator that merges the given array of iterators.
   * 
   * <P>Note that the special case of the empty and of the singleton arrays
   * are handled efficiently. The frequency is computed as a max, and
   * {@link #index()} will return <code>index</code>.
   * 
   * @param index the index that wil be returned by {@link #index()}.
   * @param indexIterator the iterators to be joined.
   * @return a merged index iterator. 
   */
  public static IndexIterator getInstance( final Index index, final IndexIterator... indexIterator  ) throws IOException {
    return getInstance( Integer.MIN_VALUE, index, indexIterator );
  }


  /** Returns an index iterator that merges the given array of iterators.
   *  This method requires that at least one iterator is provided.
   * 
   * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
   * @param indexIterator the iterators to be joined (at least one).
   * @return a merged index iterator. 
   * @throws IllegalArgumentException if no iterators were provided, or they run on different indices.
   */
  public static IndexIterator getInstance( final int defaultFrequency, final IndexIterator... indexIterator  ) throws IOException {
    if ( indexIterator.length == 0 ) throw new IllegalArgumentException();
    return getInstance( defaultFrequency, indexIterator[ 0 ].index(), indexIterator );
  }


  /** Returns an index iterator that merges the given array of iterators.
   * 
   * <P>Note that the special case of the empty and of the singleton arrays
   * are handled efficiently. 
   * 
   * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
   * @param index the index that wil be returned by {@link #index()}.
   * @param indexIterator the iterators to be joined.
   * @return a merged index iterator. 
   * @throws IllegalArgumentException if there is some iterator on an index different from <code>index</code>.
   */
  public static IndexIterator getInstance( final int defaultFrequency, final Index index, final IndexIterator... indexIterator  ) throws IOException {
    if ( indexIterator.length == 0 ) return index.getEmptyIndexIterator();
    if ( indexIterator.length == 1 ) return indexIterator[ 0 ];
    return new MultiTermIndexIterator( defaultFrequency, indexIterator );
  }


  
  /** Creates a new document iterator that merges the given array of iterators. 
   * 
   * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
     * @param indexIterator the iterators to be joined.
   */
  @SuppressWarnings("cast")
  protected MultiTermIndexIterator( final int defaultFrequency, final IndexIterator... indexIterator ) throws IOException {
    super( (DocumentIterator[]) indexIterator );
    this.frequency = defaultFrequency;
    boolean havePositions = true, haveCounts = true;
    for( IndexIterator i: indexIterator ) {
      if ( ! i.index().hasCounts ) haveCounts = false;
      if ( ! i.index().hasPositions ) havePositions = false;
        
    }
    
    hasCounts = haveCounts;
    hasPositions = havePositions;
  }


  protected IntervalIterator getComposedIntervalIterator( final Index index ) {
    return new MultiTermIntervalIterator();
  }


  @Override
  public int skipTo( final int n ) throws IOException {
    if ( last >= n ) return last;
    // We invalidate count before calling the superclass method.
    count = -1;
    return super.skipTo( n );
  }
  
  public int nextDocument() throws IOException {
    // We invalidate count before calling the superclass method.
    count = -1;
    return super.nextDocument();
  }
  
  /** The count is the sum of counts of those component iterators positioned on the current document.
   * 
   *  @return the sum of counts.
   */
  public int count() throws IOException {
    if ( ! hasCounts ) throw new IllegalStateException( "Some of the underlying iterators do not have counts" );
    if ( last == -1 ) throw new IllegalStateException();
    if ( count == -1 ) {
      int count = 0;
      for ( int i = computeFront(); i-- != 0; ) count += indexIterator[ front[ i ] ].count();
      this.count = count;
    }
    return count;
  }


  /** Fills the given array with the index iterators composing the current front.
   * 
   * <p>This method is essentially a safe exposure of the {@linkplain ObjectHeapIndirectPriorityQueue#front(int[]) front of the queue}
   * merging the component {@linkplain IndexIterator index iterators}.
   * After a call to {@link #nextDocument()}, you can use this method to know
   * which terms actually appear in the current document. You can use the public
   * field {@link AbstractCompositeDocumentIterator#n} to size the argument
   * array appropriately.
   * 
   * @param indexIterator an array, at least large as the number of component index iterators,
   * that will be partially filled with the index iterators corresponding to terms appearing in the current document.
   * @return the number of iterators written into <code>indexIterator</code>. 
   */
  public int front( final IndexIterator[] indexIterator ) {
    final int s = computeFront();
    for( int i = s; i-- != 0; ) indexIterator[ i ] = this.indexIterator[ front[ i ] ];
    return s;
  }
  
  /** The frequency is either the default frequency set at construction time, or the maximum frequency of the component iterators. 
   * 
   * @return the frequency.
   */
  public int frequency() throws IOException {
    if ( frequency != Integer.MIN_VALUE ) return frequency;
    int frequency = Integer.MIN_VALUE;
    for ( int i = n; i-- != 0; ) frequency = Math.max( frequency, indexIterator[ i ].frequency() );
    return this.frequency = frequency; // caching it!
  }


  public IndexIterator term( final CharSequence term ) {
    this.term = term == null ? null : term.toString();
    return this;
  }


  public String term() { 
    return term;
  }


  public int termNumber() {
    // TODO: this is not particularly sensible
    return indexIterator[ 0 ].termNumber();
  }
  
  public IndexIterator id( final int id ) {
    this.id = id;
    return this;
  }
  
  public int id() {
    return id;
  }


  public Index index() {
    return soleIndex;
  }


  /** This method is not implemented by this class.
   */
  public Payload payload() {
    throw new UnsupportedOperationException();
  }


  public int[] positionArray() throws IOException {
    if ( ! hasPositions ) throw new IllegalStateException( "Some of the underlying iterators do not have positions" );


    // If the front contains a single element, we can just use its position array.
    if ( computeFront() == 1 ) return indexIterator[ front[ 0 ] ].positionArray();
    
    final MultiTermIntervalIterator multiTermIntervalIterator = (MultiTermIntervalIterator)intervalIterator();
    multiTermIntervalIterator.drain();
    return multiTermIntervalIterator.cache;
  }


  public IntIterator positions() throws IOException {    
    return IntIterators.wrap( positionArray(), 0, count );
  }


  public int positions( int[] position ) throws IOException {
    int c = count;
    if ( position.length < c ) return -c;
    final int[] cache = positionArray();
    for( int i = c; i-- != 0; ) position[ i ] = cache[ i ];
    return c;
  }


  @Override
  public IndexIterator weight( final double weight ) {
    super.weight( weight );
    return this;
  }
  
  @Override
  public <T> T accept( DocumentIteratorVisitor<T> visitor ) throws IOException {
    return visitor.visit( this );
  }


  @Override
  public <T> T acceptOnTruePaths( DocumentIteratorVisitor<T> visitor ) throws IOException {
    return visitor.visit( this );
  }
  
  public <T> T acceptDeep( DocumentIteratorVisitor<T> visitor ) throws IOException {
    return super.accept( visitor );
  }


  public <T> T acceptDeepOnTruePaths( DocumentIteratorVisitor<T> visitor ) throws IOException {
    return super.accept( visitor );
  }  
  
  /** An optimised interval iterator with the same semantics as that implemented
   *  by {@link OrDocumentIterator}, but not allowing duplicate positions.
   *  
   *  <p>This iterator provides an additional {@link #drain()} method that exhausts the
   *  merge queue, leaving however the returned elements in the {@link #cache} array. Moreover,
   *  the internal state of the iterator is modified so that it continues to behave normally,
   *  returning however its results from {@link #cache}. In this way we can easily provide
   *  efficient implementations for {@link IndexIterator#positions()}, {@link IndexIterator#positionArray()},
   *  and {@link IndexIterator#positions(int[])}.
   */
  private class MultiTermIntervalIterator extends AbstractCompositeIndexIntervalIterator implements IntervalIterator {
    @SuppressWarnings({ "unused" })
    private final static boolean DEBUG = false;
    @SuppressWarnings("hiding")
    private final static boolean ASSERTS = false;


    /** A heap-based indirect priority queue used to keep track of the currently scanned positions. */
    private final IntHeapSemiIndirectPriorityQueue positionQueue;
    /** The cached results of this iterator. */
    public int[] cache;
    /** The number of results emitted by this iterator since the last call to {@link #reset()}. */
    private int emitted;
    /** The number of results extracted in {@link #cache} since the last call to {@link #reset()}. */
    private int extracted;


    public MultiTermIntervalIterator() {
      super( n );
      positionQueue = new IntHeapSemiIndirectPriorityQueue( curr );
      cache = new int[ 4 ];
    }


    public void reset() throws IOException {
      emitted = extracted = 0;
      next = null;
      positionQueue.clear();


      for ( int i = computeFront(), k; i-- != 0; ) {
        k = front[ i ];
        position[ k ] = indexIterator[ k ].positionArray();
        count[ k ] = indexIterator[ k ].count();
        curr[ k ] = position[ k ][ 0 ];
        currPos[ k ] = 0;
        positionQueue.enqueue( k );
      }


      if ( ASSERTS ) assert ! positionQueue.isEmpty();
    }


    public void intervalTerms( final IntSet terms ) {
      // TODO: this is not particularly sensible
      terms.add( indexIterator[ 0 ].termNumber() );
    }
    
    public Interval nextInterval() {
      if ( next != null ) {
        final Interval result = next;
        next = null;
        return result;
      }
      
      if ( emitted < extracted ) return Interval.valueOf( cache[ emitted++ ] );


      if ( positionQueue.isEmpty() ) return null;


      final int first = positionQueue.first();


      if ( extracted == cache.length ) cache = IntArrays.grow( cache, extracted + 1 );
      cache[ extracted++ ] = curr[ first ];


      if ( ++currPos[ first ] < count[ first ] ) {
        curr[ first ] = position[ first ][ currPos[ first ] ];
        positionQueue.changed();
        if ( curr[ positionQueue.first() ] == cache[ extracted - 1 ] ) throw new IllegalArgumentException( "Duplicate positions in " + this );
      }
      else positionQueue.dequeue();
        
      return Interval.valueOf( cache[ emitted++ ] );
    }


    public int extent() {
      return 1;
    }
    
    /** Drains all elements from the queue, stores them in {@link #cache} and
     * restores {@link #emitted} so that the iterators continues to work transparently. 
     */
    
    public void drain() {
      final int emittedNow = emitted - ( next != null ? 1 : 0 );
      next = null;
      while( nextInterval() != null );
      emitted = emittedNow;
    }
   }
}
Source Code of it.unimi.dsi.mg4j.index.MultiTermIndexIterator$MultiTermIntervalIterator

Related Classes of it.unimi.dsi.mg4j.index.MultiTermIndexIterator$MultiTermIntervalIterator