package it.unimi.dsi.mg4j.index;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2003-2010 Paolo Boldi and Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntIterators;
import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.fastutil.objects.ObjectHeapIndirectPriorityQueue;
import it.unimi.dsi.mg4j.index.payload.Payload;
import it.unimi.dsi.mg4j.search.AbstractCompositeDocumentIterator;
import it.unimi.dsi.mg4j.search.AbstractUnionDocumentIterator;
import it.unimi.dsi.mg4j.search.DocumentIterator;
import it.unimi.dsi.util.Interval;
import it.unimi.dsi.mg4j.search.IntervalIterator;
import it.unimi.dsi.mg4j.search.OrDocumentIterator;
import it.unimi.dsi.mg4j.search.score.BM25Scorer;
import it.unimi.dsi.mg4j.search.score.Scorer;
import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;
import java.io.IOException;
/** A virtual {@linkplain IndexIterator index iterator} that merges several component index iterators.
*
* <P>This class adds to {@link it.unimi.dsi.mg4j.search.AbstractUnionDocumentIterator}
* an interval iterator generating the OR of the intervals returned for each of the documents involved.
* The main difference with an {@link OrDocumentIterator} built on the same array of component iterators
* is that this class implements {@link IndexIterator} and hence provides a {@link #count()} (the sum
* of counts of those component iterators positioned on the current document) and a {@link #frequency()}. The
* latter is by default the maximum frequency of a component iterator, but it can be set
* at {@link MultiTermIndexIterator#getInstance(int, Index, IndexIterator[]) construction time}.
*
* <p>The main <i>raison d'être</i> of this class is support for query expansion: a blind application
* of {@link OrDocumentIterator} to an array of index iterators would mislead {@linkplain Scorer scorers} such as {@link BM25Scorer}
* because low-frequency terms (e.g., <i>hapax legomena</i>) would be responsible for most of the score.
*
* <p>Note that {@linkplain DocumentIteratorVisitor} has a {@linkplain DocumentIteratorVisitor#visit(IndexIterator) visit method for generic index iterator}
* and a {@linkplain DocumentIteratorVisitor#visit(MultiTermIndexIterator) visit method for instances of this class}.
* This approach provides additional flexibility—a scorer, for instance, might treat an instance of
* this class as a standard {@link IndexIterator}, or it might choose to {@linkplain #front(IndexIterator[]) query which terms actually appear}
* and do something more sophisticated (for instance, using {@linkplain DocumentIterator#weight() weights}).
*/
public class MultiTermIndexIterator extends AbstractUnionDocumentIterator implements IndexIterator {
@SuppressWarnings("unused")
private static final boolean ASSERTS = false;
/** Value to be used for term frequency, or {@link Integer#MIN_VALUE} to use the max; in any case, this attribute is used to cache
* frequency after the first call to {@link #frequency()}. */
private int frequency;
/** The term of this iterator. */
protected String term;
/** The id of this iterator. */
protected int id;
/** The count of the last returned document. */
private int count = -1;
/** Whether all underlying index iterators have counts. */
private final boolean hasCounts;
/** Whether all underlying index iterators have positions. */
private final boolean hasPositions;
/** Returns an index iterator that merges the given array of iterators.
* This method requires that at least one iterator is provided. The frequency is computed as a max,
* and {@link #index()} will return the result of the same method on the first iterator.
*
* @param indexIterator the iterators to be joined (at least one).
* @return a merged index iterator.
* @throws IllegalArgumentException if no iterators were provided.
*/
public static IndexIterator getInstance( final IndexIterator... indexIterator ) throws IOException {
return getInstance( Integer.MIN_VALUE, indexIterator );
}
/** Returns an index iterator that merges the given array of iterators.
*
* <P>Note that the special case of the empty and of the singleton arrays
* are handled efficiently. The frequency is computed as a max, and
* {@link #index()} will return <code>index</code>.
*
* @param index the index that wil be returned by {@link #index()}.
* @param indexIterator the iterators to be joined.
* @return a merged index iterator.
*/
public static IndexIterator getInstance( final Index index, final IndexIterator... indexIterator ) throws IOException {
return getInstance( Integer.MIN_VALUE, index, indexIterator );
}
/** Returns an index iterator that merges the given array of iterators.
* This method requires that at least one iterator is provided.
*
* @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
* @param indexIterator the iterators to be joined (at least one).
* @return a merged index iterator.
* @throws IllegalArgumentException if no iterators were provided, or they run on different indices.
*/
public static IndexIterator getInstance( final int defaultFrequency, final IndexIterator... indexIterator ) throws IOException {
if ( indexIterator.length == 0 ) throw new IllegalArgumentException();
return getInstance( defaultFrequency, indexIterator[ 0 ].index(), indexIterator );
}
/** Returns an index iterator that merges the given array of iterators.
*
* <P>Note that the special case of the empty and of the singleton arrays
* are handled efficiently.
*
* @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
* @param index the index that wil be returned by {@link #index()}.
* @param indexIterator the iterators to be joined.
* @return a merged index iterator.
* @throws IllegalArgumentException if there is some iterator on an index different from <code>index</code>.
*/
public static IndexIterator getInstance( final int defaultFrequency, final Index index, final IndexIterator... indexIterator ) throws IOException {
if ( indexIterator.length == 0 ) return index.getEmptyIndexIterator();
if ( indexIterator.length == 1 ) return indexIterator[ 0 ];
return new MultiTermIndexIterator( defaultFrequency, indexIterator );
}
/** Creates a new document iterator that merges the given array of iterators.
*
* @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
* @param indexIterator the iterators to be joined.
*/
@SuppressWarnings("cast")
protected MultiTermIndexIterator( final int defaultFrequency, final IndexIterator... indexIterator ) throws IOException {
super( (DocumentIterator[]) indexIterator );
this.frequency = defaultFrequency;
boolean havePositions = true, haveCounts = true;
for( IndexIterator i: indexIterator ) {
if ( ! i.index().hasCounts ) haveCounts = false;
if ( ! i.index().hasPositions ) havePositions = false;
}
hasCounts = haveCounts;
hasPositions = havePositions;
}
protected IntervalIterator getComposedIntervalIterator( final Index index ) {
return new MultiTermIntervalIterator();
}
@Override
public int skipTo( final int n ) throws IOException {
if ( last >= n ) return last;
// We invalidate count before calling the superclass method.
count = -1;
return super.skipTo( n );
}
public int nextDocument() throws IOException {
// We invalidate count before calling the superclass method.
count = -1;
return super.nextDocument();
}
/** The count is the sum of counts of those component iterators positioned on the current document.
*
* @return the sum of counts.
*/
public int count() throws IOException {
if ( ! hasCounts ) throw new IllegalStateException( "Some of the underlying iterators do not have counts" );
if ( last == -1 ) throw new IllegalStateException();
if ( count == -1 ) {
int count = 0;
for ( int i = computeFront(); i-- != 0; ) count += indexIterator[ front[ i ] ].count();
this.count = count;
}
return count;
}
/** Fills the given array with the index iterators composing the current front.
*
* <p>This method is essentially a safe exposure of the {@linkplain ObjectHeapIndirectPriorityQueue#front(int[]) front of the queue}
* merging the component {@linkplain IndexIterator index iterators}.
* After a call to {@link #nextDocument()}, you can use this method to know
* which terms actually appear in the current document. You can use the public
* field {@link AbstractCompositeDocumentIterator#n} to size the argument
* array appropriately.
*
* @param indexIterator an array, at least large as the number of component index iterators,
* that will be partially filled with the index iterators corresponding to terms appearing in the current document.
* @return the number of iterators written into <code>indexIterator</code>.
*/
public int front( final IndexIterator[] indexIterator ) {
final int s = computeFront();
for( int i = s; i-- != 0; ) indexIterator[ i ] = this.indexIterator[ front[ i ] ];
return s;
}
/** The frequency is either the default frequency set at construction time, or the maximum frequency of the component iterators.
*
* @return the frequency.
*/
public int frequency() throws IOException {
if ( frequency != Integer.MIN_VALUE ) return frequency;
int frequency = Integer.MIN_VALUE;
for ( int i = n; i-- != 0; ) frequency = Math.max( frequency, indexIterator[ i ].frequency() );
return this.frequency = frequency; // caching it!
}
public IndexIterator term( final CharSequence term ) {
this.term = term == null ? null : term.toString();
return this;
}
public String term() {
return term;
}
public int termNumber() {
// TODO: this is not particularly sensible
return indexIterator[ 0 ].termNumber();
}
public IndexIterator id( final int id ) {
this.id = id;
return this;
}
public int id() {
return id;
}
public Index index() {
return soleIndex;
}
/** This method is not implemented by this class.
*/
public Payload payload() {
throw new UnsupportedOperationException();
}
public int[] positionArray() throws IOException {
if ( ! hasPositions ) throw new IllegalStateException( "Some of the underlying iterators do not have positions" );
// If the front contains a single element, we can just use its position array.
if ( computeFront() == 1 ) return indexIterator[ front[ 0 ] ].positionArray();
final MultiTermIntervalIterator multiTermIntervalIterator = (MultiTermIntervalIterator)intervalIterator();
multiTermIntervalIterator.drain();
return multiTermIntervalIterator.cache;
}
public IntIterator positions() throws IOException {
return IntIterators.wrap( positionArray(), 0, count );
}
public int positions( int[] position ) throws IOException {
int c = count;
if ( position.length < c ) return -c;
final int[] cache = positionArray();
for( int i = c; i-- != 0; ) position[ i ] = cache[ i ];
return c;
}
@Override
public IndexIterator weight( final double weight ) {
super.weight( weight );
return this;
}
@Override
public <T> T accept( DocumentIteratorVisitor<T> visitor ) throws IOException {
return visitor.visit( this );
}
@Override
public <T> T acceptOnTruePaths( DocumentIteratorVisitor<T> visitor ) throws IOException {
return visitor.visit( this );
}
public <T> T acceptDeep( DocumentIteratorVisitor<T> visitor ) throws IOException {
return super.accept( visitor );
}
public <T> T acceptDeepOnTruePaths( DocumentIteratorVisitor<T> visitor ) throws IOException {
return super.accept( visitor );
}
/** An optimised interval iterator with the same semantics as that implemented
* by {@link OrDocumentIterator}, but not allowing duplicate positions.
*
* <p>This iterator provides an additional {@link #drain()} method that exhausts the
* merge queue, leaving however the returned elements in the {@link #cache} array. Moreover,
* the internal state of the iterator is modified so that it continues to behave normally,
* returning however its results from {@link #cache}. In this way we can easily provide
* efficient implementations for {@link IndexIterator#positions()}, {@link IndexIterator#positionArray()},
* and {@link IndexIterator#positions(int[])}.
*/
private class MultiTermIntervalIterator extends AbstractCompositeIndexIntervalIterator implements IntervalIterator {
@SuppressWarnings({ "unused" })
private final static boolean DEBUG = false;
@SuppressWarnings("hiding")
private final static boolean ASSERTS = false;
/** A heap-based indirect priority queue used to keep track of the currently scanned positions. */
private final IntHeapSemiIndirectPriorityQueue positionQueue;
/** The cached results of this iterator. */
public int[] cache;
/** The number of results emitted by this iterator since the last call to {@link #reset()}. */
private int emitted;
/** The number of results extracted in {@link #cache} since the last call to {@link #reset()}. */
private int extracted;
public MultiTermIntervalIterator() {
super( n );
positionQueue = new IntHeapSemiIndirectPriorityQueue( curr );
cache = new int[ 4 ];
}
public void reset() throws IOException {
emitted = extracted = 0;
next = null;
positionQueue.clear();
for ( int i = computeFront(), k; i-- != 0; ) {
k = front[ i ];
position[ k ] = indexIterator[ k ].positionArray();
count[ k ] = indexIterator[ k ].count();
curr[ k ] = position[ k ][ 0 ];
currPos[ k ] = 0;
positionQueue.enqueue( k );
}
if ( ASSERTS ) assert ! positionQueue.isEmpty();
}
public void intervalTerms( final IntSet terms ) {
// TODO: this is not particularly sensible
terms.add( indexIterator[ 0 ].termNumber() );
}
public Interval nextInterval() {
if ( next != null ) {
final Interval result = next;
next = null;
return result;
}
if ( emitted < extracted ) return Interval.valueOf( cache[ emitted++ ] );
if ( positionQueue.isEmpty() ) return null;
final int first = positionQueue.first();
if ( extracted == cache.length ) cache = IntArrays.grow( cache, extracted + 1 );
cache[ extracted++ ] = curr[ first ];
if ( ++currPos[ first ] < count[ first ] ) {
curr[ first ] = position[ first ][ currPos[ first ] ];
positionQueue.changed();
if ( curr[ positionQueue.first() ] == cache[ extracted - 1 ] ) throw new IllegalArgumentException( "Duplicate positions in " + this );
}
else positionQueue.dequeue();
return Interval.valueOf( cache[ emitted++ ] );
}
public int extent() {
return 1;
}
/** Drains all elements from the queue, stores them in {@link #cache} and
* restores {@link #emitted} so that the iterators continues to work transparently.
*/
public void drain() {
final int emittedNow = emitted - ( next != null ? 1 : 0 );
next = null;
while( nextInterval() != null );
emitted = emittedNow;
}
}
}