package it.unimi.dsi.mg4j.search;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2003-2010 Paolo Boldi and Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.fastutil.objects.ObjectArrays;
import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.util.Interval;
import it.unimi.dsi.util.Intervals;
import java.io.IOException;
/** A document iterator that returns the AND of a number of document iterators.
*
* <P>This class adds to {@link it.unimi.dsi.mg4j.search.AbstractIntersectionDocumentIterator}
* an interval iterator generating the AND of the intervals returned for each of the documents involved.
*/
public class AndDocumentIterator extends AbstractIntersectionDocumentIterator {
private final static boolean DEBUG = false;
/** Returns a document iterator that computes the AND of the given array of iterators.
*
* <P>Note that the special case of the empty and of the singleton arrays
* are handled efficiently.
*
* @param index the default index; relevant only if <code>it</code> has zero length.
* @param documentIterator the iterators to be joined.
* @return a document iterator that computes the AND of <code>it</code>.
* @throws IOException
*/
public static DocumentIterator getInstance( final Index index, final DocumentIterator... documentIterator ) throws IOException {
if ( documentIterator.length == 0 ) return TrueDocumentIterator.getInstance( index );
if ( documentIterator.length == 1 ) return documentIterator[ 0 ];
return new AndDocumentIterator( documentIterator );
}
/** Returns a document iterator that computes the AND of the given nonzero-length array of iterators.
*
* <P>Note that the special case of the singleton array is handled efficiently.
*
* @param documentIterator the iterators to be joined (at least one).
* @return a document iterator that computes the AND of <code>it</code>.
* @throws IOException
*/
public static DocumentIterator getInstance( final DocumentIterator... documentIterator ) throws IOException {
if ( documentIterator.length == 0 ) throw new IllegalArgumentException();
return getInstance( null, documentIterator );
}
protected AndDocumentIterator( final DocumentIterator[] documentIterator ) throws IOException {
super( documentIterator );
}
protected IntervalIterator getComposedIntervalIterator( final Index index ) {
return indexIterator == null ? new AndIntervalIterator( index ) : new AndIndexIntervalIterator( index );
}
/** An interval iterator returning the AND (in the Clarke−Cormack−Burkowski lattice) of the component interval iterators. */
private class AndIntervalIterator extends AbstractCompositeIntervalIterator implements IntervalIterator {
/** The index of this iterator. */
final private Index index;
/** A heap-based indirect priority queue used to keep track of the currently scanned intervals. */
final private ObjectHeapSemiIndirectPriorityQueue<Interval> queue;
/** Whether the scan is over. */
private boolean endOfProcess;
/** The left extreme of the last returned interval, or {@link Integer#MIN_VALUE} after a {@link #reset()}. */
private int lastLeft;
/** The maximum right extreme currently in the queue. */
private int maxRight;
/** Creates a new AND interval iterator.
*
* @param index the index of the iterator.
*/
public AndIntervalIterator( final Index index ) {
super( n );
// We just set up some internal data, but we perform no initialisation.
this.index = index;
queue = new ObjectHeapSemiIndirectPriorityQueue<Interval>( curr, Intervals.STARTS_BEFORE_OR_PROLONGS );
}
public void reset() throws IOException {
ObjectArrays.fill( curr, null );
queue.clear();
next = null;
maxRight = Integer.MIN_VALUE;
lastLeft = Integer.MIN_VALUE;
endOfProcess = false;
for ( int i = 0; i < n; i++ ) {
intervalIterator[ i ] = documentIterator[ i ].intervalIterator( index );
if ( ! intervalIterator[ i ].hasNext() ) {
// If by any chance we meet an empty (false) iterator we are just false
endOfProcess = true;
return;
}
// TRUE iterators need a special treatment
if ( intervalIterator[ i ] != IntervalIterators.TRUE ) {
curr[ i ] = intervalIterator[ i ].nextInterval();
queue.enqueue( i );
maxRight = Math.max( maxRight, curr[ i ].right );
}
}
/* At this point the queue could not be full because of TRUE iterators, but it's OK,
* because TRUE is the neutral element of conjunction. However, we cannot handle here
* the case in which all conjuncts are TRUE, as this case must be handled at a higher level. */
if ( queue.isEmpty() ) throw new IllegalArgumentException();
}
public void intervalTerms( final IntSet terms ) {
for( int i = n; i-- != 0; ) intervalIterator[ i ].intervalTerms( terms );
}
public Interval nextInterval() throws IOException {
if ( next != null ) {
final Interval result = next;
next = null;
return result;
}
if ( endOfProcess ) return null;
int first;
while ( curr[ first = queue.first() ].left == lastLeft ) {
if ( ( curr[ first ] = intervalIterator[ first ].nextInterval() ) == null ) {
endOfProcess = true;
return null;
}
maxRight = Math.max( maxRight, curr[ first ].right );
queue.changed();
}
int nextLeft, nextRight;
do {
nextLeft = curr[ first = queue.first() ].left;
nextRight = maxRight;
if ( DEBUG ) System.err.println( this + " is saving interval " + Interval.valueOf( nextLeft, nextRight ) );
/* We check whether the current top is equal to the span of the queue, and
* whether the top interval iterator is exhausted. In both cases, the
* current span is guaranteed to be a minimal interval. */
if ( curr[ first ].right == maxRight || ( endOfProcess = ( curr[ first ] = intervalIterator[ first ].nextInterval() ) == null ) ) break;
maxRight = Math.max( maxRight, curr[ first ].right );
queue.changed();
} while ( maxRight == nextRight );
return Interval.valueOf( lastLeft = nextLeft, nextRight );
}
public int extent() {
int s = 0;
for ( int i = n; i-- != 0; ) s += intervalIterator[ i ].extent();
return s;
}
}
/** An interval iterator returning the AND (in the Clarke−Cormack−Burkowski lattice) of the component interval iterators. */
private class AndIndexIntervalIterator extends AbstractCompositeIndexIntervalIterator implements IntervalIterator {
/** The index of this iterator. */
final private Index index;
/** A heap-based indirect priority queue used to keep track of the currently scanned positions. */
final private IntHeapSemiIndirectPriorityQueue queue;
/** Whether the scan is over. */
private boolean endOfProcess;
/** The left extreme of the last returned interval, or {@link Integer#MIN_VALUE} after a {@link #reset()}. */
private int lastLeft;
/** The maximum right extreme currently in the queue. */
private int maxRight;
/** Creates a new AND interval iterator.
*
* @param index the index of the iterator.
*/
public AndIndexIntervalIterator( final Index index ) {
super( n );
// We just set up some internal data, but we perform no initialisation.
this.index = index;
queue = new IntHeapSemiIndirectPriorityQueue( curr );
}
public void reset() throws IOException {
queue.clear();
maxRight = Integer.MIN_VALUE;
lastLeft = Integer.MIN_VALUE;
next = null;
endOfProcess = false;
for ( int i = 0; i < n; i++ ) {
// The case != index is identical to the TRUE case in AndIntervalIterator.
if ( indexIterator[ i ].index() == index ) {
position[ i ] = indexIterator[ i ].positionArray();
count[ i ] = indexIterator[ i ].count();
curr[ i ] = position[ i ][ currPos[ i ] = 0 ];
queue.enqueue( i );
maxRight = Math.max( maxRight, curr[ i ] );
}
}
/* At this point the queue could not be full because of TRUE iterators, but it's OK,
* because TRUE is the neutral element of conjunction. However, we cannot handle here
* the case in which all conjuncts are TRUE, as this case must be handled at a higher level. */
if ( queue.isEmpty() ) throw new IllegalArgumentException();
}
public void intervalTerms( final IntSet terms ) {
for( int i = n; i-- != 0; ) terms.add( indexIterator[ i ].termNumber() );
}
public Interval nextInterval() {
if ( next != null ) {
final Interval result = next;
next = null;
return result;
}
if ( endOfProcess ) return null;
int first;
while ( curr[ first = queue.first() ] == lastLeft ) {
if ( ++currPos[ first ] == count[ first ] ) {
endOfProcess = true;
return null;
}
curr[ first ] = position[ first ][ currPos[ first ] ];
maxRight = Math.max( maxRight, curr[ first ] );
queue.changed();
}
int nextLeft, nextRight;
do {
nextLeft = curr[ first = queue.first() ];
nextRight = maxRight;
if ( DEBUG ) System.err.println( this + " is saving interval " + Interval.valueOf( nextLeft, nextRight ) );
/* We check whether all iterators are on the same position, and
* whether the top interval iterator is exhausted. In both cases, the
* current span is guaranteed to be a minimal interval. */
if ( curr[ first ] == maxRight || ( endOfProcess = ++currPos[ first ] == count[ first ] ) ) break;
curr[ first ] = position[ first ][ currPos[ first ] ];
if ( maxRight < curr[ first ] ) maxRight = curr[ first ];
queue.changed();
} while ( maxRight == nextRight );
return Interval.valueOf( lastLeft = nextLeft, nextRight );
}
public int extent() {
return n;
}
}
}