package it.unimi.dsi.mg4j.search;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2003-2010 Paolo Boldi and Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.IndexIterator;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
/** An abstract iterator on documents, generating the intersection of the documents returned by
* a number of document iterators.
*
* <P>To be usable, this class must be subclassed so to provide also an iterator on intervals.
* Such iterators must be instantiable using {@link #getComposedIntervalIterator(Index)}.
* The latter is an example of a <em>non-static factory method</em>, that is, a factory method which
* depends on the enclosing instance. This pattern allows to easily specialise this class to iterators
* that do different things, such as {@link it.unimi.dsi.mg4j.search.AndDocumentIterator} and
* {@link it.unimi.dsi.mg4j.search.ConsecutiveDocumentIterator}, but that have a similar semantics at the document
* level (the semantics may in fact be slightly different: for instance, not all document belonging
* to all components will actually appear in a consecutive iterator, as there may be documents filtered
* at the interval level).
*
* <P>The important invariant is that <em>only</em> after a call to {@link #nextDocument()}, a call
* to {@link #intervalIterator(Index)} will return an interval iterator over the document
* just returned, and that for at least one index in {@link #indices()} the iterator will not be empty
* or {@link it.unimi.dsi.mg4j.search.IntervalIterators#TRUE TRUE}.
*
* <h2>The intersection algorithm</h2>
*
* <p>Since MG4J 1.1, this class implements a new intersection algorithm that should be significantly
* faster than the previous one. The main idea is that of letting sparser iterator interact as much
* as possible to obtain a candidate common document, and then trying to align the others. At construction
* time, the component iterators are sorted so that index iterators are separated, and sorted by frequency.
* Then, each time we have to align the iterators we align them greedily starting from the index
* iterators, in frequency order. This has the effect of skipping very quickly (and usually by
* large jumps, which are handled nicely by indices with skips),
* as the main interaction happens between low-frequency index iterators.
*
* <p>Moreover, this class treats in a special way
* {@linkplain PayloadPredicateDocumentIterator index iterators coming from payload-based indices}. Such
* iterators are checked at the end of the alignment process,
* after all standard index iterators (and general document iterators)
* are aligned. At that point, the special method {@link PayloadPredicateDocumentIterator#skipUnconditionallyTo(int)}
* is used to position unconditionally such iterators and check whether the payload predicate is satisfied.
* If this doesn't happen, the current candidate (obtained by alignment of standard iterators) is increased and the
* whole process is restarted. This procedure guarantees that we will never search exhaustively in a
* payload-based index a document record satisfying the predicate (unless, of course, we have a query
* containing just {@link PayloadPredicateDocumentIterator}s), which is very efficient if the payload-based
* index uses skipping.
*
*/
public abstract class AbstractIntersectionDocumentIterator extends AbstractCompositeDocumentIterator {
private final static boolean DEBUG = false;
private final static boolean ASSERTS = false;
/** A map from indices to interval iterators. */
final protected Reference2ReferenceArrayMap<Index,IntervalIterator> intervalIterators;
/** A map from indices to the iterators returned for the current document. The key set may
* not contain an index because the related iterator has never been requested. Moreover,
* the iterator in this map for a given index may differ from the one in {@link #intervalIterators}
* because it could be {@link IntervalIterators#TRUE TRUE} (in fact, in that case it may even
* happen that {@link #intervalIterators} does not contain the index). */
final protected Reference2ReferenceArrayMap<Index,IntervalIterator> currentIterators;
/** An unmodifiable wrapper around {@link #currentIterators}. */
final protected Reference2ReferenceMap<Index,IntervalIterator> unmodifiableCurrentIterators;
/** The provided document iterators, suitably sorted. */
final private DocumentIterator[] sortedIterator;
/** The last element of {@link #sortedIterator}. */
final private DocumentIterator lastIterator;
/** The prefix of {@link #sortedIterator} made of {@link PayloadPredicateDocumentIterator}s. */
final private PayloadPredicateDocumentIterator[] payloadPredicateDocumentIterator;
/** Iterators in {@link #sortedIterator} up to this position (exclusive) are instances of {@link PayloadPredicateDocumentIterator}. */
final private int predicateStart;
/** When true, the intersection list has been exhausted (of course, if {@link #next} is not -1 there is still an element to be returned). */
private boolean exhausted;
/** The current element on which all iterators are aligned, unless {@link #exhausted} is true, in which case the value is undefined. */
private int curr;
/** Creates a new intersection iterator using a given array of iterators and a given index.
* @param index an index that will be passed to {@link AbstractCompositeDocumentIterator#AbstractCompositeDocumentIterator(Index, DocumentIterator...)}.
* @param documentIterator the iterators to be intersected (at least one).
*/
protected AbstractIntersectionDocumentIterator( final Index index, final DocumentIterator[] documentIterator ) throws IOException {
super( index, documentIterator );
if ( documentIterator.length == 0 ) throw new IllegalArgumentException();
sortedIterator = documentIterator.clone(); // We need a copy to reorder iterators
intervalIterators = new Reference2ReferenceArrayMap<Index,IntervalIterator>( indices.size() );
currentIterators = new Reference2ReferenceArrayMap<Index,IntervalIterator>( indices.size() );
unmodifiableCurrentIterators = Reference2ReferenceMaps.unmodifiable( currentIterators );
// We now reorder iterators putting in the back the index iterators of smallest frequency and moving to front payload-predicate iterators.
Arrays.sort( sortedIterator, new Comparator<DocumentIterator>() {
public int compare( final DocumentIterator d0, final DocumentIterator d1 ) {
final PayloadPredicateDocumentIterator p0 = d0 instanceof PayloadPredicateDocumentIterator ? (PayloadPredicateDocumentIterator)d0 : null;
final PayloadPredicateDocumentIterator p1 = d1 instanceof PayloadPredicateDocumentIterator ? (PayloadPredicateDocumentIterator)d1 : null;
if ( p0 != null && p1 != null ) return 0;
if ( p0 != null ) return -1;
if ( p1 != null ) return 1;
final IndexIterator i0 = d0 instanceof IndexIterator ? (IndexIterator)d0 : null;
final IndexIterator i1 = d1 instanceof IndexIterator ? (IndexIterator)d1 : null;
if ( i0 == null && i1 == null ) return 0;
if ( ( i0 != null ) != ( i1 != null ) ) return ( i0 != null ) ? 1 : -1;
try {
return i1.frequency() - i0.frequency();
}
catch ( IOException e ) {
throw new RuntimeException( e );
}
}
}
);
lastIterator = sortedIterator[ n - 1 ];
int i;
for( i = n; i-- != 0; ) if ( sortedIterator[ i ] instanceof PayloadPredicateDocumentIterator ) break;
predicateStart = i + 1;
payloadPredicateDocumentIterator = new PayloadPredicateDocumentIterator[ predicateStart ];
for( i = predicateStart; i-- != 0; ) payloadPredicateDocumentIterator[ i ] = (PayloadPredicateDocumentIterator)sortedIterator[ i ];
if ( DEBUG ) System.err.println( "Sorted iterators: " + Arrays.toString( sortedIterator ) );
/* We now advance all iterators to their first common pointer, if any. Note
* that the difference between documentIterator and this.documentIterator
* is immaterial here. */
for ( i = n; i-- != 0; )
if ( ! sortedIterator[ i ].hasNext() ) {
// If any of the iterators is empty, we're over.
exhausted = true;
return;
}
if ( align() ) next = curr;
else exhausted = true;
}
/** Creates a new intersection iterator using a given array of iterators.
* @param documentIterator the iterators to be insersected (at least one).
*/
protected AbstractIntersectionDocumentIterator( final DocumentIterator[] documentIterator ) throws IOException {
this( null, documentIterator );
}
/** Align all iterators on the first common document pointer after {@link #curr}.
*
* <P>After a call to this method, all component iterators are positioned
* on the same document (and {@link #curr} is set to that document),
* unless the method returns false, in which case there
* are no more document pointer making alignment possible (and {@link #exhausted} is true).
*
* @return true if all component iterators are aligned, false if
* no alignment position could be found.
*/
private boolean align() throws IOException {
if ( ASSERTS ) assert ! exhausted;
if ( DEBUG ) System.err.println( this + ".align()..." );
int i, res;
int candidate = curr;
for(;;) {
for( i = n; i-- != 0 ; ) {
if ( i < predicateStart ) res = payloadPredicateDocumentIterator[ i ].skipUnconditionallyTo( candidate );
else res = sortedIterator[ i ].skipTo( candidate );
if ( res == Integer.MAX_VALUE ) return false;
if ( res != candidate ) {
// Note that for payload-predicate document iterators res might be negative
if ( res > candidate ) candidate = res;
else {
if ( ASSERTS ) assert res < 0;
candidate++;
}
break;
}
}
if ( i == -1 ) {
curr = candidate;
return true;
}
}
}
public int skipTo( final int n ) throws IOException {
if ( last >= n ) return last;
last = next = -1;
currentIterators.clear();
if ( curr < n ) {
if ( ( curr = lastIterator.skipTo( n ) ) == Integer.MAX_VALUE ) {
exhausted = true;
return Integer.MAX_VALUE;
}
else exhausted = ! align();
}
return exhausted ? Integer.MAX_VALUE : ( last = curr );
}
public int nextDocument() throws IOException {
if ( DEBUG ) System.err.println( this + ".hasNext()" );
if ( next >= 0 ) {
// We already know what to return
last = next;
next = -1;
return last;
}
last = next = -1;
currentIterators.clear();
if ( ( curr = lastIterator.nextDocument() ) == -1 ) exhausted = true;
else exhausted = ! align();
if ( exhausted ) return -1;
return last = curr;
}
public Reference2ReferenceMap<Index,IntervalIterator> intervalIterators() throws IOException {
final Iterator<Index> i = indices.iterator();
while( i.hasNext() ) intervalIterator( i.next() );
return unmodifiableCurrentIterators;
}
public IntervalIterator intervalIterator( final Index index ) throws IOException {
if ( DEBUG ) System.err.println( this + ".intervalIterator(" + index + ")" );
if ( last == -1 ) throw new IllegalStateException();
if ( ! indices.contains( index ) ) return IntervalIterators.TRUE;
IntervalIterator intervalIterator;
// If the iterator has been created and it's ready, we just return it.
if ( ( intervalIterator = currentIterators.get( index ) ) != null ) return intervalIterator;
int i, c;
/* None of the iterators may be FALSE. Otherwise, if all iterators are TRUE, we return TRUE. Otherwise, we return
* (possibly after creation) the underlying interval iterator.
*
* In the case of index iterators, we can avoid the check. No index iterator can return FALSE, and
* at least one must return an iterator != TRUE (as indices.contains(index)).
*/
if ( indexIterator == null )
for( i = c = 0; i < n; i++ ) {
intervalIterator = documentIterator[ i ].intervalIterator( index );
// We cannot be on a document one of whose iterators if FALSE.
if ( intervalIterator == IntervalIterators.FALSE ) break;
if ( intervalIterator != IntervalIterators.TRUE ) c++;
}
else i = c = n;
// Note that we cannot optimise the case c == 1 because of gaps in ConsecutiveDocumentIterator.
if ( i < n ) intervalIterator = IntervalIterators.FALSE;
else if ( c == 0 ) intervalIterator = IntervalIterators.TRUE;
else {
intervalIterator = intervalIterators.get( index );
if ( intervalIterator == null ) intervalIterators.put( index, intervalIterator = getComposedIntervalIterator( index ) );
intervalIterator.reset();
}
currentIterators.put( index, intervalIterator );
return intervalIterator;
}
abstract protected IntervalIterator getComposedIntervalIterator( Index index );
}