Source Code of it.unimi.dsi.mg4j.search.AbstractIntersectionDocumentIterator

package it.unimi.dsi.mg4j.search;


/*     
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2003-2010 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */


import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.IndexIterator;


import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;




/** An abstract iterator on documents, generating the intersection of the documents returned by 
 * a number of document iterators.
 * 
 * <P>To be usable, this class must be subclassed so to provide also an iterator on intervals.
 * Such iterators must be instantiable using {@link #getComposedIntervalIterator(Index)}.
 * The latter is an example of a <em>non-static factory method</em>, that is, a factory method which
 * depends on the enclosing instance. This pattern allows to easily specialise this class to iterators
 * that do different things, such as {@link it.unimi.dsi.mg4j.search.AndDocumentIterator} and
 * {@link it.unimi.dsi.mg4j.search.ConsecutiveDocumentIterator}, but that have a similar semantics at the document
 * level (the semantics may in fact be slightly different: for instance, not all document belonging
 * to all components will actually appear in a consecutive iterator, as there may be documents filtered
 * at the interval level).
 * 
 * <P>The important invariant is that <em>only</em> after a call to {@link #nextDocument()}, a call
 * to {@link #intervalIterator(Index)} will return an interval iterator over the document
 * just returned, and that for at least one index in {@link #indices()} the iterator will not be empty
 * or {@link it.unimi.dsi.mg4j.search.IntervalIterators#TRUE TRUE}.
 * 
 * <h2>The intersection algorithm</h2>
 * 
 * <p>Since MG4J 1.1, this class implements a new intersection algorithm that should be significantly
 * faster than the previous one. The main idea is that of letting sparser iterator interact as much
 * as possible to obtain a candidate common document, and then trying to align the others. At construction
 * time, the component iterators are sorted so that index iterators are separated, and sorted by frequency.
 * Then, each time we have to align the iterators we align them greedily starting from the index
 * iterators, in frequency order. This has the effect of skipping very quickly (and usually by
 * large jumps, which are handled nicely by indices with skips),
 * as the main interaction happens between low-frequency index iterators.
 * 
 * <p>Moreover, this class treats in a special way
 *  {@linkplain PayloadPredicateDocumentIterator index iterators coming from payload-based indices}. Such
 * iterators are checked at the end of the alignment process, 
 * after all standard index iterators (and general document iterators)
 * are aligned. At that point, the special method {@link PayloadPredicateDocumentIterator#skipUnconditionallyTo(int)}
 * is used to position unconditionally such iterators and check whether the payload predicate is satisfied.
 * If this doesn't happen, the current candidate (obtained by alignment of standard iterators) is increased and the
 * whole process is restarted. This procedure guarantees that we will never search exhaustively in a 
 * payload-based index a document record satisfying the predicate (unless, of course, we have a query
 * containing just {@link PayloadPredicateDocumentIterator}s), which is very efficient if the payload-based
 * index uses skipping.
 * 
 */


public abstract class AbstractIntersectionDocumentIterator extends AbstractCompositeDocumentIterator {
  private final static boolean DEBUG = false;
  private final static boolean ASSERTS = false;


  /** A map from indices to interval iterators. */
  final protected Reference2ReferenceArrayMap<Index,IntervalIterator> intervalIterators;
  /** A map from indices to the iterators returned for the current document. The key set may
   * not contain an index because the related iterator has never been requested. Moreover,
   * the iterator in this map for a given index may differ from the one in {@link #intervalIterators}
   * because it could be {@link IntervalIterators#TRUE TRUE} (in fact, in that case it may even
   * happen that {@link #intervalIterators} does not contain the index). */
  final protected Reference2ReferenceArrayMap<Index,IntervalIterator> currentIterators;
  /** An unmodifiable wrapper around {@link #currentIterators}. */
  final protected Reference2ReferenceMap<Index,IntervalIterator> unmodifiableCurrentIterators;
  /** The provided document iterators, suitably sorted. */
  final private DocumentIterator[] sortedIterator;
  /** The last element of {@link #sortedIterator}. */
  final private DocumentIterator lastIterator;
  /** The prefix of {@link #sortedIterator} made of {@link PayloadPredicateDocumentIterator}s. */
  final private PayloadPredicateDocumentIterator[] payloadPredicateDocumentIterator;
  /** Iterators in {@link #sortedIterator} up to this position (exclusive) are instances of {@link PayloadPredicateDocumentIterator}. */
  final private int predicateStart;
  /** When true, the intersection list has been exhausted (of course, if {@link #next} is not -1 there is still an element to be returned). */
  private boolean exhausted;
  /** The current element on which all iterators are aligned, unless {@link #exhausted} is true, in which case the value is undefined. */
  private int curr;


  /** Creates a new intersection iterator using a given array of iterators and a given index.
   *  @param index an index that will be passed to {@link AbstractCompositeDocumentIterator#AbstractCompositeDocumentIterator(Index, DocumentIterator...)}.
   *  @param documentIterator the iterators to be intersected (at least one).
   */
  protected AbstractIntersectionDocumentIterator( final Index index, final DocumentIterator[] documentIterator ) throws IOException {
    super( index, documentIterator ); 
    if ( documentIterator.length == 0 ) throw new IllegalArgumentException();
    sortedIterator = documentIterator.clone(); // We need a copy to reorder iterators
    
    intervalIterators = new Reference2ReferenceArrayMap<Index,IntervalIterator>( indices.size() );
    currentIterators = new Reference2ReferenceArrayMap<Index,IntervalIterator>( indices.size() );
    unmodifiableCurrentIterators = Reference2ReferenceMaps.unmodifiable( currentIterators );
    
    // We now reorder iterators putting in the back the index iterators of smallest frequency and moving to front payload-predicate iterators.
    Arrays.sort( sortedIterator, new Comparator<DocumentIterator>() {
      public int compare( final DocumentIterator d0, final DocumentIterator d1 ) {
        final PayloadPredicateDocumentIterator p0 = d0 instanceof PayloadPredicateDocumentIterator ? (PayloadPredicateDocumentIterator)d0 : null;
        final PayloadPredicateDocumentIterator p1 = d1 instanceof PayloadPredicateDocumentIterator ? (PayloadPredicateDocumentIterator)d1 : null;
        
        if ( p0 != null && p1 != null ) return 0;
        if ( p0 != null ) return -1;
        if ( p1 != null ) return 1;
        
        final IndexIterator i0 = d0 instanceof IndexIterator ? (IndexIterator)d0 : null;
        final IndexIterator i1 = d1 instanceof IndexIterator ? (IndexIterator)d1 : null;
        if ( i0 == null && i1 == null ) return 0;
        if ( ( i0 != null ) != ( i1 != null ) ) return ( i0 != null ) ? 1 : -1;
        try {
          return i1.frequency() - i0.frequency();
        }
        catch ( IOException e ) {
          throw new RuntimeException( e );
        }
      }
    }
    );


    lastIterator = sortedIterator[ n - 1 ];
    int i;
    for( i = n; i-- != 0; ) if ( sortedIterator[ i ] instanceof PayloadPredicateDocumentIterator ) break;
    predicateStart = i + 1;
    payloadPredicateDocumentIterator = new PayloadPredicateDocumentIterator[ predicateStart ];
    for( i = predicateStart; i-- != 0; ) payloadPredicateDocumentIterator[ i ] = (PayloadPredicateDocumentIterator)sortedIterator[ i ];
    
    if ( DEBUG ) System.err.println( "Sorted iterators: " + Arrays.toString( sortedIterator ) );
    
    /* We now advance all iterators to their first common pointer, if any. Note
     * that the difference between documentIterator and this.documentIterator
     * is immaterial here. */


    for ( i = n; i-- != 0; ) 
      if ( ! sortedIterator[ i ].hasNext() ) {
        // If any of the iterators is empty, we're over.
        exhausted = true;
        return;
      }


    if ( align() ) next = curr;
    else exhausted = true;
  }


  /** Creates a new intersection iterator using a given array of iterators.
   *  @param documentIterator the iterators to be insersected (at least one).
   */
  protected AbstractIntersectionDocumentIterator( final DocumentIterator[] documentIterator ) throws IOException {
    this( null, documentIterator );
  }
  
  /** Align all iterators on the first common document pointer after {@link #curr}.
   * 
   * <P>After a call to this method, all component iterators are positioned
   * on the same document (and {@link #curr} is set to that document), 
   * unless the method returns false, in which case there
   * are no more document pointer making alignment possible (and {@link #exhausted} is true).
   * 
   * @return true if all component iterators are aligned, false if
   * no alignment position could be found.
   */
  private boolean align() throws IOException {
    if ( ASSERTS ) assert ! exhausted;
    if ( DEBUG ) System.err.println( this + ".align()..." );
    
    int i, res;
    int candidate = curr;
    
    for(;;) {
      for( i = n; i-- != 0 ; ) {
        if ( i < predicateStart ) res = payloadPredicateDocumentIterator[ i ].skipUnconditionallyTo( candidate );
        else res = sortedIterator[ i ].skipTo( candidate );


        if ( res == Integer.MAX_VALUE ) return false;
        
        if ( res != candidate ) {
          // Note that for payload-predicate document iterators res might be negative
          if ( res > candidate ) candidate = res;
          else {
            if ( ASSERTS ) assert res < 0;
            candidate++;
          }
          break;
        }
      }
      
      if ( i == -1 ) {
        curr = candidate;
        return true;
      }
    }
  }


  public int skipTo( final int n ) throws IOException {
    if ( last >= n ) return last;
    last = next = -1;
    currentIterators.clear();
    if ( curr < n ) {
      if ( ( curr = lastIterator.skipTo( n ) ) == Integer.MAX_VALUE ) {
        exhausted = true;
        return Integer.MAX_VALUE;
      }
      else exhausted = ! align();
    }
    return exhausted ? Integer.MAX_VALUE : ( last = curr );
  }
  
  public int nextDocument() throws IOException {
    if ( DEBUG ) System.err.println( this + ".hasNext()" );


    if ( next >= 0 ) {
      // We already know what to return
      last = next;
      next = -1;
      return last;
    }


    last = next = -1;
    currentIterators.clear();
    if ( ( curr = lastIterator.nextDocument() ) == -1 ) exhausted = true;
    else exhausted = ! align();
    if ( exhausted ) return -1;
    return last = curr;
  }


  public Reference2ReferenceMap<Index,IntervalIterator> intervalIterators() throws IOException {
    final Iterator<Index> i = indices.iterator();
    while( i.hasNext() ) intervalIterator( i.next() );
    return unmodifiableCurrentIterators;
  }


  public IntervalIterator intervalIterator( final Index index ) throws IOException {
    if ( DEBUG ) System.err.println( this + ".intervalIterator(" + index + ")" );
    if ( last == -1 ) throw new IllegalStateException(); 
    if ( ! indices.contains( index ) ) return IntervalIterators.TRUE;
    IntervalIterator intervalIterator;


    // If the iterator has been created and it's ready, we just return it.    
    if ( ( intervalIterator = currentIterators.get( index ) ) != null ) return intervalIterator;


    int i, c;
      
    /* None of the iterators may be FALSE. Otherwise, if all iterators are TRUE, we return TRUE. Otherwise, we return
     * (possibly after creation) the underlying interval iterator.
     * 
     * In the case of index iterators, we can avoid the check. No index iterator can return FALSE, and
     * at least one must return an iterator != TRUE (as indices.contains(index)).
     */
    if ( indexIterator == null ) 
      for( i = c = 0; i < n; i++ ) {
        intervalIterator = documentIterator[ i ].intervalIterator( index );
        // We cannot be on a document one of whose iterators if FALSE.
        if ( intervalIterator == IntervalIterators.FALSE ) break;
        if ( intervalIterator != IntervalIterators.TRUE ) c++;
      }
    else i = c = n;
    
    // Note that we cannot optimise the case c == 1 because of gaps in ConsecutiveDocumentIterator.
    if ( i < n ) intervalIterator = IntervalIterators.FALSE;
    else if ( c == 0 ) intervalIterator = IntervalIterators.TRUE;
    else {
      intervalIterator = intervalIterators.get( index );
      if ( intervalIterator == null ) intervalIterators.put( index, intervalIterator = getComposedIntervalIterator( index ) );
      intervalIterator.reset();
    }


    currentIterators.put( index, intervalIterator );  
    return intervalIterator;
  }
  
  abstract protected IntervalIterator getComposedIntervalIterator( Index index );
}
Source Code of it.unimi.dsi.mg4j.search.AbstractIntersectionDocumentIterator

Related Classes of it.unimi.dsi.mg4j.search.AbstractIntersectionDocumentIterator