Source Code of it.unimi.dsi.mg4j.search.ConsecutiveDocumentIterator$ConsecutiveIndexIntervalIterator

package it.unimi.dsi.mg4j.search;


/*     
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2003-2010 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */


import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.util.Interval;
import it.unimi.dsi.util.Intervals;


import java.io.IOException;


/** An iterator returning documents containing consecutive intervals (in query order) 
 * satisfying the underlying queries.
 * 
 * <p>As an additional service, this class makes it possible to specify <em>gaps</em> between
 * intervals. If gaps are specified, a match will satisfy the condition
 * that the left extreme of the first interval is larger than or equal to the
 * first gap, the left extreme of the second interval is larger than
 * the right extreme of the first interval plus the second gap, and so on.  The standard
 * semantics corresponds thus to the everywhere zero gap array. 
 * 
 * <p>This semantics
 * makes it possible to peform phrasal searches &ldquo;with holes&rdquo;, typically
 * because of stopwords that have not been indexed. Note that it is possible to specify
 * a gap <em>before</em> the first interval, but not <em>after</em> the last interval,
 * as in general the document length is not known at this level of query resolution.
 * 
 * <p>This class will handle correctly {@link IntervalIterators#TRUE TRUE} iterators; in this
 * case, the semantics is defined as follows: an interval is in the output if it is formed by the union of disjoint intervals,
 * one from each input list, and each gap of value <var>k</var> corresponds to <var>k</var> iterators
 * returning all document positions as singleton intervals. Since {@link IntervalIterators#TRUE TRUE} represents a list containing just
 * the empty interval, the result is equivalent to dropping {@link IntervalIterators#TRUE TRUE} iterators from the input; as
 * a consequence, the gap of a {@link IntervalIterators#TRUE TRUE} iterator is merged with that of the following iterator.
 * 
 * <p><strong>Warning</strong>: In case gaps are specified, the mathematically correct semantics would require that
 * gaps before {@link IntervalIterators#TRUE TRUE} iterators that are not followed by any non-{@link IntervalIterators#TRUE TRUE} iterators
 * have the effect of enlarging the resulting intervals <em>on the right side</em>. However,
 * this behaviour is very difficult to implement at this level because document lengths are not known. For this
 * reason, if one or more {@link IntervalIterators#TRUE TRUE} iterators appear a the end of the component iterator list they will be simply dropped.  
 */


public class ConsecutiveDocumentIterator extends AbstractOrderedIntervalDocumentIterator {
  /** The gap array. This is essentially the array provided at construction time; however, if a {@link ConsecutiveIndexIntervalIterator}
   * is requested by {@link #getComposedIntervalIterator(Index)} this array will be used to store <em>cumulative</em> gaps. */
  private final int gap[];
  
  /** Returns a document iterator that computes the consecutive AND of the given array of iterators.
   * 
   * <P>Note that the special case of the empty and of the singleton arrays
   * are handled efficiently.
   * 
    * @param index the default index; relevant only if <code>it</code> has zero length.
   * @param documentIterator the iterators to be composed.
   * @return a document iterator that computes the consecutive AND of <code>it</code>. 
   * @throws IOException 
   */
  public static DocumentIterator getInstance( final Index index, final DocumentIterator... documentIterator ) throws IOException {
    if ( documentIterator.length == 0 ) return TrueDocumentIterator.getInstance( index );
    if ( documentIterator.length == 1 ) return documentIterator[ 0 ];
    return new ConsecutiveDocumentIterator( documentIterator, null );
  }
  
  /** Returns a document iterator that computes the consecutive AND of the given nonzero-length array of iterators.
   * 
   * <P>Note that the special case of the singleton array is handled efficiently.
   * 
   * @param documentIterator the iterators to be composed (at least one).
   * @return a document iterator that computes the consecutive AND of <code>documentIterator</code>. 
   * @throws IOException 
   */
  public static DocumentIterator getInstance( final DocumentIterator... documentIterator ) throws IOException {
    if ( documentIterator.length == 1 ) return documentIterator[ 0 ];
    return getInstance( null, documentIterator );
  }
  
  /** Returns a document iterator that computes the consecutive AND of the given nonzero-length array of iterators, adding
   * gaps between intervals.
   * 
   * <p>A match will satisfy the condition
   * that the left extreme of the first interval is larger than or equal to the
   * first gap, the left extreme of the second interval is larger than 
   * the right extreme of the first interval plus the second gap, and so on. This semantics
   * makes it possible to perform phrasal searches &ldquo;with holes&rdquo;, typically
   * because of stopwords that have not been indexed.
   * 
   * @param documentIterator the iterators to be composed (at least one).
   * @param gap an array of gaps parallel to <code>documentIterator</code>, or <code>null</code> for no gaps. 
   * @return a document iterator that computes the consecutive AND of <code>documentIterator</code> using the given gaps.  
   * @throws IOException 
   */
  public static DocumentIterator getInstance( final DocumentIterator documentIterator[], final int gap[] ) throws IOException {
    if ( gap != null && gap.length != documentIterator.length ) throw new IllegalArgumentException( "The number of gaps (" + gap.length + ") is not equal to the number of document iterators (" + documentIterator.length +")" );
    if ( documentIterator.length == 1 && ( gap == null || gap[ 0 ] == 0 ) ) return documentIterator[ 0 ];
    return new ConsecutiveDocumentIterator( documentIterator, gap );
  }
  
  protected ConsecutiveDocumentIterator( final DocumentIterator[] documentIterator, final int[] gap ) throws IOException {
    super( documentIterator );
    if ( gap == null ) this.gap = new int[ n ];
    else this.gap = gap.clone();
  }


  protected IntervalIterator getComposedIntervalIterator( final Index unused ) {
    if ( ASSERTS ) assert unused == soleIndex;
    if ( indexIterator == null ) return new ConsecutiveIntervalIterator( gap );
    // In this case, gap must be made cumulative.
    for( int i = 1; i < n; i++ ) gap[ i ] += gap[ i - 1 ] + 1;
    return new ConsecutiveIndexIntervalIterator( gap );
  }


  
  /** An interval iterator returning the BLOCK of the component iterator 
   * (i.e., intervals made of sequences of consecutive intervals
   * of the component iterator, in the given order). 
   * 
   * <p>In this implementation, {@link #advanced} is
   * never true when {@link AbstractOrderedIntervalIterator#endOfProcess} is true. 
   */
  
  private class ConsecutiveIntervalIterator extends AbstractOrderedIntervalIterator {
    /** A cached reference to the gap array. */
    @SuppressWarnings("hiding")
    private final int[] gap;
    /** The actual gaps. They depend on whether some {@link IntervalIterators#TRUE} iterator reduces the iterator array. */
    private final int[] actualGap;
    /** Whether the scan is over. */
    private boolean endOfProcess;
    /** The number of non-{@link IntervalIterators#TRUE} interval iterator. */
    private int m;


    public ConsecutiveIntervalIterator( final int[] gap ) {
      this.gap = gap;
      // The enlargement is made necessary by the filling long in reset().
      this.actualGap = new int[ n + 1 ];
    }


    /** Loads {@link #curr} with the first interval from each non-{@link IntervalIterators#TRUE} iterator, leaving
     * in {@link #m} the number of non-{@link IntervalIterators#TRUE} iterators, and in {@link #actualGap}
     * the gaps to be used for those {@link #m} iterators.
     */


    public void reset() throws IOException {
      final int[] actualGap = this.actualGap;
      final int[] gap = this.gap;
      final IntervalIterator[] intervalIterator = this.intervalIterator;
      
      actualGap[ m = 0 ] = 0;


      int i;
      for( i = 0; i < n; i++ ) {
        intervalIterator[ m ] = documentIterator[ i ].intervalIterator();
        if ( ASSERTS ) assert intervalIterator[ m ].hasNext();
        actualGap[ m ] += gap[ i ];
        curr[ m ] = Intervals.MINUS_INFINITY;
        if ( intervalIterator[ m ] != IntervalIterators.TRUE ) actualGap[ ++m ] = 1;
      }


      if ( m == 0 ) throw new IllegalStateException();      


      next = null;
      do  curr[ 0 ] = intervalIterator[ 0 ].nextInterval(); while( curr[ 0 ] != null && curr[ 0 ].left < actualGap[ 0 ] );
      if ( ! ( endOfProcess = curr[ 0 ] == null ) ) next = align();
    }
    
    public void intervalTerms( final IntSet terms ) {
      for( int i = m; i-- != 0; ) intervalIterator[ i ].intervalTerms( terms );
    }
    
    private Interval align() throws IOException {
      if ( DEBUG ) System.err.println( this + ".align()" );
      
      final Interval[] curr = this.curr;
      final int[] actualGap = this.actualGap;
      final IntervalIterator[] intervalIterator = this.intervalIterator;
      
      if ( DEBUG ) System.err.println( java.util.Arrays.asList( curr ) );
      int k = 0;
      Interval interval;


      while( k < m ) {
        for ( k = 1; k < m; k++ ) {
          while ( curr[ k ].left < curr[ k - 1 ].right + actualGap[ k ] )
            if ( ( curr[ k ] = intervalIterator[ k ].nextInterval() ) == null ) {
              endOfProcess = true;
              return null;
            }


          if ( curr[ k ].left > curr[ k - 1 ].right + actualGap[ k ] ) {
            final int limit = curr[ k ].left - actualGap[ k ];
            /* Note that we are not interested in intervals whole right extreme is
             * smaller than limit, as it is impossible to find a match in that case. */
            while ( ( interval = intervalIterator[ 0 ].nextInterval() ) != null && interval.right < limit );
            if ( endOfProcess = interval == null ) return null;
            curr[ 0 ] = interval;
            break;
          } 
        }
      }
      
      return Interval.valueOf( curr[ 0 ].left - actualGap[ 0 ], curr[ m - 1 ].right );
    }
    
    public Interval nextInterval() throws IOException {
      if ( next != null ) {
        final Interval result = next;
        next = null; 
        return result;
      }


      if ( endOfProcess ) return null;
      
      if ( ( curr[ 0 ] = intervalIterator[ 0 ].nextInterval() ) == null ) {
        endOfProcess = true;
        return null;
      }


      return align();
    }
    
    public int extent() {
      int s = 0;
      for ( int i = m; i-- != 0; ) s += intervalIterator[ i ].extent() + actualGap[ i ];
      return s;
    }


  }




  private class ConsecutiveIndexIntervalIterator extends AbstractOrderedIndexIntervalIterator {
    /** A cached reference to the gap array. */
    @SuppressWarnings("hiding")
    private final int[] gap;
    /** Whether the scan is over. */
    private boolean endOfProcess;
    
    public ConsecutiveIndexIntervalIterator( final int[] gap ) {
      this.gap = gap;
    }
    
    /** Resets the iterator by calling the superclass method, and then aligning all iterators.
     * 
     * <p>Note that in this class {@link #curr} is used to denote the value of the current position
     * minus the corresponding {@linkplain #gap cumulative gap}; this method updates {@link #curr} accordingly. */


    public void reset() throws IOException {
      final int[][] position = this.position;
      final int[] currPos = this.currPos;
      final int[] curr = this.curr;
      final int[] count = this.count;
      final int[] gap = this.gap;


      IntArrays.fill( currPos, 0 );
      for( int i = n; i-- != 0; ) {
        count[ i ] = indexIterator[ i ].count();
        position[ i ] = indexIterator[ i ].positionArray();
        curr[ i ] = position[ i ][ 0 ];
      }
      endOfProcess = false;
      for( int i = n; i-- != 0; ) curr[ i ] -= gap[ i ];
      
      if ( gap[ 0 ] != 0 ) {
        // Go beyond the 0-th gap. This must be done just once.
        next = null;
        while ( curr[ 0 ] < 0 && ++currPos[ 0 ] < count[ 0 ] ) curr[ 0 ] = position[ 0 ][ currPos[ 0 ] ] - gap[ 0 ];
        endOfProcess = currPos[ 0 ] == count[ 0 ];
      }
      if ( ! endOfProcess ) next = align();
    }


    public void intervalTerms( final IntSet terms ) {
      for( int i = n; i-- != 0; ) terms.add( indexIterator[ i ].termNumber() );
    }
    
    private Interval align() {
      if ( DEBUG ) System.err.println( this + ".align()" );


      final int[][] position = this.position;
      final int[] currPos = this.currPos;
      final int[] curr = this.curr;
      final int[] count = this.count;
      final int[] gap = this.gap;
      
      int c, k = 1, l = n <= 2 ? 0 : 2; // This is actually 2 % n
      boolean oneRoundDone = false;
      int[] p;
      int start = curr[ 0 ];
      
      for(;;) {
        p = position[ k ];
        c = currPos[ k ];
        // First, we try to align the k-th term.
        while ( c < count[ k ] && p[ c ] - gap[ k ] < start ) c++;
        // If we exhaust the term positions, it's all over.
        if ( c == count[ k ] ) {
          endOfProcess = true;
          return null;
        }
        curr[ k ] = p[ currPos[ k ] = c ] - gap[ k ];
        // If we went beyond start + k, we must update start.
        if ( curr[ k ] > start ) start = curr[ k ]; 
        
        // If k == 0, it means we have made a full round of alignment, so the next check is now valid.
        oneRoundDone |= ( k == 0 );
        // If oneRoundDone, all current normalised positions (curr[ x ] - gap[ x ]) are squeezed between start and curr[ l ].
        if ( oneRoundDone && curr[ l ] == start ) return Interval.valueOf( curr[ 0 ], curr[ 0 ] + gap[ n - 1 ] ); 
        k = l;
        
        if ( ( l = l + 1 ) == n ) l = 0;
      }
    }
    
    public Interval nextInterval() {
      if ( next != null ) {
        final Interval result = next;
        next = null; 
        return result;
      }


      if ( endOfProcess ) return null;
      
      if ( ++currPos[ 0 ] < count[ 0 ] ) curr[ 0 ] = position[ 0 ][ currPos[ 0 ] ] - gap[ 0 ];
      else {
        endOfProcess = true;
        return null;
      }
      return align();      
    }
  }
}
Source Code of it.unimi.dsi.mg4j.search.ConsecutiveDocumentIterator$ConsecutiveIndexIntervalIterator

Related Classes of it.unimi.dsi.mg4j.search.ConsecutiveDocumentIterator$ConsecutiveIndexIntervalIterator