Package it.unimi.dsi.mg4j.search

Source Code of it.unimi.dsi.mg4j.search.LowPassDocumentIterator$LowPassIntervalIterator

package it.unimi.dsi.mg4j.search;

/*    
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2003-2010 Paolo Boldi and Sebastiano Vigna
*
*  This library is free software; you can redistribute it and/or modify it
*  under the terms of the GNU Lesser General Public License as published by the Free
*  Software Foundation; either version 3 of the License, or (at your option)
*  any later version.
*
*  This library is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
*  for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/

import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
import it.unimi.dsi.fastutil.objects.ReferenceSet;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;
import it.unimi.dsi.util.Interval;

import java.io.IOException;
import java.util.Iterator;



/** A document iterator that filters another document iterator, returning just intervals (and containing
* documents) whose length does not exceed a given threshold.
*
* @author Paolo Boldi
* @author Sebastiano Vigna
* @since 0.9
*/

public class LowPassDocumentIterator extends AbstractDocumentIterator {

  private final static boolean DEBUG = false;
  @SuppressWarnings("unused")
  private final static boolean ASSERTS = false;

  /** The underlying iterator. */
  final private DocumentIterator documentIterator;
  /** If not <code>null</code>, the sole index involved in this iterator. */
  final private Index soleIndex;
  /** The iterator threshold. */
  final protected int threshold;
  /** A map from indices to interval iterators. */
  final private Reference2ReferenceArrayMap<Index,IntervalIterator> intervalIterators;
  /** A map from indices to the iterators returned for the current document. The key set may
   * not contain an index because the related iterator has never been requested. Moreover,
   * the iterator in this map for a given index may differ from the one in {@link #intervalIterators}
   * because it could be {@link IntervalIterators#TRUE} (in fact, in that case it may even
   * happen that {@link #intervalIterators} does not contain the index). */
  final private Reference2ReferenceArrayMap<Index,IntervalIterator> currentIterators;
  /** An unmodifiable wrapper around {@link #currentIterators}. */
  final private Reference2ReferenceMap<Index,IntervalIterator> unmodifiableCurrentIterators;


  /** Creates a new low-pass document iterator over a given iterator.
   * @param documentIterator the iterator to be filtered.
   * @param threshold the filter threshold.
   */
  protected LowPassDocumentIterator( final DocumentIterator documentIterator, final int threshold ) {
    this.documentIterator = documentIterator;
    this.threshold = threshold;
    final int n = documentIterator.indices().size();
    soleIndex = n == 1 ? indices().iterator().next() : null;
    intervalIterators = new Reference2ReferenceArrayMap<Index,IntervalIterator>( n );
    currentIterators = new Reference2ReferenceArrayMap<Index,IntervalIterator>( n );
    unmodifiableCurrentIterators = Reference2ReferenceMaps.unmodifiable( currentIterators );
  }

  /** Returns a low-pass document iterator over a given iterator.
   * @param it the iterator to be filtered.
   * @param threshold the filter threshold.
   */
  public static LowPassDocumentIterator getInstance( final DocumentIterator it, final int threshold ) {
    return new LowPassDocumentIterator( it, threshold );
  }

  public ReferenceSet<Index> indices() {
    return documentIterator.indices();
  }

  public int nextDocument() throws IOException {
    if ( next >= 0 ) {
      last = next;
      next = -1;
      return last;
    }
   
    do currentIterators.clear(); while( ( last = documentIterator.nextDocument() ) != -1 && ! isValid() );
    return last;
  }
 
  public int skipTo( final int n ) throws IOException {
    // The easy case.
    if ( last >= n ) return last;

    last = next = -1;
    currentIterators.clear();
    // We first try to get a candidate document.
    final int res = documentIterator.skipTo( n );
    // If this doesn't work, be bail out.
    if ( res == Integer.MAX_VALUE ) return Integer.MAX_VALUE;

    last = res;
    // Otherwise, we must manually check that we are on a valid document
    if ( isValid() ) return res;
    // If not, we invalidate and check whether there is another possible document.
    return nextDocument() != -1 ? last : Integer.MAX_VALUE;
  }

  private boolean isValid() throws IOException {
    /* The policy here is that a low-pass is valid is at least one of the underlying
     * interval iterators, once filtered, would return at least one interval. Note
     * that TRUE iterators are not actually filtered, so they always
     * return true on a call to hasNext(). */
   
    if ( soleIndex == null ) return intervalIterator( soleIndex ).hasNext();
   
    for( Index index: indices() ) if ( intervalIterator( index ).hasNext() ) return true;
    return false;
  }

  public Reference2ReferenceMap<Index,IntervalIterator> intervalIterators() throws IOException {
    final Iterator<Index> i = indices().iterator();
    while( i.hasNext() ) intervalIterator( i.next() );
    return unmodifiableCurrentIterators;
  }

  public IntervalIterator intervalIterator() throws IOException {
    if ( soleIndex == null ) throw new IllegalStateException();
    return intervalIterator( soleIndex );
  }

  public IntervalIterator intervalIterator( final Index index ) throws IOException {
    if ( last == -1 ) throw new IllegalStateException();
    if ( DEBUG ) System.err.println( this + ".intervalIterator(" + index + ")" );
    if ( ! documentIterator.indices().contains( index ) ) return IntervalIterators.TRUE;

    IntervalIterator intervalIterator;

    // If the iterator has been created and it's ready, we just return it.   
    if ( ( intervalIterator = currentIterators.get( index ) ) != null ) return intervalIterator;

    intervalIterator = documentIterator.intervalIterator( index );
     
    /* If the underlying iterator is TRUE or FALSE, then our constribution to the result is not relevant,
     * and we just pass this information upwards. E.g., consider the query (A OR title:B)~2 with
     * a document containing A but not B in its title. When evaluating the query for the title index,
     * the subquery before the low-pass operator evalutes to TRUE, meaning that its truth is independent
     * of the title field. This fact is not changed by the low-pass operator. */
    if ( intervalIterator != IntervalIterators.FALSE && intervalIterator != IntervalIterators.TRUE ) {
      intervalIterator = intervalIterators.get( index );
      if ( intervalIterator == null ) intervalIterators.put( index, intervalIterator = new LowPassIntervalIterator( index ) );
      intervalIterator.reset();
    }
   
    currentIterators.put( index, intervalIterator )
    return intervalIterator;
  }

  public void dispose() throws IOException {
    documentIterator.dispose();
  }
 
  public <T> T accept( final DocumentIteratorVisitor<T> visitor ) throws IOException {
    if ( ! visitor.visitPre( this ) ) return null;
    final T[] a = visitor.newArray( 1 );
    if ( a == null ) {
      if ( documentIterator.accept( visitor ) == null ) return null;
    }
    else {
      if ( ( a[ 0 ] = documentIterator.accept( visitor ) ) == null ) return null;
    }
    return visitor.visitPost( this, a );
  }

  public <T> T acceptOnTruePaths( final DocumentIteratorVisitor<T> visitor ) throws IOException {
    if ( ! visitor.visitPre( this ) ) return null;
    final T[] a = visitor.newArray( 1 );
    if ( a == null ) {
      if ( documentIterator.acceptOnTruePaths( visitor ) == null ) return null;     
    }
    else {
      if ( ( a[ 0 ] = documentIterator.acceptOnTruePaths( visitor ) ) == null ) return null;
    }
    return visitor.visitPost( this, a );
  }
 
  public String toString() {
     return this.getClass().getSimpleName() + "(" + documentIterator + ", " + threshold + ")";
  }
 
  /** An interval iterator returning just the interval shorter than {@link #threshold}. */
 
  private class LowPassIntervalIterator extends AbstractIntervalIterator implements IntervalIterator {
    /** The index of this iterator. */
    final Index index;
    /** The underlying interal iterator. */
    private IntervalIterator intervalIterator;
   
    public LowPassIntervalIterator( final Index index ) {
      this.index = index;
    }

    public void reset( ) throws IOException {
      next = null;
      intervalIterator = documentIterator.intervalIterator( index );
    }

    public void intervalTerms( final IntSet terms ) {
      // Just delegate to the filtered iterator
      intervalIterator.intervalTerms( terms );
    }
   
    public Interval nextInterval() throws IOException {
      if ( next != null ) {
        final Interval result = next;
        next = null;
        return result;
      }

      Interval result;
      while( ( result = intervalIterator.nextInterval() ) != null && result.length() > threshold );
      return result;
    }
   
    public int extent() {
      return Math.min( intervalIterator.extent(), threshold );
    }
   
    public String toString() {
       return getClass().getSimpleName() + "(" + intervalIterator + ", " + threshold + ")";
    }
  }
}
TOP

Related Classes of it.unimi.dsi.mg4j.search.LowPassDocumentIterator$LowPassIntervalIterator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.