Examples of it.unimi.dsi.mg4j.index.IndexIterator

it.unimi.dsi.mg4j.index.IndexIterator
An iterator over an inverted list.
An index iterator scans the inverted list of an indexed term. Each integer returned by {@link DocumentIterator#nextDocument() nextDocument()} is the index of a document containing the term. If the index contains counts, they can be obtained after each call to {@link #nextDocument()} using {@link #count()}. Then, if the index contains positions they can be obtained as an array using {@link #positionArray()}, as an iterator using {@link #positions()}, or stored into an array using {@link #positions(int[])}.
Note that this interface extends {@link it.unimi.dsi.mg4j.search.DocumentIterator}. The intervals returned for a document are exactly length-one intervals corresponding to the positions returned by {@link #positions()}. If the index to which an instance of this class refers does not contain positions, an {@link UnsupportedOperationException}will be thrown.
Additionally, this interface strengthens {@link DocumentIterator#weight(double)} so thatit {@linkplain #weight(double) returns an index iterator}.

  }
  
  public DocumentIterator visitPost( final MultiTerm node, final DocumentIterator subNode[] ) throws QueryBuilderVisitorException {
    final IndexIterator[] indexIterator = new IndexIterator[ subNode.length ];
    System.arraycopy( subNode, 0, indexIterator, 0, indexIterator.length );
    IndexIterator result;
    try {
      result = MultiTermIndexIterator.getInstance( curr.top(), indexIterator ).weight( weights.popDouble() );
    }
    catch ( IOException e ) {
      throw new QueryBuilderVisitorException( e );
    }
    result.term( node.toString() );
    return result;
  }

View Full Code Here


    if ( ! metadataOnly ) {
      int currIndex, numPrevDocs = 0, currDoc, count;
      OutputBitStream obs;
      Index i;
      IndexIterator ii;


      if ( p != 0 ) variableQuantumIndexWriter.newInvertedList(totalFrequency, p, predictedSize, predictedLengthNumBits ); 
      else indexWriter.newInvertedList();
       
      indexWriter.writeFrequency( totalFrequency );


      for( int k = currIndex = 0; k < numUsedIndices; k++ ) { // We can just concatenated posting lists.


        // We must update the number of previously seen documents, possibly adding those in skipped indices.
        while( currIndex < usedIndex[ k ] ) numPrevDocs += index[ currIndex++ ].numberOfDocuments;


        i = index[ currIndex ];
        ii = indexIterator[ currIndex ];


        for( int j = frequency[ currIndex ]; j-- != 0; ) {
          obs = indexWriter.newDocumentRecord();
          currDoc = ii.nextDocument() + numPrevDocs;
          indexWriter.writeDocumentPointer( obs, currDoc );


          if ( i.hasPayloads ) indexWriter.writePayload( obs, ii.payload() );


          if ( i.hasCounts ) {
            count = ii.count();
            if ( hasCounts ) indexWriter.writePositionCount( obs, count );
            if ( hasPositions ) indexWriter.writeDocumentPositions( obs, ii.positionArray(), 0, count, size != null ? size[ currDoc ] : -1 );
          }    
        }
      }
    }

View Full Code Here

      indexWriter.writeFrequency( totalFrequency );


      int currDoc = -1, count; 
      OutputBitStream obs;
      Index i;
      IndexIterator ir;


      while( ! documentQueue.isEmpty() ) {
        // We extract the smallest document pointer, and enqueue it in the new index.
        if ( currDoc == doc[ currIndex = documentQueue.first() ] ) throw new IllegalStateException( "The indices to be merged contain document " + currDoc + " at least twice (once in index " + inputBasename[ lastIndex ] + " and once in index " + inputBasename[ currIndex ] + ")" );
        currDoc = doc[ currIndex ];


        obs = indexWriter.newDocumentRecord();
        indexWriter.writeDocumentPointer( obs, currDoc );
        i = index[ currIndex ];
        ir = indexIterator[ currIndex ];


        if ( i.hasPayloads ) indexWriter.writePayload( obs, ir.payload() );


        if ( i.hasCounts ) {
          count = ir.count();
          if ( hasCounts ) indexWriter.writePositionCount( obs, count );
          if ( hasPositions ) indexWriter.writeDocumentPositions( obs, ir.positionArray(), 0, count, size == null ? -1 : size[ currDoc ] );
        }


        // If we just wrote the last document pointer of this term in index j, we dequeue it.
        if ( --frequency[ currIndex ] == 0 ) documentQueue.dequeue();
        else {
          doc[ currIndex ] = ir.nextDocument();
          documentQueue.changed();
        }
        lastIndex = currIndex;
      }
    }

View Full Code Here

        
        if ( p0 != null && p1 != null ) return 0;
        if ( p0 != null ) return -1;
        if ( p1 != null ) return 1;
        
        final IndexIterator i0 = d0 instanceof IndexIterator ? (IndexIterator)d0 : null;
        final IndexIterator i1 = d1 instanceof IndexIterator ? (IndexIterator)d1 : null;
        if ( i0 == null && i1 == null ) return 0;
        if ( ( i0 != null ) != ( i1 != null ) ) return ( i0 != null ) ? 1 : -1;
        try {
          return i1.frequency() - i0.frequency();
        }
        catch ( IOException e ) {
          throw new RuntimeException( e );
        }
      }

View Full Code Here

     * the maximum over all indices. */
    int currIndex, prevDoc = -1, currDoc, count;
    int temp[];
    OutputBitStream obs;
    Index i;
    IndexIterator ii;
  
    // Note that the total frequency can be computed only during the merge.
    for( int k = numUsedIndices; k-- != 0; ) {
      currIndex = usedIndex[ k ];
      frequency[ currIndex ] = indexIterator[ currIndex ].frequency();
      doc[ currIndex ] = indexIterator[ currIndex ].nextDocument();
      documentQueue.enqueue( currIndex );
    }
    
    // First phase: we write the inverted list using a quick-and-dirty format in the cache.
    cacheBitStreamOut.position( 0 );
    int  totalFrequency = 0, increment, prevIndex, totalCount;
    
    while( ! documentQueue.isEmpty() ) {
      // We extract the smallest document pointer, and enqueue it in the new index.
      currDoc = doc[ currIndex = documentQueue.firstInt() ];
      totalFrequency++;
      if ( ! metadataOnly ) cacheBitStreamOut.writeDelta( currDoc - prevDoc - 1 );
      
      totalCount = prevIndex = increment = 0;
      
      do {
        if ( incremental) 
          while( prevIndex < currIndex ) {
            /* Note that some virtual documents could not exist at all in some index (in which
             * case we extend the size list with zeroes). */ 
            if ( sizesSize[ prevIndex ] > currDoc ) increment += index[ prevIndex ].sizes.getInt( currDoc );
            prevIndex++;
          }
        i = index[ currIndex ];


        i = index[ currIndex ];
        ii = indexIterator[ currIndex ];
      
        if ( ! metadataOnly && i.hasCounts ) {
          count = ii.count();
          if ( i.hasPositions ) {
            temp = ii.positionArray();
            if ( ! incremental && totalCount > 0 && temp[ 0 ] <= position[ totalCount - 1 ] ) throw new IllegalStateException( "Positions in document " + currDoc + " are not increasing; you probably need to require an incremental pasting" );
            for( int k = count; k-- != 0; ) position[ totalCount + k ] = temp[ k ] + increment;
          }
          totalCount += count;
        }
        
        // If we just wrote the last document pointer of this term in index j, we dequeue it.
        if ( --frequency[ currIndex ] == 0 ) documentQueue.dequeue();
        else {
          doc[ currIndex ] = ii.nextDocument();
          documentQueue.changed();
        }
      } while( ! documentQueue.isEmpty() && doc[ currIndex = documentQueue.firstInt() ] == currDoc );
  
      if ( totalCount > maxCount ) maxCount = totalCount;

View Full Code Here

      int left = 0; // The left extreme of the current block
      long count = 0; // Number of documents/occurrences in the current block


      final IndexReader indexReader = index.getReader();
      long blockSize = total / blockSizeDivisor++; // The approximate size of a block
      IndexIterator indexIterator;
      
      for ( int i = k = 0; i < terms; i++ ) {
        indexIterator = indexReader.nextIterator();
        frequency = indexIterator.frequency();
        if ( ! index.hasPositions ) count += frequency; 
        for ( int j = frequency; j-- != 0; ) {
          indexIterator.nextDocument();
          if ( index.hasPositions ) count += indexIterator.count();
        }
        
        if ( i == terms - 1 ) i++; // To fool the next
        if ( count >= blockSize && k < numberOfLocalIndices - 1 || i == terms ) {
          LOGGER.info( "New term interval [" + left + ".." + i + "] (" + termMap.list().get( left ) + " -> " + ( i == terms ? "" : termMap.list().get( i ) ) + ")" );

View Full Code Here

  private static boolean textTerm =false;


  public static void testIndexIterator() throws IOException{
    IndexReader firstIndexReader =firstIndex.getReader();
    IndexReader secondIndexReader =secondIndex.getReader(1000);
    IndexIterator firstIterator = null;
    IndexIterator secondIterator = null;
    
    for(int i = 0;i<firstIndex.numberOfTerms;i++){
      try{
        System.out.println("term: " + i);
        firstIterator = firstIndexReader.documents(i);        
        secondIterator = !textTerm?secondIndexReader.documents(i):secondIndexReader.documents(firstIndex.termMap.list().get(i));        
        
        /** Compare hasNext*/
        Assert.assertEquals(firstIterator.hasNext(), secondIterator.hasNext());
        
        /** Compare frequency*/
        Assert.assertEquals(firstIterator.frequency(), secondIterator.frequency());                
        
        /** Compare positions & count*/      
        while(firstIterator.hasNext()){
          int fr = firstIterator.nextDocument();
          int sr = secondIterator.nextDocument();    
          Assert.assertEquals(fr,sr);                
          /** Compare count*/
          Assert.assertEquals(firstIterator.count(), secondIterator.count());
          
          
          
          
          int[] firstPos = new int[1000];
          int[] secondPos = new int[1000];
          int fRet = firstIterator.positions(firstPos);
          int sRet = secondIterator.positions(secondPos);
          System.out.println(fRet + "  " + sRet);
          Assert.assertTrue(fRet == sRet);
          
          for(int j = 0;j<fRet;j++)
            Assert.assertEquals(firstPos[j],secondPos[j]);        
        }
        
        /** Compare positions int[] positionArray()*/
        while(firstIterator.hasNext()){
          secondIterator.next();
          int []firstPos = firstIterator.positionArray();
          int []secondPos = secondIterator.positionArray();
          Assert.assertTrue(firstPos.length == secondPos.length);
          for(int j = 0;j<firstPos.length;j++)
            Assert.assertTrue(firstPos[j] == secondPos[j]);        
        }


        /** Compare IntIterator from positions() method */
        firstIterator = firstIndexReader.documents(i);
        secondIterator = !textTerm?secondIndexReader.documents(i):secondIndexReader.documents(firstIndex.termMap.list().get(i));        
        while(firstIterator.hasNext()){
          firstIterator.next();
          secondIterator.next();
  
          IntIterator firstIntIt = firstIterator.positions();
          IntIterator secondIntIt = secondIterator.positions();        
          while(firstIntIt.hasNext()){                  
            Assert.assertEquals(firstIntIt.nextInt(),secondIntIt.nextInt());
          }        
          Assert.assertEquals(firstIntIt.skip(2),secondIntIt.skip(2));        
          if(firstIntIt.hasNext()){
            Assert.assertEquals(firstIntIt.nextInt(),secondIntIt.nextInt());
          }        
          Assert.assertEquals(firstIntIt.skip(9999999),secondIntIt.skip(9999999));        
          if(firstIntIt.hasNext()){        
            Assert.assertEquals(firstIntIt.nextInt(),secondIntIt.nextInt());
          }
          
        }
        
        /** Compare IntervalIterator from Interval() method */
        firstIterator = firstIndexReader.documents(i);
        secondIterator = !textTerm?secondIndexReader.documents(i):secondIndexReader.documents(firstIndex.termMap.list().get(i));        
        while(firstIterator.hasNext()){
          firstIterator.next();
          secondIterator.next();  
          /** Compare position IntIterator*/
          IntervalIterator firstIntervalIt = firstIterator.intervalIterator(firstIndex);
          IntervalIterator secondIntervalIt = secondIterator.intervalIterator(secondIndex);
          while(firstIntervalIt.hasNext()){              
            Interval firstIntv = firstIntervalIt.nextInterval();
            Interval secondIntv = secondIntervalIt.nextInterval();
            
            System.out.println("left:" +  firstIntv.left + "   " + "right:" + firstIntv.right);

View Full Code Here

  @Override
  public IndexIterator documents( final CharSequence prefix, final int limit ) throws IOException, TooManyTermsException {
    final ArrayList<DocumentIterator> iterators = new ArrayList<DocumentIterator>( localIndex.length );
    final IntArrayList usedIndices = new IntArrayList();


    IndexIterator documentIterator;
    for ( int i = 0; i < localIndex.length; i++ ) {
      // TODO: check for limit globally
      documentIterator = localIndex[ i ].documents( prefix, limit );
      if ( documentIterator.hasNext() ) {
        iterators.add( documentIterator );
        usedIndices.add( i );
      }
    }
    // TODO: test that this clustered multiterm does work
    final IndexIterator result = concatenated ?
        new DocumentalConcatenatedClusterIndexIterator( (DocumentalClusterIndexReader)getReader(), iterators.toArray( IndexIterators.EMPTY_ARRAY ), usedIndices.toIntArray() ) :
          new DocumentalMergedClusterIndexIterator( (DocumentalClusterIndexReader)getReader(), iterators.toArray( IndexIterators.EMPTY_ARRAY ), usedIndices.toIntArray() );
    result.term( prefix );
    return result;
    
  }

View Full Code Here

    if ( ! index.flat ) throw new UnsupportedOperationException( "Only flat documental clusters allow access by term number" );


    final IndexIterator[] iterator = new IndexIterator[ indexReader.length ];
    for ( int i = 0; i < indexReader.length; i++ ) iterator[ i ] = indexReader[ i ].documents( term );


    final IndexIterator indexIterator =
      index.concatenated ? 
          new DocumentalConcatenatedClusterIndexIterator( this, iterator, index.allIndices ) :
            new DocumentalMergedClusterIndexIterator( this, iterator, index.allIndices ) ;
          
    return indexIterator;

View Full Code Here

  public IndexIterator documents( final CharSequence term ) throws IOException {
    final ArrayList<IndexIterator> iterators = new ArrayList<IndexIterator>( indexReader.length );
    final IntArrayList usedIndices = new IntArrayList();
    for ( int i = 0; i < indexReader.length; i++ ) {
      if ( index.termFilter == null || index.termFilter[ i ].contains( term ) ) {
        IndexIterator it = indexReader[ i ].documents( term );
        if ( it.hasNext() ) {
          iterators.add( it );
          usedIndices.add( i );
        }
      }
    }


    if ( DEBUG ) LOGGER.debug( "Indices used for " + term + ": " + usedIndices );


    if ( iterators.isEmpty() ) return index.getEmptyIndexIterator( term );
    final IndexIterator indexIterator =
      index.concatenated ? 
          new DocumentalConcatenatedClusterIndexIterator( this, iterators.toArray( IndexIterators.EMPTY_ARRAY ), usedIndices.toIntArray() ) :
            new DocumentalMergedClusterIndexIterator( this, iterators.toArray( IndexIterators.EMPTY_ARRAY ), usedIndices.toIntArray() ) ;
          
    indexIterator.term( term );
    return indexIterator;
  }

View Full Code Here

0 1

TOP

Related Classes of it.unimi.dsi.mg4j.index.IndexIterator

it.unimi.dsi.mg4j.index.cluster.DocumentalCluster

it.unimi.dsi.mg4j.index.cluster.DocumentalClusterIndexReader

it.unimi.dsi.mg4j.index.cluster.LexicalStrategies

it.unimi.dsi.mg4j.index.MultiTermIndexIteratorTest

it.unimi.dsi.mg4j.search.AbstractIntersectionDocumentIterator

it.unimi.dsi.mg4j.search.BitStreamIndexReaderTest

it.unimi.dsi.mg4j.search.DocumentIteratorBuilderVisitor

it.unimi.dsi.mg4j.test.RemoteIndexIteratorTest

it.unimi.dsi.mg4j.test.Verifier

it.unimi.dsi.mg4j.tool.Concatenate

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.