Examples of it.unimi.dsi.mg4j.document.DocumentSequence

it.unimi.dsi.mg4j.document.DocumentSequence
A sequence of documents.
This is the most basic class available in MG4J for representing a sequence to documents to be indexed. Its only duty is to be able to return once an iterator over the documents in sequence.
The iterator returned by {@link #iterator()} must always return thesame documents in the same order, given the same external conditions (standard input, file system, etc.).
Document sequences must always return documents of the same type. This is usually accomplished by providing at construction time a {@link DocumentFactory}that will be used to build and parse documents. Of course, it is possible to create document sequences with a hardwired factory (see, e.g., {@link it.unimi.dsi.mg4j.document.ZipDocumentCollection}).
Some sequences might require invoking {@link #filename(CharSequence)} toaccess ancillary data. {@link AbstractDocumentSequence#load(CharSequence)} isthe suggest method for deserialising sequences, as it will do it for you.

    if ( jsap.messagePrinted() ) return;


    if ( ( jsapResult.userSpecified( "builderClass" ) || jsapResult.userSpecified( "exact" ) ) && ! jsapResult.userSpecified( "buildCollection" ) )  throw new IllegalArgumentException( "To specify options about the collection building process, you must specify a basename first." );
    if ( jsapResult.userSpecified( "sequence" ) && jsapResult.userSpecified( "objectSequence" ) ) throw new IllegalArgumentException( "You cannot specify both a serialised and an parseable-object sequence" );
    
    final DocumentSequence documentSequence = jsapResult.userSpecified( "objectSequence" ) ? (DocumentSequence)jsapResult.getObject( "objectSequence" ) : Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );


    final DocumentFactory factory = documentSequence.factory();
    final int[] indexedField = parseFieldNames( jsapResult.getStringArray( "indexedField" ), factory, jsapResult.getBoolean( "allFields" ) );
    final int batchSize = jsapResult.getInt( "batchSize" );
    final VirtualDocumentResolver[] virtualDocumentResolver = parseVirtualDocumentResolver( jsapResult.getStringArray( "virtualDocumentResolver" ), indexedField, factory );
    final int[] virtualDocumentGap = parseVirtualDocumentGap( jsapResult.getStringArray( "virtualDocumentGap" ), indexedField, factory );


    DocumentCollectionBuilder builder = null;
    if ( jsapResult.userSpecified( "buildCollection" ) ) {
      final Class<? extends DocumentCollectionBuilder> builderClass = jsapResult.getClass( "builderClass" );
      builder = builderClass != null ? builderClass.getConstructor( String.class, DocumentFactory.class, boolean.class ).newInstance( 
          jsapResult.getString( "buildCollection" ), 
          documentSequence.factory().numberOfFields() == indexedField.length ? documentSequence.factory().copy() : new SubDocumentFactory( documentSequence.factory().copy(), indexedField ), 
          Boolean.valueOf( jsapResult.getBoolean( "exact" ) ) ) : null;
    }


    run( jsapResult.getString( "basename" ), documentSequence, Completeness.valueOf( jsapResult.getString( "completeness" ) ), jsapResult.getBoolean( "downcase" ) ? DowncaseTermProcessor.getInstance() : ObjectParser.fromSpec( jsapResult
        .getString( "termProcessor" ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } ), builder, jsapResult

View Full Code Here

    });


    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;


    DocumentSequence documentSequence = Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );


    if ( ! jsapResult.userSpecified( "uris" ) && ! jsapResult.userSpecified( "titles" ) ) 
      throw new IllegalArgumentException( "You specify either a title or a URI output file" );
    
    Util.ensureLog4JIsConfigured();


    final DocumentIterator documentIterator = documentSequence.iterator();


    Document document;
    FastBufferedOutputStream uriStream = null, titleStream = null;
    
    if ( jsapResult.userSpecified( "uris" ) ) uriStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "uris" ) ) );

View Full Code Here

    if ( jsap.messagePrinted() ) return;


    if ( ( jsapResult.userSpecified( "builderClass" ) || jsapResult.userSpecified( "exact" ) ) && ! jsapResult.userSpecified( "buildCollection" ) )  throw new IllegalArgumentException( "To specify options about the collection building process, you must specify a basename first." );
    if ( jsapResult.userSpecified( "sequence" ) && jsapResult.userSpecified( "objectSequence" ) ) throw new IllegalArgumentException( "You cannot specify both a serialised and an parseable-object sequence" );
    
    final DocumentSequence documentSequence = jsapResult.userSpecified( "objectSequence" ) ? (DocumentSequence)jsapResult.getObject( "objectSequence" ) : Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );
    final DocumentFactory factory = documentSequence.factory();


    final int[] indexedField = Scan.parseFieldNames( jsapResult.getStringArray( "indexedField" ), factory, jsapResult.getBoolean( "allFields" ) );
    final VirtualDocumentResolver[] virtualDocumentResolver = Scan.parseVirtualDocumentResolver( jsapResult.getStringArray( "virtualDocumentResolver" ), indexedField, factory );
    final int[] virtualDocumentGap = Scan.parseVirtualDocumentGap( jsapResult.getStringArray( "virtualDocumentGap" ), indexedField, factory );


    final TermProcessor termProcessor = jsapResult.getBoolean( "downcase" ) ? DowncaseTermProcessor.getInstance() :
      ObjectParser.fromSpec( jsapResult.getString( "termProcessor" ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } ); 


    final boolean skips = ! jsapResult.getBoolean( "noSkips" );
    final boolean interleaved = jsapResult.getBoolean( "interleaved" );
    if ( ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) throw new IllegalArgumentException( "You specified quantum or height, but you also disabled skips." );


    DocumentCollectionBuilder builder = null;
    if ( jsapResult.userSpecified( "buildCollection" ) ) {
      final Class<? extends DocumentCollectionBuilder> builderClass = jsapResult.getClass( "builderClass" );
      builder = builderClass != null ? builderClass.getConstructor( String.class, DocumentFactory.class, boolean.class ).newInstance( 
          jsapResult.getString( "buildCollection" ), 
          documentSequence.factory().numberOfFields() == indexedField.length ? documentSequence.factory().copy() : new SubDocumentFactory( documentSequence.factory().copy(), indexedField ), 
          Boolean.valueOf( jsapResult.getBoolean( "exact" ) ) ) : null;
    }


    final IndexBuilder indexBuilder = new IndexBuilder( jsapResult.getString( "basename" ), documentSequence )
    .termProcessor( termProcessor )

View Full Code Here

      });
    
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;
    
    DocumentSequence documentSequence = it.unimi.dsi.mg4j.tool.Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );
    
    final DocumentFactory factory = documentSequence.factory();
    final boolean stem = jsapResult.getBoolean( "stem" );
    final boolean termLists = jsapResult.getBoolean( "termLists" );
    final int[] indexedField = it.unimi.dsi.mg4j.tool.Scan.parseFieldNames( jsapResult.getStringArray( "indexedField" ), factory, jsapResult.getBoolean( "allFields" )  );
    
    LOGGER.debug( "Parsed indexed field: " + IntArrayList.wrap( indexedField ) );
    
    final String basename = jsapResult.getString( "basename" ); 
    final String permutationFile = jsapResult.getString( "renumber" );


    final boolean isVirtual = jsapResult.getBoolean( "virtual" );


    int i, t = 0;


    final ProgressLogger pl = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "ints" );
    final Index[] index = stem ? new Index[ indexedField.length ] : new Index[ 1 ];
    final int numberOfTerms[] = new int[ indexedField.length ];
    final ObjectArrayList<MutableString>[] terms = new ObjectArrayList[ indexedField.length ];
    final IndexReader[] indexReader = new IndexReader[ index.length ];
    final InputBitStream[] frequencies = new InputBitStream[ index.length ];
    final int[][] count = new int[ index.length ][];
    final int[] permutation = permutationFile != null ? BinIO.loadInts( permutationFile ) : null;
    final int[][] occ = new int[ index.length ][];
    final int[][] wordInPos = new int[ index.length ][];
    final Int2IntMap[] termsInDoc = new Int2IntOpenHashMap[ index.length ];
    int totalTerms = 0;
    
    boolean allBitStreamIndices = true;
    
    for( i = 0; i < index.length; i++ ) {
      final String basenameField = basename + (stem ? "-" + factory.fieldName( indexedField[ i ] ) : "" );
      index[ i ] = Index.getInstance( basenameField );
      if ( ! ( index[ i ] instanceof BitStreamIndex ) ) allBitStreamIndices = false;
      
      if ( termLists ) {
        terms[ i ] = new ObjectArrayList<MutableString>( new FileLinesCollection( basenameField + DiskBasedIndex.TERMS_EXTENSION, "UTF-8" ).allLines() );
        numberOfTerms[ i ] = terms[ i ].size();
      }
      else numberOfTerms[ i ] = index[ i ].numberOfTerms;
      totalTerms += numberOfTerms[ i ];
      
      // This will be matched with the number of occurrences per document
      count[ i ] = new int[ index[ i ].numberOfDocuments ];


      occ[ i ] = index[ i ].maxCount > 0 ? new int[ index[ i ].maxCount ] : IntArrays.EMPTY_ARRAY;
      wordInPos[ i ] = new int[ Math.max( 0, index[ i ].properties.getInt( Index.PropertyKeys.MAXDOCSIZE ) ) ];
      indexReader[ i ] = index[ i ].getReader();
      
      if ( new File( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION ).exists() ) frequencies[ i ] = new InputBitStream( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION );
      termsInDoc[ i ] = new Int2IntOpenHashMap();
    }




    int currDoc = 0,
    // Term position in the current document.
    pos = 0, f = 0, p;


    pl.itemsName = "lists";
    pl.expectedUpdates = totalTerms;
    
    int indexFrequency = -1;
    
    // Sequential scan
    if ( !jsapResult.getBoolean( "noSeq" ) ) {
      try {
        for ( i = 0; i < index.length; i++ ) {
          int numberOfPostings = 0;
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start( "Verifying sequentially index " + index[ i ] + "..." );


          if ( allBitStreamIndices ) {
            for ( t = 0; t < numberOfTerms[ i ]; t++ ) {
              pl.update();
              IndexIterator indexIterator = indexReader[ i ].nextIterator();
              indexFrequency = indexIterator.frequency();
              numberOfPostings += indexFrequency;
              if ( frequencies[ i ] != null && indexFrequency != ( f = frequencies[ i ].readGamma() ) ) {
                System.err.println( "Error in frequency for term " + t + ": expected " + f + " documents, found " + indexFrequency );
                return;
              }


              while ( indexFrequency-- != 0 ) {
                p = indexIterator.nextDocument();
                if (index[i].hasCounts) count[i][p] += indexIterator.count();
                if (index[i].hasPositions) indexIterator.positionArray(); // Just to force reading in high-performance indices
              }
              if ( indexIterator.nextDocument() != -1 ) throw new AssertionError( "nextDocument() is not -1 after exhaustive iteration" );
            }
            
            // Check document sizes
            if ( ! isVirtual && ( (BitStreamIndex) index[ i ] ).sizes != null && index[ i ].hasCounts )
              for ( p = 0; p < index[ i ].numberOfDocuments; p++ )
                if ( index[ i ].sizes.getInt( p ) != count[ i ][ p ] )
                  System.err.println( "Document " + p + " has size " + ( (BitStreamIndex) index[ i ] ).sizes.getInt( p ) + " but " + count[ i ][ p ] + " occurrences have been stored." );
            
          }
          else { // Non-bitstream indices
            for (t = 0; t < numberOfTerms[ i ]; t++) {
              pl.update();
              IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              indexFrequency = indexIterator.frequency();
              numberOfPostings += indexFrequency;
              if (frequencies[i] != null && indexFrequency != (f = frequencies[i].readGamma())) {
                System.err.println("Error in frequency for term " + t
                    + ": expected " + f + " documents, found "
                    + indexFrequency);
                return;
              }
              
              int prevp = -1;
              while (indexFrequency-- != 0) {
                p = indexIterator.nextDocument();
                if ( prevp >= p ) throw new AssertionError( "previous pointer: " + prevp + "; current pointer: " + p );
                prevp = p;
                if (index[i].hasCounts) count[i][p] += indexIterator.count();
              }
            }
          }
          pl.done();
          
          if ( ! isVirtual && numberOfPostings != index[ i ].numberOfPostings ) System.err.println( "Index declares " + index[ i ].numberOfPostings + " postings, but we found " + numberOfPostings );
          long numberOfOccurrences = 0;
          if ( index[ i ].hasCounts ) {
            for ( p = 0; p < index[ i ].numberOfDocuments; p++ ) numberOfOccurrences += count[ i ][ p ];
            if ( numberOfOccurrences != index[ i ].numberOfOccurrences ) System.err.println( "Index declares " + index[ i ].numberOfOccurrences + " occurrences, but we found " + numberOfOccurrences );
          }
        }
      } catch ( Exception e ) {
        System.err.println( "Exception while scanning sequentially term " + t + " of index " + index[ i ] );
        System.err.println( "Term frequency was " + f + " and position " + ( f - indexFrequency - 1 ) );
        throw e;
      }
    }
  
    IntArrayList l = new IntArrayList();
    ObjectArrayList<int[]> positions = new ObjectArrayList<int[]>();
    
    if ( ! jsapResult.getBoolean( "noSkip" ) ) {
      int start = 0, end = 0, result;
      try {
        for (i = 0; i < index.length; i++) {
          
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start("Verifying all skips in " + index[i] + "...");


          for (t = 0; t < numberOfTerms[ i ]; t++) {
            l.clear();
            positions.clear();
            IndexIterator documents = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
            int d;
            while( ( d = documents.nextDocument() ) != -1 ) {
              l.add( d );
              if ( index[ i ].hasPositions ) positions.add( ArrayUtils.subarray( documents.positionArray(), 0, documents.count() ) );
            }
            
            for( start = 0; start < l.size(); start++ ) {
              for( end = start + 1; end < l.size(); end++ ) {
                IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
                
                result = indexIterator.skipTo( l.getInt( start ) );
                if ( indexIterator.document() != l.getInt( start ) || result != l.getInt( start ) ) throw new AssertionError( "Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
                result = indexIterator.skipTo( l.getInt( end ) );
                if ( indexIterator.document() != l.getInt( end ) || result != l.getInt( end ) ) throw new AssertionError( "Trying to skip to document " + l.getInt( end ) + " (term " + t + ") after a skip to " + start + " moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
                
                if ( index[ i ].hasPositions ) {
                  // This catches wrong state reconstruction after skips.
                  indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
                  indexIterator.skipTo( l.getInt( start ) );
                  if ( indexIterator.document() != l.getInt( start ) ) throw new AssertionError(indexIterator.document() + " != " + l.getInt( start ) );
                  if ( indexIterator.count() != positions.get( start ).length ) throw new AssertionError(indexIterator.count() + " != " + positions.get( start ).length );
                  if ( ! Arrays.equals( positions.get( start ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )
                     ) throw new AssertionError(Arrays.toString( positions.get( start ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) ) );
                  indexIterator.skipTo( l.getInt( end ) );
                  if ( indexIterator.document() != l.getInt( end )  ) throw new AssertionError(indexIterator.document() + " != " + l.getInt( end ) );
                  if ( indexIterator.count() != positions.get( end ).length ) throw new AssertionError(indexIterator.count() + " != " + positions.get( end ).length );
                  if ( ! Arrays.equals( positions.get( end ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )
                     ) throw new AssertionError(Arrays.toString( positions.get( end ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) ) );
                }
                
              }
              
              IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              
              result = indexIterator.skipTo( l.getInt( start ) );
              if ( indexIterator.document() != l.getInt( start ) || result != l.getInt( start ) ) throw new AssertionError("Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
              result = indexIterator.skipTo( Integer.MAX_VALUE );
              if ( indexIterator.hasNext() || result != Integer.MAX_VALUE ) throw new AssertionError("Trying to skip beyond end of list (term " + t + ") after a skip to " + start + " returned " + result + " (hasNext()=" + indexIterator.hasNext() + ")" );
              
              
            }
            pl.update();
          }
          pl.done();
        }
      }
      catch( Throwable e  ) {
        System.err.println( "Exception during all-skip test (index=" + index[ i ] + ", term=" + t + ", start=" + start + ", end=" + end + ")" );
        throw e;
      }
     }
    


    if ( ! jsapResult.getBoolean( "noComp" ) ) {
      IndexReader additionalReader;
      IntLinkedOpenHashSet s0 = new IntLinkedOpenHashSet();
      IntOpenHashSet s1 = new IntOpenHashSet();
      IntAVLTreeSet s2 = new IntAVLTreeSet();
      IntIterator it;
      IndexIterator indexIterator, additionalIterator;
      it.unimi.dsi.mg4j.search.DocumentIterator documentIterator;
      int u = 0;
      
      try {
        for (i = 0; i < index.length; i++) {
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start("Verifying composite iterators in " + index[i] + "...");
          additionalReader = index[ i ].getReader();
          
          for (t = 0; t < numberOfTerms[ i ]; t++) {
            for (u = 0; u < numberOfTerms[ i ]; u++) {
              s0.clear();
              s1.clear();
              // TODO: in case we have positions, we should check them, too
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t ), s0 );
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( u ) ) : indexReader[ i ].documents( u ), s1 );
              s0.retainAll( s1 );
              indexIterator =  termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              additionalIterator = termLists ? additionalReader.documents( terms[ i ].get( u ) ) : additionalReader.documents( u );
              it = s0.iterator();
              documentIterator = AndDocumentIterator.getInstance( indexIterator, additionalIterator );
              for( int j = s0.size(); j-- != 0; ) if ( it.nextInt() != documentIterator.nextDocument() ) throw new AssertionError();
              if ( documentIterator.hasNext() ) throw new AssertionError();


              s2.clear();
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t ), s2 );
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( u ) ) : indexReader[ i ].documents( u ), s2 );


              indexIterator =  termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              additionalIterator = termLists ? additionalReader.documents( terms[ i ].get( u ) ) : additionalReader.documents( u );


              it = s2.iterator();
              documentIterator = OrDocumentIterator.getInstance( indexIterator, additionalIterator ); 
              for( int j = s2.size(); j-- != 0; ) if ( it.nextInt() != documentIterator.nextDocument() ) throw new AssertionError();
              if ( documentIterator.hasNext() ) throw new AssertionError();
            }  
            pl.update();
          }
          pl.done();
          additionalReader.close();
        }
      }
      catch( Throwable e  ) {
        System.err.println( "Exception during composite iterator test (index=" + index[ i ] + ", first term=" + t + ", second term =" + u + ")" );
        throw e;
      }  
    }
    
    if ( ! isVirtual && jsapResult.getBoolean( "random" ) ) {
      
      // Random access scan
      pl.expectedUpdates = index[ 0 ].numberOfDocuments;
      pl.itemsName = "documents";
      pl.start( "Verifying random access..." );


      if ( allBitStreamIndices ) {
        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
        
        final MutableString word = new MutableString(), nonWord = new MutableString();
        
        int docCounter = 0;
        
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;


          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              // TODO: write tests for the other case
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload );  
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents(  0  );
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                } 
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              // text index
              pos = 0;
              termsInDoc[ i ].clear();
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                if ( ( t = (int)( (BitStreamIndex)index[ i ] ).termMap.getLong( word ) ) == -1 ) LOGGER.error( index[ i ] + ": Could not find term " + word + " in term index" );
                else {
                  if ( index[ i ].hasCounts ) termsInDoc[ i ].put( t, termsInDoc[ i ].get( t ) + 1 );
                  if ( index[ i ].hasPositions ) wordInPos[ i ][ pos++ ] = t;
                }
              }


              if ( allBitStreamIndices ) {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();


                  IndexIterator indexIterator = indexReader[ i ].documents( t );


                  int pointer = indexIterator.skipTo( currDoc );
                  if ( pointer == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c ) 
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );


                          for( int j = 0; j < c; j++ ) 
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )  
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    } 
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t + "(skipTo returned " + pointer + ")" );
                }
              }
              else {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();
                  IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );


                  if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c ) 
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );


                          for( int j = 0; j < c; j++ ) 
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )  
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  } 
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
                }
              }
            }
          }
          docCounter++;
          document.close();
          pl.update();
        }
      }
      else {
        LOGGER.warn( "Random access tests require very slow single-term scanning as not all indices are disk based" );


        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
        
        final MutableString word = new MutableString(), nonWord = new MutableString();

View Full Code Here

TOP

Related Classes of it.unimi.dsi.mg4j.document.DocumentSequence

it.unimi.dsi.mg4j.test.Verifier

it.unimi.dsi.mg4j.tool.IndexBuilder

it.unimi.dsi.mg4j.tool.Scan

it.unimi.dsi.mg4j.tool.ScanMetadata

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.