Package it.unimi.dsi.mg4j.document

Examples of it.unimi.dsi.mg4j.document.DocumentSequence


    if ( jsap.messagePrinted() ) return;

    if ( ( jsapResult.userSpecified( "builderClass" ) || jsapResult.userSpecified( "exact" ) ) && ! jsapResult.userSpecified( "buildCollection" ) )  throw new IllegalArgumentException( "To specify options about the collection building process, you must specify a basename first." );
    if ( jsapResult.userSpecified( "sequence" ) && jsapResult.userSpecified( "objectSequence" ) ) throw new IllegalArgumentException( "You cannot specify both a serialised and an parseable-object sequence" );
   
    final DocumentSequence documentSequence = jsapResult.userSpecified( "objectSequence" ) ? (DocumentSequence)jsapResult.getObject( "objectSequence" ) : Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );

    final DocumentFactory factory = documentSequence.factory();
    final int[] indexedField = parseFieldNames( jsapResult.getStringArray( "indexedField" ), factory, jsapResult.getBoolean( "allFields" ) );
    final int batchSize = jsapResult.getInt( "batchSize" );
    final VirtualDocumentResolver[] virtualDocumentResolver = parseVirtualDocumentResolver( jsapResult.getStringArray( "virtualDocumentResolver" ), indexedField, factory );
    final int[] virtualDocumentGap = parseVirtualDocumentGap( jsapResult.getStringArray( "virtualDocumentGap" ), indexedField, factory );

    DocumentCollectionBuilder builder = null;
    if ( jsapResult.userSpecified( "buildCollection" ) ) {
      final Class<? extends DocumentCollectionBuilder> builderClass = jsapResult.getClass( "builderClass" );
      builder = builderClass != null ? builderClass.getConstructor( String.class, DocumentFactory.class, boolean.class ).newInstance(
          jsapResult.getString( "buildCollection" ),
          documentSequence.factory().numberOfFields() == indexedField.length ? documentSequence.factory().copy() : new SubDocumentFactory( documentSequence.factory().copy(), indexedField ),
          Boolean.valueOf( jsapResult.getBoolean( "exact" ) ) ) : null;
    }

    run( jsapResult.getString( "basename" ), documentSequence, Completeness.valueOf( jsapResult.getString( "completeness" ) ), jsapResult.getBoolean( "downcase" ) ? DowncaseTermProcessor.getInstance() : ObjectParser.fromSpec( jsapResult
        .getString( "termProcessor" ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } ), builder, jsapResult
View Full Code Here


    });

    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;

    DocumentSequence documentSequence = Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );

    if ( ! jsapResult.userSpecified( "uris" ) && ! jsapResult.userSpecified( "titles" ) )
      throw new IllegalArgumentException( "You specify either a title or a URI output file" );
   
    Util.ensureLog4JIsConfigured();

    final DocumentIterator documentIterator = documentSequence.iterator();

    Document document;
    FastBufferedOutputStream uriStream = null, titleStream = null;
   
    if ( jsapResult.userSpecified( "uris" ) ) uriStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "uris" ) ) );
View Full Code Here

    if ( jsap.messagePrinted() ) return;

    if ( ( jsapResult.userSpecified( "builderClass" ) || jsapResult.userSpecified( "exact" ) ) && ! jsapResult.userSpecified( "buildCollection" ) )  throw new IllegalArgumentException( "To specify options about the collection building process, you must specify a basename first." );
    if ( jsapResult.userSpecified( "sequence" ) && jsapResult.userSpecified( "objectSequence" ) ) throw new IllegalArgumentException( "You cannot specify both a serialised and an parseable-object sequence" );
   
    final DocumentSequence documentSequence = jsapResult.userSpecified( "objectSequence" ) ? (DocumentSequence)jsapResult.getObject( "objectSequence" ) : Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );
    final DocumentFactory factory = documentSequence.factory();

    final int[] indexedField = Scan.parseFieldNames( jsapResult.getStringArray( "indexedField" ), factory, jsapResult.getBoolean( "allFields" ) );
    final VirtualDocumentResolver[] virtualDocumentResolver = Scan.parseVirtualDocumentResolver( jsapResult.getStringArray( "virtualDocumentResolver" ), indexedField, factory );
    final int[] virtualDocumentGap = Scan.parseVirtualDocumentGap( jsapResult.getStringArray( "virtualDocumentGap" ), indexedField, factory );

    final TermProcessor termProcessor = jsapResult.getBoolean( "downcase" ) ? DowncaseTermProcessor.getInstance() :
      ObjectParser.fromSpec( jsapResult.getString( "termProcessor" ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } );

    final boolean skips = ! jsapResult.getBoolean( "noSkips" );
    final boolean interleaved = jsapResult.getBoolean( "interleaved" );
    if ( ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) throw new IllegalArgumentException( "You specified quantum or height, but you also disabled skips." );

    DocumentCollectionBuilder builder = null;
    if ( jsapResult.userSpecified( "buildCollection" ) ) {
      final Class<? extends DocumentCollectionBuilder> builderClass = jsapResult.getClass( "builderClass" );
      builder = builderClass != null ? builderClass.getConstructor( String.class, DocumentFactory.class, boolean.class ).newInstance(
          jsapResult.getString( "buildCollection" ),
          documentSequence.factory().numberOfFields() == indexedField.length ? documentSequence.factory().copy() : new SubDocumentFactory( documentSequence.factory().copy(), indexedField ),
          Boolean.valueOf( jsapResult.getBoolean( "exact" ) ) ) : null;
    }

    final IndexBuilder indexBuilder = new IndexBuilder( jsapResult.getString( "basename" ), documentSequence )
    .termProcessor( termProcessor )
View Full Code Here

      });
   
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;
   
    DocumentSequence documentSequence = it.unimi.dsi.mg4j.tool.Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );
   
    final DocumentFactory factory = documentSequence.factory();
    final boolean stem = jsapResult.getBoolean( "stem" );
    final boolean termLists = jsapResult.getBoolean( "termLists" );
    final int[] indexedField = it.unimi.dsi.mg4j.tool.Scan.parseFieldNames( jsapResult.getStringArray( "indexedField" ), factory, jsapResult.getBoolean( "allFields" )  );
   
    LOGGER.debug( "Parsed indexed field: " + IntArrayList.wrap( indexedField ) );
   
    final String basename = jsapResult.getString( "basename" );
    final String permutationFile = jsapResult.getString( "renumber" );

    final boolean isVirtual = jsapResult.getBoolean( "virtual" );

    int i, t = 0;

    final ProgressLogger pl = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "ints" );
    final Index[] index = stem ? new Index[ indexedField.length ] : new Index[ 1 ];
    final int numberOfTerms[] = new int[ indexedField.length ];
    final ObjectArrayList<MutableString>[] terms = new ObjectArrayList[ indexedField.length ];
    final IndexReader[] indexReader = new IndexReader[ index.length ];
    final InputBitStream[] frequencies = new InputBitStream[ index.length ];
    final int[][] count = new int[ index.length ][];
    final int[] permutation = permutationFile != null ? BinIO.loadInts( permutationFile ) : null;
    final int[][] occ = new int[ index.length ][];
    final int[][] wordInPos = new int[ index.length ][];
    final Int2IntMap[] termsInDoc = new Int2IntOpenHashMap[ index.length ];
    int totalTerms = 0;
   
    boolean allBitStreamIndices = true;
   
    for( i = 0; i < index.length; i++ ) {
      final String basenameField = basename + (stem ? "-" + factory.fieldName( indexedField[ i ] ) : "" );
      index[ i ] = Index.getInstance( basenameField );
      if ( ! ( index[ i ] instanceof BitStreamIndex ) ) allBitStreamIndices = false;
     
      if ( termLists ) {
        terms[ i ] = new ObjectArrayList<MutableString>( new FileLinesCollection( basenameField + DiskBasedIndex.TERMS_EXTENSION, "UTF-8" ).allLines() );
        numberOfTerms[ i ] = terms[ i ].size();
      }
      else numberOfTerms[ i ] = index[ i ].numberOfTerms;
      totalTerms += numberOfTerms[ i ];
     
      // This will be matched with the number of occurrences per document
      count[ i ] = new int[ index[ i ].numberOfDocuments ];

      occ[ i ] = index[ i ].maxCount > 0 ? new int[ index[ i ].maxCount ] : IntArrays.EMPTY_ARRAY;
      wordInPos[ i ] = new int[ Math.max( 0, index[ i ].properties.getInt( Index.PropertyKeys.MAXDOCSIZE ) ) ];
      indexReader[ i ] = index[ i ].getReader();
     
      if ( new File( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION ).exists() ) frequencies[ i ] = new InputBitStream( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION );
      termsInDoc[ i ] = new Int2IntOpenHashMap();
    }


    int currDoc = 0,
    // Term position in the current document.
    pos = 0, f = 0, p;

    pl.itemsName = "lists";
    pl.expectedUpdates = totalTerms;
   
    int indexFrequency = -1;
   
    // Sequential scan
    if ( !jsapResult.getBoolean( "noSeq" ) ) {
      try {
        for ( i = 0; i < index.length; i++ ) {
          int numberOfPostings = 0;
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start( "Verifying sequentially index " + index[ i ] + "..." );

          if ( allBitStreamIndices ) {
            for ( t = 0; t < numberOfTerms[ i ]; t++ ) {
              pl.update();
              IndexIterator indexIterator = indexReader[ i ].nextIterator();
              indexFrequency = indexIterator.frequency();
              numberOfPostings += indexFrequency;
              if ( frequencies[ i ] != null && indexFrequency != ( f = frequencies[ i ].readGamma() ) ) {
                System.err.println( "Error in frequency for term " + t + ": expected " + f + " documents, found " + indexFrequency );
                return;
              }

              while ( indexFrequency-- != 0 ) {
                p = indexIterator.nextDocument();
                if (index[i].hasCounts) count[i][p] += indexIterator.count();
                if (index[i].hasPositions) indexIterator.positionArray(); // Just to force reading in high-performance indices
              }
              if ( indexIterator.nextDocument() != -1 ) throw new AssertionError( "nextDocument() is not -1 after exhaustive iteration" );
            }
           
            // Check document sizes
            if ( ! isVirtual && ( (BitStreamIndex) index[ i ] ).sizes != null && index[ i ].hasCounts )
              for ( p = 0; p < index[ i ].numberOfDocuments; p++ )
                if ( index[ i ].sizes.getInt( p ) != count[ i ][ p ] )
                  System.err.println( "Document " + p + " has size " + ( (BitStreamIndex) index[ i ] ).sizes.getInt( p ) + " but " + count[ i ][ p ] + " occurrences have been stored." );
           
          }
          else { // Non-bitstream indices
            for (t = 0; t < numberOfTerms[ i ]; t++) {
              pl.update();
              IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              indexFrequency = indexIterator.frequency();
              numberOfPostings += indexFrequency;
              if (frequencies[i] != null && indexFrequency != (f = frequencies[i].readGamma())) {
                System.err.println("Error in frequency for term " + t
                    + ": expected " + f + " documents, found "
                    + indexFrequency);
                return;
              }
             
              int prevp = -1;
              while (indexFrequency-- != 0) {
                p = indexIterator.nextDocument();
                if ( prevp >= p ) throw new AssertionError( "previous pointer: " + prevp + "; current pointer: " + p );
                prevp = p;
                if (index[i].hasCounts) count[i][p] += indexIterator.count();
              }
            }
          }
          pl.done();
         
          if ( ! isVirtual && numberOfPostings != index[ i ].numberOfPostings ) System.err.println( "Index declares " + index[ i ].numberOfPostings + " postings, but we found " + numberOfPostings );
          long numberOfOccurrences = 0;
          if ( index[ i ].hasCounts ) {
            for ( p = 0; p < index[ i ].numberOfDocuments; p++ ) numberOfOccurrences += count[ i ][ p ];
            if ( numberOfOccurrences != index[ i ].numberOfOccurrences ) System.err.println( "Index declares " + index[ i ].numberOfOccurrences + " occurrences, but we found " + numberOfOccurrences );
          }
        }
      } catch ( Exception e ) {
        System.err.println( "Exception while scanning sequentially term " + t + " of index " + index[ i ] );
        System.err.println( "Term frequency was " + f + " and position " + ( f - indexFrequency - 1 ) );
        throw e;
      }
    }
 
    IntArrayList l = new IntArrayList();
    ObjectArrayList<int[]> positions = new ObjectArrayList<int[]>();
   
    if ( ! jsapResult.getBoolean( "noSkip" ) ) {
      int start = 0, end = 0, result;
      try {
        for (i = 0; i < index.length; i++) {
         
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start("Verifying all skips in " + index[i] + "...");

          for (t = 0; t < numberOfTerms[ i ]; t++) {
            l.clear();
            positions.clear();
            IndexIterator documents = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
            int d;
            while( ( d = documents.nextDocument() ) != -1 ) {
              l.add( d );
              if ( index[ i ].hasPositions ) positions.add( ArrayUtils.subarray( documents.positionArray(), 0, documents.count() ) );
            }
           
            for( start = 0; start < l.size(); start++ ) {
              for( end = start + 1; end < l.size(); end++ ) {
                IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
               
                result = indexIterator.skipTo( l.getInt( start ) );
                if ( indexIterator.document() != l.getInt( start ) || result != l.getInt( start ) ) throw new AssertionError( "Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
                result = indexIterator.skipTo( l.getInt( end ) );
                if ( indexIterator.document() != l.getInt( end ) || result != l.getInt( end ) ) throw new AssertionError( "Trying to skip to document " + l.getInt( end ) + " (term " + t + ") after a skip to " + start + " moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
               
                if ( index[ i ].hasPositions ) {
                  // This catches wrong state reconstruction after skips.
                  indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
                  indexIterator.skipTo( l.getInt( start ) );
                  if ( indexIterator.document() != l.getInt( start ) ) throw new AssertionError(indexIterator.document() + " != " + l.getInt( start ) );
                  if ( indexIterator.count() != positions.get( start ).length ) throw new AssertionError(indexIterator.count() + " != " + positions.get( start ).length );
                  if ( ! Arrays.equals( positions.get( start ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )
                     ) throw new AssertionError(Arrays.toString( positions.get( start ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) ) );
                  indexIterator.skipTo( l.getInt( end ) );
                  if ( indexIterator.document() != l.getInt( end )  ) throw new AssertionError(indexIterator.document() + " != " + l.getInt( end ) );
                  if ( indexIterator.count() != positions.get( end ).length ) throw new AssertionError(indexIterator.count() + " != " + positions.get( end ).length );
                  if ( ! Arrays.equals( positions.get( end ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )
                     ) throw new AssertionError(Arrays.toString( positions.get( end ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) ) );
                }
               
              }
             
              IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
             
              result = indexIterator.skipTo( l.getInt( start ) );
              if ( indexIterator.document() != l.getInt( start ) || result != l.getInt( start ) ) throw new AssertionError("Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
              result = indexIterator.skipTo( Integer.MAX_VALUE );
              if ( indexIterator.hasNext() || result != Integer.MAX_VALUE ) throw new AssertionError("Trying to skip beyond end of list (term " + t + ") after a skip to " + start + " returned " + result + " (hasNext()=" + indexIterator.hasNext() + ")" );
             
             
            }
            pl.update();
          }
          pl.done();
        }
      }
      catch( Throwable e  ) {
        System.err.println( "Exception during all-skip test (index=" + index[ i ] + ", term=" + t + ", start=" + start + ", end=" + end + ")" );
        throw e;
      }
     }
   

    if ( ! jsapResult.getBoolean( "noComp" ) ) {
      IndexReader additionalReader;
      IntLinkedOpenHashSet s0 = new IntLinkedOpenHashSet();
      IntOpenHashSet s1 = new IntOpenHashSet();
      IntAVLTreeSet s2 = new IntAVLTreeSet();
      IntIterator it;
      IndexIterator indexIterator, additionalIterator;
      it.unimi.dsi.mg4j.search.DocumentIterator documentIterator;
      int u = 0;
     
      try {
        for (i = 0; i < index.length; i++) {
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start("Verifying composite iterators in " + index[i] + "...");
          additionalReader = index[ i ].getReader();
         
          for (t = 0; t < numberOfTerms[ i ]; t++) {
            for (u = 0; u < numberOfTerms[ i ]; u++) {
              s0.clear();
              s1.clear();
              // TODO: in case we have positions, we should check them, too
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t ), s0 );
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( u ) ) : indexReader[ i ].documents( u ), s1 );
              s0.retainAll( s1 );
              indexIterator =  termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              additionalIterator = termLists ? additionalReader.documents( terms[ i ].get( u ) ) : additionalReader.documents( u );
              it = s0.iterator();
              documentIterator = AndDocumentIterator.getInstance( indexIterator, additionalIterator );
              for( int j = s0.size(); j-- != 0; ) if ( it.nextInt() != documentIterator.nextDocument() ) throw new AssertionError();
              if ( documentIterator.hasNext() ) throw new AssertionError();

              s2.clear();
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t ), s2 );
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( u ) ) : indexReader[ i ].documents( u ), s2 );

              indexIterator =  termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              additionalIterator = termLists ? additionalReader.documents( terms[ i ].get( u ) ) : additionalReader.documents( u );

              it = s2.iterator();
              documentIterator = OrDocumentIterator.getInstance( indexIterator, additionalIterator );
              for( int j = s2.size(); j-- != 0; ) if ( it.nextInt() != documentIterator.nextDocument() ) throw new AssertionError();
              if ( documentIterator.hasNext() ) throw new AssertionError();
           
            pl.update();
          }
          pl.done();
          additionalReader.close();
        }
      }
      catch( Throwable e  ) {
        System.err.println( "Exception during composite iterator test (index=" + index[ i ] + ", first term=" + t + ", second term =" + u + ")" );
        throw e;
     
    }
   
    if ( ! isVirtual && jsapResult.getBoolean( "random" ) ) {
     
      // Random access scan
      pl.expectedUpdates = index[ 0 ].numberOfDocuments;
      pl.itemsName = "documents";
      pl.start( "Verifying random access..." );

      if ( allBitStreamIndices ) {
        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
       
        final MutableString word = new MutableString(), nonWord = new MutableString();
       
        int docCounter = 0;
       
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;

          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              // TODO: write tests for the other case
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload )
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents);
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              // text index
              pos = 0;
              termsInDoc[ i ].clear();
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                if ( ( t = (int)( (BitStreamIndex)index[ i ] ).termMap.getLong( word ) ) == -1 ) LOGGER.error( index[ i ] + ": Could not find term " + word + " in term index" );
                else {
                  if ( index[ i ].hasCounts ) termsInDoc[ i ].put( t, termsInDoc[ i ].get( t ) + 1 );
                  if ( index[ i ].hasPositions ) wordInPos[ i ][ pos++ ] = t;
                }
              }

              if ( allBitStreamIndices ) {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();

                  IndexIterator indexIterator = indexReader[ i ].documents( t );

                  int pointer = indexIterator.skipTo( currDoc );
                  if ( pointer == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c )
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );

                          for( int j = 0; j < c; j++ )
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t + "(skipTo returned " + pointer + ")" );
                }
              }
              else {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();
                  IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );

                  if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c )
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );

                          for( int j = 0; j < c; j++ )
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
                }
              }
            }
          }
          docCounter++;
          document.close();
          pl.update();
        }
      }
      else {
        LOGGER.warn( "Random access tests require very slow single-term scanning as not all indices are disk based" );

        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
       
        final MutableString word = new MutableString(), nonWord = new MutableString();
View Full Code Here

TOP

Related Classes of it.unimi.dsi.mg4j.document.DocumentSequence

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.