pl.itemsName = "documents";
pl.start( "Verifying random access..." );
if ( allBitStreamIndices ) {
it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
Document document;
Reader reader;
WordReader wordReader;
final MutableString word = new MutableString(), nonWord = new MutableString();
int docCounter = 0;
while( ( document = documentIterator.nextDocument() ) != null ) {
currDoc = permutation != null ? permutation[ docCounter ] : docCounter;
for( i = 0; i < index.length; i++ ) {
Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
if ( index[ i ].hasPayloads ) {
// TODO: write tests for the other case
if ( allBitStreamIndices ) {
IndexIterator indexIterator = indexReader[ i ].documents( 0 );
int pointer = indexIterator.skipTo( currDoc );
if ( pointer == currDoc ) {
Payload payload = indexIterator.payload();
if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload );
}
else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
}
else {
IndexIterator indexIterator = indexReader[ i ].documents( 0 );
if ( indexIterator.skipTo( currDoc ) == currDoc ) {
if ( ! indexIterator.payload().get().equals( content ) )
LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
}
else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
}
}
else {
// text index
pos = 0;
termsInDoc[ i ].clear();
reader = (Reader)content;
wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
wordReader.setReader( reader );
while( wordReader.next( word, nonWord ) ) {
if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
if ( ( t = (int)( (BitStreamIndex)index[ i ] ).termMap.getLong( word ) ) == -1 ) LOGGER.error( index[ i ] + ": Could not find term " + word + " in term index" );
else {
if ( index[ i ].hasCounts ) termsInDoc[ i ].put( t, termsInDoc[ i ].get( t ) + 1 );
if ( index[ i ].hasPositions ) wordInPos[ i ][ pos++ ] = t;
}
}
if ( allBitStreamIndices ) {
for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
t = x.nextInt();
IndexIterator indexIterator = indexReader[ i ].documents( t );
int pointer = indexIterator.skipTo( currDoc );
if ( pointer == currDoc ) {
if ( index[ i ].hasCounts ) {
int c = indexIterator.count();
if ( termsInDoc[ i ].get( t ) != c )
LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
else {
if ( index[ i ].hasPositions ) {
indexIterator.positions( occ[ i ] );
for( int j = 0; j < c; j++ )
if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )
LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
}
}
}
}
else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t + "(skipTo returned " + pointer + ")" );
}
}
else {
for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
t = x.nextInt();
IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
if ( indexIterator.skipTo( currDoc ) == currDoc ) {
if ( index[ i ].hasCounts ) {
int c = indexIterator.count();
if ( termsInDoc[ i ].get( t ) != c )
LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
else {
if ( index[ i ].hasPositions ) {
indexIterator.positions( occ[ i ] );
for( int j = 0; j < c; j++ )
if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )
LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
}
}
}
}
else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
}
}
}
}
docCounter++;
document.close();
pl.update();
}
}
else {
LOGGER.warn( "Random access tests require very slow single-term scanning as not all indices are disk based" );
it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
Document document;
Reader reader;
WordReader wordReader;
final MutableString word = new MutableString(), nonWord = new MutableString();
int docCounter = 0;
while( ( document = documentIterator.nextDocument() ) != null ) {
currDoc = permutation != null ? permutation[ docCounter ] : docCounter;
for( i = 0; i < index.length; i++ ) {
Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
if ( index[ i ].hasPayloads ) {
if ( allBitStreamIndices ) {
IndexIterator indexIterator = indexReader[ i ].documents( 0 );
int pointer = indexIterator.skipTo( currDoc );
if ( pointer == currDoc ) {
Payload payload = indexIterator.payload();
if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload );
}
else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
}
else {
IndexIterator indexIterator = indexReader[ i ].documents( "#" );
if ( indexIterator.skipTo( currDoc ) == currDoc ) {
if ( ! indexIterator.payload().get().equals( content ) )
LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
}
else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
}
}
else {
pos = 0;
reader = (Reader)content;
wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
wordReader.setReader( reader );
while( wordReader.next( word, nonWord ) ) {
if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
IndexIterator indexIterator = indexReader[ i ].documents( word );
if ( currDoc != indexIterator.skipTo( currDoc ) )
LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + word );
else if ( index[ i ].hasPositions ) {
indexIterator.positions( occ[ i ] );
if ( IntArrayList.wrap( occ[ i ], indexIterator.count() ).indexOf( pos ) == -1 )
LOGGER.error( index[ i ] + ": Position " + pos + " does not appear in the position list of term " + word + " in document " + currDoc );
}
pos++;
}
}
}
document.close();
pl.update();
docCounter++;
}
}