JSAPResult jsapResult = jsap.parse( arg );
if ( jsap.messagePrinted() ) return;
DocumentSequence documentSequence = it.unimi.dsi.mg4j.tool.Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );
final DocumentFactory factory = documentSequence.factory();
final boolean stem = jsapResult.getBoolean( "stem" );
final boolean termLists = jsapResult.getBoolean( "termLists" );
final int[] indexedField = it.unimi.dsi.mg4j.tool.Scan.parseFieldNames( jsapResult.getStringArray( "indexedField" ), factory, jsapResult.getBoolean( "allFields" ) );
LOGGER.debug( "Parsed indexed field: " + IntArrayList.wrap( indexedField ) );
final String basename = jsapResult.getString( "basename" );
final String permutationFile = jsapResult.getString( "renumber" );
final boolean isVirtual = jsapResult.getBoolean( "virtual" );
int i, t = 0;
final ProgressLogger pl = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "ints" );
final Index[] index = stem ? new Index[ indexedField.length ] : new Index[ 1 ];
final int numberOfTerms[] = new int[ indexedField.length ];
final ObjectArrayList<MutableString>[] terms = new ObjectArrayList[ indexedField.length ];
final IndexReader[] indexReader = new IndexReader[ index.length ];
final InputBitStream[] frequencies = new InputBitStream[ index.length ];
final int[][] count = new int[ index.length ][];
final int[] permutation = permutationFile != null ? BinIO.loadInts( permutationFile ) : null;
final int[][] occ = new int[ index.length ][];
final int[][] wordInPos = new int[ index.length ][];
final Int2IntMap[] termsInDoc = new Int2IntOpenHashMap[ index.length ];
int totalTerms = 0;
boolean allBitStreamIndices = true;
for( i = 0; i < index.length; i++ ) {
final String basenameField = basename + (stem ? "-" + factory.fieldName( indexedField[ i ] ) : "" );
index[ i ] = Index.getInstance( basenameField );
if ( ! ( index[ i ] instanceof BitStreamIndex ) ) allBitStreamIndices = false;
if ( termLists ) {
terms[ i ] = new ObjectArrayList<MutableString>( new FileLinesCollection( basenameField + DiskBasedIndex.TERMS_EXTENSION, "UTF-8" ).allLines() );
numberOfTerms[ i ] = terms[ i ].size();
}
else numberOfTerms[ i ] = index[ i ].numberOfTerms;
totalTerms += numberOfTerms[ i ];
// This will be matched with the number of occurrences per document
count[ i ] = new int[ index[ i ].numberOfDocuments ];
occ[ i ] = index[ i ].maxCount > 0 ? new int[ index[ i ].maxCount ] : IntArrays.EMPTY_ARRAY;
wordInPos[ i ] = new int[ Math.max( 0, index[ i ].properties.getInt( Index.PropertyKeys.MAXDOCSIZE ) ) ];
indexReader[ i ] = index[ i ].getReader();
if ( new File( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION ).exists() ) frequencies[ i ] = new InputBitStream( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION );
termsInDoc[ i ] = new Int2IntOpenHashMap();
}
int currDoc = 0,
// Term position in the current document.
pos = 0, f = 0, p;
pl.itemsName = "lists";
pl.expectedUpdates = totalTerms;
int indexFrequency = -1;
// Sequential scan
if ( !jsapResult.getBoolean( "noSeq" ) ) {
try {
for ( i = 0; i < index.length; i++ ) {
int numberOfPostings = 0;
pl.expectedUpdates = numberOfTerms[ i ];
pl.start( "Verifying sequentially index " + index[ i ] + "..." );
if ( allBitStreamIndices ) {
for ( t = 0; t < numberOfTerms[ i ]; t++ ) {
pl.update();
IndexIterator indexIterator = indexReader[ i ].nextIterator();
indexFrequency = indexIterator.frequency();
numberOfPostings += indexFrequency;
if ( frequencies[ i ] != null && indexFrequency != ( f = frequencies[ i ].readGamma() ) ) {
System.err.println( "Error in frequency for term " + t + ": expected " + f + " documents, found " + indexFrequency );
return;
}
while ( indexFrequency-- != 0 ) {
p = indexIterator.nextDocument();
if (index[i].hasCounts) count[i][p] += indexIterator.count();
if (index[i].hasPositions) indexIterator.positionArray(); // Just to force reading in high-performance indices
}
if ( indexIterator.nextDocument() != -1 ) throw new AssertionError( "nextDocument() is not -1 after exhaustive iteration" );
}
// Check document sizes
if ( ! isVirtual && ( (BitStreamIndex) index[ i ] ).sizes != null && index[ i ].hasCounts )
for ( p = 0; p < index[ i ].numberOfDocuments; p++ )
if ( index[ i ].sizes.getInt( p ) != count[ i ][ p ] )
System.err.println( "Document " + p + " has size " + ( (BitStreamIndex) index[ i ] ).sizes.getInt( p ) + " but " + count[ i ][ p ] + " occurrences have been stored." );
}
else { // Non-bitstream indices
for (t = 0; t < numberOfTerms[ i ]; t++) {
pl.update();
IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
indexFrequency = indexIterator.frequency();
numberOfPostings += indexFrequency;
if (frequencies[i] != null && indexFrequency != (f = frequencies[i].readGamma())) {
System.err.println("Error in frequency for term " + t
+ ": expected " + f + " documents, found "
+ indexFrequency);
return;
}
int prevp = -1;
while (indexFrequency-- != 0) {
p = indexIterator.nextDocument();
if ( prevp >= p ) throw new AssertionError( "previous pointer: " + prevp + "; current pointer: " + p );
prevp = p;
if (index[i].hasCounts) count[i][p] += indexIterator.count();
}
}
}
pl.done();
if ( ! isVirtual && numberOfPostings != index[ i ].numberOfPostings ) System.err.println( "Index declares " + index[ i ].numberOfPostings + " postings, but we found " + numberOfPostings );
long numberOfOccurrences = 0;
if ( index[ i ].hasCounts ) {
for ( p = 0; p < index[ i ].numberOfDocuments; p++ ) numberOfOccurrences += count[ i ][ p ];
if ( numberOfOccurrences != index[ i ].numberOfOccurrences ) System.err.println( "Index declares " + index[ i ].numberOfOccurrences + " occurrences, but we found " + numberOfOccurrences );
}
}
} catch ( Exception e ) {
System.err.println( "Exception while scanning sequentially term " + t + " of index " + index[ i ] );
System.err.println( "Term frequency was " + f + " and position " + ( f - indexFrequency - 1 ) );
throw e;
}
}
IntArrayList l = new IntArrayList();
ObjectArrayList<int[]> positions = new ObjectArrayList<int[]>();
if ( ! jsapResult.getBoolean( "noSkip" ) ) {
int start = 0, end = 0, result;
try {
for (i = 0; i < index.length; i++) {
pl.expectedUpdates = numberOfTerms[ i ];
pl.start("Verifying all skips in " + index[i] + "...");
for (t = 0; t < numberOfTerms[ i ]; t++) {
l.clear();
positions.clear();
IndexIterator documents = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
int d;
while( ( d = documents.nextDocument() ) != -1 ) {
l.add( d );
if ( index[ i ].hasPositions ) positions.add( ArrayUtils.subarray( documents.positionArray(), 0, documents.count() ) );
}
for( start = 0; start < l.size(); start++ ) {
for( end = start + 1; end < l.size(); end++ ) {
IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
result = indexIterator.skipTo( l.getInt( start ) );
if ( indexIterator.document() != l.getInt( start ) || result != l.getInt( start ) ) throw new AssertionError( "Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
result = indexIterator.skipTo( l.getInt( end ) );
if ( indexIterator.document() != l.getInt( end ) || result != l.getInt( end ) ) throw new AssertionError( "Trying to skip to document " + l.getInt( end ) + " (term " + t + ") after a skip to " + start + " moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
if ( index[ i ].hasPositions ) {
// This catches wrong state reconstruction after skips.
indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
indexIterator.skipTo( l.getInt( start ) );
if ( indexIterator.document() != l.getInt( start ) ) throw new AssertionError(indexIterator.document() + " != " + l.getInt( start ) );
if ( indexIterator.count() != positions.get( start ).length ) throw new AssertionError(indexIterator.count() + " != " + positions.get( start ).length );
if ( ! Arrays.equals( positions.get( start ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )
) throw new AssertionError(Arrays.toString( positions.get( start ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) ) );
indexIterator.skipTo( l.getInt( end ) );
if ( indexIterator.document() != l.getInt( end ) ) throw new AssertionError(indexIterator.document() + " != " + l.getInt( end ) );
if ( indexIterator.count() != positions.get( end ).length ) throw new AssertionError(indexIterator.count() + " != " + positions.get( end ).length );
if ( ! Arrays.equals( positions.get( end ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )
) throw new AssertionError(Arrays.toString( positions.get( end ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) ) );
}
}
IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
result = indexIterator.skipTo( l.getInt( start ) );
if ( indexIterator.document() != l.getInt( start ) || result != l.getInt( start ) ) throw new AssertionError("Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
result = indexIterator.skipTo( Integer.MAX_VALUE );
if ( indexIterator.hasNext() || result != Integer.MAX_VALUE ) throw new AssertionError("Trying to skip beyond end of list (term " + t + ") after a skip to " + start + " returned " + result + " (hasNext()=" + indexIterator.hasNext() + ")" );
}
pl.update();
}
pl.done();
}
}
catch( Throwable e ) {
System.err.println( "Exception during all-skip test (index=" + index[ i ] + ", term=" + t + ", start=" + start + ", end=" + end + ")" );
throw e;
}
}
if ( ! jsapResult.getBoolean( "noComp" ) ) {
IndexReader additionalReader;
IntLinkedOpenHashSet s0 = new IntLinkedOpenHashSet();
IntOpenHashSet s1 = new IntOpenHashSet();
IntAVLTreeSet s2 = new IntAVLTreeSet();
IntIterator it;
IndexIterator indexIterator, additionalIterator;
it.unimi.dsi.mg4j.search.DocumentIterator documentIterator;
int u = 0;
try {
for (i = 0; i < index.length; i++) {
pl.expectedUpdates = numberOfTerms[ i ];
pl.start("Verifying composite iterators in " + index[i] + "...");
additionalReader = index[ i ].getReader();
for (t = 0; t < numberOfTerms[ i ]; t++) {
for (u = 0; u < numberOfTerms[ i ]; u++) {
s0.clear();
s1.clear();
// TODO: in case we have positions, we should check them, too
IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t ), s0 );
IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( u ) ) : indexReader[ i ].documents( u ), s1 );
s0.retainAll( s1 );
indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
additionalIterator = termLists ? additionalReader.documents( terms[ i ].get( u ) ) : additionalReader.documents( u );
it = s0.iterator();
documentIterator = AndDocumentIterator.getInstance( indexIterator, additionalIterator );
for( int j = s0.size(); j-- != 0; ) if ( it.nextInt() != documentIterator.nextDocument() ) throw new AssertionError();
if ( documentIterator.hasNext() ) throw new AssertionError();
s2.clear();
IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t ), s2 );
IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( u ) ) : indexReader[ i ].documents( u ), s2 );
indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
additionalIterator = termLists ? additionalReader.documents( terms[ i ].get( u ) ) : additionalReader.documents( u );
it = s2.iterator();
documentIterator = OrDocumentIterator.getInstance( indexIterator, additionalIterator );
for( int j = s2.size(); j-- != 0; ) if ( it.nextInt() != documentIterator.nextDocument() ) throw new AssertionError();
if ( documentIterator.hasNext() ) throw new AssertionError();
}
pl.update();
}
pl.done();
additionalReader.close();
}
}
catch( Throwable e ) {
System.err.println( "Exception during composite iterator test (index=" + index[ i ] + ", first term=" + t + ", second term =" + u + ")" );
throw e;
}
}
if ( ! isVirtual && jsapResult.getBoolean( "random" ) ) {
// Random access scan
pl.expectedUpdates = index[ 0 ].numberOfDocuments;
pl.itemsName = "documents";
pl.start( "Verifying random access..." );
if ( allBitStreamIndices ) {
it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
Document document;
Reader reader;
WordReader wordReader;
final MutableString word = new MutableString(), nonWord = new MutableString();
int docCounter = 0;
while( ( document = documentIterator.nextDocument() ) != null ) {
currDoc = permutation != null ? permutation[ docCounter ] : docCounter;
for( i = 0; i < index.length; i++ ) {
Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
if ( index[ i ].hasPayloads ) {
// TODO: write tests for the other case
if ( allBitStreamIndices ) {
IndexIterator indexIterator = indexReader[ i ].documents( 0 );
int pointer = indexIterator.skipTo( currDoc );
if ( pointer == currDoc ) {
Payload payload = indexIterator.payload();
if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload );
}
else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
}
else {
IndexIterator indexIterator = indexReader[ i ].documents( 0 );
if ( indexIterator.skipTo( currDoc ) == currDoc ) {
if ( ! indexIterator.payload().get().equals( content ) )
LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
}
else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
}
}
else {
// text index
pos = 0;
termsInDoc[ i ].clear();
reader = (Reader)content;
wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
wordReader.setReader( reader );
while( wordReader.next( word, nonWord ) ) {
if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
if ( ( t = (int)( (BitStreamIndex)index[ i ] ).termMap.getLong( word ) ) == -1 ) LOGGER.error( index[ i ] + ": Could not find term " + word + " in term index" );
else {
if ( index[ i ].hasCounts ) termsInDoc[ i ].put( t, termsInDoc[ i ].get( t ) + 1 );
if ( index[ i ].hasPositions ) wordInPos[ i ][ pos++ ] = t;
}
}
if ( allBitStreamIndices ) {
for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
t = x.nextInt();
IndexIterator indexIterator = indexReader[ i ].documents( t );
int pointer = indexIterator.skipTo( currDoc );
if ( pointer == currDoc ) {
if ( index[ i ].hasCounts ) {
int c = indexIterator.count();
if ( termsInDoc[ i ].get( t ) != c )
LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
else {
if ( index[ i ].hasPositions ) {
indexIterator.positions( occ[ i ] );
for( int j = 0; j < c; j++ )
if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )
LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
}
}
}
}
else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t + "(skipTo returned " + pointer + ")" );
}
}
else {
for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
t = x.nextInt();
IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
if ( indexIterator.skipTo( currDoc ) == currDoc ) {
if ( index[ i ].hasCounts ) {
int c = indexIterator.count();
if ( termsInDoc[ i ].get( t ) != c )
LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
else {
if ( index[ i ].hasPositions ) {
indexIterator.positions( occ[ i ] );
for( int j = 0; j < c; j++ )
if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )
LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
}
}
}
}
else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
}
}
}
}
docCounter++;
document.close();
pl.update();
}
}
else {
LOGGER.warn( "Random access tests require very slow single-term scanning as not all indices are disk based" );
it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
Document document;
Reader reader;
WordReader wordReader;
final MutableString word = new MutableString(), nonWord = new MutableString();
int docCounter = 0;
while( ( document = documentIterator.nextDocument() ) != null ) {
currDoc = permutation != null ? permutation[ docCounter ] : docCounter;
for( i = 0; i < index.length; i++ ) {
Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
if ( index[ i ].hasPayloads ) {
if ( allBitStreamIndices ) {
IndexIterator indexIterator = indexReader[ i ].documents( 0 );
int pointer = indexIterator.skipTo( currDoc );
if ( pointer == currDoc ) {
Payload payload = indexIterator.payload();
if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload );
}
else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
}
else {
IndexIterator indexIterator = indexReader[ i ].documents( "#" );
if ( indexIterator.skipTo( currDoc ) == currDoc ) {
if ( ! indexIterator.payload().get().equals( content ) )
LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
}
else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
}
}
else {
pos = 0;
reader = (Reader)content;
wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
wordReader.setReader( reader );
while( wordReader.next( word, nonWord ) ) {
if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
IndexIterator indexIterator = indexReader[ i ].documents( word );
if ( currDoc != indexIterator.skipTo( currDoc ) )