final long[] termFrequency = new long[ (int)collection.terms ];
final long[] nonTermFrequency = collection.exact ? new long[ (int)collection.nonTerms ] : null;
final InputBitStream documentsIbs = collection.documentsInputBitStream;
final DocumentFactory factory = collection.factory;
final boolean exact = collection.exact;
final MutableString s = new MutableString();
documentsIbs.position( 0 );
for( int i = (int)collection.documents; i-- != 0; ) {
readSelfDelimitedUtf8String( documentsIbs, s ); // Skip URI
readSelfDelimitedUtf8String( documentsIbs, s ); // Skip title
for( int f = factory.numberOfFields() - 1; f-- !=0; ) {
int len = documentsIbs.readDelta();
while( len-- != 0 ) {
termFrequency[ documentsIbs.readDelta() ]++;
if ( exact ) nonTermFrequency[ documentsIbs.readDelta() ]++;
}
}
}
int[] termPerm = new int[ termFrequency.length ];
for( int i = termPerm.length; i-- != 0; ) termPerm[ i ] = i;
IntArrays.quickSort( termPerm, 0, termPerm.length, new AbstractIntComparator() {
public int compare( int arg0, int arg1 ) {
return termFrequency[ arg1 ] - termFrequency[ arg0 ] < 0 ? -1 : termFrequency[ arg1 ] == termFrequency[ arg0 ] ? 0 : 1;
}
});
int[] invTermPerm = new int[ termFrequency.length ];
for( int i = invTermPerm.length; i-- != 0; ) invTermPerm[ termPerm[ i ] ] = i;
int[] nonTermPerm = null, invNonTermPerm = null;
if ( exact ) {
nonTermPerm = new int[ termFrequency.length ];
for( int i = nonTermPerm.length; i-- != 0; ) nonTermPerm[ i ] = i;
IntArrays.quickSort( nonTermPerm, 0, nonTermPerm.length, new AbstractIntComparator() {
public int compare( int arg0, int arg1 ) {
return termFrequency[ arg1 ] - termFrequency[ arg0 ] < 0 ? -1 : termFrequency[ arg1 ] == termFrequency[ arg0 ] ? 0 : 1;
}
});
invNonTermPerm = new int[ nonTermFrequency.length ];
for( int i = invNonTermPerm.length; i-- != 0; ) invNonTermPerm[ nonTermPerm[ i ] ] = i;
}
File newDocumentsFile = File.createTempFile( SimpleCompressedDocumentCollection.class.getSimpleName(), "temp", new File( basename.toString() ).getParentFile() );
OutputBitStream newDocumentsObs = new OutputBitStream( newDocumentsFile );
documentsIbs.position( 0 );
for( int i = (int)collection.documents; i-- != 0; ) {
readSelfDelimitedUtf8String( documentsIbs, s ); // Skip URI
SimpleCompressedDocumentCollectionBuilder.writeSelfDelimitedUtf8String( newDocumentsObs, s );
readSelfDelimitedUtf8String( documentsIbs, s ); // Skip title
SimpleCompressedDocumentCollectionBuilder.writeSelfDelimitedUtf8String( newDocumentsObs, s );
for( int f = factory.numberOfFields() - 1; f-- !=0; ) {
int len = documentsIbs.readDelta();
newDocumentsObs.writeDelta( len );
while( len-- != 0 ) {
newDocumentsObs.writeDelta( invTermPerm[ documentsIbs.readDelta() ] );
if ( exact ) newDocumentsObs.writeDelta( invNonTermPerm[ documentsIbs.readDelta() ] );
}
}
}
newDocumentsObs.close();
new File( basename + DOCUMENTS_EXTENSION ).delete();
newDocumentsFile.renameTo( new File( basename + DOCUMENTS_EXTENSION ) );
newDocumentsObs = null;
invTermPerm = invNonTermPerm = null;
FastBufferedInputStream termsStream = new FastBufferedInputStream( new FileInputStream( basename + TERMS_EXTENSION ) ) ;
MutableString term[] = new MutableString[ (int)collection.terms ];
for( int i = 0; i < term.length; i++ ) term[ i ] = new MutableString().readSelfDelimUTF8( termsStream );
termsStream.close();
new FastBufferedOutputStream( new FileOutputStream( basename + TERMS_EXTENSION ) );
}