int iteration = 0;
try{
Iterator<DocumentIndexEntry> diis = (Iterator<DocumentIndexEntry>) index.getIndexStructureInputStream("document");
final String offsetsFilename = index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "."+destinationStructure+".offsets";
final DataOutputStream offsetsTmpFile = new DataOutputStream(Files.writeFileStream(offsetsFilename));
final BitOut bos = new BitOutputStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "."+destinationStructure+ BitIn.USUAL_EXTENSION);
do//for each pass of the inverted file
{
iteration++;
//logger.info("Iteration "+iteration + iterationSuffix);
//get a copy of the inverted index
final InvertedIndexInputStream iiis = (InvertedIndexInputStream) index.getIndexStructureInputStream(sourceStructure);
//work out how many document we can scan for
lastDocid = firstDocid + scanDocumentIndexForTokens(processTokens, diis);
//logger.info("Generating postings for documents with ids "+firstDocid + " to " + lastDocid);
//get a set of posting objects to save the compressed postings for each of the documents to
final Posting[] postings = getPostings(lastDocid - firstDocid +1 );
//get postings for these documents
numberOfTokensFound += traverseInvertedFile(iiis, firstDocid, lastDocid, postings);
//logger.info("Writing the postings to disk");
int id = firstDocid;
for (Posting p : postings) //for each document
{
//logger.debug("Document " + id + " length="+ p.getDocF());
id++;
//get the offsets
long endByte = bos.getByteOffset();
byte endBit = bos.getBitOffset();
//if the document is non-empty
if (p.getDocF() > 0)
{
//obtain the compressed memory posting list
final MemorySBOS Docs = p.getDocs();
//some obscure problem when reading from memory rather than disk.
//by padding the posting list with some non zero bytes the problem
//is solved. Thanks to Roicho for working this one out.
Docs.writeGamma(1);
Docs.writeGamma(1);
Docs.pad();
//use a PostingInRun to decompress the postings stored in memory
final PostingInRun pir = getPostingReader();
pir.setDf(p.getDocF());
pir.setTF(p.getTF());
pir.setPostingSource(new BitInputStream(new ByteArrayInputStream(
Docs.getMOS().getBuffer())));
//System.err.println("temp compressed buffer size="+Docs.getMOS().getPos() + " length="+Docs.getMOS().getBuffer().length);
//decompress the memory postings and write out to the direct file
pir.append(bos, -1);
}
//take note of the offset for this document in the df
offsetsTmpFile.writeLong(endByte);
offsetsTmpFile.writeByte(endBit);
offsetsTmpFile.writeInt(p.getDocF());
}// /for document postings
firstDocid = lastDocid +1;
} while(firstDocid < -1 + index.getCollectionStatistics().getNumberOfDocuments());
if (numberOfTokensFound != totalTokens)
{
//logger.warn("Number of tokens found while scanning "+sourceStructure+" structure does not match expected. Expected "
// +index.getCollectionStatistics().getNumberOfTokens()+ ", found " + numberOfTokensFound);
}
//logger.info("Finishing up: rewriting document index");
offsetsTmpFile.close();
//write the offsets to the DocumentIndex
final DataInputStream dis = new DataInputStream(Files.openFileStream(offsetsFilename));
final DocumentIndexBuilder dios = new DocumentIndexBuilder(index, "document-df");
final Iterator<DocumentIndexEntry> docidInput = (Iterator<DocumentIndexEntry>)index.getIndexStructureInputStream("document");
DocumentIndexEntry die = null;
int docid = 0;
while (docidInput.hasNext())
{
DocumentIndexEntry old = docidInput.next();
if (fieldCount == 0)
{
die = new BasicDocumentIndexEntry(old);
}
else
{
die = old;
}
die.setOffset(dis.readLong(), dis.readByte());
die.setNumberOfEntries(dis.readInt());
dios.addEntryToBuffer(die);
docid++;
}
IndexUtil.close(docidInput);
bos.close();
IndexUtil.close(diis);
dis.close();
Files.delete(offsetsFilename);
dios.close();
IndexUtil.renameIndexStructure(index, "document-df", "document");