Examples of BitOut


Examples of org.terrier.compression.BitOut

    int iteration = 0;
    try{
      Iterator<DocumentIndexEntry> diis =  (Iterator<DocumentIndexEntry>) index.getIndexStructureInputStream("document");
      final String offsetsFilename = index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "."+destinationStructure+".offsets";
      final DataOutputStream offsetsTmpFile = new DataOutputStream(Files.writeFileStream(offsetsFilename));
      final BitOut bos = new BitOutputStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "."+destinationStructure+ BitIn.USUAL_EXTENSION);
      do//for each pass of the inverted file
      {
        iteration++;
        //logger.info("Iteration "+iteration  + iterationSuffix);
        //get a copy of the inverted index
        final InvertedIndexInputStream iiis = (InvertedIndexInputStream) index.getIndexStructureInputStream(sourceStructure);
        //work out how many document we can scan for
        lastDocid = firstDocid + scanDocumentIndexForTokens(processTokens, diis);
        //logger.info("Generating postings for documents with ids "+firstDocid + " to " + lastDocid);
        //get a set of posting objects to save the compressed postings for each of the documents to
        final Posting[] postings = getPostings(lastDocid - firstDocid +1 );
        //get postings for these documents
        numberOfTokensFound += traverseInvertedFile(iiis, firstDocid, lastDocid, postings);
        //logger.info("Writing the postings to disk");
        int id = firstDocid;
        for (Posting p : postings) //for each document
       
          //logger.debug("Document " + id  + " length="+ p.getDocF());
          id++;
         
          //get the offsets
          long endByte = bos.getByteOffset();
          byte endBit = bos.getBitOffset();
         
          //if the document is non-empty
          if (p.getDocF() > 0)
          {         
            //obtain the compressed memory posting list
            final MemorySBOS Docs = p.getDocs();
            //some obscure problem when reading from memory rather than disk.
            //by padding the posting list with some non zero bytes the problem
            //is solved. Thanks to Roicho for working this one out.
            Docs.writeGamma(1);
            Docs.writeGamma(1);
            Docs.pad();
         
            //use a PostingInRun to decompress the postings stored in memory
            final PostingInRun pir = getPostingReader();
            pir.setDf(p.getDocF());
            pir.setTF(p.getTF());
            pir.setPostingSource(new BitInputStream(new ByteArrayInputStream(
              Docs.getMOS().getBuffer())));
            //System.err.println("temp compressed buffer size="+Docs.getMOS().getPos() + " length="+Docs.getMOS().getBuffer().length);
            //decompress the memory postings and write out to the direct file
            pir.append(bos, -1);
          }

          //take note of the offset for this document in the df
          offsetsTmpFile.writeLong(endByte);
          offsetsTmpFile.writeByte(endBit);
          offsetsTmpFile.writeInt(p.getDocF());
        }// /for document postings
        firstDocid = lastDocid +1;
      } while(firstDocid <  -1 + index.getCollectionStatistics().getNumberOfDocuments());

      if (numberOfTokensFound != totalTokens)
      {
        //logger.warn("Number of tokens found while scanning "+sourceStructure+" structure does not match expected. Expected "
//          +index.getCollectionStatistics().getNumberOfTokens()+ ", found " + numberOfTokensFound);
      }
      //logger.info("Finishing up: rewriting document index"); 
      offsetsTmpFile.close();
      //write the offsets to the DocumentIndex
      final DataInputStream dis = new DataInputStream(Files.openFileStream(offsetsFilename));
      final DocumentIndexBuilder dios = new DocumentIndexBuilder(index, "document-df");
      final Iterator<DocumentIndexEntry> docidInput = (Iterator<DocumentIndexEntry>)index.getIndexStructureInputStream("document");
     
      DocumentIndexEntry die = null;
      int docid = 0;
        while (docidInput.hasNext())
      {
          DocumentIndexEntry old = docidInput.next();
          if (fieldCount == 0)
          {
            die = new BasicDocumentIndexEntry(old);
          }
          else
          {
            die = old;
          }
          die.setOffset(dis.readLong(), dis.readByte());
        die.setNumberOfEntries(dis.readInt());
        dios.addEntryToBuffer(die);
        docid++;
        }
        IndexUtil.close(docidInput);
      bos.close();
      IndexUtil.close(diis);
      dis.close();
      Files.delete(offsetsFilename);
      dios.close();
      IndexUtil.renameIndexStructure(index, "document-df", "document");
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.