Package org.terrier.compression

Examples of org.terrier.compression.BitOutputStream


   * @param size number of runs in disk.
   * @param fileName String with the file name of the final inverted file.
   * @throws IOException if an I/O error occurs.
   */
  protected void init(int size, String fileName) throws Exception{
    this.init(size, new BitOutputStream(fileName));
  }
View Full Code Here


    int iteration = 0;
    try{
      Iterator<DocumentIndexEntry> diis =  (Iterator<DocumentIndexEntry>) index.getIndexStructureInputStream("document");
      final String offsetsFilename = index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "."+destinationStructure+".offsets";
      final DataOutputStream offsetsTmpFile = new DataOutputStream(Files.writeFileStream(offsetsFilename));
      final BitOut bos = new BitOutputStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "."+destinationStructure+ BitIn.USUAL_EXTENSION);
      do//for each pass of the inverted file
      {
        iteration++;
        //logger.info("Iteration "+iteration  + iterationSuffix);
        //get a copy of the inverted index
        final InvertedIndexInputStream iiis = (InvertedIndexInputStream) index.getIndexStructureInputStream(sourceStructure);
        //work out how many document we can scan for
        lastDocid = firstDocid + scanDocumentIndexForTokens(processTokens, diis);
        //logger.info("Generating postings for documents with ids "+firstDocid + " to " + lastDocid);
        //get a set of posting objects to save the compressed postings for each of the documents to
        final Posting[] postings = getPostings(lastDocid - firstDocid +1 );
        //get postings for these documents
        numberOfTokensFound += traverseInvertedFile(iiis, firstDocid, lastDocid, postings);
        //logger.info("Writing the postings to disk");
        int id = firstDocid;
        for (Posting p : postings) //for each document
       
          //logger.debug("Document " + id  + " length="+ p.getDocF());
          id++;
         
          //get the offsets
          long endByte = bos.getByteOffset();
          byte endBit = bos.getBitOffset();
         
          //if the document is non-empty
          if (p.getDocF() > 0)
          {         
            //obtain the compressed memory posting list
            final MemorySBOS Docs = p.getDocs();
            //some obscure problem when reading from memory rather than disk.
            //by padding the posting list with some non zero bytes the problem
            //is solved. Thanks to Roicho for working this one out.
            Docs.writeGamma(1);
            Docs.writeGamma(1);
            Docs.pad();
         
            //use a PostingInRun to decompress the postings stored in memory
            final PostingInRun pir = getPostingReader();
            pir.setDf(p.getDocF());
            pir.setTF(p.getTF());
            pir.setPostingSource(new BitInputStream(new ByteArrayInputStream(
              Docs.getMOS().getBuffer())));
            //System.err.println("temp compressed buffer size="+Docs.getMOS().getPos() + " length="+Docs.getMOS().getBuffer().length);
            //decompress the memory postings and write out to the direct file
            pir.append(bos, -1);
          }

          //take note of the offset for this document in the df
          offsetsTmpFile.writeLong(endByte);
          offsetsTmpFile.writeByte(endBit);
          offsetsTmpFile.writeInt(p.getDocF());
        }// /for document postings
        firstDocid = lastDocid +1;
      } while(firstDocid <  -1 + index.getCollectionStatistics().getNumberOfDocuments());

      if (numberOfTokensFound != totalTokens)
      {
        //logger.warn("Number of tokens found while scanning "+sourceStructure+" structure does not match expected. Expected "
//          +index.getCollectionStatistics().getNumberOfTokens()+ ", found " + numberOfTokensFound);
      }
      //logger.info("Finishing up: rewriting document index"); 
      offsetsTmpFile.close();
      //write the offsets to the DocumentIndex
      final DataInputStream dis = new DataInputStream(Files.openFileStream(offsetsFilename));
      final DocumentIndexBuilder dios = new DocumentIndexBuilder(index, "document-df");
      final Iterator<DocumentIndexEntry> docidInput = (Iterator<DocumentIndexEntry>)index.getIndexStructureInputStream("document");
     
      DocumentIndexEntry die = null;
      int docid = 0;
        while (docidInput.hasNext())
      {
          DocumentIndexEntry old = docidInput.next();
          if (fieldCount == 0)
          {
            die = new BasicDocumentIndexEntry(old);
          }
          else
          {
            die = old;
          }
          die.setOffset(dis.readLong(), dis.readByte());
        die.setNumberOfEntries(dis.readInt());
        dios.addEntryToBuffer(die);
        docid++;
        }
        IndexUtil.close(docidInput);
      bos.close();
      IndexUtil.close(diis);
      dis.close();
      Files.delete(offsetsFilename);
      dios.close();
      IndexUtil.renameIndexStructure(index, "document-df", "document");
View Full Code Here

   * @param fileName name of the file to write the posting lists data.
   * @param termsFile name of the file to write the terms.
   * @throws IOException if an I/O error occurs.
   */
  public RunWriter(String fileName, String termsFile) throws IOException{
    bos = new BitOutputStream(fileName);
    stringDos = new DataOutputStream( Files.writeFileStream(termsFile));
    this.info = "RunWriter("+fileName+")";
  }
View Full Code Here

  {
    this.index = i;
    this.structureName = _structureName;
   
    try{
      file = new BitOutputStream(index.getPath() + "/"+ index.getPrefix() + "." +structureName + BitIn.USUAL_EXTENSION);
    } catch (IOException ioe) {
      logger.error("creating BitOutputStream for writing the inverted file : ", ioe);
    }
    lexiconOutputStream = LexiconOutputStream.class;
  }
View Full Code Here

          ? FieldPostingInRun.class
          : SimplePostingInRun.class),
        super.numFields);
    HadoopRunsMerger tempRM = new HadoopRunsMerger(runIteratorF);
    try{
      tempRM.setBos(new BitOutputStream(
          currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR
          + currentIndex.getPrefix() + ".inverted" + BitIn.USUAL_EXTENSION));
    } catch (IOException ioe) {
      ioe.printStackTrace();
    }
View Full Code Here

          ? BlockFieldPostingInRun.class
          : BlockPostingInRun.class),
        super.numFields);
    HadoopRunsMerger tempRM = new HadoopRunsMerger(runIteratorF);
    try{
      tempRM.setBos(new BitOutputStream(
          currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR
          + currentIndex.getPrefix() + ".inverted.bf" ));
    } catch (IOException ioe) {
      ioe.printStackTrace();
    }
View Full Code Here

    * for fields must also be specified.
    * @param filename Location of the file to write to
    */
  public DirectInvertedOutputStream(String filename) throws IOException
  {
    this.output = new BitOutputStream(filename);
  }
View Full Code Here

TOP

Related Classes of org.terrier.compression.BitOutputStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.