Package org.sindice.siren.index.codecs.siren10

Source Code of org.sindice.siren.index.codecs.siren10.DocsFreqBlockIndexOutput

/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
*  https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*  http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.sindice.siren.index.codecs.siren10;

import java.io.IOException;

import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.sindice.siren.index.codecs.block.BlockCompressor;
import org.sindice.siren.index.codecs.block.BlockIndexOutput;
import org.sindice.siren.index.codecs.siren10.NodBlockIndexOutput.NodBlockWriter;
import org.sindice.siren.index.codecs.siren10.PosBlockIndexOutput.PosBlockWriter;

/**
* Implementation of the {@link BlockIndexOutput} for the .doc file of the SIREn
* postings format.
*/
public class DocsFreqBlockIndexOutput extends BlockIndexOutput {

  private final int maxBlockSize;

  private final BlockCompressor docCompressor;
  private final BlockCompressor freqCompressor;

  public DocsFreqBlockIndexOutput(final IndexOutput out, final int maxBlockSize,
                                  final BlockCompressor docCompressor,
                                  final BlockCompressor freqCompressor)
  throws IOException {
    super(out);
    this.docCompressor = docCompressor;
    this.freqCompressor = freqCompressor;
    this.maxBlockSize = maxBlockSize;
  }

  @Override
  public DocsFreqBlockWriter getBlockWriter() {
    return new DocsFreqBlockWriter();
  }

  /**
   * Implementation of the {@link BlockWriter} for the .doc file.
   *
   * <p>
   *
   * Encode and write blocks containing the document identifiers and the node
   * frequencies. It also encodes the pointers of the associated blocks from
   * the .nod and .pos files into the block header.
   *
   * <p>
   *
   * This class must be associated to the {@link Index}s of the
   * {@link NodBlockWriter} and {@link PosBlockWriter} using
   * {@link #setNodeBlockIndex(Index)} and {@link #setPosBlockIndex(Index)}.
   */
  public class DocsFreqBlockWriter extends BlockWriter {

    IntsRef docBuffer;
    IntsRef nodFreqBuffer;

    int firstDocId, lastDocId = 0;
    NodBlockIndexOutput.Index nodeBlockIndex;
    PosBlockIndexOutput.Index posBlockIndex;

    BytesRef docCompressedBuffer;
    BytesRef nodFreqCompressedBuffer;

    public DocsFreqBlockWriter() {
      // ensure that the input buffers has the minimum size required
      docBuffer = new IntsRef(this.getMinimumBufferSize(maxBlockSize, docCompressor.getWindowSize()));
      nodFreqBuffer = new IntsRef(this.getMinimumBufferSize(maxBlockSize, freqCompressor.getWindowSize()));

      // determine max size of compressed buffer to avoid overflow
      int size = docCompressor.maxCompressedSize(maxBlockSize);
      docCompressedBuffer = new BytesRef(size);

      size = freqCompressor.maxCompressedSize(maxBlockSize);
      nodFreqCompressedBuffer = new BytesRef(size);
    }

    public int getMaxBlockSize() {
      return maxBlockSize;
    }

    public int getFirstDocId() {
      return firstDocId;
    }

    /**
     * Set the {@link Index} of the {@link NodBlockIndexOutput}. The
     * {@link Index} is used to read the current file pointer of the
     * {@link NodBlockIndexOutput} when flushing a block.
     */
    public void setNodeBlockIndex(final NodBlockIndexOutput.Index index) throws IOException {
      this.nodeBlockIndex = index;
    }

    /**
     * Set the {@link Index} of the {@link PosBlockIndexOutput}. The
     * {@link Index} is used to read the current file pointer of the
     * {@link PosBlockIndexOutput} when flushing a block.
     */
    public void setPosBlockIndex(final PosBlockIndexOutput.Index index) throws IOException {
      this.posBlockIndex = index;
    }

    /**
     * Add a document identifier to the buffer.
     */
    public void write(final int docId) throws IOException {
      int delta;

      // compute delta - first value in the block is always 0
      if (docBuffer.offset != 0) {
        assert docId > lastDocId;
        // encode delta and decrement by one
        delta = docId - lastDocId - 1;
      }
      else {
        delta = 0;
        firstDocId = docId;
      }

      // copy delta to buffer
      docBuffer.ints[docBuffer.offset++] = delta;
      // update last doc id
      lastDocId = docId;
    }


    /**
     * Add a node frenquency to the buffer.
     */
    public void writeNodeFreq(final int nodeFreqInDoc) {
      // decrement freq by one
      nodFreqBuffer.ints[nodFreqBuffer.offset++] = nodeFreqInDoc - 1;
    }

    @Override
    public boolean isEmpty() {
      return docBuffer.offset == 0;
    }

    @Override
    public boolean isFull() {
      return docBuffer.offset >= maxBlockSize;
    }

    @Override
    protected void compress() {
      // Flip buffer before compression
      docBuffer.length = nodFreqBuffer.length = docBuffer.offset;
      docBuffer.offset = nodFreqBuffer.offset = 0;

      docCompressor.compress(docBuffer, docCompressedBuffer);
      freqCompressor.compress(nodFreqBuffer, nodFreqCompressedBuffer);
    }

    @Override
    protected void writeHeader() throws IOException {
      // logger.debug("Write DocFreq header - writer-id={}", this.hashCode());
      // logger.debug("DocFreq header start at fp={}", out.getFilePointer());

      // write block size (same for all of them)
      out.writeVInt(docBuffer.length);
      // logger.debug("blockSize: {}", docBuffer.length);

      // write size of each compressed data block
      out.writeVInt(docCompressedBuffer.length);
      // logger.debug("docCompressedBuffer.length: {}", docCompressedBuffer.length);
      out.writeVInt(nodFreqCompressedBuffer.length);
      // logger.debug("nodFreqCompressedBuffer.length: {}", nodFreqCompressedBuffer.length);

      // write first and last doc id
      out.writeVInt(firstDocId);
      out.writeVInt(lastDocId - firstDocId);
      // logger.debug("firstDocId: {}, lastDocId: {}", firstDocId, lastDocId);

      // write node and pos skip data
      // logger.debug("Write node and pos skip data");
      nodeBlockIndex.mark();
      nodeBlockIndex.write(out, true);
      posBlockIndex.mark();
      posBlockIndex.write(out, true);
    }

    @Override
    protected void writeData() throws IOException {
      out.writeBytes(docCompressedBuffer.bytes, docCompressedBuffer.length);
      out.writeBytes(nodFreqCompressedBuffer.bytes, nodFreqCompressedBuffer.length);
    }

    @Override
    protected void initBlock() {
      docBuffer.offset = 0;
      nodFreqBuffer.offset = 0;
    }

  }

}
TOP

Related Classes of org.sindice.siren.index.codecs.siren10.DocsFreqBlockIndexOutput

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.