Package org.sindice.siren.index.codecs.siren10

Source Code of org.sindice.siren.index.codecs.siren10.NodBlockIndexOutput$NodBlockWriter

/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
*  https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*  http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.sindice.siren.index.codecs.siren10;

import java.io.IOException;

import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.sindice.siren.index.codecs.block.BlockCompressor;
import org.sindice.siren.index.codecs.block.BlockIndexOutput;
import org.sindice.siren.util.ArrayUtils;

/**
* Implementation of the {@link BlockIndexOutput} for the .nod file of the SIREn
* postings format.
*/
public class NodBlockIndexOutput extends BlockIndexOutput {

  private final int maxBlockSize;
  private final BlockCompressor nodCompressor;

  public NodBlockIndexOutput(final IndexOutput out, final int maxBlockSize,
                             final BlockCompressor nodCompressor)
  throws IOException {
    super(out);
    this.nodCompressor = nodCompressor;
    this.maxBlockSize = maxBlockSize;
  }

  @Override
  public NodBlockWriter getBlockWriter() {
    return new NodBlockWriter();
  }

  /**
   * Implementation of the {@link BlockWriter} for the .nod file.
   *
   * <p>
   *
   * Encode and write blocks containing the node labels and term frequencies.
   *
   * <p>
   *
   * TODO: Can we try to reduce the number of test conditions for buffer size by
   * using term frequency information ? At each new document, nodBlockWriter is
   * informed of the term frequency, and check buffer size appropriately.
   */
  protected class NodBlockWriter extends BlockWriter {

    IntsRef nodLenBuffer;
    IntsRef nodBuffer;
    IntsRef termFreqBuffer;

    BytesRef nodLenCompressedBuffer;
    BytesRef nodCompressedBuffer;

    BytesRef termFreqCompressedBuffer;

    public NodBlockWriter() {
      // ensure that the input buffers has the minimum size required
      // maxBlockSize is just use as a minimum initial capacity for the buffers
      nodLenBuffer = new IntsRef(this.getMinimumBufferSize(maxBlockSize, nodCompressor.getWindowSize()));
      nodBuffer = new IntsRef(this.getMinimumBufferSize(maxBlockSize, nodCompressor.getWindowSize()));
      termFreqBuffer = new IntsRef(this.getMinimumBufferSize(maxBlockSize, nodCompressor.getWindowSize()));

      // init of the compressed buffers
      nodLenCompressedBuffer = new BytesRef();
      nodCompressedBuffer = new BytesRef();
      termFreqCompressedBuffer = new BytesRef();
    }

    /**
     * Add a node label to the buffer.
     */
    public void write(final IntsRef node) {
      final int nodeOffset = node.offset;
      final int nodeLength = node.length;
      final int[] nodeInts = node.ints;

      assert nodeLength > 0;

      /*
       * write node
       */

      int[] nodBufferInts = nodBuffer.ints;
      final int nodBufferOffset = nodBuffer.offset;

      // increase buffers if needed
      if (nodBufferOffset + nodeLength >= nodBufferInts.length) {
        // Take the max to ensure that buffer will be large enough
        int newLength = Math.max(nodBufferOffset + nodeLength, nodBufferInts.length * 3/2);
        // ensure that the buffer is large enough to accommodate the window size
        newLength = this.getMinimumBufferSize(newLength, nodCompressor.getWindowSize());
        ArrayUtils.growAndCopy(nodBuffer, newLength);
        // update reference of the buffer's int array
        nodBufferInts = nodBuffer.ints;
      }

      // compute delta encoding and copy to buffer
      this.deltaEncodingAndCopy(nodeInts, nodeOffset, nodeLength, nodBufferInts, nodBufferOffset);
      // cache current node for next delta encoding
      this.cacheNode(nodeInts, nodeOffset, nodeLength);
      // increment node buffer offset with node length
      nodBuffer.offset += nodeLength;

      /*
       * write node length
       */

      int[] nodLenBufferInts = nodLenBuffer.ints;

      // increase node length buffer if needed
      if (nodLenBuffer.offset >= nodLenBufferInts.length) {
        // Take the max to ensure that buffer will be large enough
        int newLength = Math.max(nodLenBuffer.offset + 1, nodLenBufferInts.length * 3/2);
        // ensure that the buffer is large enough to accomodate the window size
        newLength = this.getMinimumBufferSize(newLength, nodCompressor.getWindowSize());
        ArrayUtils.growAndCopy(nodLenBuffer, newLength);
        // update reference of the buffer's int array
        nodLenBufferInts = nodLenBuffer.ints;
      }
      // decrement length by one
      nodLenBufferInts[nodLenBuffer.offset++] = nodeLength - 1;
    }

    /**
     * Node cache used for computing the delta of a node label.
     */
    private IntsRef nodeCache = new IntsRef(DEFAULT_NODE_CACHE_SIZE);
    private static final int DEFAULT_NODE_CACHE_SIZE = 16;

    /**
     * Cache the given node
     */
    private final void cacheNode(final int[] nodeInts, final int nodeOffset, final int nodeLen) {
      int[] nodeCacheInts = nodeCache.ints;

      // ensure that the cache is large enough to accommodate the node array
      if (nodeLen > nodeCacheInts.length) {
        nodeCache = ArrayUtils.grow(nodeCache, nodeLen);
        // update reference of the cache's int array reference
        nodeCacheInts = nodeCache.ints;
      }

      System.arraycopy(nodeInts, nodeOffset, nodeCacheInts, 0, nodeLen);
      nodeCache.offset = 0;
      nodeCache.length = nodeLen;
    }

    /**
     * Compute the delta of the new node based on the cache node and copy
     * the delta encoding to the node buffer
     */
    private final void deltaEncodingAndCopy(final int[] nodeInts,
                                            final int nodeOffset,
                                            final int nodeLen,
                                            final int[] nodeBufferInts,
                                            final int nodeBufferOffset) {
      final int nodEnd = nodeOffset + nodeLen;
      final int[] nodeCacheInts = nodeCache.ints;
      final int nodeCacheOffset = nodeCache.offset;
      final int nodeCacheEnd = nodeCacheOffset + nodeCache.length;

      int i, j, k;

      // iterate over the node levels
      for (i = nodeCacheOffset, j = nodeOffset, k = 0;
           i < nodeCacheEnd && j < nodEnd && nodeCacheInts[i] <= nodeInts[j];
           i++, j++, k++) {

        // if previous node id is inferior to current node id, then we must stop
        // delta encoding
        if (nodeCacheInts[i] < nodeInts[j]) {
          // compute delta
          nodeBufferInts[nodeBufferOffset + k] = nodeInts[j] - nodeCacheInts[i];
          // increment for preparing copy of remaining ids
          j += 1;
          k += 1;
          // stop iteration
          break;
        }

        // otherwise if equal, compute delta (== 0) and move to next level
        nodeBufferInts[nodeBufferOffset + k] = 0;
      }
      // copy the remaining ids
      for (; j < nodEnd; j++, k++) {
        nodeBufferInts[nodeBufferOffset + k] = nodeInts[j];
      }
    }

    /**
     * Add the term frequency within the current node to the buffer
     */
    public void writeTermFreq(final int termFreq) {
      // check size of the buffer and increase it if needed
      if (termFreqBuffer.offset >= termFreqBuffer.ints.length) {
        // Take the max to ensure that buffer will be large enough
        int newLength = Math.max(termFreqBuffer.offset + 1, termFreqBuffer.ints.length * 3/2);
        // ensure that the buffer is large enough to accomodate the window size
        newLength = this.getMinimumBufferSize(newLength, nodCompressor.getWindowSize());
        ArrayUtils.growAndCopy(termFreqBuffer, newLength);
      }
      // decrement freq by one
      termFreqBuffer.ints[termFreqBuffer.offset++] = termFreq - 1;
    }

    @Override
    public boolean isEmpty() {
      return nodBuffer.offset == 0;
    }

    @Override
    public boolean isFull() {
      // this implementation is never full as it is synchronised with doc block
      // and grows on demand
      return false;
    }

    @Override
    protected void writeHeader() throws IOException {
      // logger.debug("Write Nod header: {}", this.hashCode());
      // logger.debug("Nod header start at {}", out.getFilePointer());

      // write block sizes
      out.writeVInt(nodLenBuffer.length);
      out.writeVInt(nodBuffer.length);
      out.writeVInt(termFreqBuffer.length);
      assert nodLenBuffer.length <= nodBuffer.length;

      // write size of compressed data blocks
      out.writeVInt(nodLenCompressedBuffer.length);
      out.writeVInt(nodCompressedBuffer.length);
      out.writeVInt(termFreqCompressedBuffer.length);
    }

    @Override
    protected void compress() {
      // Flip buffers before compression
      nodLenBuffer.length = nodLenBuffer.offset;
      nodLenBuffer.offset = 0;

      nodBuffer.length = nodBuffer.offset;
      nodBuffer.offset = 0;

      termFreqBuffer.length = termFreqBuffer.offset;
      termFreqBuffer.offset = 0;

      // determine max size of compressed buffer to avoid overflow
      int size = nodCompressor.maxCompressedSize(nodLenBuffer.length);
      nodLenCompressedBuffer = ArrayUtils.grow(nodLenCompressedBuffer, size);

      size = nodCompressor.maxCompressedSize(nodBuffer.length);
      nodCompressedBuffer = ArrayUtils.grow(nodCompressedBuffer, size);

      size = nodCompressor.maxCompressedSize(termFreqBuffer.length);
      termFreqCompressedBuffer = ArrayUtils.grow(termFreqCompressedBuffer, size);

      // compress
      nodCompressor.compress(nodLenBuffer, nodLenCompressedBuffer);
      nodCompressor.compress(nodBuffer, nodCompressedBuffer);
      nodCompressor.compress(termFreqBuffer, termFreqCompressedBuffer);
    }

    @Override
    protected void writeData() throws IOException {
      // logger.debug("Write Node data: {}", this.hashCode());
      // logger.debug("Write Node Length at {}", out.getFilePointer());
      out.writeBytes(nodLenCompressedBuffer.bytes, nodLenCompressedBuffer.length);
      // logger.debug("Write Node at {}", out.getFilePointer());
      out.writeBytes(nodCompressedBuffer.bytes, nodCompressedBuffer.length);
      // logger.debug("Write Term Freq in Node at {}", out.getFilePointer());
      out.writeBytes(termFreqCompressedBuffer.bytes, termFreqCompressedBuffer.length);
    }

    @Override
    protected void initBlock() {
      nodLenBuffer.offset = nodLenBuffer.length = 0;
      nodBuffer.offset = nodBuffer.length = 0;
      termFreqBuffer.offset = termFreqBuffer.length = 0;
      this.resetCurrentNode();
    }

    protected void resetCurrentNode() {
      nodeCache.offset = 0;
      nodeCache.length = 0;
    }

  }

}
TOP

Related Classes of org.sindice.siren.index.codecs.siren10.NodBlockIndexOutput$NodBlockWriter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.