Package ivory.core.data.document

Source Code of ivory.core.data.document.LazyIntDocVector$Reader

/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package ivory.core.data.document;

import ivory.core.compression.BitInputStream;
import ivory.core.compression.BitOutputStream;
import ivory.core.data.dictionary.DefaultFrequencySortedDictionary;
import ivory.core.data.index.TermPositions;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;

import org.apache.hadoop.io.WritableUtils;

/**
* Implementation of {@link IntDocVector} that lazily decodes term and
* positional information on demand.
*
* @author Tamer Elsayed
* @author Jimmy Lin
*/
public class LazyIntDocVector implements IntDocVector {
  private SortedMap<Integer, int[]> termPositionsMap = null;
  private byte[] bytes = null;
  private int numTerms;

  private transient ByteArrayOutputStream bytesOut = null;
  private transient BitOutputStream bitsOut = null;

  public LazyIntDocVector() {}

  public LazyIntDocVector(SortedMap<Integer, int[]> termPositionsMap) {
    this.termPositionsMap = termPositionsMap;
  }

  public void setTermPositionsMap(SortedMap<Integer, int[]> termPositionsMap) {
    this.termPositionsMap = termPositionsMap;
  }

  @Override
  public void write(DataOutput out) throws IOException {
    if (bytes != null) {
      // This would happen if we're reading in an already-encoded
      // doc vector; if that's the case, simply write out the byte array
      writeRawBytes(out);
    } else if (termPositionsMap != null) {
      writeTermPositionsMap(out);
    } else {
      throw new RuntimeException("Unable to write LazyIntDocVector!");
    }
  }

  private void writeRawBytes(DataOutput out) {
    try {
      WritableUtils.writeVInt(out, bytes.length);
      out.write(bytes);
    } catch (IOException e) {
      throw new RuntimeException("Error writing LazyIntDocVector raw bytes");
    }
  }

  private void writeTermPositionsMap(DataOutput out) {
    try {
      numTerms = termPositionsMap.size();

      // Write # of terms.
      WritableUtils.writeVInt(out, numTerms);
      if (numTerms == 0)
        return;

      bytesOut = new ByteArrayOutputStream();
      bitsOut = new BitOutputStream(bytesOut);

      Iterator<Map.Entry<Integer, int[]>> it = termPositionsMap.entrySet().iterator();
      Map.Entry<Integer, int[]> posting = it.next();
      int[] positions = posting.getValue();
      TermPositions tp = new TermPositions();
      // Write out the first termid.
      int lastTerm = posting.getKey().intValue();
      bitsOut.writeBinary(32, lastTerm);
      // Write out the tf value.
      bitsOut.writeGamma((short) positions.length);
      tp.set(positions, (short) positions.length);
      // Write out the positions.
      writePositions(bitsOut, tp);

      int curTerm;
      while (it.hasNext()) {
        posting = it.next();
        curTerm = posting.getKey().intValue();
        positions = posting.getValue();
        int tgap = curTerm - lastTerm;
        if (tgap <= 0) {
          throw new RuntimeException("Error: encountered invalid t-gap. termid=" + curTerm);
        }
        // Write out the gap.
        bitsOut.writeGamma(tgap);
        tp.set(positions, (short) positions.length);
        // Write out the tf value.
        bitsOut.writeGamma((short) positions.length);
        // Write out the positions.
        writePositions(bitsOut, tp);
        lastTerm = curTerm;
      }

      bitsOut.padAndFlush();
      bitsOut.close();
      byte[] bytes = bytesOut.toByteArray();
      WritableUtils.writeVInt(out, bytes.length);
      out.write(bytes);
    } catch (IOException e) {
      throw new RuntimeException("Error writing LazyIntDocVector term positions map", e);
    } catch (ArithmeticException e) {
      throw new RuntimeException(e);
    }
  }

  @Override
  public void readFields(DataInput in) throws IOException {
    numTerms = WritableUtils.readVInt(in);
    if (numTerms == 0) {
      bytes = null;
      return;
    }
    bytes = new byte[WritableUtils.readVInt(in)];
    in.readFully(bytes);
  }

  // Passing in docno and tf basically for error checking purposes.
  protected static void writePositions(BitOutputStream t, TermPositions p) throws IOException {
    int[] pos = p.getPositions();

    if (p.getTf() == 1) {
      // If tf=1, just write out the single term position.
      t.writeGamma(pos[0]);
    } else {
      // If tf > 1, write out skip information if we want to bypass the
      // positional information during decoding.
      t.writeGamma(p.getEncodedSize());

      // Keep track of where we are in the stream.
      int skip1 = (int) t.getByteOffset() * 8 + t.getBitOffset();

      // Write out first position.
      t.writeGamma(pos[0]);
      // Write out rest of positions using p-gaps (first order positional differences).
      for (int c = 1; c < p.getTf(); c++) {
        int pgap = pos[c] - pos[c - 1];
        if (pos[c] <= 0 || pgap == 0) {
          throw new RuntimeException("Error: invalid term positions. positions=" + p.toString());
        }
        t.writeGamma(pgap);
      }

      // Find out where we are in the stream now.
      int skip2 = (int) t.getByteOffset() * 8 + t.getBitOffset();

      // Verify that the skip information is indeed valid.
      if (skip1 + p.getEncodedSize() != skip2) {
        throw new RuntimeException("Ivalid skip information: skip_pos1=" + skip1
            + ", skip_pos2=" + skip2 + ", size=" + p.getEncodedSize());
      }
    }
  }

  @Override
  public String toString() {
    StringBuffer s = new StringBuffer("[");
    try {
      Reader r = this.getReader();
      while (r.hasMoreTerms()) {
        int id = r.nextTerm();
        TermPositions pos = new TermPositions();
        r.getPositions(pos);
        s.append("(" + id + ", " + pos.getTf() + ", " + pos + ")");
      }
      s.append("]");
    } catch (Exception e) {
      e.printStackTrace();
    }
    return s.toString();
  }

  public String toStringWithTerms(DefaultFrequencySortedDictionary map) {
    StringBuffer s = new StringBuffer("");
    try {
      Reader r = this.getReader();
      while (r.hasMoreTerms()) {
        int id = r.nextTerm();
        TermPositions pos = new TermPositions();
        r.getPositions(pos);
        s.append(String.format("(%d, %d, %s)", map.getTerm(id), pos.getTf(), pos));
      }
      s.append("]");
    } catch (Exception e) {
      e.printStackTrace();
    }
    return s.toString();
  }

  @Override
  public Reader getReader() throws IOException {
    return new Reader(bytes, numTerms);
  }

  public static class Reader implements IntDocVector.Reader {
    private ByteArrayInputStream bytesIn;
    private BitInputStream bitsIn;
    private int p = -1;
    private int prevTermID = -1;
    private short prevTf = -1;
    private int termCnt;
    private boolean needToReadPositions = false;

    public Reader(byte[] bytes, int n) throws IOException {
      this.termCnt = n;
      if (termCnt > 0) {
        bytesIn = new ByteArrayInputStream(bytes);
        bitsIn = new BitInputStream(bytesIn);
      }
    }

    @Override
    public int getNumberOfTerms() {
      return termCnt;
    }

    @Override
    public short getTf() {
      return prevTf;
    }

    @Override
    public void reset() {
      try {
        bytesIn.reset();
        bitsIn = new BitInputStream(bytesIn);
        p = -1;
        prevTf = -1;
        needToReadPositions = false;
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }

    @Override
    public int nextTerm() {
      int id = -1;
      try {
        p++;
        if (needToReadPositions) {
          skipPositions(prevTf);
        }
        needToReadPositions = true;
        if (p == 0) {
          prevTermID = bitsIn.readBinary(32);
          prevTf = (short) bitsIn.readGamma();
          return prevTermID;
        } else {
          if (p > termCnt - 1) {
            return -1;
          }
          id = bitsIn.readGamma() + prevTermID;
          prevTermID = id;
          prevTf = (short) bitsIn.readGamma();
          return id;
        }
      } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException();
      }
    }

    @Override
    public int[] getPositions() {
      int[] pos = null;
      try {
        if (prevTf == 1) {
          pos = new int[1];
          pos[0] = bitsIn.readGamma();
        } else {
          bitsIn.readGamma();
          pos = new int[prevTf];
          pos[0] = bitsIn.readGamma();
          for (int i = 1; i < prevTf; i++) {
            pos[i] = (pos[i - 1] + bitsIn.readGamma());
          }
        }
      } catch (IOException e) {
        throw new RuntimeException("Error reading bits:", e);
      }
      needToReadPositions = false;

      return pos;
    }

    @Override
    public boolean getPositions(TermPositions tp) {
      int[] pos = getPositions();

      if (pos == null) {
        return false;
      }

      tp.set(pos, (short) pos.length);

      return true;
    }

    @Override
    public boolean hasMoreTerms() {
      return !(p >= termCnt - 1);
    }

    private void skipPositions(int tf) throws IOException {
      if (tf == 1) {
        bitsIn.readGamma();
      } else {
        bitsIn.skipBits(bitsIn.readGamma());
      }
    }
  }
}
TOP

Related Classes of ivory.core.data.document.LazyIntDocVector$Reader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.