Package ivory.core.data.document

Source Code of ivory.core.data.document.LazyTermDocVector$Reader

/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package ivory.core.data.document;

import ivory.core.compression.BitInputStream;
import ivory.core.compression.BitOutputStream;
import ivory.core.data.index.TermPositions;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Map;

import org.apache.hadoop.io.WritableUtils;

import edu.umd.cloud9.util.array.ArrayListOfInts;

/**
* Implementation of {@link TermDocVector} that lazily decodes term and
* positional information on demand.
*
* @author Tamer Elsayed
* @author Jimmy Lin
*/
public class LazyTermDocVector implements TermDocVector {
  private Map<String, ArrayListOfInts> termPositionsMap = null;
  private byte[] rawBytes = null;
  private String[] terms = null;
  private int numTerms;
  private static boolean read = false;

  transient private ByteArrayOutputStream bytesOut = null;
  transient private BitOutputStream bitsOut = null;

  public LazyTermDocVector() {}

  public LazyTermDocVector(Map<String, ArrayListOfInts> termPositionsMap) {
    this.termPositionsMap = termPositionsMap;
    read = false;
  }

  public void setTermPositionsMap(Map<String, ArrayListOfInts> termPositionsMap) {
    this.termPositionsMap = termPositionsMap;
    read = false;
  }

  @Override
  public void write(DataOutput out) throws IOException {
    if (!read) {
      numTerms = termPositionsMap.size();
      // write # of terms
      WritableUtils.writeVInt(out, numTerms);
      if (numTerms == 0)
        return;

      try {
        bytesOut = new ByteArrayOutputStream();
        bitsOut = new BitOutputStream(bytesOut);

        ArrayListOfInts positions;
        TermPositions tp = new TermPositions();
        String term;

        for (Map.Entry<String, ArrayListOfInts> posting : termPositionsMap.entrySet()) {
          term = posting.getKey();
          positions = posting.getValue();
          tp.set(positions.getArray(), (short) positions.size());

          // Write the term.
          out.writeUTF(term);
          // Write out the tf value.
          bitsOut.writeGamma((short) positions.size());
          // Write out the positions.
          LazyIntDocVector.writePositions(bitsOut, tp);
        }
        bitsOut.padAndFlush();
        bitsOut.close();
        byte[] bytes = bytesOut.toByteArray();
        WritableUtils.writeVInt(out, bytes.length);
        out.write(bytes);
      } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException("Error adding postings.");
      } catch (ArithmeticException e) {
        e.printStackTrace();
        throw new RuntimeException(e.getMessage());
      }

    } else {
      WritableUtils.writeVInt(out, numTerms);
      if (numTerms == 0)
        return;

      for (int i = 0; i < numTerms; i++)
        out.writeUTF(terms[i]);

      WritableUtils.writeVInt(out, rawBytes.length);
      out.write(rawBytes);
    }
  }

  @Override
  public void readFields(DataInput in) throws IOException {
    read = true;
    numTerms = WritableUtils.readVInt(in);
    if (numTerms == 0) {
      rawBytes = null;
      terms = null;
      return;
    }
    terms = new String[numTerms];
    for (int i = 0; i < numTerms; i++) {
      terms[i] = in.readUTF();
    }
    rawBytes = new byte[WritableUtils.readVInt(in)];
    in.readFully(rawBytes);
  }

  @Override
  public String toString() {
    StringBuffer s = new StringBuffer(this.getClass().getName() + "," + numTerms + ","
        + rawBytes + "," + terms + "\n" + "[");
    try {
      Reader r = this.getReader();
      while (r.hasMoreTerms()) {
        String id = r.nextTerm();
        TermPositions pos = new TermPositions();
        r.getPositions(pos);
        s.append("(" + id + ", " + pos.getTf() + ", " + pos + ")");
      }
      s.append("]");
    } catch (Exception e) {
      e.printStackTrace();
    }
    return s.toString();
  }

  @Override
  public Reader getReader() throws IOException {
    return new Reader(numTerms, rawBytes, terms);
  }

  public static class Reader implements TermDocVector.Reader {
    private ByteArrayInputStream bytesIn;
    private BitInputStream bitsIn;
    private int p = -1;
    private short prevTf = -1;
    private int n;
    private boolean needToReadPositions = false;
    private String[] innerTerms = null;

    public Reader(int nTerms, byte[] bytes, String[] terms) throws IOException {
      this.n = nTerms;
      if (nTerms > 0) {
        bytesIn = new ByteArrayInputStream(bytes);
        bitsIn = new BitInputStream(bytesIn);
        innerTerms = terms;
      }
    }

    @Override
    public int getNumberOfTerms() {
      return n;
    }

    @Override
    public short getTf() {
      return prevTf;
    }

    @Override
    public void reset() {
      try {
        bytesIn.reset();
        bitsIn = new BitInputStream(bytesIn);
        p = -1;
        prevTf = -1;
        needToReadPositions = false;
      } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException("Error resetting postings.");
      }
    }

    @Override
    public String nextTerm() {
      try {
        p++;
        if (needToReadPositions) {
          skipPositions(prevTf);
        }
        needToReadPositions = true;
        if (p <= n - 1) {
          prevTf = (short) bitsIn.readGamma();
          return innerTerms[p];
        } else {
          return null;
        }
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }

    @Override
    public int[] getPositions() {
      int[] pos = null;
      try {
        if (prevTf == 1) {
          pos = new int[1];
          pos[0] = bitsIn.readGamma();
        } else {
          bitsIn.readGamma();
          pos = new int[prevTf];
          pos[0] = bitsIn.readGamma();
          for (int i = 1; i < prevTf; i++) {
            pos[i] = (pos[i - 1] + bitsIn.readGamma());
          }
        }
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
      needToReadPositions = false;

      return pos;
    }

    @Override
    public boolean getPositions(TermPositions tp) {
      int[] pos = getPositions();

      if (pos == null) {
        return false;
      }

      tp.set(pos, (short) pos.length);
      return true;
    }

    @Override
    public boolean hasMoreTerms() {
      return !(p >= n - 1);
    }

    private void skipPositions(int tf) throws IOException {
      if (tf == 1) {
        bitsIn.readGamma();
      } else {
        bitsIn.skipBits(bitsIn.readGamma());
      }
    }
  }
}
TOP

Related Classes of ivory.core.data.document.LazyTermDocVector$Reader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.