Package cc.mrlda

Source Code of cc.mrlda.Document

package cc.mrlda;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.Serializable;
import java.util.Iterator;

import org.apache.hadoop.io.Writable;

import edu.umd.cloud9.util.map.HMapII;
import edu.umd.cloud9.util.map.MapII;

public class Document implements Writable, Cloneable, Serializable {
  /**
   *
   */
  private static final long serialVersionUID = 752244298258266755L;

  /**
   *
   */
  private HMapII content = null;

  /**
   * @deprecated
   */
  private double[] gamma = null;

  /**
   * Define the total number of words in this document, not necessarily distinct.
   */
  private int numberOfTokens = 0;

  /**
   * Creates a <code>LDADocument</code> object from a byte array.
   *
   * @param bytes raw serialized representation
   * @return a newly-created <code>LDADocument</code> object
   * @throws IOException
   */
  public static Document create(byte[] bytes) throws IOException {
    return create(new DataInputStream(new ByteArrayInputStream(bytes)));
  }

  /**
   * Creates a <code>LDADocument</code> object from a <code>DataInput</code>.
   *
   * @param in source for reading the serialized representation
   * @return a newly-created <code>LDADocument</code> object
   * @throws IOException
   */
  public static Document create(DataInput in) throws IOException {
    Document m = new Document();
    m.readFields(in);

    return m;
  }

  public Document() {
  }

  public Document(HMapII document) {
    this.content = document;
    if (document != null) {
      Iterator<Integer> itr = this.content.values().iterator();
      while (itr.hasNext()) {
        numberOfTokens += itr.next();
      }
    }
  }

  /**
   * @deprecated
   * @param document
   * @param gamma
   */
  public Document(HMapII document, double[] gamma) {
    this(document);
    this.gamma = gamma;
  }

  /**
   * @deprecated
   * @param document
   * @param numberOfTopics
   */
  public Document(HMapII document, int numberOfTopics) {
    this(document, new double[numberOfTopics]);
  }

  public HMapII getContent() {
    return this.content;
  }

  /**
   * @deprecated
   * @return
   */
  public double[] getGamma() {
    return gamma;
  }

  /**
   * @deprecated
   * @return
   */
  public int getNumberOfTopics() {
    if (gamma == null) {
      return 0;
    } else {
      return gamma.length;
    }
  }

  /**
   * Get the total number of distinct types in this document.
   *
   * @return the total number of unique types in this document.
   */
  public int getNumberOfTypes() {
    if (content == null) {
      return 0;
    } else {
      return content.size();
    }
  }

  /**
   * Get the total number of words in this document, not necessarily distinct.
   *
   * @return the total number of words in this document, not necessarily distinct.
   */
  public int getNumberOfTokens() {
    return numberOfTokens;
  }

  /**
   * Deserializes the LDADocument.
   *
   * @param in source for raw byte representation
   */
  public void readFields(DataInput in) throws IOException {
    numberOfTokens = 0;

    int numEntries = in.readInt();
    if (numEntries <= 0) {
      content = null;
    } else {
      content = new HMapII();
      for (int i = 0; i < numEntries; i++) {
        int id = in.readInt();
        int count = in.readInt();
        content.put(id, count);
        numberOfTokens += count;
      }
    }

    int numTopics = in.readInt();
    if (numTopics <= 0) {
      gamma = null;
    } else {
      gamma = new double[numTopics];
      for (int i = 0; i < numTopics; i++) {
        gamma[i] = in.readDouble();
      }
    }
  }

  /**
   * Returns the serialized representation of this object as a byte array.
   *
   * @return byte array representing the serialized representation of this object
   * @throws IOException
   */
  public byte[] serialize() throws IOException {
    ByteArrayOutputStream bytesOut = new ByteArrayOutputStream();
    DataOutputStream dataOut = new DataOutputStream(bytesOut);
    write(dataOut);

    return bytesOut.toByteArray();
  }

  public void setDocument(HMapII document) {
    this.content = document;
    numberOfTokens = 0;

    if (document != null) {
      Iterator<Integer> itr = this.content.values().iterator();
      while (itr.hasNext()) {
        numberOfTokens += itr.next();
      }
    }
  }

  /**
   * @deprecated
   * @param gamma
   */
  public void setGamma(double[] gamma) {
    this.gamma = gamma;
  }

  @Override
  public String toString() {
    StringBuilder document = new StringBuilder("content:\t");
    if (content == null) {
      document.append("null");
    } else {
      Iterator<Integer> itr = this.content.keySet().iterator();
      while (itr.hasNext()) {
        int id = itr.next();
        document.append(id);
        document.append(":");
        document.append(content.get(id));
        document.append(" ");
      }
    }
    document.append("\ngamma:\t");
    if (gamma == null) {
      document.append("null");
    } else {
      for (double value : gamma) {
        document.append(value);
        document.append(" ");
      }
    }

    return document.toString();
  }

  /**
   * Serializes the map.
   *
   * @param out where to write the raw byte representation
   */
  public void write(DataOutput out) throws IOException {
    // Write out the number of entries in the map.
    if (content == null) {
      out.writeInt(0);
    } else {
      out.writeInt(content.size());
      for (MapII.Entry e : content.entrySet()) {
        out.writeInt(e.getKey());
        out.writeInt(e.getValue());
      }
    }

    // Write out the gamma values for this document.
    if (gamma == null) {
      out.writeInt(0);
    } else {
      out.writeInt(gamma.length);
      for (double value : gamma) {
        // TODO: change it to double and also in read method
        out.writeDouble(value);
      }
    }
  }
}
TOP

Related Classes of cc.mrlda.Document

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.