Package cc.mrlda.polylda

Source Code of cc.mrlda.polylda.Document

package cc.mrlda.polylda;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.Serializable;
import java.util.Iterator;

import org.apache.hadoop.io.Writable;

import edu.umd.cloud9.util.map.HMapII;
import edu.umd.cloud9.util.map.MapII;

public class Document implements Writable, Cloneable, Serializable {
  /**
   *
   */
  private HMapII[] content = null;

  /**
   * @deprecated
   */
  private double[] gamma = null;

  /**
   * Define the total number of words in this document for every language, not necessarily distinct.
   */
  private int numberOfWords[] = null;

  /**
   * Define the total number of words in this document aggregated over all languages.
   */
  private int totalNumberOfWords = 0;

  public Document() {
  }

  public Document(HMapII[] document, double[] gamma) {
    this(document);
    this.gamma = gamma;
  }

  public Document(HMapII[] document, int numberOfTopics) {
    this(document, new double[numberOfTopics]);
  }

  public Document(HMapII[] document) {
    this.content = document;
    if (document != null) {
      this.numberOfWords = new int[document.length];

      for (int i = 0; i < this.content.length; i++) {
        if (this.content[i] != null) {
          Iterator<Integer> itr = this.content[i].values().iterator();
          while (itr.hasNext()) {
            numberOfWords[i] += itr.next();
          }
        }
        totalNumberOfWords += numberOfWords[i];
      }
    }
  }

  /**
   * Get the total number of distinct types in this document for this language.
   *
   * @return the total number of unique types in this document for this language.
   */
  public int getNumberOfTypes(int languageIndex) throws ArrayIndexOutOfBoundsException {
    if (content == null || content[languageIndex] == null) {
      return 0;
    } else {
      return content[languageIndex].size();
    }
  }

  public int getNumberOfLanguages() {
    if (content == null) {
      return 0;
    }
    return content.length;
  }

  public int[] getNumberOfTypes() {
    if (content == null) {
      return null;
    }

    int[] numberOfTypes = new int[content.length];
    for (int i = 0; i < content.length; i++) {
      if (content[i] == null) {
        numberOfTypes[i] = 0;
      } else {
        numberOfTypes[i] = content[i].size();
      }
    }

    return numberOfTypes;
  }

  /**
   * Get the total number of words in this document, not necessarily distinct, for all languages.
   *
   * @return the total number of words in this document, not necessarily distinct, for all
   *         languages.
   */
  public int[] getNumberOfWords() {
    return numberOfWords;
  }

  public int getTotalNumberOfWords() {
    return totalNumberOfWords;
  }

  public int getNumberOfWords(int languageIndex) throws ArrayIndexOutOfBoundsException {
    if (numberOfWords == null) {
      return 0;
    }
    return numberOfWords[languageIndex];
  }

  public HMapII[] getContent() {
    return this.content;
  }

  public HMapII getContent(int languageIndex) {
    if (this.content == null) {
      return null;
    }
    return this.content[languageIndex];
  }

  /**
   * @deprecated
   */
  public void resetGamma() {
    this.gamma = null;
  }

  /**
   * @deprecated
   * @return
   */
  public int getNumberOfTopics() {
    if (gamma == null) {
      return 0;
    } else {
      return gamma.length;
    }
  }

  /**
   * @deprecated
   * @return
   */
  public double[] getGamma() {
    return gamma;
  }

  /**
   * @deprecated
   */
  public void setGamma(double[] gamma) {
    this.gamma = gamma;
  }

  /**
   * Deserializes the LDADocument.
   *
   * @param in source for raw byte representation
   */
  public void readFields(DataInput in) throws IOException {
    int numLanguages = in.readInt();
    if (numLanguages <= 0) {
      content = null;
      numberOfWords = null;
      totalNumberOfWords = 0;
    } else {
      numberOfWords = new int[numLanguages];
      content = new HMapII[numLanguages];
      totalNumberOfWords = 0;

      for (int i = 0; i < numLanguages; i++) {
        int numEntries = in.readInt();

        if (numEntries <= 0) {
          content[i] = null;
        } else {
          content[i] = new HMapII();
          for (int j = 0; j < numEntries; j++) {
            int id = in.readInt();
            int count = in.readInt();
            content[i].put(id, count);
            numberOfWords[i] += count;
            totalNumberOfWords += count;
          }
        }
      }
    }

    int numTopics = in.readInt();
    if (numTopics <= 0) {
      gamma = null;
    } else {
      gamma = new double[numTopics];
      for (int i = 0; i < numTopics; i++) {
        gamma[i] = in.readDouble();
      }
    }
  }

  /**
   * Returns the serialized representation of this object as a byte array.
   *
   * @return byte array representing the serialized representation of this object
   * @throws IOException
   */
  public byte[] serialize() throws IOException {
    ByteArrayOutputStream bytesOut = new ByteArrayOutputStream();
    DataOutputStream dataOut = new DataOutputStream(bytesOut);
    write(dataOut);

    return bytesOut.toByteArray();
  }

  public void setDocument(HMapII[] document) {
    this.content = document;
    if (this.content == null) {
      this.numberOfWords = null;
      this.totalNumberOfWords = 0;
      return;
    }

    this.numberOfWords = new int[content.length];
    this.totalNumberOfWords = 0;
    Iterator<Integer> itr = null;
    for (int i = 0; i < numberOfWords.length; i++) {
      if (this.content[i] == null) {
        numberOfWords[i] = 0;
      } else {
        itr = this.content[i].values().iterator();
        while (itr.hasNext()) {
          numberOfWords[i] += itr.next();
        }
      }
      this.totalNumberOfWords += numberOfWords[i];
    }
  }

  /**
   * Serializes the map.
   *
   * @param out where to write the raw byte representation
   */
  public void write(DataOutput out) throws IOException {
    // Write out the number of entries in the map.
    if (content == null) {
      out.writeInt(0);
    } else {
      out.writeInt(content.length);
      for (HMapII hmapii : content) {
        if (hmapii == null) {
          out.writeInt(0);
        } else {
          out.writeInt(hmapii.size());
          for (MapII.Entry e : hmapii.entrySet()) {
            out.writeInt(e.getKey());
            out.writeInt(e.getValue());
          }
        }
      }
    }

    // Write out the gamma values for this document.
    if (gamma == null) {
      out.writeInt(0);
    } else {
      out.writeInt(gamma.length);
      for (double value : gamma) {
        out.writeDouble(value);
      }
    }
  }

  /**
   * Creates a <code>LDADocument</code> object from a byte array.
   *
   * @param bytes raw serialized representation
   * @return a newly-created <code>LDADocument</code> object
   * @throws IOException
   */
  public static Document create(byte[] bytes) throws IOException {
    return create(new DataInputStream(new ByteArrayInputStream(bytes)));
  }

  /**
   * Creates a <code>LDADocument</code> object from a <code>DataInput</code>.
   *
   * @param in source for reading the serialized representation
   * @return a newly-created <code>LDADocument</code> object
   * @throws IOException
   */
  public static Document create(DataInput in) throws IOException {
    Document m = new Document();
    m.readFields(in);

    return m;
  }

  @Override
  public String toString() {
    StringBuilder document = new StringBuilder("content:\n");
    if (content == null) {
      document.append("null");
    } else {
      for (int i = 0; i < content.length; i++) {
        document.append("language " + i + "\t");

        if (this.content[i] == null) {
          document.append("null\n");
        } else {
          Iterator<Integer> itr = this.content[i].keySet().iterator();
          while (itr.hasNext()) {
            int id = itr.next();
            document.append(id);
            document.append(":");
            document.append(content[i].get(id));
            document.append(" ");
          }
          document.append("\n");
        }
      }
    }

    document.append("gamma:\t");
    if (gamma == null) {
      document.append("null");
    } else {
      for (double value : gamma) {
        document.append(value);
        document.append(" ");
      }
    }

    return document.toString();
  }
}
TOP

Related Classes of cc.mrlda.polylda.Document

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.