Package edu.umd.hooka.alignment

Source Code of edu.umd.hooka.alignment.IndexedFloatArray

package edu.umd.hooka.alignment;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.PriorityQueue;

import org.apache.hadoop.io.Writable;

import edu.umd.cloud9.io.pair.PairOfFloatInt;
import edu.umd.cloud9.util.array.ArrayListOfInts;


/*
* Represents a sparse float array.  That is, some indices don't exist.
*
* TODO: performance enhancement, when _data.length > |V|/2 it becomes both more
* memory efficient and run-time efficient to store a non-sparse array.
*/
public final class IndexedFloatArray implements Writable, Cloneable {
 
  /**
   * If the sparse array exceeds this threshold, make the array non-sparse.
   */
  public static final float NO_BINSEARCH_THRESHOLD = 0.90f;
 
  /**
   * Don't make arrays sparse unless they exceed this length.
   */
  public static final int MIN_LENGTH_FOR_NONSPARSE_ARRAY = 5;
 
  public float[] _data;
  public int[] _indices;
  public boolean _useBinSearch;

  public void readFields(DataInput in) throws IOException {
    int bbLen = in.readInt();
    if (bbLen == 0) { _data = null; _indices = null; return; }
    ByteBuffer bb=ByteBuffer.allocate(bbLen);
    _useBinSearch = in.readBoolean();
    if (_useBinSearch) {
      in.readFully(bb.array());
      _indices = new int[bbLen/4];
      IntBuffer ib = bb.asIntBuffer();
      ib.get(_indices);
      bb=ByteBuffer.allocate(bbLen);
    }
    in.readFully(bb.array());
    FloatBuffer fb = bb.asFloatBuffer();
    _data = new float[bbLen/4];
    fb.get(_data);
  }

  public void write(DataOutput out) throws IOException {
    if (_data == null) {
      out.writeInt(0);
    } else {
      int bbLen = _data.length * 4;
      out.writeInt(bbLen);
      out.writeBoolean(_useBinSearch);
      ByteBuffer bb=ByteBuffer.allocate(bbLen);
      if (_useBinSearch) {
        IntBuffer ib = bb.asIntBuffer();
        ib.put(_indices);
        out.write(bb.array());
        bb=ByteBuffer.allocate(bbLen);
      }
      FloatBuffer fb = bb.asFloatBuffer();
      fb.put(_data);
      out.write(bb.array());
    }
  }
 
  public Object clone() {
    IndexedFloatArray res = new IndexedFloatArray();
    if (_data == null) { return res; }
    res._data = _data.clone();
    res._useBinSearch = _useBinSearch;
    if (_useBinSearch)
      res._indices = _indices.clone();
    return res;
  }
 
  public int maxKey() {
    if (_useBinSearch)
      return _indices[_indices.length - 1];
    else
      return _data.length - 1;
  }
 
  private void optimizeMemory(float[] data, int max) {
    if (_useBinSearch) return;
    int nzc = 0;
    for (int c = 0; c < max; c++)
      if (data[c] != 0.0f) nzc++;
    if (nzc == 0) {
      _data = null; _indices = null;
      return;
    }
    float[] nd = new float[nzc];
    int[]   ni = new int[nzc];
    int ci = 0;
    for (int c = 0; c < max; c++) {
      float v = data[c];
      if (v != 0.0f) {
        nd[ci] = v;
        ni[ci] = c;
        ci++;
      }
    }
    _data = nd;
    _indices = ni;
    _useBinSearch = true;
  }

  /**
   * If sparse array meets the load criteria, optimize it so that it no longer uses
   * a bin search.
   */
  public void optimizeSpeed() {
    if (_indices == null || _indices.length < MIN_LENGTH_FOR_NONSPARSE_ARRAY) return;
    int maxIndex = _indices[_indices.length - 1];
    float load = ((float)_data.length)/((float)maxIndex);
    if (load > NO_BINSEARCH_THRESHOLD) {
      System.err.println("Optimizing IFA: len=" + _indices.length + ", load="
          + load +", newMax=" + maxIndex);
      float[] nd = new float[maxIndex+1];
      for (int i = 0; i < _indices.length; i++)
        nd[_indices[i]] = _data[i];
      _data = nd;
      _indices = null;
      _useBinSearch = false;
    }
  }
 
  public void copyTo(float[] dest, int destPos) {
    System.arraycopy(_data, 0, dest, destPos, _data.length);
  }
 
  public void copyFrom(IndexedFloatArray rhs) {
    System.arraycopy(rhs._data, 0, _data, 0, _data.length);
  }
  public void addTo(float[] dest) {
    if (_useBinSearch) {
      for (int i = 0; i < _data.length; i++)
        dest[_indices[i]] += _data[i];
    } else {
      for (int i = 0; i < _data.length; i++)
        dest[i] += _data[i];
    }
  }

  public IndexedFloatArray() {}
  public IndexedFloatArray(int[] indices, float[] values) {
    _indices = indices;
    _data = values;
    _useBinSearch = true;
    optimizeSpeed();
  }
  public IndexedFloatArray(int[] indices, float[] values, boolean isOptimize) {
    _indices = indices;
    _data = values;
    _useBinSearch = true;
    if(isOptimize)
      optimizeSpeed();
  }
  public IndexedFloatArray(float[] values, int size) {
    _useBinSearch = false;
    int nzc = 0;
    for (int i=0; i<values.length; i++)
      if (values[i] != 0.0f) nzc++;
    if (nzc == 0) { _data = null; _indices = null; return; }
    float load = ((float)nzc)/((float)size);
    if (size < MIN_LENGTH_FOR_NONSPARSE_ARRAY ||
        load <= NO_BINSEARCH_THRESHOLD) {
      optimizeMemory(values, size);
    } else {
      _indices = null;
      _data = new float[size];
      System.arraycopy(values, 0, _data, 0, size);
    }
  }
  public IndexedFloatArray(int[] indices) {
    _indices = indices.clone();
    _data = new float[_indices.length];
    _useBinSearch = true;
  }
  // TODO: in this case, make this a single lookup type data structure,
  // ie, skip the bin search. Normally would use polymorphism for this,
  // but hadoop's SequenceFiles don't like that kind of polymorphism
  public IndexedFloatArray(int n) {
    _indices = null;
    _useBinSearch = false;
    _data = new float[n];
  }

  final int binSearch(int n) {
    if (!_useBinSearch) return n;
    int min = 0;
    int max = _indices.length - 1;
    while (min <= max) {
      int mid = (min + max) / 2;
      if (_indices[mid] > n)
        max = mid - 1;
      else if (_indices[mid] < n)
        min = mid + 1;
      else
        return mid;
    }
    throw new RuntimeException("IFA: Couldn't find " + n);
  }

  public int size() {
    if (_data != null) return _data.length;
    else return 0;
  }

 
  public int getWord(int loc){
    return _indices[loc];
  }
 
  public float getProb(int loc){
    return _data[loc];
  }
 
  //Ferhan: i don't know what this is doing. the behavior tends to be dependent on _useBinSearch value
  public final float get(int n) {
    if (_data == null) return 0.0f;
    if (!_useBinSearch) if (n >= _data.length) return 0.0f; else return _data[n];
    int min = 0;
    int max = _indices.length - 1;
    while (min <= max) {
      int mid = (min + max) / 2;
      if (_indices[mid] > n)
        max = mid - 1;
      else if (_indices[mid] < n)
        min = mid + 1;
      else
        return _data[mid];
    }
    return 0.0f;
  }
 
  public final float getLazy(int n) {
    if (_data == null) return 0.0f;
    for(int i=0; i<_indices.length; i++){
      if(_indices[i] == n){
        return _data[i];
      }
    }
    return 0.0f;
  }
 
  public int[] getTranslations(float probThreshold){
    ArrayListOfInts words = new ArrayListOfInts();
    if (_useBinSearch) {
      for (int i=0; i < _data.length; i++) {
        if (_data[i] > probThreshold) {
          words.add(_indices[i]);
        }
      }
    }else{
      for (int i=0; i < _data.length; i++) {
        if (_data[i] > probThreshold) {
          words.add(i);
        }
      }
    }
    words.trimToSize();
    return words.getArray();
  }
 
  public PriorityQueue<PairOfFloatInt> getTranslationsWithProbs(float probThreshold){
    PriorityQueue<PairOfFloatInt> q = new PriorityQueue<PairOfFloatInt>(_data.length, Collections.reverseOrder());
    if (_useBinSearch) {
      for (int i=0; i < _data.length; i++) {
        if (_data[i] > probThreshold) {
          q.add(new PairOfFloatInt(_data[i],_indices[i]));
        }
      }
    }else{
      for (int i=0; i < _data.length; i++) {
        if (_data[i] > probThreshold) {
          q.add(new PairOfFloatInt(_data[i],i));
        }
      }
    }
    return q;
  }
 
  public List<PairOfFloatInt> getTranslationsWithProbsAsList(float probThreshold){
    List<PairOfFloatInt> l = new ArrayList<PairOfFloatInt>();
    if (_useBinSearch) {
      for(int i=0; i < _data.length; i++){
        if (_data[i] > probThreshold) {
          l.add(new PairOfFloatInt(_data[i],_indices[i]));
        }
      }
    }else{
      for (int i=0; i < _data.length; i++) {
        if (_data[i] > probThreshold) {
          l.add(new PairOfFloatInt(_data[i],i));
        }
      }
    }
    return l;
  }
 
  public final void set(int index, float value) { _data[binSearch(index)] = value; }
  public final void add(int index, float delta) { _data[binSearch(index)]+= delta; }
 
  /**
   * @param index
   *     the index of the searched term
   * @return
   *     the location of the term in the array
   */
  public int getAddr(int index) { return binSearch(index); }
  public void clear() {
    int l = size();
    for (int i=0; i<l; i++)
      _data[i]=0.0f;
  }
  public void plusEqualsMismatchSize(IndexedFloatArray rhs) {
    if (this._data == null) {
      if (rhs._data == null) return;
      this._data = rhs._data.clone();
      if (rhs._indices != null)
        this._indices = rhs._indices.clone();
      this._useBinSearch = rhs._useBinSearch;
      return;
    }
    this.optimizeMemory(_data, _data.length);
    rhs.optimizeMemory(rhs._data, rhs._data.length);
    float[] tv = new float[_data.length + rhs._data.length];
    int[] tk = new int[_data.length + rhs._data.length];
    int cl = 0;
    int cr = 0;
    int c = 0;
    while(cl < _data.length && cr < rhs._data.length) {
      int il = _indices[cl];
      int ir = rhs._indices[cr];
      if (il == ir) {
        tk[c] = ir;
        tv[c] = _data[cl] + rhs._data[cr];
        cr++; cl++;
      } else if (il < ir) {
        tk[c] = il;
        tv[c] = _data[cl];
        cl++;
      } else {
        tk[c] = ir;
        tv[c] = rhs._data[cr];
        cr++;
      }
      c++;
    }
    if (cl < _data.length) {
      int dif = _data.length - cl;
      System.arraycopy(_data, cl, tv, c, dif);
      System.arraycopy(_indices, cl, tk, c, dif);
      c += dif;
    } else if (cr < rhs._data.length) {
      int dif = rhs._data.length - cr;
      System.arraycopy(rhs._data, cr, tv, c, dif);
      System.arraycopy(rhs._indices, cr, tk, c, dif);
      c += dif;       
    }
    if (c == tv.length) {
      _data = tv;
      _indices = tk;
    } else {
      int[] ni = new int[c];
      float[] nv = new float[c];
      System.arraycopy(tk, 0, ni, 0, c);
      System.arraycopy(tv, 0, nv, 0, c);
      _data = nv;
      _indices = ni;
      this.optimizeSpeed();
    }
  }
  public void plusEquals(IndexedFloatArray rhs) {
    if (size() != rhs.size())
      throw new RuntimeException("Size mismatch");
    if (size() == 0) return;
    for (int i=0; i<_data.length; i++)
      _data[i] += rhs._data[i];
  }
  public void minusEquals(IndexedFloatArray rhs) {
    if (size() != rhs.size())
      throw new RuntimeException("Size mismatch");
    if (size() == 0) return;
    for (int i=0; i<_data.length; i++)
      _data[i] -= rhs._data[i];
  }
  public void timesEquals(float rhs) {
    if (size() == 0) return;
    for (int i=0; i<_data.length; i++)
      _data[i] *= rhs;
  }
  public void normalize() {
    normalize(0.0f);
  }
  public void normalize(float alpha) {
    if (size() == 0) return;
    float total = 0.0f;
    for (float f: _data)
      total += (f  + alpha);
    if (total == 0.0f) {
      float v = 1.0f / (float)size();
      for (int i=0; i<_data.length; i++)
        _data[i] = v;
    } else {
      for (int i=0; i<_data.length; i++)
        _data[i] = (_data[i] + alpha) / total;
    }
  }
  public void normalize_variationalBayes(float alpha) {
    if (size() == 0) return;
    float total = 0.0f;
    for (float f: _data)
      total += (f + alpha);
    if (total == 0.0f) {
      if (true) throw new RuntimeException("Sum=0: shouldn't happen " + this);
      float v = 1.0f / (float)size();
      for (int i=0; i<_data.length; i++)
        _data[i] = v;
    } else {
      for (int i=0; i<_data.length; i++)
        _data[i] = (float)Math.exp(Digamma.digamma(_data[i] + alpha) - Digamma.digamma(total));
    }   
  }
  public float innerProduct(IndexedFloatArray rhs) {
    if (size() != rhs.size())
      throw new RuntimeException("Size mismatch");
    if (size() == 0) return 0.0f;
    float res = 0.0f;
    for (int i=0; i<_data.length; i++)
      res += _data[i] * rhs._data[i];
    return res;
  }
  public String toString(boolean brackets) {
    StringBuffer sb = new StringBuffer();

    if (brackets) sb.append('<');
    if (_data == null)
      sb.append("null");
    else {
      if (_useBinSearch) {
        if (size() > 0) {
          for (int i=0; i<_data.length; i++) {
            if (i != 0) sb.append(' ');
            sb.append(_indices[i]+":"+_data[i]);
          }
        }
      } else {
        for (int i=0; i<_data.length; i++) {
          if (i != 0) sb.append(' ');
          sb.append(i+":"+_data[i]);
        }
      }
    }
    if (brackets) sb.append('>');
    return sb.toString();   
  }
  public String toString() {
    return toString(true);
  }



}
TOP

Related Classes of edu.umd.hooka.alignment.IndexedFloatArray

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.