package ch.akuhn.linalg;
import ch.akuhn.edu.mit.tedlab.SMat;
import ch.akuhn.edu.mit.tedlab.SVDRec;
import ch.akuhn.edu.mit.tedlab.Svdlib;
import ch.akuhn.matrix.SparseMatrix;
/** Singular value decomposition of matrix A.
*<P>
* Given a term-document matrix A, with dimension m on n, the decomposition consists of
* S = a diagonal matrix with the k largest singular values,
* U = m left singular vectors of length k, one for each document, and
* V = n right singular vectors of length k, one for each term.
* The matrix A' = U * S * V^T is the closest rank k approximation of of matrix A
* [Eckard and Young].
*<P>
* Instances of this class are immutable.
*
* @author Adrian Kuhn, 2008-2009
*
*/
public class SVD {
public final double[] s;
public final double[][] U; // terms
public final double[][] V; // documents
public SVD(double[] s, double[][] U, double[][] V) {
this.s = s;
this.U = U;
this.V = V;
assert invariant();
}
private boolean invariant() {
if (s == null || U == null || V == null ) return false;
if (s.length > U.length || s.length > V.length) return false;
for (int n = 0; n < U.length; n++) if (s.length != U[n].length) return false;
for (int n = 0; n < V.length; n++) if (s.length != V[n].length) return false;
return true;
}
public SVD(SparseMatrix matrix, int dimensions) {
if (matrix.rowCount() == 0 || matrix.columnCount() == 0) {
s = new double[0];
U = new double[matrix.rowCount()][0];
V = new double[matrix.columnCount()][0];
}
else {
SMat input = makeSMat(matrix);
SVDRec r = new Svdlib().svdLAS2(input, dimensions, 0, new double[] { -1e-30, 1e-30 }, 1e-6);
s = r.S;
U = transpose(r.Ut.value);
V = transpose(r.Vt.value);
}
assert invariant();
}
/*default*/ static final double[][] append(double[][] data, double[] values) {
assert data.length == 0 || data[0].length == values.length;
double[][] result = new double[data.length + 1][];
System.arraycopy(data, 0, result, 0, data.length);
result[data.length] = values;
return result;
}
/*default*/ static final double[][] replace(double[][] data, int index, double[] values) {
assert 0 <= index && index < data.length;
assert data[index].length == values.length;
double[][] result = new double[data.length][];
System.arraycopy(data, 0, result, 0, data.length);
result[index] = values;
return result;
}
/*default*/ static final double[][] remove(double[][] data, int index) {
assert 0 <= index && index < data.length;
double[][] result = new double[data.length - 1][];
System.arraycopy(data, 0, result, 0, index);
System.arraycopy(data, index + 1, result, index, data.length - index - 1);
return result;
}
/** Returns a copy with additional term.
*
* @param u new term vector, length k.
* @return new copy of this instance.
*/
public SVD withAppendU(double[] u) {
assert u.length == s.length;
return new SVD(s, append(U, u), V);
}
/** Returns a copy with additional document.
*
* @param v new document vector, length k.
* @return new copy of this instance.
*/
public SVD withAppendV(double[] v) {
assert v.length == s.length;
return new SVD(s, U, append(V, v));
}
/** Returns the rank of this singular value decomposition.
*
* @return a positive integer.
*/
public int getRank() {
return s.length;
}
private double[] project(double[][] data, double[] weightings) {
if (s.length == 0) return new double[] { };
assert weightings.length == data.length;
double[] result = new double[s.length];
for (int n = 0; n < weightings.length; n++) {
double weight = weightings[n];
if (weight == 0) continue;
double[] dn = data[n];
for (int k = 0; k < s.length; k++) {
result[k] += weight * dn[k];
}
}
for (int k = 0; k < s.length; k++) {
result[k] /= s[k];
}
return result;
}
/** Places a term at the weighted sum of its documents.
*
* @param weightings document weight vector, length = m.
* @return pseudo term vector.
*/
public double[] makePseudoU(double[] weightings) {
return project(V, weightings);
}
/** Places a document at the weighted sum of its terms.
*
* @param weightings
* @return
*/
public double[] makePseudoV(double[] weightings) {
return project(U, weightings);
}
/*default*/ static final SMat makeSMat(SparseMatrix matrix) {
SMat S = new SMat(matrix.rowCount(), matrix.columnCount(), matrix.used());
for (int j = 0, n = 0; j < matrix.columnCount(); j++) {
S.pointr[j] = n;
for (int i = 0; i < matrix.rowCount(); i++)
if (matrix.get(i, j) != 0) {
S.rowind[n] = i;
S.value[n] = matrix.get(i, j);
n++;
}
}
S.pointr[S.cols] = S.vals;
return S;
}
private double similaritySqrt(double[] a, double[] b) {
double sim = 0;
double suma = 0;
double sumb = 0;
for (int k = 0; k < b.length; k++) {
double s2 = s[k];
sim += a[k] * b[k] * s2;
suma += a[k] * a[k] * s2;
sumb += b[k] * b[k] * s2;
}
return (sim / (Math.sqrt(suma) * Math.sqrt(sumb)));
}
private double similarity(double[] a, double[] b) {
double sim = 0;
double suma = 0;
double sumb = 0;
for (int k = 0; k < b.length; k++) {
double s2 = s[k] * s[k];
sim += a[k] * b[k] * s2;
suma += a[k] * a[k] * s2;
sumb += b[k] * b[k] * s2;
}
return (sim / (Math.sqrt(suma) * Math.sqrt(sumb)));
}
/** Computes the similarity between i-th term and pseudo-term t.
*
* @param i index of a term, range 0..m.
* @param t a pseudo-term vector, length k
* @return a cosine value, typically greater then zero.
*/
public double similarityU(int i, double[] t) {
if (s.length == 0) return Double.NaN;
return similarity(U[i], t);
}
/** Computes the similarity between a-th and b-th term.
*
* @param a index of a term, range 0..m.
* @param b index of a term, range 0..m.
* @return a cosine value, typically greater then zero.
*/
public double similarityUU(int a, int b) {
if (s.length == 0) return Double.NaN;
return similarity(U[a], U[b]);
}
/** Computes the similarity between i-th term and j-th document.
*
* @param i index of a term, range 0..m.
* @param j index of a document, range 0..n.
* @return a cosine value, typically greater than zero.
*/
public double similarityUV(int i, int j) {
if (s.length == 0) return Double.NaN;
return similaritySqrt(U[i], V[j]);
}
/** Computes the similarity between j-th document and pseudo-document d.
*
* @param j index of a document, range 0..n.
* @param d a pseudo-document vector, length k.
* @return a cosine value, typically greater than zero.
*/
public double similarityV(int j, double[] d) {
if (s.length == 0) return Double.NaN;
return similarity(V[j], d);
}
/** Computes the similarity between a-th and b-th document.
*
* @param a index of a document, range 0..n.
* @param b index of a document, range 0..n.
* @return a cosine value, typically greater then zero.
*/
public double similarityVV(int a, int b) {
if (s.length == 0) return Double.NaN;
return similarity(V[a], V[b]);
}
public double euclidianVV(int a, int b) {
if (s.length == 0) return Double.NaN;
double[] aa = V[a];
double[] bb = V[b];
double sum = 0;
for (int k = 0; k < bb.length; k++) {
sum += (aa[k] - bb[k]) * (aa[k] - bb[k]);
}
return Math.sqrt(sum);
}
/*default*/ static final double[][] transpose(double[][] array) {
if (array.length == 0) return new double[][] {{}};
int width = array[0].length, height = array.length;
double[][] t = new double[width][height];
for (int n = 0; n < width; n++) {
for (int m = 0; m < height; m++) {
t[n][m] = array[m][n];
}
}
return t;
}
/** Returns the decomposition of matrix A^T.
*
* @return new copy of this instance.
*/
public SVD transposed() {
return new SVD(s, V, U); // swap U and V
}
/** Returns a copy, where one term is updated.
*
* @param index index of a term, range 0..m.
* @param values new term vector, length k.
* @return new copy of this instance.
*/
public SVD withReplaceU(int index, double[] values) {
assert values.length == s.length;
return new SVD(s, replace(U, index, values), V);
}
/** Returns a copy, where one document is updated.
*
* @param index index of a document, range 0..n.
* @param values new document vector, length k.
* @return new copy of this instance.
*/
public SVD withReplaceV(int index, double[] values) {
assert values.length == s.length;
return new SVD(s, U, replace(V, index, values));
}
/** Returns the number of right singular vectors in V,
* that is the number of columns of A (m x n).
* Given a term-document matrix, this is the number of documents.
*
* @return the value of n.
*/
public int columnCount() {
return V.length;
}
/** Returns the number of left singular vectors in U,
* that is the number of rows of A (m x n).
* Given a term-document matrix, this is the number of terms.
*
* @return the value of m.
*/
public int rowCount() {
return U.length;
}
/** Returns a copy, with selected documents only.
*
* @param indices list of indices of documents, range 0..n.
* @return a new copy of this instance.
*
*/
public SVD withSelectV(int[] indices) {
double[][] selectV = new double[indices.length][s.length];
for (int n = 0; n < indices.length; n++) {
for (int k = 0; k < s.length; k++) {
selectV[n][k] = V[indices[n]][k];
}
}
return new SVD(s, U, selectV);
}
/** Returns copy without given document.
*
* @param doc index of a document, range 0..n.
* @return new copy of this instance.
*/
public SVD withoutV(int doc) {
return new SVD(s, U, remove(V, doc));
}
}