Package org.apache.mahout.math.hadoop.stochasticsvd

Source Code of org.apache.mahout.math.hadoop.stochasticsvd.LocalSSVDPCASparseTest

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.mahout.math.hadoop.stochasticsvd;

import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.mahout.common.IOUtils;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.*;
import org.apache.mahout.math.function.DoubleFunction;
import org.apache.mahout.math.function.Functions;
import org.apache.mahout.math.function.VectorFunction;
import org.junit.Test;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.Deque;
import java.util.Iterator;
import java.util.Random;

public class LocalSSVDPCASparseTest extends MahoutTestCase {

  private static final double s_epsilon = 1.0E-10d;

  @Test
  public void testOmegaTRightMultiply() {
    final Random rnd = RandomUtils.getRandom();
    final long seed = rnd.nextLong();
    final int n = 2000;

    final int kp = 100;

    final Omega omega = new Omega(seed, kp);
    final Matrix materializedOmega = new DenseMatrix(n, kp);
    for (int i = 0; i < n; i++)
      for (int j = 0; j < kp; j++)
        materializedOmega.setQuick(i, j, omega.getQuick(i, j));
    Vector xi = new DenseVector(n);
    xi.assign(new DoubleFunction() {
      @Override
      public double apply(double x) {
        return rnd.nextDouble() * 100;
      }
    });

    Vector s_o = omega.mutlithreadedTRightMultiply(xi);

    Matrix xiVector = new DenseMatrix(n, 1);
    xiVector.assignColumn(0, xi);

    Vector s_o_control = materializedOmega.transpose().times(xiVector).viewColumn(0);

    assertEquals(0, s_o.minus(s_o_control).aggregate(Functions.PLUS, Functions.ABS), 1e-10);

    System.out.printf("s_omega=\n%s\n", s_o);
    System.out.printf("s_omega_control=\n%s\n", s_o_control);
  }

  @Test
  public void runPCATest1() throws IOException {
    runSSVDSolver(1);
  }

//  @Test
  public void runPCATest0() throws IOException {
    runSSVDSolver(0);
  }


  public void runSSVDSolver(int q) throws IOException {

    Configuration conf = new Configuration();
    conf.set("mapred.job.tracker", "local");
    conf.set("fs.default.name", "file:///");

    // conf.set("mapred.job.tracker","localhost:11011");
    // conf.set("fs.default.name","hdfs://localhost:11010/");

    Deque<Closeable> closeables = Lists.newLinkedList();
    try {
      Random rnd = RandomUtils.getRandom();

      File tmpDir = getTestTempDir("svdtmp");
      conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath());

      Path aLocPath = new Path(getTestTempDirPath("svdtmp/A"), "A.seq");

      // create distributed row matrix-like struct
      SequenceFile.Writer w =
        SequenceFile.createWriter(FileSystem.getLocal(conf),
                                  conf,
                                  aLocPath,
                                  Text.class,
                                  VectorWritable.class,
                                  CompressionType.BLOCK,
                                  new DefaultCodec());
      closeables.addFirst(w);

      int n = 100;
      int m = 2000;
      double percent = 5;

      VectorWritable vw = new VectorWritable();
      Text rkey = new Text();

      Vector xi = new DenseVector(n);

      double muAmplitude = 50.0;
      for (int i = 0; i < m; i++) {
        Vector dv = new SequentialAccessSparseVector(n);
        String rowname = "row-"+i;
        NamedVector namedRow = new NamedVector(dv, rowname);
        for (int j = 0; j < n * percent / 100; j++) {
          dv.setQuick(rnd.nextInt(n), muAmplitude * (rnd.nextDouble() - 0.25));
        }
        rkey.set("row-i"+i);
        vw.set(namedRow);
        w.append(rkey, vw);
        xi.assign(dv, Functions.PLUS);
      }
      closeables.remove(w);
      Closeables.close(w, false);

      xi.assign(Functions.mult(1.0 / m));

      FileSystem fs = FileSystem.get(conf);

      Path tempDirPath = getTestTempDirPath("svd-proc");
      Path aPath = new Path(tempDirPath, "A/A.seq");
      fs.copyFromLocalFile(aLocPath, aPath);
      Path xiPath = new Path(tempDirPath, "xi/xi.seq");
      SSVDHelper.saveVector(xi, xiPath, conf);

      Path svdOutPath = new Path(tempDirPath, "SSVD-out");

      // make sure we wipe out previous test results, just a convenience
      fs.delete(svdOutPath, true);

      // Solver starts here:
      System.out.println("Input prepared, starting solver...");

      int ablockRows = 867;
      int p = 60;
      int k = 40;
      SSVDSolver ssvd =
        new SSVDSolver(conf,
                       new Path[]{aPath},
                       svdOutPath,
                       ablockRows,
                       k,
                       p,
                       3);
      ssvd.setOuterBlockHeight(500);
      ssvd.setAbtBlockHeight(251);
      ssvd.setPcaMeanPath(xiPath);

    /*
     * Removing V,U jobs from this test to reduce running time. i will keep them
     * put in the dense test though.
     *
     * For PCA test, we also want to request U*Sigma output and check it for named
     * vector propagation.
     */
      ssvd.setComputeU(false);
      ssvd.setComputeV(false);
      ssvd.setcUSigma(true);

      ssvd.setOverwrite(true);
      ssvd.setQ(q);
      ssvd.setBroadcast(true);
      ssvd.run();

      Vector stochasticSValues = ssvd.getSingularValues();

      // try to run the same thing without stochastic algo
      Matrix a = SSVDHelper.drmLoadAsDense(fs, aPath, conf);

      verifyInternals(svdOutPath, a, new Omega(ssvd.getOmegaSeed(), k + p), k + p, q);

      // subtract pseudo pca mean
      for (int i = 0; i < m; i++) {
        a.viewRow(i).assign(xi, Functions.MINUS);
      }

      SingularValueDecomposition svd2 =
        new SingularValueDecomposition(a);

      Vector svalues2 = new DenseVector(svd2.getSingularValues());

      System.out.println("--SSVD solver singular values:");
      LocalSSVDSolverSparseSequentialTest.dumpSv(stochasticSValues);
      System.out.println("--SVD solver singular values:");
      LocalSSVDSolverSparseSequentialTest.dumpSv(svalues2);

      for (int i = 0; i < k + p; i++) {
        assertTrue(Math.abs(svalues2.getQuick(i) - stochasticSValues.getQuick(i)) <= s_epsilon);
      }

      DenseMatrix mQ =
        SSVDHelper.drmLoadAsDense(fs, new Path(svdOutPath, "Bt-job/"
          + BtJob.OUTPUT_Q + "-*"), conf);

      SSVDCommonTest.assertOrthonormality(mQ,
                                          false,
                                          s_epsilon);

      // assert name propagation
      for (Iterator<Pair<Writable, Vector>> iter = SSVDHelper.drmIterator(fs,
                                                                          new Path(ssvd.getuSigmaPath()+"/*"),
                                                                          conf,
                                                                          closeables); iter.hasNext(); ) {
        Pair<Writable, Vector> pair = iter.next();
        Writable key = pair.getFirst();
        Vector v = pair.getSecond();

        assertTrue(v instanceof NamedVector);
        assertTrue(key instanceof Text);
      }

    } finally {
      IOUtils.close(closeables);
    }
  }

  private void verifyInternals(Path tempDir, Matrix a, Omega omega, int kp, int q) {
    int m = a.numRows();
    int n = a.numCols();

    Vector xi = a.aggregateColumns(new VectorFunction() {
      @Override
      public double apply(Vector v) {
        return v.zSum() / v.size();
      }
    });

    // materialize omega
    Matrix momega = new DenseMatrix(n, kp);
    for (int i = 0; i < n; i++)
      for (int j = 0; j < kp; j++)
        momega.setQuick(i, j, omega.getQuick(i, j));

    Vector s_o = omega.mutlithreadedTRightMultiply(xi);

    System.out.printf("s_omega=\n%s\n", s_o);

    Matrix y = a.times(momega);
    for (int i = 0; i < n; i++) y.viewRow(i).assign(s_o, Functions.MINUS);

    QRDecomposition qr = new QRDecomposition(y);
    Matrix qm = qr.getQ();

    Vector s_q = qm.aggregateColumns(new VectorFunction() {
      @Override
      public double apply(Vector v) {
        return v.zSum();
      }
    });

    System.out.printf("s_q=\n%s\n", s_q);

    Matrix b = qm.transpose().times(a);

    Vector s_b = b.times(xi);

    System.out.printf("s_b=\n%s\n", s_b);


  }

}
TOP

Related Classes of org.apache.mahout.math.hadoop.stochasticsvd.LocalSSVDPCASparseTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.