Package org.apache.mahout.math.hadoop.decomposer

Source Code of org.apache.mahout.math.hadoop.decomposer.TestDistributedLanczosSolverCLI

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.mahout.math.hadoop.decomposer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;
import org.apache.mahout.math.hadoop.TestDistributedRowMatrix;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Arrays;

public final class TestDistributedLanczosSolverCLI extends MahoutTestCase {
  private static final Logger log = LoggerFactory.getLogger(TestDistributedLanczosSolverCLI.class);

  @Test
  public void testDistributedLanczosSolverCLI() throws Exception {
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix corpus =
        new TestDistributedRowMatrix().randomDenseHierarchicalDistributedMatrix(50, 45, false,
            testData.toString());
    corpus.setConf(new Configuration());
    Path output = getTestTempDirPath("output");
    Path tmp = getTestTempDirPath("tmp");
    Path workingDir = getTestTempDirPath("working");
    String[] args = {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "50",
        "--numCols", "45",
        "--rank", "30",
        "--symmetric", "false",
        "--workingDir", workingDir.toString()
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);

    output = getTestTempDirPath("output2");
    tmp = getTestTempDirPath("tmp2");
    args = new String[] {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "50",
        "--numCols", "45",
        "--rank", "35",
        "--symmetric", "false",
        "--workingDir", workingDir.toString()
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);

    Path rawEigenvectors = new Path(output, DistributedLanczosSolver.RAW_EIGENVECTORS);
    Matrix eigenVectors = new DenseMatrix(35, corpus.numCols());
    Configuration conf = new Configuration();

    int i = 0;
    for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(rawEigenvectors, conf)) {
      Vector v = value.get();
      eigenVectors.assignRow(i, v);
      i++;
    }
    assertEquals("number of eigenvectors", 35, i);
  }

  @Test
  public void testDistributedLanczosSolverEVJCLI() throws Exception {
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix corpus = new TestDistributedRowMatrix()
        .randomDenseHierarchicalDistributedMatrix(50, 45, false, testData.toString());
    corpus.setConf(new Configuration());
    Path output = getTestTempDirPath("output");
    Path tmp = getTestTempDirPath("tmp");
    String[] args = {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "50",
        "--numCols", "45",
        "--rank", "30",
        "--symmetric", "false",
        "--cleansvd", "true"
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);
 
    Path cleanEigenvectors = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
    Matrix eigenVectors = new DenseMatrix(30, corpus.numCols());
    Collection<Double> eigenvalues = new ArrayList<Double>();

    output = getTestTempDirPath("output2");
    tmp = getTestTempDirPath("tmp2");
    args = new String[] {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "50",
        "--numCols", "45",
        "--rank", "35",
        "--symmetric", "false",
        "--cleansvd", "true"
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);
    Path cleanEigenvectors2 = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
    Matrix eigenVectors2 = new DenseMatrix(35, corpus.numCols());
    Configuration conf = new Configuration();
    Collection<Double> newEigenValues = new ArrayList<Double>();

    int i = 0;
    for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(cleanEigenvectors, conf)) {
      NamedVector v = (NamedVector) value.get();
      eigenVectors.assignRow(i, v);
      log.info(v.getName());
      if(EigenVector.getCosAngleError(v.getName()) < 1e-3) {
        eigenvalues.add(EigenVector.getEigenValue(v.getName()));
      }
      i++;
    }
    assertEquals("number of clean eigenvectors", 23, i);

    i = 0;
    for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(cleanEigenvectors2, conf)) {
      NamedVector v = (NamedVector) value.get();
      log.info(v.getName());
      eigenVectors2.assignRow(i, v);
      newEigenValues.add(EigenVector.getEigenValue(v.getName()));
      i++;
    }

    Collection<Integer> oldEigensFound = new ArrayList<Integer>();
    for(int row = 0; row < eigenVectors.numRows(); row++) {
      Vector oldEigen = eigenVectors.getRow(row);
      if(oldEigen == null) {
        break;
      }
      for(int newRow = 0; newRow < eigenVectors2.numRows(); newRow++) {
        Vector newEigen = eigenVectors2.getRow(newRow);
        if(newEigen != null) {
          if(oldEigen.dot(newEigen) > 0.9) {
            oldEigensFound.add(row);
            break;
          }
        }
      }
    }
    assertEquals("the number of new eigenvectors", 30, i);

    Collection<Double> oldEigenValuesNotFound = new ArrayList<Double>();
    for(double d : eigenvalues) {
      boolean found = false;
      for(double newD : newEigenValues) {
        if(Math.abs((d - newD)/d) < 0.1) {
          found = true;
        }
      }
      if(!found) {
        oldEigenValuesNotFound.add(d);
      }
    }
    assertEquals("number of old eigenvalues not found: "
                 + Arrays.toString(oldEigenValuesNotFound.toArray(new Double[oldEigenValuesNotFound.size()])),
                0, oldEigenValuesNotFound.size());
    assertEquals("did not find enough old eigenvectors", 16, oldEigensFound.size());
  }

}
TOP

Related Classes of org.apache.mahout.math.hadoop.decomposer.TestDistributedLanczosSolverCLI

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.