Examples of org.apache.mahout.math.hadoop.DistributedRowMatrix

org.apache.mahout.math.hadoop.DistributedRowMatrix
path must already contain an already created SequenceFile! DistributedRowMatrix m = new DistributedRowMatrix("path/to/vector/sequenceFile", "tmp/path", 10000000, 250000); m.configure(new JobConf()); // now if we want to multiply a vector by this matrix, it's dimension must equal the row dimension of this // matrix. If we want to timesSquared() a vector by this matrix, its dimension must equal the column dimension // of the matrix. Vector v = new DenseVector(250000); // now the following operation will be done via a M/R pass via Hadoop. Vector w = m.timesSquared(v);


  @Test
  public void testSolver() throws Exception {
    Configuration conf = new Configuration();
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix matrix = new TestDistributedRowMatrix().randomDistributedMatrix(
        10, 10, 10, 10, 10.0, true, testData.toString());
    matrix.setConf(conf);
    Path output = getTestTempFilePath("output");
    Path vectorPath = getTestTempFilePath("vector");
    Path tempPath = getTestTempDirPath("tmp");


    Vector vector = randomVector(matrix.numCols(), 10.0);
    saveVector(conf, vectorPath, vector);
        
    String[] args = {
        "-i", matrix.getRowPath().toString(),
        "-o", output.toString(),
        "--tempDir", tempPath.toString(),
        "--vector", vectorPath.toString(),
        "--numRows", "10",
        "--numCols", "10",
        "--symmetric", "true"        
    };
    
    DistributedConjugateGradientSolver solver = new DistributedConjugateGradientSolver();
    solver.job().run(args);
    
    Vector x = loadVector(conf, output);
    
    Vector solvedVector = matrix.times(x);    
    double distance = Math.sqrt(vector.getDistanceSquared(solvedVector));
    assertEquals(0.0, distance, EPSILON);
  }

View Full Code Here

  }


  @Test
  public void testSolver() throws Exception {
    File testData = getTestTempDir("testdata");
    DistributedRowMatrix matrix = new TestDistributedRowMatrix().randomDistributedMatrix(
        10, 10, 10, 10, 10.0, true, testData.getAbsolutePath());
    matrix.setConf(new Configuration());
    Vector vector = randomVector(matrix.numCols(), 10.0);
    
    DistributedConjugateGradientSolver solver = new DistributedConjugateGradientSolver();
    Vector x = solver.solve(matrix, vector);


    Vector solvedVector = matrix.times(x);    
    double distance = Math.sqrt(vector.getDistanceSquared(solvedVector));
    assertEquals(0.0, distance, EPSILON);
  }

View Full Code Here

  private static final Logger log = LoggerFactory.getLogger(TestDistributedLanczosSolverCLI.class);


  @Test
  public void testDistributedLanczosSolverCLI() throws Exception {
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix corpus =
        new TestDistributedRowMatrix().randomDenseHierarchicalDistributedMatrix(10, 9, false,
            testData.toString());
    corpus.setConf(new Configuration());
    Path output = getTestTempDirPath("output");
    Path tmp = getTestTempDirPath("tmp");
    Path workingDir = getTestTempDirPath("working");
    String[] args = {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "10",
        "--numCols", "9",
        "--rank", "6",
        "--symmetric", "false",
        "--workingDir", workingDir.toString()
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);


    output = getTestTempDirPath("output2");
    tmp = getTestTempDirPath("tmp2");
    args = new String[] {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "10",
        "--numCols", "9",
        "--rank", "7",
        "--symmetric", "false",
        "--workingDir", workingDir.toString()
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);


    Path rawEigenvectors = new Path(output, DistributedLanczosSolver.RAW_EIGENVECTORS);
    Matrix eigenVectors = new DenseMatrix(7, corpus.numCols());
    Configuration conf = new Configuration();


    int i = 0;
    for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(rawEigenvectors, conf)) {
      Vector v = value.get();

View Full Code Here

  }


  @Test
  public void testDistributedLanczosSolverEVJCLI() throws Exception {
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix corpus = new TestDistributedRowMatrix()
        .randomDenseHierarchicalDistributedMatrix(10, 9, false, testData.toString());
    corpus.setConf(new Configuration());
    Path output = getTestTempDirPath("output");
    Path tmp = getTestTempDirPath("tmp");
    String[] args = {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "10",
        "--numCols", "9",
        "--rank", "6",
        "--symmetric", "false",
        "--cleansvd", "true"
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);
  
    Path cleanEigenvectors = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
    Matrix eigenVectors = new DenseMatrix(6, corpus.numCols());
    Collection<Double> eigenvalues = Lists.newArrayList();


    output = getTestTempDirPath("output2");
    tmp = getTestTempDirPath("tmp2");
    args = new String[] {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "10",
        "--numCols", "9",
        "--rank", "7",
        "--symmetric", "false",
        "--cleansvd", "true"
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);
    Path cleanEigenvectors2 = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
    Matrix eigenVectors2 = new DenseMatrix(7, corpus.numCols());
    Configuration conf = new Configuration();
    Collection<Double> newEigenValues = Lists.newArrayList();


    int i = 0;
    for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(cleanEigenvectors, conf)) {

View Full Code Here

   */


  private LanczosState doTestDistributedLanczosSolver(boolean symmetric,
      int desiredRank, boolean hdfsBackedState)
      throws IOException {
    DistributedRowMatrix corpus = getCorpus(symmetric);
    Configuration conf = new Configuration();
    corpus.setConf(conf);
    DistributedLanczosSolver solver = new DistributedLanczosSolver();
    Vector intitialVector = solver.getInitialVector(corpus);
    LanczosState state;
    if (hdfsBackedState) {
      HdfsBackedLanczosState hState = new HdfsBackedLanczosState(corpus,

View Full Code Here

    counter++;
    return state;
  }


  public void doTestResumeIteration(boolean symmetric) throws IOException {
    DistributedRowMatrix corpus = getCorpus(symmetric);
    Configuration conf = new Configuration();
    corpus.setConf(conf);
    DistributedLanczosSolver solver = new DistributedLanczosSolver();
    int rank = 10;
    Vector intitialVector = solver.getInitialVector(corpus);
    HdfsBackedLanczosState state = new HdfsBackedLanczosState(corpus, rank,
        intitialVector, new Path(getTestTempDirPath(), "lanczosStateDir" + suf(symmetric) + counter));

View Full Code Here

    // (similar to the style of syntheticcontrol.canopy.InputMapper)
    Path affSeqFiles = new Path(outputCalc, "seqfile");
    AffinityMatrixInputJob.runJob(input, affSeqFiles, numDims, numDims);


    // Construct the affinity matrix using the newly-created sequence files
    DistributedRowMatrix A = new DistributedRowMatrix(affSeqFiles, new Path(outputTmp, "afftmp"), numDims, numDims);


    Configuration depConf = new Configuration(conf);
    A.setConf(depConf);


    // Construct the diagonal matrix D (represented as a vector)
    Vector D = MatrixDiagonalizeJob.runJob(affSeqFiles, numDims);


    // Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
    DistributedRowMatrix L = VectorMatrixMultiplicationJob.runJob(affSeqFiles, D, new Path(outputCalc, "laplacian"),
        new Path(outputCalc, outputCalc));
    L.setConf(depConf);


    Path data;


    if (ssvd) {
      // SSVD requires an array of Paths to function. So we pass in an array of length one
      Path[] LPath = new Path[1];
      LPath[0] = L.getRowPath();


      Path SSVDout = new Path(outputCalc, "SSVD");


      SSVDSolver solveIt = new SSVDSolver(depConf, LPath, SSVDout, blockHeight, clusters, oversampling, numReducers);


      solveIt.setComputeV(false);
      solveIt.setComputeU(true);
      solveIt.setOverwrite(true);
      solveIt.setQ(poweriters);
      // solveIt.setBroadcast(false);
      solveIt.run();
      data = new Path(solveIt.getUPath());
    } else {
      // Perform eigen-decomposition using LanczosSolver
      // since some of the eigen-output is spurious and will be eliminated
      // upon verification, we have to aim to overshoot and then discard
      // unnecessary vectors later
      int overshoot = Math.min((int) (clusters * OVERSHOOTMULTIPLIER), numDims);
      DistributedLanczosSolver solver = new DistributedLanczosSolver();
      LanczosState state = new LanczosState(L, overshoot, DistributedLanczosSolver.getInitialVector(L));
      Path lanczosSeqFiles = new Path(outputCalc, "eigenvectors");


      solver.runJob(conf, state, overshoot, true, lanczosSeqFiles.toString());


      // perform a verification
      EigenVerificationJob verifier = new EigenVerificationJob();
      Path verifiedEigensPath = new Path(outputCalc, "eigenverifier");
      verifier.runJob(conf, lanczosSeqFiles, L.getRowPath(), verifiedEigensPath, true, 1.0, clusters);


      Path cleanedEigens = verifier.getCleanedEigensPath();
      DistributedRowMatrix W = new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), clusters,
          numDims);
      W.setConf(depConf);
      DistributedRowMatrix Wtrans = W.transpose();
      data = Wtrans.getRowPath();
    }


    // Normalize the rows of Wt to unit length
    // normalize is important because it reduces the occurrence of two unique clusters combining into one
    Path unitVectors = new Path(outputCalc, "unitvectors");


    UnitVectorizerJob.runJob(data, unitVectors);


    DistributedRowMatrix Wt = new DistributedRowMatrix(unitVectors, new Path(unitVectors, "tmp"), clusters, numDims);
    Wt.setConf(depConf);
    data = Wt.getRowPath();


    // Generate initial clusters using EigenSeedGenerator which picks rows as centroids if that row contains max
    // eigen value in that column
    Path initialclusters = EigenSeedGenerator.buildFromEigens(conf, data,
        new Path(output, Cluster.INITIAL_CLUSTERS_DIR), clusters, measure);

View Full Code Here

                       int numCols, 
                       Vector b, 
                       Preconditioner preconditioner, 
                       int maxIterations, 
                       double maxError) {
    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, tempPath, numRows, numCols);
    matrix.setConf(conf);
        
    return solve(matrix, b, preconditioner, maxIterations, maxError);
  }

View Full Code Here

                             int numRows,
                             int numCols,
                             boolean isSymmetric,
                             int desiredRank,
                             String outputEigenVectorPathString) throws IOException {
    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
    matrix.setConf(new Configuration(originalConfig));
    LanczosState state = new LanczosState(matrix, desiredRank, getInitialVector(matrix));
    return runJob(originalConfig, state, desiredRank, isSymmetric, outputEigenVectorPathString);
  }

View Full Code Here

                 Path workingDirPath,
                 int numRows,
                 int numCols,
                 boolean isSymmetric,
                 int desiredRank) throws Exception {
    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
    matrix.setConf(new Configuration(getConf() != null ? getConf() : new Configuration()));


    LanczosState state;
    if (workingDirPath == null) {
      state = new LanczosState(matrix, desiredRank, getInitialVector(matrix));
    } else {
      HdfsBackedLanczosState hState =
          new HdfsBackedLanczosState(matrix, desiredRank, getInitialVector(matrix), workingDirPath);
      hState.setConf(matrix.getConf());
      state = hState;
    }
    solve(state, desiredRank, isSymmetric);


    Path outputEigenVectorPath = new Path(outputPath, RAW_EIGENVECTORS);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.mahout.math.hadoop.DistributedRowMatrix

org.apache.hadoop.conf.Configuration

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.fs.Path

org.apache.hadoop.mapred.JobConf

org.apache.hadoop.mapreduce.Job

org.apache.mahout.clustering.spectral.AffinityMatrixInputJob

org.apache.mahout.clustering.spectral.common.AffinityMatrixInputJob

org.apache.mahout.clustering.spectral.common.VectorMatrixMultiplicationJob

org.apache.mahout.clustering.spectral.eigencuts.EigencutsDriver

org.apache.mahout.clustering.spectral.kmeans.SpectralKMeansDriver

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.