Examples of org.apache.mahout.math.hadoop.DistributedRowMatrix

org.apache.mahout.math.hadoop.DistributedRowMatrix
path must already contain an already created SequenceFile! DistributedRowMatrix m = new DistributedRowMatrix("path/to/vector/sequenceFile", "tmp/path", 10000000, 250000); m.configure(new JobConf()); // now if we want to multiply a vector by this matrix, it's dimension must equal the row dimension of this // matrix. If we want to timesSquared() a vector by this matrix, its dimension must equal the column dimension // of the matrix. Vector v = new DenseVector(250000); // now the following operation will be done via a M/R pass via Hadoop. Vector w = m.timesSquared(v);

    }


    maxError = Double.parseDouble(argMap.get("--maxError"));
    minEigenValue = Double.parseDouble(argMap.get("--minEigenvalue"));


    DistributedRowMatrix c = new DistributedRowMatrix(argMap.get("--corpusInput"), tmpOut, 1, 1);
    c.configure(new JobConf(getConf()));
    corpus = c;


    // set up eigenverifier and orthoverifier TODO: allow multithreaded execution


    eigenVerifier = new SimpleEigenVerifier();

View Full Code Here

    }
    return eigenMetaData;
  }


  private void prepareEigens(String eigenInput, boolean inMemory) {
    DistributedRowMatrix eigens = new DistributedRowMatrix(eigenInput, tmpOut, 1, 1);
    eigens.configure(new JobConf(getConf()));
    if(inMemory) {
      List<Vector> eigenVectors = new ArrayList<Vector>();
      for(MatrixSlice slice : eigens) {
        eigenVectors.add(slice.vector());
      }

View Full Code Here

    int desiredRank = Integer.parseInt(parsedArgs.get("--rank"));
    Matrix eigenVectors = new DenseMatrix(desiredRank, numCols);
    List<Double> eigenValues = new ArrayList<Double>();
    String outputEigenVectorPath =  parsedArgs.get("--output");
    
    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPathString,
                                                           outputTmpPathString,
                                                           numRows,
                                                           numCols);
    matrix.configure(new JobConf(getConf()));
    solve(matrix, desiredRank, eigenVectors, eigenValues, isSymmetric);


    serializeOutput(eigenVectors, eigenValues, outputEigenVectorPath);  
    return 0;
  }

View Full Code Here

  public void doTestDistributedLanczosSolver(boolean symmetric) throws Exception {
    File testData = new File("testdata");
    if (!testData.exists()) {
      testData.mkdir();
    }
    DistributedRowMatrix corpus = TestDistributedRowMatrix.randomDistributedMatrix(500,
        450, 400, 10, 10.0, symmetric, "testdata");
    corpus.configure(new JobConf());
    DistributedLanczosSolver solver = new DistributedLanczosSolver();
    int desiredRank = 30;
    Matrix eigenVectors = new DenseMatrix(desiredRank, corpus.numCols());
    List<Double> eigenValues = new ArrayList<Double>();
    solver.solve(corpus, desiredRank, eigenVectors, eigenValues, symmetric);
    assertOrthonormal(eigenVectors);
    assertEigen(eigenVectors, corpus, eigenVectors.numRows() / 2, 0.01, symmetric);
  }

View Full Code Here

    Path affSeqFiles = new Path(outputCalc, "seqfile-" + (System.nanoTime() & 0xFF));
    AffinityMatrixInputJob.runJob(input, affSeqFiles, numDims, numDims);


    // Next step: construct the affinity matrix using the newly-created
    // sequence files
    DistributedRowMatrix A = new DistributedRowMatrix(affSeqFiles,
                                                      new Path(outputTmp, "afftmp-" + (System.nanoTime() & 0xFF)),
                                                      numDims,
                                                      numDims);
    JobConf depConf = new JobConf(conf);
    A.configure(depConf);


    // Next step: construct the diagonal matrix D (represented as a vector)
    // and calculate the normalized Laplacian of the form:
    // L = D^(-0.5)AD^(-0.5)
    Vector D = MatrixDiagonalizeJob.runJob(affSeqFiles, numDims);
    DistributedRowMatrix L =
        VectorMatrixMultiplicationJob.runJob(affSeqFiles, D,
            new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)));
    L.configure(depConf);


    // Next step: perform eigen-decomposition using LanczosSolver
    // since some of the eigen-output is spurious and will be eliminated
    // upon verification, we have to aim to overshoot and then discard
    // unnecessary vectors later
    int overshoot = (int) ((double) clusters * OVERSHOOT_MULTIPLIER);
    List<Double> eigenValues = new ArrayList<Double>(overshoot);
    Matrix eigenVectors = new DenseMatrix(overshoot, numDims);
    DistributedLanczosSolver solver = new DistributedLanczosSolver();
    Path lanczosSeqFiles = new Path(outputCalc, "eigenvectors-" + (System.nanoTime() & 0xFF));
    solver.runJob(conf,
                  L.getRowPath(),
                  new Path(outputTmp, "lanczos-" + (System.nanoTime() & 0xFF)),
                  L.numRows(),
                  L.numCols(),
                  true,
                  overshoot,
                  eigenVectors,
                  eigenValues,
                  lanczosSeqFiles.toString());


    // perform a verification
    EigenVerificationJob verifier = new EigenVerificationJob();
    Path verifiedEigensPath = new Path(outputCalc, "eigenverifier");
    verifier.runJob(conf, lanczosSeqFiles, L.getRowPath(), verifiedEigensPath, true, 1.0, 0.0, clusters);
    Path cleanedEigens = verifier.getCleanedEigensPath();
    DistributedRowMatrix W = new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), clusters, numDims);
    W.configure(depConf);
    DistributedRowMatrix Wtrans = W.transpose();
    //    DistributedRowMatrix Wt = W.transpose();


    // next step: normalize the rows of Wt to unit length
    Path unitVectors = new Path(outputCalc, "unitvectors-" + (System.nanoTime() & 0xFF));
    UnitVectorizerJob.runJob(Wtrans.getRowPath(), unitVectors);
    DistributedRowMatrix Wt = new DistributedRowMatrix(unitVectors, new Path(unitVectors, "tmp"), clusters, numDims);
    Wt.configure(depConf);


    // Finally, perform k-means clustering on the rows of L (or W)
    // generate random initial clusters
    Path initialclusters = RandomSeedGenerator.buildRandom(Wt.getRowPath(),
                                                           new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
                                                           clusters,
                                                           measure);
    KMeansDriver.run(conf, Wt.getRowPath(), initialclusters, output, measure, convergenceDelta, maxIterations, true, false);


    // Read through the cluster assignments
    Path clusteredPointsPath = new Path(output, "clusteredPoints");
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(clusteredPointsPath, "part-m-00000"), conf);

View Full Code Here

    // set the instance variables
    // create a few new Paths for temp files and transformations
    Path outputCalc = new Path(output, "calculations");
    Path outputTmp = new Path(output, "temporary");


    DistributedRowMatrix A = AffinityMatrixInputJob.runJob(input, outputCalc, dimensions);
    Vector D = MatrixDiagonalizeJob.runJob(A.getRowPath(), dimensions);


    long numCuts;
    do {
      // first three steps are the same as spectral k-means:
      // 1) calculate D from A
      // 2) calculate L = D^-0.5 * A * D^-0.5
      // 3) calculate eigenvectors of L


      DistributedRowMatrix L =
          VectorMatrixMultiplicationJob.runJob(A.getRowPath(), D,
              new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)));
      L.configure(new JobConf(conf));


      // eigendecomposition (step 3)
      int overshoot = (int) ((double) dimensions * OVERSHOOT_MULTIPLIER);
      List<Double> eigenValues = new ArrayList<Double>(overshoot);
      Matrix eigenVectors = new DenseMatrix(overshoot, dimensions);
      DistributedRowMatrix U = performEigenDecomposition(conf, L, dimensions, overshoot, eigenValues, eigenVectors, outputCalc);
      U.configure(new JobConf(conf));
      eigenValues = eigenValues.subList(0, dimensions);


      // here's where things get interesting: steps 4, 5, and 6 are unique
      // to this algorithm, and depending on the final output, steps 1-3
      // may be repeated as well


      // helper method, since apparently List and Vector objects don't play nicely
      Vector evs = listToVector(eigenValues);


      // calculate sensitivities (step 4 and step 5)
      Path sensitivities = new Path(outputCalc, "sensitivities-" + (System.nanoTime() & 0xFF));
      EigencutsSensitivityJob.runJob(evs, D, U.getRowPath(), halflife, tau, median(D), epsilon, sensitivities);


      // perform the cuts (step 6)
      input = new Path(outputTmp, "nextAff-" + (System.nanoTime() & 0xFF));
      numCuts = EigencutsAffinityCutsJob.runjob(A.getRowPath(), sensitivities, input, conf);


      // how many cuts were made?
      if (numCuts > 0) {
        // recalculate A
        A = new DistributedRowMatrix(input, new Path(outputTmp, Long.toString(System.nanoTime())), dimensions, dimensions);
        A.configure(new JobConf());
      }
    } while (numCuts > 0);


    // TODO: MAHOUT-517: Eigencuts needs an output format

View Full Code Here

    // now run the verifier to trim down the number of eigenvectors
    EigenVerificationJob verifier = new EigenVerificationJob();
    Path verifiedEigens = new Path(tmp, "verifiedeigens");
    verifier.runJob(conf, seqFiles, input.getRowPath(), verifiedEigens, false, 1.0, 0.0, numEigenVectors);
    Path cleanedEigens = verifier.getCleanedEigensPath();
    return new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), numEigenVectors, input.numRows());
  }

View Full Code Here

    FileInputFormat.addInputPath(job, markovPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.waitForCompletion(true);
    
    // build the resulting DRM from the results
    return new DistributedRowMatrix(outputPath, new Path(outputPath, "tmp"), 
        diag.size(), diag.size());
  }

View Full Code Here

   */
  public static DistributedRowMatrix runJob(Path input, Path output, int dimensions)
    throws IOException, InterruptedException, ClassNotFoundException {
    Path seqFiles = new Path(output, "seqfiles-" + (System.nanoTime() & 0xFF));
    runJob(input, seqFiles, dimensions, dimensions);
    DistributedRowMatrix A = new DistributedRowMatrix(seqFiles, 
        new Path(seqFiles, "seqtmp-" + (System.nanoTime() & 0xFF)), 
        dimensions, dimensions);
    A.configure(new JobConf());
    return A;
  }

View Full Code Here

    // Run EigenVerificationJob from within DistributedLanczosSolver.run(...)
    solver.run(testData, output, tmp, sampleData.size(), sampleDimension, false, desiredRank, 0.5, 0.0, false);
    Path cleanEigenvectors = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);


    // now multiply the testdata matrix and the eigenvector matrix
    DistributedRowMatrix svdT = new DistributedRowMatrix(cleanEigenvectors, tmp, desiredRank - 1, sampleDimension);
    JobConf conf = new JobConf(config);
    svdT.configure(conf);
    DistributedRowMatrix a = new DistributedRowMatrix(testData, tmp, sampleData.size(), sampleDimension);
    a.configure(conf);
    DistributedRowMatrix sData = a.transpose().times(svdT.transpose());
    sData.configure(conf);


    // now run the Canopy job to prime kMeans canopies
    CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false, false);
    // now run the KMeans job
    KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"), output, measure, 0.001, 10, true, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, 10), new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(termDictionary);
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.mahout.math.hadoop.DistributedRowMatrix

org.apache.hadoop.conf.Configuration

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.fs.Path

org.apache.hadoop.mapred.JobConf

org.apache.hadoop.mapreduce.Job

org.apache.mahout.clustering.spectral.AffinityMatrixInputJob

org.apache.mahout.clustering.spectral.common.AffinityMatrixInputJob

org.apache.mahout.clustering.spectral.common.VectorMatrixMultiplicationJob

org.apache.mahout.clustering.spectral.eigencuts.EigencutsDriver

org.apache.mahout.clustering.spectral.kmeans.SpectralKMeansDriver

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.