Package org.apache.mahout.math.hadoop

Examples of org.apache.mahout.math.hadoop.DistributedRowMatrix


                             int numRows,
                             int numCols,
                             boolean isSymmetric,
                             int desiredRank,
                             String outputEigenVectorPathString) throws IOException {
    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
    matrix.setConf(new Configuration(originalConfig));
    LanczosState state = new LanczosState(matrix, desiredRank, getInitialVector(matrix));
    return runJob(originalConfig, state, desiredRank, isSymmetric, outputEigenVectorPathString);
  }
View Full Code Here


                 Path workingDirPath,
                 int numRows,
                 int numCols,
                 boolean isSymmetric,
                 int desiredRank) throws Exception {
    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
    matrix.setConf(new Configuration(getConf() != null ? getConf() : new Configuration()));

    LanczosState state;
    if (workingDirPath == null) {
      state = new LanczosState(matrix, desiredRank, getInitialVector(matrix));
    } else {
      HdfsBackedLanczosState hState =
          new HdfsBackedLanczosState(matrix, desiredRank, getInitialVector(matrix), workingDirPath);
      hState.setConf(matrix.getConf());
      state = hState;
    }
    solve(state, desiredRank, isSymmetric);

    Path outputEigenVectorPath = new Path(outputPath, RAW_EIGENVECTORS);
View Full Code Here

    this.minEigenValue = minEigenValue;

    if (eigenInput != null && eigensToVerify == null) {
      prepareEigens(conf, eigenInput, inMemory);
    }
    DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tempOut, 1, 1);
    c.setConf(conf);
    corpus = c;

    // set up eigenverifier and orthoverifier TODO: allow multithreaded execution

    eigenVerifier = new SimpleEigenVerifier();
View Full Code Here

    }
    return eigenMetaData;
  }

  private void prepareEigens(Configuration conf, Path eigenInput, boolean inMemory) {
    DistributedRowMatrix eigens = new DistributedRowMatrix(eigenInput, tmpOut, 1, 1);
    eigens.setConf(conf);
    if (inMemory) {
      List<Vector> eigenVectors = Lists.newArrayList();
      for (MatrixSlice slice : eigens) {
        eigenVectors.add(slice.vector());
      }
View Full Code Here

    this.maxError = maxError;
    if (eigenInput != null && eigensToVerify == null) {
      prepareEigens(new Configuration(conf), eigenInput, inMemory);
    }

    DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tmpOut, 1, 1);
    c.setConf(new Configuration(conf));
    corpus = c;

    eigenVerifier = new SimpleEigenVerifier();

    Map<MatrixSlice,EigenStatus> eigenMetaData = verifyEigens();
View Full Code Here

    if (!succeeded) {
      throw new IllegalStateException("Job failed!");
    }

    // build the resulting DRM from the results
    return new DistributedRowMatrix(outputPath, tmpPath,
        diag.size(), diag.size());
  }
View Full Code Here

   */
  public static DistributedRowMatrix runJob(Path input, Path output, int dimensions)
    throws IOException, InterruptedException, ClassNotFoundException {
    Path seqFiles = new Path(output, "seqfiles-" + (System.nanoTime() & 0xFF));
    runJob(input, seqFiles, dimensions, dimensions);
    DistributedRowMatrix a = new DistributedRowMatrix(seqFiles,
        new Path(seqFiles, "seqtmp-" + (System.nanoTime() & 0xFF)),
        dimensions, dimensions);
    a.setConf(new Configuration());
    return a;
  }
View Full Code Here

    // (similar to the style of syntheticcontrol.canopy.InputMapper)
    Path affSeqFiles = new Path(outputCalc, "seqfile");
    AffinityMatrixInputJob.runJob(input, affSeqFiles, numDims, numDims);

    // Construct the affinity matrix using the newly-created sequence files
    DistributedRowMatrix A = new DistributedRowMatrix(affSeqFiles, new Path(outputTmp, "afftmp"), numDims, numDims);

    Configuration depConf = new Configuration(conf);
    A.setConf(depConf);

    // Construct the diagonal matrix D (represented as a vector)
    Vector D = MatrixDiagonalizeJob.runJob(affSeqFiles, numDims);

    // Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
    DistributedRowMatrix L = VectorMatrixMultiplicationJob.runJob(affSeqFiles, D, new Path(outputCalc, "laplacian"),
        new Path(outputCalc, outputCalc));
    L.setConf(depConf);

    Path data;

    if (ssvd) {
      // SSVD requires an array of Paths to function. So we pass in an array of length one
      Path[] LPath = new Path[1];
      LPath[0] = L.getRowPath();

      Path SSVDout = new Path(outputCalc, "SSVD");

      SSVDSolver solveIt = new SSVDSolver(depConf, LPath, SSVDout, blockHeight, clusters, oversampling, numReducers);

      solveIt.setComputeV(false);
      solveIt.setComputeU(true);
      solveIt.setOverwrite(true);
      solveIt.setQ(poweriters);
      // solveIt.setBroadcast(false);
      solveIt.run();
      data = new Path(solveIt.getUPath());
    } else {
      // Perform eigen-decomposition using LanczosSolver
      // since some of the eigen-output is spurious and will be eliminated
      // upon verification, we have to aim to overshoot and then discard
      // unnecessary vectors later
      int overshoot = Math.min((int) (clusters * OVERSHOOTMULTIPLIER), numDims);
      DistributedLanczosSolver solver = new DistributedLanczosSolver();
      LanczosState state = new LanczosState(L, overshoot, DistributedLanczosSolver.getInitialVector(L));
      Path lanczosSeqFiles = new Path(outputCalc, "eigenvectors");

      solver.runJob(conf, state, overshoot, true, lanczosSeqFiles.toString());

      // perform a verification
      EigenVerificationJob verifier = new EigenVerificationJob();
      Path verifiedEigensPath = new Path(outputCalc, "eigenverifier");
      verifier.runJob(conf, lanczosSeqFiles, L.getRowPath(), verifiedEigensPath, true, 1.0, clusters);

      Path cleanedEigens = verifier.getCleanedEigensPath();
      DistributedRowMatrix W = new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), clusters,
          numDims);
      W.setConf(depConf);
      DistributedRowMatrix Wtrans = W.transpose();
      data = Wtrans.getRowPath();
    }

    // Normalize the rows of Wt to unit length
    // normalize is important because it reduces the occurrence of two unique clusters combining into one
    Path unitVectors = new Path(outputCalc, "unitvectors");

    UnitVectorizerJob.runJob(data, unitVectors);

    DistributedRowMatrix Wt = new DistributedRowMatrix(unitVectors, new Path(unitVectors, "tmp"), clusters, numDims);
    Wt.setConf(depConf);
    data = Wt.getRowPath();

    // Generate initial clusters using EigenSeedGenerator which picks rows as centroids if that row contains max
    // eigen value in that column
    Path initialclusters = EigenSeedGenerator.buildFromEigens(conf, data,
        new Path(output, Cluster.INITIAL_CLUSTERS_DIR), clusters, measure);
View Full Code Here

   
    Path cleanEigenvectors = new Path(output,
        EigenVerificationJob.CLEAN_EIGENVECTORS);
   
    // now multiply the testdata matrix and the eigenvector matrix
    DistributedRowMatrix svdT = new DistributedRowMatrix(cleanEigenvectors,
        tmp, desiredRank, sampleDimension);
    Configuration conf = new Configuration(config);
    svdT.setConf(conf);
    DistributedRowMatrix a = new DistributedRowMatrix(testData, tmp,
        sampleData.size(), sampleDimension);
    a.setConf(conf);
    DistributedRowMatrix sData = a.transpose().times(svdT.transpose());
    sData.setConf(conf);
   
    // now run the Canopy job to prime kMeans canopies
    CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false,
        0.0, true);
    // now run the KMeans job
    Path kmeansOutput = new Path(output, "kmeans");
  KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"),
        kmeansOutput, measure, 0.001, 10, true, 0.0, true);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
        kmeansOutput, 10), new Path(kmeansOutput, "clusteredPoints"));
    clusterDumper.printClusters(termDictionary);
View Full Code Here

        0.0, true, conf);
    Path cleanEigenvectors = new Path(output,
        EigenVerificationJob.CLEAN_EIGENVECTORS);
   
    // now multiply the testdata matrix and the eigenvector matrix
    DistributedRowMatrix svdT = new DistributedRowMatrix(cleanEigenvectors,
        tmp, desiredRank, sampleDimension);
    svdT.setConf(conf);
    DistributedRowMatrix a = new DistributedRowMatrix(testData, tmp,
        sampleData.size(), sampleDimension);
    a.setConf(conf);
    DistributedRowMatrix sData = a.transpose().times(svdT.transpose());
    sData.setConf(conf);
   
    // now run the Canopy job to prime kMeans canopies
    CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false,
        0.0, true);
    // now run the KMeans job
    Path kmeansOutput = new Path(output, "kmeans");
  KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"),
        kmeansOutput, measure, 0.001, 10, true, 0.0, true);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
        kmeansOutput, 10), new Path(kmeansOutput, "clusteredPoints"));
    clusterDumper.printClusters(termDictionary);
View Full Code Here

TOP

Related Classes of org.apache.mahout.math.hadoop.DistributedRowMatrix

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.