Package org.apache.mahout.math.hadoop

Examples of org.apache.mahout.math.hadoop.DistributedRowMatrix


  private static final Logger log = LoggerFactory.getLogger(TestDistributedLanczosSolverCLI.class);

  @Test
  public void testDistributedLanczosSolverCLI() throws Exception {
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix corpus =
        new TestDistributedRowMatrix().randomDenseHierarchicalDistributedMatrix(10, 9, false,
            testData.toString());
    corpus.setConf(getConfiguration());
    Path output = getTestTempDirPath("output");
    Path tmp = getTestTempDirPath("tmp");
    Path workingDir = getTestTempDirPath("working");
    String[] args = {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "10",
        "--numCols", "9",
        "--rank", "6",
        "--symmetric", "false",
        "--workingDir", workingDir.toString()
    };
    ToolRunner.run(getConfiguration(), new DistributedLanczosSolver().new DistributedLanczosSolverJob(), args);

    output = getTestTempDirPath("output2");
    tmp = getTestTempDirPath("tmp2");
    args = new String[] {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "10",
        "--numCols", "9",
        "--rank", "7",
        "--symmetric", "false",
        "--workingDir", workingDir.toString()
    };
    ToolRunner.run(getConfiguration(), new DistributedLanczosSolver().new DistributedLanczosSolverJob(), args);

    Path rawEigenvectors = new Path(output, DistributedLanczosSolver.RAW_EIGENVECTORS);
    Matrix eigenVectors = new DenseMatrix(7, corpus.numCols());
    Configuration conf = getConfiguration();

    int i = 0;
    for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(rawEigenvectors, conf)) {
      Vector v = value.get();
View Full Code Here


  }

  @Test
  public void testDistributedLanczosSolverEVJCLI() throws Exception {
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix corpus = new TestDistributedRowMatrix()
        .randomDenseHierarchicalDistributedMatrix(10, 9, false, testData.toString());
    corpus.setConf(getConfiguration());
    Path output = getTestTempDirPath("output");
    Path tmp = getTestTempDirPath("tmp");
    String[] args = {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "10",
        "--numCols", "9",
        "--rank", "6",
        "--symmetric", "false",
        "--cleansvd", "true"
    };
    ToolRunner.run(getConfiguration(), new DistributedLanczosSolver().new DistributedLanczosSolverJob(), args);
 
    Path cleanEigenvectors = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
    Matrix eigenVectors = new DenseMatrix(6, corpus.numCols());
    Collection<Double> eigenvalues = Lists.newArrayList();

    output = getTestTempDirPath("output2");
    tmp = getTestTempDirPath("tmp2");
    args = new String[] {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "10",
        "--numCols", "9",
        "--rank", "7",
        "--symmetric", "false",
        "--cleansvd", "true"
    };
    ToolRunner.run(getConfiguration(), new DistributedLanczosSolver().new DistributedLanczosSolverJob(), args);
    Path cleanEigenvectors2 = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
    Matrix eigenVectors2 = new DenseMatrix(7, corpus.numCols());
    Configuration conf = getConfiguration();
    Collection<Double> newEigenValues = Lists.newArrayList();

    int i = 0;
    for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(cleanEigenvectors, conf)) {
View Full Code Here

   */

  private LanczosState doTestDistributedLanczosSolver(boolean symmetric,
      int desiredRank, boolean hdfsBackedState)
      throws IOException {
    DistributedRowMatrix corpus = getCorpus(symmetric);
    Configuration conf = getConfiguration();
    corpus.setConf(conf);
    DistributedLanczosSolver solver = new DistributedLanczosSolver();
    Vector intitialVector = DistributedLanczosSolver.getInitialVector(corpus);
    LanczosState state;
    if (hdfsBackedState) {
      HdfsBackedLanczosState hState = new HdfsBackedLanczosState(corpus,
View Full Code Here

    counter++;
    return state;
  }

  public void doTestResumeIteration(boolean symmetric) throws IOException {
    DistributedRowMatrix corpus = getCorpus(symmetric);
    Configuration conf = getConfiguration();
    corpus.setConf(conf);
    DistributedLanczosSolver solver = new DistributedLanczosSolver();
    int rank = 10;
    Vector intitialVector = DistributedLanczosSolver.getInitialVector(corpus);
    HdfsBackedLanczosState state = new HdfsBackedLanczosState(corpus, rank,
        intitialVector, new Path(getTestTempDirPath(), "lanczosStateDir" + suf(symmetric) + counter));
View Full Code Here

                             int numRows,
                             int numCols,
                             boolean isSymmetric,
                             int desiredRank,
                             String outputEigenVectorPathString) throws IOException {
    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
    matrix.setConf(new Configuration(originalConfig));
    LanczosState state = new LanczosState(matrix, numCols, desiredRank, getInitialVector(matrix));
    return runJob(originalConfig, state, desiredRank, isSymmetric, outputEigenVectorPathString);
  }
View Full Code Here

                 Path workingDirPath,
                 int numRows,
                 int numCols,
                 boolean isSymmetric,
                 int desiredRank) throws Exception {
    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
    matrix.setConf(new Configuration(getConf() != null ? getConf() : new Configuration()));

    LanczosState state;
    if(workingDirPath == null) {
      state = new LanczosState(matrix, numCols, desiredRank, getInitialVector(matrix));
    } else {
      HdfsBackedLanczosState hState =
          new HdfsBackedLanczosState(matrix, numCols, desiredRank, getInitialVector(matrix),
              workingDirPath);
      hState.setConf(matrix.getConf());
      state = hState;
    }
    solve(state, desiredRank, isSymmetric);

    Path outputEigenVectorPath = new Path(outputPath, RAW_EIGENVECTORS);
View Full Code Here

        false, desiredRank, 0.5, 0.0, false);

    Path cleanEigenvectors = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);

    // now multiply the testdata matrix and the eigenvector matrix
    DistributedRowMatrix svdT = new DistributedRowMatrix(cleanEigenvectors, tmp,
        desiredRank, sampleDimension);
    Configuration conf = new Configuration(config);
    svdT.setConf(conf);
    DistributedRowMatrix a = new DistributedRowMatrix(testData, tmp, sampleData.size(), sampleDimension);
    a.setConf(conf);
    DistributedRowMatrix sData = a.transpose().times(svdT.transpose());
    sData.setConf(conf);

    // now run the Canopy job to prime kMeans canopies
    CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false, true);
    // now run the KMeans job
    KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"), output, measure, 0.001, 10, true, true);
    // run ClusterDumper
    ClusterDumper clusterDumper =
        new ClusterDumper(finalClusterPath(conf, output, 10), new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(termDictionary);
  }
View Full Code Here

    Configuration conf = new Configuration(config);
    new EigenVerificationJob().run(testData, rawEigenvectors, output, tmp, 0.5, 0.0, true, conf);
    Path cleanEigenvectors = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);

    // now multiply the testdata matrix and the eigenvector matrix
    DistributedRowMatrix svdT = new DistributedRowMatrix(cleanEigenvectors, tmp, desiredRank,
        sampleDimension);
    svdT.setConf(conf);
    DistributedRowMatrix a = new DistributedRowMatrix(testData, tmp, sampleData.size(),
        sampleDimension);
    a.setConf(conf);
    DistributedRowMatrix sData = a.transpose().times(svdT.transpose());
    sData.setConf(conf);

    // now run the Canopy job to prime kMeans canopies
    CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false, true);
    // now run the KMeans job
    KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"), output, measure,
        0.001, 10, true, true);
    // run ClusterDumper
    ClusterDumper clusterDumper =
        new ClusterDumper(finalClusterPath(conf, output, 10), new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(termDictionary);
View Full Code Here

  private static final Logger log = LoggerFactory.getLogger(TestDistributedLanczosSolverCLI.class);

  @Test
  public void testDistributedLanczosSolverCLI() throws Exception {
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix corpus =
        new TestDistributedRowMatrix().randomDenseHierarchicalDistributedMatrix(50, 45, false,
            testData.toString());
    corpus.setConf(new Configuration());
    Path output = getTestTempDirPath("output");
    Path tmp = getTestTempDirPath("tmp");
    Path workingDir = getTestTempDirPath("working");
    String[] args = {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "50",
        "--numCols", "45",
        "--rank", "30",
        "--symmetric", "false",
        "--workingDir", workingDir.toString()
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);

    output = getTestTempDirPath("output2");
    tmp = getTestTempDirPath("tmp2");
    args = new String[] {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "50",
        "--numCols", "45",
        "--rank", "35",
        "--symmetric", "false",
        "--workingDir", workingDir.toString()
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);

    Path rawEigenvectors = new Path(output, DistributedLanczosSolver.RAW_EIGENVECTORS);
    Matrix eigenVectors = new DenseMatrix(35, corpus.numCols());
    Configuration conf = new Configuration();

    int i = 0;
    for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(rawEigenvectors, conf)) {
      Vector v = value.get();
View Full Code Here

  }

  @Test
  public void testDistributedLanczosSolverEVJCLI() throws Exception {
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix corpus = new TestDistributedRowMatrix()
        .randomDenseHierarchicalDistributedMatrix(50, 45, false, testData.toString());
    corpus.setConf(new Configuration());
    Path output = getTestTempDirPath("output");
    Path tmp = getTestTempDirPath("tmp");
    String[] args = {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "50",
        "--numCols", "45",
        "--rank", "30",
        "--symmetric", "false",
        "--cleansvd", "true"
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);
 
    Path cleanEigenvectors = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
    Matrix eigenVectors = new DenseMatrix(30, corpus.numCols());
    Collection<Double> eigenvalues = new ArrayList<Double>();

    output = getTestTempDirPath("output2");
    tmp = getTestTempDirPath("tmp2");
    args = new String[] {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "50",
        "--numCols", "45",
        "--rank", "35",
        "--symmetric", "false",
        "--cleansvd", "true"
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);
    Path cleanEigenvectors2 = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
    Matrix eigenVectors2 = new DenseMatrix(35, corpus.numCols());
    Configuration conf = new Configuration();
    Collection<Double> newEigenValues = new ArrayList<Double>();

    int i = 0;
    for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(cleanEigenvectors, conf)) {
View Full Code Here

TOP

Related Classes of org.apache.mahout.math.hadoop.DistributedRowMatrix

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.