// (similar to the style of syntheticcontrol.canopy.InputMapper)
Path affSeqFiles = new Path(outputCalc, "seqfile");
AffinityMatrixInputJob.runJob(input, affSeqFiles, numDims, numDims);
// Construct the affinity matrix using the newly-created sequence files
DistributedRowMatrix A = new DistributedRowMatrix(affSeqFiles, new Path(outputTmp, "afftmp"), numDims, numDims);
Configuration depConf = new Configuration(conf);
A.setConf(depConf);
// Construct the diagonal matrix D (represented as a vector)
Vector D = MatrixDiagonalizeJob.runJob(affSeqFiles, numDims);
// Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
DistributedRowMatrix L = VectorMatrixMultiplicationJob.runJob(affSeqFiles, D, new Path(outputCalc, "laplacian"),
new Path(outputCalc, outputCalc));
L.setConf(depConf);
Path data;
if (ssvd) {
// SSVD requires an array of Paths to function. So we pass in an array of length one
Path[] LPath = new Path[1];
LPath[0] = L.getRowPath();
Path SSVDout = new Path(outputCalc, "SSVD");
SSVDSolver solveIt = new SSVDSolver(depConf, LPath, SSVDout, blockHeight, clusters, oversampling, numReducers);
solveIt.setComputeV(false);
solveIt.setComputeU(true);
solveIt.setOverwrite(true);
solveIt.setQ(poweriters);
// solveIt.setBroadcast(false);
solveIt.run();
data = new Path(solveIt.getUPath());
} else {
// Perform eigen-decomposition using LanczosSolver
// since some of the eigen-output is spurious and will be eliminated
// upon verification, we have to aim to overshoot and then discard
// unnecessary vectors later
int overshoot = Math.min((int) (clusters * OVERSHOOTMULTIPLIER), numDims);
DistributedLanczosSolver solver = new DistributedLanczosSolver();
LanczosState state = new LanczosState(L, overshoot, DistributedLanczosSolver.getInitialVector(L));
Path lanczosSeqFiles = new Path(outputCalc, "eigenvectors");
solver.runJob(conf, state, overshoot, true, lanczosSeqFiles.toString());
// perform a verification
EigenVerificationJob verifier = new EigenVerificationJob();
Path verifiedEigensPath = new Path(outputCalc, "eigenverifier");
verifier.runJob(conf, lanczosSeqFiles, L.getRowPath(), verifiedEigensPath, true, 1.0, clusters);
Path cleanedEigens = verifier.getCleanedEigensPath();
DistributedRowMatrix W = new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), clusters,
numDims);
W.setConf(depConf);
DistributedRowMatrix Wtrans = W.transpose();
data = Wtrans.getRowPath();
}
// Normalize the rows of Wt to unit length
// normalize is important because it reduces the occurrence of two unique clusters combining into one
Path unitVectors = new Path(outputCalc, "unitvectors");
UnitVectorizerJob.runJob(data, unitVectors);
DistributedRowMatrix Wt = new DistributedRowMatrix(unitVectors, new Path(unitVectors, "tmp"), clusters, numDims);
Wt.setConf(depConf);
data = Wt.getRowPath();
// Generate initial clusters using EigenSeedGenerator which picks rows as centroids if that row contains max
// eigen value in that column
Path initialclusters = EigenSeedGenerator.buildFromEigens(conf, data,
new Path(output, Cluster.INITIAL_CLUSTERS_DIR), clusters, measure);