Path affSeqFiles = new Path(outputCalc, "seqfile-" + (System.nanoTime() & 0xFF));
AffinityMatrixInputJob.runJob(input, affSeqFiles, numDims, numDims);
// Next step: construct the affinity matrix using the newly-created
// sequence files
DistributedRowMatrix A = new DistributedRowMatrix(affSeqFiles,
new Path(outputTmp, "afftmp-" + (System.nanoTime() & 0xFF)),
numDims,
numDims);
Configuration depConf = new Configuration(conf);
A.setConf(depConf);
// Next step: construct the diagonal matrix D (represented as a vector)
// and calculate the normalized Laplacian of the form:
// L = D^(-0.5)AD^(-0.5)
Vector D = MatrixDiagonalizeJob.runJob(affSeqFiles, numDims);
DistributedRowMatrix L =
VectorMatrixMultiplicationJob.runJob(affSeqFiles, D,
new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)), new Path(outputCalc, "laplacian-tmp-" + (System.nanoTime() & 0xFF)));
L.setConf(depConf);
// Next step: perform eigen-decomposition using LanczosSolver
// since some of the eigen-output is spurious and will be eliminated
// upon verification, we have to aim to overshoot and then discard
// unnecessary vectors later
int overshoot = (int) ((double) clusters * OVERSHOOT_MULTIPLIER);
DistributedLanczosSolver solver = new DistributedLanczosSolver();
LanczosState state = new LanczosState(L, numDims, solver.getInitialVector(L));
Path lanczosSeqFiles = new Path(outputCalc, "eigenvectors-" + (System.nanoTime() & 0xFF));
solver.runJob(conf,
state,
overshoot,
true,
lanczosSeqFiles.toString());
// perform a verification
EigenVerificationJob verifier = new EigenVerificationJob();
Path verifiedEigensPath = new Path(outputCalc, "eigenverifier");
verifier.runJob(conf, lanczosSeqFiles, L.getRowPath(), verifiedEigensPath, true, 1.0, clusters);
Path cleanedEigens = verifier.getCleanedEigensPath();
DistributedRowMatrix W = new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), clusters, numDims);
W.setConf(depConf);
DistributedRowMatrix Wtrans = W.transpose();
// DistributedRowMatrix Wt = W.transpose();
// next step: normalize the rows of Wt to unit length
Path unitVectors = new Path(outputCalc, "unitvectors-" + (System.nanoTime() & 0xFF));
UnitVectorizerJob.runJob(Wtrans.getRowPath(), unitVectors);
DistributedRowMatrix Wt = new DistributedRowMatrix(unitVectors, new Path(unitVectors, "tmp"), clusters, numDims);
Wt.setConf(depConf);
// Finally, perform k-means clustering on the rows of L (or W)
// generate random initial clusters
Path initialclusters = RandomSeedGenerator.buildRandom(conf,
Wt.getRowPath(),
new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
clusters,
measure);
// The output format is the same as the K-means output format.
// TODO: Perhaps a conversion of the output format from points and clusters
// in eigenspace to the original dataset. Currently, the user has to perform
// the association step after this job finishes on their own.
KMeansDriver.run(conf,
Wt.getRowPath(),
initialclusters,
output,
measure,
convergenceDelta,
maxIterations,