Path affSeqFiles = new Path(outputCalc, "seqfile-" + (System.nanoTime() & 0xFF));
AffinityMatrixInputJob.runJob(input, affSeqFiles, numDims, numDims);
// Next step: construct the affinity matrix using the newly-created
// sequence files
DistributedRowMatrix A = new DistributedRowMatrix(affSeqFiles,
new Path(outputTmp, "afftmp-" + (System.nanoTime() & 0xFF)),
numDims,
numDims);
JobConf depConf = new JobConf(conf);
A.configure(depConf);
// Next step: construct the diagonal matrix D (represented as a vector)
// and calculate the normalized Laplacian of the form:
// L = D^(-0.5)AD^(-0.5)
Vector D = MatrixDiagonalizeJob.runJob(affSeqFiles, numDims);
DistributedRowMatrix L =
VectorMatrixMultiplicationJob.runJob(affSeqFiles, D,
new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)));
L.configure(depConf);
// Next step: perform eigen-decomposition using LanczosSolver
// since some of the eigen-output is spurious and will be eliminated
// upon verification, we have to aim to overshoot and then discard
// unnecessary vectors later
int overshoot = (int) ((double) clusters * OVERSHOOT_MULTIPLIER);
List<Double> eigenValues = new ArrayList<Double>(overshoot);
Matrix eigenVectors = new DenseMatrix(overshoot, numDims);
DistributedLanczosSolver solver = new DistributedLanczosSolver();
Path lanczosSeqFiles = new Path(outputCalc, "eigenvectors-" + (System.nanoTime() & 0xFF));
solver.runJob(conf,
L.getRowPath(),
new Path(outputTmp, "lanczos-" + (System.nanoTime() & 0xFF)),
L.numRows(),
L.numCols(),
true,
overshoot,
eigenVectors,
eigenValues,
lanczosSeqFiles.toString());
// perform a verification
EigenVerificationJob verifier = new EigenVerificationJob();
Path verifiedEigensPath = new Path(outputCalc, "eigenverifier");
verifier.runJob(conf, lanczosSeqFiles, L.getRowPath(), verifiedEigensPath, true, 1.0, 0.0, clusters);
Path cleanedEigens = verifier.getCleanedEigensPath();
DistributedRowMatrix W = new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), clusters, numDims);
W.configure(depConf);
DistributedRowMatrix Wtrans = W.transpose();
// DistributedRowMatrix Wt = W.transpose();
// next step: normalize the rows of Wt to unit length
Path unitVectors = new Path(outputCalc, "unitvectors-" + (System.nanoTime() & 0xFF));
UnitVectorizerJob.runJob(Wtrans.getRowPath(), unitVectors);
DistributedRowMatrix Wt = new DistributedRowMatrix(unitVectors, new Path(unitVectors, "tmp"), clusters, numDims);
Wt.configure(depConf);
// Finally, perform k-means clustering on the rows of L (or W)
// generate random initial clusters
Path initialclusters = RandomSeedGenerator.buildRandom(Wt.getRowPath(),
new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
clusters,
measure);
KMeansDriver.run(conf, Wt.getRowPath(), initialclusters, output, measure, convergenceDelta, maxIterations, true, false);
// Read through the cluster assignments
Path clusteredPointsPath = new Path(output, "clusteredPoints");
FileSystem fs = FileSystem.get(conf);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(clusteredPointsPath, "part-m-00000"), conf);