Source Code of org.apache.mahout.clustering.kmeans.TestKmeansClustering

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.mahout.clustering.kmeans;




import junit.framework.TestCase;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.matrix.AbstractVector;
import org.apache.mahout.matrix.DenseVector;
import org.apache.mahout.matrix.SparseVector;
import org.apache.mahout.matrix.Vector;
import org.apache.mahout.utils.DistanceMeasure;
import org.apache.mahout.utils.DummyOutputCollector;
import org.apache.mahout.utils.EuclideanDistanceMeasure;
import org.apache.mahout.utils.ManhattanDistanceMeasure;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.nio.charset.Charset;


public class TestKmeansClustering extends TestCase {


  public static final double[][] reference = { { 1, 1 }, { 2, 1 }, { 1, 2 },
      { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 5, 5 } };


  public static final int[][] expectedNumPoints = { { 9 }, { 4, 5 },
      { 4, 5, 0 }, { 1, 2, 1, 5 }, { 1, 1, 1, 2, 4 }, { 1, 1, 1, 1, 1, 4 },
      { 1, 1, 1, 1, 1, 2, 2 }, { 1, 1, 1, 1, 1, 1, 2, 1 },
      { 1, 1, 1, 1, 1, 1, 1, 1, 1 } };


  private void rmr(String path) throws Exception {
    File f = new File(path);
    if (f.exists()) {
      if (f.isDirectory()) {
        String[] contents = f.list();
        for (int i = 0; i < contents.length; i++)
          rmr(f.toString() + File.separator + contents[i]);
      }
      f.delete();
    }
  }


  protected void setUp() throws Exception {
    super.setUp();
    rmr("output");
    rmr("testdata");
  }


  /**
   * This is the reference k-means implementation. Given its inputs it iterates
   * over the points and clusters until their centers converge or until the
   * maximum number of iterations is exceeded.
   * 
   * @param points the input List<Vector> of points
   * @param clusters the initial List<Cluster> of clusters
   * @param measure the DistanceMeasure to use
   * @param maxIter the maximum number of iterations
   */
  private void referenceKmeans(List<Vector> points, List<Cluster> clusters,
      DistanceMeasure measure, int maxIter) {
    boolean converged = false;
    int iteration = 0;
    while (!converged && iteration++ < maxIter) {
      converged = iterateReference(points, clusters, measure);
    }
  }


  /**
   * Perform a single iteration over the points and clusters, assigning points
   * to clusters and returning if the iterations are completed.
   * 
   * @param points the List<Vector> having the input points
   * @param clusters the List<Cluster> clusters
   * @param measure a DistanceMeasure to use
   * @return
   */
  private boolean iterateReference(List<Vector> points, List<Cluster> clusters,
      DistanceMeasure measure) {
    boolean converged;
    converged = true;
    // iterate through all points, assigning each to the nearest cluster
    for (Vector point : points) {
      Cluster closestCluster = null;
      double closestDistance = Double.MAX_VALUE;
      for (Cluster cluster : clusters) {
        double distance = measure.distance(cluster.getCenter(), point);
        if (closestCluster == null || closestDistance > distance) {
          closestCluster = cluster;
          closestDistance = distance;
        }
      }
      closestCluster.addPoint(point);
    }
    // test for convergence
    for (Cluster cluster : clusters) {
      if (!cluster.computeConvergence())
        converged = false;
    }
    // update the cluster centers
    if (!converged)
      for (Cluster cluster : clusters)
        cluster.recomputeCenter();
    return converged;
  }


  public static List<Vector> getPoints(double[][] raw) {
    List<Vector> points = new ArrayList<Vector>();
    for (int i = 0; i < raw.length; i++) {
      double[] fr = raw[i];
      Vector vec = new SparseVector(fr.length);
      vec.assign(fr);
      points.add(vec);
    }
    return points;
  }


  /**
   * Story: Test the reference implementation
   * 
   * @throws Exception
   */
  public void testReferenceImplementation() throws Exception {
    List<Vector> points = getPoints(reference);
    DistanceMeasure measure = new EuclideanDistanceMeasure();
    Cluster.config(measure, 0.001);
    // try all possible values of k
    for (int k = 0; k < points.size(); k++) {
      System.out.println("Test k=" + (k + 1) + ':');
      // pick k initial cluster centers at random
      List<Cluster> clusters = new ArrayList<Cluster>();
      for (int i = 0; i < k + 1; i++) {
        Vector vec = points.get(i);
        clusters.add(new VisibleCluster(vec));
      }
      // iterate clusters until they converge
      int maxIter = 10;
      referenceKmeans(points, clusters, measure, maxIter);
      for (int c = 0; c < clusters.size(); c++) {
        Cluster cluster = clusters.get(c);
        assertEquals("Cluster " + c + " test " + k, expectedNumPoints[k][c],
            cluster.getNumPoints());
        System.out.println(cluster.toString());
      }
    }
  }


  private Map<String, Cluster> loadClusterMap(List<Cluster> clusters) {
    Map<String, Cluster> clusterMap = new HashMap<String, Cluster>();


    for (Cluster cluster : clusters) {
      clusterMap.put(cluster.getIdentifier(), cluster);
    }
    return clusterMap;
  }


  /**
   * Story: test that the mapper will map input points to the nearest cluster
   * 
   * @throws Exception
   */
  public void testKMeansMapper() throws Exception {
    KMeansMapper mapper = new KMeansMapper();
    EuclideanDistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();
    Cluster.config(euclideanDistanceMeasure, 0.001);
    List<Vector> points = getPoints(reference);
    for (int k = 0; k < points.size(); k++) {
      // pick k initial cluster centers at random
      DummyOutputCollector<Text, Text> collector = new DummyOutputCollector<Text, Text>();
      List<Cluster> clusters = new ArrayList<Cluster>();


      for (int i = 0; i < k + 1; i++) {
        Cluster cluster = new Cluster(points.get(i));
        // add the center so the centroid will be correct upon output
        cluster.addPoint(cluster.getCenter());
        clusters.add(cluster);
      }


      Map<String, Cluster> clusterMap = loadClusterMap(clusters);
      mapper.config(clusters);
      // map the data
      for (Vector point : points) {
        mapper.map(new Text(), new Text(point.asFormatString()), collector,
            null);
      }
      assertEquals("Number of map results", k + 1, collector.getData().size());
      // now verify that all points are correctly allocated
      for (String key : collector.getKeys()) {
        Cluster cluster = clusterMap.get(key);
        List<Text> values = collector.getValue(key);
        for (Writable value : values) {
          String[] pointInfo = value.toString().split("\t");


          Vector point = AbstractVector.decodeVector(pointInfo[1]);
          double distance = euclideanDistanceMeasure.distance(cluster
              .getCenter(), point);
          for (Cluster c : clusters)
            assertTrue("distance error", distance <= euclideanDistanceMeasure
                .distance(point, c.getCenter()));
        }
      }
    }
  }


  /**
   * Story: test that the combiner will produce partial cluster totals for all
   * of the clusters and points that it sees
   * 
   * @throws Exception
   */
  public void testKMeansCombiner() throws Exception {
    KMeansMapper mapper = new KMeansMapper();
    EuclideanDistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();
    Cluster.config(euclideanDistanceMeasure, 0.001);
    List<Vector> points = getPoints(reference);
    for (int k = 0; k < points.size(); k++) {
      // pick k initial cluster centers at random
      DummyOutputCollector<Text, Text> collector = new DummyOutputCollector<Text, Text>();
      List<Cluster> clusters = new ArrayList<Cluster>();
      for (int i = 0; i < k + 1; i++) {
        Vector vec = points.get(i);


        Cluster cluster = new Cluster(vec);
        // add the center so the centroid will be correct upon output
        cluster.addPoint(cluster.getCenter());
        clusters.add(cluster);
      }
      mapper.config(clusters);
      // map the data
      for (Vector point : points) {
        mapper.map(new Text(), new Text(point.asFormatString()), collector,
            null);
      }
      // now combine the data
      KMeansCombiner combiner = new KMeansCombiner();
      DummyOutputCollector<Text, Text> collector2 = new DummyOutputCollector<Text, Text>();
      for (String key : collector.getKeys())
        combiner.reduce(new Text(key), collector.getValue(key).iterator(),
            collector2, null);


      assertEquals("Number of map results", k + 1, collector2.getData().size());
      // now verify that all points are accounted for
      int count = 0;
      Vector total = new DenseVector(2);
      for (String key : collector2.getKeys()) {
        List<Text> values = collector2.getValue(key);
        assertEquals("too many values", 1, values.size());
        String value = values.get(0).toString();


        String[] pointInfo = value.split("\t");
        count += Integer.parseInt(pointInfo[0]);
        total = total.plus(AbstractVector.decodeVector(pointInfo[1]));
      }
      assertEquals("total points", 9, count);
      assertEquals("point total[0]", 27, (int) total.get(0));
      assertEquals("point total[1]", 27, (int) total.get(1));
    }
  }


  /**
   * Story: test that the reducer will sum the partial cluster totals for all of
   * the clusters and points that it sees
   * 
   * @throws Exception
   */
  public void testKMeansReducer() throws Exception {
    KMeansMapper mapper = new KMeansMapper();
    EuclideanDistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();
    Cluster.config(euclideanDistanceMeasure, 0.001);
    List<Vector> points = getPoints(reference);
    for (int k = 0; k < points.size(); k++) {
      System.out.println("K = " + k);
      // pick k initial cluster centers at random
      DummyOutputCollector<Text, Text> collector = new DummyOutputCollector<Text, Text>();
      List<Cluster> clusters = new ArrayList<Cluster>();
      for (int i = 0; i < k + 1; i++) {
        Vector vec = points.get(i);
        Cluster cluster = new Cluster(vec, i);
        // add the center so the centroid will be correct upon output
        // cluster.addPoint(cluster.getCenter());
        clusters.add(cluster);
      }
      mapper.config(clusters);
      // map the data
      for (Vector point : points) {
        mapper.map(new Text(), new Text(point.asFormatString()), collector,
            null);
      }
      // now combine the data
      KMeansCombiner combiner = new KMeansCombiner();
      DummyOutputCollector<Text, Text> collector2 = new DummyOutputCollector<Text, Text>();
      for (String key : collector.getKeys())
        combiner.reduce(new Text(key), collector.getValue(key).iterator(),
            collector2, null);


      // now reduce the data
      KMeansReducer reducer = new KMeansReducer();
      reducer.config(clusters);
      DummyOutputCollector<Text, Text> collector3 = new DummyOutputCollector<Text, Text>();
      for (String key : collector2.getKeys())
        reducer.reduce(new Text(key), collector2.getValue(key).iterator(),
            collector3, null);


      assertEquals("Number of map results", k + 1, collector3.getData().size());


      // compute the reference result after one iteration and compare
      List<Cluster> reference = new ArrayList<Cluster>();
      for (int i = 0; i < k + 1; i++) {
        Vector vec = points.get(i);
        reference.add(new Cluster(vec, i));
      }
      boolean converged = iterateReference(points, reference,
          euclideanDistanceMeasure);
      if (k == 8)
        assertTrue("not converged? " + k, converged);
      else
        assertFalse("converged? " + k, converged);


      // now verify that all clusters have correct centers
      converged = true;
      for (int i = 0; i < reference.size(); i++) {
        Cluster ref = reference.get(i);
        String key = ref.getIdentifier();
        List<Text> values = collector3.getValue(key);
        String value = values.get(0).toString();
        Cluster cluster = Cluster.decodeCluster(value);
        converged = converged && cluster.isConverged();
        System.out.println("ref= " + ref.toString() + " cluster= "
            + cluster.toString());
        assertEquals(k + " center[" + key + "][0]", ref.getCenter().get(0),
            cluster.getCenter().get(0));
        assertEquals(k + " center[" + key + "][1]", ref.getCenter().get(1),
            cluster.getCenter().get(1));
      }
      if (k == 8)
        assertTrue("not converged? " + k, converged);
      else
        assertFalse("converged? " + k, converged);
    }
  }


  /**
   * Story: User wishes to run kmeans job on reference data
   * 
   * @throws Exception
   */
  public void testKMeansMRJob() throws Exception {
    List<Vector> points = getPoints(reference);
    File testData = new File("testdata");
    if (!testData.exists())
      testData.mkdir();
    testData = new File("testdata/points");
    if (!testData.exists())
      testData.mkdir();


    writePointsToFile(points, "testdata/points/file1");
    writePointsToFile(points, "testdata/points/file2");
    for (int k = 1; k < points.size(); k++) {
      System.out.println("testKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      JobConf job = new JobConf(KMeansDriver.class);
      FileSystem fs = FileSystem.get(job);
      Path path = new Path("testdata/clusters/part-00000");
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path,
          Text.class, Text.class);


      for (int i = 0; i < k + 1; i++) {
        Vector vec = points.get(i);


        Cluster cluster = new Cluster(vec, i);
        // add the center so the centroid will be correct upon output
        cluster.addPoint(cluster.getCenter());
        writer.append(new Text(cluster.getIdentifier()), new Text(Cluster
            .formatCluster(cluster)));
      }
      writer.close();
      // now run the Job
      KMeansJob.runJob("testdata/points", "testdata/clusters", "output",
          EuclideanDistanceMeasure.class.getName(), 0.001, 10, k + 1);
      // now compare the expected clusters with actual
      File outDir = new File("output/points");
      assertTrue("output dir exists?", outDir.exists());
      String[] outFiles = outDir.list();
      // assertEquals("output dir files?", 4, outFiles.length);
      BufferedReader reader = new BufferedReader(new InputStreamReader(
          new FileInputStream("output/points/part-00000"), Charset
              .forName("UTF-8")));
      int[] expect = expectedNumPoints[k];
      DummyOutputCollector<Text, Text> collector = new DummyOutputCollector<Text, Text>();
      while (reader.ready()) {
        String line = reader.readLine();
        String[] lineParts = line.split("\t");
        assertEquals("line parts", 2, lineParts.length);
        // String cl = line.substring(0, line.indexOf(':'));
        collector.collect(new Text(lineParts[1]), new Text(lineParts[0]));
      }
      reader.close();
      if (k == 2)
        // cluster 3 is empty so won't appear in output
        assertEquals("clusters[" + k + "]", expect.length - 1, collector
            .getKeys().size());
      else
        assertEquals("clusters[" + k + "]", expect.length, collector.getKeys()
            .size());
    }
  }


  /**
   * Story: User wants to use canopy clustering to input the initial clusters
   * for kmeans job.
   * 
   * @throws Exception
   */
  public void textKMeansWithCanopyClusterInput() throws Exception {
    List<Vector> points = getPoints(reference);
    File testData = new File("testdata");
    if (!testData.exists())
      testData.mkdir();
    testData = new File("testdata/points");
    if (!testData.exists())
      testData.mkdir();
    writePointsToFile(points, "testdata/points/file1");
    writePointsToFile(points, "testdata/points/file2");


    // now run the Canopy job
    CanopyDriver.runJob("testdata/points", "testdata/canopies",
        ManhattanDistanceMeasure.class.getName(), 3.1, 2.1);


    // now run the KMeans job
    KMeansJob.runJob("testdata/points", "testdata/canopies", "output",
        EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);


    // now compare the expected clusters with actual
    File outDir = new File("output/points");
    assertTrue("output dir exists?", outDir.exists());
    String[] outFiles = outDir.list();
    assertEquals("output dir files?", 4, outFiles.length);
    BufferedReader reader = new BufferedReader(new InputStreamReader(
        new FileInputStream("output/points/part-00000"), Charset
            .forName("UTF-8")));
    DummyOutputCollector<Text, Text> collector = new DummyOutputCollector<Text, Text>();
    while (reader.ready()) {
      String line = reader.readLine();
      String[] lineParts = line.split("\t");
      assertEquals("line parts", 2, lineParts.length);
      String cl = line.substring(0, line.indexOf(':'));
      collector.collect(new Text(cl), new Text(lineParts[1]));
    }
    reader.close();
    assertEquals("num points[V0]", 4, collector.getValue("V0").size());
    assertEquals("num points[V1]", 5, collector.getValue("V1").size());
  }


  public static void writePointsToFileWithPayload(List<Vector> points,
      String fileName, String payload) throws IOException {
    BufferedWriter output = new BufferedWriter(new OutputStreamWriter(
        new FileOutputStream(fileName), Charset.forName("UTF-8")));
    for (Vector point : points) {
      output.write(point.asFormatString());
      output.write(payload);
      output.write('\n');
    }
    output.flush();
    output.close();
  }


  /**
   * Split pattern for <code>decodePoint(String)</code>
   */
  public static void writePointsToFile(List<Vector> points, String fileName)
      throws IOException {
    writePointsToFileWithPayload(points, fileName, "");
  }
}
Source Code of org.apache.mahout.clustering.kmeans.TestKmeansClustering

Related Classes of org.apache.mahout.clustering.kmeans.TestKmeansClustering