Examples of org.apache.mahout.common.distance.DistanceMeasure

Package org.apache.mahout.common.distance

Examples of org.apache.mahout.common.distance.DistanceMeasure

org.apache.mahout.common.distance.DistanceMeasure
This interface is used for objects which can determine a distance metric between two points


  @Test
  public void testEpsilon() {
    final int dataSize = 10000;
    final int querySize = 30;
    final DistanceMeasure metric = new EuclideanDistanceMeasure();


    // these determine the dimension for the test. Each scale is multiplied by each multiplier
    final List<Integer> scales = ImmutableList.of(10);
    final List<Integer> multipliers = ImmutableList.of(1, 2, 3, 5);

View Full Code Here

      clusters.add(cluster);
      value = (Writable) reader.getValueClass()
          .newInstance();
    }
    
    DistanceMeasure measure = new CosineDistanceMeasure();
    double max = 0;
    double min = Double.MAX_VALUE;
    double sum = 0;
    int count = 0;
    for (int i = 0; i < clusters.size(); i++) {
      for (int j = i + 1; j < clusters.size(); j++) {
        double d = measure.distance(clusters.get(i)
            .getCenter(), clusters.get(j).getCenter());
        min = Math.min(d, min);
        max = Math.max(d, max);
        sum += d;
        count++;

View Full Code Here

      }
      List<Vector> pointsVectors = new ArrayList<Vector>();
      for(VectorWritable point : points)
        pointsVectors.add(point.get());
      
      DistanceMeasure measure = new EuclideanDistanceMeasure();
      FuzzyKMeansClusterer clusterer = new FuzzyKMeansClusterer(measure, 0.001, 2);
      FuzzyKMeansClusterer.runFuzzyKMeansIteration(pointsVectors, reference, clusterer);
      
      for (SoftCluster key : reference) {
        String clusterId = key.getIdentifier();

View Full Code Here

  }
  
  /** Story: Test the reference implementation */
  public void testReferenceImplementation() throws Exception {
    List<Vector> points = getPoints(reference);
    DistanceMeasure measure = new EuclideanDistanceMeasure();
    // try all possible values of k
    for (int k = 0; k < points.size(); k++) {
      System.out.println("Test k=" + (k + 1) + ':');
      // pick k initial cluster centers at random
      List<Cluster> clusters = new ArrayList<Cluster>();

View Full Code Here

    RandomUtils.useTestSeed();
    DisplayDirichlet.generateSamples();
    List<Vector> points = new ArrayList<Vector>();
    for (VectorWritable sample : sampleData)
      points.add(sample.get());
    DistanceMeasure measure = new ManhattanDistanceMeasure();
    List<SoftCluster> initialClusters = new ArrayList<SoftCluster>();
    
    k = 3;
    int i = 0;
    for (Vector point : points) {

View Full Code Here

    RandomUtils.useTestSeed();
    DisplayDirichlet.generateSamples();
    List<Vector> points = new ArrayList<Vector>();
    for (VectorWritable sample : sampleData)
      points.add(sample.get());
    DistanceMeasure measure = new ManhattanDistanceMeasure();
    List<Cluster> initialClusters = new ArrayList<Cluster>();
    k = 3;
    int i = 0;
    for (Vector point : points) {
      if (initialClusters.size() < Math.min(k, points.size())) {

View Full Code Here

      center.update(row);
    }
    // Given the centroid, we can compute \Delta_1^2(X), the total squared distance for the datapoints
    // this accelerates seed selection.
    double radius = 0;
    DistanceMeasure l2 = new SquaredEuclideanDistanceMeasure();
    for (WeightedVector row : datapoints) {
      radius += l2.distance(row, center);
    }


    // Find the first seed c_1 (and conceptually the second, c_2) as might be done in the 2-means clustering so that
    // the probability of selecting c_1 and c_2 is proportional to || c_1 - c_2 ||^2.  This is done
    // by first selecting c_1 with probability:
    //
    // p(c_1) = sum_{c_1} || c_1 - c_2 ||^2 \over sum_{c_1, c_2} || c_1 - c_2 ||^2
    //
    // This can be simplified to:
    //
    // p(c_1) = \Delta_1^2(X) + n || c_1 - c ||^2 / (2 n \Delta_1^2(X))
    //
    // where c = \sum x / n and \Delta_1^2(X) = sum || x - c ||^2
    //
    // All subsequent seeds c_i (including c_2) can then be selected from the remaining points with probability
    // proportional to Pr(c_i == x_j) = min_{m < i} || c_m - x_j ||^2.


    // Multinomial distribution of vector indices for the selection seeds. These correspond to
    // the indices of the vectors in the original datapoints list.
    Multinomial<Integer> seedSelector = new Multinomial<Integer>();
    for (int i = 0; i < datapoints.size(); ++i) {
      double selectionProbability =
          radius + datapoints.size() * l2.distance(datapoints.get(i), center);
      seedSelector.add(i, selectionProbability);
    }


    Centroid c_1 = new Centroid((WeightedVector)datapoints.get(seedSelector.sample()).clone());
    c_1.setIndex(0);
    // Construct a set of weighted things which can be used for random selection.  Initial weights are
    // set to the squared distance from c_1
    for (int i = 0; i < datapoints.size(); ++i) {
      WeightedVector row = datapoints.get(i);
      final double w = l2.distance(c_1, row) * row.getWeight();
      seedSelector.set(i, w);
    }


    // From here, seeds are selected with probablity proportional to:
    //
    // r_i = min_{c_j} || x_i - c_j ||^2
    //
    // when we only have c_1, we have already set these distances and as we select each new
    // seed, we update the minimum distances.
    centroids.add(c_1);
    int clusterIndex = 1;
    while (centroids.size() < numClusters) {
      // Select according to weights.
      int seedIndex = seedSelector.sample();
      Centroid nextSeed = new Centroid((WeightedVector)datapoints.get(seedIndex).clone());
      nextSeed.setIndex(clusterIndex++);
      centroids.add(nextSeed);
      // Don't select this one again.
      seedSelector.set(seedIndex, 0);
      // Re-weight everything according to the minimum distance to a seed.
      for (int currSeedIndex : seedSelector) {
        WeightedVector curr = datapoints.get(currSeedIndex);
        double newWeight = nextSeed.getWeight() * l2.distance(nextSeed, curr);
        if (newWeight < seedSelector.getWeight(currSeedIndex)) {
          seedSelector.set(currSeedIndex, newWeight);
        }
      }
    }

View Full Code Here

   * optimal k-means solution (given good starting points).
   *
   * @param datapoints          Rows containing WeightedVectors
   */
  private void iterativeAssignment(List<? extends WeightedVector> datapoints) {
    DistanceMeasure l2 = new EuclideanDistanceMeasure();
    // closestClusterDistances.get(i) is the distance from the i'th cluster to its closest
    // neighboring cluster.
    List<Double> closestClusterDistances = Lists.newArrayListWithExpectedSize(numClusters);
    // clusterAssignments[i] == j means that the i'th point is assigned to the j'th cluster. When
    // these don't change, we are done.
    List<Integer> clusterAssignments = Lists.newArrayListWithExpectedSize(datapoints.size());
    // Each point is assigned to the invalid "-1" cluster initially.
    for (int i = 0; i < datapoints.size(); ++i) {
      clusterAssignments.add(-1);
    }


    boolean changed = true;
    for (int i = 0; changed && i < maxNumIterations; i++) {
      // We compute what the distance between each cluster and its closest neighbor is to set a
      // proportional distance threshold for points that should be involved in calculating the
      // centroid.
      closestClusterDistances.clear();
      for (Vector center : centroids) {
        Vector closestOtherCluster = centroids.search(center, 2).get(1).getValue();
        closestClusterDistances.add(l2.distance(center, closestOtherCluster));
      }


      // Copies the current cluster centroids to newClusters and sets their weights to 0. This is
      // so we calculate the new centroids as we go through the datapoints.
      List<Centroid> newCentroids = Lists.newArrayList();

View Full Code Here

   * @throws IOException 
   */
  @Test
  public void testAllSameValueCluster() throws IOException {
    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
    DistanceMeasure measure = new EuclideanDistanceMeasure();
    initData(1, 0.25, measure);
    Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure);
    clusters.add(cluster);
    List<VectorWritable> points = new ArrayList<VectorWritable>();
    points.add(new VectorWritable(cluster.getCenter()));

View Full Code Here

  }


  @Test
  public void testCanopy() throws Exception {
    ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
    DistanceMeasure measure = new EuclideanDistanceMeasure();
    Configuration conf = new Configuration();
    CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, true);
    int numIterations = 10;
    Path clustersIn = new Path(output, "clusters-0");
    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations, true);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.mahout.common.distance.DistanceMeasure

mia.clustering.ch10.InterClusterDistances

org.apache.mahout.clustering.canopy.CanopyConfigKeys

org.apache.mahout.clustering.canopy.CanopyDriver

org.apache.mahout.clustering.cdbw.TestCDbwEvaluator

org.apache.mahout.clustering.classify.ClusterClassificationDriver

org.apache.mahout.clustering.classify.ClusterClassificationMapper

org.apache.mahout.clustering.ClusteringUtils

org.apache.mahout.clustering.dirichlet.DirichletDriver

org.apache.mahout.clustering.dirichlet.models.DistributionDescription

org.apache.mahout.clustering.display.DisplayFuzzyKMeans

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.