Examples of org.apache.mahout.math.Centroid

org.apache.mahout.math.Centroid
A centroid is a weighted vector. We have it delegate to the vector itself for lots of operations to make it easy to use vector search classes and such.

   */
  private void initializeSeeds(List<? extends WeightedVector> datapoints) {
    Preconditions.checkArgument(datapoints.size() > 1, "Must have at least two datapoints points to cluster " +
        "sensibly");
    // Compute the centroid of all of the datapoints.  This is then used to compute the squared radius of the datapoints.
    Centroid center = new Centroid(datapoints.iterator().next());
    for (WeightedVector row : Iterables.skip(datapoints, 1)) {
      center.update(row);
    }
    // Given the centroid, we can compute \Delta_1^2(X), the total squared distance for the datapoints
    // this accelerates seed selection.
    double radius = 0;
    DistanceMeasure l2 = new SquaredEuclideanDistanceMeasure();
    for (WeightedVector row : datapoints) {
      radius += l2.distance(row, center);
    }


    // Find the first seed c_1 (and conceptually the second, c_2) as might be done in the 2-means clustering so that
    // the probability of selecting c_1 and c_2 is proportional to || c_1 - c_2 ||^2.  This is done
    // by first selecting c_1 with probability:
    //
    // p(c_1) = sum_{c_1} || c_1 - c_2 ||^2 \over sum_{c_1, c_2} || c_1 - c_2 ||^2
    //
    // This can be simplified to:
    //
    // p(c_1) = \Delta_1^2(X) + n || c_1 - c ||^2 / (2 n \Delta_1^2(X))
    //
    // where c = \sum x / n and \Delta_1^2(X) = sum || x - c ||^2
    //
    // All subsequent seeds c_i (including c_2) can then be selected from the remaining points with probability
    // proportional to Pr(c_i == x_j) = min_{m < i} || c_m - x_j ||^2.


    // Multinomial distribution of vector indices for the selection seeds. These correspond to
    // the indices of the vectors in the original datapoints list.
    Multinomial<Integer> seedSelector = new Multinomial<Integer>();
    for (int i = 0; i < datapoints.size(); ++i) {
      double selectionProbability =
          radius + datapoints.size() * l2.distance(datapoints.get(i), center);
      seedSelector.add(i, selectionProbability);
    }


    Centroid c_1 = new Centroid((WeightedVector)datapoints.get(seedSelector.sample()).clone());
    c_1.setIndex(0);
    // Construct a set of weighted things which can be used for random selection.  Initial weights are
    // set to the squared distance from c_1
    for (int i = 0; i < datapoints.size(); ++i) {
      WeightedVector row = datapoints.get(i);
      final double w = l2.distance(c_1, row) * row.getWeight();
      seedSelector.set(i, w);
    }


    // From here, seeds are selected with probablity proportional to:
    //
    // r_i = min_{c_j} || x_i - c_j ||^2
    //
    // when we only have c_1, we have already set these distances and as we select each new
    // seed, we update the minimum distances.
    centroids.add(c_1);
    int clusterIndex = 1;
    while (centroids.size() < numClusters) {
      // Select according to weights.
      int seedIndex = seedSelector.sample();
      Centroid nextSeed = new Centroid((WeightedVector)datapoints.get(seedIndex).clone());
      nextSeed.setIndex(clusterIndex++);
      centroids.add(nextSeed);
      // Don't select this one again.
      seedSelector.set(seedIndex, 0);
      // Re-weight everything according to the minimum distance to a seed.
      for (int currSeedIndex : seedSelector) {
        WeightedVector curr = datapoints.get(currSeedIndex);
        double newWeight = nextSeed.getWeight() * l2.distance(nextSeed, curr);
        if (newWeight < seedSelector.getWeight(currSeedIndex)) {
          seedSelector.set(currSeedIndex, newWeight);
        }
      }
    }

View Full Code Here

      // Copies the current cluster centroids to newClusters and sets their weights to 0. This is
      // so we calculate the new centroids as we go through the datapoints.
      List<Centroid> newCentroids = Lists.newArrayList();
      for (Vector centroid : centroids) {
        // need a deep copy because we will mutate these values
        Centroid newCentroid = (Centroid)centroid.clone();
        newCentroid.setWeight(0);
        newCentroids.add(newCentroid);
      }


      // Pass over the datapoints computing new centroids.
      for (int j = 0; j < datapoints.size(); ++j) {

View Full Code Here

      int pow2J = 1 << (numDimensions - 1);
      for (int j = 0; j < numDimensions; ++j) {
        v.set(j, 1.0 / pow2J * (i & pow2J));
        pow2J >>= 1;
      }
      mean.add(new Centroid(i, v, 1));
      rowSamplers.add(new MultiNormal(distributionRadius, v));
    }


    // Sample the requested number of data points.
    List<Centroid> data = Lists.newArrayListWithCapacity(numDatapoints);
    for (int i = 0; i < numDatapoints; ++i) {
      data.add(new Centroid(i, rowSamplers.get(i % pow2N).sample(), 1));
    }
    return new Pair<List<Centroid>, List<Centroid>>(data, mean);
  }

View Full Code Here


  public static Centroid read(DataInput dataInput) throws IOException {
    int index = dataInput.readInt();
    double weight = dataInput.readDouble();
    Vector v = VectorWritable.readVector(dataInput);
    return new Centroid(index, v, weight);
  }

View Full Code Here

    estimateDistanceCutoff = false;
  }


  @Override
  public void map(Writable key, VectorWritable point, Context context) {
    Centroid centroid = new Centroid(numPoints++, point.get(), 1);
    if (estimateDistanceCutoff) {
      if (numPoints < NUM_ESTIMATE_POINTS) {
        estimatePoints.add(centroid);
      } else if (numPoints == NUM_ESTIMATE_POINTS) {
        clusterEstimatePoints();

View Full Code Here

    return Iterables.transform(inputIterable, new Function<VectorWritable, Centroid>() {
      private int numVectors = 0;
      @Override
      public Centroid apply(VectorWritable input) {
        Preconditions.checkNotNull(input);
        return new Centroid(numVectors++, new RandomAccessSparseVector(input.get()), 1);
      }
    });
  }

View Full Code Here

      public Centroid apply(Vector input) {
        Preconditions.checkNotNull(input);
        if (input instanceof Centroid) {
          return (Centroid) input;
        } else {
          return new Centroid(numVectors++, input, 1);
        }
      }
    });
  }

View Full Code Here

    SequenceFile.Writer writer = SequenceFile.createWriter(FileSystem.get(conf), conf, output, IntWritable.class,
        CentroidWritable.class);
    int numCentroids = 0;
    // Run BallKMeans on the intermediate centroids.
    for (Vector finalVector : StreamingKMeansReducer.getBestCentroids(intermediateCentroids, conf)) {
      Centroid finalCentroid = (Centroid)finalVector;
      writer.append(new IntWritable(numCentroids++), new CentroidWritable(finalCentroid));
    }
    writer.close();
    long end = System.currentTimeMillis();
    log.info("Finished BallKMeans. Took {}.", (end - start) / 1000.0);

View Full Code Here

      seedSelector.add(i, datapoints.get(i).getWeight() / totalWeight);
    }
    for (int i = 0; i < numClusters; ++i) {
      int sample = seedSelector.sample();
      seedSelector.delete(sample);
      Centroid centroid = new Centroid(datapoints.get(sample));
      centroid.setIndex(i);
      centroids.add(centroid);
    }
  }

View Full Code Here

    Preconditions.checkArgument(datapoints.size() > 1, "Must have at least two datapoints points to cluster " +
        "sensibly");
    Preconditions.checkArgument(datapoints.size() >= numClusters,
        String.format("Must have more datapoints [%d] than clusters [%d]", datapoints.size(), numClusters));
    // Compute the centroid of all of the datapoints.  This is then used to compute the squared radius of the datapoints.
    Centroid center = new Centroid(datapoints.iterator().next());
    for (WeightedVector row : Iterables.skip(datapoints, 1)) {
      center.update(row);
    }


    // Given the centroid, we can compute \Delta_1^2(X), the total squared distance for the datapoints
    // this accelerates seed selection.
    double deltaX = 0;
    DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
    for (WeightedVector row : datapoints) {
      deltaX += distanceMeasure.distance(row, center);
    }


    // Find the first seed c_1 (and conceptually the second, c_2) as might be done in the 2-means clustering so that
    // the probability of selecting c_1 and c_2 is proportional to || c_1 - c_2 ||^2.  This is done
    // by first selecting c_1 with probability:
    //
    // p(c_1) = sum_{c_1} || c_1 - c_2 ||^2 \over sum_{c_1, c_2} || c_1 - c_2 ||^2
    //
    // This can be simplified to:
    //
    // p(c_1) = \Delta_1^2(X) + n || c_1 - c ||^2 / (2 n \Delta_1^2(X))
    //
    // where c = \sum x / n and \Delta_1^2(X) = sum || x - c ||^2
    //
    // All subsequent seeds c_i (including c_2) can then be selected from the remaining points with probability
    // proportional to Pr(c_i == x_j) = min_{m < i} || c_m - x_j ||^2.


    // Multinomial distribution of vector indices for the selection seeds. These correspond to
    // the indices of the vectors in the original datapoints list.
    Multinomial<Integer> seedSelector = new Multinomial<Integer>();
    for (int i = 0; i < datapoints.size(); ++i) {
      double selectionProbability =
          deltaX + datapoints.size() * distanceMeasure.distance(datapoints.get(i), center);
      seedSelector.add(i, selectionProbability);
    }


    int selected = random.nextInt(datapoints.size());
    Centroid c_1 = new Centroid(datapoints.get(selected).clone());
    c_1.setIndex(0);
    // Construct a set of weighted things which can be used for random selection.  Initial weights are
    // set to the squared distance from c_1
    for (int i = 0; i < datapoints.size(); ++i) {
      WeightedVector row = datapoints.get(i);
      double w = distanceMeasure.distance(c_1, row) * 2 * Math.log(1 + row.getWeight());
      seedSelector.set(i, w);
    }


    // From here, seeds are selected with probability proportional to:
    //
    // r_i = min_{c_j} || x_i - c_j ||^2
    //
    // when we only have c_1, we have already set these distances and as we select each new
    // seed, we update the minimum distances.
    centroids.add(c_1);
    int clusterIndex = 1;
    while (centroids.size() < numClusters) {
      // Select according to weights.
      int seedIndex = seedSelector.sample();
      Centroid nextSeed = new Centroid(datapoints.get(seedIndex));
      nextSeed.setIndex(clusterIndex++);
      centroids.add(nextSeed);
      // Don't select this one again.
      seedSelector.delete(seedIndex);
      // Re-weight everything according to the minimum distance to a seed.
      for (int currSeedIndex : seedSelector) {
        WeightedVector curr = datapoints.get(currSeedIndex);
        double newWeight = nextSeed.getWeight() * distanceMeasure.distance(nextSeed, curr);
        if (newWeight < seedSelector.getWeight(currSeedIndex)) {
          seedSelector.set(currSeedIndex, newWeight);
        }
      }
    }

View Full Code Here

0 1 2

TOP

Related Classes of org.apache.mahout.math.Centroid

org.apache.mahout.clustering.ClusteringUtils

org.apache.mahout.clustering.streaming.cluster.BallKMeans

org.apache.mahout.clustering.streaming.cluster.DataUtils

org.apache.mahout.clustering.streaming.cluster.StreamingKMeans

org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable

org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansDriver

org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansMapper

org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansThread

org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansUtilsMR

org.apache.mahout.clustering.streaming.tools.IOUtils

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.