Examples of org.apache.mahout.math.Centroid

org.apache.mahout.math.Centroid
A centroid is a weighted vector. We have it delegate to the vector itself for lots of operations to make it easy to use vector search classes and such.

      // Copies the current cluster centroids to newClusters and sets their weights to 0. This is
      // so we calculate the new centroids as we go through the datapoints.
      List<Centroid> newCentroids = Lists.newArrayList();
      for (Vector centroid : centroids) {
        // need a deep copy because we will mutate these values
        Centroid newCentroid = (Centroid)centroid.clone();
        newCentroid.setWeight(0);
        newCentroids.add(newCentroid);
      }


      // Pass over the datapoints computing new centroids.
      for (int j = 0; j < datapoints.size(); ++j) {
        WeightedVector datapoint = datapoints.get(j);
        // Get the closest cluster this point belongs to.
        WeightedThing<Vector> closestPair = centroids.searchFirst(datapoint, false);
        int closestIndex = ((WeightedVector) closestPair.getValue()).getIndex();
        double closestDistance = closestPair.getWeight();
        // Update its cluster assignment if necessary.
        if (closestIndex != clusterAssignments.get(j)) {
          changed = true;
          clusterAssignments.set(j, closestIndex);
        }
        // Only update if the datapoints point is near enough. What this means is that the weight
        // of outliers is NOT taken into account and the final weights of the centroids will
        // reflect this (it will be less or equal to the initial sum of the weights).
        if (closestDistance < trimFraction * closestClusterDistances.get(closestIndex)) {
          newCentroids.get(closestIndex).update(datapoint);
        }
      }
      // Add the new centers back into searcher.
      centroids.clear();
      centroids.addAll(newCentroids);
    }


    if (correctWeights) {
      for (Vector v : centroids) {
        ((Centroid)v).setWeight(0);
      }
      for (WeightedVector datapoint : datapoints) {
        Centroid closestCentroid = (Centroid) centroids.searchFirst(datapoint, false).getValue();
        closestCentroid.setWeight(closestCentroid.getWeight() + datapoint.getWeight());
      }
    }
  }

View Full Code Here

    }


    // To cluster, we scan the data and either add each point to the nearest group or create a new group.
    // when we get too many groups, we need to increase the threshold and rescan our current groups
    while (datapointsIterator.hasNext()) {
      Centroid row = datapointsIterator.next();
      // Get the closest vector and its weight as a WeightedThing<Vector>.
      // The weight of the WeightedThing is the distance to the query and the value is a
      // reference to one of the vectors we added to the searcher previously.
      WeightedThing<Vector> closestPair = centroids.searchFirst(row, false);


      // We get a uniformly distributed random number between 0 and 1 and compare it with the
      // distance to the closest cluster divided by the distanceCutoff.
      // This is so that if the closest cluster is further than distanceCutoff,
      // closestPair.getWeight() / distanceCutoff > 1 which will trigger the creation of a new
      // cluster anyway.
      // However, if the ratio is less than 1, we want to create a new cluster with probability
      // proportional to the distance to the closest cluster.
      double sample = random.nextDouble();
      if (sample < row.getWeight() * closestPair.getWeight() / distanceCutoff) {
        // Add new centroid, note that the vector is copied because we may mutate it later.
        centroids.add(row.clone());
      } else {
        // Merge the new point with the existing centroid. This will update the centroid's actual
        // position.
        // We know that all the points we inserted in the centroids searcher are (or extend)
        // WeightedVector, so the cast will always succeed.
        Centroid centroid = (Centroid) closestPair.getValue();


        // We will update the centroid by removing it from the searcher and reinserting it to
        // ensure consistency.
        if (!centroids.remove(centroid, Constants.EPSILON)) {
          throw new RuntimeException("Unable to remove centroid");
        }
        centroid.update(row);
        centroids.add(centroid);


      }
      ++numProcessedDatapoints;

View Full Code Here

    }
    for (int i = 0; i < searcher.size(); ++i) {
      summarizers.add(new OnlineSummarizer());
    }
    for (Vector v : datapoints) {
      Centroid closest = (Centroid)searcher.search(v,  1).get(0).getValue();
      OnlineSummarizer summarizer = summarizers.get(closest.getIndex());
      summarizer.add(distanceMeasure.distance(v, closest));
    }
    return summarizers;
  }

View Full Code Here

      int pow2J = 1 << (numDimensions - 1);
      for (int j = 0; j < numDimensions; ++j) {
        v.set(j, 1.0 / pow2J * (i & pow2J));
        pow2J >>= 1;
      }
      mean.add(new Centroid(i, v, 1));
      rowSamplers.add(new MultiNormal(distributionRadius, v));
    }


    // Sample the requested number of data points.
    List<Centroid> data = Lists.newArrayListWithCapacity(numDatapoints);
    for (int i = 0; i < numDatapoints; ++i) {
      data.add(new Centroid(i, rowSamplers.get(i % pow2N).sample(), 1));
    }
    return new Pair<List<Centroid>, List<Centroid>>(data, mean);
  }

View Full Code Here

    return Iterables.transform(dirIterable, new Function<ClusterWritable, Centroid>() {
      int numClusters = 0;
      @Override
      public Centroid apply(ClusterWritable input) {
        Preconditions.checkNotNull(input);
        return new Centroid(numClusters++, input.getValue().getCenter().clone(),
            input.getValue().getTotalObservations());
      }
    });
  }

View Full Code Here

    Iterator<Centroid> dataPointsIterator = dataPoints.iterator();


    if (estimateDistanceCutoff == StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF) {
      List<Centroid> estimatePoints = Lists.newArrayListWithExpectedSize(NUM_ESTIMATE_POINTS);
      while (dataPointsIterator.hasNext() && estimatePoints.size() < NUM_ESTIMATE_POINTS) {
        Centroid centroid = dataPointsIterator.next();
        estimatePoints.add(centroid);
      }


      if (log.isInfoEnabled()) {
        log.info("Estimated Points: {}", estimatePoints.size());

View Full Code Here

    estimateDistanceCutoff = false;
  }


  @Override
  public void map(Writable key, VectorWritable point, Context context) {
    Centroid centroid = new Centroid(numPoints++, point.get(), 1);
    if (estimateDistanceCutoff) {
      if (numPoints < NUM_ESTIMATE_POINTS) {
        estimatePoints.add(centroid);
      } else if (numPoints == NUM_ESTIMATE_POINTS) {
        clusterEstimatePoints();

View Full Code Here

    SequenceFile.Writer writer = SequenceFile.createWriter(FileSystem.get(conf), conf, new Path(output, "part-r-00000"), IntWritable.class,
        CentroidWritable.class);
    int numCentroids = 0;
    // Run BallKMeans on the intermediate centroids.
    for (Vector finalVector : StreamingKMeansReducer.getBestCentroids(intermediateCentroids, conf)) {
      Centroid finalCentroid = (Centroid)finalVector;
      writer.append(new IntWritable(numCentroids++), new CentroidWritable(finalCentroid));
    }
    writer.close();
    long end = System.currentTimeMillis();
    log.info("Finished BallKMeans. Took {}.", (end - start) / 1000.0);

View Full Code Here

    return Iterables.transform(dirIterable, new Function<ClusterWritable, Centroid>() {
      int numClusters = 0;
      @Override
      public Centroid apply(ClusterWritable input) {
        Preconditions.checkNotNull(input);
        return new Centroid(numClusters++, input.getValue().getCenter().clone(),
            input.getValue().getTotalObservations());
      }
    });
  }

View Full Code Here

    }
    for (int i = 0; i < searcher.size(); ++i) {
      summarizers.add(new OnlineSummarizer());
    }
    for (Vector v : datapoints) {
      Centroid closest = (Centroid)searcher.search(v,  1).get(0).getValue();
      OnlineSummarizer summarizer = summarizers.get(closest.getIndex());
      summarizer.add(distanceMeasure.distance(v, closest));
    }
    return summarizers;
  }

View Full Code Here

0 1 2

TOP

Related Classes of org.apache.mahout.math.Centroid

org.apache.mahout.clustering.ClusteringUtils

org.apache.mahout.clustering.streaming.cluster.BallKMeans

org.apache.mahout.clustering.streaming.cluster.DataUtils

org.apache.mahout.clustering.streaming.cluster.StreamingKMeans

org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable

org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansDriver

org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansMapper

org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansThread

org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansUtilsMR

org.apache.mahout.clustering.streaming.tools.IOUtils

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.