Package org.apache.mahout.math.neighborhood

Examples of org.apache.mahout.math.neighborhood.BruteSearch


   * @param distanceMeasure the distance measure used to compute the distance between two points.
   * @return the minimum distance between the first sampleLimit points
   * @see org.apache.mahout.clustering.streaming.cluster.StreamingKMeans#clusterInternal(Iterable, boolean)
   */
  public static double estimateDistanceCutoff(List<? extends Vector> data, DistanceMeasure distanceMeasure) {
    BruteSearch searcher = new BruteSearch(distanceMeasure);
    searcher.addAll(data);
    double minDistance = Double.POSITIVE_INFINITY;
    for (Vector vector : data) {
      double closest = searcher.searchFirst(vector, true).getWeight();
      if (minDistance > 0 && closest < minDistance) {
        minDistance = closest;
      }
      searcher.add(vector);
    }
    return minDistance;
  }
View Full Code Here


   * @param distanceMeasure distance measure to use
   * @return the confusion matrix
   */
  public static Matrix getConfusionMatrix(List<? extends Vector> rowCentroids, List<? extends  Vector> columnCentroids,
                                          Iterable<? extends Vector> datapoints, DistanceMeasure distanceMeasure) {
    Searcher rowSearcher = new BruteSearch(distanceMeasure);
    rowSearcher.addAll(rowCentroids);
    Searcher columnSearcher = new BruteSearch(distanceMeasure);
    columnSearcher.addAll(columnCentroids);

    int numRows = rowCentroids.size();
    int numCols = columnCentroids.size();
    Matrix confusionMatrix = new DenseMatrix(numRows, numCols);

    for (Vector vector : datapoints) {
      WeightedThing<Vector> closestRowCentroid = rowSearcher.search(vector, 1).get(0);
      WeightedThing<Vector> closestColumnCentroid = columnSearcher.search(vector, 1).get(0);
      int row = ((Centroid) closestRowCentroid.getValue()).getIndex();
      int column = ((Centroid) closestColumnCentroid.getValue()).getIndex();
      double vectorWeight;
      if (vector instanceof WeightedVector) {
        vectorWeight = ((WeightedVector) vector).getWeight();
View Full Code Here

        mapDriver.getConfiguration().get(StreamingKMeansDriver.SEARCHER_CLASS_OPTION));
    for (Centroid datapoint : syntheticData.getFirst()) {
      mapDriver.addInput(new IntWritable(0), new VectorWritable(datapoint));
    }
    List<org.apache.hadoop.mrunit.types.Pair<IntWritable,CentroidWritable>> results = mapDriver.run();
    BruteSearch resultSearcher = new BruteSearch(new SquaredEuclideanDistanceMeasure());
    for (org.apache.hadoop.mrunit.types.Pair<IntWritable, CentroidWritable> result : results) {
      resultSearcher.add(result.getSecond().getCentroid());
    }
    System.out.printf("Clustered the data into %d clusters\n", results.size());
    for (Vector mean : syntheticData.getSecond()) {
      WeightedThing<Vector> closest = resultSearcher.search(mean, 1).get(0);
      assertTrue("Weight " + closest.getWeight() + " not less than 0.5", closest.getWeight() < 0.5);
    }
  }
View Full Code Here

  private static final int K1 = 100;

  @Test
  public void testClusteringMultipleRuns() {
    for (int i = 1; i <= 10; ++i) {
      BallKMeans clusterer = new BallKMeans(new BruteSearch(new SquaredEuclideanDistanceMeasure()),
          1 << NUM_DIMENSIONS, NUM_ITERATIONS, true, i);
      clusterer.cluster(syntheticData.getFirst());
      double costKMeansPlusPlus = ClusteringUtils.totalClusterCost(syntheticData.getFirst(), clusterer);

      clusterer = new BallKMeans(new BruteSearch(new SquaredEuclideanDistanceMeasure()),
          1 << NUM_DIMENSIONS, NUM_ITERATIONS, false, i);
      clusterer.cluster(syntheticData.getFirst());
      double costKMeansRandom = ClusteringUtils.totalClusterCost(syntheticData.getFirst(), clusterer);

      System.out.printf("%d runs; kmeans++: %f; random: %f\n", i, costKMeansPlusPlus, costKMeansRandom);
View Full Code Here

    }
  }

  @Test
  public void testClustering() {
    UpdatableSearcher searcher = new BruteSearch(new SquaredEuclideanDistanceMeasure());
    BallKMeans clusterer = new BallKMeans(searcher, 1 << NUM_DIMENSIONS, NUM_ITERATIONS);

    long startTime = System.currentTimeMillis();
    clusterer.cluster(syntheticData.getFirst());
    long endTime = System.currentTimeMillis();

    assertEquals("Total weight not preserved", totalWeight(syntheticData.getFirst()), totalWeight(clusterer), 1.0e-9);

    // Verify that each corner of the cube has a centroid very nearby.
    // This is probably FALSE for large-dimensional spaces!
    OnlineSummarizer summarizer = new OnlineSummarizer();
    for (Vector mean : syntheticData.getSecond()) {
      WeightedThing<Vector> v = searcher.search(mean, 1).get(0);
      summarizer.add(v.getWeight());
    }
    assertTrue(String.format("Median weight [%f] too large [>%f]", summarizer.getMedian(),
        DISTRIBUTION_RADIUS), summarizer.getMedian() < DISTRIBUTION_RADIUS);

    double clusterTime = (endTime - startTime) / 1000.0;
    System.out.printf("%s\n%.2f for clustering\n%.1f us per row\n\n",
        searcher.getClass().getName(), clusterTime,
        clusterTime / syntheticData.getFirst().size() * 1.0e6);

    // Verify that the total weight of the centroids near each corner is correct.
    double[] cornerWeights = new double[1 << NUM_DIMENSIONS];
    Searcher trueFinder = new BruteSearch(new EuclideanDistanceMeasure());
    for (Vector trueCluster : syntheticData.getSecond()) {
      trueFinder.add(trueCluster);
    }
    for (Centroid centroid : clusterer) {
      WeightedThing<Vector> closest = trueFinder.search(centroid, 1).get(0);
      cornerWeights[((Centroid)closest.getValue()).getIndex()] += centroid.getWeight();
    }
    int expectedNumPoints = NUM_DATA_POINTS / (1 << NUM_DIMENSIONS);
    for (double v : cornerWeights) {
      System.out.printf("%f ", v);
View Full Code Here

  public void testInitialization() {
    // Start with super clusterable data.
    List<? extends WeightedVector> data = cubishTestData(0.01);

    // Just do initialization of ball k-means. This should drop a point into each of the clusters.
    BallKMeans r = new BallKMeans(new BruteSearch(new SquaredEuclideanDistanceMeasure()), 6, 20);
    r.cluster(data);

    // Put the centroids into a matrix.
    Matrix x = new DenseMatrix(6, 5);
    int row = 0;
View Full Code Here

        searcher.getClass().getName(), clusterTime,
        clusterTime / syntheticData.getFirst().size() * 1.0e6);

    // verify that the total weight of the centroids near each corner is correct
    double[] cornerWeights = new double[1 << NUM_DIMENSIONS];
    Searcher trueFinder = new BruteSearch(new EuclideanDistanceMeasure());
    for (Vector trueCluster : syntheticData.getSecond()) {
      trueFinder.add(trueCluster);
    }
    for (Centroid centroid : clusterer) {
      WeightedThing<Vector> closest = trueFinder.search(centroid, 1).get(0);
      cornerWeights[((Centroid)closest.getValue()).getIndex()] += centroid.getWeight();
    }
    int expectedNumPoints = NUM_DATA_POINTS / (1 << NUM_DIMENSIONS);
    for (double v : cornerWeights) {
      System.out.printf("%f ", v);
View Full Code Here

   * @param distanceMeasure the distance measure used to compute the distance between two points.
   * @return the minimum distance between the first sampleLimit points
   * @see org.apache.mahout.clustering.streaming.cluster.StreamingKMeans#clusterInternal(Iterable, boolean)
   */
  public static double estimateDistanceCutoff(List<? extends Vector> data, DistanceMeasure distanceMeasure) {
    BruteSearch searcher = new BruteSearch(distanceMeasure);
    searcher.addAll(data);
    double minDistance = Double.POSITIVE_INFINITY;
    for (Vector vector : data) {
      double closest = searcher.searchFirst(vector, true).getWeight();
      if (minDistance > 0 && closest < minDistance) {
        minDistance = closest;
      }
      searcher.add(vector);
    }
    return minDistance;
  }
View Full Code Here

   * @param distanceMeasure distance measure to use
   * @return the confusion matrix
   */
  public static Matrix getConfusionMatrix(List<? extends Vector> rowCentroids, List<? extends  Vector> columnCentroids,
                                          Iterable<? extends Vector> datapoints, DistanceMeasure distanceMeasure) {
    Searcher rowSearcher = new BruteSearch(distanceMeasure);
    rowSearcher.addAll(rowCentroids);
    Searcher columnSearcher = new BruteSearch(distanceMeasure);
    columnSearcher.addAll(columnCentroids);

    int numRows = rowCentroids.size();
    int numCols = columnCentroids.size();
    Matrix confusionMatrix = new DenseMatrix(numRows, numCols);

    for (Vector vector : datapoints) {
      WeightedThing<Vector> closestRowCentroid = rowSearcher.search(vector, 1).get(0);
      WeightedThing<Vector> closestColumnCentroid = columnSearcher.search(vector, 1).get(0);
      int row = ((Centroid) closestRowCentroid.getValue()).getIndex();
      int column = ((Centroid) closestColumnCentroid.getValue()).getIndex();
      double vectorWeight;
      if (vector instanceof WeightedVector) {
        vectorWeight = ((WeightedVector) vector).getWeight();
View Full Code Here


  @Test
  public void testClusteringMultipleRuns() {
    for (int i = 1; i <= 10; ++i) {
      BallKMeans clusterer = new BallKMeans(new BruteSearch(new SquaredEuclideanDistanceMeasure()),
          1 << NUM_DIMENSIONS, NUM_ITERATIONS, true, i);
      clusterer.cluster(syntheticData.getFirst());
      double costKMeansPlusPlus = ClusteringUtils.totalClusterCost(syntheticData.getFirst(), clusterer);

      clusterer = new BallKMeans(new BruteSearch(new SquaredEuclideanDistanceMeasure()),
          1 << NUM_DIMENSIONS, NUM_ITERATIONS, false, i);
      clusterer.cluster(syntheticData.getFirst());
      double costKMeansRandom = ClusteringUtils.totalClusterCost(syntheticData.getFirst(), clusterer);

      System.out.printf("%d runs; kmeans++: %f; random: %f\n", i, costKMeansPlusPlus, costKMeansRandom);
View Full Code Here

TOP

Related Classes of org.apache.mahout.math.neighborhood.BruteSearch

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.