Package org.apache.mahout.math.neighborhood

Examples of org.apache.mahout.math.neighborhood.UpdatableSearcher


    this.conf = conf;
  }

  @Override
  public Iterable<Centroid> call() {
    UpdatableSearcher searcher = StreamingKMeansUtilsMR.searcherFromConfiguration(conf);
    int numClusters = conf.getInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS, 1);
    double estimateDistanceCutoff = conf.getFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF,
        StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF);

    Iterator<Centroid> datapointsIterator = datapoints.iterator();
    if (estimateDistanceCutoff == StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF) {
      List<Centroid> estimatePoints = Lists.newArrayListWithExpectedSize(NUM_ESTIMATE_POINTS);
      while (datapointsIterator.hasNext() && estimatePoints.size() < NUM_ESTIMATE_POINTS) {
        estimatePoints.add(datapointsIterator.next());
      }
      estimateDistanceCutoff = ClusteringUtils.estimateDistanceCutoff(estimatePoints, searcher.getDistanceMeasure());
    }

    StreamingKMeans clusterer = new StreamingKMeans(searcher, numClusters, estimateDistanceCutoff);
    while (datapointsIterator.hasNext()) {
      clusterer.cluster(datapointsIterator.next());
View Full Code Here


  @Override
  public void setup(Context context) {
    // At this point the configuration received from the Driver is assumed to be valid.
    // No other checks are made.
    Configuration conf = context.getConfiguration();
    UpdatableSearcher searcher = StreamingKMeansUtilsMR.searcherFromConfiguration(conf);
    int numClusters = conf.getInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS, 1);
    double estimatedDistanceCutoff = conf.getFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF,
        StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF);
    if (estimatedDistanceCutoff == StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF) {
      estimateDistanceCutoff = true;
View Full Code Here

   * index is i.
   */
  public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints,
                                                                 Iterable<? extends Vector> centroids,
                                                                 DistanceMeasure distanceMeasure) {
    UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1);
    searcher.addAll(centroids);
    List<OnlineSummarizer> summarizers = Lists.newArrayList();
    if (searcher.size() == 0) {
      return summarizers;
    }
    for (int i = 0; i < searcher.size(); ++i) {
      summarizers.add(new OnlineSummarizer());
    }
    for (Vector v : datapoints) {
      Centroid closest = (Centroid)searcher.search(v,  1).get(0).getValue();
      OnlineSummarizer summarizer = summarizers.get(closest.getIndex());
      summarizer.add(distanceMeasure.distance(v, closest));
    }
    return summarizers;
  }
View Full Code Here

   * @param centroids iterable of Centroids.
   * @return the total cost described above.
   */
  public static double totalClusterCost(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids) {
    DistanceMeasure distanceMeasure = new EuclideanDistanceMeasure();
    UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1);
    searcher.addAll(centroids);
    return totalClusterCost(datapoints, searcher);
  }
View Full Code Here

    }
  }

  @Test
  public void testClustering() {
    UpdatableSearcher searcher = new BruteSearch(new SquaredEuclideanDistanceMeasure());
    BallKMeans clusterer = new BallKMeans(searcher, 1 << NUM_DIMENSIONS, NUM_ITERATIONS);

    long startTime = System.currentTimeMillis();
    clusterer.cluster(syntheticData.getFirst());
    long endTime = System.currentTimeMillis();

    assertEquals("Total weight not preserved", totalWeight(syntheticData.getFirst()), totalWeight(clusterer), 1.0e-9);

    // Verify that each corner of the cube has a centroid very nearby.
    // This is probably FALSE for large-dimensional spaces!
    OnlineSummarizer summarizer = new OnlineSummarizer();
    for (Vector mean : syntheticData.getSecond()) {
      WeightedThing<Vector> v = searcher.search(mean, 1).get(0);
      summarizer.add(v.getWeight());
    }
    assertTrue(String.format("Median weight [%f] too large [>%f]", summarizer.getMedian(),
        DISTRIBUTION_RADIUS), summarizer.getMedian() < DISTRIBUTION_RADIUS);

    double clusterTime = (endTime - startTime) / 1000.0;
    System.out.printf("%s\n%.2f for clustering\n%.1f us per row\n\n",
        searcher.getClass().getName(), clusterTime,
        clusterTime / syntheticData.getFirst().size() * 1.0e6);

    // Verify that the total weight of the centroids near each corner is correct.
    double[] cornerWeights = new double[1 << NUM_DIMENSIONS];
    Searcher trueFinder = new BruteSearch(new EuclideanDistanceMeasure());
View Full Code Here

    this.conf = conf;
  }

  @Override
  public Iterable<Centroid> call() {
    UpdatableSearcher searcher = StreamingKMeansUtilsMR.searcherFromConfiguration(conf);
    int numClusters = conf.getInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS, 1);
    double estimateDistanceCutoff = conf.getFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF,
        StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF);

    Iterator<Centroid> dataPointsIterator = dataPoints.iterator();

    if (estimateDistanceCutoff == StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF) {
      List<Centroid> estimatePoints = Lists.newArrayListWithExpectedSize(NUM_ESTIMATE_POINTS);
      while (dataPointsIterator.hasNext() && estimatePoints.size() < NUM_ESTIMATE_POINTS) {
        Centroid centroid = dataPointsIterator.next();
        estimatePoints.add(centroid);
      }

      if (log.isInfoEnabled()) {
        log.info("Estimated Points: {}", estimatePoints.size());
      }
      estimateDistanceCutoff = ClusteringUtils.estimateDistanceCutoff(estimatePoints, searcher.getDistanceMeasure());
    }

    StreamingKMeans streamingKMeans = new StreamingKMeans(searcher, numClusters, estimateDistanceCutoff);

    // datapointsIterator could be empty if no estimate distance was initially provided
View Full Code Here

  @Override
  public void setup(Context context) {
    // At this point the configuration received from the Driver is assumed to be valid.
    // No other checks are made.
    Configuration conf = context.getConfiguration();
    UpdatableSearcher searcher = StreamingKMeansUtilsMR.searcherFromConfiguration(conf);
    int numClusters = conf.getInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS, 1);
    double estimatedDistanceCutoff = conf.getFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF,
        StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF);
    if (estimatedDistanceCutoff == StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF) {
      estimateDistanceCutoff = true;
View Full Code Here

   * index is i.
   */
  public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints,
                                                                 Iterable<? extends Vector> centroids,
                                                                 DistanceMeasure distanceMeasure) {
    UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1);
    searcher.addAll(centroids);
    List<OnlineSummarizer> summarizers = Lists.newArrayList();
    if (searcher.size() == 0) {
      return summarizers;
    }
    for (int i = 0; i < searcher.size(); ++i) {
      summarizers.add(new OnlineSummarizer());
    }
    for (Vector v : datapoints) {
      Centroid closest = (Centroid)searcher.search(v,  1).get(0).getValue();
      OnlineSummarizer summarizer = summarizers.get(closest.getIndex());
      summarizer.add(distanceMeasure.distance(v, closest));
    }
    return summarizers;
  }
View Full Code Here

   * @param centroids iterable of Centroids.
   * @return the total cost described above.
   */
  public static double totalClusterCost(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids) {
    DistanceMeasure distanceMeasure = new EuclideanDistanceMeasure();
    UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1);
    searcher.addAll(centroids);
    return totalClusterCost(datapoints, searcher);
  }
View Full Code Here

    }
  }

  @Test
  public void testClustering() {
    UpdatableSearcher searcher = new BruteSearch(new SquaredEuclideanDistanceMeasure());
    BallKMeans clusterer = new BallKMeans(searcher, 1 << NUM_DIMENSIONS, NUM_ITERATIONS);

    long startTime = System.currentTimeMillis();
    Pair<List<Centroid>, List<Centroid>> data = syntheticData;
    clusterer.cluster(data.getFirst());
    long endTime = System.currentTimeMillis();

    long hash = 0;
    for (Centroid centroid : data.getFirst()) {
      for (Vector.Element element : centroid.all()) {
        hash = 31 * hash + 17 * element.index() + Double.toHexString(element.get()).hashCode();
      }
    }
    System.out.printf("Hash = %08x\n", hash);

    assertEquals("Total weight not preserved", totalWeight(syntheticData.getFirst()), totalWeight(clusterer), 1.0e-9);

    // Verify that each corner of the cube has a centroid very nearby.
    // This is probably FALSE for large-dimensional spaces!
    OnlineSummarizer summarizer = new OnlineSummarizer();
    for (Vector mean : syntheticData.getSecond()) {
      WeightedThing<Vector> v = searcher.search(mean, 1).get(0);
      summarizer.add(v.getWeight());
    }
    assertTrue(String.format("Median weight [%f] too large [>%f]", summarizer.getMedian(),
        DISTRIBUTION_RADIUS), summarizer.getMedian() < DISTRIBUTION_RADIUS);

    double clusterTime = (endTime - startTime) / 1000.0;
    System.out.printf("%s\n%.2f for clustering\n%.1f us per row\n\n",
        searcher.getClass().getName(), clusterTime,
        clusterTime / syntheticData.getFirst().size() * 1.0e6);

    // Verify that the total weight of the centroids near each corner is correct.
    double[] cornerWeights = new double[1 << NUM_DIMENSIONS];
    Searcher trueFinder = new BruteSearch(new EuclideanDistanceMeasure());
View Full Code Here

TOP

Related Classes of org.apache.mahout.math.neighborhood.UpdatableSearcher

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.