Package de.jungblut.clustering

Source Code of de.jungblut.clustering.OnePassExclusiveClustering

package de.jungblut.clustering;

import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;

import de.jungblut.datastructure.KDTree;
import de.jungblut.datastructure.KDTree.VectorDistanceTuple;
import de.jungblut.distance.EuclidianDistance;
import de.jungblut.math.DoubleVector;

/**
* A one pass exclusive clustering algorithm. As the name suggests, the
* clustering algorithm will iterate once over a constructed kd-tree and find
* nearest neighbours inside a distance threshold. The found neighbours are
* going to be put into a bitset and will be omitted from search in the
* following kd-tree searches. Found clusters are checked against a minimum size
* and maybe discarded when not reaching the configured threshold. This is
* considered a very fast algorithm, it can be used instead of
* {@link CanopyClustering}.
*
* @author thomas.jungblut
*
*/
public final class OnePassExclusiveClustering {

  private int minSize;
  private int k;
  private double t1;
  private boolean mergeOverlaps;

  /**
   * Constructs a one pass clustering algorithm. With unlimited maximum number
   * of neighbours retrieved and a minimum cluster size of 2.
   *
   * @param t1 the maximum distance of neighbourhood.
   */
  public OnePassExclusiveClustering(double t1) {
    this(t1, Integer.MAX_VALUE, 2, false);
  }

  /**
   * Constructs a one pass clustering algorithm.
   *
   * @param t1 the maximum distance of neighbourhood.
   * @param k the maximum number of neighbours to retrieve inside the t1
   *          threshold.
   * @param minSize the minimum size of a cluster.
   * @param mergeOverlaps if true, overlapping found centers by t1 distance will
   *          be merged.
   */
  public OnePassExclusiveClustering(double t1, int k, int minSize,
      boolean mergeOverlaps) {
    this.t1 = t1;
    this.k = k;
    this.minSize = minSize;
  }

  /**
   * Cluster the given items.
   *
   * @param values the vectors to cluster.
   * @param verbose if true, outputs progress to STDOUT.
   * @return a list of centers that describe the given vectors.
   */
  public List<DoubleVector> cluster(List<DoubleVector> values, boolean verbose) {
    ArrayList<DoubleVector> centers = new ArrayList<>();
    KDTree<Integer> tree = new KDTree<>();
    int index = 0;
    for (DoubleVector value : values) {
      tree.add(value, index++);
    }
    tree.balanceBySort();

    BitSet set = new BitSet(values.size());
    int items = 0;
    for (int i = 0; i < values.size(); i++) {
      if (!set.get(i)) {
        DoubleVector v = values.get(i);
        DoubleVector center = v.deepCopy();
        List<VectorDistanceTuple<Integer>> nns = tree.getNearestNeighbours(v,
            k, t1);
        int sum = 1;
        for (VectorDistanceTuple<Integer> nn : nns) {
          if (nn.getDistance() < t1 && !set.get(nn.getValue())) {
            sum++;
            set.set(nn.getValue());
            center = center.add(nn.getVector());
          }
        }
        // ignore clusters violating the threshold.
        if (sum >= minSize) {
          DoubleVector newCenter = center.divide(sum);
          if (mergeOverlaps) {
            boolean noOverlap = true;
            // merge overlapping clusters within our t1
            for (int x = 0; x < centers.size(); x++) {
              DoubleVector ref = centers.get(x);
              double dist = EuclidianDistance.get().measureDistance(ref,
                  newCenter);
              if (dist < t1) {
                // average both centers
                centers.set(x, ref.add(newCenter).divide(2d));
                noOverlap = false;
                break;
              }
            }
            if (noOverlap) {
              centers.add(newCenter);
            }
          } else {
            centers.add(newCenter);
          }
        }
        set.set(i);
      }
      items++;
      if (verbose && items % 1000 == 0) {
        String progressString = NumberFormat.getPercentInstance().format(
            items / (double) tree.size());
        System.out.format("Processed %d/%d = %s. Centers found: %d.\n", items,
            tree.size(), progressString, centers.size());
      }
    }
    return centers;
  }
}
TOP

Related Classes of de.jungblut.clustering.OnePassExclusiveClustering

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.