Package de.jungblut.clustering

Source Code of de.jungblut.clustering.DBSCAN

package de.jungblut.clustering;

import gnu.trove.iterator.TIntObjectIterator;
import gnu.trove.map.hash.TIntObjectHashMap;
import gnu.trove.set.hash.TIntHashSet;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.HashSet;
import java.util.List;

import de.jungblut.datastructure.ArrayUtils;
import de.jungblut.distance.DistanceMeasurer;
import de.jungblut.math.DoubleMatrix;
import de.jungblut.math.DoubleVector;
import de.jungblut.math.dense.DenseDoubleMatrix;

/**
* Sequential version of DBSCAN to evaluate if this algorithm is suitable for
* arbitrary parallelization paradigms that can crunch graphs. <br/>
* <br/>
* PLAN: <br/>
*
* 1. compute distance matrix between the points <br/>
* 2. extract adjacent points via threshold epsilon and minpoints s <br/>
* 3. run connected components (here BFS)<br/>
* 4. PROFIT!
*/
public final class DBSCAN {

  private List<DoubleVector> noise;
  private ArrayList<DoubleVector>[] connectedComponents;

  /**
   * Clusters the points.
   *
   * @param measurer the distance measurer to use.
   * @param minPoints the minimum points in a cluster.
   * @param epsilon the radius of a point to detect other points.
   */
  public ArrayList<DoubleVector>[] cluster(List<DoubleVector> points,
      DistanceMeasurer measurer, int minPoints, double epsilon) {
    // compute the distance matrix
    DoubleMatrix distanceMatrix = generateDistanceMatrix(measurer, points);
    // generate adjacency list
    TIntObjectHashMap<int[]> adjacencyMatrix = generateAdjacencyMatrix(
        distanceMatrix, points, minPoints, epsilon);
    connectedComponents = findConnectedComponents(points, adjacencyMatrix);
    noise = findNoise(points);
    return connectedComponents;
  }

  /**
   * @return the found noise as list of vectors.
   */
  public List<DoubleVector> getNoise() {
    return this.noise;
  }

  /**
   * A distance matrix (NxN) based on n given points and a distance measurer.
   */
  private DoubleMatrix generateDistanceMatrix(DistanceMeasurer measurer,
      List<DoubleVector> pointList) {

    final int n = pointList.size();
    DenseDoubleMatrix matrix = new DenseDoubleMatrix(n, n);

    for (int i = 0; i < n; i++) {
      for (int j = 0; j < n; j++) {
        final double distance = measurer.measureDistance(pointList.get(i),
            pointList.get(j));
        matrix.set(i, j, distance);
      }
    }

    return matrix;
  }

  /**
   * Generates an adjacency matrix from the distance matrix, based on min-points
   * and epsilon (maximum distance between two points). <br/>
   * At this point you can see that never assigned points are possible noise.
   */
  private TIntObjectHashMap<int[]> generateAdjacencyMatrix(
      DoubleMatrix distanceMatrix, List<DoubleVector> points, int minPoints,
      double epsilon) {

    TIntObjectHashMap<int[]> adjacencyList = new TIntObjectHashMap<>();
    for (int col = 0; col < distanceMatrix.getColumnCount(); col++) {
      List<Integer> possibleNeighbours = new ArrayList<>();
      for (int row = 0; row < distanceMatrix.getRowCount(); row++) {
        // don't include the same point
        if (row != col) {
          final double distance = distanceMatrix.get(row, col);
          if (distance < epsilon) {
            possibleNeighbours.add(row);
          }
        }
      }
      // if our range scan found at least minPoints, add them to the adjacency
      // list.
      if (possibleNeighbours.size() >= minPoints) {
        adjacencyList.put(col, ArrayUtils.toPrimitiveArray(possibleNeighbours));
      }
    }

    return adjacencyList;
  }

  /**
   * Returns a mapping between a cluster ID and its associated points.
   */
  private ArrayList<DoubleVector>[] findConnectedComponents(
      List<DoubleVector> points, TIntObjectHashMap<int[]> adjacencyMatrix) {
    TIntObjectHashMap<int[]> connectedComponents = new TIntObjectHashMap<>();
    TIntHashSet globallyVisitedVertices = new TIntHashSet();
    int clusterId = 0;
    // loop over all known points
    final int size = points.size();
    for (int i = 0; i < size; i++) {
      if (!globallyVisitedVertices.contains(i)) {
        globallyVisitedVertices.add(i);
        TIntHashSet set = new TIntHashSet();
        set = bfs(set, i, adjacencyMatrix);
        if (!set.isEmpty()) {
          connectedComponents.put(clusterId++, set.toArray());
          globallyVisitedVertices.addAll(set);
        }
      }
    }
    // translate the adjacents back to the points
    @SuppressWarnings("unchecked")
    ArrayList<DoubleVector>[] array = new ArrayList[connectedComponents.size()];

    TIntObjectIterator<int[]> iterator = connectedComponents.iterator();
    while (iterator.hasNext()) {
      iterator.advance();
      int[] values = iterator.value();
      ArrayList<DoubleVector> list = new ArrayList<>(values.length);
      for (int val : values) {
        list.add(points.get(val));
      }
      array[iterator.key()] = list;
    }

    return array;
  }

  /**
   * Find the noise in the given connected components, by taking a set
   * difference.
   *
   * @return a list of points that are classified as noise.
   */
  private List<DoubleVector> findNoise(List<DoubleVector> points) {
    List<DoubleVector> noise = new ArrayList<>();
    HashSet<DoubleVector> set = new HashSet<>();
    for (List<DoubleVector> component : connectedComponents) {
      set.addAll(component);
    }

    for (DoubleVector point : points) {
      if (!set.contains(point)) {
        noise.add(point);
      }
    }
    return noise;
  }

  /**
   * Simple BFS to find out the connected components.
   */
  private TIntHashSet bfs(TIntHashSet set, int start,
      TIntObjectHashMap<int[]> adjacencyMatrix) {
    final Deque<Integer> vertexDeque = new ArrayDeque<>();
    vertexDeque.add(start);
    while (!vertexDeque.isEmpty()) {
      start = vertexDeque.poll();
      int[] is = adjacencyMatrix.get(start);
      // check for null,because not all points may be included
      if (is != null) {
        set.add(start);
        for (int i : is) {
          if (!set.contains(i)) {
            set.add(i);
            vertexDeque.add(i);
          }
        }
      }
    }
    return set;
  }

}
TOP

Related Classes of de.jungblut.clustering.DBSCAN

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.