Source Code of org.apache.mahout.clustering.dirichlet.DirichletClusterer

package org.apache.mahout.clustering.dirichlet;


/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


import java.util.ArrayList;
import java.util.List;


import org.apache.mahout.clustering.dirichlet.models.Model;
import org.apache.mahout.clustering.dirichlet.models.ModelDistribution;
import org.apache.mahout.matrix.DenseVector;
import org.apache.mahout.matrix.TimesFunction;
import org.apache.mahout.matrix.Vector;


/**
 * Performs Bayesian mixture modeling.
 * <p/>
 * The idea is that we use a probabilistic mixture of a number of models that we use to explain some observed data.
 * The idea here is that each observed data point is assumed to have come from one of the models in the mixture, but
 * we don't know which.  The way we deal with that is to use a so-called latent parameter which specifies which model
 * each data point came from.
 * <p/>
 * In addition, since this is a Bayesian clustering algorithm, we don't want to actually commit to any single explanation,
 * but rather to sample from the distribution of models and latent assignments of data points to models given the
 * observed data and the prior distributions of model parameters.
 * <p/>
 * This sampling process is initialized by taking models at random from the prior distribution for models.
 * <p/>
 * Then, we iteratively assign points to the different models using the mixture probabilities and the degree
 * of fit between the point and each model expressed as a probability that the point was generated by that model.
 * <p/>
 * After points are assigned, new parameters for each model are sampled from the posterior distribution for the model
 * parameters considering all of the observed data points that were assigned to the model.  Models without any
 * data points are also sampled, but since they have no points assigned, the new samples are effectively taken from
 * the prior distribution for model parameters.
 * <p/>
 * The result is a number of samples that represent mixing probabilities, models and assignment of points to models.
 * If the total number of possible models is substantially larger than the number that ever have points assigned to
 * them, then this algorithm provides a (nearly) non-parametric clustering algorithm.
 * <p/>
 * These samples can give us interesting information that is lacking from a normal clustering that consists of a
 * single assignment of points to clusters.  Firstly, by examining the number of models in each sample that actually
 * has any points assigned to it, we can get information about how many models (clusters) that the data support.
 * <p/>
 * Morevoer, by examining how often two points are assigned to the same model, we can get an approximate measure
 * of how likely these points are to be explained by the same model.  Such soft membership information is difficult
 * to come by with conventional clustering methods.
 * <p/>
 * Finally, we can get an idea of the stability of how the data can be described.  Typically, aspects of the data
 * with lots of data available wind up with stable descriptions while at the edges, there are aspects that are
 * phenomena that we can't really commit to a solid description, but it is still clear that the well supported
 * explanations are insufficient to explain these additional aspects.
 * <p/>
 * One thing that can be difficult about these samples is that we can't always assign a correlation between the models
 * in the different samples.  Probably the best way to do this is to look for overlap in the assignments of data
 * observations to the different models.
 * <p/>
 * <pre>
 *    \theta_i ~ prior()
 *    \lambda_i ~ Dirichlet(\alpha_0)
 *    z_j ~ Multinomial( \lambda )
 *    x_j ~ model(\theta_i)
 * </pre>
 */
public class DirichletClusterer<Observation> {


  // observed data
  private final List<Observation> sampleData;


  // the ModelDistribution for the computation
  private final ModelDistribution<Observation> modelFactory;


  // the state of the clustering process
  private final DirichletState<Observation> state;


  private final int thin;


  private final int burnin;


  private final int numClusters;


  public final List<Model<Observation>[]> clusterSamples = new ArrayList<Model<Observation>[]>();


  /**
   * Create a new instance on the sample data with the given additional parameters
   *
   * @param sampleData    the observed data to be clustered
   * @param modelFactory  the ModelDistribution to use
   * @param alpha_0       the double value for the beta distributions
   * @param numClusters   the int number of clusters
   * @param thin          the int thinning interval, used to report every n iterations
   * @param burnin        the int burnin interval, used to suppress early iterations
   */
  public DirichletClusterer(List<Observation> sampleData,
      ModelDistribution<Observation> modelFactory, double alpha_0,
      int numClusters, int thin, int burnin) {
    this.sampleData = sampleData;
    this.modelFactory = modelFactory;
    this.thin = thin;
    this.burnin = burnin;
    this.numClusters = numClusters;
    state = new DirichletState<Observation>(modelFactory, numClusters, alpha_0,
        thin, burnin);
  }


  /**
   * Iterate over the sample data, obtaining cluster samples periodically
   * and returning them.
   * 
   * @param numIterations the int number of iterations to perform
   * @return a List<List<Model<Observation>>> of the observed models
   */
  public List<Model<Observation>[]> cluster(int numIterations) {
    for (int iteration = 0; iteration < numIterations; iteration++)
      iterate(iteration, state);
    return clusterSamples;
  }


  /**
   * Perform one iteration of the clustering process, iterating over the samples to build
   * a new array of models, then updating the state for the next iteration
   * 
   * @param state the DirichletState<Observation> of this iteration
   */
  private void iterate(int iteration, DirichletState<Observation> state) {


    // create new posterior models
    Model<Observation>[] newModels = modelFactory.sampleFromPosterior(state
        .getModels());


    // iterate over the samples, assigning each to a model
    for (Observation x : sampleData) {
      // compute normalized vector of probabilities that x is described by each model
      Vector pi = normalizedProbabilities(state, x);
      // then pick one cluster by sampling a Multinomial distribution based upon them
      // see: http://en.wikipedia.org/wiki/Multinomial_distribution
      int k = UncommonDistributions.rMultinom(pi);
      // ask the selected model to observe the datum
      newModels[k].observe(x);
    }


    // periodically add models to the cluster samples after the burn-in period
    if ((iteration >= burnin) && (iteration % thin == 0))
      clusterSamples.add(newModels);
    // update the state from the new models
    state.update(newModels);
  }


  /**
   * Compute a normalized vector of probabilities that x is described
   * by each model using the mixture and the model pdfs
   * 
   * @param state the DirichletState<Observation> of this iteration
   * @param x an Observation
   * @return the Vector of probabilities
   */
  private Vector normalizedProbabilities(DirichletState<Observation> state,
      Observation x) {
    Vector pi = new DenseVector(numClusters);
    double max = 0;
    for (int k = 0; k < numClusters; k++) {
      double p = state.adjustedProbability(x, k);
      pi.set(k, p);
      if (max < p)
        max = p;
    }
    // normalize the probabilities by largest observed value
    pi.assign(new TimesFunction(), 1.0 / max);
    return pi;
  }


}
Source Code of org.apache.mahout.clustering.dirichlet.DirichletClusterer

Related Classes of org.apache.mahout.clustering.dirichlet.DirichletClusterer