Package org.apache.mahout.clustering.canopy

Source Code of org.apache.mahout.clustering.canopy.Canopy

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.mahout.clustering.canopy;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.matrix.AbstractVector;
import org.apache.mahout.matrix.SparseVector;
import org.apache.mahout.matrix.Vector;
import org.apache.mahout.utils.DistanceMeasure;

import java.io.IOException;
import java.util.List;

/**
* This class models a canopy as a center point, the number of points that are
* contained within it according to the application of some distance metric, and
* a point total which is the sum of all the points and is used to compute the
* centroid when needed.
*/
public class Canopy {

  // keys used by Driver, Mapper, Combiner & Reducer
  public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure";

  public static final String T1_KEY = "org.apache.mahout.clustering.canopy.t1";

  public static final String T2_KEY = "org.apache.mahout.clustering.canopy.t2";

  public static final String CANOPY_PATH_KEY = "org.apache.mahout.clustering.canopy.path";

  // the next canopyId to be allocated
  private static int nextCanopyId = 0;

  // the T1 distance threshold
  private static double t1;

  // the T2 distance threshold
  private static double t2;

  // the distance measure
  private static DistanceMeasure measure;

  // this canopy's canopyId
  private final int canopyId;

  // the current center
  private Vector center = new SparseVector(0);

  // the number of points in the canopy
  private int numPoints = 0;

  // the total of all points added to the canopy
  private Vector pointTotal = null;

  /**
   * Create a new Canopy containing the given point
   *
   * @param point
   *            a point in vector space
   */
  public Canopy(Vector point) {
    this.canopyId = nextCanopyId++;
    this.center = point;
    this.pointTotal = point.copy();
    this.numPoints = 1;
  }

  /**
   * Create a new Canopy containing the given point and canopyId
   *
   * @param point
   *            a point in vector space
   * @param canopyId
   *            an int identifying the canopy local to this process only
   */
  public Canopy(Vector point, int canopyId) {
    this.canopyId = canopyId;
    this.center = point;
    this.pointTotal = point.copy();
    this.numPoints = 1;
  }

  /**
   * Configure the Canopy and its distance measure
   *
   * @param job
   *            the JobConf for this job
   */
  public static void configure(JobConf job) {
    try {
      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
      Class<?> cl = ccl.loadClass(job.get(DISTANCE_MEASURE_KEY));
      measure = (DistanceMeasure) cl.newInstance();
      measure.configure(job);
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e);
    } catch (IllegalAccessException e) {
      throw new RuntimeException(e);
    } catch (InstantiationException e) {
      throw new RuntimeException(e);
    }
    nextCanopyId = 0;
    t1 = Double.parseDouble(job.get(T1_KEY));
    t2 = Double.parseDouble(job.get(T2_KEY));
  }

  /**
   * Configure the Canopy for unit tests
   *
   * @param aMeasure
   * @param aT1
   * @param aT2
   */
  public static void config(DistanceMeasure aMeasure, double aT1, double aT2) {
    nextCanopyId = 0;
    measure = aMeasure;
    t1 = aT1;
    t2 = aT2;
  }

  /**
   * This is the same algorithm as the reference but inverted to iterate over
   * existing canopies instead of the points. Because of this it does not need
   * to actually store the points, instead storing a total points vector and the
   * number of points. From this a centroid can be computed. <p/> This method is
   * used by the CanopyReducer.
   *
   * @param point
   *            the point to be added
   * @param canopies
   *            the List<Canopy> to be appended
   */
  public static void addPointToCanopies(Vector point, List<Canopy> canopies) {
    boolean pointStronglyBound = false;
    for (Canopy canopy : canopies) {
      double dist = measure.distance(canopy.getCenter(), point);
      if (dist < t1)
        canopy.addPoint(point);
      pointStronglyBound = pointStronglyBound || (dist < t2);
    }
    if (!pointStronglyBound)
      canopies.add(new Canopy(point));
  }

  /**
   * This method is used by the CanopyMapper to perform canopy inclusion tests
   * and to emit the point and its covering canopies to the output. The
   * CanopyCombiner will then sum the canopy points and produce the centroids.
   *
   * @param point
   *            the point to be added
   * @param canopies
   *            the List<Canopy> to be appended
   * @param collector
   *            an OutputCollector in which to emit the point
   */
  public static void emitPointToNewCanopies(Vector point,
      List<Canopy> canopies, OutputCollector<Text, Text> collector)
      throws IOException {
    boolean pointStronglyBound = false;
    for (Canopy canopy : canopies) {
      double dist = measure.distance(canopy.getCenter(), point);
      if (dist < t1)
        canopy.emitPoint(point, collector);
      pointStronglyBound = pointStronglyBound || (dist < t2);
    }
    if (!pointStronglyBound) {
      Canopy canopy = new Canopy(point);
      canopies.add(canopy);
      canopy.emitPoint(point, collector);
    }
  }

  /**
   * This method is used by the CanopyMapper to perform canopy inclusion tests
   * and to emit the point keyed by its covering canopies to the output. if the
   * point is not covered by any canopies (due to canopy centroid clustering),
   * emit the point to the closest covering canopy.
   *
   * @param point
   *            the point to be added
   * @param canopies
   *            the List<Canopy> to be appended
   * @param writable
   *            the original Writable from the input, may include arbitrary
   *            payload information after the point [...]<payload>
   * @param collector
   *            an OutputCollector in which to emit the point
   */
  public static void emitPointToExistingCanopies(Vector point,
      List<Canopy> canopies, Text writable,
      OutputCollector<Text, Text> collector) throws IOException {
    double minDist = Double.MAX_VALUE;
    Canopy closest = null;
    boolean isCovered = false;
    for (Canopy canopy : canopies) {
      double dist = measure.distance(canopy.getCenter(), point);
      if (dist < t1) {
        isCovered = true;
        collector.collect(new Text(formatCanopy(canopy)), writable);
      } else if (dist < minDist) {
        minDist = dist;
        closest = canopy;
      }
    }
    // if the point is not contained in any canopies (due to canopy centroid
    // clustering), emit the point to the closest covering canopy.
    if (!isCovered)
      collector.collect(new Text(formatCanopy(closest)), writable);
  }

  /**
   * Format the canopy for output
   *
   * @param canopy
   */
  public static String formatCanopy(Canopy canopy) {
    return "C" + canopy.canopyId + ": "
        + canopy.computeCentroid().asFormatString();
  }

  /**
   * Decodes and returns a Canopy from the formattedString
   *
   * @param formattedString
   *            a String prouced by formatCanopy
   * @return a new Canopy
   */
  public static Canopy decodeCanopy(String formattedString) {
    int beginIndex = formattedString.indexOf('[');
    String id = formattedString.substring(0, beginIndex);
    String centroid = formattedString.substring(beginIndex);
    if (id.charAt(0) == 'C') {
      int canopyId = Integer.parseInt(formattedString.substring(1, beginIndex - 2));
      Vector canopyCentroid = AbstractVector.decodeVector(centroid);
      return new Canopy(canopyCentroid, canopyId);
    }
    return null;
  }

  /**
   * Add a point to the canopy
   *
   * @param point
   *            some point to add
   */
  public void addPoint(Vector point) {
    numPoints++;
    for (int i = 0; i < point.cardinality(); i++)
      pointTotal.set(i, point.get(i) + pointTotal.get(i));
  }

  /**
   * Emit the point to the collector, keyed by the canopy's formatted
   * representation
   *
   * @param point
   *            a point to emit.
   */
  public void emitPoint(Vector point, OutputCollector<Text, Text> collector)
      throws IOException {
    collector.collect(new Text(this.getIdentifier()), new Text(point
        .asFormatString()));
  }

  @Override
  public String toString() {
    return getIdentifier() + " - " + getCenter().asFormatString();
  }

  public String getIdentifier() {
    return "C" + canopyId;
  }

  public int getCanopyId() {
    return canopyId;
  }

  /**
   * Return the center point
   *
   * @return the center of the Canopy
   */
  public Vector getCenter() {
    return center;
  }

  /**
   * Return the number of points in the Canopy
   *
   * @return the number of points in the canopy.
   */
  public int getNumPoints() {
    return numPoints;
  }

  /**
   * Compute the centroid by averaging the pointTotals
   *
   * @return a point which is the new centroid
   */
  public Vector computeCentroid() {
    Vector result = new SparseVector(pointTotal.cardinality());
    for (int i = 0; i < pointTotal.cardinality(); i++)
      result.set(i, pointTotal.get(i) / numPoints);
    return result;
  }

  /**
   * Return if the point is covered by this canopy
   *
   * @param point
   *            a point
   * @return if the point is covered
   */
  public boolean covers(Vector point) {
    return measure.distance(center, point) < t1;
  }
}
TOP

Related Classes of org.apache.mahout.clustering.canopy.Canopy

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.