package image.processing.algorithms.clustering.kmeans;
/* This file is copyright (c) 2008-2012 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
/* Algorithm modified by Alexander Cooke, University of Manchester
*
*
*
*
*/
import image.processing.ImageManipulation;
import image.processing.ImageSorter;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import image.processing.patterns.cluster.Cluster;
import image.processing.patterns.cluster.DoubleArray;
import image.processing.tools.MemoryLogger;
import image.processing.FolderSearch;
/**
* This is a Hierarchical Clustering with a constant "threshold" that indicate
* the maximal distance between two clusters to group them. The algorithm stops
* when no cluster can be merged.
*
* The distance between two clusters is calculated as the distance between the
* medians of the two clusters.
*
* @author Philippe Fournier-Viger
*/
public class AlgoHierarchicalClustering {
// parameter
private double maxDistance =0; // maximum distance allowed for merging two clusters
// list of clusters
List<Cluster> clusters = null;
// for statistics
private long startTimestamp; // start time of latest execution
private long endTimestamp; // end time of latest execution
private long iterationCount; // number of iterations performed
private int vectorCount; //number of vector being processed
/**
* Default constructor
*/
public AlgoHierarchicalClustering() {
}
/**
* Run the algorithm.
* @param inputFile an input file containing vectors of doubles
* @param maxDistance the maximum distance allowed for merging two clusters
* @return a list of Clusters
* @throws IOException exception if error while reading the file
*/
public List<Cluster> runAlgorithm(String inputFile, double maxDistance) throws NumberFormatException, IOException {
// record start time
startTimestamp = System.currentTimeMillis();
// initialise vector count to 0
vectorCount = 0;
// save the parameter
this.maxDistance = maxDistance;
System.out.println("Max Distance on this run: " + maxDistance);
// create an empty list of clusters
clusters = new ArrayList<Cluster>();
// Read the vectors from the input file
// and add each vector to an individual cluster.
BufferedReader reader = new BufferedReader(new FileReader(inputFile));
System.out.println("FILE READING FROM:" + inputFile);
String line;
// for each line until the end of file
while (((line = reader.readLine()) != null)) {
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true ||
line.charAt(0) == '#' || line.charAt(0) == '%'
|| line.charAt(0) == '@') {
continue;
}
// split the line by spaces
String[] lineSplited = line.split(" ");
// convert the values to double values and put them in
// a vector of doubles
double [] vector = new double[lineSplited.length];
for (int i=0; i< lineSplited.length; i++) {
double value = Double.parseDouble(lineSplited[i]);
vector[i] = value;
// System.out.println("val");
}
// create a DoubleArray object with the vector
System.out.println("Image Number: " + vectorCount);
System.out.println(line);
DoubleArray theVector = new DoubleArray(vector, vectorCount);
vectorCount++;
// Initiallly we create a cluster for each vector
Cluster cluster = new Cluster(vector.length);
cluster.addVector(theVector);
cluster.setMean(theVector.clone());
clusters.add(cluster);
}
reader.close(); // close the input file
// (2) Loop to combine the two closest clusters into a bigger cluster
// until no clusters can be combined.
boolean changed = false;
do {
// merge the two closest clusters
changed = mergeTheClosestCluster();
// record memory usage
MemoryLogger.getInstance().checkMemory();
} while (changed);
// record end time
endTimestamp = System.currentTimeMillis();
// return the clusters
return clusters;
}
/**
* Merge the two closest clusters in terms of distance.
* @return true if a merge was done, otherwise false.
*/
private boolean mergeTheClosestCluster() {
// These variables will contain the two closest clusters that
// can be merged
Cluster clusterToMerge1 = null;
Cluster clusterToMerge2 = null;
double minClusterDistance = Integer.MAX_VALUE;
// find the two closest clusters with distance > threshold
// by comparing all pairs of clusters i and j
for (int i = 0; i < clusters.size(); i++) {
for (int j = i + 1; j < clusters.size(); j++) {
// calculate the distance between i and j
double distance = euclideanDistance(clusters.get(i).getmean(), clusters.get(j).getmean());
// if the distance is less than the max distance allowed
// and if it is the smallest distance until now
if (distance < minClusterDistance && distance <= maxDistance) {
// record this pair of clusters
minClusterDistance = distance;
clusterToMerge1 = clusters.get(i);
clusterToMerge2 = clusters.get(j);
}
}
}
// if no close clusters were found, return false
if (clusterToMerge1 == null) {
return false;
}
// else, merge the two closest clusters
for(DoubleArray vector : clusterToMerge2.getVectors()){
clusterToMerge1.addVector(vector);
}
// after mergint, we need to recompute the mean of the resulting cluster
clusterToMerge1.recomputeClusterMean();
// we delete the cluster that was merged
clusters.remove(clusterToMerge2);
// increase iteration count for statistics
iterationCount++;
return true;
}
/**
* Calculate the euclidian distance between two vectors of doubles.
* @param vector1 the first vector
* @param vector2 the second vector
* @return the distance
*/
private double euclideanDistance(DoubleArray vector1, DoubleArray vector2) {
double sum =0;
for(int i=0; i< vector1.data.length; i++){
sum += Math.pow(vector1.data[i] - vector2.data[i], 2);
}
return Math.sqrt(sum);
}
/**
* Save the clusters to an output file
* @param output the output file path
* @throws IOException exception if there is some writing error.
*/
public void saveToColFile(String output) throws IOException {
BufferedWriter writer = new BufferedWriter(new FileWriter(output));
int[] tempRow;
ImageSorter.colClusterOrder = new ArrayList<int[]>();
//Interface.colourClusterArea.setText(null);
//Interface.colourClusterArea.append("HIERARCHICAL COLOUR CLUSTERING \n\n");
ImageManipulation.initColResultsImages();
// for each cluster
for(int i=0; i< clusters.size(); i++){
//TESTING
System.out.println("max I: " + clusters.size());
//initilaise tempRow
tempRow = new int[clusters.get(i).getVectors().size()];
// if the cluster is not empty
if(clusters.get(i).getVectors().size() >= 1){
// write the cluster
writer.write("Cluster"+ i + ": " + clusters.get(i).toString());
//Interface.colourClusterArea.append("Cluster " + (i+1) + ":\n");
for(int j = 0; j < clusters.get(i).getVectors().size(); j++)
{
//TESTING
System.out.println("i = " + i);
System.out.println("j = " + j);
System.out.println("max J: " + clusters.get(i).getVectors().size());
System.out.println("Real I " + clusters.get(i).getVectors().get(j).getNum());
System.out.println("Number of images: " + ImageSorter.photoSet.length );
ImageManipulation.updateColResultsImages(clusters.get(i).getVectors().get(j).getNum(), i);
System.out.println("");
//Add current image number to temp array
tempRow[j] = clusters.get(i).getVectors().get(j).getNum();
}
// if not the last cluster, add a line return
if(i < clusters.size()-1){
writer.newLine();
}
}
else{
//Interface.colourClusterArea.append("Cluster " + i + ": EMPTY \n");
}
//Add new vector order for cluster to reference array
ImageSorter.colClusterOrder.add(tempRow);
}
ImageManipulation.finaliseColResultsImages();
// close the file
writer.close();
}
/**
* Save the clusters to an output file
* @param output the output file path
* @throws IOException exception if there is some writing error.
*/
public void saveToFeatFile(String output) throws IOException {
BufferedWriter writer = new BufferedWriter(new FileWriter(output));
int[] tempRow;
ImageSorter.featClusterOrder = new ArrayList<int[]>();
//Interface.featureClusterArea.setText(null);
//Interface.featureClusterArea.append("HIERARCHICAL FEATURE CLUSTERING \n\n");
ImageManipulation.initFeatResultsImages();
// for each cluster
for(int i=0; i< clusters.size(); i++){
//initilaise tempRow
tempRow = new int[clusters.get(i).getVectors().size()];
// if the cluster is not empty
if(clusters.get(i).getVectors().size() >= 1){
// write the cluster
writer.write("Cluster"+ i + ": " + clusters.get(i).toString());
//Interface.featureClusterArea.append("Cluster " + (i+1) + ":\n");
for(int j = 0; j < clusters.get(i).getVectors().size(); j++)
{
//Interface.featureClusterArea.append(" -contains:\t"
// + ImageSorter.photoSet[clusters.get(i).getVectors().get(j).getNum()].getShortName()
// + "\n");
ImageManipulation.updateFeatResultsImages(clusters.get(i).getVectors().get(j).getNum(), i);
System.out.println("i = " + i);
//Add current image number to temp array
tempRow[j] = clusters.get(i).getVectors().get(j).getNum();
}
// if not the last cluster, add a line return
if(i < clusters.size()-1){
writer.newLine();
}
}
else{
//Interface.featureClusterArea.append("Cluster " + i + ": EMPTY \n");
}
//Add new vector order for cluster to reference array
ImageSorter.featClusterOrder.add(tempRow);
}
ImageManipulation.finaliseFeatResultsImages();
// close the file
writer.close();
}
/**
* Save the clusters to an output file
* @param output the output file path
* @throws IOException exception if there is some writing error.
*/
public void saveToCombFile(String output) throws IOException {
BufferedWriter writer = new BufferedWriter(new FileWriter(output));
int[] tempRow;
ImageSorter.combClusterOrder = new ArrayList<int[]>();
//Interface.featureClusterArea.setText(null);
//Interface.featureClusterArea.append("HIERARCHICAL COMBINED CLUSTERING \n\n");
ImageManipulation.initCombResultsImages();
// for each cluster
for(int i=0; i< clusters.size(); i++){
//initilaise tempRow
tempRow = new int[clusters.get(i).getVectors().size()];
// if the cluster is not empty
if(clusters.get(i).getVectors().size() >= 1){
// write the cluster
writer.write("Cluster"+ i + ": " + clusters.get(i).toString());
//Interface.combinedClusterArea.append("Cluster " + (i+1) + ":\n");
for(int j = 0; j < clusters.get(i).getVectors().size(); j++)
{
//Interface.combinedClusterArea.append(" -contains:\t"
// + ImageSorter.photoSet[clusters.get(i).getVectors().get(j).getNum()].getShortName()
// + "\n");
ImageManipulation.updateCombResultsImages(clusters.get(i).getVectors().get(j).getNum(), i);
System.out.println("i = " + i);
//Add current image number to temp array
tempRow[j] = clusters.get(i).getVectors().get(j).getNum();
}
// if not the last cluster, add a line return
if(i < clusters.size()-1){
writer.newLine();
}
}
else{
//Interface.combinedClusterArea.append("Cluster " + i + ": EMPTY \n");
}
//Add new vector order for cluster to reference array
ImageSorter.combClusterOrder.add(tempRow);
}
ImageManipulation.finaliseCombResultsImages();
// close the file
writer.close();
}
/**
* Print statistics about the latest execution to System.out.
*/
public void printStatistics() {
System.out.println(" ");
System.out.println("========== HIERARCHICAL CLUSTERING - STATS ============");
System.out.println(" Total time ~: " + (endTimestamp - startTimestamp)
+ " ms");
System.out.println(" Max memory:" + MemoryLogger.getInstance().getMaxMemory() + " mb ");
System.out.println(" Iteration count: " + iterationCount);
System.out.println("=====================================");
}
}