package com.chine.kmeans.mapreduce.kmeansiter;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.text.NumberFormat;
import java.text.DecimalFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import com.chine.kmeans.models.Movie;
import com.chine.kmeans.mapreduce.ConfiguredKmeans;
public class KmeansIterMapper extends Mapper<Text, Text, Text, Text> {
private static HashMap<Integer, ArrayList<Movie>> canopyKmeansCenters =
new HashMap<Integer, ArrayList<Movie>>();
private boolean hasLoadCenters = false;
private NumberFormat nf = new DecimalFormat("00000");
@Override
public void setup(Context context) throws IOException {
if(hasLoadCenters) return;
else
hasLoadCenters = true;
List<Movie> canopyCenters = new ArrayList<Movie>();
List<Movie> kmeansCenters = new ArrayList<Movie>();
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.get(conf);
String canopyPath = conf.get(ConfiguredKmeans.CANOPY_CENTERS_OUTPUT_KEY);
canopyPath = canopyPath.endsWith("/") ? canopyPath + "part-r-00000":
canopyPath + "/part-r-00000";
String kmeansPath = conf.get(ConfiguredKmeans.KMEANS_OUTPUT_KEY);
kmeansPath = kmeansPath.endsWith("/") ? kmeansPath + "part-r-":
kmeansPath + "/part-r-";
String kmeansPartPath = "";
int kmeansPartCount = Integer.valueOf(conf.get(
ConfiguredKmeans.KMEANS_OUTPUT_NUM_KEY));
loadCanopyCenters(canopyPath, conf, fs, canopyCenters);
for(int i=0; i<kmeansPartCount; i++) {
kmeansPartPath = kmeansPath + nf.format(i);
loadKmeansCenters(kmeansPartPath, conf, fs, kmeansCenters);
}
for(Movie cMovie: canopyCenters) {
for(Movie kMovie: kmeansCenters) {
if(cMovie.getMatchCount(kMovie) >= ConfiguredKmeans.T2) {
if(!canopyKmeansCenters.containsKey(cMovie))
canopyKmeansCenters.put(cMovie.getMovieId(),
new ArrayList<Movie>());
canopyKmeansCenters.get(cMovie.getMovieId()).add(kMovie);
}
}
}
}
private void loadCanopyCenters(
String input,
Configuration conf,
FileSystem fs,
List<Movie> canopyCenters) throws IOException {
Path path = new Path(input);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
Text key = new Text();
Text value = new Text();
while(reader.next(key, value)) {
Movie currentMovie = new Movie(Integer.valueOf(key.toString())
, value.toString());
if(currentMovie.getMap().size() > 5)
canopyCenters.add(currentMovie);
}
}
private void loadKmeansCenters(
String input,
Configuration conf,
FileSystem fs,
List<Movie> kmeansCenters) throws IOException {
Path path = new Path(input);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
Text key = new Text();
Text value = new Text();
while(reader.next(key, value)) {
Movie currentMovie = new Movie(Integer.valueOf(key.toString())
, value.toString());
if(currentMovie.getMap().size() > 5)
kmeansCenters.add(currentMovie);
}
}
@Override
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
String[] splits = value.toString().split(":");
int movieId = Integer.valueOf(splits[splits.length-2]);
String data = splits[splits.length-1];
Movie currentMovie = new Movie(movieId, data);
HashSet<Movie> hasCalcCenters = new HashSet<Movie>();
double maxDst = -1;
Movie maxMovie = null;
for(int i=0; i<splits.length-2; i++) {
Integer cId = Integer.valueOf(splits[i]);
ArrayList<Movie> movies = canopyKmeansCenters.get(cId);
if(movies == null)
continue;
for(Movie movie: movies) {
if(hasCalcCenters.contains(movie))
continue;
hasCalcCenters.add(movie);
double dst = movie.getComplexDistance(currentMovie);
if(dst > maxDst) {
maxDst = dst;
maxMovie = movie;
}
}
}
if(maxDst > -1) {
context.write(new Text(String.valueOf(maxMovie.getMovieId())),
new Text(String.valueOf(movieId)+":"+data));
}
}
}