package com.chine.kmeans.mapreduce.canopydata;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import com.chine.kmeans.models.Movie;
import com.chine.kmeans.mapreduce.ConfiguredKmeans;
public class CanopyDataMapper extends Mapper<Text, Text, Text, Text> {
private boolean hasLoadCanopyCenters = false;
private List<Movie> canopyMovieCenters = new ArrayList<Movie>();
@Override
public void setup(Context context)
throws IOException, InterruptedException{
if(hasLoadCanopyCenters)
return;
else
hasLoadCanopyCenters = true;
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.get(conf);
String folderPath = conf.get(ConfiguredKmeans.CANOPY_CENTERS_OUTPUT_KEY);
String filePath = folderPath.endsWith("/") ? folderPath+"part-r-00000"
: folderPath+"/part-r-00000";
Path path = new Path(filePath);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
Text key = new Text();
Text value = new Text();
try {
while(reader.next(key, value)) {
Movie movie = new Movie(Integer.valueOf(key.toString()), value.toString());
this.canopyMovieCenters.add(movie);
}
}
finally {
reader.close();
}
}
@Override
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
int movieId = Integer.valueOf(key.toString());
String data = value.toString();
Movie currentMovie = new Movie(movieId, data);
boolean emit = false;
StringBuilder sb = new StringBuilder();
for(Movie canopyMovie: canopyMovieCenters) {
if(currentMovie.getMatchCount(canopyMovie) >= ConfiguredKmeans.T2) {
if(!emit) emit = true;
if(sb.length() > 0)
sb.append(":");
sb.append(currentMovie.getMovieId());
}
}
if(emit) {
sb.append(":");
sb.append(movieId);
sb.append(":");
sb.append(data);
context.write(key, new Text(sb.toString()));
}
}
}