Package com.chine.kmeans.mapreduce.kmeansiter

Source Code of com.chine.kmeans.mapreduce.kmeansiter.KmeansIterMapper

package com.chine.kmeans.mapreduce.kmeansiter;

import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.text.NumberFormat;
import java.text.DecimalFormat;

import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;

import com.chine.kmeans.models.Movie;
import com.chine.kmeans.mapreduce.ConfiguredKmeans;

public class KmeansIterMapper extends Mapper<Text, Text, Text, Text> {
 
  private static HashMap<Integer, ArrayList<Movie>> canopyKmeansCenters =
    new HashMap<Integer, ArrayList<Movie>>();
 
  private boolean hasLoadCenters = false;
  private NumberFormat nf = new DecimalFormat("00000");
 
  @Override
  public void setup(Context context) throws IOException {
    if(hasLoadCenters) return;
    else
      hasLoadCenters = true;
   
    List<Movie> canopyCenters = new ArrayList<Movie>();
    List<Movie> kmeansCenters = new ArrayList<Movie>();
   
    Configuration conf = context.getConfiguration();
    FileSystem fs = FileSystem.get(conf);
   
    String canopyPath = conf.get(ConfiguredKmeans.CANOPY_CENTERS_OUTPUT_KEY);
    canopyPath = canopyPath.endsWith("/") ? canopyPath + "part-r-00000":
      canopyPath + "/part-r-00000";
    String kmeansPath = conf.get(ConfiguredKmeans.KMEANS_OUTPUT_KEY);
    kmeansPath = kmeansPath.endsWith("/") ? kmeansPath + "part-r-":
      kmeansPath + "/part-r-";
    String kmeansPartPath = "";
    int kmeansPartCount = Integer.valueOf(conf.get(
        ConfiguredKmeans.KMEANS_OUTPUT_NUM_KEY));
   
    loadCanopyCenters(canopyPath, conf, fs, canopyCenters);
    for(int i=0; i<kmeansPartCount; i++) {
      kmeansPartPath = kmeansPath + nf.format(i);   
      loadKmeansCenters(kmeansPartPath, conf, fs, kmeansCenters);
    }
   
    for(Movie cMovie: canopyCenters) {
      for(Movie kMovie: kmeansCenters) {
        if(cMovie.getMatchCount(kMovie) >= ConfiguredKmeans.T2) {
          if(!canopyKmeansCenters.containsKey(cMovie))
            canopyKmeansCenters.put(cMovie.getMovieId(),
                new ArrayList<Movie>());
          canopyKmeansCenters.get(cMovie.getMovieId()).add(kMovie);
        }
      }
    }
  }
 
  private void loadCanopyCenters(
      String input,
      Configuration conf,
      FileSystem fs,
      List<Movie> canopyCenters) throws IOException {
   
    Path path = new Path(input);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
   
    Text key = new Text();
    Text value = new Text();
    while(reader.next(key, value)) {
      Movie currentMovie = new Movie(Integer.valueOf(key.toString())
          , value.toString());
      if(currentMovie.getMap().size() > 5)
        canopyCenters.add(currentMovie);
    }
  }
 
  private void loadKmeansCenters(
      String input,
      Configuration conf,
      FileSystem fs,
      List<Movie> kmeansCenters) throws IOException {
   
    Path path = new Path(input);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
   
    Text key = new Text();
    Text value = new Text();
    while(reader.next(key, value)) {
      Movie currentMovie = new Movie(Integer.valueOf(key.toString())
          , value.toString());
      if(currentMovie.getMap().size() > 5)
        kmeansCenters.add(currentMovie);
    }
  }
 
  @Override
  public void map(Text key, Text value, Context context)
    throws IOException, InterruptedException {
   
    String[] splits = value.toString().split(":");
    int movieId = Integer.valueOf(splits[splits.length-2]);
    String data = splits[splits.length-1];
    Movie currentMovie = new Movie(movieId, data);
   
    HashSet<Movie> hasCalcCenters = new HashSet<Movie>();
    double maxDst = -1;
    Movie maxMovie = null;
   
    for(int i=0; i<splits.length-2; i++) {
      Integer cId = Integer.valueOf(splits[i]);
      ArrayList<Movie> movies = canopyKmeansCenters.get(cId);
      if(movies == null)
        continue;
     
      for(Movie movie: movies) {
        if(hasCalcCenters.contains(movie))
          continue;
        hasCalcCenters.add(movie);
       
        double dst = movie.getComplexDistance(currentMovie);
        if(dst > maxDst) {
          maxDst = dst;
          maxMovie = movie;
        }
      }
     
    }
   
    if(maxDst > -1) {
      context.write(new Text(String.valueOf(maxMovie.getMovieId())),
          new Text(String.valueOf(movieId)+":"+data));
    }
   
  }
 
}
TOP

Related Classes of com.chine.kmeans.mapreduce.kmeansiter.KmeansIterMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.