package mia.clustering.ch12.lastfm;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.GenericsUtil;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.VectorWritable;
public class VectorMapper extends
Mapper<LongWritable,Text,Text,VectorWritable> {
private Pattern splitter;
private VectorWritable writer;
private Map<String,Integer> dictionary = new HashMap<String,Integer>();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
String[] fields = splitter.split(value.toString());
if (fields.length < 4) {
context.getCounter("Map", "LinesWithErrors").increment(1);
return;
}
String artist = fields[1];
String tag = fields[2];
double weight = Double.parseDouble(fields[3]);
NamedVector vector = new NamedVector(
new SequentialAccessSparseVector(dictionary.size()), tag);
vector.set(dictionary.get(artist), weight);
writer.set(vector);
context.write(new Text(tag), writer);
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
DefaultStringifier<Map<String,Integer>> mapStringifier
= new DefaultStringifier<Map<String,Integer>>(
conf, GenericsUtil.getClass(dictionary));
dictionary = mapStringifier.fromString(conf.get("dictionary"));
splitter = Pattern.compile("<sep>");
writer = new VectorWritable();
}
}