package finderbots.recommenders.hadoop;
import cascading.flow.Flow;
import cascading.flow.FlowDef;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.pipe.joiner.InnerJoin;
import cascading.scheme.hadoop.TextDelimited;
import cascading.scheme.hadoop.WritableSequenceFile;
import cascading.tap.MultiSourceTap;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.apache.mahout.math.VectorWritable;
import java.util.ArrayList;
import java.util.Map;
* User: pat
* Date: 8/8/13
* Time: 12:36 PM
* This class joins two DRMs by their row ids. It each joined data set to a CSV with field names passed in
* Examples: [B'B] and [B'A] similarity matrices joined by row id, which = item ids. B and A are joined by
* row id, which = user ids.
* The constructor set up the fields and field names used by Cascading and for column headers in the CSVs
* The main entry point is joinDRMsWriteToSolr(Path iDIndexPath, Path itemIndexPath, Path dRM1InputPath , Path dRM2InputPath, Path groupedCSVOutputPath)
* the indexes may be identical.
public class WriteDRMsToSolr {
private static Logger LOGGER = Logger.getRootLogger();
FileSystem fs;
static String iDFieldName;
String iD2FieldName;
static String dRM1FieldName;
static String dRM2FieldName;
Fields inFieldsDRM1;
Fields inFieldsDRM2;
Fields common;
Fields grouped;
Fields joinedOutFields;
Fields simpleOutFields;
WriteDRMsToSolr(Map<String, String> fields) throws IOException {
Configuration conf = new JobConf();
fs = FileSystem.get(conf);
iDFieldName = fields.get("iD1");
dRM1FieldName = fields.get("dRM1FieldName");
inFieldsDRM1 = new Fields(iDFieldName, dRM1FieldName);
simpleOutFields = new Fields(iDFieldName, dRM1FieldName);
if(fields.containsKey("dRM2FieldName")){//joining DRMs so defined needed fields
iD2FieldName = iDFieldName+"2";//just to uniqueify it from the other id field name
dRM2FieldName = fields.get("dRM2FieldName");
inFieldsDRM2 = new Fields(iDFieldName, dRM2FieldName);
common = new Fields(iDFieldName);
grouped = new Fields(iDFieldName, dRM1FieldName, iD2FieldName, dRM2FieldName);
joinedOutFields = new Fields(iDFieldName, dRM1FieldName, dRM2FieldName);
void joinDRMsWriteToSolr(Path iDIndexPath, Path itemIndexPath, Path dRM1InputPath , Path dRM2InputPath, Path groupedCSVOutputPath) throws IOException {
MultiSourceTap dRM1Source = getTaps(dRM1InputPath, inFieldsDRM1);
MultiSourceTap dRM2Source = getTaps(dRM2InputPath, inFieldsDRM2);
Pipe lhs = new Pipe("DRM1");
Pipe rhs = new Pipe("DRM2");
Pipe groupByItemIDPipe = new CoGroup(lhs, common, rhs, common, grouped, new InnerJoin());
groupByItemIDPipe = new Each(groupByItemIDPipe, new VectorsToCSVFunction(joinedOutFields));
//the DRMs (Mahout Distributed Row Matrices) have row and items indexes the two dictionary BiHashMaps
//pass these to the output function so the strings from the indexes can be written instead of the
//binary values of the Keys and Vectors in the DRMs
groupByItemIDPipe.getStepConfigDef().setProperty("itemIndexPath", itemIndexPath.toString());
// for these matrices the group by key is the id from the Mahout row key
groupByItemIDPipe.getStepConfigDef().setProperty("rowIndexPath", iDIndexPath.toString());
groupByItemIDPipe.getStepConfigDef().setProperty("joining", "true");
Tap groupedOutputSink = new Hfs(new TextDelimited(true,","), groupedCSVOutputPath.toString());
FlowDef flowDef = new FlowDef()
.addSource(lhs, dRM1Source)
.addSource(rhs, dRM2Source)
.addTailSink(groupByItemIDPipe, groupedOutputSink);
Flow flow = new HadoopFlowConnector().connect(flowDef);
//todo: not sure if it matters but may need to rename the part files to .csv
void writeDRMToSolr(Path iDIndexPath, Path itemIndexPath, Path dRM1InputPath, Path cSVOutputPath) throws IOException {
MultiSourceTap dRM1Source = getTaps(dRM1InputPath, inFieldsDRM1);
Pipe dRM1 = new Pipe("DRM1");
dRM1 = new Each(dRM1, new VectorsToCSVFunction(simpleOutFields));
//the DRM (Mahout Distributed Row Matrix) has row and items indexes the two dictionary BiHashMaps
//pass these to the output function so the strings from the indexes can be written instead of the
//binary values of the Keys and Vectors in the DRMs
dRM1.getStepConfigDef().setProperty("itemIndexPath", itemIndexPath.toString());
dRM1.getStepConfigDef().setProperty("rowIndexPath", iDIndexPath.toString());
dRM1.getStepConfigDef().setProperty("joining", "false");
Tap outputSink = new Hfs(new TextDelimited(true,","), cSVOutputPath.toString());
FlowDef flowDef = new FlowDef()
.addSource(dRM1, dRM1Source)
.addTailSink(dRM1, outputSink);
Flow flow = new HadoopFlowConnector().connect(flowDef);
//todo: not sure if it matters but may need to rename the part files to .csv
MultiSourceTap getTaps(Path p, Fields f) throws IOException {
FileStatus[] stats = fs.listStatus(p);
ArrayList<Tap> all = new ArrayList<Tap>();
if(stats != null){
for( FileStatus s : stats ){
//if(s.getPath().toString().contains("part-")){//found a part-xxxxx file
if(s.getPath().getName().matches("^part.*")){//found a part-xxxxx file
Path filePath = new Path(s.getPath().toString());
//Tap t = new Hfs( inFields, filePath.toString());
//Tap t = new Hfs(new TextLine(), filePath.toString(), true);
Tap t = new Hfs( new WritableSequenceFile( f, LongWritable.class, VectorWritable.class ), filePath.toString() );
if( s.getLen() != 0 ){// then part file is not empty
Tap[] sourceTaps = all.toArray(new Tap[all.size()]);
if(sourceTaps.length == 0 ) throw new IOException("No Source files found at: "+p.toString());
MultiSourceTap source = new MultiSourceTap(sourceTaps);
return source;
public static String getiDFieldName() {
return iDFieldName;
public static String getdRM1FieldName() {
return dRM1FieldName;
public static String getDRM2FieldName() {
return dRM2FieldName;