joinOp.setupRightPipeline(rightPipelinePlan);
rightMROpr.requestedParallelism = 1; // we need exactly one reducer for indexing job.
// At this point, we must be operating on map plan of right input and it would contain nothing else other then a POLoad.
POLoad rightLoader = (POLoad)rightMROpr.mapPlan.getRoots().get(0);
LoadFunc rightLoadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(rightLoader.getLFile().getFuncSpec());
joinOp.setSignature(rightLoader.getSignature());
if(rightLoadFunc instanceof IndexableLoadFunc) {
joinOp.setRightLoaderFuncSpec(rightLoader.getLFile().getFuncSpec());
joinOp.setRightInputFileName(rightLoader.getLFile().getFileName());
// we don't need the right MROper since
// the right loader is an IndexableLoadFunc which can handle the index
// itself
MRPlan.remove(rightMROpr);
if(rightMROpr == compiledInputs[0]) {
compiledInputs[0] = null;
} else if(rightMROpr == compiledInputs[1]) {
compiledInputs[1] = null;
}
rightMROpr = null;
// validate that the join keys in merge join are only
// simple column projections or '*' and not expression - expressions
// cannot be handled when the index is built by the storage layer on the sorted
// data when the sorted data (and corresponding index) is written.
// So merge join will be restricted not have expressions as
// join keys
int numInputs = mPlan.getPredecessors(joinOp).size(); // should be 2
for(int i = 0; i < numInputs; i++) {
List<PhysicalPlan> keyPlans = joinOp.getInnerPlansOf(i);
for (PhysicalPlan keyPlan : keyPlans) {
for(PhysicalOperator op : keyPlan) {
if(!(op instanceof POProject)) {
int errCode = 1106;
String errMsg = "Merge join is possible only for simple column or '*' join keys when using " +
rightLoader.getLFile().getFuncSpec() + " as the loader";
throw new MRCompilerException(errMsg, errCode, PigException.INPUT);
}
}
}
}
} else {
// Replace POLoad with indexer.
String[] indexerArgs = new String[3];
FileSpec origRightLoaderFileSpec = rightLoader.getLFile();
indexerArgs[0] = origRightLoaderFileSpec.getFuncSpec().toString();
if (! (PigContext.instantiateFuncFromSpec(indexerArgs[0]) instanceof SamplableLoader)){
int errCode = 1104;
String errMsg = "Right input of merge-join must implement SamplableLoader interface. The specified loader " + indexerArgs[0] + " doesn't implement it";
throw new MRCompilerException(errMsg,errCode);
}
List<PhysicalPlan> rightInpPlans = joinOp.getInnerPlansOf(1);
indexerArgs[1] = ObjectSerializer.serialize((Serializable)rightInpPlans);
indexerArgs[2] = ObjectSerializer.serialize(rightPipelinePlan);
FileSpec lFile = new FileSpec(rightLoader.getLFile().getFileName(),new FuncSpec(MergeJoinIndexer.class.getName(), indexerArgs));
rightLoader.setLFile(lFile);
// Loader of mro will return a tuple of form (key1, key2, ..,filename, offset)
// Now set up a POLocalRearrange which has "all" as the key and tuple fetched
// by loader as the "value" of POLocalRearrange
// Sorting of index can possibly be achieved by using Hadoop sorting between map and reduce instead of Pig doing sort. If that is so,