flat1.add(true);
}
}
// This foreach will pick the sort key columns from the RandomSampleLoader output
POForEach nfe1 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)),-1,eps1,flat1);
mro.mapPlan.addAsLeaf(nfe1);
// Now set up a POLocalRearrange which has "all" as the key and the output of the
// foreach will be the "value" out of POLocalRearrange
PhysicalPlan ep1 = new PhysicalPlan();
ConstantExpression ce = new ConstantExpression(new OperatorKey(scope,nig.getNextNodeId(scope)));
ce.setValue("all");
ce.setResultType(DataType.CHARARRAY);
ep1.add(ce);
List<PhysicalPlan> eps = new ArrayList<PhysicalPlan>();
eps.add(ep1);
POLocalRearrange lr = new POLocalRearrange(new OperatorKey(scope,nig.getNextNodeId(scope)));
try {
lr.setIndex(0);
} catch (ExecException e) {
int errCode = 2058;
String msg = "Unable to set index on newly created POLocalRearrange.";
throw new PlanException(msg, errCode, PigException.BUG, e);
}
lr.setKeyType(DataType.CHARARRAY);
lr.setPlans(eps);
lr.setResultType(DataType.TUPLE);
mro.mapPlan.add(lr);
mro.mapPlan.connect(nfe1, lr);
mro.setMapDone(true);
POPackage pkg = new POPackage(new OperatorKey(scope,nig.getNextNodeId(scope)));
pkg.setKeyType(DataType.CHARARRAY);
pkg.setNumInps(1);
boolean[] inner = {false};
pkg.setInner(inner);
mro.reducePlan.add(pkg);
// Lets start building the plan which will have the sort
// for the foreach
PhysicalPlan fe2Plan = new PhysicalPlan();
// Top level project which just projects the tuple which is coming
// from the foreach after the package
POProject topPrj = new POProject(new OperatorKey(scope,nig.getNextNodeId(scope)));
topPrj.setColumn(1);
topPrj.setResultType(DataType.TUPLE);
topPrj.setOverloaded(true);
fe2Plan.add(topPrj);
// the projections which will form sort plans
List<PhysicalPlan> nesSortPlanLst = new ArrayList<PhysicalPlan>();
if (sortKeyPlans != null) {
for(int i=0; i<sortKeyPlans.size(); i++) {
nesSortPlanLst.add(sortKeyPlans.get(i));
}
}else{
Pair<Integer,Byte>[] fields = null;
try{
fields = getSortCols(sort.getSortPlans());
}catch(Exception e) {
throw new RuntimeException(e);
}
// Set up the projections of the key columns
if (fields == null) {
PhysicalPlan ep = new PhysicalPlan();
POProject prj = new POProject(new OperatorKey(scope,
nig.getNextNodeId(scope)));
prj.setStar(true);
prj.setOverloaded(false);
prj.setResultType(DataType.TUPLE);
ep.add(prj);
nesSortPlanLst.add(ep);
} else {
for (int i=0; i<fields.length; i++) {
PhysicalPlan ep = new PhysicalPlan();
POProject prj = new POProject(new OperatorKey(scope,nig.getNextNodeId(scope)));
prj.setColumn(i);
prj.setOverloaded(false);
prj.setResultType(fields[i].second);
ep.add(prj);
nesSortPlanLst.add(ep);
}
}
}
sort.setSortPlans(nesSortPlanLst);
sort.setResultType(DataType.BAG);
fe2Plan.add(sort);
fe2Plan.connect(topPrj, sort);
// The plan which will have a constant representing the
// degree of parallelism for the final order by map-reduce job
// this will either come from a "order by parallel x" in the script
// or will be the default number of reducers for the cluster if
// "parallel x" is not used in the script
PhysicalPlan rpep = new PhysicalPlan();
ConstantExpression rpce = new ConstantExpression(new OperatorKey(scope,nig.getNextNodeId(scope)));
rpce.setRequestedParallelism(rp);
int val = rp;
if(val<=0){
ExecutionEngine eng = pigContext.getExecutionEngine();
if(eng instanceof HExecutionEngine){
try {
val = Math.round(0.9f * ((HExecutionEngine)eng).getJobClient().getDefaultReduces());
if(val<=0)
val = 1;
} catch (IOException e) {
int errCode = 6015;
String msg = "Problem getting the default number of reduces from the Job Client.";
throw new MRCompilerException(msg, errCode, PigException.REMOTE_ENVIRONMENT, e);
}
} else {
val = 1; // local mode, set it to 1
}
}
int parallelismForSort = (rp <= 0 ? val : rp);
rpce.setValue(parallelismForSort);
rpce.setResultType(DataType.INTEGER);
rpep.add(rpce);
List<PhysicalPlan> genEps = new ArrayList<PhysicalPlan>();
genEps.add(rpep);
genEps.add(fe2Plan);
List<Boolean> flattened2 = new ArrayList<Boolean>();
flattened2.add(false);
flattened2.add(false);
POForEach nfe2 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)),-1, genEps, flattened2);
mro.reducePlan.add(nfe2);
mro.reducePlan.connect(pkg, nfe2);
// Let's connect the output from the foreach containing
// number of quantiles and the sorted bag of samples to
// another foreach with the FindQuantiles udf. The input
// to the FindQuantiles udf is a project(*) which takes the
// foreach input and gives it to the udf
PhysicalPlan ep4 = new PhysicalPlan();
POProject prjStar4 = new POProject(new OperatorKey(scope,nig.getNextNodeId(scope)));
prjStar4.setResultType(DataType.TUPLE);
prjStar4.setStar(true);
ep4.add(prjStar4);
List<PhysicalOperator> ufInps = new ArrayList<PhysicalOperator>();
ufInps.add(prjStar4);
POUserFunc uf = new POUserFunc(new OperatorKey(scope,nig.getNextNodeId(scope)), -1, ufInps,
new FuncSpec(udfClassName, udfArgs));
ep4.add(uf);
ep4.connect(prjStar4, uf);
List<PhysicalPlan> ep4s = new ArrayList<PhysicalPlan>();
ep4s.add(ep4);
List<Boolean> flattened3 = new ArrayList<Boolean>();
flattened3.add(false);
POForEach nfe3 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)), -1, ep4s, flattened3);
mro.reducePlan.add(nfe3);
mro.reducePlan.connect(nfe2, nfe3);
POStore str = getStore();