// first job uses a single reducer for the sampling
Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf());
// Simulate the first job having run so estimation kicks in.
MapReduceOper sort = mrPlan.getLeaves().get(0);
jcc.updateMROpPlan(jobControl.getReadyJobs());
FileLocalizer.create(sort.getQuantFile(), pc);
jobControl = jcc.compile(mrPlan, query);
sort = mrPlan.getLeaves().get(0);
long reducer=Math.min((long)Math.ceil(new File("test/org/apache/pig/test/data/passwd").length()/100.0), 10);
assertEquals(reducer, sort.getRequestedParallelism());
// the second job estimates reducers
Util.assertParallelValues(-1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf());
// use the PARALLEL key word, it will override the estimated reducer number
query = "a = load '/passwd';" + "b = order a by $0 PARALLEL 2;" +
"store b into 'output';";
pp = Util.buildPp(ps, query);
mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
assertEquals(2, mrPlan.size());
sort = mrPlan.getLeaves().get(0);
assertEquals(2, sort.getRequestedParallelism());
// the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as hbase
query = "a = load 'hbase://passwd' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');" +
"b = order a by $0 ;" +
"store b into 'output';";
pp = Util.buildPp(ps, query);
mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
assertEquals(2, mrPlan.size());
sort = mrPlan.getLeaves().get(0);
// the requested parallel will be -1 if users don't set any of default_parallel, paralllel
// and the estimation doesn't take effect. MR framework will finally set it to 1.
assertEquals(-1, sort.getRequestedParallelism());
// test order by with three jobs (after optimization)
query = "a = load '/passwd';" +
"b = foreach a generate $0, $1, $2;" +
"c = order b by $0;" +
"store c into 'output';";
pp = Util.buildPp(ps, query);
mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
assertEquals(3, mrPlan.size());
// Simulate the first 2 jobs having run so estimation kicks in.
sort = mrPlan.getLeaves().get(0);
FileLocalizer.create(sort.getQuantFile(), pc);
jobControl = jcc.compile(mrPlan, query);
Util.copyFromLocalToCluster(cluster, "test/org/apache/pig/test/data/passwd", ((POLoad) sort.mapPlan.getRoots().get(0)).getLFile().getFileName());
//First job is just foreach with projection, mapper-only job, so estimate gets ignored
Util.assertParallelValues(-1, -1, reducer, 0, jobControl.getWaitingJobs().get(0).getJobConf());
jcc.updateMROpPlan(jobControl.getReadyJobs());
jobControl = jcc.compile(mrPlan, query);
jcc.updateMROpPlan(jobControl.getReadyJobs());
//Second job is a sampler, which requests and gets 1 reducer
Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf());
jobControl = jcc.compile(mrPlan, query);
sort = mrPlan.getLeaves().get(0);
assertEquals(reducer, sort.getRequestedParallelism());
//Third job is the order, which uses the estimated number of reducers
Util.assertParallelValues(-1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf());
}