PhysicalOperator po = pos.get(0);
if (!(po instanceof POLoad)) {
log.debug("Root operator of map is not load.");
return; // Huh?
POLoad load = (POLoad)po;
String loadFunc = load.getLFile().getFuncName();
String loadFile = load.getLFile().getFileName();
if (!("org.apache.pig.impl.builtin.RandomSampleLoader".equals(loadFunc)) && !("org.apache.pig.impl.builtin.PoissonSampleLoader".equals(loadFunc))) {
log.debug("Not a sampling job.");
if (loadFile == null) {
log.debug("No load file");
// Get this job's predecessor. There should be exactly one.;
List<MapReduceOper> preds = mPlan.getPredecessors(mr);
if (preds.size() != 1) {
log.debug("Too many predecessors to sampling job.");
MapReduceOper pred = preds.get(0);
// The predecessor should be a root.
List<MapReduceOper> predPreds = mPlan.getPredecessors(pred);
if (predPreds != null && predPreds.size() > 0) {
log.debug("Predecessor should be a root of the plan");
// The predecessor should have just a load and store in the map, and nothing
// in the combine or reduce.
if ( !(pred.reducePlan.isEmpty() && pred.combinePlan.isEmpty())) {
log.debug("Predecessor has a combine or reduce plan");
if (pred.mapPlan == null || pred.mapPlan.size() != 2) {
log.debug("Predecessor has more than just load+store in the map");
List<PhysicalOperator> loads = pred.mapPlan.getRoots();
if (loads.size() != 1) {
log.debug("Predecessor plan has more than one root.");
PhysicalOperator r = loads.get(0);
if (!(r instanceof POLoad)) { // Huh?
log.debug("Predecessor's map plan root is not a load.");
POLoad predLoad = (POLoad)r;
LoadFunc lf = (LoadFunc)PigContext.instantiateFuncFromSpec(predLoad.getLFile().getFuncSpec());
if (!(lf instanceof SamplableLoader)) {
log.debug("Predecessor's loader does not implement SamplableLoader");
// The MR job should have one successor.
List<MapReduceOper> succs = mPlan.getSuccessors(mr);
if (succs.size() != 1) {
log.debug("Job has more than one successor.");
MapReduceOper succ = succs.get(0);
// Find the load the correlates with the file the sampler is loading, and
// check that it is using BinaryStorage.
if (succ.mapPlan == null) { // Huh?
log.debug("Successor has no map plan.");
loads = succ.mapPlan.getRoots();
POLoad succLoad = null;
for (PhysicalOperator root : loads) {
if (!(root instanceof POLoad)) { // Huh?
log.debug("Successor's roots are not loads");
POLoad sl = (POLoad)root;
if (loadFile.equals(sl.getLFile().getFileName()) &&
"org.apache.pig.builtin.BinStorage".equals(sl.getLFile().getFuncName())) {
succLoad = sl;
if (succLoad == null) {
log.debug("Could not find load that matched file we are sampling.");
// Okay, we're on.
// First, replace our RandomSampleLoader with a RandomSampleLoader that uses
// the load function from our predecessor.
String[] rslargs = new String[2];
FileSpec predFs = predLoad.getLFile();
// First argument is FuncSpec of loader function to subsume, this we want to set for
// ourselves.
rslargs[0] = predFs.getFuncSpec().toString();
// Second argument is the number of samples per block, read this from the original.
rslargs[1] = load.getLFile().getFuncSpec().getCtorArgs()[1];
FileSpec fs = new FileSpec(predFs.getFileName(),new FuncSpec(loadFunc, rslargs));
POLoad newLoad = new POLoad(load.getOperatorKey(),load.getRequestedParallelism(), fs, load.isSplittable());
try {
mr.mapPlan.replace(load, newLoad);
// check if it has PartitionSkewedKeys
List<PhysicalOperator> ls = mr.reducePlan.getLeaves();
for(PhysicalOperator op: ls) {
scan(mr, op, fs.getFileName());
} catch (PlanException e) {
throw new VisitorException(e);
// Second, replace the loader in our successor with whatever the originally used loader was.
fs = new FileSpec(predFs.getFileName(), predFs.getFuncSpec());
newLoad = new POLoad(succLoad.getOperatorKey(), succLoad.getRequestedParallelism(), fs, succLoad.isSplittable());
try {
succ.mapPlan.replace(succLoad, newLoad);
} catch (PlanException e) {
throw new VisitorException(e);