nwJob.setOutputFormatClass(PigOutputFormat.class);
if (mapStores.size() + reduceStores.size() == 1) { // single store case
log.info("Setting up single store job");
POStore st;
if (reduceStores.isEmpty()) {
st = mapStores.get(0);
if(!pigContext.inIllustrator)
mro.mapPlan.remove(st);
}
else {
st = reduceStores.get(0);
if(!pigContext.inIllustrator)
mro.reducePlan.remove(st);
}
// set out filespecs
String outputPathString = st.getSFile().getFileName();
if (!outputPathString.contains("://") || outputPathString.startsWith("hdfs://")) {
conf.set("pig.streaming.log.dir",
new Path(outputPathString, LOG_DIR).toString());
} else {
String tmpLocationStr = FileLocalizer
.getTemporaryPath(pigContext).toString();
tmpLocation = new Path(tmpLocationStr);
conf.set("pig.streaming.log.dir",
new Path(tmpLocation, LOG_DIR).toString());
}
conf.set("pig.streaming.task.output.dir", outputPathString);
}
else if (mapStores.size() + reduceStores.size() > 0) { // multi store case
log.info("Setting up multi store job");
String tmpLocationStr = FileLocalizer
.getTemporaryPath(pigContext).toString();
tmpLocation = new Path(tmpLocationStr);
nwJob.setOutputFormatClass(PigOutputFormat.class);
int idx = 0;
for (POStore sto: storeLocations) {
sto.setMultiStore(true);
sto.setIndex(idx++);
}
conf.set("pig.streaming.log.dir",
new Path(tmpLocation, LOG_DIR).toString());
conf.set("pig.streaming.task.output.dir", tmpLocation.toString());
}
// store map key type
// this is needed when the key is null to create
// an appropriate NullableXXXWritable object
conf.set("pig.map.keytype", ObjectSerializer.serialize(new byte[] { mro.mapKeyType }));
// set parent plan in all operators in map and reduce plans
// currently the parent plan is really used only when POStream is present in the plan
new PhyPlanSetter(mro.mapPlan).visit();
new PhyPlanSetter(mro.reducePlan).visit();
// this call modifies the ReplFiles names of POFRJoin operators
// within the MR plans, must be called before the plans are
// serialized
setupDistributedCacheForJoin(mro, pigContext, conf);
// Search to see if we have any UDFs that need to pack things into the
// distrubted cache.
setupDistributedCacheForUdfs(mro, pigContext, conf);
POPackage pack = null;
if(mro.reducePlan.isEmpty()){
//MapOnly Job
nwJob.setMapperClass(PigMapOnly.Map.class);
nwJob.setNumReduceTasks(0);
if(!pigContext.inIllustrator)
conf.set("pig.mapPlan", ObjectSerializer.serialize(mro.mapPlan));
if(mro.isEndOfAllInputSetInMap()) {
// this is used in Map.close() to decide whether the
// pipeline needs to be rerun one more time in the close()
// The pipeline is rerun if there either was a stream or POMergeJoin
conf.set(END_OF_INP_IN_MAP, "true");
}
}
else{
//Map Reduce Job
//Process the POPackage operator and remove it from the reduce plan
if(!mro.combinePlan.isEmpty()){
POPackage combPack = (POPackage)mro.combinePlan.getRoots().get(0);
mro.combinePlan.remove(combPack);
nwJob.setCombinerClass(PigCombiner.Combine.class);
conf.set("pig.combinePlan", ObjectSerializer.serialize(mro.combinePlan));
conf.set("pig.combine.package", ObjectSerializer.serialize(combPack));
} else if (mro.needsDistinctCombiner()) {
nwJob.setCombinerClass(DistinctCombiner.Combine.class);
log.info("Setting identity combiner class.");
}
pack = (POPackage)mro.reducePlan.getRoots().get(0);
if(!pigContext.inIllustrator)
mro.reducePlan.remove(pack);
nwJob.setMapperClass(PigMapReduce.Map.class);
nwJob.setReducerClass(PigMapReduce.Reduce.class);
// first check the PARALLE in query, then check the defaultParallel in PigContext, and last do estimation
if (mro.requestedParallelism > 0)
nwJob.setNumReduceTasks(mro.requestedParallelism);
else if (pigContext.defaultParallel > 0)
conf.set("mapred.reduce.tasks", ""+pigContext.defaultParallel);
else
estimateNumberOfReducers(conf,lds);
if (mro.customPartitioner != null)
nwJob.setPartitionerClass(PigContext.resolveClassName(mro.customPartitioner));
if(!pigContext.inIllustrator)
conf.set("pig.mapPlan", ObjectSerializer.serialize(mro.mapPlan));
if(mro.isEndOfAllInputSetInMap()) {
// this is used in Map.close() to decide whether the
// pipeline needs to be rerun one more time in the close()
// The pipeline is rerun only if there was a stream or merge-join.
conf.set(END_OF_INP_IN_MAP, "true");
}
if(!pigContext.inIllustrator)
conf.set("pig.reducePlan", ObjectSerializer.serialize(mro.reducePlan));
if(mro.isEndOfAllInputSetInReduce()) {
// this is used in Map.close() to decide whether the
// pipeline needs to be rerun one more time in the close()
// The pipeline is rerun only if there was a stream
conf.set("pig.stream.in.reduce", "true");
}
if (!pigContext.inIllustrator)
conf.set("pig.reduce.package", ObjectSerializer.serialize(pack));
conf.set("pig.reduce.key.type", Byte.toString(pack.getKeyType()));
if (mro.getUseSecondaryKey()) {
nwJob.setGroupingComparatorClass(PigSecondaryKeyGroupComparator.class);
nwJob.setPartitionerClass(SecondaryKeyPartitioner.class);
nwJob.setSortComparatorClass(PigSecondaryKeyComparator.class);
nwJob.setOutputKeyClass(NullableTuple.class);
conf.set("pig.secondarySortOrder",
ObjectSerializer.serialize(mro.getSecondarySortOrder()));
}
else
{
Class<? extends WritableComparable> keyClass = HDataType.getWritableComparableTypes(pack.getKeyType()).getClass();
nwJob.setOutputKeyClass(keyClass);
selectComparator(mro, pack.getKeyType(), nwJob);
}
nwJob.setOutputValueClass(NullableTuple.class);
}
if(mro.isGlobalSort() || mro.isLimitAfterSort()){
// Only set the quantiles file and sort partitioner if we're a
// global sort, not for limit after sort.
if (mro.isGlobalSort()) {
String symlink = addSingleFileToDistributedCache(
pigContext, conf, mro.getQuantFile(), "pigsample");
conf.set("pig.quantilesFile", symlink);
nwJob.setPartitionerClass(WeightedRangePartitioner.class);
}
if (mro.isUDFComparatorUsed) {
boolean usercomparator = false;
for (String compFuncSpec : mro.UDFs) {
Class comparator = PigContext.resolveClassName(compFuncSpec);
if(ComparisonFunc.class.isAssignableFrom(comparator)) {
nwJob.setMapperClass(PigMapReduce.MapWithComparator.class);
nwJob.setReducerClass(PigMapReduce.ReduceWithComparator.class);
conf.set("pig.reduce.package", ObjectSerializer.serialize(pack));
conf.set("pig.usercomparator", "true");
nwJob.setOutputKeyClass(NullableTuple.class);
nwJob.setSortComparatorClass(comparator);
usercomparator = true;
break;
}
}
if (!usercomparator) {
String msg = "Internal error. Can't find the UDF comparator";
throw new IOException (msg);
}
} else {
conf.set("pig.sortOrder",
ObjectSerializer.serialize(mro.getSortOrder()));
}
}
if (mro.isSkewedJoin()) {
String symlink = addSingleFileToDistributedCache(pigContext,
conf, mro.getSkewedJoinPartitionFile(), "pigdistkey");
conf.set("pig.keyDistFile", symlink);
nwJob.setPartitionerClass(SkewedPartitioner.class);
nwJob.setMapperClass(PigMapReduce.MapWithPartitionIndex.class);
nwJob.setMapOutputKeyClass(NullablePartitionWritable.class);
nwJob.setGroupingComparatorClass(PigGroupingPartitionWritableComparator.class);
}
if (!pigContext.inIllustrator)
{
// unset inputs for POStore, otherwise, map/reduce plan will be unnecessarily deserialized
for (POStore st: mapStores) { st.setInputs(null); st.setParentPlan(null);}
for (POStore st: reduceStores) { st.setInputs(null); st.setParentPlan(null);}
conf.set(PIG_MAP_STORES, ObjectSerializer.serialize(mapStores));
conf.set(PIG_REDUCE_STORES, ObjectSerializer.serialize(reduceStores));
}
// tmp file compression setups