.serialize(new byte[] { combRearrange.getKeyType() }));
}
private Vertex newVertex(TezOperator tezOp) throws IOException,
ClassNotFoundException, InterruptedException {
ProcessorDescriptor procDesc = ProcessorDescriptor.create(
tezOp.getProcessorName());
// Pass physical plans to vertex as user payload.
JobConf payloadConf = new JobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), false));
// We do this so that dag.getCredentials(), job.getCredentials(),
// job.getConfiguration().getCredentials() all reference the same Credentials object
// Unfortunately there is no setCredentials() on Job
payloadConf.setCredentials(dag.getCredentials());
// We won't actually use this job, but we need it to talk with the Load Store funcs
@SuppressWarnings("deprecation")
Job job = new Job(payloadConf);
payloadConf = (JobConf) job.getConfiguration();
if (tezOp.getSampleOperator() != null) {
payloadConf.set(PigProcessor.SAMPLE_VERTEX, tezOp.getSampleOperator().getOperatorKey().toString());
}
if (tezOp.getSortOperator() != null) {
// Required by Sample Aggregation job for estimating quantiles
payloadConf.set(PigProcessor.SORT_VERTEX, tezOp.getSortOperator().getOperatorKey().toString());
// PIG-4162: Order by/Skew Join in intermediate stage.
// Increasing order by parallelism may not be required as it is
// usually followed by limit other than store. But would benefit
// cases like skewed join followed by group by.
if (tezOp.getSortOperator().getEstimatedParallelism() != -1
&& TezCompilerUtil.isIntermediateReducer(tezOp.getSortOperator())) {
payloadConf.setLong(
InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
intermediateTaskInputSize);
}
}
payloadConf.set("pig.inputs", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInp()));
payloadConf.set("pig.inpSignatures", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInpSignatureLists()));
payloadConf.set("pig.inpLimits", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInpLimits()));
// Process stores
LinkedList<POStore> stores = processStores(tezOp, payloadConf, job);
payloadConf.set("pig.pigContext", ObjectSerializer.serialize(pc));
payloadConf.set("udf.import.list",
ObjectSerializer.serialize(PigContext.getPackageImportList()));
payloadConf.set("exectype", "TEZ");
payloadConf.setBoolean(MRConfiguration.MAPPER_NEW_API, true);
payloadConf.setClass(MRConfiguration.INPUTFORMAT_CLASS,
PigInputFormat.class, InputFormat.class);
// Set parent plan for all operators in the Tez plan.
new PhyPlanSetter(tezOp.plan).visit();
// Set the endOfAllInput flag on the physical plan if certain operators that
// use this property (such as STREAM) are present in the plan.
EndOfAllInputSetter.EndOfAllInputChecker checker =
new EndOfAllInputSetter.EndOfAllInputChecker(tezOp.plan);
checker.visit();
if (checker.isEndOfAllInputPresent()) {
payloadConf.set(JobControlCompiler.END_OF_INP_IN_MAP, "true");
}
// Configure the classes for incoming shuffles to this TezOp
// TODO: Refactor out resetting input keys, PIG-3957
List<PhysicalOperator> roots = tezOp.plan.getRoots();
if (roots.size() == 1 && roots.get(0) instanceof POPackage) {
POPackage pack = (POPackage) roots.get(0);
List<PhysicalOperator> succsList = tezOp.plan.getSuccessors(pack);
if (succsList != null) {
succsList = new ArrayList<PhysicalOperator>(succsList);
}
byte keyType = pack.getPkgr().getKeyType();
tezOp.plan.remove(pack);
payloadConf.set("pig.reduce.package", ObjectSerializer.serialize(pack));
setIntermediateOutputKeyValue(keyType, payloadConf, tezOp);
POShuffleTezLoad newPack = new POShuffleTezLoad(pack);
if (tezOp.isSkewedJoin()) {
newPack.setSkewedJoins(true);
}
tezOp.plan.add(newPack);
// Set input keys for POShuffleTezLoad. This is used to identify
// the inputs that are attached to the POShuffleTezLoad in the
// backend.
Map<Integer, String> localRearrangeMap = new TreeMap<Integer, String>();
for (TezOperator pred : mPlan.getPredecessors(tezOp)) {
if (tezOp.getSampleOperator() != null && tezOp.getSampleOperator() == pred) {
// skip sample vertex input
} else {
String inputKey = pred.getOperatorKey().toString();
if (pred.isVertexGroup()) {
pred = mPlan.getOperator(pred.getVertexGroupMembers().get(0));
}
LinkedList<POLocalRearrangeTez> lrs =
PlanHelper.getPhysicalOperators(pred.plan, POLocalRearrangeTez.class);
for (POLocalRearrangeTez lr : lrs) {
if (lr.isConnectedToPackage()
&& lr.getOutputKey().equals(tezOp.getOperatorKey().toString())) {
localRearrangeMap.put((int) lr.getIndex(), inputKey);
}
}
}
}
for (Map.Entry<Integer, String> entry : localRearrangeMap.entrySet()) {
newPack.addInputKey(entry.getValue());
}
if (succsList != null) {
for (PhysicalOperator succs : succsList) {
tezOp.plan.connect(newPack, succs);
}
}
setIntermediateOutputKeyValue(pack.getPkgr().getKeyType(), payloadConf, tezOp);
} else if (roots.size() == 1 && roots.get(0) instanceof POIdentityInOutTez) {
POIdentityInOutTez identityInOut = (POIdentityInOutTez) roots.get(0);
// TODO Need to fix multiple input key mapping
TezOperator identityInOutPred = null;
for (TezOperator pred : mPlan.getPredecessors(tezOp)) {
if (!pred.isSampleAggregation()) {
identityInOutPred = pred;
break;
}
}
identityInOut.setInputKey(identityInOutPred.getOperatorKey().toString());
} else if (roots.size() == 1 && roots.get(0) instanceof POValueInputTez) {
POValueInputTez valueInput = (POValueInputTez) roots.get(0);
LinkedList<String> scalarInputs = new LinkedList<String>();
for (POUserFunc userFunc : PlanHelper.getPhysicalOperators(tezOp.plan, POUserFunc.class) ) {
if (userFunc.getFunc() instanceof ReadScalarsTez) {
scalarInputs.add(((ReadScalarsTez)userFunc.getFunc()).getTezInputs()[0]);
}
}
// Make sure we don't find the scalar
for (TezOperator pred : mPlan.getPredecessors(tezOp)) {
if (!scalarInputs.contains(pred.getOperatorKey().toString())) {
valueInput.setInputKey(pred.getOperatorKey().toString());
break;
}
}
}
setOutputFormat(job);
// set parent plan in all operators. currently the parent plan is really
// used only when POStream, POSplit are present in the plan
new PhyPlanSetter(tezOp.plan).visit();
// Serialize the execution plan
payloadConf.set(PigProcessor.PLAN,
ObjectSerializer.serialize(tezOp.plan));
UDFContext.getUDFContext().serialize(payloadConf);
MRToTezHelper.processMRSettings(payloadConf, globalConf);
if (!pc.inIllustrator) {
for (POStore store : stores) {
// unset inputs for POStore, otherwise, map/reduce plan will be unnecessarily deserialized
store.setInputs(null);
store.setParentPlan(null);
}
// We put them in the reduce because PigOutputCommitter checks the
// ID of the task to see if it's a map, and if not, calls the reduce
// committers.
payloadConf.set(JobControlCompiler.PIG_MAP_STORES,
ObjectSerializer.serialize(new ArrayList<POStore>()));
payloadConf.set(JobControlCompiler.PIG_REDUCE_STORES,
ObjectSerializer.serialize(stores));
}
if (tezOp.isNeedEstimateParallelism()) {
payloadConf.setBoolean(PigProcessor.ESTIMATE_PARALLELISM, true);
log.info("Estimate quantile for sample aggregation vertex " + tezOp.getOperatorKey().toString());
}
// set various parallelism into the job conf for later analysis, PIG-2779
payloadConf.setInt(PigImplConstants.REDUCER_DEFAULT_PARALLELISM, pc.defaultParallel);
payloadConf.setInt(PigImplConstants.REDUCER_REQUESTED_PARALLELISM, tezOp.getRequestedParallelism());
payloadConf.setInt(PigImplConstants.REDUCER_ESTIMATED_PARALLELISM, tezOp.getEstimatedParallelism());
TezScriptState ss = TezScriptState.get();
ss.addVertexSettingsToConf(dag.getName(), tezOp, payloadConf);
// Take our assembled configuration and create a vertex
UserPayload userPayload = TezUtils.createUserPayloadFromConf(payloadConf);
procDesc.setUserPayload(userPayload);
Vertex vertex = Vertex.create(tezOp.getOperatorKey().toString(), procDesc, tezOp.getVertexParallelism(),
tezOp.isUseMRMapSettings() ? MRHelpers.getResourceForMRMapper(globalConf) : MRHelpers.getResourceForMRReducer(globalConf));
Map<String, String> taskEnv = new HashMap<String, String>();