// Take our assembled configuration and create a vertex
UserPayload userPayload = TezUtils.createUserPayloadFromConf(payloadConf);
procDesc.setUserPayload(userPayload);
Vertex vertex = Vertex.create(tezOp.getOperatorKey().toString(), procDesc, tezOp.getVertexParallelism(),
tezOp.isUseMRMapSettings() ? MRHelpers.getResourceForMRMapper(globalConf) : MRHelpers.getResourceForMRReducer(globalConf));
Map<String, String> taskEnv = new HashMap<String, String>();
MRHelpers.updateEnvBasedOnMRTaskEnv(globalConf, taskEnv, tezOp.isUseMRMapSettings());
vertex.setTaskEnvironment(taskEnv);
// All these classes are @InterfaceAudience.Private in Hadoop. Switch to Tez methods in TEZ-1012
// set the timestamps, public/private visibility of the archives and files
ClientDistributedCacheManager
.determineTimestampsAndCacheVisibilities(globalConf);
// get DelegationToken for each cached file
ClientDistributedCacheManager.getDelegationTokens(globalConf,
job.getCredentials());
MRApps.setupDistributedCache(globalConf, localResources);
vertex.addTaskLocalFiles(localResources);
vertex.setTaskLaunchCmdOpts(tezOp.isUseMRMapSettings() ? MRHelpers.getJavaOptsForMRMapper(globalConf)
: MRHelpers.getJavaOptsForMRReducer(globalConf));
log.info("For vertex - " + tezOp.getOperatorKey().toString()
+ ": parallelism=" + tezOp.getVertexParallelism()
+ ", memory=" + vertex.getTaskResource().getMemory()
+ ", java opts=" + vertex.getTaskLaunchCmdOpts()
);
// Right now there can only be one of each of these. Will need to be
// more generic when there can be more.
for (POLoad ld : tezOp.getLoaderInfo().getLoads()) {
// TODO: These should get the globalConf, or a merged version that
// keeps settings like pig.maxCombinedSplitSize
vertex.setLocationHint(VertexLocationHint.create(tezOp.getLoaderInfo().getInputSplitInfo().getTaskLocationHints()));
vertex.addDataSource(ld.getOperatorKey().toString(),
DataSourceDescriptor.create(InputDescriptor.create(MRInput.class.getName())
.setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder()
.setConfigurationBytes(TezUtils.createByteStringFromConf(payloadConf))
.setSplits(tezOp.getLoaderInfo().getInputSplitInfo().getSplitsProto()).build().toByteString().asReadOnlyByteBuffer())),
InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName()), dag.getCredentials()));
}
for (POStore store : stores) {
ArrayList<POStore> emptyList = new ArrayList<POStore>();
ArrayList<POStore> singleStore = new ArrayList<POStore>();
singleStore.add(store);
Configuration outputPayLoad = new Configuration(payloadConf);
outputPayLoad.set(JobControlCompiler.PIG_MAP_STORES,
ObjectSerializer.serialize(emptyList));
outputPayLoad.set(JobControlCompiler.PIG_REDUCE_STORES,
ObjectSerializer.serialize(singleStore));
OutputDescriptor storeOutDescriptor = OutputDescriptor.create(
MROutput.class.getName()).setUserPayload(TezUtils
.createUserPayloadFromConf(outputPayLoad));
if (tezOp.getVertexGroupStores() != null) {
OperatorKey vertexGroupKey = tezOp.getVertexGroupStores().get(store.getOperatorKey());
if (vertexGroupKey != null) {
getPlan().getOperator(vertexGroupKey).getVertexGroupInfo()
.setStoreOutputDescriptor(storeOutDescriptor);
continue;
}
}
vertex.addDataSink(store.getOperatorKey().toString(),
new DataSinkDescriptor(storeOutDescriptor,
OutputCommitterDescriptor.create(MROutputCommitter.class.getName()),
dag.getCredentials()));
}
// LoadFunc and StoreFunc add delegation tokens to Job Credentials in
// setLocation and setStoreLocation respectively. For eg: HBaseStorage
// InputFormat add delegation token in getSplits and OutputFormat in
// checkOutputSpecs. For eg: FileInputFormat and FileOutputFormat
if (stores.size() > 0) {
new PigOutputFormat().checkOutputSpecs(job);
}
String vmPluginName = null;
Configuration vmPluginConf = null;
// Set the right VertexManagerPlugin
if (tezOp.getEstimatedParallelism() != -1) {
if (tezOp.isGlobalSort()||tezOp.isSkewedJoin()) {
// Set VertexManagerPlugin to PartitionerDefinedVertexManager, which is able
// to decrease/increase parallelism of sorting vertex dynamically
// based on the numQuantiles calculated by sample aggregation vertex
vmPluginName = PartitionerDefinedVertexManager.class.getName();
log.info("Set VertexManagerPlugin to PartitionerDefinedParallelismVertexManager for vertex " + tezOp.getOperatorKey().toString());
} else {
boolean containScatterGather = false;
boolean containCustomPartitioner = false;
for (TezEdgeDescriptor edge : tezOp.inEdges.values()) {
if (edge.dataMovementType == DataMovementType.SCATTER_GATHER) {
containScatterGather = true;
}
if (edge.partitionerClass!=null) {
containCustomPartitioner = true;
}
}
if (containScatterGather && !containCustomPartitioner) {
// Use auto-parallelism feature of ShuffleVertexManager to dynamically
// reduce the parallelism of the vertex
vmPluginName = ShuffleVertexManager.class.getName();
vmPluginConf = (vmPluginConf == null) ? ConfigurationUtil.toConfiguration(pc.getProperties(), false) : vmPluginConf;
vmPluginConf.setBoolean(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL, true);
if (stores.size() <= 0) {
// Intermediate reduce. Set the bytes per reducer to be block size.
vmPluginConf.setLong(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE,
intermediateTaskInputSize);
} else if (vmPluginConf.getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER) !=
InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER) {
vmPluginConf.setLong(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE,
vmPluginConf.getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER));
}
log.info("Set auto parallelism for vertex " + tezOp.getOperatorKey().toString());
}
}
}
/* TODO: Uncomment after TEZ-1590 is fixed
if (tezOp.isLimit() && (vmPluginName == null || vmPluginName.equals(ShuffleVertexManager.class.getName()))) {
if (tezOp.inEdges.values().iterator().next().inputClassName.equals(UnorderedKVInput.class.getName())) {
// Setting SRC_FRACTION to 0.00001 so that even if there are 100K source tasks,
// limit job starts when 1 source task finishes.
// If limit is part of a group by or join because their parallelism is 1,
// we should leave the configuration with the defaults.
vmPluginName = ShuffleVertexManager.class.getName();
vmPluginConf = (vmPluginConf == null) ? ConfigurationUtil.toConfiguration(pc.getProperties(), false) : vmPluginConf;
vmPluginConf.set(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MIN_SRC_FRACTION, "0.00001");
vmPluginConf.set(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MAX_SRC_FRACTION, "0.00001");
log.info("Set " + ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MIN_SRC_FRACTION + " to 0.00001 for limit vertex " + tezOp.getOperatorKey().toString());
}
}
*/
// else if(tezOp.isLimitAfterSort())
// TODO: PIG-4049 If standalone Limit we need a new VertexManager or new input
// instead of ShuffledMergedInput. For limit part of the sort (order by parallel 1) itself
// need to enhance PartitionerDefinedVertexManager
if (vmPluginName != null) {
VertexManagerPluginDescriptor vmPluginDescriptor = VertexManagerPluginDescriptor.create(vmPluginName);
if (vmPluginConf != null) {
vmPluginDescriptor.setUserPayload(TezUtils.createUserPayloadFromConf(vmPluginConf));
}
vertex.setVertexManagerPlugin(vmPluginDescriptor);
}
// Reset udfcontext jobconf. It is not supposed to be set in the front end
UDFContext.getUDFContext().addJobConf(null);
return vertex;
}