// use tez to combine splits
boolean useTezGroupedSplits = false;
int numTasks = -1;
Class amSplitGeneratorClass = null;
InputSplitInfo inputSplitInfo = null;
Class inputFormatClass = conf.getClass("mapred.input.format.class",
InputFormat.class);
boolean vertexHasCustomInput = false;
if (tezWork != null) {
for (BaseWork baseWork : tezWork.getParents(mapWork)) {
if (tezWork.getEdgeType(baseWork, mapWork) == EdgeType.CUSTOM_EDGE) {
vertexHasCustomInput = true;
}
}
}
if (vertexHasCustomInput) {
useTezGroupedSplits = false;
// grouping happens in execution phase. Setting the class to TezGroupedSplitsInputFormat
// here would cause pre-mature grouping which would be incorrect.
inputFormatClass = HiveInputFormat.class;
conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
// mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
// this plug-in to avoid getting a serialized event at run-time.
conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
} else {
// we'll set up tez to combine spits for us iff the input format
// is HiveInputFormat
if (inputFormatClass == HiveInputFormat.class) {
useTezGroupedSplits = true;
conf.setClass("mapred.input.format.class", TezGroupedSplitsInputFormat.class, InputFormat.class);
}
}
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
// if we're generating the splits in the AM, we just need to set
// the correct plugin.
amSplitGeneratorClass = MRInputAMSplitGenerator.class;
} else {
// client side split generation means we have to compute them now
inputSplitInfo = MRHelpers.generateInputSplits(conf,
new Path(tezDir, "split_"+mapWork.getName().replaceAll(" ", "_")));
numTasks = inputSplitInfo.getNumTasks();
}
byte[] serializedConf = MRHelpers.createUserPayloadFromConf(conf);
map = new Vertex(mapWork.getName(),
new ProcessorDescriptor(MapTezProcessor.class.getName()).
setUserPayload(serializedConf), numTasks, getContainerResource(conf));
Map<String, String> environment = new HashMap<String, String>();
MRHelpers.updateEnvironmentForMRTasks(conf, environment, true);
map.setTaskEnvironment(environment);
map.setJavaOpts(getContainerJavaOpts(conf));
assert mapWork.getAliasToWork().keySet().size() == 1;
String alias = mapWork.getAliasToWork().keySet().iterator().next();
byte[] mrInput = null;
if (useTezGroupedSplits) {
mrInput = MRHelpers.createMRInputPayloadWithGrouping(serializedConf,
HiveInputFormat.class.getName());
} else {
mrInput = MRHelpers.createMRInputPayload(serializedConf, null);
}
map.addInput(alias,
new InputDescriptor(MRInputLegacy.class.getName()).
setUserPayload(mrInput), amSplitGeneratorClass);
Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
localResources.put(getBaseName(appJarLr), appJarLr);
for (LocalResource lr: additionalLr) {
localResources.put(getBaseName(lr), lr);
}
if (inputSplitInfo != null) {
// only relevant for client-side split generation
map.setTaskLocationsHint(inputSplitInfo.getTaskLocationHints());
MRHelpers.updateLocalResourcesForInputSplits(FileSystem.get(conf), inputSplitInfo,
localResources);
}
map.setTaskLocalResources(localResources);