// use tez to combine splits
boolean useTezGroupedSplits = true;
int numTasks = -1;
Class amSplitGeneratorClass = null;
InputSplitInfo inputSplitInfo = null;
Class inputFormatClass = conf.getClass("mapred.input.format.class",
InputFormat.class);
boolean vertexHasCustomInput = false;
if (tezWork != null) {
for (BaseWork baseWork : tezWork.getParents(mapWork)) {
if (tezWork.getEdgeType(baseWork, mapWork) == EdgeType.CUSTOM_EDGE) {
vertexHasCustomInput = true;
}
}
}
// we cannot currently allow grouping of splits where each split is a different input format
// or has different deserializers similar to the checks in CombineHiveInputFormat. We do not
// need the check for the opList because we will not process different opLists at this time.
// Long term fix would be to have a custom input format
// logic that groups only the splits that share the same input format
Class<?> previousInputFormatClass = null;
Class<?> previousDeserializerClass = null;
for (String path : mapWork.getPathToPartitionInfo().keySet()) {
PartitionDesc pd = mapWork.getPathToPartitionInfo().get(path);
Class<?> currentDeserializerClass = pd.getDeserializer(conf).getClass();
Class<?> currentInputFormatClass = pd.getInputFileFormatClass();
if (previousInputFormatClass == null) {
previousInputFormatClass = currentInputFormatClass;
}
if (previousDeserializerClass == null) {
previousDeserializerClass = currentDeserializerClass;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Current input format class = "+currentInputFormatClass+", previous input format class = "
+ previousInputFormatClass + ", verifying " + " current deserializer class = "
+ currentDeserializerClass + " previous deserializer class = " + previousDeserializerClass);
}
if ((currentInputFormatClass != previousInputFormatClass) ||
(currentDeserializerClass != previousDeserializerClass)) {
useTezGroupedSplits = false;
break;
}
}
if (vertexHasCustomInput) {
// if it is the case of different input formats for different partitions, we cannot group
// in the custom vertex for now. Long term, this can be improved to group the buckets that
// share the same input format.
if (useTezGroupedSplits == false) {
conf.setBoolean(CustomPartitionVertex.GROUP_SPLITS, false);
} else {
conf.setBoolean(CustomPartitionVertex.GROUP_SPLITS, true);
}
// grouping happens in execution phase. Setting the class to TezGroupedSplitsInputFormat
// here would cause pre-mature grouping which would be incorrect.
inputFormatClass = HiveInputFormat.class;
conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
// mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
// this plug-in to avoid getting a serialized event at run-time.
conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
} else if (useTezGroupedSplits) {
// we'll set up tez to combine spits for us iff the input format
// is HiveInputFormat
if (inputFormatClass == HiveInputFormat.class) {
conf.setClass("mapred.input.format.class", TezGroupedSplitsInputFormat.class, InputFormat.class);
} else {
conf.setClass("mapred.input.format.class", CombineHiveInputFormat.class, InputFormat.class);
useTezGroupedSplits = false;
}
} else {
conf.setClass("mapred.input.format.class", CombineHiveInputFormat.class, InputFormat.class);
}
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
// if we're generating the splits in the AM, we just need to set
// the correct plugin.
amSplitGeneratorClass = MRInputAMSplitGenerator.class;
} else {
// client side split generation means we have to compute them now
inputSplitInfo = MRHelpers.generateInputSplits(conf,
new Path(tezDir, "split_"+mapWork.getName().replaceAll(" ", "_")));
numTasks = inputSplitInfo.getNumTasks();
}
byte[] serializedConf = MRHelpers.createUserPayloadFromConf(conf);
map = new Vertex(mapWork.getName(),
new ProcessorDescriptor(MapTezProcessor.class.getName()).
setUserPayload(serializedConf), numTasks, getContainerResource(conf));
Map<String, String> environment = new HashMap<String, String>();
MRHelpers.updateEnvironmentForMRTasks(conf, environment, true);
map.setTaskEnvironment(environment);
map.setJavaOpts(getContainerJavaOpts(conf));
assert mapWork.getAliasToWork().keySet().size() == 1;
String alias = mapWork.getAliasToWork().keySet().iterator().next();
byte[] mrInput = null;
if (useTezGroupedSplits) {
mrInput = MRHelpers.createMRInputPayloadWithGrouping(serializedConf,
HiveInputFormat.class.getName());
} else {
mrInput = MRHelpers.createMRInputPayload(serializedConf, null);
}
map.addInput(alias,
new InputDescriptor(MRInputLegacy.class.getName()).
setUserPayload(mrInput), amSplitGeneratorClass);
Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
localResources.put(getBaseName(appJarLr), appJarLr);
for (LocalResource lr: additionalLr) {
localResources.put(getBaseName(lr), lr);
}
if (inputSplitInfo != null) {
// only relevant for client-side split generation
map.setTaskLocationsHint(inputSplitInfo.getTaskLocationHints());
MRHelpers.updateLocalResourcesForInputSplits(FileSystem.get(conf), inputSplitInfo,
localResources);
}
map.setTaskLocalResources(localResources);