Vertex map = null;
// use tez to combine splits
boolean groupSplitsInInputInitializer;
DataSourceDescriptor dataSource;
int numTasks = -1;
@SuppressWarnings("rawtypes")
Class inputFormatClass = conf.getClass("mapred.input.format.class",
InputFormat.class);
boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
LOG.info("Vertex has custom input? " + vertexHasCustomInput);
if (vertexHasCustomInput) {
groupSplitsInInputInitializer = false;
// grouping happens in execution phase. The input payload should not enable grouping here,
// it will be enabled in the CustomVertex.
inputFormatClass = HiveInputFormat.class;
conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
// mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
// this plug-in to avoid getting a serialized event at run-time.
conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
} else {
// we'll set up tez to combine spits for us iff the input format
// is HiveInputFormat
if (inputFormatClass == HiveInputFormat.class) {
groupSplitsInInputInitializer = true;
} else {
groupSplitsInInputInitializer = false;
}
}
if (mapWork instanceof MergeFileWork) {
Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
// prepare the tmp output directory. The output tmp directory should
// exist before jobClose (before renaming after job completion)
Path tempOutPath = Utilities.toTempPath(outputPath);
try {
if (!fs.exists(tempOutPath)) {
fs.mkdirs(tempOutPath);
}
} catch (IOException e) {
throw new RuntimeException(
"Can't make path " + outputPath + " : " + e.getMessage());
}
}
// remember mapping of plan to input
conf.set(Utilities.INPUT_NAME, mapWork.getName());
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)
&& !mapWork.isUseOneNullRowInputFormat()) {
// set up the operator plan. (before setting up splits on the AM)
Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
// if we're generating the splits in the AM, we just need to set
// the correct plugin.
if (groupSplitsInInputInitializer) {
// Not setting a payload, since the MRInput payload is the same and can be accessed.
InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(
HiveSplitGenerator.class.getName());
dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true)
.setCustomInitializerDescriptor(descriptor).build();
} else {
// Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
if (vertexHasCustomInput) {
dataSource =
MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
} else {
dataSource =
MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
}
}
} else {
// Setup client side split generation.
dataSource = MRInputHelpers.configureMRInputWithLegacySplitGeneration(conf, new Path(tezDir,
"split_" + mapWork.getName().replaceAll(" ", "_")), true);
numTasks = dataSource.getNumberOfShards();
// set up the operator plan. (after generating splits - that changes configs)
Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
}