perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
init(job);
Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
Map<String, Operator<? extends OperatorDesc>> aliasToWork =
mrwork.getAliasToWork();
CombineFileInputFormatShim combine = ShimLoader.getHadoopShims()
.getCombineFileInputFormat();
// on tez we're avoiding duplicating path info since the info will go over
// rpc
if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
try {
List<Path> dirs = Utilities.getInputPathsTez(job, mrwork);
Utilities.setInputPaths(job, dirs);
} catch (Exception e) {
throw new IOException("Could not create input paths", e);
}
}
InputSplit[] splits = null;
if (combine == null) {
splits = super.getSplits(job, numSplits);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return splits;
}
if (combine.getInputPathsShim(job).length == 0) {
throw new IOException("No input paths specified in job");
}
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
// combine splits only from same tables and same partitions. Do not combine splits from multiple
// tables or multiple partitions.
Path[] paths = combine.getInputPathsShim(job);
List<Path> inpDirs = new ArrayList<Path>();
List<Path> inpFiles = new ArrayList<Path>();
Map<CombinePathInputFormat, CombineFilter> poolMap =
new HashMap<CombinePathInputFormat, CombineFilter>();
Set<Path> poolSet = new HashSet<Path>();
for (Path path : paths) {
PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
TableDesc tableDesc = part.getTableDesc();
if ((tableDesc != null) && tableDesc.isNonNative()) {
return super.getSplits(job, numSplits);
}
// Use HiveInputFormat if any of the paths is not splittable
Class inputFormatClass = part.getInputFileFormatClass();
String inputFormatClassName = inputFormatClass.getName();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
String deserializerClassName = null;
try {
deserializerClassName = part.getDeserializer(job).getClass().getName();
} catch (Exception e) {
// ignore
}
FileSystem inpFs = path.getFileSystem(job);
if (inputFormatClass.isAssignableFrom(OrcInputFormat.class)) {
if (inpFs.exists(new Path(path, OrcRecordUpdater.ACID_FORMAT))) {
throw new IOException("CombineHiveInputFormat is incompatible " +
" with ACID tables. Please set hive.input.format=" +
"org.apache.hadoop.hive.ql.io.HiveInputFormat");
}
}
// Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
// we use a configuration variable for the same
if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
// The following code should be removed, once
// https://issues.apache.org/jira/browse/MAPREDUCE-1597 is fixed.
// Hadoop does not handle non-splittable files correctly for CombineFileInputFormat,
// so don't use CombineFileInputFormat for non-splittable files
//ie, dont't combine if inputformat is a TextInputFormat and has compression turned on
if (inputFormat instanceof TextInputFormat) {
Queue<Path> dirs = new LinkedList<Path>();
FileStatus fStats = inpFs.getFileStatus(path);
// If path is a directory
if (fStats.isDir()) {
dirs.offer(path);
} else if ((new CompressionCodecFactory(job)).getCodec(path) != null) {
//if compresssion codec is set, use HiveInputFormat.getSplits (don't combine)
splits = super.getSplits(job, numSplits);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return splits;
}
while (dirs.peek() != null) {
Path tstPath = dirs.remove();
FileStatus[] fStatus = inpFs.listStatus(tstPath);
for (int idx = 0; idx < fStatus.length; idx++) {
if (fStatus[idx].isDir()) {
dirs.offer(fStatus[idx].getPath());
} else if ((new CompressionCodecFactory(job)).getCodec(
fStatus[idx].getPath()) != null) {
//if compresssion codec is set, use HiveInputFormat.getSplits (don't combine)
splits = super.getSplits(job, numSplits);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return splits;
}
}
}
}
}
//don't combine if inputformat is a SymlinkTextInputFormat
if (inputFormat instanceof SymlinkTextInputFormat) {
splits = super.getSplits(job, numSplits);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return splits;
}
Path filterPath = path;
// Does a pool exist for this path already
CombineFilter f = null;
List<Operator<? extends OperatorDesc>> opList = null;
if (!mrwork.isMapperCannotSpanPartns()) {
//if mapper can span partitions, make sure a splits does not contain multiple
// opList + inputFormatClassName + deserializerClassName combination
// This is done using the Map of CombinePathInputFormat to PathFilter
opList = HiveFileFormatUtils.doGetWorksFromPath(
pathToAliases, aliasToWork, filterPath);
CombinePathInputFormat combinePathInputFormat =
new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
f = poolMap.get(combinePathInputFormat);
if (f == null) {
f = new CombineFilter(filterPath);
LOG.info("CombineHiveInputSplit creating pool for " + path +
"; using filter path " + filterPath);
combine.createPool(job, f);
poolMap.put(combinePathInputFormat, f);
} else {
LOG.info("CombineHiveInputSplit: pool is already created for " + path +
"; using filter path " + filterPath);
f.addPath(filterPath);
}
} else {
// In the case of tablesample, the input paths are pointing to files rather than directories.
// We need to get the parent directory as the filtering path so that all files in the same
// parent directory will be grouped into one pool but not files from different parent
// directories. This guarantees that a split will combine all files in the same partition
// but won't cross multiple partitions if the user has asked so.
if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
filterPath = path.getParent();
inpFiles.add(path);
poolSet.add(filterPath);
} else {
inpDirs.add(path);
}
}
}
// Processing directories
List<InputSplitShim> iss = new ArrayList<InputSplitShim>();
if (!mrwork.isMapperCannotSpanPartns()) {
//mapper can span partitions
//combine into as few as one split, subject to the PathFilters set
// using combine.createPool.
iss = Arrays.asList(combine.getSplits(job, 1));
} else {
for (Path path : inpDirs) {
processPaths(job, combine, iss, path);
}
if (inpFiles.size() > 0) {
// Processing files
for (Path filterPath : poolSet) {
combine.createPool(job, new CombineFilter(filterPath));
}
processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
}
}