/**
* Create Hive splits based on CombineFileSplit.
*/
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
PerfLogger perfLogger = PerfLogger.getPerfLogger();
perfLogger.PerfLogBegin(LOG, PerfLogger.GET_SPLITS);
init(job);
Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
Map<String, Operator<? extends OperatorDesc>> aliasToWork =
mrwork.getAliasToWork();
CombineFileInputFormatShim combine = ShimLoader.getHadoopShims()
.getCombineFileInputFormat();
InputSplit[] splits = null;
if (combine == null) {
splits = super.getSplits(job, numSplits);
perfLogger.PerfLogEnd(LOG, PerfLogger.GET_SPLITS);
return splits;
}
if (combine.getInputPathsShim(job).length == 0) {
throw new IOException("No input paths specified in job");
}
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
// combine splits only from same tables and same partitions. Do not combine splits from multiple
// tables or multiple partitions.
Path[] paths = combine.getInputPathsShim(job);
List<Path> inpDirs = new ArrayList<Path>();
List<Path> inpFiles = new ArrayList<Path>();
Map<CombinePathInputFormat, CombineFilter> poolMap =
new HashMap<CombinePathInputFormat, CombineFilter>();
Set<Path> poolSet = new HashSet<Path>();
for (Path path : paths) {
PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
TableDesc tableDesc = part.getTableDesc();
if ((tableDesc != null) && tableDesc.isNonNative()) {
return super.getSplits(job, numSplits);
}
// Use HiveInputFormat if any of the paths is not splittable
Class inputFormatClass = part.getInputFileFormatClass();
String inputFormatClassName = inputFormatClass.getName();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
String deserializerClassName = part.getDeserializerClass() == null ? null
: part.getDeserializerClass().getName();
// Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
// we use a configuration variable for the same
if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
// The following code should be removed, once
// https://issues.apache.org/jira/browse/MAPREDUCE-1597 is fixed.
// Hadoop does not handle non-splittable files correctly for CombineFileInputFormat,
// so don't use CombineFileInputFormat for non-splittable files
FileSystem inpFs = path.getFileSystem(job);
if (inputFormat instanceof TextInputFormat) {
Queue<Path> dirs = new LinkedList<Path>();
FileStatus fStats = inpFs.getFileStatus(path);
// If path is a directory
if (fStats.isDir()) {
dirs.offer(path);
} else if ((new CompressionCodecFactory(job)).getCodec(path) != null) {
splits = super.getSplits(job, numSplits);
perfLogger.PerfLogEnd(LOG, PerfLogger.GET_SPLITS);
return splits;
}
while (dirs.peek() != null) {
Path tstPath = dirs.remove();
FileStatus[] fStatus = inpFs.listStatus(tstPath);
for (int idx = 0; idx < fStatus.length; idx++) {
if (fStatus[idx].isDir()) {
dirs.offer(fStatus[idx].getPath());
} else if ((new CompressionCodecFactory(job)).getCodec(
fStatus[idx].getPath()) != null) {
splits = super.getSplits(job, numSplits);
perfLogger.PerfLogEnd(LOG, PerfLogger.GET_SPLITS);
return splits;
}
}
}
}
}
if (inputFormat instanceof SymlinkTextInputFormat) {
splits = super.getSplits(job, numSplits);
perfLogger.PerfLogEnd(LOG, PerfLogger.GET_SPLITS);
return splits;
}
Path filterPath = path;
// Does a pool exist for this path already
CombineFilter f = null;
List<Operator<? extends OperatorDesc>> opList = null;
if (!mrwork.isMapperCannotSpanPartns()) {
opList = HiveFileFormatUtils.doGetWorksFromPath(
pathToAliases, aliasToWork, filterPath);
CombinePathInputFormat combinePathInputFormat =
new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
f = poolMap.get(combinePathInputFormat);
if (f == null) {
f = new CombineFilter(filterPath);
LOG.info("CombineHiveInputSplit creating pool for " + path +
"; using filter path " + filterPath);
combine.createPool(job, f);
poolMap.put(combinePathInputFormat, f);
} else {
LOG.info("CombineHiveInputSplit: pool is already created for " + path +
"; using filter path " + filterPath);
f.addPath(filterPath);
}
} else {
// In the case of tablesample, the input paths are pointing to files rather than directories.
// We need to get the parent directory as the filtering path so that all files in the same
// parent directory will be grouped into one pool but not files from different parent
// directories. This guarantees that a split will combine all files in the same partition
// but won't cross multiple partitions if the user has asked so.
if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
filterPath = path.getParent();
inpFiles.add(path);
poolSet.add(filterPath);
} else {
inpDirs.add(path);
}
}
}
// Processing directories
List<InputSplitShim> iss = new ArrayList<InputSplitShim>();
if (!mrwork.isMapperCannotSpanPartns()) {
iss = Arrays.asList(combine.getSplits(job, 1));
} else {
for (Path path : inpDirs) {
processPaths(job, combine, iss, path);
}
if (inpFiles.size() > 0) {
// Processing files
for (Path filterPath : poolSet) {
combine.createPool(job, new CombineFilter(filterPath));
}
processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
}
}
if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
iss = sampleSplits(iss);
}
for (InputSplitShim is : iss) {
CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is);
result.add(csplit);
}
LOG.info("number of splits " + result.size());
perfLogger.PerfLogEnd(LOG, PerfLogger.GET_SPLITS);
return result.toArray(new CombineHiveInputSplit[result.size()]);
}