Package org.apache.hadoop.hive.shims.HadoopShims

Examples of org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim

  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims()

    if (combine == null) {
      return super.getSplits(job, numSplits);

    if (combine.getInputPathsShim(job).length == 0) {
      throw new IOException("No input paths specified in job");
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = combine.getInputPathsShim(job);
    Set<Path> poolSet = new HashSet<Path>();
    for (Path path : paths) {

      PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
          pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
      TableDesc tableDesc = part.getTableDesc();
      if ((tableDesc != null) && tableDesc.isNonNative()) {
        return super.getSplits(job, numSplits);

      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);

      // Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
      // we use a configuration variable for the same
      if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
        // The following code should be removed, once
        // is fixed.
        // Hadoop does not handle non-splittable files correctly for CombineFileInputFormat,
        // so don't use CombineFileInputFormat for non-splittable files
        FileSystem inpFs = path.getFileSystem(job);

        if (inputFormat instanceof TextInputFormat) {
          Queue<Path> dirs = new LinkedList<Path>();
          FileStatus fStats = inpFs.getFileStatus(path);

          // If path is a directory
          if (fStats.isDir()) {
          else if ((new CompressionCodecFactory(job)).getCodec(path) != null) {
            return super.getSplits(job, numSplits);

          while (dirs.peek() != null) {
            Path tstPath = dirs.remove();
            FileStatus[] fStatus = inpFs.listStatus(tstPath);
            for (int idx = 0; idx < fStatus.length; idx++) {
              if (fStatus[idx].isDir()) {
              else if ((new CompressionCodecFactory(job)).getCodec(fStatus[idx].getPath()) != null) {
                return super.getSplits(job, numSplits);

      if (inputFormat instanceof SymlinkTextInputFormat) {
        return super.getSplits(job, numSplits);

      // In the case of tablesample, the input paths are pointing to files rather than directories.
      // We need to get the parent directory as the filtering path so that all files in the same
      // parent directory will be grouped into one pool but not files from different parent
      // directories. This guarantees that a split will combine all files in the same partition
      // but won't cross multiple partitions.
      Path filterPath = path;
      if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
        filterPath = path.getParent();
      if (!poolSet.contains(filterPath)) {"CombineHiveInputSplit creating pool for " + path +
            "; using filter path " + filterPath);
        combine.createPool(job, new CombineFilter(filterPath));
      } else {"CombineHiveInputSplit: pool is already created for " + path +
            "; using filter path " + filterPath);
    InputSplitShim[] iss = combine.getSplits(job, 1);
    for (InputSplitShim is : iss) {
      CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is);

View Full Code Here

    perfLogger.PerfLogBegin(LOG, PerfLogger.GET_SPLITS);
    Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork =
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims()

    InputSplit[] splits = null;
    if (combine == null) {
      splits = super.getSplits(job, numSplits);
      perfLogger.PerfLogEnd(LOG, PerfLogger.GET_SPLITS);
      return splits;

    if (combine.getInputPathsShim(job).length == 0) {
      throw new IOException("No input paths specified in job");
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = combine.getInputPathsShim(job);

    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap =
      new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();

    for (Path path : paths) {

      PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
          pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
      TableDesc tableDesc = part.getTableDesc();
      if ((tableDesc != null) && tableDesc.isNonNative()) {
        return super.getSplits(job, numSplits);

      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      String inputFormatClassName = inputFormatClass.getName();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
      String deserializerClassName = part.getDeserializerClass() == null ? null
          : part.getDeserializerClass().getName();

      // Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
      // we use a configuration variable for the same
      if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
        // The following code should be removed, once
        // is fixed.
        // Hadoop does not handle non-splittable files correctly for CombineFileInputFormat,
        // so don't use CombineFileInputFormat for non-splittable files
        FileSystem inpFs = path.getFileSystem(job);

        if (inputFormat instanceof TextInputFormat) {
          Queue<Path> dirs = new LinkedList<Path>();
          FileStatus fStats = inpFs.getFileStatus(path);

          // If path is a directory
          if (fStats.isDir()) {
          } else if ((new CompressionCodecFactory(job)).getCodec(path) != null) {
            splits = super.getSplits(job, numSplits);
            perfLogger.PerfLogEnd(LOG, PerfLogger.GET_SPLITS);
            return splits;

          while (dirs.peek() != null) {
            Path tstPath = dirs.remove();
            FileStatus[] fStatus = inpFs.listStatus(tstPath);
            for (int idx = 0; idx < fStatus.length; idx++) {
              if (fStatus[idx].isDir()) {
              } else if ((new CompressionCodecFactory(job)).getCodec(
                  fStatus[idx].getPath()) != null) {
                splits = super.getSplits(job, numSplits);
                perfLogger.PerfLogEnd(LOG, PerfLogger.GET_SPLITS);
                return splits;

      if (inputFormat instanceof SymlinkTextInputFormat) {
        splits = super.getSplits(job, numSplits);
        perfLogger.PerfLogEnd(LOG, PerfLogger.GET_SPLITS);
        return splits;

      Path filterPath = path;

      // Does a pool exist for this path already
      CombineFilter f = null;
      List<Operator<? extends OperatorDesc>> opList = null;

      if (!mrwork.isMapperCannotSpanPartns()) {
        opList = HiveFileFormatUtils.doGetWorksFromPath(
                   pathToAliases, aliasToWork, filterPath);
        CombinePathInputFormat combinePathInputFormat =
            new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
        f = poolMap.get(combinePathInputFormat);
        if (f == null) {
          f = new CombineFilter(filterPath);
"CombineHiveInputSplit creating pool for " + path +
                   "; using filter path " + filterPath);
          combine.createPool(job, f);
          poolMap.put(combinePathInputFormat, f);
        } else {
"CombineHiveInputSplit: pool is already created for " + path +
                   "; using filter path " + filterPath);
      } else {
        // In the case of tablesample, the input paths are pointing to files rather than directories.
        // We need to get the parent directory as the filtering path so that all files in the same
        // parent directory will be grouped into one pool but not files from different parent
        // directories. This guarantees that a split will combine all files in the same partition
        // but won't cross multiple partitions if the user has asked so.
        if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
          filterPath = path.getParent();
        } else {

    // Processing directories
    List<InputSplitShim> iss = new ArrayList<InputSplitShim>();
    if (!mrwork.isMapperCannotSpanPartns()) {
      iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
      for (Path path : inpDirs) {
        processPaths(job, combine, iss, path);

      if (inpFiles.size() > 0) {
        // Processing files
        for (Path filterPath : poolSet) {
          combine.createPool(job, new CombineFilter(filterPath));
        processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
View Full Code Here

  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends Serializable>> aliasToWork =
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims()

    if (combine == null) {
      return super.getSplits(job, numSplits);

    if (combine.getInputPathsShim(job).length == 0) {
      throw new IOException("No input paths specified in job");
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = combine.getInputPathsShim(job);
    Map<CombinePathInputFormat, CombineFilter> poolMap =
      new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();

    for (Path path : paths) {

      PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
          pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
      TableDesc tableDesc = part.getTableDesc();
      if ((tableDesc != null) && tableDesc.isNonNative()) {
        return super.getSplits(job, numSplits);

      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      String inputFormatClassName = inputFormatClass.getName();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);

      // Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
      // we use a configuration variable for the same
      if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
        // The following code should be removed, once
        // is fixed.
        // Hadoop does not handle non-splittable files correctly for CombineFileInputFormat,
        // so don't use CombineFileInputFormat for non-splittable files
        FileSystem inpFs = path.getFileSystem(job);

        if (inputFormat instanceof TextInputFormat) {
          Queue<Path> dirs = new LinkedList<Path>();
          FileStatus fStats = inpFs.getFileStatus(path);

          // If path is a directory
          if (fStats.isDir()) {
          else if ((new CompressionCodecFactory(job)).getCodec(path) != null) {
            return super.getSplits(job, numSplits);

          while (dirs.peek() != null) {
            Path tstPath = dirs.remove();
            FileStatus[] fStatus = inpFs.listStatus(tstPath);
            for (int idx = 0; idx < fStatus.length; idx++) {
              if (fStatus[idx].isDir()) {
              else if ((new CompressionCodecFactory(job)).getCodec(fStatus[idx].getPath()) != null) {
                return super.getSplits(job, numSplits);

      if (inputFormat instanceof SymlinkTextInputFormat) {
        return super.getSplits(job, numSplits);

      Path filterPath = path;

      // In the case of tablesample, the input paths are pointing to files rather than directories.
      // We need to get the parent directory as the filtering path so that all files in the same
      // parent directory will be grouped into one pool but not files from different parent
      // directories. This guarantees that a split will combine all files in the same partition
      // but won't cross multiple partitions if the user has asked so.
      if (mrwork.isMapperCannotSpanPartns() &&
          !path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
        filterPath = path.getParent();

      // Does a pool exist for this path already
      CombineFilter f = null;
      List<Operator<? extends Serializable>> opList = null;
      boolean done = false;

      if (!mrwork.isMapperCannotSpanPartns()) {
        opList = HiveFileFormatUtils.doGetWorksFromPath(
                   pathToAliases, aliasToWork, filterPath);
        f = poolMap.get(new CombinePathInputFormat(opList, inputFormatClassName));
      else {
        if (poolSet.contains(filterPath)) {
"CombineHiveInputSplit: pool is already created for " + path +
                   "; using filter path " + filterPath);
          done = true;

      if (!done) {
        if (f == null) {
          f = new CombineFilter(filterPath);
"CombineHiveInputSplit creating pool for " + path +
                   "; using filter path " + filterPath);
          combine.createPool(job, f);
          if (!mrwork.isMapperCannotSpanPartns()) {
            poolMap.put(new CombinePathInputFormat(opList, inputFormatClassName), f);
        } else {
"CombineHiveInputSplit: pool is already created for " + path +
                   "; using filter path " + filterPath);

    InputSplitShim[] iss = combine.getSplits(job, 1);

    if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
      iss = sampleSplits(iss);
View Full Code Here

   * Create Hive splits based on CombineFileSplit
  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();

    if (combine.getInputPathsShim(job).length == 0) {
      throw new IOException("No input paths specified in job");
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    // combine splits only from same tables. Do not combine splits from multiple tables.
    Path[] paths = combine.getInputPathsShim(job);
    for (int i = 0; i < paths.length; i++) {"CombineHiveInputSplit creating pool for " + paths[i]);
      combine.createPool(job, new CombineFilter(paths[i]));

    InputSplitShim[] iss = (InputSplitShim[])combine.getSplits(job, 1);
    for (InputSplitShim is: iss) {
      CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is);
View Full Code Here

  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork =
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims()

    if (combine == null) {
      return super.getSplits(job, numSplits);

    if (combine.getInputPathsShim(job).length == 0) {
      throw new IOException("No input paths specified in job");
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = combine.getInputPathsShim(job);

    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap =
      new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();

    for (Path path : paths) {

      PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
          pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
      TableDesc tableDesc = part.getTableDesc();
      if ((tableDesc != null) && tableDesc.isNonNative()) {
        return super.getSplits(job, numSplits);

      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      String inputFormatClassName = inputFormatClass.getName();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);

      // Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
      // we use a configuration variable for the same
      if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
        // The following code should be removed, once
        // is fixed.
        // Hadoop does not handle non-splittable files correctly for CombineFileInputFormat,
        // so don't use CombineFileInputFormat for non-splittable files
        FileSystem inpFs = path.getFileSystem(job);

        if (inputFormat instanceof TextInputFormat) {
          Queue<Path> dirs = new LinkedList<Path>();
          FileStatus fStats = inpFs.getFileStatus(path);

          // If path is a directory
          if (fStats.isDir()) {
          else if ((new CompressionCodecFactory(job)).getCodec(path) != null) {
            return super.getSplits(job, numSplits);

          while (dirs.peek() != null) {
            Path tstPath = dirs.remove();
            FileStatus[] fStatus = inpFs.listStatus(tstPath);
            for (int idx = 0; idx < fStatus.length; idx++) {
              if (fStatus[idx].isDir()) {
              else if ((new CompressionCodecFactory(job)).getCodec(fStatus[idx].getPath()) != null) {
                return super.getSplits(job, numSplits);

      if (inputFormat instanceof SymlinkTextInputFormat) {
        return super.getSplits(job, numSplits);

      Path filterPath = path;

      // Does a pool exist for this path already
      CombineFilter f = null;
      List<Operator<? extends OperatorDesc>> opList = null;
      boolean done = false;

      if (!mrwork.isMapperCannotSpanPartns()) {
        opList = HiveFileFormatUtils.doGetWorksFromPath(
                   pathToAliases, aliasToWork, filterPath);
        f = poolMap.get(new CombinePathInputFormat(opList, inputFormatClassName));
      } else {
        // In the case of tablesample, the input paths are pointing to files rather than directories.
        // We need to get the parent directory as the filtering path so that all files in the same
        // parent directory will be grouped into one pool but not files from different parent
        // directories. This guarantees that a split will combine all files in the same partition
        // but won't cross multiple partitions if the user has asked so.
        if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
          filterPath = path.getParent();
        } else {
        done = true;

      if (!done) {
        if (f == null) {
          f = new CombineFilter(filterPath);
"CombineHiveInputSplit creating pool for " + path +
                   "; using filter path " + filterPath);
          combine.createPool(job, f);
          if (!mrwork.isMapperCannotSpanPartns()) {
            poolMap.put(new CombinePathInputFormat(opList, inputFormatClassName), f);
        } else {
"CombineHiveInputSplit: pool is already created for " + path +
                   "; using filter path " + filterPath);

    // Processing directories
    List<InputSplitShim> iss = new ArrayList<InputSplitShim>();
    if (!mrwork.isMapperCannotSpanPartns()) {
      iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
      for (Path path : inpDirs) {
        processPaths(job, combine, iss, path);

      if (inpFiles.size() > 0) {
        // Processing files
        for (Path filterPath : poolSet) {
          combine.createPool(job, new CombineFilter(filterPath));
        processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
View Full Code Here

  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends Serializable>> aliasToWork =
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims()

    if (combine == null) {
      return super.getSplits(job, numSplits);

    if (combine.getInputPathsShim(job).length == 0) {
      throw new IOException("No input paths specified in job");
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = combine.getInputPathsShim(job);

    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap =
      new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();

    for (Path path : paths) {

      PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
          pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
      TableDesc tableDesc = part.getTableDesc();
      if ((tableDesc != null) && tableDesc.isNonNative()) {
        return super.getSplits(job, numSplits);

      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      String inputFormatClassName = inputFormatClass.getName();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);

      // Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
      // we use a configuration variable for the same
      if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
        // The following code should be removed, once
        // is fixed.
        // Hadoop does not handle non-splittable files correctly for CombineFileInputFormat,
        // so don't use CombineFileInputFormat for non-splittable files
        FileSystem inpFs = path.getFileSystem(job);

        if (inputFormat instanceof TextInputFormat) {
          Queue<Path> dirs = new LinkedList<Path>();
          FileStatus fStats = inpFs.getFileStatus(path);

          // If path is a directory
          if (fStats.isDir()) {
          else if ((new CompressionCodecFactory(job)).getCodec(path) != null) {
            return super.getSplits(job, numSplits);

          while (dirs.peek() != null) {
            Path tstPath = dirs.remove();
            FileStatus[] fStatus = inpFs.listStatus(tstPath);
            for (int idx = 0; idx < fStatus.length; idx++) {
              if (fStatus[idx].isDir()) {
              else if ((new CompressionCodecFactory(job)).getCodec(fStatus[idx].getPath()) != null) {
                return super.getSplits(job, numSplits);

      if (inputFormat instanceof SymlinkTextInputFormat) {
        return super.getSplits(job, numSplits);

      Path filterPath = path;

      // Does a pool exist for this path already
      CombineFilter f = null;
      List<Operator<? extends Serializable>> opList = null;
      boolean done = false;

      if (!mrwork.isMapperCannotSpanPartns()) {
        opList = HiveFileFormatUtils.doGetWorksFromPath(
                   pathToAliases, aliasToWork, filterPath);
        f = poolMap.get(new CombinePathInputFormat(opList, inputFormatClassName));
      } else {
        // In the case of tablesample, the input paths are pointing to files rather than directories.
        // We need to get the parent directory as the filtering path so that all files in the same
        // parent directory will be grouped into one pool but not files from different parent
        // directories. This guarantees that a split will combine all files in the same partition
        // but won't cross multiple partitions if the user has asked so.
        if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
          filterPath = path.getParent();
        } else {
        done = true;

      if (!done) {
        if (f == null) {
          f = new CombineFilter(filterPath);
"CombineHiveInputSplit creating pool for " + path +
                   "; using filter path " + filterPath);
          combine.createPool(job, f);
          if (!mrwork.isMapperCannotSpanPartns()) {
            poolMap.put(new CombinePathInputFormat(opList, inputFormatClassName), f);
        } else {
"CombineHiveInputSplit: pool is already created for " + path +
                   "; using filter path " + filterPath);

    // Processing directories
    List<InputSplitShim> iss = new ArrayList<InputSplitShim>();
    if (!mrwork.isMapperCannotSpanPartns()) {
      iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
      for (Path path : inpDirs) {
        processPaths(job, combine, iss, path);

      if (inpFiles.size() > 0) {
        // Processing files
        for (Path filterPath : poolSet) {
          combine.createPool(job, new CombineFilter(filterPath));
        processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
View Full Code Here

  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends Serializable>> aliasToWork =
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims()

    if (combine == null) {
      return super.getSplits(job, numSplits);

    if (combine.getInputPathsShim(job).length == 0) {
      throw new IOException("No input paths specified in job");
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = combine.getInputPathsShim(job);

    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap =
      new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();

    for (Path path : paths) {

      PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
          pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
      TableDesc tableDesc = part.getTableDesc();
      if ((tableDesc != null) && tableDesc.isNonNative()) {
        return super.getSplits(job, numSplits);

      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      String inputFormatClassName = inputFormatClass.getName();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);

      // Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
      // we use a configuration variable for the same
      if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
        // The following code should be removed, once
        // is fixed.
        // Hadoop does not handle non-splittable files correctly for CombineFileInputFormat,
        // so don't use CombineFileInputFormat for non-splittable files
        FileSystem inpFs = path.getFileSystem(job);

        if (inputFormat instanceof TextInputFormat) {
          Queue<Path> dirs = new LinkedList<Path>();
          FileStatus fStats = inpFs.getFileStatus(path);

          // If path is a directory
          if (fStats.isDir()) {
          else if ((new CompressionCodecFactory(job)).getCodec(path) != null) {
            return super.getSplits(job, numSplits);

          while (dirs.peek() != null) {
            Path tstPath = dirs.remove();
            FileStatus[] fStatus = inpFs.listStatus(tstPath);
            for (int idx = 0; idx < fStatus.length; idx++) {
              if (fStatus[idx].isDir()) {
              else if ((new CompressionCodecFactory(job)).getCodec(fStatus[idx].getPath()) != null) {
                return super.getSplits(job, numSplits);

      if (inputFormat instanceof SymlinkTextInputFormat) {
        return super.getSplits(job, numSplits);

      Path filterPath = path;

      // Does a pool exist for this path already
      CombineFilter f = null;
      List<Operator<? extends Serializable>> opList = null;
      boolean done = false;

      if (!mrwork.isMapperCannotSpanPartns()) {
        opList = HiveFileFormatUtils.doGetWorksFromPath(
                   pathToAliases, aliasToWork, filterPath);
        f = poolMap.get(new CombinePathInputFormat(opList, inputFormatClassName));
      } else {
        // In the case of tablesample, the input paths are pointing to files rather than directories.
        // We need to get the parent directory as the filtering path so that all files in the same
        // parent directory will be grouped into one pool but not files from different parent
        // directories. This guarantees that a split will combine all files in the same partition
        // but won't cross multiple partitions if the user has asked so.
        if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
          filterPath = path.getParent();
        } else {
        done = true;

      if (!done) {
        if (f == null) {
          f = new CombineFilter(filterPath);
"CombineHiveInputSplit creating pool for " + path +
                   "; using filter path " + filterPath);
          combine.createPool(job, f);
          if (!mrwork.isMapperCannotSpanPartns()) {
            poolMap.put(new CombinePathInputFormat(opList, inputFormatClassName), f);
        } else {
"CombineHiveInputSplit: pool is already created for " + path +
                   "; using filter path " + filterPath);

    // Processing directories
    List<InputSplitShim> iss = new ArrayList<InputSplitShim>();
    if (!mrwork.isMapperCannotSpanPartns()) {
      iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
      for (Path path : inpDirs) {
        processPaths(job, combine, iss, path);

      if (inpFiles.size() > 0) {
        // Processing files
        for (Path filterPath : poolSet) {
          combine.createPool(job, new CombineFilter(filterPath));
        processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
View Full Code Here

    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
    Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork =
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims()
    // on tez we're avoiding duplicating path info since the info will go over
    // rpc
    if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
      try {
        List<Path> dirs = Utilities.getInputPathsTez(job, mrwork);
        Utilities.setInputPaths(job, dirs);
      } catch (Exception e) {
        throw new IOException("Could not create input paths", e);

    InputSplit[] splits = null;
    if (combine == null) {
      splits = super.getSplits(job, numSplits);
      perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
      return splits;

    if (combine.getInputPathsShim(job).length == 0) {
      throw new IOException("No input paths specified in job");
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = combine.getInputPathsShim(job);

    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap =
      new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();

    for (Path path : paths) {
      PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
          pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
      TableDesc tableDesc = part.getTableDesc();
      if ((tableDesc != null) && tableDesc.isNonNative()) {
        return super.getSplits(job, numSplits);

      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      String inputFormatClassName = inputFormatClass.getName();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
      String deserializerClassName = null;
      try {
        deserializerClassName = part.getDeserializer(job).getClass().getName();
      } catch (Exception e) {
        // ignore
      FileSystem inpFs = path.getFileSystem(job);
      if (inputFormatClass.isAssignableFrom(OrcInputFormat.class)) {
        if (inpFs.exists(new Path(path, OrcRecordUpdater.ACID_FORMAT))) {
          throw new IOException("CombineHiveInputFormat is incompatible " +
            " with ACID tables. Please set hive.input.format=" +

      // Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
      // we use a configuration variable for the same
      if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
        // The following code should be removed, once
        // is fixed.
        // Hadoop does not handle non-splittable files correctly for CombineFileInputFormat,
        // so don't use CombineFileInputFormat for non-splittable files

        //ie, dont't combine if inputformat is a TextInputFormat and has compression turned on

        if (inputFormat instanceof TextInputFormat) {
          Queue<Path> dirs = new LinkedList<Path>();
          FileStatus fStats = inpFs.getFileStatus(path);

          // If path is a directory
          if (fStats.isDir()) {
          } else if ((new CompressionCodecFactory(job)).getCodec(path) != null) {
            //if compresssion codec is set, use HiveInputFormat.getSplits (don't combine)
            splits = super.getSplits(job, numSplits);
            perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
            return splits;

          while (dirs.peek() != null) {
            Path tstPath = dirs.remove();
            FileStatus[] fStatus = inpFs.listStatus(tstPath);
            for (int idx = 0; idx < fStatus.length; idx++) {
              if (fStatus[idx].isDir()) {
              } else if ((new CompressionCodecFactory(job)).getCodec(
                  fStatus[idx].getPath()) != null) {
                //if compresssion codec is set, use HiveInputFormat.getSplits (don't combine)
                splits = super.getSplits(job, numSplits);
                perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
                return splits;
      //don't combine if inputformat is a SymlinkTextInputFormat
      if (inputFormat instanceof SymlinkTextInputFormat) {
        splits = super.getSplits(job, numSplits);
        perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
        return splits;

      Path filterPath = path;

      // Does a pool exist for this path already
      CombineFilter f = null;
      List<Operator<? extends OperatorDesc>> opList = null;

      if (!mrwork.isMapperCannotSpanPartns()) {
        //if mapper can span partitions, make sure a splits does not contain multiple
        // opList + inputFormatClassName + deserializerClassName combination
        // This is done using the Map of CombinePathInputFormat to PathFilter

        opList = HiveFileFormatUtils.doGetWorksFromPath(
                   pathToAliases, aliasToWork, filterPath);
        CombinePathInputFormat combinePathInputFormat =
            new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
        f = poolMap.get(combinePathInputFormat);
        if (f == null) {
          f = new CombineFilter(filterPath);
"CombineHiveInputSplit creating pool for " + path +
                   "; using filter path " + filterPath);
          combine.createPool(job, f);
          poolMap.put(combinePathInputFormat, f);
        } else {
"CombineHiveInputSplit: pool is already created for " + path +
                   "; using filter path " + filterPath);
      } else {
        // In the case of tablesample, the input paths are pointing to files rather than directories.
        // We need to get the parent directory as the filtering path so that all files in the same
        // parent directory will be grouped into one pool but not files from different parent
        // directories. This guarantees that a split will combine all files in the same partition
        // but won't cross multiple partitions if the user has asked so.
        if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
          filterPath = path.getParent();
        } else {

    // Processing directories
    List<InputSplitShim> iss = new ArrayList<InputSplitShim>();
    if (!mrwork.isMapperCannotSpanPartns()) {
      //mapper can span partitions
      //combine into as few as one split, subject to the PathFilters set
      // using combine.createPool.
      iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
      for (Path path : inpDirs) {
        processPaths(job, combine, iss, path);

      if (inpFiles.size() > 0) {
        // Processing files
        for (Path filterPath : poolSet) {
          combine.createPool(job, new CombineFilter(filterPath));
        processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
View Full Code Here

    Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends Serializable>> aliasToWork =
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims()

    if (combine == null) {
      return super.getSplits(job, numSplits);

    if (combine.getInputPathsShim(job).length == 0) {
      throw new IOException("No input paths specified in job");
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = combine.getInputPathsShim(job);
    Map<CombinePathInputFormat, CombineFilter> poolMap =
      new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();

    for (Path path : paths) {

      PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
          pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
      TableDesc tableDesc = part.getTableDesc();
      if ((tableDesc != null) && tableDesc.isNonNative()) {
        return super.getSplits(job, numSplits);

      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      String inputFormatClassName = inputFormatClass.getName();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);

      // Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
      // we use a configuration variable for the same
      if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
        // The following code should be removed, once
        // is fixed.
        // Hadoop does not handle non-splittable files correctly for CombineFileInputFormat,
        // so don't use CombineFileInputFormat for non-splittable files
        FileSystem inpFs = path.getFileSystem(job);

        if (inputFormat instanceof TextInputFormat) {
          Queue<Path> dirs = new LinkedList<Path>();
          FileStatus fStats = inpFs.getFileStatus(path);

          // If path is a directory
          if (fStats.isDir()) {
          else if ((new CompressionCodecFactory(job)).getCodec(path) != null) {
            return super.getSplits(job, numSplits);

          while (dirs.peek() != null) {
            Path tstPath = dirs.remove();
            FileStatus[] fStatus = inpFs.listStatus(tstPath);
            for (int idx = 0; idx < fStatus.length; idx++) {
              if (fStatus[idx].isDir()) {
              else if ((new CompressionCodecFactory(job)).getCodec(fStatus[idx].getPath()) != null) {
                return super.getSplits(job, numSplits);

      if (inputFormat instanceof SymlinkTextInputFormat) {
        return super.getSplits(job, numSplits);

      Path filterPath = path;

      // In the case of tablesample, the input paths are pointing to files rather than directories.
      // We need to get the parent directory as the filtering path so that all files in the same
      // parent directory will be grouped into one pool but not files from different parent
      // directories. This guarantees that a split will combine all files in the same partition
      // but won't cross multiple partitions if the user has asked so.
      if (mrwork.isMapperCannotSpanPartns() &&
          !path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
        filterPath = path.getParent();

      // Does a pool exist for this path already
      CombineFilter f = null;
      List<Operator<? extends Serializable>> opList = null;
      boolean done = false;

      if (!mrwork.isMapperCannotSpanPartns()) {
        opList = HiveFileFormatUtils.doGetAliasesFromPath(
                   pathToAliases, aliasToWork, filterPath);
        f = poolMap.get(new CombinePathInputFormat(opList, inputFormatClassName));
      else {
        if (poolSet.contains(filterPath)) {
"CombineHiveInputSplit: pool is already created for " + path +
                   "; using filter path " + filterPath);
          done = true;

      if (!done) {
        if (f == null) {
          f = new CombineFilter(filterPath);
"CombineHiveInputSplit creating pool for " + path +
                   "; using filter path " + filterPath);
          combine.createPool(job, f);
          if (!mrwork.isMapperCannotSpanPartns()) {
            poolMap.put(new CombinePathInputFormat(opList, inputFormatClassName), f);
        } else {
"CombineHiveInputSplit: pool is already created for " + path +
                   "; using filter path " + filterPath);

    InputSplitShim[] iss = combine.getSplits(job, 1);
    for (InputSplitShim is : iss) {
      CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is);
View Full Code Here

    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
    Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork =
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims()

    InputSplit[] splits = null;
    if (combine == null) {
      splits = super.getSplits(job, numSplits);
      perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
      return splits;

    if (combine.getInputPathsShim(job).length == 0) {
      throw new IOException("No input paths specified in job");
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = combine.getInputPathsShim(job);

    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap =
      new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();

    for (Path path : paths) {
      PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
          pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
      TableDesc tableDesc = part.getTableDesc();
      if ((tableDesc != null) && tableDesc.isNonNative()) {
        return super.getSplits(job, numSplits);

      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      String inputFormatClassName = inputFormatClass.getName();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
      String deserializerClassName = null;
      try {
        deserializerClassName = part.getDeserializer(job).getClass().getName();
      } catch (Exception e) {
        // ignore
      FileSystem inpFs = path.getFileSystem(job);

      // Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
      // we use a configuration variable for the same
      if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
        // The following code should be removed, once
        // is fixed.
        // Hadoop does not handle non-splittable files correctly for CombineFileInputFormat,
        // so don't use CombineFileInputFormat for non-splittable files

        //ie, dont't combine if inputformat is a TextInputFormat and has compression turned on

        if (inputFormat instanceof TextInputFormat) {
          Queue<Path> dirs = new LinkedList<Path>();
          FileStatus fStats = inpFs.getFileStatus(path);

          // If path is a directory
          if (fStats.isDir()) {
          } else if ((new CompressionCodecFactory(job)).getCodec(path) != null) {
            //if compresssion codec is set, use HiveInputFormat.getSplits (don't combine)
            splits = super.getSplits(job, numSplits);
            perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
            return splits;

          while (dirs.peek() != null) {
            Path tstPath = dirs.remove();
            FileStatus[] fStatus = inpFs.listStatus(tstPath);
            for (int idx = 0; idx < fStatus.length; idx++) {
              if (fStatus[idx].isDir()) {
              } else if ((new CompressionCodecFactory(job)).getCodec(
                  fStatus[idx].getPath()) != null) {
                //if compresssion codec is set, use HiveInputFormat.getSplits (don't combine)
                splits = super.getSplits(job, numSplits);
                perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
                return splits;
      //don't combine if inputformat is a SymlinkTextInputFormat
      if (inputFormat instanceof SymlinkTextInputFormat) {
        splits = super.getSplits(job, numSplits);
        perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
        return splits;

      Path filterPath = path;

      // Does a pool exist for this path already
      CombineFilter f = null;
      List<Operator<? extends OperatorDesc>> opList = null;

      if (!mrwork.isMapperCannotSpanPartns()) {
        //if mapper can span partitions, make sure a splits does not contain multiple
        // opList + inputFormatClassName + deserializerClassName combination
        // This is done using the Map of CombinePathInputFormat to PathFilter

        opList = HiveFileFormatUtils.doGetWorksFromPath(
                   pathToAliases, aliasToWork, filterPath);
        CombinePathInputFormat combinePathInputFormat =
            new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
        f = poolMap.get(combinePathInputFormat);
        if (f == null) {
          f = new CombineFilter(filterPath);
"CombineHiveInputSplit creating pool for " + path +
                   "; using filter path " + filterPath);
          combine.createPool(job, f);
          poolMap.put(combinePathInputFormat, f);
        } else {
"CombineHiveInputSplit: pool is already created for " + path +
                   "; using filter path " + filterPath);
      } else {
        // In the case of tablesample, the input paths are pointing to files rather than directories.
        // We need to get the parent directory as the filtering path so that all files in the same
        // parent directory will be grouped into one pool but not files from different parent
        // directories. This guarantees that a split will combine all files in the same partition
        // but won't cross multiple partitions if the user has asked so.
        if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
          filterPath = path.getParent();
        } else {

    // Processing directories
    List<InputSplitShim> iss = new ArrayList<InputSplitShim>();
    if (!mrwork.isMapperCannotSpanPartns()) {
      //mapper can span partitions
      //combine into as few as one split, subject to the PathFilters set
      // using combine.createPool.
      iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
      for (Path path : inpDirs) {
        processPaths(job, combine, iss, path);

      if (inpFiles.size() > 0) {
        // Processing files
        for (Path filterPath : poolSet) {
          combine.createPool(job, new CombineFilter(filterPath));
        processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
View Full Code Here


Related Classes of org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact