Examples of org.apache.tez.mapreduce.hadoop.InputSplitInfoMem

org.apache.tez.mapreduce.hadoop.InputSplitInfoMem
Represents InputSplitInfo for splits generated to memory.
Since splits are generated in memory, the getSplitsMetaInfoFile and getSplitsFile are not supported.


    // Read all credentials into the credentials instance stored in JobConf.
    JobConf jobConf = new JobConf(conf);
    jobConf.getCredentials().mergeAll(UserGroupInformation.getCurrentUser().getCredentials());


    InputSplitInfoMem inputSplitInfo = null;
    boolean groupSplits = userPayloadProto.getGroupingEnabled();
    if (groupSplits) {
      LOG.info("Grouping input splits");
      inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, true, numTasks);
    } else {
      inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, false, 0);
    }
    if (LOG.isDebugEnabled()) {
      sw.stop();
      LOG.debug("Time to create splits to mem: " + sw.elapsedMillis());
    }


    List<Event> events = Lists.newArrayListWithCapacity(inputSplitInfo
        .getNumTasks() + 1);
    
    InputConfigureVertexTasksEvent configureVertexEvent = InputConfigureVertexTasksEvent.create(
        inputSplitInfo.getNumTasks(),
        VertexLocationHint.create(inputSplitInfo.getTaskLocationHints()),
        InputSpecUpdate.getDefaultSinglePhysicalInputSpecUpdate());
    events.add(configureVertexEvent);


    if (sendSerializedEvents) {
      MRSplitsProto splitsProto = inputSplitInfo.getSplitsProto();
      int count = 0;
      for (MRSplitProto mrSplit : splitsProto.getSplitsList()) {
        // Unnecessary array copy, can be avoided by using ByteBuffer instead of a raw array.
        InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(
            count++,
            mrSplit.toByteString().asReadOnlyByteBuffer());
        events.add(diEvent);
      }
    } else {
      int count = 0;
      if (inputSplitInfo.holdsNewFormatSplits()) {
        for (org.apache.hadoop.mapreduce.InputSplit split : inputSplitInfo.getNewFormatSplits()) {
          InputDataInformationEvent diEvent = InputDataInformationEvent.createWithObjectPayload(
              count++, split);
          events.add(diEvent);
        }
      } else {
        for (org.apache.hadoop.mapred.InputSplit split : inputSplitInfo.getOldFormatSplits()) {
          InputDataInformationEvent diEvent = InputDataInformationEvent.createWithObjectPayload(
              count++, split);
          events.add(diEvent);
        }
      }

View Full Code Here

    LOG.info("Input " + rootInputContext.getInputName() + " asking for " + numTasks
        + " tasks. Headroom: " + totalResource + " Task Resource: "
        + taskResource + " waves: " + waves);




    InputSplitInfoMem inputSplitInfo = null;
    String realInputFormatName = userPayloadProto.getInputFormatName(); 
    if ( realInputFormatName != null && !realInputFormatName.isEmpty()) {
      // split grouping on the AM
      JobConf jobConf = new JobConf(conf);
      if (jobConf.getUseNewMapper()) {
        LOG.info("Grouping mapreduce api input splits");
        Job job = Job.getInstance(conf);
        org.apache.hadoop.mapreduce.InputSplit[] splits = MRHelpers
            .generateNewSplits(job, realInputFormatName, numTasks);


        // Move all this into a function
        List<TaskLocationHint> locationHints = Lists
            .newArrayListWithCapacity(splits.length);
        for (org.apache.hadoop.mapreduce.InputSplit split : splits) {
          String rack = 
              ((org.apache.hadoop.mapreduce.split.TezGroupedSplit) split).getRack();
          if (rack == null) {
            if (split.getLocations() != null) {
              locationHints.add(new TaskLocationHint(new HashSet<String>(Arrays
                  .asList(split.getLocations())), null));
            } else {
              locationHints.add(new TaskLocationHint(null, null));
            }
          } else {
            locationHints.add(new TaskLocationHint(null, 
                Collections.singleton(rack)));
          }
        }
        inputSplitInfo = new InputSplitInfoMem(splits, locationHints, splits.length, null, conf);
      } else {
        LOG.info("Grouping mapred api input splits");
        org.apache.hadoop.mapred.InputSplit[] splits = MRHelpers
            .generateOldSplits(jobConf, realInputFormatName, numTasks);
        List<TaskLocationHint> locationHints = Lists
            .newArrayListWithCapacity(splits.length);
        for (org.apache.hadoop.mapred.InputSplit split : splits) {
          String rack = 
              ((org.apache.hadoop.mapred.split.TezGroupedSplit) split).getRack();
          if (rack == null) {
            if (split.getLocations() != null) {
              locationHints.add(new TaskLocationHint(new HashSet<String>(Arrays
                  .asList(split.getLocations())), null));
            } else {
              locationHints.add(new TaskLocationHint(null, null));
            }
          } else {
            locationHints.add(new TaskLocationHint(null, 
                Collections.singleton(rack)));
          }
        }
        inputSplitInfo = new InputSplitInfoMem(splits, locationHints, splits.length, null, conf);
      }
    } else {
      inputSplitInfo = MRHelpers.generateInputSplitsToMem(conf);
    }
    if (LOG.isDebugEnabled()) {
      sw.stop();
      LOG.debug("Time to create splits to mem: " + sw.elapsedMillis());
    }


    List<Event> events = Lists.newArrayListWithCapacity(inputSplitInfo
        .getNumTasks() + 1);
    
    RootInputConfigureVertexTasksEvent configureVertexEvent = new RootInputConfigureVertexTasksEvent(
        inputSplitInfo.getNumTasks(), inputSplitInfo.getTaskLocationHints());
    events.add(configureVertexEvent);


    if (sendSerializedEvents) {
      MRSplitsProto splitsProto = inputSplitInfo.getSplitsProto();
      int count = 0;
      for (MRSplitProto mrSplit : splitsProto.getSplitsList()) {
        // Unnecessary array copy, can be avoided by using ByteBuffer instead of a raw array.
        RootInputDataInformationEvent diEvent = new RootInputDataInformationEvent(count++,
            mrSplit.toByteArray());
        events.add(diEvent);
      }
    } else {
      int count = 0;
      if (inputSplitInfo.holdsNewFormatSplits()) {
        for (org.apache.hadoop.mapreduce.InputSplit split : inputSplitInfo.getNewFormatSplits()) {
          RootInputDataInformationEvent diEvent = new RootInputDataInformationEvent(count++, split);
          events.add(diEvent);
        }
      } else {
        for (org.apache.hadoop.mapred.InputSplit split : inputSplitInfo.getOldFormatSplits()) {
          RootInputDataInformationEvent diEvent = new RootInputDataInformationEvent(count++, split);
          events.add(diEvent);
        }
      }
    }

View Full Code Here


    if (LOG.isDebugEnabled()) {
      sw.reset().start();
    }


    InputSplitInfoMem inputSplitInfo = null;
    String realInputFormatName = userPayloadProto.getInputFormatName(); 
    if ( realInputFormatName != null && !realInputFormatName.isEmpty()) {
      // split grouping on the AM
      JobConf jobConf = new JobConf(conf);
      if (jobConf.getUseNewMapper()) {
        LOG.info("Grouping mapreduce api input splits");
        Job job = Job.getInstance(conf);
        org.apache.hadoop.mapreduce.InputSplit[] splits = MRHelpers
            .generateNewSplits(job, realInputFormatName,
                rootInputContext.getNumTasks());
        SerializationFactory serializationFactory = new SerializationFactory(
            job.getConfiguration());


        MRSplitsProto.Builder splitsBuilder = MRSplitsProto.newBuilder();


        List<TaskLocationHint> locationHints = Lists
            .newArrayListWithCapacity(splits.length);
        for (org.apache.hadoop.mapreduce.InputSplit split : splits) {
          splitsBuilder.addSplits(MRHelpers.createSplitProto(split,
              serializationFactory));
          String rack = 
              ((org.apache.hadoop.mapreduce.split.TezGroupedSplit) split).getRack();
          if (rack == null) {
            if (split.getLocations() != null) {
              locationHints.add(new TaskLocationHint(new HashSet<String>(Arrays
                  .asList(split.getLocations())), null));
            } else {
              locationHints.add(new TaskLocationHint(null, null));
            }
          } else {
            locationHints.add(new TaskLocationHint(null, 
                Collections.singleton(rack)));
          }
        }
        inputSplitInfo = new InputSplitInfoMem(splitsBuilder.build(),
            locationHints, splits.length);
      } else {
        LOG.info("Grouping mapred api input splits");
        org.apache.hadoop.mapred.InputSplit[] splits = MRHelpers
            .generateOldSplits(jobConf, realInputFormatName,
                rootInputContext.getNumTasks());
        List<TaskLocationHint> locationHints = Lists
            .newArrayListWithCapacity(splits.length);
        MRSplitsProto.Builder splitsBuilder = MRSplitsProto.newBuilder();
        for (org.apache.hadoop.mapred.InputSplit split : splits) {
          splitsBuilder.addSplits(MRHelpers.createSplitProto(split));
          String rack = 
              ((org.apache.hadoop.mapred.split.TezGroupedSplit) split).getRack();
          if (rack == null) {
            if (split.getLocations() != null) {
              locationHints.add(new TaskLocationHint(new HashSet<String>(Arrays
                  .asList(split.getLocations())), null));
            } else {
              locationHints.add(new TaskLocationHint(null, null));
            }
          } else {
            locationHints.add(new TaskLocationHint(null, 
                Collections.singleton(rack)));
          }
        }
        inputSplitInfo = new InputSplitInfoMem(splitsBuilder.build(),
            locationHints, splits.length);
      }
    } else {
      inputSplitInfo = MRHelpers.generateInputSplitsToMem(conf);
    }
    if (LOG.isDebugEnabled()) {
      sw.stop();
      LOG.debug("Time to create splits to mem: " + sw.elapsedMillis());
    }


    List<Event> events = Lists.newArrayListWithCapacity(inputSplitInfo
        .getNumTasks() + 1);
    
    RootInputConfigureVertexTasksEvent configureVertexEvent = new RootInputConfigureVertexTasksEvent(
        inputSplitInfo.getNumTasks(), inputSplitInfo.getTaskLocationHints());
    events.add(configureVertexEvent);


    MRSplitsProto splitsProto = inputSplitInfo.getSplitsProto();


    int count = 0;
    for (MRSplitProto mrSplit : splitsProto.getSplitsList()) {
      // Unnecessary array copy, can be avoided by using ByteBuffer instead of a
      // raw array.

View Full Code Here


    // Read all credentials into the credentials instance stored in JobConf.
    JobConf jobConf = new JobConf(conf);
    jobConf.getCredentials().mergeAll(UserGroupInformation.getCurrentUser().getCredentials());


    InputSplitInfoMem inputSplitInfo = null;
    String realInputFormatName = userPayloadProto.getInputFormatName(); 
    if ( realInputFormatName != null && !realInputFormatName.isEmpty()) {
      // split grouping on the AM
      if (jobConf.getUseNewMapper()) {
        LOG.info("Grouping mapreduce api input splits");
        Job job = Job.getInstance(jobConf);
        org.apache.hadoop.mapreduce.InputSplit[] splits = MRHelpers
            .generateNewSplits(job, realInputFormatName, numTasks);


        // Move all this into a function
        List<TaskLocationHint> locationHints = Lists
            .newArrayListWithCapacity(splits.length);
        for (org.apache.hadoop.mapreduce.InputSplit split : splits) {
          String rack = 
              ((org.apache.hadoop.mapreduce.split.TezGroupedSplit) split).getRack();
          if (rack == null) {
            if (split.getLocations() != null) {
              locationHints.add(new TaskLocationHint(new HashSet<String>(Arrays
                  .asList(split.getLocations())), null));
            } else {
              locationHints.add(new TaskLocationHint(null, null));
            }
          } else {
            locationHints.add(new TaskLocationHint(null, 
                Collections.singleton(rack)));
          }
        }
        inputSplitInfo = new InputSplitInfoMem(splits, locationHints, splits.length, null, jobConf);
      } else {
        LOG.info("Grouping mapred api input splits");
        org.apache.hadoop.mapred.InputSplit[] splits = MRHelpers
            .generateOldSplits(jobConf, realInputFormatName, numTasks);
        List<TaskLocationHint> locationHints = Lists
            .newArrayListWithCapacity(splits.length);
        for (org.apache.hadoop.mapred.InputSplit split : splits) {
          String rack = 
              ((org.apache.hadoop.mapred.split.TezGroupedSplit) split).getRack();
          if (rack == null) {
            if (split.getLocations() != null) {
              locationHints.add(new TaskLocationHint(new HashSet<String>(Arrays
                  .asList(split.getLocations())), null));
            } else {
              locationHints.add(new TaskLocationHint(null, null));
            }
          } else {
            locationHints.add(new TaskLocationHint(null, 
                Collections.singleton(rack)));
          }
        }
        inputSplitInfo = new InputSplitInfoMem(splits, locationHints, splits.length, null, jobConf);
      }
    } else {
      inputSplitInfo = MRHelpers.generateInputSplitsToMem(jobConf);
    }
    if (LOG.isDebugEnabled()) {
      sw.stop();
      LOG.debug("Time to create splits to mem: " + sw.elapsedMillis());
    }


    List<Event> events = Lists.newArrayListWithCapacity(inputSplitInfo
        .getNumTasks() + 1);
    
    RootInputConfigureVertexTasksEvent configureVertexEvent = new RootInputConfigureVertexTasksEvent(
        inputSplitInfo.getNumTasks(), inputSplitInfo.getTaskLocationHints());
    events.add(configureVertexEvent);


    if (sendSerializedEvents) {
      MRSplitsProto splitsProto = inputSplitInfo.getSplitsProto();
      int count = 0;
      for (MRSplitProto mrSplit : splitsProto.getSplitsList()) {
        // Unnecessary array copy, can be avoided by using ByteBuffer instead of a raw array.
        RootInputDataInformationEvent diEvent = new RootInputDataInformationEvent(count++,
            mrSplit.toByteArray());
        events.add(diEvent);
      }
    } else {
      int count = 0;
      if (inputSplitInfo.holdsNewFormatSplits()) {
        for (org.apache.hadoop.mapreduce.InputSplit split : inputSplitInfo.getNewFormatSplits()) {
          RootInputDataInformationEvent diEvent = new RootInputDataInformationEvent(count++, split);
          events.add(diEvent);
        }
      } else {
        for (org.apache.hadoop.mapred.InputSplit split : inputSplitInfo.getOldFormatSplits()) {
          RootInputDataInformationEvent diEvent = new RootInputDataInformationEvent(count++, split);
          events.add(diEvent);
        }
      }
    }

View Full Code Here

    MapWork work = Utilities.getMapWork(jobConf);


    // perform dynamic partition pruning
    pruner.prune(work, jobConf, context);


    InputSplitInfoMem inputSplitInfo = null;
    String realInputFormatName = conf.get("mapred.input.format.class");
    boolean groupingEnabled = userPayloadProto.getGroupingEnabled();
    if (groupingEnabled) {
      // Need to instantiate the realInputFormat
      InputFormat<?, ?> inputFormat =
          (InputFormat<?, ?>) ReflectionUtils.newInstance(Class.forName(realInputFormatName),
              jobConf);


      int totalResource = rootInputContext.getTotalAvailableResource().getMemory();
      int taskResource = rootInputContext.getVertexTaskResource().getMemory();
      int availableSlots = totalResource / taskResource;


      // Create the un-grouped splits
      float waves =
          conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES,
              TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);


      InputSplit[] splits = inputFormat.getSplits(jobConf, (int) (availableSlots * waves));
      LOG.info("Number of input splits: " + splits.length + ". " + availableSlots
          + " available slots, " + waves + " waves. Input format is: " + realInputFormatName);


      Multimap<Integer, InputSplit> groupedSplits =
          generateGroupedSplits(jobConf, conf, splits, waves, availableSlots);
      // And finally return them in a flat array
      InputSplit[] flatSplits = groupedSplits.values().toArray(new InputSplit[0]);
      LOG.info("Number of grouped splits: " + flatSplits.length);


      List<TaskLocationHint> locationHints = grouper.createTaskLocationHints(flatSplits);


      Utilities.clearWork(jobConf);


      inputSplitInfo =
          new InputSplitInfoMem(flatSplits, locationHints, flatSplits.length, null, jobConf);
    } else {
      // no need for grouping and the target #of tasks.
      // This code path should never be triggered at the moment. If grouping is disabled,
      // DAGUtils uses MRInputAMSplitGenerator.
      // If this is used in the future - make sure to disable grouping in the payload, if it isn't already disabled

View Full Code Here

TOP

Related Classes of org.apache.tez.mapreduce.hadoop.InputSplitInfoMem

org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator

org.apache.hadoop.io.serializer.SerializationFactory

org.apache.tez.mapreduce.common.MRInputAMSplitGenerator

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.