Examples of org.apache.tez.dag.api.DataSourceDescriptor

org.apache.tez.dag.api.DataSourceDescriptor
Defines the input and input initializer for a data source

    // Setup stage1 Vertex
    Vertex stage1Vertex = Vertex.create("stage1", ProcessorDescriptor.create(
        FilterByWordInputProcessor.class.getName()).setUserPayload(stage1Payload))
        .addTaskLocalFiles(commonLocalResources);


    DataSourceDescriptor dsd;
    if (generateSplitsInClient) {
      // TODO TEZ-1406. Dont' use MRInputLegacy
      stage1Conf.set(FileInputFormat.INPUT_DIR, inputPath);
      stage1Conf.setBoolean("mapred.mapper.new-api", false);
      dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, stagingDir, true);
    } else {
      dsd = MRInputLegacy.createConfigBuilder(stage1Conf, TextInputFormat.class, inputPath)
          .groupSplits(false).build();
    }
    stage1Vertex.addDataSource("MRInput", dsd);


    // Setup stage2 Vertex
    Vertex stage2Vertex = Vertex.create("stage2", ProcessorDescriptor.create(
        FilterByWordOutputProcessor.class.getName()).setUserPayload(TezUtils
        .createUserPayloadFromConf(stage2Conf)), dsd.getNumberOfShards());
    stage2Vertex.addTaskLocalFiles(commonLocalResources);


    // Configure the Output for stage2
    stage2Vertex.addDataSink(
        "MROutput",

View Full Code Here

    Configuration inputConf = new Configuration(tezConf);
    inputConf.setBoolean("mapred.mapper.new-api", false);
    inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
    inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
    MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
    DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();


    Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(
        TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);


    Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(

View Full Code Here

    // Setup stage1 Vertex
    Vertex stage1Vertex = Vertex.create("stage1", ProcessorDescriptor.create(
        FilterByWordInputProcessor.class.getName()).setUserPayload(stage1Payload))
        .addTaskLocalFiles(commonLocalResources);


    DataSourceDescriptor dsd;
    if (generateSplitsInClient) {
      // TODO TEZ-1406. Dont' use MRInputLegacy
      stage1Conf.set(FileInputFormat.INPUT_DIR, inputPath);
      stage1Conf.setBoolean("mapred.mapper.new-api", false);
      dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, stagingDir, true);

View Full Code Here

    Vertex map = null;


    // use tez to combine splits
    boolean groupSplitsInInputInitializer;


    DataSourceDescriptor dataSource;


    int numTasks = -1;
    @SuppressWarnings("rawtypes")
    Class inputFormatClass = conf.getClass("mapred.input.format.class",
        InputFormat.class);


    boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
    LOG.info("Vertex has custom input? " + vertexHasCustomInput);
    if (vertexHasCustomInput) {
      groupSplitsInInputInitializer = false;
      // grouping happens in execution phase. The input payload should not enable grouping here,
      // it will be enabled in the CustomVertex.
      inputFormatClass = HiveInputFormat.class;
      conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
      // mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
      // this plug-in to avoid getting a serialized event at run-time.
      conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
    } else {
      // we'll set up tez to combine spits for us iff the input format
      // is HiveInputFormat
      if (inputFormatClass == HiveInputFormat.class) {
        groupSplitsInInputInitializer = true;
      } else {
        groupSplitsInInputInitializer = false;
      }
    }


    if (mapWork instanceof MergeFileWork) {
      Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
      // prepare the tmp output directory. The output tmp directory should
      // exist before jobClose (before renaming after job completion)
      Path tempOutPath = Utilities.toTempPath(outputPath);
      try {
        if (!fs.exists(tempOutPath)) {
          fs.mkdirs(tempOutPath);
        }
      } catch (IOException e) {
        throw new RuntimeException(
            "Can't make path " + outputPath + " : " + e.getMessage());
      }
    }


    // remember mapping of plan to input
    conf.set(Utilities.INPUT_NAME, mapWork.getName());
    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)
        && !mapWork.isUseOneNullRowInputFormat()) {


      // set up the operator plan. (before setting up splits on the AM)
      Utilities.setMapWork(conf, mapWork, mrScratchDir, false);


      // if we're generating the splits in the AM, we just need to set
      // the correct plugin.
      if (groupSplitsInInputInitializer) {
        // Not setting a payload, since the MRInput payload is the same and can be accessed.
        InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(
            HiveSplitGenerator.class.getName());
        dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true)
            .setCustomInitializerDescriptor(descriptor).build();
      } else {
        // Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
        if (vertexHasCustomInput) {
          dataSource =
              MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
        } else {
          dataSource =
              MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
        }
      }
    } else {
      // Setup client side split generation.
      dataSource = MRInputHelpers.configureMRInputWithLegacySplitGeneration(conf, new Path(tezDir,
          "split_" + mapWork.getName().replaceAll(" ", "_")), true);
      numTasks = dataSource.getNumberOfShards();


      // set up the operator plan. (after generating splits - that changes configs)
      Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
    }

View Full Code Here


      // grouping splits loses file name info, breaking partition tap default impl
      if( flowElement instanceof PartitionTap ) // todo: generify
        configBuilder.groupSplits( false );


      DataSourceDescriptor dataSourceDescriptor = configBuilder.build();


      vertex.addDataSource( FlowElements.id( flowElement ), dataSourceDescriptor );
      }


    for( FlowElement flowElement : sinkConfigs.keySet() )

View Full Code Here

0 1

TOP

Related Classes of org.apache.tez.dag.api.DataSourceDescriptor

cascading.flow.tez.Hadoop2TezFlowStep

org.apache.hadoop.hive.ql.exec.tez.DagUtils

org.apache.tez.examples.OrderedWordCount

org.apache.tez.examples.WordCount

org.apache.tez.mapreduce.client.YARNRunner

org.apache.tez.mapreduce.examples.FilterLinesByWord

org.apache.tez.mapreduce.examples.FilterLinesByWordOneToOne

org.apache.tez.mapreduce.examples.MRRSleepJob

org.apache.tez.mapreduce.examples.TestOrderedWordCount

org.apache.tez.mapreduce.examples.UnionExample

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.