Package org.apache.tez.dag.api

Examples of org.apache.tez.dag.api.Vertex


        TextOutputFormat.class, outputPath).build();

    // Create a vertex that reads the data from the data source and tokenizes it using the
    // TokenProcessor. The number of tasks that will do the work for this vertex will be decided
    // using the information provided by the data source descriptor.
    Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(
        TokenProcessor.class.getName())).addDataSource(INPUT, dataSource);

    // Create the edge that represents the movement and semantics of data between the producer
    // Tokenizer vertex and the consumer Summation vertex. In order to perform the summation in
    // parallel the tokenized data will be partitioned by word such that a given word goes to the
    // same partition. The counts for the words should be grouped together per word. To achieve this
    // we can use an edge that contains an input/output pair that handles partitioning and grouping
    // of key value data. We use the helper OrderedPartitionedKVEdgeConfig to create such an
    // edge. Internally, it sets up matching Tez inputs and outputs that can perform this logic.
    // We specify the key, value and partitioner type. Here the key type is Text (for word), the
    // value type is IntWritable (for count) and we using a hash based partitioner. This is a helper
    // object. The edge can be configured by configuring the input, output etc individually without
    // using this helper.
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
        .newBuilder(Text.class.getName(), IntWritable.class.getName(),
            HashPartitioner.class.getName()).build();

    // Create a vertex that reads the tokenized data and calculates the sum using the SumProcessor.
    // The number of tasks that do the work of this vertex depends on the number of partitions used
    // to distribute the sum processing. In this case, its been made configurable via the
    // numPartitions parameter.
    Vertex summationVertex = Vertex.create(SUMMATION,
        ProcessorDescriptor.create(SumProcessor.class.getName()), numPartitions)
        .addDataSink(OUTPUT, dataSink);

    // No need to add jar containing this class as assumed to be part of the Tez jars. Otherwise
    // we would have to add the jars for this code as local files to the vertices.
View Full Code Here


    /**
     * This vertex represents the side of the join that will be accumulated in a hash
     * table in order to join it against the other side. It reads text data using the
     * TextInputFormat. ForwardingProcessor simply forwards the data downstream as is.
     */
    Vertex hashFileVertex = Vertex.create(hashSide, ProcessorDescriptor.create(
        ForwardingProcessor.class.getName())).addDataSource(
        inputFile,
        MRInput
            .createConfigBuilder(new Configuration(tezConf), TextInputFormat.class,
                hashPath.toUri().toString()).groupSplits(false).build());

    /**
     * This vertex represents that side of the data that will be streamed and joined
     * against the other side that has been accumulated into a hash table. It reads
     * text data using the TextInputFormat. ForwardingProcessor simply forwards the data
     * downstream as is.
     */
    Vertex streamFileVertex = Vertex.create(streamingSide, ProcessorDescriptor.create(
        ForwardingProcessor.class.getName())).addDataSource(
        inputFile,
        MRInput
            .createConfigBuilder(new Configuration(tezConf), TextInputFormat.class,
                streamPath.toUri().toString()).groupSplits(false).build());

    /**
     * This vertex represents the join operation. It writes the join output as text using
     * the TextOutputFormat. The JoinProcessor is going to perform the join of the
     * streaming side and the hash side. It is load balanced across numPartitions
     */
    Vertex joinVertex = Vertex.create(joiner, ProcessorDescriptor.create(
        JoinProcessor.class.getName()), numPartitions).addDataSink(joinOutput,
        MROutput.createConfigBuilder(new Configuration(tezConf),
            TextOutputFormat.class, outPath.toUri().toString()).build());

    /**
 
View Full Code Here

        TextInputFormat.class, inputPath).build();

    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf),
        TextOutputFormat.class, outputPath).build();

    Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(
        TokenProcessor.class.getName()));
    tokenizerVertex.addDataSource(INPUT, dataSource);

    // Use Text key and IntWritable value to bring counts for each word in the same partition
    OrderedPartitionedKVEdgeConfig summationEdgeConf = OrderedPartitionedKVEdgeConfig
        .newBuilder(Text.class.getName(), IntWritable.class.getName(),
            HashPartitioner.class.getName()).build();

    // This vertex will be reading intermediate data via an input edge and writing intermediate data
    // via an output edge.
    Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(
        SumProcessor.class.getName()), numPartitions);
   
    // Use IntWritable key and Text value to bring all words with the same count in the same
    // partition. The data will be ordered by count and words grouped by count.
    OrderedPartitionedKVEdgeConfig sorterEdgeConf = OrderedPartitionedKVEdgeConfig
        .newBuilder(IntWritable.class.getName(), Text.class.getName(),
            HashPartitioner.class.getName()).build();

    // Use 1 task to bring all the data in one place for global sorted order. Essentially the number
    // of partitions is 1. So the NoOpSorter can be used to produce the globally ordered output
    Vertex sorterVertex = Vertex.create(SORTER, ProcessorDescriptor.create(
        NoOpSorter.class.getName()), 1);
    sorterVertex.addDataSink(OUTPUT, dataSink);

    // No need to add jar containing this class as assumed to be part of the tez jars.
   
    DAG dag = DAG.create(dagName);
    dag.addVertex(tokenizerVertex)
View Full Code Here

    long largeOutSizePerTask = largeOutSize / numTasks;
    long smallOutSizePerTask = smallOutSize / numTasks;

    DAG dag = DAG.create("JoinDataGen");

    Vertex genDataVertex = Vertex.create("datagen", ProcessorDescriptor.create(
        GenDataProcessor.class.getName()).setUserPayload(
        UserPayload.create(ByteBuffer.wrap(GenDataProcessor.createConfiguration(largeOutSizePerTask,
            smallOutSizePerTask)))), numTasks);
    genDataVertex.addDataSink(STREAM_OUTPUT_NAME,
        MROutput.createConfigBuilder(new Configuration(tezConf),
            TextOutputFormat.class, largeOutPath.toUri().toString()).build());
    genDataVertex.addDataSink(HASH_OUTPUT_NAME,
        MROutput.createConfigBuilder(new Configuration(tezConf),
            TextOutputFormat.class, smallOutPath.toUri().toString()).build());
    genDataVertex.addDataSink(EXPECTED_OUTPUT_NAME,
        MROutput.createConfigBuilder(new Configuration(tezConf),
            TextOutputFormat.class, expectedOutputPath.toUri().toString()).build());

    dag.addVertex(genDataVertex);
View Full Code Here

  }
 
  @Test (timeout=60000)
  public void testBasicSuccessBroadcast() throws Exception {
    DAG dag = DAG.create("testBasicSuccessBroadcast");
    Vertex v1 =
        Vertex.create("v1", TestProcessor.getProcDesc(null), 2, SimpleTestDAG.defaultResource);
    Vertex v2 =
        Vertex.create("v2", TestProcessor.getProcDesc(null), 2, SimpleTestDAG.defaultResource);
    dag.addVertex(v1).addVertex(v2).addEdge(Edge.create(v1, v2,
        EdgeProperty.create(DataMovementType.BROADCAST,
            DataSourceType.PERSISTED,
            SchedulingType.SEQUENTIAL,
View Full Code Here

   
    String mockLR1Name = "LR1";
    Map<String, LocalResource> lrDAG = Collections.singletonMap(mockLR1Name, LocalResource
        .newInstance(URL.newInstance("file:///", "localhost", 0, "test"), LocalResourceType.FILE,
            LocalResourceVisibility.PUBLIC, 1, 1));
    Vertex vertex = Vertex.create("Vertex", ProcessorDescriptor.create("P"), 1,
        Resource.newInstance(1, 1));
    DAG dag = DAG.create("DAG").addVertex(vertex).addTaskLocalFiles(lrDAG);
    DAGClient dagClient = client.submitDAG(dag);
   
    // verify that both DAG and TezClient localResources are added to the vertex
    Map<String, LocalResource> vertexLR = vertex.getTaskLocalFiles();
    Assert.assertTrue(vertexLR.containsKey(mockLR1Name));
   
    Assert.assertTrue(dagClient.getExecutionContext().contains(client.mockAppId.toString()));
   
    if (isSession) {
View Full Code Here

      dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(mapStageConf, stagingDir, true);
    } else {
      dsd = MRInputLegacy.createConfigBuilder(mapStageConf, TextInputFormat.class, inputPath).build();
    }

    Vertex mapVertex = Vertex.create("initialmap", ProcessorDescriptor.create(
        MapProcessor.class.getName()).setUserPayload(
        TezUtils.createUserPayloadFromConf(mapStageConf))
        .setHistoryText(mapStageHistoryText)).addTaskLocalFiles(commonLocalResources);
    mapVertex.addDataSource("MRInput", dsd);
    vertices.add(mapVertex);

    ByteArrayOutputStream iROutputStream = new ByteArrayOutputStream(4096);
    iReduceStageConf.writeXml(iROutputStream);
    String iReduceStageHistoryText = new String(iROutputStream.toByteArray(), "UTF-8");
    Vertex ivertex = Vertex.create("intermediate_reducer", ProcessorDescriptor.create(
        ReduceProcessor.class.getName())
        .setUserPayload(TezUtils.createUserPayloadFromConf(iReduceStageConf))
        .setHistoryText(iReduceStageHistoryText), 2);
    ivertex.addTaskLocalFiles(commonLocalResources);
    vertices.add(ivertex);

    ByteArrayOutputStream finalReduceOutputStream = new ByteArrayOutputStream(4096);
    finalReduceConf.writeXml(finalReduceOutputStream);
    String finalReduceStageHistoryText = new String(finalReduceOutputStream.toByteArray(), "UTF-8");
    UserPayload finalReducePayload = TezUtils.createUserPayloadFromConf(finalReduceConf);
    Vertex finalReduceVertex = Vertex.create("finalreduce",
        ProcessorDescriptor.create(
            ReduceProcessor.class.getName())
            .setUserPayload(finalReducePayload)
            .setHistoryText(finalReduceStageHistoryText), 1);
    finalReduceVertex.addTaskLocalFiles(commonLocalResources);
    finalReduceVertex.addDataSink("MROutput",
        MROutputLegacy.createConfigBuilder(finalReduceConf, TextOutputFormat.class, outputPath)
            .build());
    vertices.add(finalReduceVertex);

    DAG dag = DAG.create("OrderedWordCount" + dagIndex);
View Full Code Here

   
    UserPayload mapUserPayload = TezUtils.createUserPayloadFromConf(mapStageConf);
    int numTasks = generateSplitsInAM ? -1 : numMapper;

    Vertex mapVertex = Vertex.create("map", ProcessorDescriptor.create(
        MapProcessor.class.getName()).setUserPayload(mapUserPayload), numTasks)
        .addTaskLocalFiles(commonLocalResources);
    mapVertex.addDataSource("MRInput", dataSource);
    vertices.add(mapVertex);

    if (iReduceStagesCount > 0
        && numIReducer > 0) {
      for (int i = 0; i < iReduceStagesCount; ++i) {
        Configuration iconf =
            intermediateReduceStageConfs[i];
        UserPayload iReduceUserPayload = TezUtils.createUserPayloadFromConf(iconf);
        Vertex ivertex = Vertex.create("ireduce" + (i + 1),
            ProcessorDescriptor.create(ReduceProcessor.class.getName()).
                setUserPayload(iReduceUserPayload), numIReducer);
        ivertex.addTaskLocalFiles(commonLocalResources);
        vertices.add(ivertex);
      }
    }

    Vertex finalReduceVertex = null;
    if (numReducer > 0) {
      UserPayload reducePayload = TezUtils.createUserPayloadFromConf(finalReduceConf);
      finalReduceVertex = Vertex.create("reduce", ProcessorDescriptor.create(
          ReduceProcessor.class.getName()).setUserPayload(reducePayload), numReducer);
      finalReduceVertex.addTaskLocalFiles(commonLocalResources);
      finalReduceVertex.addDataSink("MROutput", MROutputLegacy.createConfigBuilder(finalReduceConf,
          NullOutputFormat.class).build());
      vertices.add(finalReduceVertex);
    } else {
      // Map only job
      mapVertex.addDataSink("MROutput",
View Full Code Here

    stage2Conf.set(FileOutputFormat.OUTDIR, outputPath);
    stage2Conf.setBoolean("mapred.mapper.new-api", false);

    UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf);
    // Setup stage1 Vertex
    Vertex stage1Vertex = Vertex.create("stage1", ProcessorDescriptor.create(
        FilterByWordInputProcessor.class.getName()).setUserPayload(stage1Payload))
        .addTaskLocalFiles(commonLocalResources);

    DataSourceDescriptor dsd;
    if (generateSplitsInClient) {
      // TODO TEZ-1406. Dont' use MRInputLegacy
      stage1Conf.set(FileInputFormat.INPUT_DIR, inputPath);
      stage1Conf.setBoolean("mapred.mapper.new-api", false);
      dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, stagingDir, true);
    } else {
      dsd = MRInputLegacy.createConfigBuilder(stage1Conf, TextInputFormat.class, inputPath)
          .groupSplits(false).build();
    }
    stage1Vertex.addDataSource("MRInput", dsd);

    // Setup stage2 Vertex
    Vertex stage2Vertex = Vertex.create("stage2", ProcessorDescriptor.create(
        FilterByWordOutputProcessor.class.getName()).setUserPayload(TezUtils
        .createUserPayloadFromConf(stage2Conf)), dsd.getNumberOfShards());
    stage2Vertex.addTaskLocalFiles(commonLocalResources);

    // Configure the Output for stage2
    stage2Vertex.addDataSink(
        "MROutput",
        new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName())
            .setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf)),
            OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null));
View Full Code Here

    inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
    inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
    MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
    DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();

    Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(
        TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

    Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(
        TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

    Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(
        TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

    Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(
        UnionProcessor.class.getName()), 1);

    Configuration outputConf = new Configuration(tezConf);
    outputConf.setBoolean("mapred.reducer.new-api", false);
    outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
    outputConf.set(FileOutputFormat.OUTDIR, outputPath);
    DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
    checkerVertex.addDataSink("union", od);
   

    Configuration allPartsConf = new Configuration(tezConf);
    DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf,
        TextOutputFormat.class, outputPath + "-all-parts").build();
    checkerVertex.addDataSink("all-parts", od2);

    Configuration partsConf = new Configuration(tezConf);   
    DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf,
        TextOutputFormat.class, outputPath + "-parts").build();
    VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
View Full Code Here

TOP

Related Classes of org.apache.tez.dag.api.Vertex

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.