Examples of org.apache.tez.dag.api.DAG

org.apache.tez.dag.api.DAG
Top level entity that defines the DAG (Directed Acyclic Graph) representing the data flow graph. Consists of a set of Vertices and Edges connecting the vertices. Vertices represent transformations of data and edges represent movement of data between vertices.

    }


    Path lhsPath = new Path(lhsDir);
    Path rhsPath = new Path(rhsDir);


    DAG dag = createDag(tezConf, lhsPath, rhsPath, numPartitions);


    tezClient.waitTillReady();
    DAGClient dagClient = tezClient.submitDAG(dag);
    DAGStatus dagStatus = dagClient.waitForCompletionWithStatusUpdates(null);
    if (dagStatus.getState() != DAGStatus.State.SUCCEEDED) {

View Full Code Here

    }
  }


  private DAG createDag(TezConfiguration tezConf, Path lhs, Path rhs, int numPartitions)
      throws IOException {
    DAG dag = DAG.create("JoinValidate");


    // Configuration for intermediate output - shared by Vertex1 and Vertex2
    // This should only be setting selective keys from the underlying conf. Fix after there's a
    // better mechanism to configure the IOs.
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
        .newBuilder(Text.class.getName(), NullWritable.class.getName(),
            HashPartitioner.class.getName()).build();


    Vertex lhsVertex = Vertex.create(LHS_INPUT_NAME, ProcessorDescriptor.create(
        ForwardingProcessor.class.getName())).addDataSource("lhs",
        MRInput
            .createConfigBuilder(new Configuration(tezConf), TextInputFormat.class,
                lhs.toUri().toString()).groupSplits(false).build());


    Vertex rhsVertex = Vertex.create(RHS_INPUT_NAME, ProcessorDescriptor.create(
        ForwardingProcessor.class.getName())).addDataSource("rhs",
        MRInput
            .createConfigBuilder(new Configuration(tezConf), TextInputFormat.class,
                rhs.toUri().toString()).groupSplits(false).build());


    Vertex joinValidateVertex = Vertex.create("joinvalidate", ProcessorDescriptor.create(
        JoinValidateProcessor.class.getName()), numPartitions);


    Edge e1 = Edge.create(lhsVertex, joinValidateVertex, edgeConf.createDefaultEdgeProperty());
    Edge e2 = Edge.create(rhsVertex, joinValidateVertex, edgeConf.createDefaultEdgeProperty());


    dag.addVertex(lhsVertex).addVertex(rhsVertex).addVertex(joinValidateVertex).addEdge(e1)
        .addEdge(e2);
    return dag;
  }

View Full Code Here

    }


    // the remaining code is the same as submitting any DAG.
    try {
      for (int i=0; i<inputPaths.length; ++i) {
        DAG dag = OrderedWordCount.createDAG(tezConf, inputPaths[i], outputPaths[i], numPartitions,
            ("DAG-Iteration-" + i)); // the names of the DAGs must be unique in a session


        tezClient.waitTillReady();
        System.out.println("Running dag number " + i);
        DAGClient dagClient = tezClient.submitDAG(dag);

View Full Code Here


    // No need to add jar containing this class as assumed to be part of the Tez jars. Otherwise 
    // we would have to add the jars for this code as local files to the vertices.
    
    // Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
    DAG dag = DAG.create("WordCount");
    dag.addVertex(tokenizerVertex)
        .addVertex(summationVertex)
        .addEdge(
            Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
    return dag;  
  }

View Full Code Here

    TezClient tezClient = TezClient.create("WordCount", tezConf);
    // TezClient must be started before it can be used
    tezClient.start();


    try {
        DAG dag = createDAG(tezConf, inputPath, outputPath, numPartitions);


        // check that the execution environment is ready
        tezClient.waitTillReady();
        // submit the dag and receive a dag client to monitor the progress
        DAGClient dagClient = tezClient.submitDAG(dag);

View Full Code Here

    TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig());
    TezClient tezClient = TezClient.create("TestVertexOrder", tezConf);
    tezClient.start();


    try {
    DAG dag = SimpleTestDAG.createDAGForVertexOrder("dag1", conf);
    DAGClient dagClient = tezClient.submitDAG(dag);
    DAGStatus dagStatus = dagClient.getDAGStatus(null);
    while (!dagStatus.isCompleted()) {
      LOG.info("Waiting for dag to complete. Sleeping for 500ms."
          + " DAG name: " + dag.getName()
          + " DAG context: " + dagClient.getExecutionContext()
          + " Current state: " + dagStatus.getState());
      Thread.sleep(100);
      dagStatus = dagClient.getDAGStatus(null);
    }

View Full Code Here

    if (numPartitions <= 0) {
      System.err.println("NumPartitions must be > 0");
      return 4;
    }


    DAG dag = createDag(tezConf, streamInputPath, hashInputPath, outputPath, numPartitions, doBroadcast);


    tezClient.waitTillReady();
    DAGClient dagClient = tezClient.submitDAG(dag);
    DAGStatus dagStatus = dagClient.waitForCompletionWithStatusUpdates(null);
    if (dagStatus.getState() != DAGStatus.State.SUCCEEDED) {

View Full Code Here


  }


  private DAG createDag(TezConfiguration tezConf, Path streamPath, Path hashPath, Path outPath,
      int numPartitions, boolean doBroadcast) throws IOException {
    DAG dag = DAG.create("JoinExample" + (doBroadcast ? "-WithBroadcast" : ""));


    /**
     * This vertex represents the side of the join that will be accumulated in a hash 
     * table in order to join it against the other side. It reads text data using the
     * TextInputFormat. ForwardingProcessor simply forwards the data downstream as is.
     */
    Vertex hashFileVertex = Vertex.create(hashSide, ProcessorDescriptor.create(
        ForwardingProcessor.class.getName())).addDataSource(
        inputFile,
        MRInput
            .createConfigBuilder(new Configuration(tezConf), TextInputFormat.class,
                hashPath.toUri().toString()).groupSplits(false).build());


    /**
     * This vertex represents that side of the data that will be streamed and joined 
     * against the other side that has been accumulated into a hash table. It reads 
     * text data using the TextInputFormat. ForwardingProcessor simply forwards the data 
     * downstream as is.
     */
    Vertex streamFileVertex = Vertex.create(streamingSide, ProcessorDescriptor.create(
        ForwardingProcessor.class.getName())).addDataSource(
        inputFile,
        MRInput
            .createConfigBuilder(new Configuration(tezConf), TextInputFormat.class,
                streamPath.toUri().toString()).groupSplits(false).build());


    /**
     * This vertex represents the join operation. It writes the join output as text using
     * the TextOutputFormat. The JoinProcessor is going to perform the join of the 
     * streaming side and the hash side. It is load balanced across numPartitions 
     */
    Vertex joinVertex = Vertex.create(joiner, ProcessorDescriptor.create(
        JoinProcessor.class.getName()), numPartitions).addDataSink(joinOutput,
        MROutput.createConfigBuilder(new Configuration(tezConf),
            TextOutputFormat.class, outPath.toUri().toString()).build());


    /**
     * The streamed side will be partitioned into fragments with the same keys going to 
     * the same fragments using hash partitioning. The data to be joined is the key itself
     * and so the value is null. The number of fragments is initially inferred from the 
     * number of tasks running in the join vertex because each task will be handling one
     * fragment.
     */
    UnorderedPartitionedKVEdgeConfig streamConf =
        UnorderedPartitionedKVEdgeConfig
            .newBuilder(Text.class.getName(), NullWritable.class.getName(),
                HashPartitioner.class.getName()).build();


    /**
     * Connect the join vertex with the stream side
     */
    Edge e1 = Edge.create(streamFileVertex, joinVertex, streamConf.createDefaultEdgeProperty());
    
    EdgeProperty hashSideEdgeProperty = null;
    if (doBroadcast) {
      /**
       * This option can be used when the hash side is small. We can broadcast the entire data to 
       * all fragments of the stream side. This avoids re-partitioning the fragments of the stream 
       * side to match the partitioning scheme of the hash side and avoids costly network data 
       * transfer. However, in this example the stream side is being partitioned in both cases for 
       * brevity of code. The join task can perform the join of its fragment of keys with all the 
       * keys of the hash side.
       * Using an unpartitioned edge to transfer the complete output of the hash side to be 
       * broadcasted to all fragments of the streamed side. Again, since the data is the key, the 
       * value is null.
       */
      UnorderedKVEdgeConfig broadcastConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(),
          NullWritable.class.getName()).build();
      hashSideEdgeProperty = broadcastConf.createDefaultBroadcastEdgeProperty();
    } else {
      /**
       * The hash side is also being partitioned into fragments with the same key going to the same
       * fragment using hash partitioning. This way all keys with the same hash value will go to the
       * same fragment from both sides. Thus the join task handling that fragment can join both data
       * set fragments. 
       */
      hashSideEdgeProperty = streamConf.createDefaultEdgeProperty();
    }


    /**
     * Connect the join vertex to the hash side.
     * The join vertex is connected with 2 upstream vertices that provide it with inputs
     */
    Edge e2 = Edge.create(hashFileVertex, joinVertex, hashSideEdgeProperty);


    /**
     * Connect everything up by adding them to the DAG
     */
    dag.addVertex(streamFileVertex).addVertex(hashFileVertex).addVertex(joinVertex)
        .addEdge(e1).addEdge(e2);
    return dag;
  }

View Full Code Here

        NoOpSorter.class.getName()), 1);
    sorterVertex.addDataSink(OUTPUT, dataSink);


    // No need to add jar containing this class as assumed to be part of the tez jars.
    
    DAG dag = DAG.create(dagName);
    dag.addVertex(tokenizerVertex)
        .addVertex(summationVertex)
        .addVertex(sorterVertex)
        .addEdge(
            Edge.create(tokenizerVertex, summationVertex,
                summationEdgeConf.createDefaultEdgeProperty()))

View Full Code Here

    
    TezClient tezClient = TezClient.create("OrderedWordCount", tezConf);
    tezClient.start();


    try {
        DAG dag = createDAG(tezConf, inputPath, outputPath, numPartitions, "OrderedWordCount");


        tezClient.waitTillReady();
        DAGClient dagClient = tezClient.submitDAG(dag);


        // monitoring

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.tez.dag.api.DAG

cascading.flow.tez.Hadoop2TezFlowStep

org.apache.hadoop.conf.Configuration

org.apache.hadoop.hive.ql.exec.tez.TestTezTask

org.apache.hadoop.hive.ql.exec.tez.TezTask

org.apache.hadoop.yarn.api.records.LocalResource

org.apache.hadoop.yarn.api.records.Resource

org.apache.pig.backend.hadoop.executionengine.tez.TezJobCompiler

org.apache.pig.backend.hadoop.executionengine.tez.TezLauncher$ProgressReporter

org.apache.pig.tez.TestGroupConstParallelTez

org.apache.pig.tez.TestJobSubmissionTez

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.