Package org.apache.tez.dag.api

Examples of org.apache.tez.dag.api.Vertex


          dag.addEdge(e);
        }
      } else {
        // Regular vertices
        JobConf wxConf = utils.initializeVertexConf(conf, w);
        Vertex wx = utils.createVertex(wxConf, w, scratchDir, appJarLr,
          additionalLr, fs, ctx, !isFinal, work);
        dag.addVertex(wx);
        utils.addCredentials(w, dag);
        perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_CREATE_VERTEX + w.getName());
        workToVertex.put(w, wx);
View Full Code Here


    // Tez ask us to call this even if there's no preceding vertex
    MultiStageMRConfToTezTranslator.translateVertexConfToTez(conf, null);

    // finally create the vertex
    Vertex map = null;

    // use tez to combine splits
    boolean useTezGroupedSplits = false;

    int numTasks = -1;
    Class amSplitGeneratorClass = null;
    InputSplitInfo inputSplitInfo = null;
    Class inputFormatClass = conf.getClass("mapred.input.format.class",
        InputFormat.class);

    boolean vertexHasCustomInput = false;
    if (tezWork != null) {
      for (BaseWork baseWork : tezWork.getParents(mapWork)) {
        if (tezWork.getEdgeType(baseWork, mapWork) == EdgeType.CUSTOM_EDGE) {
          vertexHasCustomInput = true;
        }
      }
    }
    if (vertexHasCustomInput) {
      useTezGroupedSplits = false;
      // grouping happens in execution phase. Setting the class to TezGroupedSplitsInputFormat
      // here would cause pre-mature grouping which would be incorrect.
      inputFormatClass = HiveInputFormat.class;
      conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
      // mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
      // this plug-in to avoid getting a serialized event at run-time.
      conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
    } else {
      // we'll set up tez to combine spits for us iff the input format
      // is HiveInputFormat
      if (inputFormatClass == HiveInputFormat.class) {
        useTezGroupedSplits = true;
        conf.setClass("mapred.input.format.class", TezGroupedSplitsInputFormat.class, InputFormat.class);
      }
    }

    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
      // if we're generating the splits in the AM, we just need to set
      // the correct plugin.
      amSplitGeneratorClass = MRInputAMSplitGenerator.class;
    } else {
      // client side split generation means we have to compute them now
      inputSplitInfo = MRHelpers.generateInputSplits(conf,
          new Path(tezDir, "split_"+mapWork.getName().replaceAll(" ", "_")));
      numTasks = inputSplitInfo.getNumTasks();
    }

    byte[] serializedConf = MRHelpers.createUserPayloadFromConf(conf);
    map = new Vertex(mapWork.getName(),
        new ProcessorDescriptor(MapTezProcessor.class.getName()).
        setUserPayload(serializedConf), numTasks, getContainerResource(conf));
    Map<String, String> environment = new HashMap<String, String>();
    MRHelpers.updateEnvironmentForMRTasks(conf, environment, true);
    map.setTaskEnvironment(environment);
    map.setJavaOpts(getContainerJavaOpts(conf));

    assert mapWork.getAliasToWork().keySet().size() == 1;

    String alias = mapWork.getAliasToWork().keySet().iterator().next();

    byte[] mrInput = null;
    if (useTezGroupedSplits) {
      mrInput = MRHelpers.createMRInputPayloadWithGrouping(serializedConf,
          HiveInputFormat.class.getName());
    } else {
      mrInput = MRHelpers.createMRInputPayload(serializedConf, null);
    }
    map.addInput(alias,
        new InputDescriptor(MRInputLegacy.class.getName()).
        setUserPayload(mrInput), amSplitGeneratorClass);

    Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
    localResources.put(getBaseName(appJarLr), appJarLr);
    for (LocalResource lr: additionalLr) {
      localResources.put(getBaseName(lr), lr);
    }

    if (inputSplitInfo != null) {
      // only relevant for client-side split generation
      map.setTaskLocationsHint(inputSplitInfo.getTaskLocationHints());
      MRHelpers.updateLocalResourcesForInputSplits(FileSystem.get(conf), inputSplitInfo,
          localResources);
    }

    map.setTaskLocalResources(localResources);
    return map;
  }
View Full Code Here

    // Call once here, will be updated when we find edges
    MultiStageMRConfToTezTranslator.translateVertexConfToTez(conf, null);

    // create the vertex
    Vertex reducer = new Vertex(reduceWork.getName(),
        new ProcessorDescriptor(ReduceTezProcessor.class.getName()).
        setUserPayload(MRHelpers.createUserPayloadFromConf(conf)),
        reduceWork.getNumReduceTasks(), getContainerResource(conf));

    Map<String, String> environment = new HashMap<String, String>();

    MRHelpers.updateEnvironmentForMRTasks(conf, environment, false);
    reducer.setTaskEnvironment(environment);

    reducer.setJavaOpts(getContainerJavaOpts(conf));

    Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
    localResources.put(getBaseName(appJarLr), appJarLr);
    for (LocalResource lr: additionalLr) {
      localResources.put(getBaseName(lr), lr);
    }
    reducer.setTaskLocalResources(localResources);

    return reducer;
  }
View Full Code Here

  public Vertex createVertex(JobConf conf, BaseWork work,
      Path scratchDir, LocalResource appJarLr,
      List<LocalResource> additionalLr,
      FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork) throws Exception {

    Vertex v = null;
    // simply dispatch the call to the right method for the actual (sub-) type of
    // BaseWork.
    if (work instanceof MapWork) {
      v = createVertex(conf, (MapWork) work, appJarLr,
          additionalLr, fileSystem, scratchDir, ctx, tezWork);
    } else if (work instanceof ReduceWork) {
      v = createVertex(conf, (ReduceWork) work, appJarLr,
          additionalLr, fileSystem, scratchDir, ctx);
    } else {
      // something is seriously wrong if this is happening
      throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
    }

    // initialize stats publisher if necessary
    if (work.isGatheringStats()) {
      StatsPublisher statsPublisher;
      StatsFactory factory = StatsFactory.newFactory(conf);
      if (factory != null) {
        statsPublisher = factory.getStatsPublisher();
        if (!statsPublisher.init(conf)) { // creating stats table if not exists
          if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
            throw
              new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
          }
        }
      }
    }


    // final vertices need to have at least one output
    if (!hasChildren) {
      v.addOutput("out_"+work.getName(),
          new OutputDescriptor(MROutput.class.getName())
          .setUserPayload(MRHelpers.createUserPayloadFromConf(conf)));
    }

    return v;
View Full Code Here

    public void visitTezOp(TezOperator tezOp) throws VisitorException {
        TezOperPlan tezPlan = getPlan();
        List<TezOperator> predecessors = tezPlan.getPredecessors(tezOp);

        // Construct vertex for the current Tez operator
        Vertex to = null;
        try {
            if (!tezOp.isVertexGroup()) {
                to = newVertex(tezOp);
                dag.addVertex(to);
            } else {
                // For union, we construct VertexGroup after iterating the
                // predecessors.
            }
        } catch (Exception e) {
            throw new VisitorException("Cannot create vertex for "
                    + tezOp.name(), e);
        }

        // Connect the new vertex with predecessor vertices
        if (predecessors != null) {
            Vertex[] groupMembers = new Vertex[predecessors.size()];

            for (int i = 0; i < predecessors.size(); i++) {
                // Since this is a dependency order walker, predecessor vertices
                // must have already been created.
                TezOperator pred = predecessors.get(i);
                try {
                    if (pred.isVertexGroup()) {
                        VertexGroup from = pred.getVertexGroupInfo().getVertexGroup();
                        // The plan of vertex group is empty. Since we create the Edge based on
                        // some of the operators in the plan refer to one of the vertex group members.
                        // Both the vertex group and its members reference same EdgeDescriptor object to the
                        // the successor
                        GroupInputEdge edge = newGroupInputEdge(
                                getPlan().getOperator(pred.getVertexGroupMembers().get(0)), tezOp, from, to);
                        dag.addEdge(edge);
                    } else {
                        Vertex from = dag.getVertex(pred.getOperatorKey().toString());
                        if (tezOp.isVertexGroup()) {
                            groupMembers[i] = from;
                        } else {
                            EdgeProperty prop = newEdge(pred, tezOp);
                            Edge edge = Edge.create(from, to, prop);
View Full Code Here

        // Take our assembled configuration and create a vertex
        UserPayload userPayload = TezUtils.createUserPayloadFromConf(payloadConf);
        procDesc.setUserPayload(userPayload);

        Vertex vertex = Vertex.create(tezOp.getOperatorKey().toString(), procDesc, tezOp.getVertexParallelism(),
                tezOp.isUseMRMapSettings() ? MRHelpers.getResourceForMRMapper(globalConf) : MRHelpers.getResourceForMRReducer(globalConf));

        Map<String, String> taskEnv = new HashMap<String, String>();
        MRHelpers.updateEnvBasedOnMRTaskEnv(globalConf, taskEnv, tezOp.isUseMRMapSettings());
        vertex.setTaskEnvironment(taskEnv);

        // All these classes are @InterfaceAudience.Private in Hadoop. Switch to Tez methods in TEZ-1012
        // set the timestamps, public/private visibility of the archives and files
        ClientDistributedCacheManager
                .determineTimestampsAndCacheVisibilities(globalConf);
        // get DelegationToken for each cached file
        ClientDistributedCacheManager.getDelegationTokens(globalConf,
                job.getCredentials());
        MRApps.setupDistributedCache(globalConf, localResources);
        vertex.addTaskLocalFiles(localResources);

        vertex.setTaskLaunchCmdOpts(tezOp.isUseMRMapSettings() ? MRHelpers.getJavaOptsForMRMapper(globalConf)
                : MRHelpers.getJavaOptsForMRReducer(globalConf));

        log.info("For vertex - " + tezOp.getOperatorKey().toString()
                + ": parallelism=" + tezOp.getVertexParallelism()
                + ", memory=" + vertex.getTaskResource().getMemory()
                + ", java opts=" + vertex.getTaskLaunchCmdOpts()
                );

        // Right now there can only be one of each of these. Will need to be
        // more generic when there can be more.
        for (POLoad ld : tezOp.getLoaderInfo().getLoads()) {

            // TODO: These should get the globalConf, or a merged version that
            // keeps settings like pig.maxCombinedSplitSize
            vertex.setLocationHint(VertexLocationHint.create(tezOp.getLoaderInfo().getInputSplitInfo().getTaskLocationHints()));
            vertex.addDataSource(ld.getOperatorKey().toString(),
                    DataSourceDescriptor.create(InputDescriptor.create(MRInput.class.getName())
                          .setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder()
                          .setConfigurationBytes(TezUtils.createByteStringFromConf(payloadConf))
                          .setSplits(tezOp.getLoaderInfo().getInputSplitInfo().getSplitsProto()).build().toByteString().asReadOnlyByteBuffer())),
                    InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName()), dag.getCredentials()));
        }

        for (POStore store : stores) {

            ArrayList<POStore> emptyList = new ArrayList<POStore>();
            ArrayList<POStore> singleStore = new ArrayList<POStore>();
            singleStore.add(store);

            Configuration outputPayLoad = new Configuration(payloadConf);
            outputPayLoad.set(JobControlCompiler.PIG_MAP_STORES,
                    ObjectSerializer.serialize(emptyList));
            outputPayLoad.set(JobControlCompiler.PIG_REDUCE_STORES,
                    ObjectSerializer.serialize(singleStore));

            OutputDescriptor storeOutDescriptor = OutputDescriptor.create(
                    MROutput.class.getName()).setUserPayload(TezUtils
                    .createUserPayloadFromConf(outputPayLoad));
            if (tezOp.getVertexGroupStores() != null) {
                OperatorKey vertexGroupKey = tezOp.getVertexGroupStores().get(store.getOperatorKey());
                if (vertexGroupKey != null) {
                    getPlan().getOperator(vertexGroupKey).getVertexGroupInfo()
                            .setStoreOutputDescriptor(storeOutDescriptor);
                    continue;
                }
            }
            vertex.addDataSink(store.getOperatorKey().toString(),
                    new DataSinkDescriptor(storeOutDescriptor,
                    OutputCommitterDescriptor.create(MROutputCommitter.class.getName()),
                    dag.getCredentials()));
        }

        // LoadFunc and StoreFunc add delegation tokens to Job Credentials in
        // setLocation and setStoreLocation respectively. For eg: HBaseStorage
        // InputFormat add delegation token in getSplits and OutputFormat in
        // checkOutputSpecs. For eg: FileInputFormat and FileOutputFormat
        if (stores.size() > 0) {
            new PigOutputFormat().checkOutputSpecs(job);
        }

        String vmPluginName = null;
        Configuration vmPluginConf = null;

        // Set the right VertexManagerPlugin
        if (tezOp.getEstimatedParallelism() != -1) {
            if (tezOp.isGlobalSort()||tezOp.isSkewedJoin()) {
                // Set VertexManagerPlugin to PartitionerDefinedVertexManager, which is able
                // to decrease/increase parallelism of sorting vertex dynamically
                // based on the numQuantiles calculated by sample aggregation vertex
                vmPluginName = PartitionerDefinedVertexManager.class.getName();
                log.info("Set VertexManagerPlugin to PartitionerDefinedParallelismVertexManager for vertex " + tezOp.getOperatorKey().toString());
            } else {
                boolean containScatterGather = false;
                boolean containCustomPartitioner = false;
                for (TezEdgeDescriptor edge : tezOp.inEdges.values()) {
                    if (edge.dataMovementType == DataMovementType.SCATTER_GATHER) {
                        containScatterGather = true;
                    }
                    if (edge.partitionerClass!=null) {
                        containCustomPartitioner = true;
                    }
                }
                if (containScatterGather && !containCustomPartitioner) {
                    // Use auto-parallelism feature of ShuffleVertexManager to dynamically
                    // reduce the parallelism of the vertex
                    vmPluginName = ShuffleVertexManager.class.getName();
                    vmPluginConf = (vmPluginConf == null) ? ConfigurationUtil.toConfiguration(pc.getProperties(), false) : vmPluginConf;
                    vmPluginConf.setBoolean(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL, true);
                    if (stores.size() <= 0) {
                        // Intermediate reduce. Set the bytes per reducer to be block size.
                        vmPluginConf.setLong(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE,
                                        intermediateTaskInputSize);
                    } else if (vmPluginConf.getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
                                    InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER) !=
                                    InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER) {
                        vmPluginConf.setLong(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE,
                                vmPluginConf.getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
                                        InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER));
                    }
                    log.info("Set auto parallelism for vertex " + tezOp.getOperatorKey().toString());
                }
            }
        }
/* TODO: Uncomment after TEZ-1590 is fixed
        if (tezOp.isLimit() && (vmPluginName == null || vmPluginName.equals(ShuffleVertexManager.class.getName()))) {
            if (tezOp.inEdges.values().iterator().next().inputClassName.equals(UnorderedKVInput.class.getName())) {
                // Setting SRC_FRACTION to 0.00001 so that even if there are 100K source tasks,
                // limit job starts when 1 source task finishes.
                // If limit is part of a group by or join because their parallelism is 1,
                // we should leave the configuration with the defaults.
                vmPluginName = ShuffleVertexManager.class.getName();
                vmPluginConf = (vmPluginConf == null) ? ConfigurationUtil.toConfiguration(pc.getProperties(), false) : vmPluginConf;
                vmPluginConf.set(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MIN_SRC_FRACTION, "0.00001");
                vmPluginConf.set(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MAX_SRC_FRACTION, "0.00001");
                log.info("Set " + ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MIN_SRC_FRACTION + " to 0.00001 for limit vertex " + tezOp.getOperatorKey().toString());
            }
        }
*/
        // else if(tezOp.isLimitAfterSort())
        // TODO: PIG-4049 If standalone Limit we need a new VertexManager or new input
        // instead of ShuffledMergedInput. For limit part of the sort (order by parallel 1) itself
        // need to enhance PartitionerDefinedVertexManager

        if (vmPluginName != null) {
            VertexManagerPluginDescriptor vmPluginDescriptor = VertexManagerPluginDescriptor.create(vmPluginName);
            if (vmPluginConf != null) {
                vmPluginDescriptor.setUserPayload(TezUtils.createUserPayloadFromConf(vmPluginConf));
            }
            vertex.setVertexManagerPlugin(vmPluginDescriptor);
        }
        // Reset udfcontext jobconf. It is not supposed to be set in the front end
        UDFContext.getUDFContext().addJobConf(null);
        return vertex;
    }
View Full Code Here

        fileBytesWritten = fsGrp.findCounter("FILE_BYTES_WRITTEN").getValue();
        hdfsBytesRead = fsGrp.findCounter("HDFS_BYTES_READ").getValue();
        hdfsBytesWritten = fsGrp.findCounter("HDFS_BYTES_WRITTEN").getValue();

        for (Entry<String, TezVertexStats> entry : tezVertexStatsMap.entrySet()) {
            Vertex v = dag.getVertex(entry.getKey());
            if (v != null && tezVertexStatsMap.containsKey(v.getName())) {
                TezVertexStats vertexStats = entry.getValue();
                UserPayload payload = v.getProcessorDescriptor().getUserPayload();
                Configuration conf = TezUtils.createConfFromUserPayload(payload);
                vertexStats.setConf(conf);

                VertexStatus status = tezJob.getVertexStatus(v.getName());
                vertexStats.accumulateStats(status, v.getParallelism());
                if(vertexStats.getInputs() != null && !vertexStats.getInputs().isEmpty()) {
                    inputs.addAll(vertexStats.getInputs());
                }
                if(vertexStats.getOutputs() != null  && !vertexStats.getOutputs().isEmpty()) {
                    outputs.addAll(vertexStats.getOutputs());
View Full Code Here

        : MRHelpers.getResourceForMRReducer(stageConf);
   
    stageConf.set(MRJobConfig.MROUTPUT_FILE_NAME_PREFIX, "part");
   
    UserPayload vertexUserPayload = TezUtils.createUserPayloadFromConf(stageConf);
    Vertex vertex = Vertex.create(vertexName,
        ProcessorDescriptor.create(processorName).setUserPayload(vertexUserPayload),
        numTasks, taskResource);
    if (isMap) {
      vertex.addDataSource("MRInput",
          configureMRInputWithLegacySplitsGenerated(stageConf, true));
    }
    // Map only jobs.
    if (stageNum == totalStages -1) {
      OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName())
          .setUserPayload(vertexUserPayload);
      vertex.addDataSink("MROutput", new DataSinkDescriptor(od,
          OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null));
    }

    Map<String, String> taskEnv = new HashMap<String, String>();
    setupMapReduceEnv(stageConf, taskEnv, isMap);

    Map<String, LocalResource> taskLocalResources =
        new TreeMap<String, LocalResource>();
    // PRECOMMIT Remove split localization for reduce tasks if it's being set
    // here
    taskLocalResources.putAll(jobLocalResources);

    String taskJavaOpts = isMap ? MRHelpers.getJavaOptsForMRMapper(stageConf)
        : MRHelpers.getJavaOptsForMRReducer(stageConf);

    vertex.setTaskEnvironment(taskEnv)
        .addTaskLocalFiles(taskLocalResources)
        .setLocationHint(VertexLocationHint.create(locations))
        .setTaskLaunchCmdOpts(taskJavaOpts);
   
    if (!isMap) {
      vertex.setVertexManagerPlugin((ShuffleVertexManager.createConfigBuilder(stageConf).build()));
    }

    if (LOG.isDebugEnabled()) {
      LOG.debug("Adding vertex to DAG" + ", vertexName="
          + vertex.getName() + ", processor="
          + vertex.getProcessorDescriptor().getClassName() + ", parallelism="
          + vertex.getParallelism() + ", javaOpts=" + vertex.getTaskLaunchCmdOpts()
          + ", resources=" + vertex.getTaskResource()
      // TODO Add localResources and Environment
      );
    }

    return vertex;
View Full Code Here

    if (conf != null) {
      taskCount = conf.getInt(TEZ_SIMPLE_V_DAG_NUM_TASKS, TEZ_SIMPLE_V_DAG_NUM_TASKS_DEFAULT);
      payload = TezUtils.createUserPayloadFromConf(conf);
    }
    DAG dag = DAG.create(name);
    Vertex v1 = Vertex.create("v1", TestProcessor.getProcDesc(payload), taskCount, defaultResource);
    Vertex v2 = Vertex.create("v2", TestProcessor.getProcDesc(payload), taskCount, defaultResource);
    Vertex v3 = Vertex.create("v3", TestProcessor.getProcDesc(payload), taskCount, defaultResource);
    dag.addVertex(v1).addVertex(v2).addVertex(v3);
    dag.addEdge(Edge.create(v1, v3,
        EdgeProperty.create(DataMovementType.SCATTER_GATHER,
            DataSourceType.PERSISTED,
            SchedulingType.SEQUENTIAL,
View Full Code Here

    if (conf != null) {
      taskCount = conf.getInt(TEZ_SIMPLE_DAG_NUM_TASKS, TEZ_SIMPLE_DAG_NUM_TASKS_DEFAULT);
      payload = TezUtils.createUserPayloadFromConf(conf);
    }
    DAG dag = DAG.create(name);
    Vertex v1 = Vertex.create("v1", TestProcessor.getProcDesc(payload), taskCount, defaultResource);
    Vertex v2 = Vertex.create("v2", TestProcessor.getProcDesc(payload), taskCount, defaultResource);
    dag.addVertex(v1).addVertex(v2).addEdge(Edge.create(v1, v2,
        EdgeProperty.create(DataMovementType.SCATTER_GATHER,
            DataSourceType.PERSISTED,
            SchedulingType.SEQUENTIAL,
            TestOutput.getOutputDesc(payload),
View Full Code Here

TOP

Related Classes of org.apache.tez.dag.api.Vertex

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.