Package org.apache.hadoop.hive.ql.optimizer

Source Code of org.apache.hadoop.hive.ql.optimizer.GenMRFileSink1

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.hive.ql.optimizer;

import java.util.List;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Stack;
import java.io.Serializable;

import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMRMapJoinCtx;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.ConditionalTask;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.MoveTask;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.OperatorFactory;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.exec.UnionOperator;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.plan.mapredWork;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
import org.apache.hadoop.hive.ql.parse.RowResolver;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory;
import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles;
import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx;
import org.apache.hadoop.hive.ql.plan.ConditionalWork;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.exprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.exprNodeDesc;
import org.apache.hadoop.hive.ql.plan.extractDesc;
import org.apache.hadoop.hive.ql.plan.fileSinkDesc;
import org.apache.hadoop.hive.ql.plan.loadFileDesc;
import org.apache.hadoop.hive.ql.plan.moveWork;
import org.apache.hadoop.hive.ql.plan.reduceSinkDesc;
import org.apache.hadoop.hive.ql.plan.tableDesc;
import org.apache.hadoop.hive.ql.plan.tableScanDesc;
import org.apache.hadoop.hive.ql.plan.partitionDesc;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.conf.HiveConf;

/**
* Processor for the rule - table scan followed by reduce sink
*/
public class GenMRFileSink1 implements NodeProcessor {

  public GenMRFileSink1() {
  }

  /**
   * File Sink Operator encountered
   * @param nd the file sink operator encountered
   * @param opProcCtx context
   */
  public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    GenMRProcContext ctx = (GenMRProcContext)opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    boolean chDir = false;
    Task<? extends Serializable> currTask = ctx.getCurrTask();

    // Has the user enabled merging of files for map-only jobs or for all jobs
    if ((ctx.getMvTask() != null) && (!ctx.getMvTask().isEmpty()))
    {
      List<Task<? extends Serializable>> mvTasks = ctx.getMvTask();

      // In case of unions or map-joins, it is possible that the file has already been seen.
      // So, no need to attempt to merge the files again.
      if ((ctx.getSeenFileSinkOps() == null) ||
          (!ctx.getSeenFileSinkOps().contains((FileSinkOperator)nd)))  {
       
        // no need of merging if the move is to a local file system
        MoveTask mvTask = (MoveTask)findMoveTask(mvTasks, (FileSinkOperator)nd);
        if ((mvTask != null) && !mvTask.isLocal())
        {
          // There are separate configuration parameters to control whether to merge for a map-only job
          // or for a map-reduce job
          if ((parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPFILES) &&
              (((mapredWork)currTask.getWork()).getReducer() == null)) ||
              parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPREDFILES))
            chDir = true;
        }
      }
    }

    String finalName = processFS(nd, stack, opProcCtx, chDir);
   
    // If it is a map-only job, insert a new task to do the concatenation
    if (chDir && (finalName != null)) {
      createMergeJob((FileSinkOperator)nd, ctx, finalName);
    }
   
    return null;
  }
 
  private void createMergeJob(FileSinkOperator fsOp, GenMRProcContext ctx, String finalName) {
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    RowSchema fsRS = fsOp.getSchema();
   
    // create a reduce Sink operator - key is the first column
    ArrayList<exprNodeDesc> keyCols = new ArrayList<exprNodeDesc>();
    keyCols.add(TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("rand"));

    ArrayList<exprNodeDesc> valueCols = new ArrayList<exprNodeDesc>();
    for (ColumnInfo ci : fsRS.getSignature()) {
      valueCols.add(new exprNodeColumnDesc(ci.getType(), ci.getInternalName(), ci.getTabAlias(),
          ci.getIsPartitionCol()));
    }

    // create a dummy tableScan operator
    Operator<? extends Serializable> ts_op =
      OperatorFactory.get(tableScanDesc.class, fsRS);

    ArrayList<String> outputColumns = new ArrayList<String>();
    for (int i = 0; i < valueCols.size(); i++)
      outputColumns.add(SemanticAnalyzer.getColumnInternalName(i));
   
    reduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc(new ArrayList<exprNodeDesc>(), valueCols,
                                                        outputColumns, false, -1, -1, -1);
    ReduceSinkOperator rsOp = (ReduceSinkOperator)OperatorFactory.getAndMakeChild(rsDesc, fsRS, ts_op);
    mapredWork cplan = GenMapRedUtils.getMapRedWork();
    ParseContext parseCtx = ctx.getParseCtx();

    Task<? extends Serializable> mergeTask = TaskFactory.get(cplan, parseCtx.getConf());
    fileSinkDesc fsConf = fsOp.getConf();
   
    // Add the extract operator to get the value fields
    RowResolver out_rwsch = new RowResolver();
    RowResolver interim_rwsch = ctx.getParseCtx().getOpParseCtx().get(fsOp).getRR();
    Integer pos = Integer.valueOf(0);
    for(ColumnInfo colInfo: interim_rwsch.getColumnInfos()) {
      String [] info = interim_rwsch.reverseLookup(colInfo.getInternalName());
      out_rwsch.put(info[0], info[1],
                    new ColumnInfo(pos.toString(), colInfo.getType(), info[0],
                          colInfo.getIsPartitionCol()));
      pos = Integer.valueOf(pos.intValue() + 1);
    }

    Operator extract =
      OperatorFactory.getAndMakeChild(
        new extractDesc(new exprNodeColumnDesc(TypeInfoFactory.stringTypeInfo,
                                Utilities.ReduceField.VALUE.toString(), "", false)),
        new RowSchema(out_rwsch.getColumnInfos()));
   
    tableDesc ts = (tableDesc)fsConf.getTableInfo().clone();
    fsConf.getTableInfo().getProperties().remove(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS);
    FileSinkOperator newOutput =
      (FileSinkOperator)OperatorFactory.getAndMakeChild(
         new fileSinkDesc(finalName, ts,
                          parseCtx.getConf().getBoolVar(HiveConf.ConfVars.COMPRESSRESULT)),
         fsRS, extract);

    cplan.setReducer(extract);
    ArrayList<String> aliases = new ArrayList<String>();
    aliases.add(fsConf.getDirName());
    cplan.getPathToAliases().put(fsConf.getDirName(), aliases);
    cplan.getAliasToWork().put(fsConf.getDirName(), ts_op);   
    cplan.getPathToPartitionInfo().put(fsConf.getDirName(), new partitionDesc(fsConf.getTableInfo(), null));
    cplan.setNumReduceTasks(-1);
   
    moveWork dummyMv = new moveWork(null, new loadFileDesc(fsOp.getConf().getDirName(), finalName, true, null, null), false);
    Task<? extends Serializable> dummyMergeTask = TaskFactory.get(dummyMv, ctx.getConf());
    List<Serializable> listWorks = new ArrayList<Serializable>();
    listWorks.add(dummyMv);
    listWorks.add(mergeTask.getWork());
    ConditionalWork cndWork = new ConditionalWork(listWorks);
   
    ConditionalTask cndTsk = (ConditionalTask)TaskFactory.get(cndWork, ctx.getConf());
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    listTasks.add(dummyMergeTask);
    listTasks.add(mergeTask);
    cndTsk.setListTasks(listTasks);
   
    cndTsk.setResolver(new ConditionalResolverMergeFiles());
    cndTsk.setResolverCtx(new ConditionalResolverMergeFilesCtx(listTasks, fsOp.getConf().getDirName()));
   
    currTask.addDependentTask(cndTsk);
   
    List<Task<? extends Serializable>> mvTasks = ctx.getMvTask();
    Task<? extends Serializable> mvTask = findMoveTask(mvTasks, newOutput);
   
    if (mvTask != null)
      cndTsk.addDependentTask(mvTask);
  }
  private Task<? extends Serializable> findMoveTask(List<Task<? extends Serializable>> mvTasks, FileSinkOperator fsOp) {
    // find the move task
    for (Task<? extends Serializable> mvTsk : mvTasks) {
      moveWork mvWork = (moveWork)mvTsk.getWork();
      String srcDir = null;
      if (mvWork.getLoadFileWork() != null)
        srcDir = mvWork.getLoadFileWork().getSourceDir();
      else if (mvWork.getLoadTableWork() != null)
        srcDir = mvWork.getLoadTableWork().getSourceDir();
     
      if ((srcDir != null) && (srcDir.equalsIgnoreCase(fsOp.getConf().getDirName())))
        return mvTsk;
    }
    
    return null;
  }
 
  private String processFS(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, boolean chDir)
    throws SemanticException {
   
    // Is it the dummy file sink after the mapjoin
    FileSinkOperator fsOp = (FileSinkOperator)nd;
    if ((fsOp.getParentOperators().size() == 1) && (fsOp.getParentOperators().get(0) instanceof MapJoinOperator))
      return null;

    GenMRProcContext ctx = (GenMRProcContext)opProcCtx;
    List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps();
    if (seenFSOps == null)
      seenFSOps = new ArrayList<FileSinkOperator>();
    if (!seenFSOps.contains(fsOp))
      seenFSOps.add(fsOp);
    ctx.setSeenFileSinkOps(seenFSOps);

    Task<? extends Serializable> currTask = ctx.getCurrTask();
   
    // If the directory needs to be changed, send the new directory
    String dest = null;

    if (chDir) {
      dest = fsOp.getConf().getDirName();

      // generate the temporary file
      ParseContext parseCtx = ctx.getParseCtx();
      Context baseCtx = parseCtx.getContext();
      String tmpDir = baseCtx.getMRTmpFileURI();
     
      fsOp.getConf().setDirName(tmpDir);
    }
   
    boolean ret = false;
    Task<? extends Serializable> mvTask = null;
   
    if (!chDir)
      mvTask = findMoveTask(ctx.getMvTask(), fsOp);
   
    Operator<? extends Serializable> currTopOp = ctx.getCurrTopOp();
    String currAliasId = ctx.getCurrAliasId();
    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
    List<Operator<? extends Serializable>> seenOps = ctx.getSeenOps();
    List<Task<? extends Serializable>>  rootTasks = ctx.getRootTasks();

    // Set the move task to be dependent on the current task
    if (mvTask != null)
      ret = currTask.addDependentTask(mvTask);
   
    // In case of multi-table insert, the path to alias mapping is needed for all the sources. Since there is no
    // reducer, treat it as a plan with null reducer
    // If it is a map-only job, the task needs to be processed
    if (currTopOp != null) {
      Task<? extends Serializable> mapTask = opTaskMap.get(null);
      if (mapTask == null) {
        assert (!seenOps.contains(currTopOp));
        seenOps.add(currTopOp);
        GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, (mapredWork) currTask.getWork(), false, ctx);
        opTaskMap.put(null, currTask);
        rootTasks.add(currTask);
      }
      else {
        if (!seenOps.contains(currTopOp)) {
          seenOps.add(currTopOp);
          GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, (mapredWork) mapTask.getWork(), false, ctx);
        }
        if (ret)
          currTask.removeDependentTask(mvTask);
      }

      return dest;

    }

    UnionOperator currUnionOp = ctx.getCurrUnionOp();
   
    if  (currUnionOp != null) {
      opTaskMap.put(null, currTask);
      GenMapRedUtils.initUnionPlan(ctx, currTask, false);
      return dest;
    }
   
    MapJoinOperator currMapJoinOp = ctx.getCurrMapJoinOp();
   
    if  (currMapJoinOp != null) {
      opTaskMap.put(null, currTask);
      GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(currMapJoinOp);
      mapredWork plan = (mapredWork) currTask.getWork();

      String taskTmpDir = mjCtx.getTaskTmpDir();
      tableDesc tt_desc = mjCtx.getTTDesc();
      assert plan.getPathToAliases().get(taskTmpDir) == null;
      plan.getPathToAliases().put(taskTmpDir, new ArrayList<String>());
      plan.getPathToAliases().get(taskTmpDir).add(taskTmpDir);
      plan.getPathToPartitionInfo().put(taskTmpDir, new partitionDesc(tt_desc, null));
      plan.getAliasToWork().put(taskTmpDir, mjCtx.getRootMapJoinOp());
      return dest;
    }
   
    return dest;
  }
}
TOP

Related Classes of org.apache.hadoop.hive.ql.optimizer.GenMRFileSink1

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.