Package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators

Examples of org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach


                flat1.add(true);
            }
        }

        // This foreach will pick the sort key columns from the RandomSampleLoader output
        POForEach nfe1 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)),-1,eps1,flat1);
        mro.mapPlan.addAsLeaf(nfe1);

        // Now set up a POLocalRearrange which has "all" as the key and the output of the
        // foreach will be the "value" out of POLocalRearrange
        PhysicalPlan ep1 = new PhysicalPlan();
        ConstantExpression ce = new ConstantExpression(new OperatorKey(scope,nig.getNextNodeId(scope)));
        ce.setValue("all");
        ce.setResultType(DataType.CHARARRAY);
        ep1.add(ce);

        List<PhysicalPlan> eps = new ArrayList<PhysicalPlan>();
        eps.add(ep1);

        POLocalRearrange lr = new POLocalRearrange(new OperatorKey(scope,nig.getNextNodeId(scope)));
        try {
            lr.setIndex(0);
        } catch (ExecException e) {
            int errCode = 2058;
            String msg = "Unable to set index on newly created POLocalRearrange.";
            throw new PlanException(msg, errCode, PigException.BUG, e);
        }
        lr.setKeyType(DataType.CHARARRAY);
        lr.setPlans(eps);
        lr.setResultType(DataType.TUPLE);
        lr.addOriginalLocation(sort.getAlias(), sort.getOriginalLocations());
        mro.mapPlan.add(lr);
        mro.mapPlan.connect(nfe1, lr);

        mro.setMapDone(true);

        POPackage pkg = new POPackage(new OperatorKey(scope,nig.getNextNodeId(scope)));
        Packager pkgr = new Packager();
        pkg.setPkgr(pkgr);
        pkgr.setKeyType(DataType.CHARARRAY);
        pkg.setNumInps(1);
        boolean[] inner = {false};
        pkgr.setInner(inner);
        mro.reducePlan.add(pkg);

        // Lets start building the plan which will have the sort
        // for the foreach
        PhysicalPlan fe2Plan = new PhysicalPlan();
        // Top level project which just projects the tuple which is coming
        // from the foreach after the package
        POProject topPrj = new POProject(new OperatorKey(scope,nig.getNextNodeId(scope)));
        topPrj.setColumn(1);
        topPrj.setResultType(DataType.BAG);
        topPrj.setOverloaded(true);
        fe2Plan.add(topPrj);

        // the projections which will form sort plans
        List<PhysicalPlan> nesSortPlanLst = new ArrayList<PhysicalPlan>();
        if (sortKeyPlans != null) {
            for(int i=0; i<sortKeyPlans.size(); i++) {
                nesSortPlanLst.add(sortKeyPlans.get(i));
            }
        }else{
            Pair<POProject, Byte>[] sortProjs = null;
            try{
                sortProjs = getSortCols(sort.getSortPlans());
            }catch(Exception e) {
                throw new RuntimeException(e);
            }
            // Set up the projections of the key columns
            if (sortProjs == null) {
                PhysicalPlan ep = new PhysicalPlan();
                POProject prj = new POProject(new OperatorKey(scope,
                    nig.getNextNodeId(scope)));
                prj.setStar(true);
                prj.setOverloaded(false);
                prj.setResultType(DataType.TUPLE);
                ep.add(prj);
                nesSortPlanLst.add(ep);
            } else {
                for (int i=0; i<sortProjs.length; i++) {
                    POProject prj =
                        new POProject(new OperatorKey(scope,nig.getNextNodeId(scope)));

                    prj.setResultType(sortProjs[i].second);
                    if(sortProjs[i].first != null && sortProjs[i].first.isProjectToEnd()){
                        if(i != sortProjs.length -1){
                            //project to end has to be the last sort column
                            throw new AssertionError("Project-range to end (x..)" +
                            " is supported in order-by only as last sort column");
                        }
                        prj.setProjectToEnd(i);
                        break;
                    }
                    else{
                        prj.setColumn(i);
                    }
                    prj.setOverloaded(false);

                    PhysicalPlan ep = new PhysicalPlan();
                    ep.add(prj);
                    nesSortPlanLst.add(ep);
                }
            }
        }

        sort.setSortPlans(nesSortPlanLst);
        sort.setResultType(DataType.BAG);
        fe2Plan.add(sort);
        fe2Plan.connect(topPrj, sort);

        // The plan which will have a constant representing the
        // degree of parallelism for the final order by map-reduce job
        // this will either come from a "order by parallel x" in the script
        // or will be the default number of reducers for the cluster if
        // "parallel x" is not used in the script
        PhysicalPlan rpep = new PhysicalPlan();
        ConstantExpression rpce = new ConstantExpression(new OperatorKey(scope,nig.getNextNodeId(scope)));
        rpce.setRequestedParallelism(rp);

        // We temporarily set it to rp and will adjust it at runtime, because the final degree of parallelism
        // is unknown until we are ready to submit it. See PIG-2779.
        rpce.setValue(rp);

        rpce.setResultType(DataType.INTEGER);
        rpep.add(rpce);

        List<PhysicalPlan> genEps = new ArrayList<PhysicalPlan>();
        genEps.add(rpep);
        genEps.add(fe2Plan);

        List<Boolean> flattened2 = new ArrayList<Boolean>();
        flattened2.add(false);
        flattened2.add(false);

        POForEach nfe2 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)),-1, genEps, flattened2);
        mro.reducePlan.add(nfe2);
        mro.reducePlan.connect(pkg, nfe2);

        // Let's connect the output from the foreach containing
        // number of quantiles and the sorted bag of samples to
        // another foreach with the FindQuantiles udf. The input
        // to the FindQuantiles udf is a project(*) which takes the
        // foreach input and gives it to the udf
        PhysicalPlan ep4 = new PhysicalPlan();
        POProject prjStar4 = new POProject(new OperatorKey(scope,nig.getNextNodeId(scope)));
        prjStar4.setResultType(DataType.TUPLE);
        prjStar4.setStar(true);
        ep4.add(prjStar4);

        List<PhysicalOperator> ufInps = new ArrayList<PhysicalOperator>();
        ufInps.add(prjStar4);

        POUserFunc uf = new POUserFunc(new OperatorKey(scope,nig.getNextNodeId(scope)), -1, ufInps,
            new FuncSpec(udfClassName, udfArgs));
        ep4.add(uf);
        ep4.connect(prjStar4, uf);

        List<PhysicalPlan> ep4s = new ArrayList<PhysicalPlan>();
        ep4s.add(ep4);
        List<Boolean> flattened3 = new ArrayList<Boolean>();
        flattened3.add(false);
        POForEach nfe3 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)), -1, ep4s, flattened3);

        mro.reducePlan.add(nfe3);
        mro.reducePlan.connect(nfe2, nfe3);

        POStore str = getStore();
View Full Code Here


            op = sucs.get(0);
            boolean lastInputFlattened = true;
            boolean allSimple = true;
            if (op instanceof POForEach)
            {
                POForEach forEach = (POForEach)op;
                List<PhysicalPlan> planList = forEach.getInputPlans();
                List<Boolean> flatten = forEach.getToBeFlattened();
                POProject projOfLastInput = null;
                int i = 0;
                // check all nested foreach plans
                // 1. If it is simple projection
                // 2. If last input is all flattened
View Full Code Here

        prj1.setColumn(1);
        prj1.setOverloaded(true);
        ep1.add(prj1);
        eps1.add(ep1);
        flat1.add(true);
        POForEach fe = new POForEach(new OperatorKey(scope, nig
                .getNextNodeId(scope)), -1, eps1, flat1);
        fe.setResultType(DataType.BAG);
        return fe;
    }
View Full Code Here

        prj1.setColumn(1);
        prj1.setOverloaded(true);
        ep1.add(prj1);
        eps1.add(ep1);
        flat1.add(true);
        POForEach fe = new POForEach(new OperatorKey(scope, nig
                .getNextNodeId(scope)), -1, eps1, flat1);
        fe.setResultType(DataType.BAG);
        return fe;
    }
View Full Code Here

            prj1.setColumn(0);
            prj1.setOverloaded(false);
            ep1.add(prj1);
            eps1.add(ep1);
            flat1.add(true);
            POForEach nfe1 = new POForEach(new OperatorKey(scope, nig
                    .getNextNodeId(scope)), op.getRequestedParallelism(), eps1,
                    flat1);
            nfe1.setResultType(DataType.BAG);
            curMROp.reducePlan.addAsLeaf(nfe1);
            curMROp.setNeedsDistinctCombiner(true);
            phyToMROpMap.put(op, curMROp);
            curMROp.phyToMRMap.put(op, nfe1);
        }catch(Exception e){
View Full Code Here

              CompilerUtils.addEmptyBagOuterJoin(ep, op.getSchema(i));
          }
          flat.add(true);
      }

      POForEach fe = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)), -1, eps, flat);
      fe.setResultType(DataType.TUPLE);
     
      fe.visit(this);
     
      curMROp.setSkewedJoinPartitionFile(partitionFile.getFileName());
      phyToMROpMap.put(op, curMROp);
        }catch(PlanException e) {
            int errCode = 2034;
View Full Code Here

            prj_c1.setOverloaded(false);
            prj_c1.setResultType(DataType.BAG);
            ep_c1.add(prj_c1);
            eps_c1.add(ep_c1);
            flat_c1.add(true);
            POForEach fe_c1 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)),
                -1, eps_c1, flat_c1);
            fe_c1.setResultType(DataType.TUPLE);
            mro.combinePlan.addAsLeaf(fe_c1);
           
            POLimit pLimit = new POLimit(new OperatorKey(scope,nig.getNextNodeId(scope)));
          pLimit.setLimit(limit);
          mro.combinePlan.addAsLeaf(pLimit);
           
            List<PhysicalPlan> eps_c2 = new ArrayList<PhysicalPlan>();
            eps_c2.addAll(sort.getSortPlans());
       
          POLocalRearrange lr_c2 = new POLocalRearrange(new OperatorKey(scope,nig.getNextNodeId(scope)));
          try {
                lr_c2.setIndex(0);
            } catch (ExecException e) {
              int errCode = 2058;
              String msg = "Unable to set index on newly created POLocalRearrange.";             
                throw new PlanException(msg, errCode, PigException.BUG, e);
            }
          lr_c2.setKeyType((fields.length>1) ? DataType.TUPLE : keyType);
          lr_c2.setPlans(eps_c2);
          lr_c2.setResultType(DataType.TUPLE);
          mro.combinePlan.addAsLeaf(lr_c2);
        }

        POPackage pkg = new POPackage(new OperatorKey(scope,
                nig.getNextNodeId(scope)));
        LitePackager pkgr = new LitePackager();
        pkgr.setKeyType((fields == null || fields.length > 1) ? DataType.TUPLE
                : keyType);
        pkg.setPkgr(pkgr);
        pkg.setNumInps(1);
        mro.reducePlan.add(pkg);
       
        PhysicalPlan ep = new PhysicalPlan();
        POProject prj = new POProject(new OperatorKey(scope,nig.getNextNodeId(scope)));
        prj.setColumn(1);
        prj.setOverloaded(false);
        prj.setResultType(DataType.BAG);
        ep.add(prj);
        List<PhysicalPlan> eps2 = new ArrayList<PhysicalPlan>();
        eps2.add(ep);
        List<Boolean> flattened = new ArrayList<Boolean>();
        flattened.add(true);
        POForEach nfe1 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)),-1,eps2,flattened);
        mro.reducePlan.add(nfe1);
        mro.reducePlan.connect(pkg, nfe1);
        mro.phyToMRMap.put(sort, nfe1);
        if (limit!=-1)
        {
View Full Code Here

                flat1.add(true);
            }
        }

        // This foreach will pick the sort key columns from the RandomSampleLoader output
        POForEach nfe1 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)),-1,eps1,flat1);
        mro.mapPlan.addAsLeaf(nfe1);
       
        // Now set up a POLocalRearrange which has "all" as the key and the output of the
        // foreach will be the "value" out of POLocalRearrange
        PhysicalPlan ep1 = new PhysicalPlan();
        ConstantExpression ce = new ConstantExpression(new OperatorKey(scope,nig.getNextNodeId(scope)));
        ce.setValue("all");
        ce.setResultType(DataType.CHARARRAY);
        ep1.add(ce);
       
        List<PhysicalPlan> eps = new ArrayList<PhysicalPlan>();
        eps.add(ep1);
       
        POLocalRearrange lr = new POLocalRearrange(new OperatorKey(scope,nig.getNextNodeId(scope)));
        try {
            lr.setIndex(0);
        } catch (ExecException e) {
          int errCode = 2058;
          String msg = "Unable to set index on newly created POLocalRearrange.";
            throw new PlanException(msg, errCode, PigException.BUG, e);
        }
        lr.setKeyType(DataType.CHARARRAY);
        lr.setPlans(eps);
        lr.setResultType(DataType.TUPLE);
        lr.addOriginalLocation(sort.getAlias(), sort.getOriginalLocations());
        mro.mapPlan.add(lr);
        mro.mapPlan.connect(nfe1, lr);
       
        mro.setMapDone(true);
       
        POPackage pkg = new POPackage(new OperatorKey(scope,nig.getNextNodeId(scope)));
        Packager pkgr = new Packager();
        pkg.setPkgr(pkgr);
        pkgr.setKeyType(DataType.CHARARRAY);
        pkg.setNumInps(1);
        boolean[] inner = {false};
        pkgr.setInner(inner);
        mro.reducePlan.add(pkg);
       
        // Lets start building the plan which will have the sort
        // for the foreach
        PhysicalPlan fe2Plan = new PhysicalPlan();
        // Top level project which just projects the tuple which is coming
        // from the foreach after the package
        POProject topPrj = new POProject(new OperatorKey(scope,nig.getNextNodeId(scope)));
        topPrj.setColumn(1);
        topPrj.setResultType(DataType.BAG);
        topPrj.setOverloaded(true);
        fe2Plan.add(topPrj);
       
        // the projections which will form sort plans
        List<PhysicalPlan> nesSortPlanLst = new ArrayList<PhysicalPlan>();            
        if (sortKeyPlans != null) {
          for(int i=0; i<sortKeyPlans.size(); i++) {         
            nesSortPlanLst.add(sortKeyPlans.get(i));         
          }
        }else{  
            Pair<POProject, Byte>[] sortProjs = null;
            try{
              sortProjs = getSortCols(sort.getSortPlans());
            }catch(Exception e) {
              throw new RuntimeException(e);
            }
            // Set up the projections of the key columns
            if (sortProjs == null) {
                PhysicalPlan ep = new PhysicalPlan();
                POProject prj = new POProject(new OperatorKey(scope,
                    nig.getNextNodeId(scope)));
                prj.setStar(true);
                prj.setOverloaded(false);
                prj.setResultType(DataType.TUPLE);
                ep.add(prj);
                nesSortPlanLst.add(ep);
            } else {
                for (int i=0; i<sortProjs.length; i++) {
                    POProject prj =
                        new POProject(new OperatorKey(scope,nig.getNextNodeId(scope)));
                   
                    prj.setResultType(sortProjs[i].second);
                    if(sortProjs[i].first != null && sortProjs[i].first.isProjectToEnd()){
                        if(i != sortProjs.length -1){
                            //project to end has to be the last sort column
                            throw new AssertionError("Project-range to end (x..)" +
                            " is supported in order-by only as last sort column");
                        }
                        prj.setProjectToEnd(i);
                        break;
                    }
                    else{
                        prj.setColumn(i);
                    }
                    prj.setOverloaded(false);

                    PhysicalPlan ep = new PhysicalPlan();
                    ep.add(prj);
                    nesSortPlanLst.add(ep);
                }
            }                      
        }
       
        sort.setSortPlans(nesSortPlanLst);
        sort.setResultType(DataType.BAG);
        fe2Plan.add(sort);
        fe2Plan.connect(topPrj, sort);
       
        // The plan which will have a constant representing the
        // degree of parallelism for the final order by map-reduce job
        // this will either come from a "order by parallel x" in the script
        // or will be the default number of reducers for the cluster if
        // "parallel x" is not used in the script
        PhysicalPlan rpep = new PhysicalPlan();
        ConstantExpression rpce = new ConstantExpression(new OperatorKey(scope,nig.getNextNodeId(scope)));
        rpce.setRequestedParallelism(rp);
       
        // We temporarily set it to rp and will adjust it at runtime, because the final degree of parallelism
        // is unknown until we are ready to submit it. See PIG-2779.
        rpce.setValue(rp);
       
        rpce.setResultType(DataType.INTEGER);
        rpep.add(rpce);
       
        List<PhysicalPlan> genEps = new ArrayList<PhysicalPlan>();
        genEps.add(rpep);
        genEps.add(fe2Plan);
       
        List<Boolean> flattened2 = new ArrayList<Boolean>();
        flattened2.add(false);
        flattened2.add(false);
       
        POForEach nfe2 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)),-1, genEps, flattened2);
        mro.reducePlan.add(nfe2);
        mro.reducePlan.connect(pkg, nfe2);
       
        // Let's connect the output from the foreach containing
        // number of quantiles and the sorted bag of samples to
        // another foreach with the FindQuantiles udf. The input
        // to the FindQuantiles udf is a project(*) which takes the
        // foreach input and gives it to the udf
        PhysicalPlan ep4 = new PhysicalPlan();
        POProject prjStar4 = new POProject(new OperatorKey(scope,nig.getNextNodeId(scope)));
        prjStar4.setResultType(DataType.TUPLE);
        prjStar4.setStar(true);
        ep4.add(prjStar4);
       
        List<PhysicalOperator> ufInps = new ArrayList<PhysicalOperator>();
        ufInps.add(prjStar4);
     
        POUserFunc uf = new POUserFunc(new OperatorKey(scope,nig.getNextNodeId(scope)), -1, ufInps,
            new FuncSpec(udfClassName, udfArgs));
        ep4.add(uf);
        ep4.connect(prjStar4, uf);
       
        List<PhysicalPlan> ep4s = new ArrayList<PhysicalPlan>();
        ep4s.add(ep4);
        List<Boolean> flattened3 = new ArrayList<Boolean>();
        flattened3.add(false);
        POForEach nfe3 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)), -1, ep4s, flattened3);
       
        mro.reducePlan.add(nfe3);
        mro.reducePlan.connect(nfe2, nfe3);
       
        POStore str = getStore();
View Full Code Here

                successor = limitSucs.get(0);
            }

        }
        if (successor instanceof POForEach) {
            POForEach foreach = (POForEach)successor;
            List<PhysicalPlan> feInners = foreach.getInputPlans();

            // find algebraic operators and also check if the foreach statement
            // is suitable for combiner use
            List<Pair<PhysicalOperator, PhysicalPlan>> algebraicOps =
                findAlgebraicOps(feInners);
            if(algebraicOps == null || algebraicOps.size() == 0){
                // the plan is not  combinable or there is nothing to combine
                //we're done
                return;
            }
            if (mr.combinePlan.getRoots().size() != 0) {
                messageCollector.collect("Wasn't expecting to find anything already "
                        + "in the combiner!", MessageType.Warning, PigWarning.NON_EMPTY_COMBINE_PLAN);
                return;
            }

            log.info("Choosing to move algebraic foreach to combiner");

            try {


                // replace PODistinct->Project[*] with distinct udf (which is Algebriac)
                for(Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps ){
                    if(! (op2plan.first instanceof PODistinct))
                        continue;
                    DistinctPatcher distinctPatcher = new DistinctPatcher(op2plan.second);
                    distinctPatcher.visit();
                    if(distinctPatcher.getDistinct() == null){
                        int errCode = 2073;
                        String msg = "Problem with replacing distinct operator with distinct built-in function.";
                        throw new PlanException(msg, errCode, PigException.BUG);
                    }
                    op2plan.first = distinctPatcher.getDistinct();
                }

                //create new map foreach
                POForEach mfe = createForEachWithGrpProj(foreach, rearrange.getKeyType());               
                Map<PhysicalOperator, Integer> op2newpos =
                    new HashMap<PhysicalOperator, Integer>();
                Integer pos = 1;
                //create plan for each algebraic udf and add as inner plan in map-foreach
                for(Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps ){
                    PhysicalPlan udfPlan = createPlanWithPredecessors(op2plan.first, op2plan.second);
                    mfe.addInputPlan(udfPlan, false);
                    op2newpos.put(op2plan.first, pos++);
                }
                changeFunc(mfe, POUserFunc.INITIAL);

                // since we will only be creating SingleTupleBag as input to
                // the map foreach, we should flag the POProjects in the map
                // foreach inner plans to also use SingleTupleBag
                for (PhysicalPlan mpl : mfe.getInputPlans()) {
                    try {
                        new fixMapProjects(mpl).visit();
                    } catch (VisitorException e) {
                        int errCode = 2089;
                        String msg = "Unable to flag project operator to use single tuple bag.";
                        throw new PlanException(msg, errCode, PigException.BUG, e);
                    }
                }

                //create new combine foreach
                POForEach cfe = createForEachWithGrpProj(foreach, rearrange.getKeyType());
                //add algebraic functions with appropriate projection
                addAlgebraicFuncToCombineFE(cfe, op2newpos);
                changeFunc(cfe, POUserFunc.INTERMEDIATE);

                //fix projection and function time for algebraic functions in reduce foreach
                for(Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps ){
                    setProjectInput(op2plan.first, op2plan.second, op2newpos.get(op2plan.first));
                    ((POUserFunc)op2plan.first).setAlgebraicFunction(POUserFunc.FINAL);
                }


                // we have modified the foreach inner plans - so set them
                // again for the foreach so that foreach can do any re-initialization
                // around them.
                // FIXME - this is a necessary evil right now because the leaves are explicitly
                // stored in the POForeach as a list rather than computed each time at
                // run time from the plans for optimization. Do we want to have the Foreach
                // compute the leaves each time and have Java optimize it (will Java optimize?)?
                mfe.setInputPlans(mfe.getInputPlans());
                cfe.setInputPlans(cfe.getInputPlans());
                foreach.setInputPlans(foreach.getInputPlans());

                //tell POCombinerPackage which fields need projected and
                // which placed in bags. First field is simple project
                // rest need to go into bags
View Full Code Here

     * @param keyType type for group-by key
     * @return new POForeach
     */
    private POForEach createForEachWithGrpProj(POForEach foreach, byte keyType) {
        String scope = foreach.getOperatorKey().scope;
        POForEach newFE = new POForEach(createOperatorKey(scope), new ArrayList<PhysicalPlan>());
        newFE.addOriginalLocation(foreach.getAlias(), foreach.getOriginalLocations());
        newFE.setResultType(foreach.getResultType());
        //create plan that projects the group column
        PhysicalPlan grpProjPlan = new PhysicalPlan();
        //group by column is the first column
        POProject proj = new POProject(createOperatorKey(scope), 1, 0);
        proj.setResultType(keyType);
        grpProjPlan.add(proj);

        newFE.addInputPlan(grpProjPlan, false);
        return newFE;
    }
View Full Code Here

TOP

Related Classes of org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.