Package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators

Examples of org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLoad

    private NativeMapReduceOper getNativeMROp(String mrJar, String[] parameters) {
        return new NativeMapReduceOper(new OperatorKey(scope,nig.getNextNodeId(scope)), mrJar, parameters);
    private POLoad getLoad(){
        POLoad ld = new POLoad(new OperatorKey(scope,nig.getNextNodeId(scope)));
        return ld;
            leaf = leaves.get(0);

        for (MapReduceOper mmro : mergedPlans) {
            FileSpec fileSpec = getTempFileSpec();
            POLoad ld = getLoad();
            POStore str = getStore();
     * @return
     * @throws IOException
     * @throws PlanException
    private MapReduceOper startNew(FileSpec fSpec, MapReduceOper old) throws PlanException{
        POLoad ld = getLoad();
        MapReduceOper ret = getMROp();
        MRPlan.connect(old, ret);
        return ret;
        int numFiles = 0;
        boolean ret = false;
        try {
            for (PhysicalOperator root : roots) {
                POLoad ld = (POLoad) root;
                String fileName = ld.getLFile().getFileName();
                    // Only if the input is an hdfs file, this optimization is
                    // useful (to reduce load on namenode)
                    //separate out locations separated by comma
                    String [] locations = LoadFunc.getPathStrings(fileName);
                    for(String location : locations){
                        Path path = new Path(location);
                        FileSystem fs = path.getFileSystem(conf);
                        if (fs.exists(path)) {
                            LoadFunc loader = (LoadFunc) PigContext
                            Job job = new Job(conf);
                            loader.setLocation(location, job);
                            InputFormat inf = loader.getInputFormat();
                            List<InputSplit> splits = inf.getSplits(HadoopShims.cloneJobContext(job));
                            List<List<InputSplit>> results = MapRedUtil
                    int errCode = 2172;
                    String errMsg = "Expected physical operator at root to be POLoad. Found : "+rootPOOp.getClass().getCanonicalName();
                    throw new MRCompilerException(errMsg,errCode);
                POLoad sideLoader = (POLoad)rootPOOp;
                FileSpec loadFileSpec = sideLoader.getLFile();
                FuncSpec funcSpec = loadFileSpec.getFuncSpec();
                LoadFunc loadfunc = sideLoader.getLoadFunc();
                if(i == 0){
                      int errCode = 2252;
                        throw new MRCompilerException("Base loader in Cogroup must implement CollectableLoadFunc.", errCode);
                    int errCode = 2253;
                    throw new MRCompilerException("Side loaders in cogroup must implement IndexableLoadFunc.", errCode);
            final MapReduceOper baseMROp, final List<PhysicalPlan> mapperLRInnerPlans)
        throws MRCompilerException, PlanException, ExecException, IOException, CloneNotSupportedException {
        // First replace loader with  MergeJoinIndexer.
        PhysicalPlan baseMapPlan = baseMROp.mapPlan;
        POLoad baseLoader = (POLoad)baseMapPlan.getRoots().get(0);                           
        FileSpec origLoaderFileSpec = baseLoader.getLFile();
        FuncSpec funcSpec = origLoaderFileSpec.getFuncSpec();
        LoadFunc loadFunc = baseLoader.getLoadFunc();
        if (! (OrderedLoadFunc.class.isAssignableFrom(loadFunc.getClass()))){
            int errCode = 1104;
            String errMsg = "Base relation of merge-coGroup must implement " +
            "OrderedLoadFunc interface. The specified loader "
            + funcSpec + " doesn't implement it";
            throw new MRCompilerException(errMsg,errCode);
        String[] indexerArgs = new String[6];
        indexerArgs[0] = funcSpec.toString();
        indexerArgs[1] = ObjectSerializer.serialize((Serializable)mapperLRInnerPlans);
        indexerArgs[3] = baseLoader.getSignature();
        indexerArgs[4] = baseLoader.getOperatorKey().scope;
        indexerArgs[5] = Boolean.toString(false); // we care for nulls.
        PhysicalPlan phyPlan;
        if (baseMapPlan.getSuccessors(baseLoader) == null
                || baseMapPlan.getSuccessors(baseLoader).isEmpty()){
         // Load-Load-Cogroup case.
            phyPlan = null;
        else{ // We got something. Yank it and set it as inner plan.
            phyPlan = baseMapPlan.clone();
            PhysicalOperator root = phyPlan.getRoots().get(0);
            phyPlan.disconnect(root, phyPlan.getSuccessors(root).get(0));

        indexerArgs[2] = ObjectSerializer.serialize(phyPlan);

        POLoad idxJobLoader = getLoad();
        idxJobLoader.setLFile(new FileSpec(origLoaderFileSpec.getFileName(),
                new FuncSpec(MergeJoinIndexer.class.getName(), indexerArgs)));
        // Loader of mro will return a tuple of form -
            rightMROpr.requestedParallelism = 1; // we need exactly one reducer for indexing job.       
            // At this point, we must be operating on map plan of right input and it would contain nothing else other then a POLoad.
            POLoad rightLoader = (POLoad)rightMROpr.mapPlan.getRoots().get(0);
            LoadFunc rightLoadFunc = rightLoader.getLoadFunc();
            List<String> udfs = new ArrayList<String>();
            if(IndexableLoadFunc.class.isAssignableFrom(rightLoadFunc.getClass())) {
                // we don't need the right MROper since
                // the right loader is an IndexableLoadFunc which can handle the index
                // itself
                if(rightMROpr == compiledInputs[0]) {
                    compiledInputs[0] = null;
                } else if(rightMROpr == compiledInputs[1]) {
                    compiledInputs[1] = null;
                rightMROpr = null;
                // validate that the join keys in merge join are only                                                                                                                                                                             
                // simple column projections or '*' and not expression - expressions                                                                                                                                                              
                // cannot be handled when the index is built by the storage layer on the sorted                                                                                                                                                   
                // data when the sorted data (and corresponding index) is written.                                                                                                                                                                
                // So merge join will be restricted not have expressions as                                                                                                                                                                       
                // join keys     
                int numInputs = mPlan.getPredecessors(joinOp).size(); // should be 2
                for(int i = 0; i < numInputs; i++) {
                    List<PhysicalPlan> keyPlans = joinOp.getInnerPlansOf(i);
                    for (PhysicalPlan keyPlan : keyPlans) {
                        for(PhysicalOperator op : keyPlan) {
                            if(!(op instanceof POProject)) {
                                int errCode = 1106;
                                String errMsg = "Merge join is possible only for simple column or '*' join keys when using " +
                                rightLoader.getLFile().getFuncSpec() + " as the loader";
                                throw new MRCompilerException(errMsg, errCode, PigException.INPUT);
            } else {
                LoadFunc loadFunc = rightLoader.getLoadFunc();
                //Replacing POLoad with indexer is disabled for 'merge-sparse' joins.  While
                //this feature would be useful, the current implementation of DefaultIndexableLoader
                //is not designed to handle multiple calls to seekNear.  Specifically, it rereads the entire index
                //for each call.  Some refactoring of this class is required - and then the check below could be removed.
    if (joinOp.getJoinType() == LOJoin.JOINTYPE.MERGESPARSE) {
                    int errCode = 1104;
                    String errMsg = "Right input of merge-join must implement IndexableLoadFunc. " +
                    "The specified loader " + loadFunc + " doesn't implement it";
                    throw new MRCompilerException(errMsg,errCode);
                // Replace POLoad with  indexer.

                if (! (OrderedLoadFunc.class.isAssignableFrom(loadFunc.getClass()))){
                    int errCode = 1104;
                    String errMsg = "Right input of merge-join must implement " +
                    "OrderedLoadFunc interface. The specified loader "
                    + loadFunc + " doesn't implement it";
                    throw new MRCompilerException(errMsg,errCode);

                String[] indexerArgs = new String[6];
                List<PhysicalPlan> rightInpPlans = joinOp.getInnerPlansOf(1);
                FileSpec origRightLoaderFileSpec = rightLoader.getLFile();

                indexerArgs[0] = origRightLoaderFileSpec.getFuncSpec().toString();
                indexerArgs[1] = ObjectSerializer.serialize((Serializable)rightInpPlans);
                indexerArgs[2] = ObjectSerializer.serialize(rightPipelinePlan);
                indexerArgs[3] = rightLoader.getSignature();
                indexerArgs[4] = rightLoader.getOperatorKey().scope;
                indexerArgs[5] = Boolean.toString(true);
                FileSpec lFile = new FileSpec(rightLoader.getLFile().getFileName(),new FuncSpec(MergeJoinIndexer.class.getName(), indexerArgs));
                // Loader of mro will return a tuple of form -
                // (keyFirst1, keyFirst2, .. , position, splitIndex) See MergeJoinIndexer

                MRUtil.simpleConnectMapToReduce(rightMROpr, scope, nig);
        PhysicalOperator po = pos.get(0);
        if (!(po instanceof POLoad)) {
            log.debug("Root operator of map is not load.");
            return; // Huh?
        POLoad load = (POLoad)po;
        String loadFunc = load.getLFile().getFuncName();
        String loadFile = load.getLFile().getFileName();
        if (!("org.apache.pig.impl.builtin.RandomSampleLoader".equals(loadFunc)) && !("org.apache.pig.impl.builtin.PoissonSampleLoader".equals(loadFunc))) {
            log.debug("Not a sampling job.");
        if (loadFile == null) {
            log.debug("No load file");

        // Get this job's predecessor.  There should be exactly one.;
        List<MapReduceOper> preds = mPlan.getPredecessors(mr);
        if (preds.size() != 1) {
            log.debug("Too many predecessors to sampling job.");
        MapReduceOper pred = preds.get(0);

        // The predecessor should be a root.
        List<MapReduceOper> predPreds = mPlan.getPredecessors(pred);
        if (predPreds != null && predPreds.size() > 0) {
            log.debug("Predecessor should be a root of the plan");

        // The predecessor should have just a load and store in the map, and nothing
        // in the combine or reduce.
        if ( !(pred.reducePlan.isEmpty() && pred.combinePlan.isEmpty())) {
            log.debug("Predecessor has a combine or reduce plan");

        // The MR job should have one successor.
        List<MapReduceOper> succs = mPlan.getSuccessors(mr);
        if (succs.size() != 1) {
            log.debug("Job has more than one successor.");
        MapReduceOper succ = succs.get(0);
        if (pred.mapPlan == null || pred.mapPlan.size() != 2) {
            log.debug("Predecessor has more than just load+store in the map");

        List<PhysicalOperator> loads = pred.mapPlan.getRoots();
        if (loads.size() != 1) {
            log.debug("Predecessor plan has more than one root.");
        PhysicalOperator r = loads.get(0);
        if (!(r instanceof POLoad)) { // Huh?
            log.debug("Predecessor's map plan root is not a load.");
        POLoad predLoad = (POLoad)r;

        // Find the load the correlates with the file the sampler is loading, and
        // check that it is using the temp file storage format.
        if (succ.mapPlan == null) { // Huh?
            log.debug("Successor has no map plan.");
        loads = succ.mapPlan.getRoots();
        POLoad succLoad = null;
        for (PhysicalOperator root : loads) {
            if (!(root instanceof POLoad)) { // Huh?
                log.debug("Successor's roots are not loads");
            POLoad sl = (POLoad)root;
            if (loadFile.equals(sl.getLFile().getFileName()) &&
                    Utils.getTmpFileCompressorName(pigContext).equals(sl.getLFile().getFuncName())) {
                succLoad = sl;

        if (succLoad == null) {
            log.debug("Could not find load that matched file we are sampling.");

        // Okay, we're on.
        // First, replace our RandomSampleLoader with a RandomSampleLoader that uses
        // the load function from our predecessor.
        String[] rslargs = new String[2];
        FileSpec predFs = predLoad.getLFile();
        // First argument is FuncSpec of loader function to subsume, this we want to set for
        // ourselves.
        rslargs[0] = predFs.getFuncSpec().toString();
        // Add the loader's funcspec to the list of udf's associated with this mr operator
        // Second argument is the number of samples per block, read this from the original.
        rslargs[1] = load.getLFile().getFuncSpec().getCtorArgs()[1];
        FileSpec fs = new FileSpec(predFs.getFileName(),new FuncSpec(loadFunc, rslargs));
        POLoad newLoad = new POLoad(load.getOperatorKey(),load.getRequestedParallelism(), fs);
        try {
            mr.mapPlan.replace(load, newLoad);
            // check if it has PartitionSkewedKeys
            List<PhysicalOperator> ls = mr.reducePlan.getLeaves();
            for(PhysicalOperator op: ls) {
              scan(mr, op, fs.getFileName());
        } catch (PlanException e) {
            throw new VisitorException(e);

        // Second, replace the loader in our successor with whatever the originally used loader was.
        fs = new FileSpec(predFs.getFileName(), predFs.getFuncSpec());
        newLoad = new POLoad(succLoad.getOperatorKey(), succLoad.getRequestedParallelism(), fs);
        try {
            succ.mapPlan.replace(succLoad, newLoad);
            // Add the loader's funcspec to the list of udf's associated with this mr operator
        } catch (PlanException e) {
            throw new VisitorException(e);

        // Cannot delete the pred right now, because we are still traversing the graph. So, mark the pred and remove it from the
        // the keys are sent in a tuple. If there is really only
        // 1 join key, it would be the first field of the tuple. If
        // there are multiple Join keys, the tuple itself represents
        // the join key
        Object firstLeftKey = (keys.size() == 1 ? keys.get(0): keys);
        POLoad ld = new POLoad(genKey(), new FileSpec(indexFile, new FuncSpec(indexFileLoadFuncSpec)));
        Properties props = ConfigurationUtil.getLocalFSProperties();
        PigContext pc = new PigContext(ExecType.LOCAL, props);
        index = new LinkedList<Tuple>();
        for(Result res=ld.getNextTuple();res.returnStatus!=POStatus.STATUS_EOP;res=ld.getNextTuple())
            index.offer((Tuple) res.result);  

        Tuple prevIdxEntry = null;
        Tuple matchedEntry;
    public void visit(LOLoad loLoad) throws FrontendException {
        String scope = DEFAULT_SCOPE;
        // The last parameter here is set to true as we assume all files are
        // splittable due to LoadStore Refactor
        POLoad load = new POLoad(new OperatorKey(scope, nodeGen
                .getNextNodeId(scope)), loLoad.getLoadFunc());
        load.addOriginalLocation(loLoad.getAlias(), loLoad.getLocation());

        logToPhyMap.put(loLoad, load);

        // Load is typically a root operator, but in the multiquery
