Package org.apache.pig

Source Code of org.apache.pig.PigServer

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapreduce.Job;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.pig.backend.datastorage.ContainerDescriptor;
import org.apache.pig.backend.datastorage.DataStorage;
import org.apache.pig.backend.datastorage.ElementDescriptor;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.executionengine.ExecJob;
import org.apache.pig.backend.executionengine.ExecJob.JOB_STATUS;
import org.apache.pig.backend.hadoop.executionengine.HJob;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
import org.apache.pig.builtin.PigStorage;
import org.apache.pig.classification.InterfaceAudience;
import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.FileLocalizer;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.impl.io.InterStorage;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.LOConst;
import org.apache.pig.impl.logicalLayer.LODefine;
import org.apache.pig.impl.logicalLayer.LOForEach;
import org.apache.pig.impl.logicalLayer.LOLimit;
import org.apache.pig.impl.logicalLayer.LOLoad;
import org.apache.pig.impl.logicalLayer.LOSort;
import org.apache.pig.impl.logicalLayer.LOSplit;
import org.apache.pig.impl.logicalLayer.LOSplitOutput;
import org.apache.pig.impl.logicalLayer.LOStore;
import org.apache.pig.impl.logicalLayer.LOUserFunc;
import org.apache.pig.impl.logicalLayer.LOVisitor;
import org.apache.pig.impl.logicalLayer.LogicalOperator;
import org.apache.pig.impl.logicalLayer.LogicalPlan;
import org.apache.pig.impl.logicalLayer.LogicalPlanBuilder;
import org.apache.pig.impl.logicalLayer.PlanSetter;
import org.apache.pig.impl.logicalLayer.ScalarFinder;
import org.apache.pig.impl.logicalLayer.UnionOnSchemaSetter;
import org.apache.pig.impl.logicalLayer.optimizer.LogicalOptimizer;
import org.apache.pig.impl.logicalLayer.parser.ParseException;
import org.apache.pig.impl.logicalLayer.parser.QueryParser;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.validators.LogicalPlanValidationExecutor;
import org.apache.pig.impl.plan.CompilationMessageCollector;
import org.apache.pig.impl.plan.DependencyOrderWalker;
import org.apache.pig.impl.plan.NodeIdGenerator;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.PlanException;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.plan.CompilationMessageCollector.MessageType;
import org.apache.pig.impl.streaming.StreamingCommand;
import org.apache.pig.impl.util.LogUtils;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.impl.util.Pair;
import org.apache.pig.impl.util.PropertiesUtil;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;
import org.apache.pig.newplan.logical.LogicalPlanMigrationVistor;
import org.apache.pig.newplan.logical.optimizer.LogicalPlanOptimizer;
import org.apache.pig.pen.ExampleGenerator;
import org.apache.pig.scripting.ScriptEngine;
import org.apache.pig.tools.grunt.GruntParser;
import org.apache.pig.tools.parameters.ParameterSubstitutionPreprocessor;
import org.apache.pig.tools.pigstats.JobStats;
import org.apache.pig.tools.pigstats.OutputStats;
import org.apache.pig.tools.pigstats.PigStats;
import org.apache.pig.tools.pigstats.PigStatsUtil;
import org.apache.pig.tools.pigstats.ScriptState;
import org.apache.pig.tools.pigstats.PigStats.JobGraph;


/**
*
* A class for Java programs to connect to Pig. Typically a program will create a PigServer
* instance. The programmer then registers queries using registerQuery() and
* retrieves results using openIterator() or store(). After doing so, the
* shutdown() method should be called to free any resources used by the current
* PigServer instance. Not doing so could result in a memory leak.
*
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
public class PigServer {
   
    private final Log log = LogFactory.getLog(getClass());
   
    /**
     * Given a string, determine the exec type.
     * @param str accepted values are 'local', 'mapreduce', and 'mapred'
     * @return exectype as ExecType
     */
    public static ExecType parseExecType(String str) throws IOException {
        String normStr = str.toLowerCase();
       
        if (normStr.equals("local")) return ExecType.LOCAL;
        if (normStr.equals("mapreduce")) return ExecType.MAPREDUCE;
        if (normStr.equals("mapred")) return ExecType.MAPREDUCE;
        if (normStr.equals("pig")) return ExecType.PIG;
        if (normStr.equals("pigbody")) return ExecType.PIG;
  
        int errCode = 2040;
        String msg = "Unknown exec type: " + str;
        throw new PigException(msg, errCode, PigException.BUG);
    }

    /*
     * The data structure to support grunt shell operations.
     * The grunt shell can only work on one graph at a time.
     * If a script is contained inside another script, the grunt
     * shell first saves the current graph on the stack and works
     * on a new graph. After the nested script is done, the grunt
     * shell pops up the saved graph and continues working on it.
     */
    private Stack<Graph> graphs = new Stack<Graph>();
   
    /*
     * The current Graph the grunt shell is working on.
     */
    private Graph currDAG;
    private PigContext pigContext;
   
    private static int scopeCounter = 0;
    private String scope = constructScope();

    private boolean aggregateWarning = true;
    private boolean isMultiQuery = true;
   
    private String constructScope() {
        // scope servers for now as a session id
       
        // String user = System.getProperty("user.name", "DEFAULT_USER_ID");
        // String date = (new Date()).toString();

        // scope is not really used in the system right now. It will
        // however make your explain statements look lengthy if set to
        // username-date. For now let's simplify the scope, if a real
        // scope is needed again, we might need to update all the
        // operators to not include scope in their name().
        return ""+(++scopeCounter);
    }
   
    /**
     * @param execTypeString can be 'mapreduce' or 'local'.  Local mode will
     * use Hadoop's local job runner to execute the job on the local machine.
     * Mapreduce mode will connect to a cluster to execute the job.
     * @throws ExecException
     * @throws IOException
     */
    public PigServer(String execTypeString) throws ExecException, IOException {
        this(parseExecType(execTypeString));
    }
   
    /**
     * @param execType execution type to start the engine.  Local mode will
     * use Hadoop's local job runner to execute the job on the local machine.
     * Mapreduce mode will connect to a cluster to execute the job.
     * @throws ExecException
     */
    public PigServer(ExecType execType) throws ExecException {
        this(execType, PropertiesUtil.loadDefaultProperties());
    }

    public PigServer(ExecType execType, Properties properties) throws ExecException {
        this(new PigContext(execType, properties));
    }
 
    public PigServer(PigContext context) throws ExecException {
        this(context, true);
    }
   
    public PigServer(PigContext context, boolean connect) throws ExecException {
        this.pigContext = context;
        currDAG = new Graph(false);
       
        aggregateWarning = "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning"));
        isMultiQuery = "true".equalsIgnoreCase(pigContext.getProperties().getProperty("opt.multiquery","true"));

        if (connect) {
            pigContext.connect();
        }
       
        if( "true".equals( pigContext.getProperties().getProperty( "mapred.output.compress" ) ) ) {
            pigContext.getProperties().setProperty( "output.compression.enabled""true" );
            String codec = pigContext.getProperties().getProperty( "mapred.output.compression.codec" );
            if( codec == null ) {
                throw new RuntimeException( "'mapred.output.compress' is set but no value is specified for 'mapred.output.compression.codec'." );
            } else {
                pigContext.getProperties().setProperty( "output.compression.codec", codec );
            }
        }
       
        addJarsFromProperties();
    }
   
    private void addJarsFromProperties() throws ExecException {
        //add jars from properties to extraJars
        String jar_str = pigContext.getProperties().getProperty("pig.additional.jars");
        if(jar_str != null){
            for(String jar : jar_str.split(":")){
                try {
                    registerJar(jar);
                } catch (IOException e) {
                    int errCode = 4010;
                    String msg =
                        "Failed to register jar :" + jar + ". Caught exception.";
                    throw new ExecException(
                            msg,
                            errCode,
                            PigException.USER_ENVIRONMENT,
                            e
                    );
                }
            }
        }
    }

    public PigContext getPigContext(){
        return pigContext;
    }
   
    /**
     * Set the logging level to DEBUG.
     */
    public void debugOn() {
        Logger.getLogger("org.apache.pig").setLevel(Level.DEBUG);
        pigContext.getLog4jProperties().setProperty("log4j.logger.org.apache.pig", Level.DEBUG.toString());
    }
   
    /**
     * Set the logging level to the default.
     */
    public void debugOff() {
        Logger.getLogger("org.apache.pig").setLevel(pigContext.getDefaultLogLevel());
        pigContext.getLog4jProperties().setProperty("log4j.logger.org.apache.pig", pigContext.getDefaultLogLevel().toString());
    }
   
    /**
     * Set the default parallelism for this job
     * @param p default number of reducers to use for this job.
     */
    public void setDefaultParallel(int p) {
        pigContext.defaultParallel = p;
    }
    /**
     * Starts batch execution mode.
     */
    public void setBatchOn() {
        log.debug("Create a new graph.");
       
        if (currDAG != null) {
            graphs.push(currDAG);
        }
        currDAG = new Graph(isMultiQuery);
    }

    /**
     * Retrieve the current execution mode.
     *
     * @return true if the execution mode is batch; false otherwise.
     */
    public boolean isBatchOn() {
        // Batch is on when there are multiple graphs on the
        // stack. That gives the right response even if multiquery was
        // turned off.
        return graphs.size() > 0;
    }

    /**
     * Returns whether there is anything to process in the current batch.
     * @throws FrontendException
     * @return true if there are no stores to process in the current
     * batch, false otherwise.
     */
    public boolean isBatchEmpty() throws FrontendException {
        if (currDAG == null) {
            int errCode = 1083;
            String msg = "setBatchOn() must be called first.";
            throw new FrontendException(msg, errCode, PigException.INPUT);
        }

        return currDAG.isBatchEmpty();
    }

    /**
     * Submits a batch of Pig commands for execution.
     *
     * @return list of jobs being executed
     * @throws FrontendException
     * @throws ExecException
     */
    public List<ExecJob> executeBatch() throws FrontendException, ExecException {
        PigStats stats = executeBatchEx();
        LinkedList<ExecJob> jobs = new LinkedList<ExecJob>();
        JobGraph jGraph = stats.getJobGraph();
        Iterator<JobStats> iter = jGraph.iterator();
        while (iter.hasNext()) {
            JobStats js = iter.next();
            for (OutputStats output : js.getOutputs()) {
                if (js.isSuccessful()) {               
                    jobs.add(new HJob(HJob.JOB_STATUS.COMPLETED, pigContext, output
                            .getPOStore(), output.getAlias(), stats));
                } else {
                    HJob hjob = new HJob(HJob.JOB_STATUS.FAILED, pigContext, output
                            .getPOStore(), output.getAlias(), stats);
                    hjob.setException(js.getException());
                    jobs.add(hjob);
                }
            }
        }
        return jobs;
    }

    private PigStats executeBatchEx() throws FrontendException, ExecException {
        if (!isMultiQuery) {
            // ignore if multiquery is off
            return PigStats.get();
        }

        if (currDAG == null || !isBatchOn()) {
            int errCode = 1083;
            String msg = "setBatchOn() must be called first.";
            throw new FrontendException(msg, errCode, PigException.INPUT);
        }
       
        return currDAG.execute();
    }
   
    /**
     * Discards a batch of Pig commands.
     *
     * @throws FrontendException
     */
    public void discardBatch() throws FrontendException {
        if (currDAG == null || !isBatchOn()) {
            int errCode = 1083;
            String msg = "setBatchOn() must be called first.";
            throw new FrontendException(msg, errCode, PigException.INPUT);
        }
       
        currDAG = graphs.pop();
    }
      
    /**
     * Add a path to be skipped while automatically shipping binaries for
     * streaming.
     * 
     * @param path path to be skipped
     */
    public void addPathToSkip(String path) {
        pigContext.addPathToSkip(path);
    }
   
    /**
     * Defines an alias for the given function spec. This
     * is useful for functions that require arguments to the
     * constructor.
     *
     * @param function - the new function alias to define.
     * @param functionSpec - the name of the function and any arguments.
     * It should have the form: classname('arg1', 'arg2', ...)
     * @deprecated Use {@link #registerFunction(String, FuncSpec)}
     */
    public void registerFunction(String function, String functionSpec) {
        registerFunction(function, new FuncSpec(functionSpec));
    }
   
    /**
     * Defines an alias for the given function spec. This
     * is useful for functions that require arguments to the
     * constructor.
     *
     * @param function - the new function alias to define.
     * @param funcSpec - the FuncSpec object representing the name of
     * the function class and any arguments to constructor.
     */
    public void registerFunction(String function, FuncSpec funcSpec) {
        pigContext.registerFunction(function, funcSpec);
    }
   
    /**
     * Defines an alias for the given streaming command.
     *
     * @param commandAlias - the new command alias to define
     * @param command - streaming command to be executed
     */
    public void registerStreamingCommand(String commandAlias, StreamingCommand command) {
        pigContext.registerStreamCmd(commandAlias, command);
    }

    private URL locateJarFromResources(String jarName) throws IOException {
        Enumeration<URL> urls = ClassLoader.getSystemResources(jarName);
        URL resourceLocation = null;
       
        if (urls.hasMoreElements()) {
            resourceLocation = urls.nextElement();
        }
       
        if (urls.hasMoreElements()) {
            StringBuffer sb = new StringBuffer("Found multiple resources that match ");
            sb.append(jarName);
            sb.append(": ");
            sb.append(resourceLocation);
           
            while (urls.hasMoreElements()) {
                sb.append(urls.nextElement());
                sb.append("; ");
            }
           
            log.debug(sb.toString());
        }
   
        return resourceLocation;
    }
   
    /**
     * Registers a jar file. Name of the jar file can be an absolute or
     * relative path.
     *
     * If multiple resources are found with the specified name, the
     * first one is registered as returned by getSystemResources.
     * A warning is issued to inform the user.
     *
     * @param name of the jar file to register
     * @throws IOException
     */
    public void registerJar(String name) throws IOException {
        // first try to locate jar via system resources
        // if this fails, try by using "name" as File (this preserves
        // compatibility with case when user passes absolute path or path
        // relative to current working directory.)       
        if (name != null) {
            URL resource = locateJarFromResources(name);

            if (resource == null) {
                File f = FileLocalizer.fetchFile(pigContext.getProperties(), name).file;
               
                if (!f.canRead()) {
                    int errCode = 4002;
                    String msg = "Can't read jar file: " + name;
                    throw new FrontendException(msg, errCode, PigException.USER_ENVIRONMENT);
                }
               
                resource = f.toURI().toURL();
            }

            pigContext.addJar(resource);       
        }
    }
   
    /**
     * Universal Scripting Language Support, see PIG-928
     *
     * @param path path of the script file
     * @param scriptingLang language keyword or scriptingEngine used to interpret the script
     * @param namespace namespace defined for functions of this script
     * @throws IOException
     */
    public void registerCode(String path, String scriptingLang, String namespace)
    throws IOException {
        File f = new File(path);

        if (!f.canRead()) {
            int errCode = 4002;
            String msg = "Can't read file: " + path;
            throw new FrontendException(msg, errCode,
                    PigException.USER_ENVIRONMENT);
        }
        if(scriptingLang != null) {
            ScriptEngine se = ScriptEngine.getInstance(scriptingLang);
            se.registerFunctions(path, namespace, pigContext);
        }
        pigContext.addScriptFile(path);
    }
   
    /**
     * Register a query with the Pig runtime. The query is parsed and registered, but it is not
     * executed until it is needed.
     *
     * @param query
     *            a Pig Latin expression to be evaluated.
     * @param startLine
     *            line number of the query within the whole script
     * @throws IOException
     */   
    public void registerQuery(String query, int startLine) throws IOException {           
        currDAG.registerQuery(query, startLine);
    }
    public Graph getClonedGraph() throws IOException {
        Graph graph = currDAG.clone();

        if (graph == null) {
            int errCode = 2127;
            String msg = "Cloning of plan failed.";
            throw new FrontendException(msg, errCode, PigException.BUG);
        }
        return graph;
    }
   
    /**
     * Register a query with the Pig runtime. The query is parsed and registered, but it is not
     * executed until it is needed.  Equivalent to calling {@link #registerQuery(String, int)}
     * with startLine set to 1.
     *
     * @param query
     *            a Pig Latin expression to be evaluated.
     * @throws IOException
     */   
    public void registerQuery(String query) throws IOException {
        registerQuery(query, 1);
    }
   
    /**
     * Register a query with the Pig runtime.  The query will be read from the indicated file.
     * @param fileName file to read query from.
     * @throws IOException
     */
    public void registerScript(String fileName) throws IOException {
        registerScript(fileName, null, null);
    }
   
    /**
     * Register a pig script file.  The parameters in the file will be substituted with the values in params
     * @param fileName  pig script file
     * @param params  the key is the parameter name, and the value is the parameter value
     * @throws IOException
     */
    public void registerScript(String fileName, Map<String,String> params) throws IOException {
        registerScript(fileName, params, null);
    }

    /**
     * Register a pig script file.  The parameters in the file will be substituted with the values in the parameter files
     * @param fileName pig script file
     * @param paramsFiles  files which have the parameter setting
     * @throws IOException
     */
    public void registerScript(String fileName, List<String> paramsFiles) throws IOException {
        registerScript(fileName, null, paramsFiles);
    }
   
    /**
     * Register a pig script file.  The parameters in the file will be substituted with the values in the map and the parameter files
     * The values in params Map will override the value in parameter file if they have the same parameter
     * @param fileName  pig script
     * @param params  the key is the parameter name, and the value is the parameter value
     * @param paramsFiles   files which have the parameter setting
     * @throws IOException
     */
    public void registerScript(String fileName, Map<String,String> params,List<String> paramsFiles) throws IOException {
        try {
            // transform the map type to list type which can been accepted by ParameterSubstitutionPreprocessor
            List<String> paramList = new ArrayList<String>();
            if (params!=null){
                for (Map.Entry<String, String> entry:params.entrySet()){
                    paramList.add(entry.getKey()+"="+entry.getValue());
                }
            }
           
            // do parameter substitution
            ParameterSubstitutionPreprocessor psp = new ParameterSubstitutionPreprocessor(50);
            StringWriter writer = new StringWriter();
            psp.genSubstitutedFile(new BufferedReader(new InputStreamReader(new FileInputStream(fileName))),
                                   writer, 
                                   paramList.size() > 0 ? paramList.toArray(new String[0]) : null,
                                   paramsFiles!=null ? paramsFiles.toArray(new String[0]) : null);
           
            GruntParser grunt = new GruntParser(new StringReader(writer.toString()));
            grunt.setInteractive(false);
            grunt.setParams(this);
            grunt.parseStopOnError(true);
        } catch (FileNotFoundException e) {
            log.error(e.getLocalizedMessage());
            throw new IOException(e.getCause());
        } catch (org.apache.pig.tools.pigscript.parser.ParseException e) {
            log.error(e.getLocalizedMessage());
            throw new IOException(e.getCause());
        } catch (org.apache.pig.tools.parameters.ParseException e) {
            log.error(e.getLocalizedMessage());
            throw new IOException(e.getCause());
        }
    }
    /**
     * Intended to be used by unit tests only.
     * Print a list of all aliases in in the current Pig Latin script.  Output is written to
     * System.out.
     * @throws FrontendException
     */
    public void printAliases () throws FrontendException {
        System.out.println("aliases: " + currDAG.getAliasOp().keySet());
    }

    /**
     * Write the schema for an alias to System.out.
     * @param alias Alias whose schema will be written out
     * @return Schema of alias dumped
     * @throws IOException
     */
    public Schema dumpSchema(String alias) throws IOException{
        try {
            LogicalPlan lp = getPlanFromAlias(alias, "describe");
            lp = compileLp(alias, false);
            Schema schema = null;
            for(LogicalOperator lo : lp.getLeaves()){
                if(lo.getAlias().equals(alias)){
                    schema = lo.getSchema();
                    break;
                }
            }
            if (schema != null) System.out.println(alias + ": " + schema.toString());   
            else System.out.println("Schema for " + alias + " unknown.");
            return schema;
        } catch (FrontendException fee) {
            int errCode = 1001;
            String msg = "Unable to describe schema for alias " + alias;
            throw new FrontendException (msg, errCode, PigException.INPUT, false, null, fee);
        }
    }
   
    /**
     * Write the schema for a nestedAlias to System.out. Denoted by alias::nestedAlias.
     * @param alias Alias whose schema has nestedAlias
     * @param nestedAlias Alias whose schema will be written out
     * @return Schema of alias dumped
     * @throws IOException
     */
    public Schema dumpSchemaNested(String alias, String nestedAlias) throws IOException{
        LogicalPlan lp = getPlanFromAlias(alias, "describe");
        lp = compileLp(alias, false);
        LogicalOperator op = lp.getLeaves().get(0);
        if(op instanceof LOForEach) {
            return ((LOForEach)op).dumpNestedSchema(alias, nestedAlias);
        }
        else {
            int errCode = 1001;
            String msg = "Unable to describe schema for " + alias + "::" + nestedAlias;
            throw new FrontendException (msg, errCode, PigException.INPUT, false, null);
        }
    }

    /**
     * Set the name of the job.  This name will get translated to mapred.job.name.
     * @param name of job
     */
    public void setJobName(String name){
        currDAG.setJobName(name);
    }
   
    /**
     * Set Hadoop job priority.  This value will get translated to mapred.job.priority.
     * @param priority valid values are found in {@link org.apache.hadoop.mapred.JobPriority}
     */
    public void setJobPriority(String priority){
        currDAG.setJobPriority(priority);
    }

    /**
     * Executes a Pig Latin script up to and including indicated alias.  That is, if a user does:
     * <pre>
     * PigServer server = new PigServer();
     * server.registerQuery("A = load 'foo';");
     * server.registerQuery("B = filter A by $0 &gt; 0;");
     * server.registerQuery("C = order B by $1;");
     * </pre>
     * Then
     * <pre>
     * server.openIterator("B");
     * </pre>
     * filtered but unsorted data will be returned.  If instead a user does
     * <pre>
     * server.openIterator("C");
     * </pre>
     * filtered and sorted data will be returned.
     * @param id Alias to open iterator for
     * @return iterator of tuples returned from the script
     * @throws IOException
     */
    public Iterator<Tuple> openIterator(String id) throws IOException {
        try {
            LogicalOperator op = currDAG.getAliasOp().get(id);
            if(null == op) {
                int errCode = 1003;
                String msg = "Unable to find an operator for alias " + id;
                throw new FrontendException(msg, errCode, PigException.INPUT);
            }

            if (currDAG.isBatchOn()) {
                currDAG.execute();
            }
           
            ExecJob job = store(id, FileLocalizer.getTemporaryPath(pigContext)
                    .toString(), Utils.getTmpFileCompressorName(pigContext) + "()");
           
            // invocation of "execute" is synchronous!

            if (job.getStatus() == JOB_STATUS.COMPLETED) {
                return job.getResults();
            } else if (job.getStatus() == JOB_STATUS.FAILED
                       && job.getException() != null) {
                // throw the backend exception in the failed case
                Exception e = job.getException();
                int errCode = 1066;
                String msg = "Unable to open iterator for alias " + id +
                ". Backend error : " + e.getMessage();
                throw new FrontendException(msg, errCode, PigException.INPUT, e);
            } else {
                throw new IOException("Job terminated with anomalous status "
                    + job.getStatus().toString());
            }
        }
        catch(FrontendException e){
            throw e;
        }
        catch (Exception e) {
            int errCode = 1066;
            String msg = "Unable to open iterator for alias " + id ;
            throw new FrontendException(msg, errCode, PigException.INPUT, e);
        }
    }
   
    /**
     * Executes a Pig Latin script up to and including indicated alias and stores the resulting
     * records into a file.  That is, if a user does:
     * <pre>
     * PigServer server = new PigServer();
     * server.registerQuery("A = load 'foo';");
     * server.registerQuery("B = filter A by $0 &gt; 0;");
     * server.registerQuery("C = order B by $1;");
     * </pre>
     * Then
     * <pre>
     * server.store("B", "bar");
     * </pre>
     * filtered but unsorted data will be stored to the file <tt>bar</tt>.  If instead a user does
     * <pre>
     * server.store("C", "bar");
     * </pre>
     * filtered and sorted data will be stored to the file <tt>bar</tt>.
     * Equivalent to calling {@link #store(String, String, String)} with
     * <tt>org.apache.pig.PigStorage</tt> as the store function.
     * @param id The alias to store
     * @param filename The file to which to store to
     * @return {@link ExecJob} containing information about this job
     * @throws IOException
     */
    public ExecJob store(String id, String filename) throws IOException {
        return store(id, filename, PigStorage.class.getName() + "()");   // SFPig is the default store function
    }
       
    /**
     * Executes a Pig Latin script up to and including indicated alias and stores the resulting
     * records into a file.  That is, if a user does:
     * <pre>
     * PigServer server = new PigServer();
     * server.registerQuery("A = load 'foo';");
     * server.registerQuery("B = filter A by $0 &gt; 0;");
     * server.registerQuery("C = order B by $1;");
     * </pre>
     * Then
     * <pre>
     * server.store("B", "bar", "mystorefunc");
     * </pre>
     * filtered but unsorted data will be stored to the file <tt>bar</tt> using
     * <tt>mystorefunc</tt>.  If instead a user does
     * <pre>
     * server.store("C", "bar", "mystorefunc");
     * </pre>
     * filtered and sorted data will be stored to the file <tt>bar</tt> using
     * <tt>mystorefunc</tt>.
     * <p>
     * @param id The alias to store
     * @param filename The file to which to store to
     * @param func store function to use
     * @return {@link ExecJob} containing information about this job
     * @throws IOException
     */
    public ExecJob store(String id, String filename, String func)
            throws IOException {
        PigStats stats = storeEx(id, filename, func);
        if (stats.getOutputStats().size() < 1) {
            throw new IOException("Couldn't retrieve job.");
        }
        OutputStats output = stats.getOutputStats().get(0);

        if(stats.isSuccessful()){
            return  new HJob(JOB_STATUS.COMPLETED, pigContext, output
                    .getPOStore(), output.getAlias(), stats);
        }else{
            HJob job = new HJob(JOB_STATUS.FAILED, pigContext,
                    output.getPOStore(), output.getAlias(), stats);
           
            //check for exception
            Exception ex = null;
            for(JobStats js : stats.getJobGraph()){
                if(js.getException() != null)
                    ex = js.getException();
            }
            job.setException(ex);
            return job;
        }

    }
      
    private PigStats storeEx(
            String id,
            String filename,
            String func) throws IOException {
        if (!currDAG.getAliasOp().containsKey(id)) {
            throw new IOException("Invalid alias: " + id);
        }

        try {
            Graph g = getClonedGraph();
            LogicalPlan lp = g.getPlan(id);

            // MRCompiler needs a store to be the leaf - hence
            // add a store to the plan to explain
           
            // figure out the leaf to which the store needs to be added
            List<LogicalOperator> leaves = lp.getLeaves();
            LogicalOperator leaf = null;
            if(leaves.size() == 1) {
                leaf = leaves.get(0);
            } else {
                for (Iterator<LogicalOperator> it = leaves.iterator(); it.hasNext();) {
                    LogicalOperator leafOp = it.next();
                    if(leafOp.getAlias().equals(id))
                        leaf = leafOp;
                }
            }
           
            LogicalPlan unCompiledstorePlan = QueryParser.generateStorePlan(
                    scope, lp, filename, func, leaf, leaf.getAlias(),
                    pigContext);
            LogicalPlan storePlan = compileLp(unCompiledstorePlan, g, true);
           
            return executeCompiledLogicalPlan(storePlan);
        } catch (PigException e) {
            int errCode = 1002;
            String msg = "Unable to store alias " + id;
            throw new PigException(msg, errCode, PigException.INPUT, e);
        }  
    }
   
    /**
     * Provide information on how a pig query will be executed.  For now
     * this information is very developer focussed, and probably not very
     * useful to the average user.
     * @param alias Name of alias to explain.
     * @param stream PrintStream to write explanation to.
     * @throws IOException if the requested alias cannot be found.
     */
    public void explain(String alias,
                        PrintStream stream) throws IOException {
        explain(alias, "text", true, false, stream, stream, stream);
    }

    /**
     * Provide information on how a pig query will be executed.
     * @param alias Name of alias to explain.
     * @param format Format in which the explain should be printed.  If text, then the plan will
     * be printed in plain text.  Otherwise, the execution plan will be printed in
     * <a href="http://en.wikipedia.org/wiki/DOT_language">DOT</a> format.
     * @param verbose Controls the amount of information printed
     * @param markAsExecute When set will treat the explain like a
     * call to execute in the respoect that all the pending stores are
     * marked as complete.
     * @param lps Stream to print the logical tree
     * @param pps Stream to print the physical tree
     * @param eps Stream to print the execution tree
     * @throws IOException if the requested alias cannot be found.
     */
    @SuppressWarnings("unchecked")
    public void explain(String alias,
                        String format,
                        boolean verbose,
                        boolean markAsExecute,
                        PrintStream lps,
                        PrintStream pps,
                        PrintStream eps) throws IOException {
        try {
            pigContext.inExplain = true;
            LogicalPlan lp = getStorePlan(alias);
            if (lp.size() == 0) {
                lps.println("Logical plan is empty.");
                pps.println("Physical plan is empty.");
                eps.println("Execution plan is empty.");
                return;
            }
            PhysicalPlan pp = compilePp(lp);
            lp.explain(lps, format, verbose);
            if( pigContext.getProperties().getProperty("pig.usenewlogicalplan", "true").equals("true") ) {
                LogicalPlanMigrationVistor migrator = new LogicalPlanMigrationVistor(lp);
                migrator.visit();
                org.apache.pig.newplan.logical.relational.LogicalPlan newPlan = migrator.getNewLogicalPlan();
               
                HashSet<String> optimizerRules = null;
                try {
                    optimizerRules = (HashSet<String>) ObjectSerializer
                            .deserialize(pigContext.getProperties().getProperty(
                                    "pig.optimizer.rules"));
                } catch (IOException ioe) {
                    int errCode = 2110;
                    String msg = "Unable to deserialize optimizer rules.";
                    throw new FrontendException(msg, errCode, PigException.BUG, ioe);
                }
               
                LogicalPlanOptimizer optimizer = new LogicalPlanOptimizer(newPlan, 3, optimizerRules);
                optimizer.optimize();               
               
                newPlan.explain(lps, format, verbose);
            }
            pp.explain(pps, format, verbose);
            pigContext.getExecutionEngine().explain(pp, eps, format, verbose);
            if (markAsExecute) {
                currDAG.markAsExecuted();
            }
        } catch (Exception e) {
            int errCode = 1067;
            String msg = "Unable to explain alias " + alias;
            throw new FrontendException(msg, errCode, PigException.INPUT, e);
        } finally {
            pigContext.inExplain = false;
        }
    }

    /**
     * Returns the unused byte capacity of an HDFS filesystem. This value does
     * not take into account a replication factor, as that can vary from file
     * to file. Thus if you are using this to determine if you data set will fit
     * in the HDFS, you need to divide the result of this call by your specific replication
     * setting.
     * @return unused byte capacity of the file system.
     * @throws IOException
     */
    public long capacity() throws IOException {
        if (pigContext.getExecType() == ExecType.LOCAL) {
            throw new IOException("capacity only supported for non-local execution");
        }
        else {
            DataStorage dds = pigContext.getDfs();
           
            Map<String, Object> stats = dds.getStatistics();

            String rawCapacityStr = (String) stats.get(DataStorage.RAW_CAPACITY_KEY);
            String rawUsedStr = (String) stats.get(DataStorage.RAW_USED_KEY);
           
            if ((rawCapacityStr == null) || (rawUsedStr == null)) {
                throw new IOException("Failed to retrieve capacity stats");
            }
           
            long rawCapacityBytes = new Long(rawCapacityStr).longValue();
            long rawUsedBytes = new Long(rawUsedStr).longValue();
           
            return rawCapacityBytes - rawUsedBytes;
        }
    }

    /**
     * Returns the length of a file in bytes which exists in the HDFS (accounts for replication).
     * @param filename
     * @return length of the file in bytes
     * @throws IOException
     */
    public long fileSize(String filename) throws IOException {
        DataStorage dfs = pigContext.getDfs();
        ElementDescriptor elem = dfs.asElement(filename);
        Map<String, Object> stats = elem.getStatistics();
        long length = (Long) stats.get(ElementDescriptor.LENGTH_KEY);
        int replication = (Short) stats
                .get(ElementDescriptor.BLOCK_REPLICATION_KEY);

        return length * replication;
    }
   
    /**
     * Test whether a file exists.
     * @param filename to test
     * @return true if file exists, false otherwise
     * @throws IOException
     */
    public boolean existsFile(String filename) throws IOException {
        ElementDescriptor elem = pigContext.getDfs().asElement(filename);
        return elem.exists();
    }
   
    /**
     * Delete a file.
     * @param filename to delete
     * @return true
     * @throws IOException
     */
    public boolean deleteFile(String filename) throws IOException {
        ElementDescriptor elem = pigContext.getDfs().asElement(filename);
        elem.delete();
        return true;
    }
   
    /**
     * Rename a file.
     * @param source file to rename
     * @param target new file name
     * @return true
     * @throws IOException
     */
    public boolean renameFile(String source, String target) throws IOException {
        pigContext.rename(source, target);
        return true;
    }
   
    /**
     * Make a directory.
     * @param dirs directory to make
     * @return true
     * @throws IOException
     */
    public boolean mkdirs(String dirs) throws IOException {
        ContainerDescriptor container = pigContext.getDfs().asContainer(dirs);
        container.create();
        return true;
    }
   
    /**
     * List the contents of a directory.
     * @param dir name of directory to list
     * @return array of strings, one for each file name
     * @throws IOException
     */
    public String[] listPaths(String dir) throws IOException {
        Collection<String> allPaths = new ArrayList<String>();
        ContainerDescriptor container = pigContext.getDfs().asContainer(dir);
        Iterator<ElementDescriptor> iter = container.iterator();
           
        while (iter.hasNext()) {
            ElementDescriptor elem = iter.next();
            allPaths.add(elem.toString());
        }
           
        String[] type = new String[1];
        return allPaths.toArray(type);
    }
   
    /**
     * Does not work at the moment.
     */
    public long totalHadoopTimeSpent() {
//      TODO FIX Need to uncomment this with the right logic
//        return MapReduceLauncher.totalHadoopTimeSpent;
        return 0L;
    }
 
    /**
     * Return a map containing the logical plan associated with each alias.
     * @return map
     */
    public Map<String, LogicalPlan> getAliases() {
        Map<String, LogicalPlan> aliasPlans = new HashMap<String, LogicalPlan>();
        for(LogicalOperator op:  currDAG.getAliases().keySet()) {
            String alias = op.getAlias();
            if(null != alias) {
                aliasPlans.put(alias, currDAG.getAliases().get(op));
            }
        }
        return aliasPlans;
    }

    /**
     * Reclaims resources used by this instance of PigServer. This method
     * deletes all temporary files generated by the current thread while
     * executing Pig commands.
     */
    public void shutdown() {
        // clean-up activities
            // TODO: reclaim scope to free up resources. Currently
        // this is not implemented and throws an exception
            // hence, for now, we won't call it.
        //
        // pigContext.getExecutionEngine().reclaimScope(this.scope);

        FileLocalizer.deleteTempFiles();
    }

    /**
     * Get the set of all current aliases.
     * @return set
     */
    public Set<String> getAliasKeySet() {
        return currDAG.getAliasOp().keySet();
    }

    public Map<LogicalOperator, DataBag> getExamples(String alias) {
        LogicalPlan plan = null;

        try {       
            if (currDAG.isBatchOn()) {
                currDAG.execute();
            }
           
            plan = getClonedGraph().getPlan(alias);
        } catch (IOException e) {
            //Since the original script is parsed anyway, there should not be an
            //error in this parsing. The only reason there can be an error is when
            //the files being loaded in load don't exist anymore.
            e.printStackTrace();
        }
        ExampleGenerator exgen = new ExampleGenerator(plan, pigContext);
        return exgen.getExamples();
    }

    private LogicalPlan getStorePlan(String alias) throws IOException {
        Graph g = getClonedGraph();
        LogicalPlan lp = g.getPlan(alias);
       
        if (!isBatchOn() || alias != null) {
            // MRCompiler needs a store to be the leaf - hence
            // add a store to the plan to explain
           
            // figure out the leaves to which stores need to be added
            List<LogicalOperator> leaves = lp.getLeaves();
            LogicalOperator leaf = null;
            if(leaves.size() == 1) {
                leaf = leaves.get(0);
            } else {
                for (Iterator<LogicalOperator> it = leaves.iterator(); it.hasNext();) {
                    LogicalOperator leafOp = it.next();
                    if(leafOp.getAlias().equals(alias))
                        leaf = leafOp;
                }
            }
           
            lp = QueryParser.generateStorePlan(scope, lp, "fakefile",
                                               PigStorage.class.getName(), leaf, "fake", pigContext);
        }
       
        compileLp(lp, g, true);
       
        return lp;
    }
   
    private PigStats execute(String alias) throws FrontendException, ExecException {
        LogicalPlan typeCheckedLp = compileLp(alias);

        if (typeCheckedLp.size() == 0) {
            return PigStatsUtil.getEmptyPigStats();
        }

        LogicalOperator op = typeCheckedLp.getLeaves().get(0);
        if (op instanceof LODefine) {
            log.info("Skip execution of DEFINE only logical plan.");
            return PigStatsUtil.getEmptyPigStats();
        }

        return executeCompiledLogicalPlan(typeCheckedLp);
    }
   
    private PigStats executeCompiledLogicalPlan(LogicalPlan compiledLp) throws ExecException, FrontendException {
        // discover pig features used in this script
        ScriptState.get().setScriptFeatures(compiledLp);
        PhysicalPlan pp = compilePp(compiledLp);
        // execute using appropriate engine
        List<ExecJob> jobs = pigContext.getExecutionEngine().execute(pp, "job_pigexec_");
        PigStats stats = null;
        if (jobs.size() > 0) {
            stats = jobs.get(0).getStatistics();
        } else {
            stats = PigStatsUtil.getEmptyPigStats();
        }
        for (OutputStats output : stats.getOutputStats()) {
            if (!output.isSuccessful()) {
                POStore store = output.getPOStore();
                try {
                    store.getStoreFunc().cleanupOnFailure(store.getSFile().getFileName(),
                            new Job(output.getConf()));
                } catch (IOException e) {
                    throw new ExecException(e);
                }
            }
        }
        return stats;
    }

    private LogicalPlan compileLp(
            String alias) throws FrontendException {
        return compileLp(alias, true);
    }

    private LogicalPlan compileLp(
            String alias,
            boolean optimize) throws FrontendException {
       
        // create a clone of the logical plan and give it
        // to the operations below
        LogicalPlan lpClone;
        Graph g;
        try {
            g = getClonedGraph();
            lpClone = g.getPlan(alias);
        } catch (IOException e) {
            int errCode = 2001;
            String msg = "Unable to clone plan before compiling";
            throw new FrontendException(msg, errCode, PigException.BUG, e);
        }
        return compileLp(lpClone, g, optimize);
    }
   
    private void mergeScalars(LogicalPlan lp, Graph g) throws FrontendException {
        // When we start processing a store we look for scalars to add stores
        // to respective logical plans and temporary files to the attributes
        // Here we need to find if there are duplicates so that we do not add
        // two stores for one plan
        ScalarFinder scalarFinder = new ScalarFinder(lp);
        scalarFinder.visit();

        Map<LOUserFunc, Pair<LogicalPlan, LogicalOperator>> scalarMap = scalarFinder.getScalarMap();

        try {
            for(Map.Entry<LOUserFunc, Pair<LogicalPlan, LogicalOperator>> scalarEntry: scalarMap.entrySet()) {
                FileSpec fileSpec;
                String alias = scalarEntry.getKey().getImplicitReferencedOperator().getAlias();
                LogicalOperator store;

                LogicalPlan referredPlan = g.getAliases().get(g.getAliasOp().get(alias));

                // If referredPlan already has a store,
                // we just use it instead of adding one from our pocket
                store = referredPlan.getLeaves().get(0);
                if(store instanceof LOStore
                        &&
                        ((LOStore)store).getOutputFile().getFuncName().equals(
                                InterStorage.class.getName())                                           
                ) {
                        // use this store
                        fileSpec = ((LOStore)store).getOutputFile();
                }
                else {
                    // add new store
                    FuncSpec funcSpec = new FuncSpec(InterStorage.class.getName());
                    fileSpec = new FileSpec(FileLocalizer.getTemporaryPath(pigContext).toString(), funcSpec);
                    store = new LOStore(referredPlan, new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope)),
                            fileSpec, alias);
                    referredPlan.addAsLeaf(store);
                    ((LOStore)store).setTmpStore(true);
                    scalarEntry.getKey().setImplicitReferencedOperator(store);
                }
                lp.mergeSharedPlan(referredPlan);

                // Attach a constant operator to the ReadScalar func
                LogicalPlan innerPlan = scalarEntry.getValue().first;
                LOConst rconst = new LOConst(innerPlan, new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope)), fileSpec.getFileName());
                rconst.setType(DataType.CHARARRAY);

                innerPlan.add(rconst);
                innerPlan.connect(rconst, scalarEntry.getKey());
               
                if (lp.getSoftLinkSuccessors(store)==null || !lp.getSoftLinkSuccessors(store).contains(scalarEntry.getValue().second))
                    lp.createSoftLink(store, scalarEntry.getValue().second);
            }
        } catch (IOException ioe) {
            int errCode = 2219;
            String msg = "Unable to process scalar in the plan";
            throw new FrontendException(msg, errCode, PigException.BUG, ioe);
        }
    }
   
    private LogicalPlan compileLp(LogicalPlan lp, Graph g, boolean optimize) throws FrontendException {
        mergeScalars(lp, g);
       
        return compileLp(lp, optimize);
    }
   
    @SuppressWarnings("unchecked")
    private LogicalPlan compileLp(LogicalPlan lp, boolean optimize) throws
    FrontendException {
        // Set the logical plan values correctly in all the operators
        PlanSetter ps = new PlanSetter(lp);
        ps.visit();
       
        UnionOnSchemaSetter setUnionOnSchema = new UnionOnSchemaSetter(lp, pigContext);
        setUnionOnSchema.visit();
       
        // run through validator
        CompilationMessageCollector collector = new CompilationMessageCollector() ;
        boolean isBeforeOptimizer = true;
        validate(lp, collector, isBeforeOptimizer);
       
        // optimize
        if (optimize && pigContext.getProperties().getProperty("pig.usenewlogicalplan", "true").equals("false")) {
            HashSet<String> optimizerRules = null;
            try {
                optimizerRules = (HashSet<String>) ObjectSerializer
                        .deserialize(pigContext.getProperties().getProperty(
                                "pig.optimizer.rules"));
            } catch (IOException ioe) {
                int errCode = 2110;
                String msg = "Unable to deserialize optimizer rules.";
                throw new FrontendException(msg, errCode, PigException.BUG, ioe);
            }

            LogicalOptimizer optimizer = new LogicalOptimizer(lp, pigContext.getExecType(), optimizerRules);
            optimizer.optimize();
           
            // compute whether output data is sorted or not
            SortInfoSetter sortInfoSetter = new SortInfoSetter(lp);
            sortInfoSetter.visit();
           
            // run validations to be done after optimization
            isBeforeOptimizer = false;
            validate(lp, collector, isBeforeOptimizer);
        }
       
        return lp;
    }

    private PhysicalPlan compilePp(LogicalPlan lp) throws FrontendException {
        // translate lp to physical plan
        PhysicalPlan pp = pigContext.getExecutionEngine().compile(lp, null);

        // TODO optimize

        return pp;
    }

    private void validate(LogicalPlan lp, CompilationMessageCollector collector,
            boolean isBeforeOptimizer) throws FrontendException {
        FrontendException caught = null;
        try {
            LogicalPlanValidationExecutor validator =
                new LogicalPlanValidationExecutor(lp, pigContext, isBeforeOptimizer);
            validator.validate(lp, collector);
        } catch (FrontendException fe) {
            // Need to go through and see what the collector has in it.  But
            // remember what we've caught so we can wrap it into what we
            // throw.
            caught = fe;           
        }
       
        if(aggregateWarning) {
            CompilationMessageCollector.logMessages(collector, MessageType.Warning, aggregateWarning, log);
        } else {
            for(Enum type: MessageType.values()) {
                CompilationMessageCollector.logAllMessages(collector, log);
            }
        }
       
        if (caught != null) {
            throw caught;
        }
    }
    private LogicalPlan getPlanFromAlias(
            String alias,
            String operation) throws FrontendException {
        LogicalOperator lo = currDAG.getAliasOp().get(alias);
        if (lo == null) {
            int errCode = 1004;
            String msg = "No alias " + alias + " to " + operation;
            throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
        }
        LogicalPlan lp = currDAG.getAliases().get(lo);
        if (lp == null) {
            int errCode = 1005;
            String msg = "No plan for " + alias + " to " + operation;
            throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
        }       
        return lp;
    }
   
    public static class SortInfoSetter extends LOVisitor{

        public SortInfoSetter(LogicalPlan plan) {
            super(plan, new DependencyOrderWalker<LogicalOperator, LogicalPlan>(plan));
        }

        @Override
        protected void visit(LOStore store) throws VisitorException {
           
            LogicalOperator storePred = store.getPlan().getPredecessors(store).get(0);
            if(storePred == null){
                int errCode = 2051;
                String msg = "Did not find a predecessor for Store." ;
                throw new VisitorException(msg, errCode, PigException.BUG);   
            }
           
            SortInfo sortInfo = null;
            if(storePred instanceof LOLimit) {
                storePred = store.getPlan().getPredecessors(storePred).get(0);
            } else if (storePred instanceof LOSplitOutput) {
                LOSplitOutput splitOutput = (LOSplitOutput)storePred;
                // We assume this is the LOSplitOutput we injected for this case:
                // b = order a by $0; store b into '1'; store b into '2';
                // In this case, we should mark both '1' and '2' as sorted
                LogicalPlan conditionPlan = splitOutput.getConditionPlan();
                if (conditionPlan.getRoots().size()==1) {
                    LogicalOperator root = conditionPlan.getRoots().get(0);
                    if (root instanceof LOConst) {
                        Object value = ((LOConst)root).getValue();
                        if (value instanceof Boolean && (Boolean)value==true) {
                            LogicalOperator split = splitOutput.getPlan().getPredecessors(splitOutput).get(0);
                            if (split instanceof LOSplit)
                                storePred = store.getPlan().getPredecessors(split).get(0);
                        }
                    }
                }
            }
            // if this predecessor is a sort, get
            // the sort info.
            if(storePred instanceof LOSort) {
                try {
                    sortInfo = ((LOSort)storePred).getSortInfo();
                } catch (FrontendException e) {
                    throw new VisitorException(e);
                }
            }
            store.setSortInfo(sortInfo);
        }
    }

    /*
     * This class holds the internal states of a grunt shell session.
     */
    private class Graph {
       
        private Map<LogicalOperator, LogicalPlan> aliases = new HashMap<LogicalOperator, LogicalPlan>();
       
        private Map<OperatorKey, LogicalOperator> opTable = new HashMap<OperatorKey, LogicalOperator>();
       
        private Map<String, LogicalOperator> aliasOp = new HashMap<String, LogicalOperator>();
      
        private List<String> scriptCache = new ArrayList<String>()

        // the fileNameMap contains filename to canonical filename
        // mappings. This is done so we can reparse the cached script
        // and remember the translation (current directory might only
        // be correct during the first parse
        private Map<String, String> fileNameMap = new HashMap<String, String>();
   
        private Map<LOStore, LogicalPlan> storeOpTable = new HashMap<LOStore, LogicalPlan>();
       
        private Set<LOLoad> loadOps = new HashSet<LOLoad>();

        private String jobName;
       
        private String jobPriority;

        private boolean batchMode;

        private int processedStores;

        private int ignoreNumStores;
       
        private LogicalPlan lp;
       
        Graph(boolean batchMode) {
            this.batchMode = batchMode;
            this.processedStores = 0;
            this.ignoreNumStores = 0;
            this.jobName = pigContext.getProperties().getProperty(PigContext.JOB_NAME,
                                                                  PigContext.JOB_NAME_PREFIX+":DefaultJobName");
            this.lp = new LogicalPlan();
        };
       
        Map<LogicalOperator, LogicalPlan> getAliases() { return aliases; }
       
        Map<OperatorKey, LogicalOperator> getOpTable() { return opTable; }
       
        Map<String, LogicalOperator> getAliasOp() { return aliasOp; }
       
        List<String> getScriptCache() { return scriptCache; }
       
        boolean isBatchOn() { return batchMode; };

        boolean isBatchEmpty() { return processedStores == storeOpTable.keySet().size(); }
       
        PigStats execute() throws ExecException, FrontendException {
            pigContext.getProperties().setProperty(PigContext.JOB_NAME, jobName);
            if (jobPriority != null) {
              pigContext.getProperties().setProperty(PigContext.JOB_PRIORITY, jobPriority);
            }
           
            PigStats stats = PigServer.this.execute(null);
            processedStores = storeOpTable.keySet().size();
            return stats;
        }

        void markAsExecuted() {
            processedStores = storeOpTable.keySet().size();
        }

        void setJobName(String name) {
            jobName = PigContext.JOB_NAME_PREFIX+":"+name;
        }

        public void setJobPriority(String priority){
            jobPriority = priority;
        }

        LogicalPlan getPlan(String alias) throws IOException {
            LogicalPlan plan = lp;
               
            if (alias != null) {
                LogicalOperator op = aliasOp.get(alias);
                if(op == null) {
                    int errCode = 1003;
                    String msg = "Unable to find an operator for alias " + alias;
                    throw new FrontendException(msg, errCode, PigException.INPUT);
                }
                plan = aliases.get(op);
            }
            return plan;
        }

        void registerQuery(String query, int startLine) throws IOException {
           
            LogicalPlan tmpLp = parseQuery(query, startLine);
           
            // store away the query for use in cloning later
            scriptCache.add(query);
            if (tmpLp.getLeaves().size() == 1) {
                LogicalOperator op = tmpLp.getSingleLeafPlanOutputOp();
               
                // Check if we just processed a LOStore i.e. STORE
                if (op instanceof LOStore) {

                    if (!batchMode) {
                        lp = tmpLp;
                        try {
                            execute();
                        } catch (Exception e) {
                            int errCode = 1002;
                            String msg = "Unable to store alias "
                                    + op.getOperatorKey().getId();
                            throw new FrontendException(msg, errCode,
                                    PigException.INPUT, e);
                        }
                    } else {
                        if (0 == ignoreNumStores) {
                            storeOpTable.put((LOStore)op, tmpLp);
                            lp.mergeSharedPlan(tmpLp);
                            List<LogicalOperator> roots = tmpLp.getRoots();
                            for (LogicalOperator root : roots) {
                                if (root instanceof LOLoad) {
                                    loadOps.add((LOLoad)root);
                                }
                            }

                        } else {
                            --ignoreNumStores;
                        }
                    }
                }
            }
        }       
   
        LogicalPlan parseQuery(String query, int startLine) throws IOException {       
            if (query == null || query.length() == 0) {
                int errCode = 1084;
                String msg = "Invalid Query: Query is null or of size 0";
                throw new FrontendException(msg, errCode, PigException.INPUT);
            }

            query = query.trim();
       
            try {
                return new LogicalPlanBuilder(PigServer.this.pigContext).parse(scope, query,
                                              aliases, opTable, aliasOp, startLine, fileNameMap);
            } catch (ParseException e) {
                PigException pe = LogUtils.getPigException(e);
                int errCode = 1000;
                String msg = "Error during parsing. " + (pe == null? e.getMessage() : pe.getMessage());
                throw new FrontendException(msg, errCode, PigException.INPUT, false, null, e);
            }
        }

        @Override
        protected Graph clone() {
            // There are two choices on how we clone the logical plan
            // 1 - we really clone each operator and connect up the cloned operators
            // 2 - we cache away the script till the point we need to clone
            // and then simply re-parse the script.
            // The latter approach is used here
            // FIXME: There is one open issue with this now:
            // Consider the following script:
            // A = load 'file:/somefile';
            // B = filter A by $0 > 10;
            // store B into 'bla';
            // rm 'file:/somefile';
            // A = load 'file:/someotherfile'
            // when we try to clone - we try to reparse
            // from the beginning and currently the parser
            // checks for file existence of files in the load
            // in the case where the file is a local one -i.e. with file: prefix
            // This will be a known issue now and we will need to revisit later
           
            // parse each line of the cached script
            int lineNumber = 1;
           
            // create data structures needed for parsing       
            Graph graph = new Graph(isBatchOn());
            graph.ignoreNumStores = processedStores;
            graph.processedStores = processedStores;
            graph.fileNameMap = fileNameMap;
           
            //reset udf properties
            UDFContext.getUDFContext().reset();

            try {
                for (Iterator<String> it = getScriptCache().iterator(); it.hasNext(); lineNumber++) {
                    if (isBatchOn()) {
                        graph.registerQuery(it.next(), lineNumber);
                    } else {
                        graph.lp = graph.parseQuery(it.next(), lineNumber);
                    }
                }
                graph.postProcess();
            } catch (IOException ioe) {
                ioe.printStackTrace();
                graph = null;
            }         
            return graph;
        }
      
        private void postProcess() throws IOException {
           
            // Set the logical plan values correctly in all the operators
            PlanSetter ps = new PlanSetter(lp);
            ps.visit();
           
            // The following code deals with store/load combination of
            // intermediate files. In this case we will replace the load operator
            // with a (implicit) split operator, iff the load/store
            // func is reversible (because that's when we can safely
            // skip the load and keep going with the split output). If
            // the load/store func is not reversible (or they are
            // different functions), we connect the store and the load
            // to remember the dependency.
            for (LOLoad load : loadOps) {
                for (LOStore store : storeOpTable.keySet()) {
                    String ifile = load.getInputFile().getFileName();
                    String ofile = store.getOutputFile().getFileName();
                    if (ofile.compareTo(ifile) == 0) {
                        try {
                            // if there is no path from the load to the store,
                            // then connect the store to the load to create the
                            // dependency of the store on the load. If there is
                            // a path from the load to the store, then we should
                            // not connect the store to the load and create a cycle
                            if(!store.getPlan().pathExists(load, store)) {
                                store.getPlan().connect(store, load);
                            }
                        } catch (PlanException ex) {
                            int errCode = 2128;
                            String msg = "Failed to connect store with dependent load.";
                            throw new FrontendException(msg, errCode, ex);
                        }
                       

                        
                        //TODO
                        //if the load has a schema then the type cast inserter has to introduce
                        //casts to get the right types. Since the type cast inserter runs later,
                        //removing the load could create problems. For example, if the storage function
                        //does not preserve type information required and the subsequent load created
                        //as part of the MR Compiler introduces a load then the type cast insertion
                        //will be missing.
                        //As a result, check if the store function preserves types. For now, the only
                        //storage that preserves types internally is BinStorage.
                        //In the future, Pig the storage functions should support method to enquire if
                        //type information is preserved. Similarly, the load functions should support
                        //a similar interface. With these interfaces in place, the code below can be
                        //used to optimize the store/load combination
                           

                        /*                        
                        LoadFunc lFunc = (LoadFunc) pigContext.instantiateFuncFromSpec(load.getInputFile().getFuncSpec());
                        StoreFunc sFunc = (StoreFunc) pigContext.instantiateFuncFromSpec(store.getOutputFile().getFuncSpec());
                        if (lFunc.getClass() == sFunc.getClass() && lFunc instanceof ReversibleLoadStoreFunc) {
                           
                            log.info("Removing unnecessary load operation from location: "+ifile);
                           
                            // In this case we remember the input file
                            // spec in the store. We might have to use it
                            // in the MR compiler to recreate the load, if
                            // the store happens on a job boundary.
                            store.setInputSpec(load.getInputFile());

                            LogicalOperator storePred = lp.getPredecessors(store).get(0);
                           
                            // In this case we remember the input file
                            // spec in the store. We might have to use it
                            // in the MR compiler to recreate the load, if
                            // the store happens on a job boundary.
                            store.setInputSpec(load.getInputFile());
                           
                            Schema storePredSchema = storePred.getSchema();
                            if(storePredSchema != null) {
                                load.setSchema(storePredSchema);
                                TypeCastInserter typeCastInserter = new TypeCastInserter(lp, LOLoad.class.getName());                               
                                List<LogicalOperator> loadList = new ArrayList<LogicalOperator>();
                                loadList.add(load);
                                //the following needs a change to TypeCastInserter and LogicalTransformer
                                typeCastInserter.doTransform(loadList, false);
                            }
                           
                            lp.disconnect(store, load);
                            lp.connect(storePred, load);
                            lp.removeAndReconnectMultiSucc(load);
                           
                            List<LogicalOperator> succs = lp.getSuccessors(load);
                        } else {
                            try {
                                store.getPlan().connect(store, load);
                            } catch (PlanException ex) {
                                int errCode = 2128;
                                String msg = "Failed to connect store with dependent load.";
                                throw new FrontendException(msg, errCode, ex);
                            }   
                        }
                        */
                    }
                }
            }
        }
    }
}
TOP

Related Classes of org.apache.pig.PigServer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.