Source Code of org.apache.pig.backend.hadoop.streaming.HadoopExecutableManager

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.backend.hadoop.streaming;


import java.io.File;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.Date;
import java.util.List;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.MapContext;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStream;
import org.apache.pig.impl.streaming.ExecutableManager;
import org.apache.pig.impl.streaming.StreamingCommand.Handle;
import org.apache.pig.impl.streaming.StreamingCommand.HandleSpec;


/**
 * {@link HadoopExecutableManager} is a specialization of 
 * {@link ExecutableManager} and provides HDFS-specific support for secondary
 * outputs, task-logs etc.
 * 
 * <code>HadoopExecutableManager</code> provides support for  secondary outputs
 * of the managed process and also persists the logs of the tasks on HDFS. 
 */
public class HadoopExecutableManager extends ExecutableManager {
    // The part-<partition> file name, similar to Hadoop's outputs
    private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
    static {
      NUMBER_FORMAT.setMinimumIntegerDigits(5);
      NUMBER_FORMAT.setGroupingUsed(false);
    }


    static String getOutputName(int partition) {
      return "part-" + NUMBER_FORMAT.format(partition);
    }


    Configuration job;
    
    String scriptOutputDir;
    String scriptLogDir;
    String taskId;
    
    FSDataOutputStream errorStream;
    
    public HadoopExecutableManager() {}
    
    public void configure(POStream stream) 
    throws IOException, ExecException {
        super.configure(stream);
        
        // Chmod +x the executable
        File executable = new File(command.getExecutable());
        if (executable.isAbsolute()) {
            // we don't own it. Hope it is executable ...
        } else {
            try {
                FileUtil.chmod(executable.toString(), "a+x");
            } catch (InterruptedException ie) {
                int errCode = 6013;
                String msg = "Unable to chmod " + executable + " . Thread interrupted.";
                throw new ExecException(msg, errCode, PigException.REMOTE_ENVIRONMENT, ie);
            }
        }
        
        // Save a copy of the JobConf
        job = PigMapReduce.sJobConfInternal.get();
        
        // Save the output directory for the Pig Script
        scriptOutputDir = job.get("pig.streaming.task.output.dir");
        scriptLogDir = job.get("pig.streaming.log.dir", "_logs");
        
        // Save the taskid
        // TODO Get an equivalent property in Tez mode (currently this returns null)
        taskId = job.get(MRConfiguration.TASK_ID);
    }
    
    protected void exec() throws IOException {
        // Create the HDFS file for the stderr of the task, if necessary
        if (writeErrorToHDFS(command.getLogFilesLimit(), taskId)) {
            try {
                Path errorFile = 
                    new Path(new Path(scriptLogDir, command.getLogDir()), taskId);
                errorStream = 
                    errorFile.getFileSystem(job).create(errorFile);
            } catch (IOException ie) {
                // Don't fail the task if we couldn't save it's stderr on HDFS
                System.err.println("Failed to create stderr file of task: " +
                                   taskId + " in HDFS at " + scriptLogDir + 
                                   " with " + ie);
                errorStream = null;
            }
        }
        
        // Header for stderr file of the task
        writeDebugHeader();


        // Exec the command ...
        super.exec();
    }


    public void close() throws IOException {
        try {
            super.close();


            // Copy the secondary outputs of the task to HDFS
            if (this.scriptOutputDir==null) {
                return;
            }
            Path scriptOutputDir = new Path(this.scriptOutputDir);
            FileSystem fs = scriptOutputDir.getFileSystem(job);
            List<HandleSpec> outputSpecs = command.getHandleSpecs(Handle.OUTPUT);
            if (outputSpecs != null) {
                for (int i=1; i < outputSpecs.size(); ++i) {
                    String fileName = outputSpecs.get(i).getName();
                    try {
                        int partition = job.getInt(MRConfiguration.TASK_PARTITION, -1);
                        Path dst = new Path(new Path(scriptOutputDir, fileName), getOutputName(partition));
                        fs.copyFromLocalFile(false, true, new Path(fileName), dst);
                        fs.setReplication(dst, (short)job.getInt(MRConfiguration.SUMIT_REPLICATION, 3));
                    } catch (IOException ioe) {
                        int errCode = 6014; 
                        String msg = "Failed to save secondary output '" +
                        fileName + "' of task: " + taskId;
                        throw new ExecException(msg, errCode, PigException.REMOTE_ENVIRONMENT, ioe);
                    }
                }
        }
        } finally {
            // Footer for stderr file of the task
            writeDebugFooter();
            
            // Close the stderr file on HDFS
            if (errorStream != null) {
                errorStream.close();
            }
        }
    }


    /**
     * Should the stderr data of this task be persisted on HDFS?
     * 
     * @param limit maximum number of tasks whose stderr log-files are persisted
     * @param taskId id of the task
     * @return <code>true</code> if stderr data of task should be persisted on 
     *         HDFS, <code>false</code> otherwise
     */
    private boolean writeErrorToHDFS(int limit, String taskId) {
        if (command.getPersistStderr() && taskId != null) {
            int tipId = TaskAttemptID.forName(taskId).getTaskID().getId();
            return tipId < command.getLogFilesLimit();
        }
        return false;
    }
    
    protected void processError(String error) {
        super.processError(error);
        
        try {
            if (errorStream != null) {
                errorStream.writeBytes(error);
            }
        } catch (IOException ioe) {
            super.processError("Failed to save error logs to HDFS with: " + 
                               ioe);
        }
    }


    private void writeDebugHeader() {
        processError("===== Task Information Header =====" );


        processError("\nCommand: " + command);
        processError("\nStart time: " + new Date(System.currentTimeMillis()));
        if (job.getBoolean(MRConfiguration.TASK_IS_MAP, false)) {
            MapContext context = (MapContext)PigMapReduce.sJobContext;
            PigSplit pigSplit = (PigSplit)context.getInputSplit();
            int numPaths = pigSplit.getNumPaths();
            processError("\nPigSplit contains " + numPaths + " wrappedSplits.");


            StringBuilder sb = new StringBuilder();
            for(int i = 0; i < numPaths; i++) {
              InputSplit wrappedSplit = pigSplit.getWrappedSplit(i);
              if (wrappedSplit instanceof FileSplit) {
                  FileSplit mapInputFileSplit = (FileSplit)wrappedSplit;
                  sb.append("\nInput-split: file=");
                  sb.append(mapInputFileSplit.getPath());
                  sb.append(" start-offset=");
                  sb.append(Long.toString(mapInputFileSplit.getStart()));
                  sb.append(" length=");
                  sb.append(Long.toString(mapInputFileSplit.getLength()));
                  processError(sb.toString());
                  sb.setLength(0);
              }
            }
        }
        processError("\n=====          * * *          =====\n");
    }
    
    private void writeDebugFooter() {
        processError("===== Task Information Footer =====");


        processError("\nEnd time: " + new Date(System.currentTimeMillis()));
        processError("\nExit code: " + exitCode);


        List<HandleSpec> inputSpecs = command.getHandleSpecs(Handle.INPUT);
        HandleSpec inputSpec = 
            (inputSpecs != null) ? inputSpecs.get(0) : null;
        if (inputSpec == null || 
            !inputSpec.getSpec().contains("BinaryStorage")) {
            processError("\nInput records: " + inputRecords);
        }
        processError("\nInput bytes: " + inputBytes + " bytes " +
                    ((inputSpec != null) ? 
                            "(" + inputSpec.getName() + " using " + 
                                inputSpec.getSpec() + ")" 
                            : ""));


        List<HandleSpec> outputSpecs = command.getHandleSpecs(Handle.OUTPUT);
        HandleSpec outputSpec = 
            (outputSpecs != null) ? outputSpecs.get(0) : null;
        if (outputSpec == null || 
            !outputSpec.getSpec().contains("BinaryStorage")) {
            processError("\nOutput records: " + outputRecords);
        }
        processError("\nOutput bytes: " + outputBytes + " bytes " +
                     ((outputSpec != null) ? 
                         "(" + outputSpec.getName() + " using " + 
                             outputSpec.getSpec() + ")" 
                         : ""));
        if (outputSpecs != null) {
            for (int i=1; i < outputSpecs.size(); ++i) {
                HandleSpec spec = outputSpecs.get(i);
                processError("\n           " + new File(spec.getName()).length() 
                             + " bytes using " + spec.getSpec());
            }
        }


        processError("\n=====          * * *          =====\n");
    }
}
Source Code of org.apache.pig.backend.hadoop.streaming.HadoopExecutableManager

Related Classes of org.apache.pig.backend.hadoop.streaming.HadoopExecutableManager