Package org.apache.pig.backend.hadoop.streaming

Source Code of org.apache.pig.backend.hadoop.streaming.HadoopExecutableManager

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.backend.hadoop.streaming;

import java.io.File;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.Date;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.MapContext;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStream;
import org.apache.pig.impl.streaming.ExecutableManager;
import org.apache.pig.impl.streaming.StreamingCommand.Handle;
import org.apache.pig.impl.streaming.StreamingCommand.HandleSpec;

/**
* {@link HadoopExecutableManager} is a specialization of
* {@link ExecutableManager} and provides HDFS-specific support for secondary
* outputs, task-logs etc.
*
* <code>HadoopExecutableManager</code> provides support for  secondary outputs
* of the managed process and also persists the logs of the tasks on HDFS.
*/
public class HadoopExecutableManager extends ExecutableManager {
    // The part-<partition> file name, similar to Hadoop's outputs
    private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
    static {
      NUMBER_FORMAT.setMinimumIntegerDigits(5);
      NUMBER_FORMAT.setGroupingUsed(false);
    }

    static String getOutputName(int partition) {
      return "part-" + NUMBER_FORMAT.format(partition);
    }

    Configuration job;
   
    String scriptOutputDir;
    String scriptLogDir;
    String taskId;
   
    FSDataOutputStream errorStream;
   
    public HadoopExecutableManager() {}
   
    public void configure(POStream stream)
    throws IOException, ExecException {
        super.configure(stream);
       
        // Chmod +x the executable
        File executable = new File(command.getExecutable());
        if (executable.isAbsolute()) {
            // we don't own it. Hope it is executable ...
        } else {
            try {
                FileUtil.chmod(executable.toString(), "a+x");
            } catch (InterruptedException ie) {
                int errCode = 6013;
                String msg = "Unable to chmod " + executable + " . Thread interrupted.";
                throw new ExecException(msg, errCode, PigException.REMOTE_ENVIRONMENT, ie);
            }
        }
       
        // Save a copy of the JobConf
        job = PigMapReduce.sJobConfInternal.get();
       
        // Save the output directory for the Pig Script
        scriptOutputDir = job.get("pig.streaming.task.output.dir");
        scriptLogDir = job.get("pig.streaming.log.dir", "_logs");
       
        // Save the taskid
        // TODO Get an equivalent property in Tez mode (currently this returns null)
        taskId = job.get(MRConfiguration.TASK_ID);
    }
   
    protected void exec() throws IOException {
        // Create the HDFS file for the stderr of the task, if necessary
        if (writeErrorToHDFS(command.getLogFilesLimit(), taskId)) {
            try {
                Path errorFile =
                    new Path(new Path(scriptLogDir, command.getLogDir()), taskId);
                errorStream =
                    errorFile.getFileSystem(job).create(errorFile);
            } catch (IOException ie) {
                // Don't fail the task if we couldn't save it's stderr on HDFS
                System.err.println("Failed to create stderr file of task: " +
                                   taskId + " in HDFS at " + scriptLogDir +
                                   " with " + ie);
                errorStream = null;
            }
        }
       
        // Header for stderr file of the task
        writeDebugHeader();

        // Exec the command ...
        super.exec();
    }

    public void close() throws IOException {
        try {
            super.close();

            // Copy the secondary outputs of the task to HDFS
            if (this.scriptOutputDir==null) {
                return;
            }
            Path scriptOutputDir = new Path(this.scriptOutputDir);
            FileSystem fs = scriptOutputDir.getFileSystem(job);
            List<HandleSpec> outputSpecs = command.getHandleSpecs(Handle.OUTPUT);
            if (outputSpecs != null) {
                for (int i=1; i < outputSpecs.size(); ++i) {
                    String fileName = outputSpecs.get(i).getName();
                    try {
                        int partition = job.getInt(MRConfiguration.TASK_PARTITION, -1);
                        Path dst = new Path(new Path(scriptOutputDir, fileName), getOutputName(partition));
                        fs.copyFromLocalFile(false, true, new Path(fileName), dst);
                        fs.setReplication(dst, (short)job.getInt(MRConfiguration.SUMIT_REPLICATION, 3));
                    } catch (IOException ioe) {
                        int errCode = 6014;
                        String msg = "Failed to save secondary output '" +
                        fileName + "' of task: " + taskId;
                        throw new ExecException(msg, errCode, PigException.REMOTE_ENVIRONMENT, ioe);
                    }
                }
        }
        } finally {
            // Footer for stderr file of the task
            writeDebugFooter();
           
            // Close the stderr file on HDFS
            if (errorStream != null) {
                errorStream.close();
            }
        }
    }

    /**
     * Should the stderr data of this task be persisted on HDFS?
     *
     * @param limit maximum number of tasks whose stderr log-files are persisted
     * @param taskId id of the task
     * @return <code>true</code> if stderr data of task should be persisted on
     *         HDFS, <code>false</code> otherwise
     */
    private boolean writeErrorToHDFS(int limit, String taskId) {
        if (command.getPersistStderr() && taskId != null) {
            int tipId = TaskAttemptID.forName(taskId).getTaskID().getId();
            return tipId < command.getLogFilesLimit();
        }
        return false;
    }
   
    protected void processError(String error) {
        super.processError(error);
       
        try {
            if (errorStream != null) {
                errorStream.writeBytes(error);
            }
        } catch (IOException ioe) {
            super.processError("Failed to save error logs to HDFS with: " +
                               ioe);
        }
    }

    private void writeDebugHeader() {
        processError("===== Task Information Header =====" );

        processError("\nCommand: " + command);
        processError("\nStart time: " + new Date(System.currentTimeMillis()));
        if (job.getBoolean(MRConfiguration.TASK_IS_MAP, false)) {
            MapContext context = (MapContext)PigMapReduce.sJobContext;
            PigSplit pigSplit = (PigSplit)context.getInputSplit();
            int numPaths = pigSplit.getNumPaths();
            processError("\nPigSplit contains " + numPaths + " wrappedSplits.");

            StringBuilder sb = new StringBuilder();
            for(int i = 0; i < numPaths; i++) {
              InputSplit wrappedSplit = pigSplit.getWrappedSplit(i);
              if (wrappedSplit instanceof FileSplit) {
                  FileSplit mapInputFileSplit = (FileSplit)wrappedSplit;
                  sb.append("\nInput-split: file=");
                  sb.append(mapInputFileSplit.getPath());
                  sb.append(" start-offset=");
                  sb.append(Long.toString(mapInputFileSplit.getStart()));
                  sb.append(" length=");
                  sb.append(Long.toString(mapInputFileSplit.getLength()));
                  processError(sb.toString());
                  sb.setLength(0);
              }
            }
        }
        processError("\n=====          * * *          =====\n");
    }
   
    private void writeDebugFooter() {
        processError("===== Task Information Footer =====");

        processError("\nEnd time: " + new Date(System.currentTimeMillis()));
        processError("\nExit code: " + exitCode);

        List<HandleSpec> inputSpecs = command.getHandleSpecs(Handle.INPUT);
        HandleSpec inputSpec =
            (inputSpecs != null) ? inputSpecs.get(0) : null;
        if (inputSpec == null ||
            !inputSpec.getSpec().contains("BinaryStorage")) {
            processError("\nInput records: " + inputRecords);
        }
        processError("\nInput bytes: " + inputBytes + " bytes " +
                    ((inputSpec != null) ?
                            "(" + inputSpec.getName() + " using " +
                                inputSpec.getSpec() + ")"
                            : ""));

        List<HandleSpec> outputSpecs = command.getHandleSpecs(Handle.OUTPUT);
        HandleSpec outputSpec =
            (outputSpecs != null) ? outputSpecs.get(0) : null;
        if (outputSpec == null ||
            !outputSpec.getSpec().contains("BinaryStorage")) {
            processError("\nOutput records: " + outputRecords);
        }
        processError("\nOutput bytes: " + outputBytes + " bytes " +
                     ((outputSpec != null) ?
                         "(" + outputSpec.getName() + " using " +
                             outputSpec.getSpec() + ")"
                         : ""));
        if (outputSpecs != null) {
            for (int i=1; i < outputSpecs.size(); ++i) {
                HandleSpec spec = outputSpecs.get(i);
                processError("\n           " + new File(spec.getName()).length()
                             + " bytes using " + spec.getSpec());
            }
        }

        processError("\n=====          * * *          =====\n");
    }
}

    
TOP

Related Classes of org.apache.pig.backend.hadoop.streaming.HadoopExecutableManager

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.