Package org.apache.hadoop.mapred

Source Code of org.apache.hadoop.mapred.RemoteJTProxy

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.corona.ResourceGrant;
import org.apache.hadoop.corona.ResourceRequest;
import org.apache.hadoop.corona.SessionDriver;
import org.apache.hadoop.corona.Utilities;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.ipc.ProtocolSignature;
import org.apache.hadoop.ipc.RPC;

/**
* The Proxy used by the CoronaJobTracker in the client to communicate
* with the CoronaJobTracker running on the TaskTracker in case of a
* remote CoronaJobTracker
*/
@SuppressWarnings("deprecation")
public class RemoteJTProxy implements InterCoronaJobTrackerProtocol,
    JobSubmissionProtocol {
  /** Logger */
  public static final Log LOG = LogFactory.getLog(CoronaJobTracker.class);
  /** Amount of time to wait for remote JT to launch. */
  public static final String REMOTE_JT_TIMEOUT_SEC_CONF =
      "mapred.coronajobtracker.remotejobtracker.wait";
  /** Default amount of time to wait for remote JT to launch. */
  public static final int REMOTE_JT_TIMEOUT_SEC_DEFAULT = 60;
  /** The proxy object to the CoronaJobTracker running in the cluster */
  private JobSubmissionProtocol client;
  /** The task id for the current attempt of running CJT */
  private TaskAttemptID currentAttemptId;
  /** The number of the current attempt */
  private int attempt;
  /** Job configuration */
  private final JobConf conf;
  /** Parent JobTracker */
  private final CoronaJobTracker jt;
  /** The id of the job */
  private final JobID jobId;
  /** The session id for the job tracker running in the cluster */
  private String remoteSessionId;

  /**
   * Construct a proxy for the remote job tracker
   * @param jt parent job tracker
   * @param jobId id of the job the proxy is created for
   * @param conf job configuration
   */
  @SuppressWarnings("deprecation")
  RemoteJTProxy(CoronaJobTracker jt, JobID jobId, JobConf conf) {
    this.conf = conf;
    this.jt = jt;
    this.jobId = jobId;
    attempt = 0;
    int partitionId = conf.getNumMapTasks() + 100000;
    currentAttemptId = new TaskAttemptID(new TaskID(jobId, true, partitionId),
        attempt);
  }

  public String getRemoteSessionId() {
    return remoteSessionId;
  }

  // ///////////////////////////////////////////////////////////////////////////
  // InterCoronaJobTrackerProtocol
  // ///////////////////////////////////////////////////////////////////////////
  @Override
  public void reportRemoteCoronaJobTracker(
      String attempt,
      String host,
      int port,
      String sessionId) throws IOException {
    TaskAttemptID attemptId = TaskAttemptID.forName(attempt);
    synchronized (this) {
      if (!attemptId.equals(currentAttemptId)) {
        throw new IOException("Attempt " + attempt +
            " does not match current attempt " + currentAttemptId);
      }
      initializeClientUnprotected(host, port, sessionId);
      this.notifyAll();
    }
  }

  @Override
  public ProtocolSignature getProtocolSignature(String protocol,
      long clientVersion, int clientMethodsHash) throws IOException {
    return ProtocolSignature.getProtocolSignature(
        this, protocol, clientVersion, clientMethodsHash);
  }

  @Override
  public long getProtocolVersion(String protocol, long clientVersion)
    throws IOException {
    if (protocol.equals(InterCoronaJobTrackerProtocol.class.getName())) {
      return InterCoronaJobTrackerProtocol.versionID;
    } else {
      throw new IOException("Unknown protocol " + protocol);
    }
  }


  /**
   * Increment the attempt number for launching a remote corona job tracker.
   * Must be called only when holding the object lock.
   */
  private void incrementAttemptUnprotected() {
    attempt++;
    currentAttemptId = new TaskAttemptID(currentAttemptId.getTaskID(), attempt);
  }

  /**
   * Create the RPC client to the remote corona job tracker.
   * @param host The host running the remote corona job tracker.
   * @param port The port of the remote corona job tracker.
   * @param sessionId The session for the remote corona job tracker.
   * @throws IOException
   */
  void initializeClientUnprotected(String host, int port, String sessionId)
    throws IOException {
    if (client != null) {
      return;
    }
    LOG.info("Creating JT client to " + host + ":" + port);
    client = RPC.waitForProxy(JobSubmissionProtocol.class,
        JobSubmissionProtocol.versionID, new InetSocketAddress(host, port),
        conf);
    remoteSessionId = sessionId;
  }

  /**
   * Wait for the remote corona job tracker to be ready.
   * This involves
   *    - getting a JOBTRACKER resource from the cluster manager.
   *    - starting the remote job tracker by connecting to the corona task
   *      tracker on the machine.
   *    - waiting for the remote job tracker to report its port back to this
   *      process.
   * @param jobConf The job configuration to use.
   * @throws IOException
   */
  public void waitForJTStart(JobConf jobConf) throws IOException {
    int maxJTAttempts = jobConf.getInt(
        "mapred.coronajobtracker.remotejobtracker.attempts", 4);
    ResourceTracker resourceTracker = jt.getResourceTracker();
    SessionDriver sessionDriver = jt.getSessionDriver();
    List<ResourceGrant> excludeGrants = new ArrayList<ResourceGrant>();
    for (int i = 0; i < maxJTAttempts; i++) {
      try {
        ResourceGrant jtGrant = waitForJTGrant(resourceTracker, sessionDriver,
            excludeGrants);
        boolean success = startRemoteJT(jobConf, jtGrant);
        if (success) {
          return;
        } else {
          excludeGrants.add(jtGrant);
          resourceTracker.releaseResource(jtGrant.getId());
          List<ResourceRequest> released =
            resourceTracker.getResourcesToRelease();
          sessionDriver.releaseResources(released);
        }
      } catch (InterruptedException e) {
        throw new IOException(e);
      }

    }
    throw new IOException("Could not start remote JT after " + maxJTAttempts +
      " attempts");
  }

  /**
   * Wait for a JOBTRACKER grant.
   * @param resourceTracker The resource tracker object for getting the grant
   * @param sessionDriver The session driver for getting the grant
   * @param previousGrants Previous grants that could not be used successfully.
   * @return A new JOBTRACKER grant.
   * @throws IOException
   * @throws InterruptedException
   */
  private ResourceGrant waitForJTGrant(
      ResourceTracker resourceTracker,
      SessionDriver sessionDriver,
      List<ResourceGrant> previousGrants)
    throws IOException, InterruptedException {
    LOG.info("Waiting for JT grant for " + jobId);
    ResourceRequest req = resourceTracker.newJobTrackerRequest();
    for (ResourceGrant prev: previousGrants) {
      LOG.info("Adding " + prev.getNodeName() + " to excluded hosts");
      req.addToExcludeHosts(prev.getAddress().getHost());
    }
    resourceTracker.recordRequest(req);
    List<ResourceRequest> newRequests = resourceTracker.getWantedResources();
    sessionDriver.requestResources(newRequests);
    final List<ResourceGrant> grants = new ArrayList<ResourceGrant>();
    ResourceTracker.ResourceProcessor proc =
      new ResourceTracker.ResourceProcessor() {
        @Override
        public boolean processAvailableResource(ResourceGrant resource) {
          grants.add(resource);
          final boolean consumed = true;
          return consumed;
        }
      };
    while (true) {
      // Try to get JT grant while periodically checking for session driver
      // exceptions.
      long timeout = 60 * 1000; // 1 min.
      resourceTracker.processAvailableGrants(proc, 1, timeout);
      IOException e = sessionDriver.getFailed();
      if (e != null) {
        throw e;
      }
      if (!grants.isEmpty()) {
        return grants.get(0);
      }
    }
  }

  /**
   * Start corona job tracker on the machine provided by using the corona
   * task tracker API.
   * @param jobConf The job configuration.
   * @param grant The grant that specifies the remote machine.
   * @return A boolean indicating success.
   * @throws InterruptedException
   */
  private boolean startRemoteJT(
    JobConf jobConf,
    ResourceGrant grant) throws InterruptedException {
    org.apache.hadoop.corona.InetAddress ttAddr =
      Utilities.appInfoToAddress(grant.appInfo);
    CoronaTaskTrackerProtocol coronaTT = null;
    try {
      coronaTT = jt.getTaskTrackerClient(ttAddr.getHost(), ttAddr.getPort());
    } catch (IOException e) {
      LOG.error("Error while trying to connect to TT at " + ttAddr.getHost() +
        ":" + ttAddr.getPort(), e);
      return false;
    }
    LOG.info("Starting remote JT for " + jobId + " on " + ttAddr.getHost());

    // Get a special map id for the JT task.
    Path systemDir = new Path(jt.getSystemDir());
    String jobFile = CoronaJobInProgress.getJobFile(systemDir, jobId)
        .toString();
    String splitClass = JobClient.RawSplit.class.getName();
    BytesWritable split = new BytesWritable();
    Task jobTask = new MapTask(
        jobFile, currentAttemptId, currentAttemptId.getTaskID().getId(),
        splitClass, split, 1, jobConf.getUser());
    CoronaSessionInfo info = new CoronaSessionInfo(jt.getSessionId(),
        jt.getJobTrackerAddress());
    synchronized (this) {
      try {
        coronaTT.startCoronaJobTracker(jobTask, info);
      } catch (IOException e) {
        // Increment the attempt so that the older attempt will get an error
        // in reportRemoteCoronaJobTracker().
        incrementAttemptUnprotected();
        LOG.error("Error while performing RPC to TT at " + ttAddr.getHost() +
          ":" + ttAddr.getPort(), e);
        return false;
      }
    }

    // Now wait for the remote CJT to report its address.
    final long waitStart = System.currentTimeMillis();
    final long timeout = RemoteJTProxy.getRemotJTTimeout(jobConf);
    synchronized (this) {
      while (client == null) {
        LOG.info("Waiting for remote JT to start on " + ttAddr.getHost());
        this.wait(1000);
        if (System.currentTimeMillis() - waitStart > timeout) {
          // Increment the attempt so that the older attempt will get an error
          // in reportRemoteCoronaJobTracker().
          incrementAttemptUnprotected();
          LOG.warn("Could not start remote JT on " + ttAddr.getHost());
          return false;
        }
      }
    }
    return true;
  }

  // ///////////////////////////////////////////////////////////////////////////
  // JobSubmissionProtocol
  // ///////////////////////////////////////////////////////////////////////////
  @Override
  public JobID getNewJobId() throws IOException {
    throw new UnsupportedOperationException(
        "getNewJobId not supported by proxy");
  }

  @Override
  public JobStatus submitJob(final JobID jobId) throws IOException {
    return (new Caller<JobStatus>() {
      @Override
      JobStatus call() throws IOException {
        return client.submitJob(jobId);
      }
    }).makeCall();
  }

  @Override
  public ClusterStatus getClusterStatus(boolean detailed) throws IOException {
    throw new UnsupportedOperationException(
        "getClusterStatus is not supported by proxy");
  }

  @Override
  public void killJob(final JobID jobId) throws IOException {
    (new Caller<JobID>() {
      @Override
      JobID call() throws IOException {
        client.killJob(jobId);
        return jobId;
      }
    }).makeCall();
  }

  @Override
  public void setJobPriority(JobID jobId, String priority) throws IOException {
    throw new UnsupportedOperationException(
        "setJobPriority is not supported by proxy");
  }

  @Override
  public boolean killTask(final TaskAttemptID taskId, final boolean shouldFail)
    throws IOException {
    return (new Caller<Boolean>() {
      @Override
      Boolean call() throws IOException {
        return client.killTask(taskId, shouldFail);
      }
    }).makeCall();
  }

  @Override
  public JobProfile getJobProfile(final JobID jobId) throws IOException {
    return (new Caller<JobProfile>() {
      @Override
      JobProfile call() throws IOException {
        return client.getJobProfile(jobId);
      }
    }).makeCall();
  }

  @Override
  public JobStatus getJobStatus(final JobID jobId) throws IOException {
    return (new Caller<JobStatus>() {
      @Override
      JobStatus call() throws IOException {
        return client.getJobStatus(jobId);
      }
    }).makeCall();
  }

  @Override
  public Counters getJobCounters(final JobID jobId) throws IOException {
    return (new Caller<Counters>() {
      @Override
      Counters call() throws IOException {
        return client.getJobCounters(jobId);
      }
    }).makeCall();
  }

  @Override
  public TaskReport[] getMapTaskReports(final JobID jobId) throws IOException {
    return (new Caller<TaskReport[]>() {
      @Override
      TaskReport[] call() throws IOException {
        return client.getMapTaskReports(jobId);
      }
    }).makeCall();
  }

  @Override
  public TaskReport[] getReduceTaskReports(final JobID jobId)
    throws IOException {
    return (new Caller<TaskReport[]>() {
      @Override
      TaskReport[] call() throws IOException {
        return client.getReduceTaskReports(jobId);
      }
    }).makeCall();
  }

  @Override
  public TaskReport[] getCleanupTaskReports(final JobID jobId)
    throws IOException {
    return (new Caller<TaskReport[]>() {
      @Override
      TaskReport[] call() throws IOException {
        return client.getCleanupTaskReports(jobId);
      }
    }).makeCall();
  }

  @Override
  public TaskReport[] getSetupTaskReports(final JobID jobId)
    throws IOException {
    return (new Caller<TaskReport[]>() {
      @Override
      TaskReport[] call() throws IOException {
        return client.getSetupTaskReports(jobId);
      }
    }).makeCall();
  }

  @Override
  public String getFilesystemName() throws IOException {
    throw new UnsupportedOperationException(
        "getFilesystemName is not supported by proxy");
  }

  @Override
  public JobStatus[] jobsToComplete() {
    throw new UnsupportedOperationException(
        "jobsToComplete is not supported by proxy");
  }

  @Override
  public JobStatus[] getAllJobs() {
    throw new UnsupportedOperationException(
        "getAllJobs is not supported by proxy");
  }

  @Override
  public TaskCompletionEvent[] getTaskCompletionEvents(final JobID jobid,
      final int fromEventId, final int maxEvents) throws IOException {
    return (new Caller<TaskCompletionEvent[]>() {
      @Override
      TaskCompletionEvent[] call() throws IOException {
        return client.getTaskCompletionEvents(jobid, fromEventId, maxEvents);
      }
    }).makeCall();
  }

  @Override
  public String[] getTaskDiagnostics(final TaskAttemptID taskId)
    throws IOException {
    return (new Caller<String[]>() {
      @Override
      String[] call() throws IOException {
        return client.getTaskDiagnostics(taskId);
      }
    }).makeCall();
  }

  @Override
  public String getSystemDir() {
    throw new UnsupportedOperationException(
        "getSystemDir not supported by proxy.");
  }

  @Override
  public JobQueueInfo[] getQueues() {
    throw new UnsupportedOperationException("getQueues method is " +
        "not supported by proxy.");
  }

  @Override
  public JobQueueInfo getQueueInfo(String queue) {
    throw new UnsupportedOperationException(
        "getQueueInfo not supported by proxy.");
  }

  @Override
  public JobStatus[] getJobsFromQueue(String queue) {
    throw new UnsupportedOperationException(
        "getJobsFromQueue not supported by proxy.");
  }

  @Override
  public QueueAclsInfo[] getQueueAclsForCurrentUser() throws IOException {
    throw new UnsupportedOperationException(
        "getQueueAclsForCurrentUser not supported by proxy.");
  }

  public void close() {
    synchronized (this) {
      if (client != null) {
        RPC.stopProxy(client);
      }
    }
  }

  /**
   * Generic caller interface.
   */
  private abstract class Caller<T> {
    /**
     * Perform the call. Must be overridden by a sub-class.
     * @return The generic return value.
     * @throws IOException
     */
    abstract T call() throws IOException;

    /**
     * Template function to make the call.
     * @return The generic return value.
     * @throws IOException
     */
    public T makeCall() throws IOException {
      try {
        checkClient();
        return call();
      } catch (IOException e) {
        LOG.error("Error on remote call ", e);
        handleCallFailure();
        throw e;
      }
    }
  }

  /**
   * Handle failures while making calls to the remote corona job tracker.
   * We need to close the local job tracker.
   * @throws IOException
   */
  private void handleCallFailure() throws IOException {
    try {
      jt.close(false, true);
    } catch (InterruptedException e) {
      throw new IOException(e);
    }
  }

  /**
   * Check if the RPC client to the remote job tracker is ready, and wait if
   * not.
   * @throws IOException
   */
  private void checkClient() throws IOException {
    synchronized (this) {
      if (client == null) {
        try {
          this.wait();
        } catch (InterruptedException e) {
          throw new IOException(e);
        }
      }
    }
  }

  /**
   * Returns the timeout in milliseconds after which we timeout the remote job
   * tracker.
   *
   * @param conf
   *          The configuration
   * @return The timeout in milliseconds.
   */
  public static long getRemotJTTimeout(Configuration conf) {
    return conf.getInt(RemoteJTProxy.REMOTE_JT_TIMEOUT_SEC_CONF,
        RemoteJTProxy.REMOTE_JT_TIMEOUT_SEC_DEFAULT) * 1000;
  }
}
TOP

Related Classes of org.apache.hadoop.mapred.RemoteJTProxy

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.