Package org.apache.hadoop.corona

Source Code of org.apache.hadoop.corona.ClusterManagerMetrics

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.corona;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.hadoop.metrics.MetricsContext;
import org.apache.hadoop.metrics.MetricsRecord;
import org.apache.hadoop.metrics.MetricsUtil;
import org.apache.hadoop.metrics.Updater;
import org.apache.hadoop.metrics.util.MetricsBase;
import org.apache.hadoop.metrics.util.MetricsIntValue;
import org.apache.hadoop.metrics.util.MetricsLongValue;
import org.apache.hadoop.metrics.util.MetricsRegistry;
import org.apache.hadoop.metrics.util.MetricsTimeVaryingInt;
import org.apache.hadoop.metrics.util.MetricsTimeVaryingLong;

/**
* Metrics for the Corona Cluster Manager.
*/
class ClusterManagerMetrics implements Updater {
  /** Metrics context name. */
  static final String CONTEXT_NAME = "clustermanager";
  /** The list of possible end states for a session */
  private static final List<SessionStatus> SESSION_END_STATES = getEndStates();
  /** Metrics context. */
  private final MetricsContext context;
  /** Metrics record. */
  private final MetricsRecord metricsRecord;
  /** Metrics registry. */
  private final MetricsRegistry registry = new MetricsRegistry();
  /** Number of requested resources by resource type. */
  private final Map<ResourceType, MetricsTimeVaryingLong>
  typeToResourceRequested;
  /** Number of granted resources by resource type. */
  private final Map<ResourceType, MetricsTimeVaryingLong> typeToResourceGranted;
  /** Number of revokes issued (preemption) by resource type. */
  private final Map<ResourceType, MetricsTimeVaryingLong> typeToResourceRevoked;
  /** Number of resources released by sessions, by resource type. */
  private final Map<ResourceType, MetricsTimeVaryingLong>
  typeToResourceReleased;
  /** Number of pending resource requests by resource type. */
  private final Map<ResourceType, MetricsIntValue> typeToPendingCount;
  /**
   * Number of running resource requests by resource type. This should be same
   * as the number of granted resources.
   */
  private final Map<ResourceType, MetricsIntValue> typeToRunningCount;
  /** Number of total slots by resource type. */
  private final Map<ResourceType, MetricsIntValue> typeToTotalSlots;
  /** Number of free slots by resource type. */
  private final Map<ResourceType, MetricsIntValue> typeToFreeSlots;
  /** Scheduler run time by resource type. */
  private final Map<ResourceType, MetricsIntValue> typeToSchedulerRunTime;
  /** The start time of the current run of the scheduler by resource type.
   * Contains a non 0 value if the cycle is in progress and 0 otherwise */
  private final Map<ResourceType, Long> typeToSchedulerCurrentCycleStart;
  /** Number of alive nodes. */
  private final MetricsIntValue aliveNodes;
  /** Number of dead nodes. */
  private final MetricsIntValue deadNodes;
  /** Number of blacklisted nodes. */
  private final MetricsIntValue blacklistedNodes;
  /** Breakdown of session by session status. */
  private final Map<SessionStatus, MetricsTimeVaryingInt>
  sessionStatusToMetrics;
  /** Number of running sessions. */
  private final MetricsIntValue numRunningSessions;
  /** Number of sessions since start of cluster manager. */
  private final MetricsTimeVaryingInt totalSessionCount;
  /** Number of pending calls to sessions. */
  private final MetricsIntValue pendingCallsCount;
  /** Number of CoronaJobTracker failures. */
  private final MetricsTimeVaryingInt numCJTFailures;
  /** Cluster manager scheduler for metrics */
  private Scheduler scheduler;
  /** Cluster manager session notifier for metrics. */
  private SessionNotifier sessionNotifier;
  /** Number of task tracker get restarted */
  private final MetricsIntValue numTaskTrackerRestarted;
  /** Number of remote job tracker timedout */
  private final MetricsIntValue numRemoteJTTimedout;

  /**
   * Constructor.
   * @param types The available resource types.
   */
  public ClusterManagerMetrics(Collection<ResourceType> types) {
    context = MetricsUtil.getContext(CONTEXT_NAME);
    metricsRecord = MetricsUtil.createRecord(context, CONTEXT_NAME);
    typeToResourceRequested = createTypeToResourceCountMap(types, "requested");
    typeToResourceGranted = createTypeToResourceCountMap(types, "granted");
    typeToResourceRevoked = createTypeToResourceCountMap(types, "revoked");
    typeToResourceReleased = createTypeToResourceCountMap(types, "released");
    typeToPendingCount = createTypeToCountMap(types, "pending");
    typeToRunningCount = createTypeToCountMap(types, "running");
    typeToTotalSlots = createTypeToCountMap(types, "total");
    typeToFreeSlots = createTypeToCountMap(types, "free");
    typeToSchedulerRunTime = createTypeToCountMap(types, "scheduler_runtime");
    typeToSchedulerCurrentCycleStart =
        new ConcurrentHashMap<ResourceType, Long>();
    sessionStatusToMetrics = createSessionStatusToMetricsMap();
    aliveNodes = new MetricsIntValue("alive_nodes", registry);
    deadNodes = new MetricsIntValue("dead_nodes", registry);
    blacklistedNodes = new MetricsIntValue("blacklisted_nodes", registry);
    numRunningSessions = new MetricsIntValue("num_running_sessions", registry);
    totalSessionCount = new MetricsTimeVaryingInt("total_sessions", registry);
    pendingCallsCount = new MetricsIntValue("num_pending_calls", registry);
    numCJTFailures = new MetricsTimeVaryingInt("num_cjt_failures", registry);
    numTaskTrackerRestarted = new MetricsIntValue("num_task_tracker_restarted", registry);
    numRemoteJTTimedout = new MetricsIntValue("num_remotejt_timedout", registry);
  }

  /**
   * Set the number of pending requests.
   * @param resourceType The resource type.
   * @param pending The number of pending requests.
   */
  public void setPendingRequestCount(ResourceType resourceType, int pending) {
    typeToPendingCount.get(resourceType).set(pending);
  }

  /**
   * Set the number of running resources.
   * @param resourceType The resource type.
   * @param running The number of running resources.
   */
  public void setRunningRequestCount(ResourceType resourceType, int running) {
    typeToRunningCount.get(resourceType).set(running);
  }

  /**
   * Set the number of total slots.
   * @param resourceType The resource type.
   * @param totalSlots The total number of slots.
   */
  public void setTotalSlots(ResourceType resourceType, int totalSlots) {
    typeToTotalSlots.get(resourceType).set(totalSlots);
  }

  /**
   * Set the number of free slots.
   * @param resourceType The resource type.
   * @param freeSlots The number of free slots.
   */
  public void setFreeSlots(ResourceType resourceType, int freeSlots) {
    typeToFreeSlots.get(resourceType).set(freeSlots);
  }

  public void setSchedulerRunTime(ResourceType resourceType, int runtime) {
    // This is called when the scheduling cycle is complete
    setSchedulerCurrentCycleStartTime(resourceType, 0);
    typeToSchedulerRunTime.get(resourceType).set(runtime);
  }

  public void setSchedulerCurrentCycleStartTime(ResourceType resourceType, long tstamp) {
    typeToSchedulerCurrentCycleStart.put(resourceType, tstamp);
  }

  /**
   * Record the request of a resource.
   * @param type The resource type.
   */
  public void requestResource(ResourceType type) {
    typeToResourceRequested.get(type).inc();
  }

  /**
   * Record the release of a resource.
   * @param type The resource type.
   */
  public void releaseResource(ResourceType type) {
    typeToResourceReleased.get(type).inc();
  }

  /**
   * Record the grant of a resource.
   * @param type The resource type.
   */
  public void grantResource(ResourceType type) {
    typeToResourceGranted.get(type).inc();
  }

  /**
   * Record the revoke of a resource.
   * @param type The resource type.
   */
  public void revokeResource(ResourceType type) {
    typeToResourceRevoked.get(type).inc();
  }

  /**
   * Set the number of alive nodes.
   * @param numAlive The number of alive nodes.
   */
  public void setAliveNodes(int numAlive) {
    aliveNodes.set(numAlive);
  }
 
  /**
   * num of task trackers get restarted
   * @param num The number of task trackers get restarted
   */
  public void restartTaskTracker(int num) {
    this.numTaskTrackerRestarted.inc(num);
  }

  /**
  /**
   * num of remote JT timedout
   * @param num The number of remote JT timedout
   */
  public void timeoutRemoteJT(int num) {
    this.numRemoteJTTimedout.inc(num);
  }

  /**
   * Set the number of dead nodes.
   * @param numDead The number of dead nodes.
   */
  public void setDeadNodes(int numDead) {
    deadNodes.set(numDead);
  }

  /**
   * Set the number of blacklisted nodes.
   * @param numBlacklisted The number of blacklisted nodes.
   */
  public void setBlacklistedNodes(int numBlacklisted) {
    blacklistedNodes.set(numBlacklisted);
  }

  /**
   * Set the number of running sessions.
   * @param num The number of running sessions.
   */
  public void setNumRunningSessions(int num) {
    numRunningSessions.set(num);
  }

  /**
   * Increment the number of sessions since the start of the cluster manager.
   */
  public void sessionStart() {
    totalSessionCount.inc();
  }

  /**
   * Record the end of a session.
   * @param finishState The state that the session finished in.
   */
  public void sessionEnd(SessionStatus finishState) {
    if (sessionStatusToMetrics.containsKey(finishState)) {
      sessionStatusToMetrics.get(finishState).inc();
    } else {
      throw new IllegalArgumentException("Invalid end state " + finishState);
    }
  }

  /**
   * Update the metric pending calls metric
   * @param numPendingCalls the number of calls in the queue
   * waiting to be sent out
   */
  public void setNumPendingCalls(int numPendingCalls) {
    pendingCallsCount.set(numPendingCalls);
  }

  /**
   * Records CoronaJobTracker failure.
   */
  public void recordCJTFailure() {
    numCJTFailures.inc();
  }

  /**
   * Set the scheduler and start the updating.  The metrics won't be reported
   * until this is called.
   *
   * @param scheduler Scheduler for this cluster manager.
   * @param sessionNotifier Session Notifier for this cluster manager.
   */
  public void registerUpdater(Scheduler scheduler,
      SessionNotifier sessionNotifier) {
    this.scheduler = scheduler;
    this.sessionNotifier = sessionNotifier;
    context.registerUpdater(this);
  }

  /**
   * Get all the possible end states (non running) of the session
   * @return The list of session status that are the non-running state
   */
  private static List<SessionStatus> getEndStates() {
    List<SessionStatus> endStatesRet = new ArrayList<SessionStatus>();
    for (SessionStatus s : SessionStatus.values()) {
      if (s != SessionStatus.RUNNING) {
        endStatesRet.add(s);
      }
    }
    return endStatesRet;
  }

  /**
   * Create a map of session status -> metrics.
   * @return the map.
   */
  private Map<SessionStatus, MetricsTimeVaryingInt>
  createSessionStatusToMetricsMap() {
    Map<SessionStatus, MetricsTimeVaryingInt> m =
      new HashMap<SessionStatus, MetricsTimeVaryingInt>();
    for (SessionStatus endState : SESSION_END_STATES) {
      String name = endState.toString().toLowerCase() + "_sessions";
      m.put(endState, new MetricsTimeVaryingInt(name, registry));
    }
    return m;
  }

  /**
   * Create a map of resource type -> current count.
   * @param resourceTypes The resource types.
   * @param actionType A string indicating pending, running etc.
   * @return The map.
   */
  private Map<ResourceType, MetricsIntValue> createTypeToCountMap(
      Collection<ResourceType> resourceTypes, String actionType) {
    Map<ResourceType, MetricsIntValue> m =
        new HashMap<ResourceType, MetricsIntValue>();
    for (ResourceType t : resourceTypes) {
      String name = (actionType + "_" + t).toLowerCase();
      MetricsIntValue value = new MetricsIntValue(name, registry);
      m.put(t, value);
    }
    return m;
  }

  /**
   * Create a map of resource type -> cumulative counts.
   * @param resourceTypes The resource types.
   * @param actionType A string indicating granted, revoked, etc.
   * @return The map.
   */
  private Map<ResourceType, MetricsTimeVaryingLong>
  createTypeToResourceCountMap(
      Collection<ResourceType> resourceTypes, String actionType) {
    Map<ResourceType, MetricsTimeVaryingLong> m =
        new HashMap<ResourceType, MetricsTimeVaryingLong>();
    for (ResourceType t : resourceTypes) {
      String name = (actionType + "_" + t).toLowerCase();
      MetricsTimeVaryingLong value = new MetricsTimeVaryingLong(name, registry);
      m.put(t, value);
    }
    return m;
  }

  @Override
  public void doUpdates(MetricsContext context) {
    // Get the fair scheduler metrics
    if (scheduler != null) {
      scheduler.submitMetrics(metricsRecord);
    }

    for (Map.Entry<ResourceType, Long> currStart :
        typeToSchedulerCurrentCycleStart.entrySet()) {
      long start = currStart.getValue();
      if (start > 0) {
        // This means that there's a scheduling cycle in progress.
        int currCycleRun = (int)(System.currentTimeMillis() - start);
        typeToSchedulerRunTime.get(currStart.getKey()).set(currCycleRun);
      }
    }

    // Get the number of pending calls.
    setNumPendingCalls(sessionNotifier.getNumPendingCalls());

    // Not synchronized on the ClusterManagerMetrics object.
    // The list of metrics in the registry is modified only in the constructor.
    // And pushMetrics() is thread-safe.
    for (MetricsBase m : registry.getMetricsList()) {
      m.pushMetric(metricsRecord);
    }

    metricsRecord.update();
  }

  public MetricsContext getContext() {
    return context;
  }
}
TOP

Related Classes of org.apache.hadoop.corona.ClusterManagerMetrics

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.