/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ducc.rm.scheduler;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.uima.ducc.common.Node;
import org.apache.uima.ducc.common.NodeConfiguration;
import org.apache.uima.ducc.common.NodeIdentity;
import org.apache.uima.ducc.common.Pair;
import org.apache.uima.ducc.common.utils.DuccLogger;
import org.apache.uima.ducc.common.utils.DuccProperties;
import org.apache.uima.ducc.common.utils.DuccPropertiesResolver;
import org.apache.uima.ducc.common.utils.SystemPropertyResolver;
import org.apache.uima.ducc.common.utils.Version;
import org.apache.uima.ducc.common.utils.id.DuccId;
import org.apache.uima.ducc.common.utils.id.DuccIdFactory;
/**
* This process orchestrates scheduling.
* - Receives requests from clients ( job manager, service manager, etc ) for resources
* - Forwards requests and current state to pluggable scheduling implementation
* - Receives a schedule, updates state, sends responses to requestors
* - Maintains state as needed (work item life cycle etc)
*/
public class Scheduler
// extends Thread
implements ISchedulerMain,
SchedConstants
{
IJobManager jobManager;
static DuccLogger logger = DuccLogger.getLogger(Scheduler.class, COMPONENT_NAME);
boolean done = false;
// Boolean force_epoch = false;
String ducc_home;
// Integer epoch = 5; // scheduling epoch, seconds
NodePool[] nodepools;
int max_order = 0;
//
// Fair-share and fixed-share use shares only, not machines
//
Map<DuccId, Share> busyShares = new HashMap<DuccId, Share>(); // Running "fair" share jobs
// incoming reports of machines that are now free
Map<DuccId, Pair<IRmJob, Share>> vacatedShares= new HashMap<DuccId, Pair<IRmJob, Share>>();
// boolean growthOccurred = false; // don't care which grew, just that something grew
List<IRmJob> incomingJobs = new ArrayList<IRmJob>(); // coming in from external world but not added our queues yet
List<IRmJob> recoveredJobs = new ArrayList<IRmJob>(); // coming in from external world but we don't now about them, (hopefully
// because we crashed and not for more nefarious reasons)
List<IRmJob> completedJobs = new ArrayList<IRmJob>(); // signaled complete from outside but not yet dealt with
List<IRmJob> initializedJobs = new ArrayList<IRmJob>(); // Init is complete so we can begin full (un)fair share allocation
//HashMap<Node, Node> incomingNodes = new HashMap<Node, Node>(); // node updates
Map<Node, Node> deadNodes = new HashMap<Node, Node>(); // missed too many heartbeats
// HashMap<Node, Node> allNodes = new HashMap<Node, Node>(); // the guys we know
Map<String, NodePool> nodepoolsByNode = new HashMap<String, NodePool>(); // all nodes, and their associated pool
Map<String, User> users = new HashMap<String, User>(); // Active users - has a job in the system
//HashMap<DuccId, IRmJob> runningJobs = new HashMap<DuccId, IRmJob>();
Map<DuccId, IRmJob> allJobs = new HashMap<DuccId, IRmJob>();
Map<ResourceClass, ResourceClass> resourceClasses = new HashMap<ResourceClass, ResourceClass>();
Map<String, ResourceClass> resourceClassesByName = new HashMap<String, ResourceClass>();
String defaultFairShareName = null;
String defaultReserveName = null;
int defaultNThreads = 1;
int defaultNTasks = 10;
int defaultMemory = 16;
// these two are initialized in constructor
String schedImplName;
IScheduler[] schedulers;
long share_quantum = 16; // 16 GB in KB - smallest share size
long share_free_dram = 0; // 0 GB in KB - minim memory after shares are allocated
long dramOverride = 0; // if > 0, use this instead of amount reported by agents (modeling and testing)
EvictionPolicy evictionPolicy = EvictionPolicy.SHRINK_BY_MACHINE;
// int nodeMetricsUpdateRate = 30000;
// int startupCountdown = 0; // update each epoch. only schedule when it's > nodeStability
int nodeStability = 3;
boolean stability = false;
private static DuccIdFactory idFactory;
// static boolean expandByDoubling = true;
// static int initializationCap = 2; // Max allocation until we know initialization works in
// units of *processes*, not shares (i.e.N-shares).
//
// Version
// 0 - major version
// 6 - minor version
// 3 - ptf - forced eviction under fragmentation.
// 4 - defrag code complete
// beta - not yet "real"!
//
// Bring up to speed with rest of ducc version. 2013-03-06 jrc
//
final static int rmversion_major = 1;
final static int rmversion_minor = 0;
final static int rmversion_ptf = 0;
final static String rmversion_string = null;
boolean initialized = false; // we refuse nodeupdates until this is true
public Scheduler()
{
}
public synchronized void init()
throws Exception
{
String methodName = "init";
//setName("Scheduler");
DuccLogger.setUnthreaded();
String ep = SystemPropertyResolver.getStringProperty("ducc.rm.eviction.policy", "SHRINK_BY_MACHINE");
evictionPolicy = EvictionPolicy.valueOf(ep);
// nodepool = new NodePool(null, evictionPolicy, 0); // global nodepool
share_quantum = SystemPropertyResolver.getLongProperty("ducc.rm.share.quantum", share_quantum) * 1024 * 1024; // GB -> KB
share_free_dram = SystemPropertyResolver.getLongProperty("ducc.rm.reserved.dram", share_free_dram) * 1024 * 1024; // GB -> KB
ducc_home = SystemPropertyResolver.getStringProperty("DUCC_HOME");
// some defaults, for jobs that don't specify them
defaultNTasks = SystemPropertyResolver.getIntProperty("ducc.rm.default.tasks", 10);
defaultNThreads = SystemPropertyResolver.getIntProperty("ducc.rm.default.threads", 1);
defaultMemory = SystemPropertyResolver.getIntProperty("ducc.rm.default.memory", 16); // in GB
// expandByDoubling = RmUtil.getBooleanProperty("ducc.rm.expand.by.doubling", true);
nodeStability = SystemPropertyResolver.getIntProperty("ducc.rm.node.stability", 3); // number of node metrics updates to wait for before scheduling
// 0 means, just jump right in and don't wait
dramOverride = SystemPropertyResolver.getLongProperty("ducc.rm.override.dram", 0);
if ( dramOverride > 0 ) {
dramOverride = dramOverride * (1024 * 1024); // convert to KB
}
idFactory = new DuccIdFactory(1);
// try {
// schedImplName = SystemPropertyResolver.getStringProperty("ducc.rm.scheduler", "org.apache.uima.ducc.rm.ClassBasedScheduler");
// @SuppressWarnings("unchecked")
// Class<IScheduler> cl = (Class<IScheduler>) Class.forName(schedImplName);
// scheduler = (IScheduler) cl.newInstance();
// } catch (ClassNotFoundException e) {
// throw new SchedulingException(null, "Cannot find class " + schedImplName);
// } catch (InstantiationException e) {
// throw new SchedulingException(null, "Cannot instantiate class " + schedImplName);
// } catch (IllegalAccessException e) {
// throw new SchedulingException(null, "Cannot instantiate class " + schedImplName + ": can't access constructor.");
// }
String class_definitions = SystemPropertyResolver
.getStringProperty(DuccPropertiesResolver
.ducc_rm_class_definitions, "scheduler.classes");
class_definitions = System.getProperty("DUCC_HOME") + "/resources/" + class_definitions;
try {
initClasses(class_definitions);
} catch ( Exception e ) {
logger.error(methodName, null, e);
throw e;
}
// we share most of the state with the actual scheduling code - no need to keep passing this around
// TODO: Make sure these are all Sialized correctly
// scheduler.setEvictionPolicy(evictionPolicy);
// scheduler.setClasses(resourceClasses);
// scheduler.setNodePool(nodepools[0]);
logger.info(methodName, null, "Scheduler running with share quantum : ", (share_quantum / (1024*1024)), " GB");
logger.info(methodName, null, " reserved DRAM : ", (share_free_dram / (1024*1024)), " GB");
logger.info(methodName, null, " DRAM override : ", (dramOverride / (1024*1024)), " GB");
logger.info(methodName, null, " scheduler : ", schedImplName);
logger.info(methodName, null, " default threads : ", defaultNThreads);
logger.info(methodName, null, " default tasks : ", defaultNTasks);
logger.info(methodName, null, " default memory : ", defaultMemory);
logger.info(methodName, null, " default fairshare class : ", defaultFairShareName);
logger.info(methodName, null, " default reserve : ", defaultReserveName);
logger.info(methodName, null, " class definition file : ", class_definitions);
logger.info(methodName, null, " eviction policy : ", evictionPolicy);
logger.info(methodName, null, " use prediction : ", SystemPropertyResolver.getBooleanProperty("ducc.rm.prediction", true));
logger.info(methodName, null, " prediction fudge factor : ", SystemPropertyResolver.getIntProperty("ducc.rm.prediction.fudge", 10000));
logger.info(methodName, null, " node stability : ", nodeStability);
logger.info(methodName, null, " init stability : ", SystemPropertyResolver.getIntProperty("ducc.rm.init.stability"));
logger.info(methodName, null, " fast recovery : ", SystemPropertyResolver.getBooleanProperty("ducc.rm.fast.recovery", true));
logger.info(methodName, null, " RM publish rate : ", SystemPropertyResolver.getIntProperty("ducc.rm.state.publish.rate", 60));
logger.info(methodName, null, " metrics update rate : ", SystemPropertyResolver.getIntProperty("ducc.agent.node.metrics.publish.rate",
DEFAULT_NODE_METRICS_RATE));
logger.info(methodName, null, " initialization cap : ", SystemPropertyResolver.getIntProperty("ducc.rm.initialization.cap"));
logger.info(methodName, null, " expand by doubling : ", SystemPropertyResolver.getBooleanProperty("ducc.rm.expand.by.doubling", true));
logger.info(methodName, null, " fragmentation threshold : ", SystemPropertyResolver.getIntProperty("ducc.rm.fragmentation.threshold", 2));
logger.info(methodName, null, " do defragmentation : ", SystemPropertyResolver.getBooleanProperty("ducc.rm.defragmentation", true));
logger.info(methodName, null, " DUCC home : ", System.getProperty("DUCC_HOME"));
logger.info(methodName, null, " ActiveMQ URL : ", SystemPropertyResolver.getStringProperty("ducc.broker.url"));
logger.info(methodName, null, " JVM : ", System.getProperty("java.vendor") +
" "+ System.getProperty("java.version"));
logger.info(methodName, null, " JAVA_HOME : ", System.getProperty("java.home"));
logger.info(methodName, null, " JVM Path : ", System.getProperty("ducc.jvm"));
logger.info(methodName, null, " JMX URL : ", System.getProperty("ducc.jmx.url"));
logger.info(methodName, null, " OS Architecture : ", System.getProperty("os.arch"));
logger.info(methodName, null, " OS Name : ", System.getProperty("os.name"));
logger.info(methodName, null, " DUCC Version : ", Version.version());
logger.info(methodName, null, " RM Version : ", ""+ rmversion_major + "."
+ rmversion_minor + "."
+ rmversion_ptf);
initialized = true;
}
public synchronized boolean isInitialized()
{
return initialized;
}
public Machine getMachine(Node n)
{
return getMachine(n.getNodeIdentity());
}
public Machine getMachine(NodeIdentity ni)
{
NodePool nodepool = getNodepoolByName(ni);
return nodepool.getMachine(ni);
}
public void setJobManager(IJobManager jobmanager)
{
this.jobManager = jobmanager;
}
public String getDefaultFairShareName()
{
return defaultFairShareName;
}
public String getDefaultReserveName()
{
return defaultReserveName;
}
public int getDefaultNThreads()
{
return defaultNThreads;
}
public int getDefaultNTasks()
{
return defaultNTasks;
}
public int getDefaultMemory()
{
return defaultMemory;
}
public ResourceClass getResourceClass(String name)
{
return resourceClassesByName.get(name);
}
public IRmJob getJob(DuccId id)
{
return allJobs.get(id);
}
public Share getShare(DuccId id)
{
return busyShares.get(id);
}
// public static int getInitializationCap()
// {
// return initializationCap;
// }
//
// public static boolean isExpandByDoubling()
// {
// return expandByDoubling;
// }
/**
* Calculate share order, given some memory size in GB (as in from a job spec)
*/
int calcShareOrder(long mem)
{
// Calculate its share order
mem = mem * 1024 * 1024; // to GB
int share_order = (int) (mem / share_quantum); // liberal calc, round UP
if ( (mem % share_quantum) > 0 ) {
share_order++;
}
return share_order;
}
// /**
// * Use the NodeIdentity to infer my the domain name.
// *
// * Itertate through the possible names - if one of them has a '.'
// * the we have to assume the following stuff is the domain name.
// * We only get one such name, so we give up the search if we find
// * it.
// */
// static String cached_domain = null;
// private String getDomainName()
// {
// String methodName = "getDomainName";
// String answer = System.getProperty("SIM_RM_DOMAIN"); // for the simulation wrapper, to replay logs from other domains correctly
// if ( answer != null ) {
// return answer;
// }
// if ( cached_domain != null ) return cached_domain;
// try {
// NodeIdentity ni = new NodeIdentity();
// for ( IIdentity id : ni.getNodeIdentities()) {
// String n = id.getName();
// int ndx = n.indexOf(".");
// if ( ndx > 0 ) {
// cached_domain = n.substring(ndx + 1);
// return cached_domain;
// }
// }
// } catch (Exception e) {
// // TODO Auto-generated catch block
// logger.warn(methodName, null, "Cannot create my own node identity:", e);
// }
// return null; // crappy config if this happens, some stuff may not match nodepools and
// // nothing to do about it.
// }
// Map<String, String> readNodepoolFile(String npfile)
// {
// String methodName = "readNodepoolFile";
// String my_domain = getDomainName();
// String ducc_home = System.getProperty("DUCC_HOME");
// npfile = ducc_home + "/resources/" + npfile;
// logger.info(methodName, null, "Domain name:", my_domain);
// Map<String, String> response = new HashMap<String, String>();
// try {
// BufferedReader br = new BufferedReader(new FileReader(npfile));
// String node = "";
// while ( (node = br.readLine()) != null ) {
// int ndx = node.indexOf("#");
// if ( ndx >= 0 ) {
// node = node.substring(0, ndx);
// }
// node = node.trim();
// if (node.equals("") ) {
// continue;
// }
// if ( node.startsWith("import") ) {
// String[] tmp = node.split("\\s");
// response.putAll(readNodepoolFile(tmp[1]));
// continue;
// }
// response.put(node, node);
// // include fully and non-fully qualified names to allow sloppiness of config
// ndx = node.indexOf(".");
// String dnode;
// if ( ndx >= 0 ) {
// dnode = node.substring(0, ndx);
// response.put(dnode, dnode);
// } else if ( my_domain != null ) {
// dnode = node + "." + my_domain;
// response.put(dnode, dnode);
// }
// }
// br.close();
// } catch (FileNotFoundException e) {
// throw new SchedulingException(null, "Cannot open NodePool file \"" + npfile + "\": file not found.");
// } catch (IOException e) {
// throw new SchedulingException(null, "Cannot read NodePool file \"" + npfile + "\": I/O Error.");
// }
// return response;
// }
/**
* Collect all the classes served by the indicated nodepool (property set). This fills
* in the 'ret' map from the parameter 'dp' and recursive calls to the children in dp.
* @param dp This is the properties object from the configurator for a top-level
* nodepool.
* @param ret This is the map to be filled in by this routine.
*/
void getClassesForNodepool(DuccProperties dp, Map<ResourceClass, ResourceClass> ret)
{
@SuppressWarnings("unchecked")
List<DuccProperties> class_set = (List<DuccProperties>) dp.get("classes");
if ( class_set != null ) {
for ( DuccProperties cl : class_set ) {
ResourceClass rc = resourceClassesByName.get(cl.getStringProperty("name"));
ret.put(rc, rc);
}
}
@SuppressWarnings("unchecked")
List<DuccProperties> children = (List<DuccProperties>) dp.get("children");
if ( children != null ) {
for (DuccProperties child : children ) {
getClassesForNodepool(child, ret);
}
}
}
/**
* Map each node by name into the nodepool it belongs to
*/
void mapNodesToNodepool(Map<String, String> nodes, NodePool pool)
{
if ( nodes == null ) return;
for ( String s : nodes.keySet() ) {
nodepoolsByNode.put(s, pool);
}
}
/**
* (Recursively) build up the heirarchy under the parent nodepool.
*/
void createSubpools(NodePool parent, List<DuccProperties> children)
{
if ( children == null ) return;
for ( DuccProperties dp : children ) {
String id = dp.getStringProperty("name");
@SuppressWarnings("unchecked")
Map<String, String> nodes = (Map<String, String>) dp.get("nodes");
NodePool child = parent.createSubpool(id, nodes, 0);
mapNodesToNodepool(nodes, child);
@SuppressWarnings("unchecked")
List<DuccProperties> grandkids = (List<DuccProperties>) dp.get("children");
createSubpools(child, grandkids);
}
}
void initClasses(String filename)
{
String methodName = "initClasses";
String me = Scheduler.class.getName() + ".Config";
DuccLogger initLogger = new DuccLogger(me, COMPONENT_NAME);
NodeConfiguration nc = new NodeConfiguration(filename, initLogger);
try {
nc.readConfiguration();
} catch (Throwable e) {
logger.error(methodName, null, e);
logger.error(methodName, null, "Scheduler exits: unable to read configuration.");
System.exit(1);
}
nc.printConfiguration();
DuccProperties[] nps = nc.getToplevelNodepools();
Map<String, DuccProperties> cls = nc.getClasses();
nodepools = new NodePool[nps.length]; // top-level nodepools
schedulers = new IScheduler[nps.length]; // a schedler for each top-level nodepool
// Here build up the ResourceClass definitions
logger.info(methodName, null, "Classes:");
logger.info(methodName, null, ResourceClass.getHeader());
logger.info(methodName, null, ResourceClass.getDashes());
for ( DuccProperties props : cls.values() ) {
ResourceClass rc = new ResourceClass(props);
resourceClasses.put(rc, rc);
resourceClassesByName.put(rc.getName(), rc);
logger.info(methodName, null, rc.toString());
}
DuccProperties dc = nc.getDefaultFairShareClass();
if ( dc != null ) {
defaultFairShareName = dc.getProperty("name");
}
dc = nc.getDefaultReserveClass();
if ( dc != null ) {
defaultReserveName = dc.getProperty("name");
}
// Instatntiate one scheduler per top-level nodepool
try {
schedImplName = SystemPropertyResolver.getStringProperty("ducc.rm.scheduler", "org.apache.uima.ducc.rm.ClassBasedScheduler");
@SuppressWarnings("unchecked")
Class<IScheduler> cl = (Class<IScheduler>) Class.forName(schedImplName);
for ( int i = 0; i < nps.length; i++ ) {
schedulers[i] = (IScheduler) cl.newInstance();
schedulers[i].setEvictionPolicy(evictionPolicy);
}
} catch (ClassNotFoundException e) {
throw new SchedulingException(null, "Cannot find class " + schedImplName);
} catch (InstantiationException e) {
throw new SchedulingException(null, "Cannot instantiate class " + schedImplName);
} catch (IllegalAccessException e) {
throw new SchedulingException(null, "Cannot instantiate class " + schedImplName + ": can't access constructor.");
}
// Here create the nodepool configuration
for ( int i = 0; i < nps.length; i++ ) {
DuccProperties np = nps[i];
String id = np.getStringProperty("name");
@SuppressWarnings("unchecked")
Map<String, String> nodes = (Map<String, String>) np.get("nodes");
nodepools[i] = new NodePool(null, id, nodes, evictionPolicy, 0, 0);
schedulers[i].setNodePool(nodepools[i]); // set its top-level nodepool
mapNodesToNodepool(nodes, nodepools[i]);
logger.info(methodName, null, "Created top-level nodepool", id);
@SuppressWarnings("unchecked")
List<DuccProperties> children = (List<DuccProperties>) np.get("children");
createSubpools(nodepools[i], children);
Map<ResourceClass, ResourceClass> classesForNp = new HashMap<ResourceClass, ResourceClass>();
getClassesForNodepool(np, classesForNp); // all classes served by this heirarchy - fills in classesForNp
schedulers[i].setClasses(classesForNp);
}
}
/**
* Called only from schedule, under the 'this' monitor.
*
* We then take the SchedulingUpdate from the IScheduler and dispatches orders to
* the world to make it happen.
*
* For jobs that lose resources, job manager is asked to stop execution in specific shares.
* For jobs that gain resources, job manager is asked to start execution in specific shares.
* Jobs that don't change are leftovers. If they're not running at all, they're in the pending
* list; they might also be in the running list but had no allocation changes in the current epoch.
*/
private JobManagerUpdate dispatch(SchedulingUpdate upd, JobManagerUpdate jmu)
{
String methodName = "dispatch";
HashMap<IRmJob, IRmJob> jobs;
// Go through shrunken jobs - if they are shrunken to 0, move to dormant
jobs = upd.getShrunkenJobs();
for (IRmJob j : jobs.values()) {
logger.trace(methodName, j.getId(), ">>>>>>>>>> SHRINK");
HashMap<Share, Share> sharesE = j.getAssignedShares();
HashMap<Share, Share> sharesR = j.getPendingRemoves();
logger.trace(methodName, j.getId(), "removing", sharesR.size(), "of existing", sharesE.size(), "shares.");
for ( Share s : sharesE.values() ) {
logger.trace(methodName, j.getId(), " current", s.toString());
}
for ( Share s : sharesR.values() ) {
logger.trace(methodName, j.getId(), " remove ", s.toString());
}
logger.trace(methodName, j.getId(), ">>>>>>>>>>");
jmu.removeShares(j, sharesR);
// jobManager.stopJob(j, shares); // stops job on everything on the pendingRemoves list
// j.clearPendingRemoves();
}
// Go through expanded jobs - if they are dormant, remove from dormant
// then add to running.
// Tell the server it needs to start some machines for the job
jobs = upd.getExpandedJobs();
for (IRmJob j : jobs.values() ) {
HashMap<Share, Share> sharesE = j.getAssignedShares();
HashMap<Share, Share> sharesN = j.getPendingShares();
logger.trace(methodName, j.getId(), "<<<<<<<<<< EXPAND");
logger.trace(methodName, j.getId(), "adding", sharesN.size(), "new shares to existing", sharesE.size(), "shares.");
for ( Share s : sharesE.values()) {
logger.trace(methodName, j.getId(), " existing ", s.toString());
}
for ( Share s : sharesN.values()) {
logger.trace(methodName, j.getId(), " expanding", s.toString());
}
logger.trace(methodName, j.getId(), "<<<<<<<<<<");
sharesN = j.promoteShares();
if ( sharesN.size() == 0 ) {
// internal error - should not be marked expanded if no machines
throw new SchedulingException(j.getId(), "Trying to execute expanded job but no pending machines.");
}
for ( Share s : sharesN.values()) { // update machine books
// Sanity checks on the bookkeeping
busyShares.put(s.getId(), s);
}
// DuccId id = j.getId(); // pull from dormant, maybe
// if ( dormantJobs .containsKey(id) ) {
// dormantJobs .remove(id);
// }
//runningJobs.put(id, j);
jmu.addShares(j, sharesN);
// jobManager.executeJob(j, shares); // will update job's pending lists
}
jobs = upd.getStableJobs(); // squirrel these away to try next time
for (IRmJob j: jobs.values()) {
if ( j.countNShares() < 0 ) {
throw new SchedulingException(j.getId(), "Share count went negative " + j.countNShares());
}
logger.trace(methodName, j.getId(), ".......... STABLE with ", j.countNShares(), " shares.");
}
jobs = upd.getDormantJobs(); // squirrel these away to try next time
for (IRmJob j: jobs.values()) {
logger.trace(methodName, j.getId(), ".......... DORMANT");
// dormantJobs .put(j.getId(), j);
}
jobs = upd.getReservedJobs();
for (IRmJob j: jobs.values()) {
logger.trace(methodName, j.getId(), "<<<<<<<<<< RESERVE");
HashMap<Share, Share> sharesE = j.getAssignedShares();
HashMap<Share, Share> sharesN = j.getPendingShares();
if ( sharesE.size() == j.countInstances() ) {
logger.trace(methodName, j.getId(), "reserve_stable", sharesE.size(), "machines");
} else if ( sharesN.size() == j.countInstances() ) { // reservation is complete but not yet confirmed?
logger.trace(methodName, j.getId(), "reserve_adding", sharesN.size(), "machines");
for ( Share s : sharesN.values()) {
logger.trace(methodName, j.getId(), " reserve_expanding ", s.toString());
}
jmu.addShares(j, sharesN);
j.promoteShares();
} else {
logger.trace(methodName, j.getId(), "reserve_pending", j.countInstances(), "machines");
}
logger.trace(methodName, j.getId(), "<<<<<<<<<<");
}
jmu.setAllJobs((HashMap<DuccId, IRmJob>)allJobs);
jobs = upd.getRefusedJobs();
Iterator<IRmJob> iter = jobs.values().iterator();
while ( iter.hasNext() ) {
IRmJob j = iter.next();
logger.trace(methodName, j.getId(), ".......... REFUSED");
}
return jmu;
}
/**
* We don't accept new work or even Orchestrator state updates until "ready". We do
* want machines, but be sure the internal structures are protected.
*/
public synchronized boolean ready()
{
return stability;
}
public synchronized void start()
{
stability = true;
}
protected void handleDeadNodes()
{
String methodName = "handleDeadNodes";
if ( ! isInitialized() ) {
return;
}
HashMap<Node, Node> nodeUpdates = new HashMap<Node, Node>();
synchronized(deadNodes) {
nodeUpdates.putAll(deadNodes);
deadNodes.clear();
}
synchronized(this) {
for ( Node n : nodeUpdates.values() ) {
Machine m = getMachine(n);
if ( m == null ) {
// must have been removed because of earlier missed hb
continue;
}
logger.warn(methodName, null, "***Purging machine***", m.getId(), "due to missed heartbeats. THreshold:", nodeStability);
NodePool np = m.getNodepool();
np.nodeLeaves(m);
}
}
}
/**
* We first accept any changes and requests from the outside world and place them where they
* can be acted on in this epoch.
*
* We then pass all relevent requests and resources to the IScheduler. This returns a
* SchedulingUpdate which is passed to the dispatcher to be acted upon.
*/
public JobManagerUpdate schedule()
{
String methodName = "schedule";
// if ( startupCountdown++ < nodeStability ) {
// logger.info(methodName, null, "Startup countdown:", startupCountdown, "of", nodeStability);
// return null;
// }
if ( ! ready() ) {
return null;
}
if ( ! isInitialized() ) {
return null;
}
// tracking the OR hang problem - are topics being delivered?
logger.info("nodeArrives", null, "Total arrivals:", total_arrivals);
handleDeadNodes();
resetNodepools();
// TODO: Can we combine these two into one?
SchedulingUpdate upd = new SchedulingUpdate(); // state from internal scheduler
JobManagerUpdate jmu = new JobManagerUpdate(); // state we forward to job manager
// int nchanges = 0;
ArrayList<IRmJob> jobsToRecover = new ArrayList<IRmJob>();
synchronized(recoveredJobs) {
jobsToRecover.addAll(recoveredJobs);
recoveredJobs.clear();
// nchanges += jobsToRecover.size();
}
ArrayList<IRmJob> newJobs = new ArrayList<IRmJob>();
//
// If there are new jobs we need to init some things and start a scheduling cycle.
//
synchronized(incomingJobs) {
newJobs.addAll(incomingJobs);
incomingJobs.clear();
// nchanges += newJobs.size();
}
//
// If some jobs pased initializion we need to signal a scheduling cycle to get
// them their fair share
//
// synchronized(initializedJobs) {
// if ( initializedJobs.size() > 0 ) {
// nchanges++;
// }
// initializedJobs.clear();
// }
//
// If some jobs completed we need to process clearning them out and signal a
// scheduling cycle to try to reuse their resources.
//
ArrayList<IRmJob> doneJobs = new ArrayList<IRmJob>();
synchronized(completedJobs) {
doneJobs.addAll(completedJobs);
completedJobs.clear();
//nchanges += doneJobs.size();
}
//
// If some shares were vacated we need to clear them out and run a scheduling cycle.
//
ArrayList<Pair<IRmJob, Share>> doneShares= new ArrayList<Pair<IRmJob, Share>>();
synchronized(vacatedShares) {
doneShares.addAll(vacatedShares.values());
vacatedShares.clear();
//nchanges += doneShares.size();
// we use the vacatedShares object to control share growth as well
//if ( growthOccurred ) nchanges++;
//growthOccurred = false;
}
// boolean must_run = false;
// synchronized(force_epoch) {
// must_run = force_epoch;
// force_epoch = false;
// }
// if ( (nchanges == 0) && !must_run ) {
// jmu.setAllJobs(allJobs);
// return jmu;
// }
// TODO if we remove this code above be sure to clear out all the force_epoch nonsense
// TODO does this even use growthOccurred?
synchronized(this) {
// before looking at jobs, insure we're updated after a crash
for ( IRmJob j : jobsToRecover ) {
processRecovery(j);
}
// process these next to free up resources for the scheduling cycle
for (Pair<IRmJob, Share> p : doneShares) {
processCompletion(p.first(), p.second());
}
for (IRmJob j : doneJobs) {
processCompletion(j);
}
// update user records, "check in" new jobs
if ( newJobs.size() > 0 ) {
logger.info(methodName, null, "Jobs arrive:");
logger.info(methodName, null, "submit", RmJob.getHeader());
}
Iterator<IRmJob> iter = newJobs.iterator();
while ( iter.hasNext() ) {
IRmJob j = iter.next();
if ( j.isRefused() ) { // the JobManagerConverter has already refused it
logger.info(methodName, j.getId(), "Bypassing previously refused job.");
upd.refuse(j, j.getRefusalReason());
}
String user = j.getUserName();
User u = users.get(user);
if ( u == null ) {
u = new User(user);
users.put(user, u);
}
j.setUser(u);
// Calculate its share order
int share_order = calcShareOrder(j.getMemory());
j.setShareOrder(share_order);
// Assign it to its priority class
String clid = j.getClassName();
ResourceClass prclass = resourceClassesByName.get(clid);
u.addJob(j);
allJobs.put(j.getId(), j);
if ( prclass == null ) {
upd.refuse(j, "Cannot find priority class " + clid + " for job");
continue;
}
if ( share_order > max_order ) {
upd.refuse(j, "Memory requested " + j.getMemory() + "GB exceeds the capacity of any machine in the cluster.");
continue;
}
/**
* We want to allow this - a normal job, submitted to a reservation class.
if ( (prclass.getPolicy() == Policy.RESERVE ) && ( ! j.isReservation() ) ) {
upd.refuse(j, "Reservaction class " +
prclass.getId() + " specified but work is not a reservation.");
continue;
}
*/
if ( ((prclass.getPolicy() != Policy.RESERVE ) && (prclass.getPolicy() != Policy.FIXED_SHARE)) && ( j.isReservation() ) ) {
upd.refuse(j, "Class " + prclass.getName() + " is policy " +
prclass.getPolicy() + " but the work is submitted as a reservation.");
continue;
}
prclass.addJob(j);
j.setResourceClass(prclass);
logger.info(methodName, j.getId(), "submit", j.toString());
}
logger.info(methodName, null, "Scheduling " + newJobs.size(), " new jobs. Existing jobs: " + allJobs.size());
for ( int i = 0; i < schedulers.length; i++ ) {
logger.info(methodName, null, "Run scheduler", i, "with top-level nodepool", nodepools[i].getId());
schedulers[i].schedule(upd);
}
logger.info(methodName, null, "--------------- Scheduler returns ---------------");
logger.info(methodName, null, "\n", upd.toString());
logger.info(methodName, null, "------------------------------------------------");
dispatch(upd, jmu); // my own job lists get updated by this
return jmu;
}
}
synchronized public void shutdown()
{
done = true;
}
// public void run()
// {
// String methodName = "run";
// while ( ! done ) {
// try { sleep(epoch); } catch (InterruptedException e) { }
// logger.info(methodName, null, "========================== Epoch starts ===========================");
// try {
// schedule();
// } catch ( SchedulingException e ) {
// logger.info(methodName, e.jobid, e);
// }
// logger.info(methodName, null, "========================== Epoch ends ===========================");
// }
// }
//
// Return a nodepool by Node. If the node can't be associated with a nodepool, return the
// default nodepool, which is always the first one defined in the config file.
//
NodePool getNodepoolByName(NodeIdentity ni)
{
NodePool np = nodepoolsByNode.get( ni.getName() );
if ( np == null ) {
np = nodepoolsByNode.get( ni.getIp() );
}
if ( np == null ) {
np = nodepools[0];
nodepoolsByNode.put( ni.getName(), np); // assign this guy to the default np
}
return np;
}
private int total_arrivals = 0;
public void nodeArrives(Node node)
{
// String methodName = "nodeArrives";
// The first block insures the node is in the scheduler's records as soon as possible
total_arrivals++; // report these in the main schedule loop
synchronized(this) {
// the amount of memory available for shares, adjusted with configured overhead
NodePool np = getNodepoolByName(node.getNodeIdentity());
Machine m = np.getMachine(node);
int share_order = 0;
if ( m == null ) {
// allNodes.put(node, node);
long allocatable_mem = node.getNodeMetrics().getNodeMemory().getMemTotal() - share_free_dram;
if ( dramOverride > 0 ) {
allocatable_mem = dramOverride;
}
share_order = (int) (allocatable_mem / share_quantum); // conservative - rounds down (this will always cast ok)
} else {
share_order = m.getShareOrder();
}
max_order = Math.max(share_order, max_order);
m = np.nodeArrives(node, share_order); // announce to the nodepools
}
}
public void nodeDeath(Map<Node, Node> nodes)
{
synchronized(deadNodes) {
deadNodes.putAll(nodes);
}
}
/**
* Callback from job manager, need shares for a new fair-share job.
*/
public void signalNewWork(IRmJob job)
{
// We'll synchronize only on the incoming job list
synchronized(incomingJobs) {
incomingJobs.add(job);
}
}
// public void signalForceEpoch()
// {
// synchronized( force_epoch ) {
// force_epoch = true;
// }
// }
public void signalInitialized(IRmJob job)
{
// We'll synchronize only on the incoming job list
synchronized(initializedJobs) {
initializedJobs.add(job);
}
}
public void signalRecovery(IRmJob job)
{
synchronized(recoveredJobs) {
recoveredJobs.add(job);
}
}
public void jobCancelled(DuccId id)
{
// TODO Fill this in.
}
/**
* Callback from job manager when a job completes. We just believe him, no sanity checks or other such stuff.
*/
public void signalCompletion(DuccId id)
{
String methodName = "signalCompletion";
synchronized(completedJobs) {
try {
IRmJob job = allJobs.get(id);
if ( job == null ) {
logger.warn(methodName, id, "Job completion signal: early termination; nothing to complete.");
return; // canceled or terminated very soon.
}
logger.info(methodName, id, "Job completion signal.");
completedJobs.add(job);
} catch (Throwable t) {
logger.warn(methodName, id, t);
}
}
}
/**
* Callback from job manager when a specific share exits but the job is still alive.
*/
public void signalCompletion(IRmJob job, Share share)
{
String methodName = "signalCompletion";
synchronized(vacatedShares) {
logger.info(methodName, job.getId(), "Job vacate signal share: ", share.toString());
vacatedShares.put(share.getId(), new Pair<IRmJob, Share>(job, share));
}
}
/**
* Callback from job manager when a specific share gets a process associated.
*/
// public void signalGrowth(DuccId jobid, Share share)
// {
// String methodName = "signalGrowth";
// synchronized(vacatedShares) {
// logger.info(methodName, jobid, "Job growth signal share: ", share.toString());
// growthOccurred = true;
// }
// }
/**
* Called in scheduling cycle, to actually complete the job - avoids deadlock
*/
private synchronized void processCompletion(IRmJob job)
{
String methodName = "processCompletion";
logger.info(methodName, job.getId(), "Job completes.");
// -- clean up the running jobs list
IRmJob j = allJobs.remove(job.getId());
if ( j == null ) {
logger.info(methodName, job.getId(), "Job is not in run list!"); // can happen if job is refused very early
return;
}
j.markComplete();
// -- clean up user list
User user = users.get(j.getUserName());
if ( user.remove(job) == 0 ) {
users.remove(user.getName());
}
ResourceClass rc = job.getResourceClass();
if ( rc != null ) {
rc.removeJob(j); // also clears it if it's a reservation
} else if ( !j.isRefused() ) {
throw new SchedInternalError(j.getId(), "Job exits from class " + job.getClassName() + " but we cannot find the priority class definition.");
}
// -- clean up machine lists
HashMap<Share, Share> shares= job.getAssignedShares();
for (Share s: shares.values()) {
purgeShare(s, job);
}
job.removeAllShares();
}
/**
* Called from scheduling cycle - a specific share has run out of work for the give job (but the
* job is not done yet).
*/
private synchronized void processCompletion(IRmJob job, Share share)
{
String methodName = "processCompletion";
logger.debug(methodName, job.getId(), "Job vacates share ", share.toString());
//share.removeJob();
job.removeShare(share);
purgeShare(share, job);
}
/**
* Log following / reconstruction, needed to init before recovery.
*/
public void resetNodepools()
{
for ( NodePool np : nodepools ) {
np.reset(NodePool.getMaxOrder());
}
}
/**
* Make this public for log following.
*/
public synchronized void processRecovery(IRmJob j)
{
String methodName = "processRecovery";
int share_order = calcShareOrder(j.getMemory());
ResourceClass rc = resourceClassesByName.get(j.getClassName());
j.setShareOrder(share_order);
j.setResourceClass(rc);
HashMap<Share, Share> shares = j.getRecoveredShares();
StringBuffer sharenames = new StringBuffer();
for ( Share s : shares.values() ) {
sharenames.append(s.toString());
sharenames.append(" ");
switch ( rc.getPolicy() ) {
case FAIR_SHARE:
s.setShareOrder(share_order);
break;
case FIXED_SHARE:
logger.info(methodName, j.getId(), "Set fixed bit for FIXED job");
s.setShareOrder(share_order);
s.setFixed();
break;
case RESERVE:
logger.info(methodName, j.getId(), "Set fixed bit for RESERVE job");
s.setFixed();
break;
}
// if ( rc.getPolicy() != Policy.RESERVE ) { // if it's RESERVE, the share order is already set from
// // the machine when the job arrives.
// s.setShareOrder(share_order);
// }
Machine m = s.getMachine();
NodePool np = m.getNodepool();
np.connectShare(s, m, j, s.getShareOrder());
busyShares.put(s.getId(), s);
}
String username = j.getUserName();
User user = users.get(username);
if ( user == null ) {
user = new User(username);
users.put(username, user);
logger.info(methodName, j.getId(), "&&&&&&&&&&&&&&&& new user", user.toString(), "-------------------");
}
j.setUser(user);
user.addJob(j);
j.promoteShares(); // NOT expanded, just recovered, promote them right away
j.clearRecoveredShares();
String clid = j.getClassName();
ResourceClass prclass = resourceClassesByName.get(clid);
allJobs.put(j.getId(), j);
prclass.addJob(j);
j.setResourceClass(prclass);
logger.info(methodName, j.getId(), "Recovered job:", j.toString());
logger.info(methodName, j.getId(), "Recovered shares:", sharenames.toString());
}
/**
* The share is gone, purge from our structures.
*/
private void purgeShare(Share s, IRmJob j)
{
busyShares.remove(s.getId()); // so long, and thanks for all the fish
Machine m = s.getMachine();
m.removeShare(s);
}
public synchronized static DuccId newId()
{
return idFactory.next();
}
public synchronized static DuccId newId(long id)
{
return idFactory.next(id);
}
public void queryMachines()
{
for ( NodePool np : nodepools ) {
np.queryMachines();
}
}
class MachineByOrderSorter
implements Comparator<Machine>
{
public int compare(Machine m1, Machine m2)
{
if ( m1.equals(m2) ) return 0;
if (m1.getShareOrder() == m2.getShareOrder()) {
return (m1.getId().compareTo(m2.getId()));
}
return (int) (m1.getShareOrder() - m2.getShareOrder());
}
}
}