}
if (!isClusterProvisionedByBDE(clusterDef)) {
throw SoftwareManagementPluginException.START_CLUSTER_FAILED_NOT_PROV_BY_BDE(clusterName);
}
ClusterReport clusterReport = clusterDef.getCurrentReport();
clusterReport.setAction("Ambari is starting services");
clusterReport.setProgress(ProgressSplit.OPERATION_BEGIN.getProgress());
reportStatus(clusterReport, reports);
boolean success = false;
//when start services, some tasks will fail with error msg "Host Role in invalid state".
// The failed task are random(I had saw NodeManager, ResourceManager, NAGOIS failed), and the
// root cause is not clear by now. Each time, when I retry, it succeed. So just add retry logic to make a
// a temp fix for it.
//TODO(qjin): find out the root cause of failure in startting services
Exception resultException = null;
try {
ReflectionUtils.getPreStartServicesHook().preStartServices(clusterName, 120);
for (int i = 0; i < getRequestMaxRetryTimes(); i++) {
ApiRequest apiRequestSummary;
try {
apiRequestSummary = apiManager.startAllServicesInCluster(clusterName);
//when reach here, command is succeed. If ApiRequestInfo is null, it means the command has been
//finished successfully, otherwise we need to wait for it using doSoftwareOperation
if (apiRequestSummary == null || apiRequestSummary.getApiRequestInfo() == null) {
success = true;
return true;
}
success = doSoftwareOperation(clusterBlueprint.getName(), apiRequestSummary, clusterReport, reports);
} catch (Exception e) {
resultException = e;
logger.warn("Failed to start cluster services, retrying after 5 seconds...", e);
try {
Thread.sleep(5000);
} catch (InterruptedException interrupt) {
logger.info("interrupted when sleeping, trying to start cluster services immediately");
}
}
}
} finally {
if (!success) {
logger.error("Failed to start all services: ", resultException);
throw SoftwareManagementPluginException.START_CLUSTER_FAILED(resultException, Constants.AMBARI_PLUGIN_NAME, clusterName);
}
clusterReport.setClusterAndNodesServiceStatus(ServiceStatus.STARTED);
clusterReport.setClusterAndNodesAction("");
clusterReport.clearAllNodesErrorMsg();
reportStatus(clusterReport.clone(), reports);
return true;
}
}