Package com.taobao.zeus.socket.master

Source Code of com.taobao.zeus.socket.master.Master

package com.taobao.zeus.socket.master;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import org.jboss.netty.channel.Channel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.taobao.zeus.broadcast.alarm.MailAlarm;
import com.taobao.zeus.broadcast.alarm.SMSAlarm;
import com.taobao.zeus.client.ZeusException;
import com.taobao.zeus.model.DebugHistory;
import com.taobao.zeus.model.FileDescriptor;
import com.taobao.zeus.model.JobDescriptor;
import com.taobao.zeus.model.JobHistory;
import com.taobao.zeus.model.JobStatus;
import com.taobao.zeus.model.JobStatus.TriggerType;
import com.taobao.zeus.model.Profile;
import com.taobao.zeus.schedule.mvc.AddJobListener;
import com.taobao.zeus.schedule.mvc.DebugInfoLog;
import com.taobao.zeus.schedule.mvc.DebugListener;
import com.taobao.zeus.schedule.mvc.JobController;
import com.taobao.zeus.schedule.mvc.JobFailListener;
import com.taobao.zeus.schedule.mvc.JobSuccessListener;
import com.taobao.zeus.schedule.mvc.ScheduleInfoLog;
import com.taobao.zeus.schedule.mvc.StopScheduleJobListener;
import com.taobao.zeus.schedule.mvc.ZeusJobException;
import com.taobao.zeus.schedule.mvc.event.DebugFailEvent;
import com.taobao.zeus.schedule.mvc.event.DebugSuccessEvent;
import com.taobao.zeus.schedule.mvc.event.Events;
import com.taobao.zeus.schedule.mvc.event.JobFailedEvent;
import com.taobao.zeus.schedule.mvc.event.JobSuccessEvent;
import com.taobao.zeus.socket.SocketLog;
import com.taobao.zeus.socket.master.MasterWorkerHolder.HeartBeatInfo;
import com.taobao.zeus.socket.master.reqresp.MasterExecuteJob;
import com.taobao.zeus.socket.protocol.Protocol.ExecuteKind;
import com.taobao.zeus.socket.protocol.Protocol.Response;
import com.taobao.zeus.socket.protocol.Protocol.Status;
import com.taobao.zeus.store.GroupBean;
import com.taobao.zeus.store.JobBean;
import com.taobao.zeus.util.Environment;
import com.taobao.zeus.util.Tuple;

public class Master {

  private MasterContext context;
  private static Logger log = LoggerFactory.getLogger(Master.class);

  public Master(final MasterContext context) {
    this.context = context;
    GroupBean root = context.getGroupManager().getGlobeGroupBean();

    if (Environment.isPrePub()) {
      // 如果是预发环境,添加stop listener,阻止自动调度执行
      context.getDispatcher().addDispatcherListener(
          new StopScheduleJobListener());
    }
    context.getDispatcher().addDispatcherListener(
        new AddJobListener(context, this));
    context.getDispatcher().addDispatcherListener(
        new JobFailListener(context));
    context.getDispatcher().addDispatcherListener(
        new DebugListener(context));
    context.getDispatcher().addDispatcherListener(
        new JobSuccessListener(context));
    Map<String, JobBean> allJobBeans = root.getAllSubJobBeans();
    for (String id : allJobBeans.keySet()) {
      context.getDispatcher().addController(
          new JobController(context, this, id));
    }

    // 初始化
    context.getDispatcher().forwardEvent(Events.Initialize);
    context.setMaster(this);

    // 定时扫描等待队列
    context.getSchedulePool().scheduleAtFixedRate(new Runnable() {
      @Override
      public void run() {
        try {
          scan();
        } catch (Exception e) {
        }
      }
    }, 0, 3, TimeUnit.SECONDS);
    // 定时扫描worker channel,心跳超过1分钟没有连接就主动断掉
    context.getSchedulePool().scheduleAtFixedRate(new Runnable() {
      @Override
      public void run() {
        Date now = new Date();
        for (MasterWorkerHolder holder : new ArrayList<MasterWorkerHolder>(
            context.getWorkers().values())) {
          if (holder.getHeart().timestamp == null
              || (now.getTime() - holder.getHeart().timestamp
                  .getTime()) > 1000 * 60) {
            holder.getChannel().close();
          }
        }
      }
    }, 30, 30, TimeUnit.SECONDS);
  }

  private MasterWorkerHolder getRunableWorker() {
    MasterWorkerHolder selectWorker = null;
    Float selectMemRate = null;
    for (MasterWorkerHolder worker : context.getWorkers().values()) {
      HeartBeatInfo heart = worker.getHeart();
      if (heart != null && heart.memRate != null && heart.memRate < 0.8) {
        if (selectWorker == null) {
          selectWorker = worker;
          selectMemRate = heart.memRate;
        } else if (selectMemRate > heart.memRate) {
          selectWorker = worker;
          selectMemRate = heart.memRate;
        }
      }
    }
    return selectWorker;
  }

  private void scan() {
    if (!context.getQueue().isEmpty()
        || !context.getManualQueue().isEmpty()
        || !context.getDebugQueue().isEmpty()) {
      MasterWorkerHolder selectWorker = getRunableWorker();
      if (selectWorker != null) {
        if (!context.getQueue().isEmpty()) {// 调度任务
          runScheduleJob(selectWorker);
        } else if (!context.getManualQueue().isEmpty()) {
          runManualJob(selectWorker);
        } else if (!context.getDebugQueue().isEmpty()) {// 调试任务
          runDebugJob(selectWorker);
        }
      }
    }
    // 检测任务超时
    checkTimeOver();
  }

  private void runDebugJob(MasterWorkerHolder selectWorker) {
    final MasterWorkerHolder w = selectWorker;
    final String debugId = context.getDebugQueue().poll();
    SocketLog.info("master scan and poll debugId=" + debugId + " and run!");

    new Thread() {
      @Override
      public void run() {
        DebugHistory history = context.getDebugHistoryManager()
            .findDebugHistory(debugId);
        history.getLog().appendZeus(
            new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
                .format(new Date()) + " 开始运行");
        context.getDebugHistoryManager().updateDebugHistoryLog(debugId,
            history.getLog().getContent());
        Exception exception = null;
        Response resp = null;
        try {
          Future<Response> f = new MasterExecuteJob().executeJob(
              context, w, ExecuteKind.DebugKind, debugId);
          resp = f.get();
        } catch (Exception e) {
          exception = e;
          DebugInfoLog.error(
              String.format("debugId:%s run failed", debugId), e);
        }
        boolean success = resp.getStatus() == Status.OK ? true : false;

        if (!success) {
          // 运行失败,更新失败状态,发出失败消息
          if (exception != null) {
            exception = new ZeusException(String.format(
                "fileId:%s run failed ", history.getFileId()),
                exception);
          } else {
            exception = new ZeusException(String.format(
                "fileId:%s run failed ", history.getFileId()));
          }
          DebugInfoLog.info("debugId:" + debugId + " run fail ");
          history = context.getDebugHistoryManager()
              .findDebugHistory(debugId);
          DebugFailEvent jfe = new DebugFailEvent(
              history.getFileId(), history, exception);
          context.getDispatcher().forwardEvent(jfe);
        } else {
          // 运行成功,发出成功消息
          DebugInfoLog.info("debugId:" + debugId + " run success");
          DebugSuccessEvent dse = new DebugSuccessEvent(
              history.getFileId(), history);
          context.getDispatcher().forwardEvent(dse);
        }
      }
    }.start();
  }

  private void runManualJob(MasterWorkerHolder selectWorker) {
    final MasterWorkerHolder w = selectWorker;
    final String historyId = context.getManualQueue().poll();
    SocketLog.info("master scan and poll historyId=" + historyId
        + " and run!");
    new Thread() {
      @Override
      public void run() {
        JobHistory history = context.getJobHistoryManager()
            .findJobHistory(historyId);
        history.getLog().appendZeus(
            new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
                .format(new Date()) + " 开始运行");
        context.getJobHistoryManager().updateJobHistoryLog(historyId,
            history.getLog().getContent());
        Exception exception = null;
        Response resp = null;
        try {
          Future<Response> f = new MasterExecuteJob().executeJob(
              context, w, ExecuteKind.ManualKind, historyId);
          resp = f.get();
        } catch (Exception e) {
          exception = e;
          ScheduleInfoLog.error("JobId:" + history.getJobId()
              + " run failed", e);
        }
        boolean success = resp.getStatus() == Status.OK ? true : false;

        if (!success) {
          // 运行失败,更新失败状态,发出失败消息
          ZeusJobException jobException = null;
          if (exception != null) {
            jobException = new ZeusJobException(history.getJobId(),
                String.format("JobId:%s run failed ",
                    history.getJobId()), exception);
          } else {
            jobException = new ZeusJobException(history.getJobId(),
                String.format("JobId:%s run failed ",
                    history.getJobId()));
          }
          ScheduleInfoLog.info("jobId:" + history.getJobId()
              + " run fail ");
          history = context.getJobHistoryManager().findJobHistory(
              historyId);
          JobFailedEvent jfe = new JobFailedEvent(history.getJobId(),
              history.getTriggerType(), history, jobException);
          context.getDispatcher().forwardEvent(jfe);
        } else {
          // 运行成功,发出成功消息
          ScheduleInfoLog.info("manual jobId::" + history.getJobId()
              + " run success");
          JobSuccessEvent jse = new JobSuccessEvent(
              history.getJobId(), history.getTriggerType(),
              historyId);
          context.getDispatcher().forwardEvent(jse);
        }
      };
    }.start();
  }

  private void runScheduleJob(MasterWorkerHolder selectWorker) {
    final MasterWorkerHolder w = selectWorker;
    final String jobId = context.getQueue().poll();
    SocketLog.info("master scan and poll jobId=" + jobId + " and run!");
    new Thread() {
      @Override
      public void run() {
        JobHistory his = context.getJobHistoryManager().findJobHistory(
            context.getGroupManager().getJobStatus(jobId)
                .getHistoryId());
        TriggerType type = his.getTriggerType();
        ScheduleInfoLog.info("JobId:" + jobId + " run start");
        his.getLog().appendZeus(
            new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
                .format(new Date()) + " 开始运行");
        context.getJobHistoryManager().updateJobHistoryLog(his.getId(),
            his.getLog().getContent());
        Exception exception = null;
        Response resp = null;
        try {
          Future<Response> f = new MasterExecuteJob().executeJob(
              context, w, ExecuteKind.ScheduleKind, his.getId());
          resp = f.get();
        } catch (Exception e) {
          exception = e;
          ScheduleInfoLog.error(
              String.format("JobId:%s run failed", jobId), e);
        }
        boolean success = resp.getStatus() == Status.OK ? true : false;

        JobStatus jobstatus = context.getGroupManager().getJobStatus(
            jobId);
        jobstatus
            .setStatus(com.taobao.zeus.model.JobStatus.Status.WAIT);
        if (success
            && (his.getTriggerType() == TriggerType.SCHEDULE || his
                .getTriggerType() == TriggerType.MANUAL_RECOVER)) {
          ScheduleInfoLog.info("JobId:" + jobId
              + " clear ready dependency");
          jobstatus.setReadyDependency(new HashMap<String, String>());
        }
        context.getGroupManager().updateJobStatus(jobstatus);

        if (!success) {
          // 运行失败,更新失败状态,发出失败消息
          ZeusJobException jobException = null;
          if (exception != null) {
            jobException = new ZeusJobException(jobId,
                String.format("JobId:%s run failed ", jobId),
                exception);
          } else {
            jobException = new ZeusJobException(jobId,
                String.format("JobId:%s run failed ", jobId));
          }
          ScheduleInfoLog.info("JobId:" + jobId
              + " run fail and dispatch the fail event");
          JobFailedEvent jfe = new JobFailedEvent(jobId, type,
              context.getJobHistoryManager().findJobHistory(
                  his.getId()), jobException);
          context.getDispatcher().forwardEvent(jfe);
        } else {
          // 运行成功,发出成功消息
          ScheduleInfoLog.info("JobId:" + jobId
              + " run success and dispatch the success event");
          JobSuccessEvent jse = new JobSuccessEvent(jobId,
              his.getTriggerType(), his.getId());
          context.getDispatcher().forwardEvent(jse);
        }
      }
    }.start();
  }

  /**
   * 检查任务超时
   */
  private void checkTimeOver() {
    for (MasterWorkerHolder w : context.getWorkers().values()) {
      checkScheduleTimeOver(w);
      checkManualTimeOver(w);
      checkDebugTimeOver(w);
    }
  }

  private void checkDebugTimeOver(MasterWorkerHolder w) {
    for (Map.Entry<String, Boolean> entry : w.getDebugRunnings().entrySet()) {
      if (entry.getValue() != null && entry.getValue()) {
        continue;
      }
      String historyId = entry.getKey();
      DebugHistory his = context.getDebugHistoryManager()
          .findDebugHistory(historyId);
      long maxTime;
      FileDescriptor fd;
      try {
        fd = context.getFileManager().getFile(his.getFileId());
        Profile pf = context.getProfileManager().findByUid(
            fd.getOwner());
        String maxTimeString = pf.getHadoopConf().get(
            "zeus.job.maxtime");
        if (maxTimeString == null || maxTimeString.trim().isEmpty()) {
          continue;
        }
        maxTime = Long.parseLong(maxTimeString);

        if (maxTime < 0) {
          continue;
        }
      } catch (Exception e) {
        continue;
      }
      long runTime = (System.currentTimeMillis() - his.getStartTime()
          .getTime()) / 1000 / 60;
      if (runTime > maxTime) {
        if (timeOverAlarm(null, fd, runTime, maxTime, 2)) {
          w.getDebugRunnings().replace(historyId, false, true);
        }
      }
    }
  }

  private void checkManualTimeOver(MasterWorkerHolder w) {
    for (Map.Entry<String, Boolean> entry : w.getManualRunnings()
        .entrySet()) {
      if (entry.getValue() != null && entry.getValue()) {
        continue;
      }
      String historyId = entry.getKey();
      JobHistory his = context.getJobHistoryManager().findJobHistory(
          historyId);
      long maxTime;
      try {
        JobDescriptor jd = context.getGroupManager()
            .getJobDescriptor(his.getJobId()).getX();
        String maxTimeString = jd.getProperties().get(
            "zeus.job.maxtime");
        if (maxTimeString == null || maxTimeString.trim().isEmpty()) {
          continue;
        }
        maxTime = Long.parseLong(maxTimeString);

        if (maxTime < 0) {
          continue;
        }
      } catch (Exception e) {
        continue;
      }
      long runTime = (System.currentTimeMillis() - his.getStartTime()
          .getTime()) / 1000 / 60;
      if (runTime > maxTime) {
        if (timeOverAlarm(his, null, runTime, maxTime, 1)) {
          w.getManualRunnings().replace(historyId, false, true);
        }
      }
    }
  }

  private void checkScheduleTimeOver(MasterWorkerHolder w) {
    for (Map.Entry<String, Boolean> entry : w.getRunnings().entrySet()) {
      if (entry.getValue() != null && entry.getValue()) {
        continue;
      }
      String jobId = entry.getKey();
      JobDescriptor jd = context.getGroupManager()
          .getJobDescriptor(jobId).getX();
      String maxTimeString = jd.getProperties().get("zeus.job.maxtime");
      long maxTime;
      try {
        if (maxTimeString == null || maxTimeString.trim().isEmpty()) {
          continue;
        }
        maxTime = Long.parseLong(maxTimeString);

        if (maxTime < 0) {
          continue;
        }
      } catch (Exception e) {
        continue;
      }

      JobHistory his = context.getJobHistoryManager().findJobHistory(
          context.getGroupManager().getJobStatus(jobId)
              .getHistoryId());
      long runTime = (System.currentTimeMillis() - his.getStartTime()
          .getTime()) / 1000 / 60;
      if (runTime > maxTime) {
        if (timeOverAlarm(his, null, runTime, maxTime, 0)) {
          w.getRunnings().replace(jobId, false, true);
        }
      }
    }
  }

  private boolean timeOverAlarm(final JobHistory his, FileDescriptor fd,
      long runTime, long maxTime, int type) {
    final MailAlarm mailAlarm = (MailAlarm) context.getApplicationContext()
        .getBean("mailAlarm");
    SMSAlarm smsAlarm = (SMSAlarm) context.getApplicationContext().getBean(
        "smsAlarm");

    final StringBuffer title = new StringBuffer("宙斯任务超时[");
    switch (type) {
    case 0:
      title.append("自动调度").append("] jobID=").append(his.getJobId());
      break;
    case 1:
      title.append("手动调度").append("] jobID=").append(his.getJobId());
      break;
    case 2:
      title.append("调试任务").append("] 脚本名称:").append(fd.getName());
    }
    final StringBuffer content = new StringBuffer(title);
    content.append("\n已经运行时间:").append(runTime).append("分钟")
        .append("\n设置最大运行时间:").append(maxTime).append("分钟")
        .append("\n详情请登录zeus系统查看:http://zeus.taobao.com:9999");
    try {
      if (type == 2) {
        //此处可以发送IM消息
      } else {
        //此处可以发送IM消息
        new Thread() {
          @Override
          public void run() {
            try {
              Thread.sleep(6000);
              mailAlarm.alarm(his.getId(), title.toString(),
                  content.toString().replace("\n","<br/>").replace("http://zeus.taobao.com:9999", "<a href='http://zeus.taobao.com:9999'>http://zeus.taobao.com:9999</a>"));
            } catch (Exception e) {
              log.error("send run timeover mail alarm failed", e);
            }
          }
        }.start();
        if (type == 0) {
          Calendar now = Calendar.getInstance();
          int hour = now.get(Calendar.HOUR_OF_DAY);
          int day = now.get(Calendar.DAY_OF_WEEK);
          if (day == Calendar.SATURDAY || day == Calendar.SUNDAY
              || hour < 9 || hour > 18) {
            smsAlarm.alarm(his.getId(), title.toString(),
                content.toString(), null);
          }
        }
      }
      return true;
    } catch (Exception e) {
      log.error("send run timeover alarm failed", e);
      return false;
    }
  }

  public void workerDisconnectProcess(Channel channel) {
    MasterWorkerHolder holder = context.getWorkers().get(channel);
    if (holder != null) {
      context.getWorkers().remove(channel);
      final List<JobHistory> hiss = new ArrayList<JobHistory>();
      Map<String, Tuple<JobDescriptor, JobStatus>> map = context
          .getGroupManager().getJobDescriptor(
              holder.getRunnings().keySet());
      for (String key : map.keySet()) {
        JobStatus js = map.get(key).getY();
        if (js.getHistoryId() != null) {
          hiss.add(context.getJobHistoryManager().findJobHistory(
              js.getHistoryId()));
        }
        js.setStatus(com.taobao.zeus.model.JobStatus.Status.FAILED);
        context.getGroupManager().updateJobStatus(js);
      }
      new Thread() {
        @Override
        public void run() {
          try {
            Thread.sleep(30000);
          } catch (InterruptedException e) {
          }
          for (JobHistory his : hiss) {
            String jobId = his.getJobId();
            JobHistory history = new JobHistory();
            history.setJobId(jobId);
            history.setTriggerType(his.getTriggerType());
            history.setIllustrate("worker断线,重新跑任务");
            history.setOperator(his.getOperator());
            context.getJobHistoryManager().addJobHistory(history);
            Master.this.run(history);
          }
        };
      }.start();

    }
  }

  public void debug(DebugHistory debug) {
    debug.setStatus(com.taobao.zeus.model.JobStatus.Status.RUNNING);
    debug.setStartTime(new Date());
    context.getDebugHistoryManager().updateDebugHistory(debug);
    debug.getLog().appendZeus(
        new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date())
            + " 进入任务队列");
    context.getDebugHistoryManager().updateDebugHistoryLog(debug.getId(),
        debug.getLog().getContent());
    context.getDebugQueue().offer(debug.getId());
  }

  public JobHistory run(JobHistory history) {
    history.setStatus(com.taobao.zeus.model.JobStatus.Status.RUNNING);
    String jobId = history.getJobId();
    if (history.getTriggerType() == TriggerType.MANUAL_RECOVER) {
      for (String job : new ArrayList<String>(context.getQueue())) {
        if (job.equals(jobId)) {
          history.getLog().appendZeus("已经在队列中,无法再次运行");
          history.setStartTime(new Date());
          history.setEndTime(new Date());
          history.setStatus(com.taobao.zeus.model.JobStatus.Status.FAILED);
          break;
        }
      }
      for (Channel key : context.getWorkers().keySet()) {
        MasterWorkerHolder worker = context.getWorkers().get(key);
        if (worker.getRunnings().containsKey(jobId)) {
          history.getLog().appendZeus("已经在运行中,无法再次运行");
          history.setStartTime(new Date());
          history.setEndTime(new Date());
          history.setStatus(com.taobao.zeus.model.JobStatus.Status.FAILED);
          break;
        }
      }
    }

    if (history.getStatus() == com.taobao.zeus.model.JobStatus.Status.RUNNING) {
      history.getLog().appendZeus(
          new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
              .format(new Date()) + " 进入任务队列");
      context.getJobHistoryManager().updateJobHistoryLog(history.getId(),
          history.getLog().getContent());
      if (history.getTriggerType() == TriggerType.MANUAL) {
        context.getManualQueue().offer(history.getId());
      } else {
        JobStatus js = context.getGroupManager().getJobStatus(
            history.getJobId());
        js.setStatus(com.taobao.zeus.model.JobStatus.Status.RUNNING);
        js.setHistoryId(history.getId());
        context.getGroupManager().updateJobStatus(js);
        context.getQueue().offer(jobId);
      }
    }
    context.getJobHistoryManager().updateJobHistory(history);
    context.getJobHistoryManager().updateJobHistoryLog(history.getId(),
        history.getLog().getContent());
    return history;
  }
}
TOP

Related Classes of com.taobao.zeus.socket.master.Master

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.