Package com.alibaba.otter.node.etl.select

Source Code of com.alibaba.otter.node.etl.select.SelectTask

/*
* Copyright (C) 2010-2101 Alibaba Group Holding Limited.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.alibaba.otter.node.etl.select;

import java.util.Date;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.LockSupport;

import org.slf4j.MDC;
import org.springframework.util.CollectionUtils;

import com.alibaba.otter.canal.common.CanalException;
import com.alibaba.otter.node.common.statistics.StatisticsClientService;
import com.alibaba.otter.node.etl.OtterConstants;
import com.alibaba.otter.node.etl.common.jmx.StageAggregation.AggregationItem;
import com.alibaba.otter.node.etl.common.pipe.PipeKey;
import com.alibaba.otter.node.etl.common.task.GlobalTask;
import com.alibaba.otter.node.etl.extract.SetlFuture;
import com.alibaba.otter.node.etl.select.exceptions.SelectException;
import com.alibaba.otter.node.etl.select.selector.Message;
import com.alibaba.otter.node.etl.select.selector.OtterSelector;
import com.alibaba.otter.node.etl.select.selector.OtterSelectorFactory;
import com.alibaba.otter.shared.arbitrate.model.EtlEventData;
import com.alibaba.otter.shared.arbitrate.model.TerminEventData;
import com.alibaba.otter.shared.common.model.config.channel.Channel;
import com.alibaba.otter.shared.common.model.config.enums.StageType;
import com.alibaba.otter.shared.common.model.statistics.delay.DelayCount;
import com.alibaba.otter.shared.common.utils.lock.BooleanMutex;
import com.alibaba.otter.shared.etl.model.DbBatch;
import com.alibaba.otter.shared.etl.model.EventData;
import com.alibaba.otter.shared.etl.model.Identity;
import com.alibaba.otter.shared.etl.model.RowBatch;

/**
* select流处理模式的实现版本
*
* <pre>
* 调度模型:
* 1. 正常运行调度流程
* 假如并行度为3
* ----------------------------------------------------------->时间轴
* | ProcessSelect
* --> 1
*      --> 2
*           -->3
*               --> 1
* | ProcessTermin
*     ---> 1 ack
*          ----> 2ack
*                 ---> 3ack
* a. ProcessSelect拿到数据后,丢入pool池进行异步处理,并通知ProcessTermin顺序接收termin信号
* b. 同一时间在s/e/t/l流水线上的数据受并行度控制,满了就会阻塞ProcessSelect,避免取过多的数据,只会多取一份,等待其中一个s/e/t/l完成
* c. ProcessTermin接受到termin信号
*    i. 会严格按照发出去的batchId/processId进行对比,发现不匹配,发起rollback操作.
*    ii. 会根据terminType判断这一批数据是否处理成功,如果发现不成功,发起rollback操作
*
* 2. 异常调度流程
* 假如并行度为3
* |-->1 --> 2 -->3(ing)
* a. 当第1份数据,ProcessTermin发现需要rollback,此时需要回滚2,3份数据的批次. (可能第2,3份数据还未提交到s/e/t/l调度中)
*    i. 如果2批次数据已经提交,等待2批次termin信号的返回,此时需要阻塞ProcessSelect,避免再取新数据
*    ii. 如果第2批次数据未提交,直接rollback数据,不再进入s/e/t/l调度流程
* b. 当所有批次都已经处理完成,再通知ProcessSelect启动 (注意:这里会避免rollback和get并发操作,会造成数据不一致)
*
* 3. 热备机制
* a. Select主线程会一直监听mainstem的信号,一旦抢占成功,则启动ProcessSelect/ProcessTermin线程
* b. ProcessSelect/ProcessTermin在处理过程中,会检查一下当前节点是否为抢占mainstem成功的节点,如果发现不是,立马停止,继续监听mainstem
* c. ProcessSelect进行get数据之前,会等到ProcessTermin会读取未被处理过termin信号,对上一次的selector进行ack/rollback处理
*      i. 注意:ProcessSelect进行get数据时,需要保证batch/termin/get操作状态保持一致,必须都处于同一个数据点上
* </pre>
*
* @author jianghang 2012-7-31 下午05:39:06
* @version 4.1.0
*/
public class SelectTask extends GlobalTask {

    // 运行调度控制
    private volatile boolean           isStart          = false;
    // 运行
    private StatisticsClientService    statisticsClientService;
    private OtterSelectorFactory       otterSelectorFactory;
    private OtterSelector<Message>     otterSelector;
    private ExecutorService            executor;
    private BlockingQueue<BatchTermin> batchBuffer      = new LinkedBlockingQueue<BatchTermin>(50); // 设置有界队列,避免小batch处理太多
    private boolean                    needCheck        = false;
    private BooleanMutex               canStartSelector = new BooleanMutex(false);                 // 非常轻量的一个阻塞式实现,调用成本低
    private AtomicInteger              rversion         = new AtomicInteger(0);
    private long                       lastResetTime    = new Date().getTime();

    public SelectTask(Long pipelineId){
        super(pipelineId);
    }

    public void run() {
        MDC.put(OtterConstants.splitPipelineLogFileKey, String.valueOf(pipelineId));
        try {
            while (running) {
                try {
                    if (isStart) {
                        boolean working = arbitrateEventService.mainStemEvent().check(pipelineId);
                        if (!working) {
                            stopup(false);
                        }

                        LockSupport.parkNanos(5 * 1000 * 1000L * 1000L); // 5秒钟检查一次
                    } else {
                        startup();
                    }
                } catch (Throwable e) {
                    if (isInterrupt(e)) {
                        logger.info("INFO ## this node is interrupt", e);
                        return;
                    } else {
                        logger.warn("WARN ## this node is crashed.", e);
                        sendRollbackTermin(pipelineId, e);

                        // sleep 10秒再进行重试
                        try {
                            Thread.sleep(10 * 1000);
                        } catch (InterruptedException e1) {
                        }
                    }
                }
            }
        } finally {
            arbitrateEventService.mainStemEvent().release(pipelineId);
        }
    }

    /**
     * 尝试启动,需要进行mainstem竞争,拿到锁之后才可以进行下一步
     *
     * @throws InterruptedException
     */
    private void startup() throws InterruptedException {
        try {
            arbitrateEventService.mainStemEvent().await(pipelineId);
        } catch (Throwable e) {
            if (isInterrupt(e)) {
                logger.info("INFO ## this node is interrupt", e);
            } else {
                logger.warn("WARN ## this node is crashed.", e);
            }
            arbitrateEventService.mainStemEvent().release(pipelineId);
            return;
        }

        executor = Executors.newFixedThreadPool(2); // 启动两个线程
        // 启动selector
        otterSelector = otterSelectorFactory.getSelector(pipelineId); // 获取对应的selector
        otterSelector.start();

        canStartSelector.set(false);// 初始化为false
        startProcessTermin();
        startProcessSelect();

        isStart = true;
    }

    private synchronized void stopup(boolean needInterrut) throws InterruptedException {
        if (isStart) {
            if (executor != null) {
                executor.shutdownNow();
            }

            if (otterSelector != null && otterSelector.isStart()) {
                otterSelector.stop();
            }

            if (needInterrut) {
                throw new InterruptedException();// 抛异常,退出自己
            }

            isStart = false;
        }
    }

    /**
     * 执行数据分发工作
     */
    private void startProcessSelect() {
        executor.submit(new Runnable() {

            public void run() {
                MDC.put(OtterConstants.splitPipelineLogFileKey, String.valueOf(pipelineId));
                String currentName = Thread.currentThread().getName();
                Thread.currentThread().setName(createTaskName(pipelineId, "ProcessSelect"));
                try {
                    processSelect();
                } finally {
                    Thread.currentThread().setName(currentName);
                    MDC.remove(OtterConstants.splitPipelineLogFileKey);
                }
            }
        });

    }

    private void processSelect() {
        while (running) {
            try {
                // 等待ProcessTermin exhaust,会阻塞
                // ProcessTermin发现出现rollback,会立即通知暂停,比分布式permit及时性高
                canStartSelector.get();

                // 判断当前是否为工作节点,S模块不能出现双节点工作,selector容易出现数据错乱
                if (needCheck) {
                    checkContinueWork();
                }

                // 出现阻塞挂起时,等待mananger处理完成,解挂开启同步
                arbitrateEventService.toolEvent().waitForPermit(pipelineId);// 出现rollback后能及时停住

                // 使用startVersion要解决的一个问题:出现rollback时,尽可能判断取出来的数据是rollback前还是rollback后,想办法丢弃rollback前的数据。
                // (因为出现rollback,之前取出去的几个批次的数据其实是没有执行成功,get取出来的数据会是其后一批数据,如果不丢弃的话,会出现后面的数据先执行,然后又回到出错的点,再执行一遍)
                // int startVersion = rversion.get();
                Message gotMessage = otterSelector.selector();

                // modify by ljh at 2012-09-10,startVersion获取操作应该放在拿到数据之后
                // 放在前面 : (遇到一个并发bug)
                // // a.
                // 先拿startVersion,再获取数据,在拿数据过程中rollback开始并完成了,导致selector返回时数据已经取到了末尾
                // // b. 在进行version判断时发现已经有变化,导致又触发一次拿数据的过程,此时的get
                // cursor已经到队列的末尾,拿不出任何数据,所以出现死等情况
                // 放在后面 : (一点点瑕疵)
                // // a.
                // 并发操作rollback和selector时,针对拿到rollback前的老数据,此时startVersion还未初始化,导致判断不出出现过rollback操作,后面的变更数据会提前同步
                // (概率性会比较高,取决于selector和初始化startVersion的时间间隔)
                int startVersion = rversion.get();

                if (canStartSelector.state() == false) { // 是否出现异常
                    // 回滚在出现异常的瞬间,拿出来的数据,因为otterSelector.selector()会循环,可能出现了rollback,其还未感知到
                    rollback(gotMessage.getId());
                    continue;
                }

                if (CollectionUtils.isEmpty(gotMessage.getDatas())) {// 处理下空数据,也得更新下游标,可能是回环数据被过滤掉
                    // 添加到待响应的buffer列表,不需要await termin信号,因为没启动过s/e/t/l流程
                    batchBuffer.put(new BatchTermin(gotMessage.getId(), false));
                    continue;
                }

                final EtlEventData etlEventData = arbitrateEventService.selectEvent().await(pipelineId);
                if (rversion.get() != startVersion) {// 说明存在过变化,中间出现过rollback,需要丢弃该数据
                    logger.warn("rollback happend , should skip this data and get new message.");
                    canStartSelector.get();// 确认一下rollback是否完成
                    gotMessage = otterSelector.selector();// 这时不管有没有数据,都需要执行一次s/e/t/l
                }

                final Message message = gotMessage;
                final BatchTermin batchTermin = new BatchTermin(message.getId(), etlEventData.getProcessId());
                batchBuffer.put(batchTermin); // 添加到待响应的buffer列表
                Runnable task = new Runnable() {

                    public void run() {
                        // 设置profiling信息
                        boolean profiling = isProfiling();
                        Long profilingStartTime = null;
                        if (profiling) {
                            profilingStartTime = System.currentTimeMillis();
                        }

                        MDC.put(OtterConstants.splitPipelineLogFileKey, String.valueOf(pipelineId));
                        String currentName = Thread.currentThread().getName();
                        Thread.currentThread().setName(createTaskName(pipelineId, "SelectWorker"));
                        try {
                            pipeline = configClientService.findPipeline(pipelineId);
                            List<EventData> eventData = message.getDatas();
                            long startTime = etlEventData.getStartTime();
                            if (!CollectionUtils.isEmpty(eventData)) {
                                startTime = eventData.get(0).getExecuteTime();
                            }

                            Channel channel = configClientService.findChannelByPipelineId(pipelineId);
                            RowBatch rowBatch = new RowBatch();
                            // 构造唯一标识
                            Identity identity = new Identity();
                            identity.setChannelId(channel.getId());
                            identity.setPipelineId(pipelineId);
                            identity.setProcessId(etlEventData.getProcessId());
                            rowBatch.setIdentity(identity);
                            // 进行数据合并
                            for (EventData data : eventData) {
                                rowBatch.merge(data);
                            }

                            long nextNodeId = etlEventData.getNextNid();
                            List<PipeKey> pipeKeys = rowDataPipeDelegate.put(new DbBatch(rowBatch), nextNodeId);
                            etlEventData.setDesc(pipeKeys);
                            etlEventData.setNumber((long) eventData.size());
                            etlEventData.setFirstTime(startTime); // 使用原始数据的第一条
                            etlEventData.setBatchId(message.getId());

                            if (profiling) {
                                Long profilingEndTime = System.currentTimeMillis();
                                stageAggregationCollector.push(pipelineId,
                                                               StageType.SELECT,
                                                               new AggregationItem(profilingStartTime, profilingEndTime));
                            }
                            arbitrateEventService.selectEvent().single(etlEventData);
                        } catch (Throwable e) {
                            if (!isInterrupt(e)) {
                                logger.error(String.format("[%s] selectwork executor is error! data:%s", pipelineId,
                                                           etlEventData), e);
                                sendRollbackTermin(pipelineId, e);
                            } else {
                                logger.info(String.format("[%s] selectwork executor is interrrupt! data:%s",
                                                          pipelineId, etlEventData), e);
                            }
                        } finally {
                            Thread.currentThread().setName(currentName);
                            MDC.remove(OtterConstants.splitPipelineLogFileKey);
                        }
                    }
                };

                // 构造pending任务,可在关闭线程时退出任务
                SetlFuture extractFuture = new SetlFuture(StageType.SELECT, etlEventData.getProcessId(), pendingFuture,
                                                          task);
                executorService.execute(extractFuture);

            } catch (Throwable e) {
                if (!isInterrupt(e)) {
                    logger.error(String.format("[%s] selectTask is error!", pipelineId), e);
                    sendRollbackTermin(pipelineId, e);
                } else {
                    logger.info(String.format("[%s] selectTask is interrrupt!", pipelineId), e);
                    return;
                }
            }
        }
    }

    private void startProcessTermin() {
        executor.submit(new Runnable() {

            public void run() {
                MDC.put(OtterConstants.splitPipelineLogFileKey, String.valueOf(pipelineId));
                String currentName = Thread.currentThread().getName();
                Thread.currentThread().setName(createTaskName(pipelineId, "ProcessTermin"));
                try {
                    boolean lastStatus = true;
                    while (running) {
                        try {
                            // 处理下上一次非正常退出后,未被处理的termin信息
                            lastStatus = true;
                            // 消费掉所有的termin,不管成功还是失败,都回滚canal中的数据,会有重复
                            arbitrateEventService.terminEvent().exhaust(pipelineId);

                            batchBuffer.clear();// 清空上一次的待处理的batch记录,因为所有的batch都会被rollback掉

                            // 开始处理新的termin数据
                            while (running) {
                                if (batchBuffer.size() == 0) {
                                    // termin任务已经处理完成,可以通知selector重新开始拉数据
                                    if (canStartSelector.state() == false) {
                                        otterSelector.rollback();// rollback一下所有节点,确保所有的节点都被ack,包括被预取出来的数据
                                    }

                                    lastStatus = true;
                                    canStartSelector.set(true);
                                }

                                BatchTermin batch = batchBuffer.take();
                                logger.info("start process termin : {}", batch.toString());
                                if (batch.isNeedWait()) {
                                    lastStatus = processTermin(lastStatus, batch.getBatchId(), batch.getProcessId());
                                } else {
                                    // 不需要wait的批次,直接以上一个batch的结果决定是否ack
                                    if (lastStatus) {
                                        ack(batch.getBatchId());
                                        sendDelayReset(pipelineId);
                                    } else {
                                        rollback(batch.getBatchId());// 会阻塞selector等待所有batch的rollback操作完成
                                    }
                                }

                                logger.info("end process termin : {}  result : {}", batch.toString(), lastStatus);
                            }
                        } catch (CanalException e) {// 捕获可处理的异常,进行retry,基本可自行恢复
                            logger.info(String.format("[%s] ProcessTermin has an error! retry...", pipelineId), e);
                            notifyRollback();
                        } catch (SelectException e) {// 捕获可处理的异常,进行retry,基本可自行恢复
                            logger.info(String.format("[%s] ProcessTermin has an error! retry...", pipelineId), e);
                            notifyRollback();
                        } catch (Throwable e) {
                            if (isInterrupt(e)) {
                                logger.info(String.format("[%s] ProcessTermin is interrupted!", pipelineId), e);
                                return;
                            } else {
                                logger.error(String.format("[%s] ProcessTermin is error!", pipelineId), e);
                                notifyRollback();
                                sendRollbackTermin(pipelineId, e);
                            }
                        }

                        try {
                            Thread.sleep(30000); // sleep 30,等待termin信号都ready
                        } catch (InterruptedException e) {
                        }
                    }
                } finally {
                    Thread.currentThread().setName(currentName);
                    MDC.remove(OtterConstants.splitPipelineLogFileKey);
                }
            }

        });
    }

    private boolean processTermin(boolean lastStatus, Long batchId, Long processId) throws InterruptedException {
        int retry = 0;
        SelectException exception = null;
        TerminEventData terminData = null;
        while (retry++ < 30) {
            // 因为存在网络因素,而且在Load进行termin处理时,因为是异步处理,有一定的概率会出现termin不按顺序过来
            terminData = arbitrateEventService.terminEvent().await(pipelineId);
            Long terminBatchId = terminData.getBatchId();
            Long terminProcessId = terminData.getProcessId();

            if (terminBatchId == null && processId != -1L && !processId.equals(terminProcessId)) {
                // 针对manager发起rollback,terminBatchId可能为null,需要特殊处理下
                exception = new SelectException("unmatched processId, SelectTask batchId = " + batchId
                                                + " processId = " + processId + " and Termin Event: "
                                                + terminData.toString());
                Thread.sleep(1000); // sleep 1秒,等新的数据包
            } else if (terminBatchId != null && batchId != -1L && !batchId.equals(terminBatchId)) {
                exception = new SelectException("unmatched terminId, SelectTask batchId = " + batchId + " processId = "
                                                + processId + " and Termin Event: " + terminData.toString());
                Thread.sleep(1000); // sleep 1秒,等新的数据包
            } else {
                exception = null; // batchId/processId对上了,退出
                break;
            }
        }

        if (exception != null) {
            throw exception;
        }

        if (needCheck) {
            checkContinueWork();
        }

        boolean status = terminData.getType().isNormal();
        if (lastStatus == false && status == true) {
            // 上一批失败,这一批成功,说明调度有问题
            throw new SelectException(
                                      String.format("last status is rollback , but now [batchId:%d , processId:%d] is ack",
                                                    batchId, terminData.getProcessId()));
        }

        if (terminData.getType().isNormal()) {
            ack(batchId);
            sendDelayStat(pipelineId, terminData.getEndTime(), terminData.getFirstTime());
        } else {
            rollback(batchId);
        }

        arbitrateEventService.terminEvent().ack(terminData); // 先发送对应的数据
        return status;
    }

    private void rollback(Long batchId) {
        notifyRollback();
        // otterSelector.rollback(batchId);
        otterSelector.rollback();// 一旦出错,rollback所有的mark,避免拿出后面的数据进行同步
    }

    private void ack(Long batchId) {
        canStartSelector.set(true);
        otterSelector.ack(batchId);
    }

    private void notifyRollback() {
        canStartSelector.set(false);
        rversion.incrementAndGet();// 变更一下版本
    }

    /**
     * 检查一下是否需要继续工作,因为mainstem代表一个节点是否为工作节点,一旦出现断网,另一个节点就会启用。此时会出现双节点同时工作, 所以需要做一个检查
     */
    private void checkContinueWork() throws InterruptedException {
        boolean working = arbitrateEventService.mainStemEvent().check(pipelineId);
        if (!working) {
            logger.warn("mainstem is not run in this node");
            stopup(true);
        }

    }

    public void shutdown() {
        super.shutdown();

        if (executor != null) {
            executor.shutdownNow();
        }

        if (otterSelector != null && otterSelector.isStart()) {
            otterSelector.stop();
        }
    }

    public static class BatchTermin {

        private Long    batchId   = -1L;
        private Long    processId = -1L;
        private boolean needWait  = true;

        public BatchTermin(Long batchId, Long processId){
            this(batchId, processId, true);
        }

        public BatchTermin(Long batchId, boolean needWait){
            this(batchId, -1L, needWait);
        }

        public BatchTermin(Long batchId, Long processId, boolean needWait){
            this.batchId = batchId;
            this.processId = processId;
            this.needWait = needWait;
        }

        public Long getBatchId() {
            return batchId;
        }

        public void setBatchId(Long batchId) {
            this.batchId = batchId;
        }

        public Long getProcessId() {
            return processId;
        }

        public void setProcessId(Long processId) {
            this.processId = processId;
        }

        public boolean isNeedWait() {
            return needWait;
        }

        public void setNeedWait(boolean needWait) {
            this.needWait = needWait;
        }

        @Override
        public String toString() {
            return "BatchTermin [batchId=" + batchId + ", needWait=" + needWait + ", processId=" + processId + "]";
        }

    }

    private void sendDelayStat(long pipelineId, Long endTime, Long startTime) {
        DelayCount delayCount = new DelayCount();
        delayCount.setPipelineId(pipelineId);
        delayCount.setNumber(0L);// 不再统计delayNumber
        if (startTime != null && endTime != null) {
            delayCount.setTime(endTime - startTime);// 以后改造成获取数据库的sysdate/now()
        }

        statisticsClientService.sendResetDelayCount(delayCount);
    }

    private void sendDelayReset(long pipelineId) {
        long currentTime = System.currentTimeMillis();
        if (currentTime - lastResetTime > 60 * 1000) {
            // 60秒向manager推送一次配置
            lastResetTime = currentTime;
            DelayCount delayCount = new DelayCount();
            delayCount.setPipelineId(pipelineId);
            delayCount.setNumber(0L);
            long delayTime = (new Date().getTime() - otterSelector.lastEntryTime() + 500) / 1000;// 4舍5入
            delayCount.setTime(delayTime);
            statisticsClientService.sendResetDelayCount(delayCount);
        }
    }

    // ======================= setter / getter ===================

    public void setOtterSelectorFactory(OtterSelectorFactory otterSelectorFactory) {
        this.otterSelectorFactory = otterSelectorFactory;
    }

    public void setStatisticsClientService(StatisticsClientService statisticsClientService) {
        this.statisticsClientService = statisticsClientService;
    }

}
TOP

Related Classes of com.alibaba.otter.node.etl.select.SelectTask

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.