Package com.cloudera.flume.agent.diskfailover

Source Code of com.cloudera.flume.agent.diskfailover.TestDiskFailoverAgent

/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.flume.agent.diskfailover;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;

import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.cloudera.flume.agent.DirectMasterRPC;
import com.cloudera.flume.agent.FlumeNode;
import com.cloudera.flume.agent.LivenessManager;
import com.cloudera.flume.agent.MasterRPC;
import com.cloudera.flume.conf.FlumeConfiguration;
import com.cloudera.flume.master.FlumeMaster;
import com.cloudera.util.NetUtils;

public class TestDiskFailoverAgent {
  public static final Logger LOG = LoggerFactory
      .getLogger(TestDiskFailoverAgent.class);

  FlumeMaster master = null;

  FlumeConfiguration cfg;

  @Before
  public void setCfg() throws IOException {
    // Isolate tests by only using simple cfg store
    cfg = FlumeConfiguration.createTestableConfiguration();
    cfg.set(FlumeConfiguration.MASTER_STORE, "memory");
    cfg.set(FlumeConfiguration.WEBAPPS_PATH, "build/webapps");
  }

  @After
  public void shutdownMaster() {
    if (master != null) {
      master.shutdown();
      master = null;
    }
  }

  /**
   * This test starts a DFO agent that attempts to go to a port that shouldn't
   * be open. This triggers the error recovery mechanism. We then wait for 10s
   * and then simulate the node doing a heartbeat to a master which tells that
   * that node is decomissioned. As part of decomissioning, the act of closing
   * the dfo agent should not hang the node.
   */
  @Test
  public void testActiveDFOClose() throws InterruptedException {
    final FlumeMaster master = new FlumeMaster(cfg);
    MasterRPC rpc = new DirectMasterRPC(master);

    final FlumeNode node = new FlumeNode(rpc, false, false);
    // should have nothing.
    assertEquals(0, node.getLogicalNodeManager().getNodes().size());

    final CountDownLatch done = new CountDownLatch(1);
    new Thread("ActiveDFOClose") {
      public void run() {
        LivenessManager liveMan = node.getLivenessManager();
        try {
          // update config node to something that will be interrupted.
          LOG.info("setting to invalid dfo host");
          master.getSpecMan().setConfig("node1", "flow", "asciisynth(0)",
              "agentDFOSink(\"localhost\", 12345)");
          master.getSpecMan().addLogicalNode(NetUtils.localhost(), "node1");
          liveMan.heartbeatChecks();
          Thread.sleep(10000);

          // update config node to something that will be interrupted.
          LOG.info("!!! decommissioning node on master");
          master.getSpecMan().removeLogicalNode("node1");

          // as node do heartbeat and update due to decommission
          liveMan.heartbeatChecks();
          LOG.info("!!! node should be decommissioning on node");

        } catch (Exception e) {
          LOG.info("closed caused an error out: " + e.getMessage(), e);
          // Right now it takes about 10 seconds for the dfo deco to error out.
          done.countDown();
          return; // expected fail on purpose.
        }

        LOG.info("Clean close.");
        done.countDown();
      }
    }.start();

    // false means timeout, takes about 10 seconds to shutdown.
    assertTrue("close call hung the heartbeat", done
        .await(45, TimeUnit.SECONDS));
    assertEquals(1, node.getLogicalNodeManager().getNodes().size());

  }

  /**
   * This test starts a DFO agent that attempts to go to a port that shouldn't
   * be open. This triggers the error recovery mechanism. We then wait for 10s
   * and then simulate the node doing a heartbeat to a master which tells that
   * that node is decommissioned. As part of decommissioning, the act of closing
   * the dfo agent should not hang the node.
   *
   * This test differs from the previous by having an bad dns name/request that
   * will eventually fail (ubuntu/java1.6 takes about 10s)
   */
  @Test
  public void testActiveDFOCloseBadDNS() throws InterruptedException {
    final FlumeMaster master = new FlumeMaster(cfg);
    MasterRPC rpc = new DirectMasterRPC(master);

    final FlumeNode node = new FlumeNode(rpc, false, false);
    // should have nothing.
    assertEquals(0, node.getLogicalNodeManager().getNodes().size());

    final CountDownLatch done = new CountDownLatch(1);
    new Thread() {
      public void run() {
        LivenessManager liveMan = node.getLivenessManager();
        try {
          // update config node to something that will be interrupted.
          LOG.info("setting to invalid dfo host");
          master.getSpecMan().setConfig("node1", "flow", "asciisynth(0)",
              "agentDFOSink(\"invalid\", 12345)");
          master.getSpecMan().addLogicalNode(NetUtils.localhost(), "node1");
          liveMan.heartbeatChecks();
          Thread.sleep(20000); // Takes 10s for dns to fail

          // update config node to something that will be interrupted.
          LOG.info("!!! decommissioning node on master");
          master.getSpecMan().removeLogicalNode("node1");
          liveMan.heartbeatChecks();
          LOG.info("!!! logical node should be decommissioning on node");

        } catch (Exception e) {
          LOG.error("closed caused an error out: " + e.getMessage(), e);
          // Right now it takes about 10 seconds for the dfo deco to error out.
          done.countDown();
          return; // fail
        }

        LOG.info("Clean close.");
        done.countDown();

      }
    }.start();

    // false means timeout, takes about 10 seconds to shutdown.
    assertTrue("close call hung the heartbeat", done
        .await(45, TimeUnit.SECONDS));
    assertEquals(1, node.getLogicalNodeManager().getNodes().size());

  }

  /**
   * This test starts a E2E agent that attempts to go to a port that shouldn't
   * be open. This triggers the error recovery mechanism. We then wait for 10s
   * and then simulate the node doing a heartbeat to a master which tells that
   * that node is decommissioned. As part of decommissioning, the act of closing
   * the E2E agent should not hang the node.
   */
  @Test
  public void testActiveE2EClose() throws InterruptedException {
    final FlumeMaster master = new FlumeMaster(cfg);
    MasterRPC rpc = new DirectMasterRPC(master);

    final FlumeNode node = new FlumeNode(rpc, false, false);
    // should have nothing.
    assertEquals(0, node.getLogicalNodeManager().getNodes().size());

    final CountDownLatch done = new CountDownLatch(1);
    new Thread("TestDiskFailoverAgent") {
      public void run() {
        LivenessManager liveMan = node.getLivenessManager();
        try {
          // update config node to something that will be interrupted.
          LOG.info("setting to invalid dfo host");
          master.getSpecMan().setConfig("node1", "flow", "asciisynth(0)",
              "agentE2ESink(\"localhost\", 12345)");
          master.getSpecMan().addLogicalNode(NetUtils.localhost(), "node1");
          liveMan.heartbeatChecks();
          Thread.sleep(10000);

          // update config node to something that will be interrupted.
          LOG.info("!!! decommissioning node on master");
          master.getSpecMan().removeLogicalNode("node1");
          liveMan.heartbeatChecks();
          LOG.info("!!! node should be decommissioning on node");

        } catch (Exception e) {
          LOG.error("closed caused an error out: " + e.getMessage(), e);
          // Right now it takes about 10 seconds for the dfo deco to error out.
          done.countDown();
          return; // fail
        }

        LOG.info("Did not expect clean close!?");

      }
    }.start();

    // false means timeout, takes about 10 seconds to shutdown.
    assertTrue("close call hung the heartbeat", done
        .await(60, TimeUnit.SECONDS));

  }

  /**
   * This test starts a E2E agent that attempts to go to a port that shouldn't
   * be open. This triggers the error recovery mechanism. We then wait for 10s
   * and then simulate the node doing a heartbeat to a master which tells that
   * that node is decommissioned. As part of decommissioning, the act of closing
   * the E2E agent should not hang the node.
   *
   * This test differs from the previous by having an bad dns name/request that
   * will eventually fail (ubuntu/java1.6 takes about 10s)
   */
  @Test
  public void testActiveE2ECloseBadDNS() throws InterruptedException {
    final FlumeMaster master = new FlumeMaster(cfg);
    MasterRPC rpc = new DirectMasterRPC(master);

    final FlumeNode node = new FlumeNode(rpc, false, false);
    // should have nothing.
    assertEquals(0, node.getLogicalNodeManager().getNodes().size());

    final CountDownLatch done = new CountDownLatch(1);
    new Thread() {
      public void run() {
        LivenessManager liveMan = node.getLivenessManager();
        try {
          // update config node to something that will be interrupted.
          LOG.info("setting to invalid dfo host");
          master.getSpecMan().setConfig("node1", "flow", "asciisynth(0)",
              "agentE2ESink(\"localhost\", 12345)");
          master.getSpecMan().addLogicalNode(NetUtils.localhost(), "node1");
          liveMan.heartbeatChecks();
          Thread.sleep(15000); // Takes 10s for dns to fail

          // update config node to something that will be interrupted.
          LOG.info("!!! decommissioning node on master");
          master.getSpecMan().removeLogicalNode("node1");
          liveMan.heartbeatChecks();
          LOG.info("!!! node should be decommissioning on node");

        } catch (Exception e) {
          LOG.error("closed caused an error out: " + e.getMessage(), e);
          // Right now it takes about 10 seconds for the dfo deco to error out.
          done.countDown();
          return; // fail
        }

        LOG.info("Did not expect clean close? ");

      }
    }.start();

    // false means timeout, takes about 10 seconds to shutdown.
    assertTrue("close call hung the heartbeat", done.await(120,
        TimeUnit.SECONDS));

  }

}
TOP

Related Classes of com.cloudera.flume.agent.diskfailover.TestDiskFailoverAgent

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.