Package com.cloudera.flume.agent

Source Code of com.cloudera.flume.agent.TestEndToEndAckFailureRecovery

/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.flume.agent;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;

import java.io.File;
import java.io.IOException;
import java.util.Set;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import com.cloudera.flume.agent.durability.NaiveFileWALDeco;
import com.cloudera.flume.conf.Context;
import com.cloudera.flume.conf.FlumeBuilder;
import com.cloudera.flume.conf.FlumeConfiguration;
import com.cloudera.flume.conf.FlumeSpecException;
import com.cloudera.flume.conf.ReportTestingContext;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventImpl;
import com.cloudera.flume.core.EventSink;
import com.cloudera.flume.handlers.rolling.ProcessTagger;
import com.cloudera.flume.handlers.rolling.TimeTrigger;
import com.cloudera.flume.master.FlumeMaster;
import com.cloudera.flume.reporter.ReportManager;
import com.cloudera.flume.reporter.aggregator.CounterSink;
import com.cloudera.util.Clock;
import com.cloudera.util.FileUtil;

/**
* This tests end to end transmission and acks and recovery when there are
* problems with the transmission.
*
* Generally these tests are setup to trigger manually after 5 messages. Acking
* prefixes and post fixes each batch with a ack begin and ack end message that
* has info required for the checksum to add up. Thus 7 messages are sent per
* chunk.
*
* In the cases where there are retries, these are done using forceRetry
* (instead of the time-dependent checkRetry)
*/
public class TestEndToEndAckFailureRecovery {

  public static final Logger LOG = Logger
      .getLogger(TestEndToEndAckFailureRecovery.class);

  DirectMasterRPC mock;
  FlumeNode node;
  FlumeMaster master;

  /**
   * Setup mock clock, mock rpc master and a node that writes to a temp
   * directory.
   */
  @Before
  public void setUp() {
    System.out.println("=====");
    // Logger log = Logger.getLogger(NaiveFileWALManager.class.getName());
    Logger log = Logger.getRootLogger();
    log.setLevel(Level.DEBUG);

    File tmpdir = null;
    try {
      tmpdir = FileUtil.mktempdir();
    } catch (IOException e) {
      fail("unable to create temp dir");
    }
    FlumeConfiguration.get().set(FlumeConfiguration.AGENT_LOG_DIR_NEW,
        tmpdir.getAbsolutePath());
    System.out.println("Writing out tempdir: " + tmpdir.getAbsolutePath());

    // This will register the FlumeNode with a MockMasterRPC so it doesn't go
    // across the network
    master = new FlumeMaster(FlumeConfiguration.get());
    mock = new DirectMasterRPC(master);
    node = new FlumeNode(mock, false /* starthttp */, false /* onshot */);

  }

  @After
  public void tearDown() {
    try {
      FileUtil.rmr(new File(FlumeConfiguration.get().get(
          FlumeConfiguration.AGENT_LOG_DIR_NEW)));
    } catch (IOException e) {
      LOG.error("Not supposed to happen: " + e.getMessage());
    }
  }

  /**
   * This case does the end to end ack successfully
   */
  @Test
  public void testAckSuccess() throws FlumeSpecException, IOException,
      InterruptedException {

    int count = 20;
    String rpt = "foo";
    String snk = "{ ackChecker => [ console, counter(\"" + rpt + "\") ] }";
    Context ctx = new ReportTestingContext();
    EventSink es = FlumeBuilder.buildSink(ctx, snk);
    // Excessively large timeout because I am manually triggering it.
    NaiveFileWALDeco wal = new NaiveFileWALDeco(ctx, es, node.getWalManager(),
        new TimeTrigger(new ProcessTagger(), 100000), node.getAckChecker()
            .getAgentAckQueuer(), 1000);

    // open and send data.
    wal.open();
    for (int i = 0; i < count; i++) {
      Event e = new EventImpl(("test message " + i).getBytes());
      wal.append(e);
      if (i % 5 == 4) {
        wal.rotate();
      }
    }

    CounterSink ctr = (CounterSink) ReportManager.get().getReportable(rpt);
    LOG.info("Last batch should be missing. " + count + " > " + ctr.getCount()
        + " sent");

    wal.close();
    LOG.info("All should make it through here. " + count + " =="
        + ctr.getCount() + " sent");
    assertEquals(20, ctr.getCount());

    // check master state -- there should be 5 pending acks.
    assertEquals(5, master.getAckMan().getPending().size());

    // after checking, there shuld be 0 acks left.
    node.getAckChecker().checkAcks();
    master.getAckMan().dumpLog();
    Set<String> pending2 = master.getAckMan().getPending();
    assertEquals(0, pending2.size());
  }

  /**
   * This case drops an ack section initializer
   */
  @Test
  public void testAckBeginFail() throws FlumeSpecException, IOException,
      InterruptedException {

    int count = 20;
    String rpt = "foo";

    // Phase I (20 events arrive, 3 groups make it)
    // Beg .. .. .. .. ..End
    // 00 01 02 03 04 05 06
    // 07 08 09 10 11 12 13
    // -- 15 16 17 18 19 20 (begin is missing)
    // 21 22 23 24 25 26 27

    // Phase II - (after a forced retry, 4 more sent for 24 total, but still no
    // good)
    // 14 XX 16 17 18 19 20 (retry fails, because of missing message)

    // Phase III - (another forced retry, 5 make it for 29 total and it is good
    // this time)
    // 14 15 16 17 18 19 20 (2nd retry attempt succeeds)

    // close

    String snk = "  { intervalDroppyAppend(15)  => { ackChecker => [ console, counter(\""
        + rpt + "\") ] } }  ";
    Context ctx = new ReportTestingContext();
    EventSink es = FlumeBuilder.buildSink(ctx, snk);

    NaiveFileWALDeco wal = new NaiveFileWALDeco(ctx, es, node.getWalManager(),
        new TimeTrigger(new ProcessTagger(), 100000), node.getAckChecker()
            .getAgentAckQueuer(), 1000);
    wal.open();

    // // Phase I - (20 events arrive, 3 groups make it)
    for (int i = 0; i < count; i++) {
      Event e = new EventImpl(("test message " + i).getBytes());
      wal.append(e);
      if (i % 5 == 4) {
        wal.rotate();
      }
    }

    // arbitrary sleep for now.
    Clock.sleep(1000);

    // three blocks should have successfully made it.
    assertEquals(3, master.getAckMan().getPending().size()); // (g0, g1, g3)
    CounterSink ctr = (CounterSink) ReportManager.get().getReportable(rpt);
    LOG.info(" Ack begin dropped, but all true messages went through. 20 == "
        + ctr.getCount() + " sent");
    // We dropped an ack begin message. Messages still make it through and there
    // should be some sort of ack warning on the master.
    assertEquals(20, ctr.getCount());

    // check master state
    master.getAckMan().dumpLog();
    Set<String> pending = master.getAckMan().getPending();
    assertEquals(3, pending.size());

    // check to make sure wal files are gone.
    node.getAckChecker().checkAcks();
    node.getAckChecker().checkRetry();
    master.getAckMan().dumpLog();
    pending = master.getAckMan().getPending();
    LOG.info("Number of pending acks (retry had failure)" + pending.size());
    assertEquals(0, pending.size());
    LOG.info("Event count 20  (one message missing/lost) ==" + ctr.getCount());
    assertEquals(20, ctr.getCount());

    Clock.sleep(1000); // somehow force other thread to go.

    // // Phase II - force message resent retry but one is dropped. (only 4 make
    // it)
    node.getAckChecker().checkAcks();
    node.getAckChecker().forceRetry();

    Clock.sleep(500); // somehow force other thread to go.

    LOG.info("Was stuff resent? " + " 24 == " + ctr.getCount() + " sent");
    // This retry succeeds sending 4 more messages
    assertEquals(24, ctr.getCount());
    assertEquals(0, master.getAckMan().getPending().size());

    // // Phase III - force another retry
    node.getAckChecker().checkAcks();
    node.getAckChecker().forceRetry();
    LOG
        .info("After another check. " + count + " < " + ctr.getCount()
            + " sent");
    Clock.sleep(500);

    // suceeded sending 5 more messages.
    assertEquals(29, ctr.getCount());
    assertEquals(1, master.getAckMan().getPending().size());

    // check the ack.
    node.getAckChecker().checkAcks();

    // we are clean.
    assertEquals(29, ctr.getCount());
    assertEquals(0, master.getAckMan().getPending().size());

    wal.close();
  }

  /**
   * This adds an interval Droppy between the ack writer and the ack checker.
   * This means a batch should fail and be retried.
   *
   * All except for the internal wal writer to wal reader is serilaized. We
   * trigger the reader to read written files manaully (
   */
  @Test
  public void testMsgDropBehavior() throws FlumeSpecException,
      InterruptedException, IOException {

    int count = 20;
    String rpt = "foo";

    // // Phase I (19 message arrive, 3 groups ok).
    // Beg .. .. .. .. ..End
    // 00 01 02 03 04 05 06
    // 07 08 09 10 11 12 13
    // 14 -- 16 17 18 19 20 (drop a regular event message)
    // 21 22 23 24 25 26 27

    // // Phase II (4 more messages, 23 total, still only 3 groups)
    // 14 15 16 -- 18 19 20 (retry fails, because of missing message)

    // // Phase III (5 more message, 28 total, finally all 4 groups ok)
    // 14 15 16 17 18 19 20 (2nd retry attempt succeeds)

    // close

    String snk = "  { intervalDroppyAppend(16)  => { ackChecker => [ console, counter(\""
        + rpt + "\") ] } }  ";
    Context ctx = new ReportTestingContext();
    EventSink es = FlumeBuilder.buildSink(ctx, snk);

    NaiveFileWALDeco wal = new NaiveFileWALDeco(ctx, es, node.getWalManager(),
        new TimeTrigger(new ProcessTagger(), 100000), node.getAckChecker()
            .getAgentAckQueuer(), 1000);
    wal.open();

    // // Phase I - (19 events arrive, 3 groups make it)
    for (int i = 0; i < count; i++) {
      Event e = new EventImpl(("test message " + i).getBytes());
      wal.append(e);
      if (i % 5 == 4) {
        wal.rotate();
      }
    }

    Clock.sleep(1000); // need other thread to make progress
    // 3 blocks should have successfully made it.
    assertEquals(3, master.getAckMan().getPending().size()); // (g0, g1, g3)

    CounterSink ctr = (CounterSink) ReportManager.get().getReportable(rpt);
    LOG.info(" Ack begin dropped, but all true message went through. 20 == "
        + ctr.getCount() + " sent");
    // We dropped a event message, so we should be down one, 19 of 20 events.
    assertEquals(19, ctr.getCount());

    // check master state
    master.getAckMan().dumpLog();
    Set<String> pending = master.getAckMan().getPending();
    assertEquals(3, pending.size());

    // // Phase II (4 more messages, 23 total, still only 3 groups)
    // check to make sure wal files are gone.
    node.getAckChecker().checkAcks();
    node.getAckChecker().forceRetry();
    Clock.sleep(1000);
    master.getAckMan().dumpLog();
    pending = master.getAckMan().getPending();
    LOG.info("Number of pending acks (retry had failure)" + pending.size());
    assertEquals(0, pending.size());
    LOG.info("Event count 23  (one message missing/lost) ==" + ctr.getCount());
    assertEquals(23, ctr.getCount());

    // //Phase III (5 more messages, 28 total, 4 groups good)
    node.getAckChecker().checkAcks();
    node.getAckChecker().forceRetry();
    Clock.sleep(1000); // somehow force other thread to go.

    LOG.info("Was stuff resent? " + " 28 == " + ctr.getCount() + " sent");
    // This retry succeeds sending 5 more messages
    assertEquals(28, ctr.getCount());
    Clock.sleep(1000); // somehow force other thread to go.
    node.getAckChecker().checkAcks();
    node.getAckChecker().forceRetry();
    LOG.info("After another check. " + " 28 == " + ctr.getCount() + " sent");

    // Nothing happened
    assertEquals(28, ctr.getCount());
    assertEquals(0, master.getAckMan().getPending().size());
    wal.close();

  }

  @Test
  public void testAckEndFail() throws FlumeSpecException, IOException,
      InterruptedException {

    int count = 20;
    String rpt = "foo";

    // Phase I (20 message sent, 3 groups good, 1 group bad)
    // Beg .. .. .. .. ..End
    // 00 01 02 03 04 05 06
    // 07 08 09 10 11 12 13
    // 14 15 16 17 18 19 -- (drop an end ack group event message)
    // 21 22 23 24 25 26 27

    // Phase II (5 more messeage, retry succeeds).
    // 14 15 16 17 18 19 20 (retry succeeeds)

    // close

    String snk = "  { intervalDroppyAppend(21)  => { ackChecker => [ console, counter(\""
        + rpt + "\") ] } }  ";
    Context ctx = new ReportTestingContext();
    EventSink es = FlumeBuilder.buildSink(ctx, snk);

    // Big delay values so that test has to force different actions.
    NaiveFileWALDeco wal = new NaiveFileWALDeco(ctx, es, node.getWalManager(),
        new TimeTrigger(new ProcessTagger(), 100000), node.getAckChecker()
            .getAgentAckQueuer(), 1000);
    wal.open();

    // Phase I (20 message sent, 3 groups good, 1 group bad)
    for (int i = 0; i < count; i++) {
      Event e = new EventImpl(("test message " + i).getBytes());
      wal.append(e);
      if (i % 5 == 4) {
        wal.rotate();
      }
    }

    Clock.sleep(1000); // need other thread to make progress
    // three blocks should have successfully made it.
    assertEquals(3, master.getAckMan().getPending().size()); // (g0, g1, g3)

    CounterSink ctr = (CounterSink) ReportManager.get().getReportable(rpt);
    LOG.info(" Ack begin dropped, but all true message went through. 20 == "
        + ctr.getCount() + " sent");
    // We dropped a event message, so we should be down one.
    assertEquals(20, ctr.getCount());

    // check master state
    Clock.sleep(1000); // somehow force other thread to go.
    master.getAckMan().dumpLog();
    Set<String> pending = master.getAckMan().getPending();
    assertEquals(3, pending.size());

    // Phase II (5 more messeage, retry succeeds).
    node.getAckChecker().checkAcks();
    node.getAckChecker().forceRetry();
    Clock.sleep(1000);
    master.getAckMan().dumpLog();
    pending = master.getAckMan().getPending();
    LOG.info("Number of pending acks (retry is success)" + pending.size());
    assertEquals(1, pending.size());
    LOG.info("Event count 25 (one message missing/lost) ==" + ctr.getCount());
    assertEquals(25, ctr.getCount());

    node.getAckChecker().checkAcks();
    Clock.sleep(1000); // somehow force other thread to go.

    LOG.info("Nothing new happened " + " 25 == " + ctr.getCount() + " sent");
    // This retry succeeds sending 5 more messages
    assertEquals(25, ctr.getCount());
    pending = master.getAckMan().getPending();
    assertEquals(0, pending.size());

    wal.close();
  }

  /**
   * This simulates if the collector fails to send an ack to the master, or if
   * the agent somehow fails to receive an ack sent to the master. The remedy
   * for both situations is the same -- resend the ack set that failed
   */
  @Test
  public void testMasterAckMissing() throws FlumeSpecException, IOException,
      InterruptedException {

    int count = 20;
    String rpt = "foo";
    String snk = "{ ackChecker => [ console, counter(\"" + rpt + "\") ] }";
    Context ctx = new ReportTestingContext();
    EventSink es = FlumeBuilder.buildSink(ctx, snk);
    NaiveFileWALDeco wal = new NaiveFileWALDeco(ctx, es, node.getWalManager(),
        new TimeTrigger(new ProcessTagger(), 100), node.getAckChecker()
            .getAgentAckQueuer(), 10000);

    // open and send data.
    wal.open();
    for (int i = 0; i < count; i++) {
      Event e = new EventImpl(("test message " + i).getBytes());
      wal.append(e);
      if (i % 5 == 4) {
        wal.rotate();
      }
    }

    Clock.sleep(1000); // somehow force other thread to go.
    CounterSink ctr = (CounterSink) ReportManager.get().getReportable(rpt);
    LOG.info(" All should make it through here. " + count + " =="
        + ctr.getCount() + " sent");
    assertEquals(20, ctr.getCount());

    // check master state
    Clock.sleep(1000);
    master.getAckMan().dumpLog();

    Set<String> pending = master.getAckMan().getPending();
    assertEquals(4, pending.size());
    // acks should be at the master, but not check by the agent yet, thus 4
    // acks remaining.

    // To simulate the master not getting an ack message, we clear the tag on
    // the master. This will eventually force the child to retry
    // Artificially ack one so the agent doesn't get the ack.
    master.getAckMan().check(pending.iterator().next());

    node.getAckChecker().checkAcks();
    master.getAckMan().dumpLog();
    Set<String> pending2 = master.getAckMan().getPending();
    Clock.sleep(1000);
    // acks are checked so masters can drop state information
    assertEquals(0, pending2.size());
    assertEquals(1, node.getAckChecker().pending.size());

    // Do retry stuff.
    node.getAckChecker().forceRetry();
    Clock.sleep(1000);
    LOG.info("Expected 1 ack pending on master but instead had "
        + master.getAckMan().getPending().size());
    assertEquals(1, master.getAckMan().getPending().size());
    assertEquals(1, node.getAckChecker().pending.size());
    node.getAckChecker().checkAcks();
    assertEquals(0, master.getAckMan().getPending().size());
    assertEquals(0, node.getAckChecker().pending.size());
    wal.close();
  }

}
TOP

Related Classes of com.cloudera.flume.agent.TestEndToEndAckFailureRecovery

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.