/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.flume.agent;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
import java.io.File;
import java.io.IOException;
import java.util.Set;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import com.cloudera.flume.agent.durability.NaiveFileWALDeco;
import com.cloudera.flume.conf.Context;
import com.cloudera.flume.conf.FlumeBuilder;
import com.cloudera.flume.conf.FlumeConfiguration;
import com.cloudera.flume.conf.FlumeSpecException;
import com.cloudera.flume.conf.ReportTestingContext;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventImpl;
import com.cloudera.flume.core.EventSink;
import com.cloudera.flume.handlers.rolling.ProcessTagger;
import com.cloudera.flume.handlers.rolling.TimeTrigger;
import com.cloudera.flume.master.FlumeMaster;
import com.cloudera.flume.reporter.ReportManager;
import com.cloudera.flume.reporter.aggregator.CounterSink;
import com.cloudera.util.Clock;
import com.cloudera.util.FileUtil;
/**
* This tests end to end transmission and acks and recovery when there are
* problems with the transmission.
*
* Generally these tests are setup to trigger manually after 5 messages. Acking
* prefixes and post fixes each batch with a ack begin and ack end message that
* has info required for the checksum to add up. Thus 7 messages are sent per
* chunk.
*
* In the cases where there are retries, these are done using forceRetry
* (instead of the time-dependent checkRetry)
*/
public class TestEndToEndAckFailureRecovery {
public static final Logger LOG = Logger
.getLogger(TestEndToEndAckFailureRecovery.class);
DirectMasterRPC mock;
FlumeNode node;
FlumeMaster master;
/**
* Setup mock clock, mock rpc master and a node that writes to a temp
* directory.
*/
@Before
public void setUp() {
System.out.println("=====");
// Logger log = Logger.getLogger(NaiveFileWALManager.class.getName());
Logger log = Logger.getRootLogger();
log.setLevel(Level.DEBUG);
File tmpdir = null;
try {
tmpdir = FileUtil.mktempdir();
} catch (IOException e) {
fail("unable to create temp dir");
}
FlumeConfiguration.get().set(FlumeConfiguration.AGENT_LOG_DIR_NEW,
tmpdir.getAbsolutePath());
System.out.println("Writing out tempdir: " + tmpdir.getAbsolutePath());
// This will register the FlumeNode with a MockMasterRPC so it doesn't go
// across the network
master = new FlumeMaster(FlumeConfiguration.get());
mock = new DirectMasterRPC(master);
node = new FlumeNode(mock, false /* starthttp */, false /* onshot */);
}
@After
public void tearDown() {
try {
FileUtil.rmr(new File(FlumeConfiguration.get().get(
FlumeConfiguration.AGENT_LOG_DIR_NEW)));
} catch (IOException e) {
LOG.error("Not supposed to happen: " + e.getMessage());
}
}
/**
* This case does the end to end ack successfully
*/
@Test
public void testAckSuccess() throws FlumeSpecException, IOException,
InterruptedException {
int count = 20;
String rpt = "foo";
String snk = "{ ackChecker => [ console, counter(\"" + rpt + "\") ] }";
Context ctx = new ReportTestingContext();
EventSink es = FlumeBuilder.buildSink(ctx, snk);
// Excessively large timeout because I am manually triggering it.
NaiveFileWALDeco<EventSink> wal = new NaiveFileWALDeco<EventSink>(ctx, es,
node.getWalManager(), new TimeTrigger(new ProcessTagger(), 100000),
node.getAckChecker().getAgentAckQueuer(), 1000);
// open and send data.
wal.open();
for (int i = 0; i < count; i++) {
Event e = new EventImpl(("test message " + i).getBytes());
wal.append(e);
if (i % 5 == 4) {
wal.rotate();
}
}
CounterSink ctr = (CounterSink) ReportManager.get().getReportable(rpt);
LOG.info("Last batch should be missing. " + count + " > " + ctr.getCount()
+ " sent");
wal.close();
LOG.info("All should make it through here. " + count + " =="
+ ctr.getCount() + " sent");
assertEquals(20, ctr.getCount());
// check master state -- there should be 5 pending acks.
assertEquals(5, master.getAckMan().getPending().size());
// after checking, there shuld be 0 acks left.
node.getAckChecker().checkAcks();
master.getAckMan().dumpLog();
Set<String> pending2 = master.getAckMan().getPending();
assertEquals(0, pending2.size());
}
/**
* This case drops an ack section initializer
*/
@Test
public void testAckBeginFail() throws FlumeSpecException, IOException,
InterruptedException {
int count = 20;
String rpt = "foo";
// Phase I (20 events arrive, 3 groups make it)
// Beg .. .. .. .. ..End
// 00 01 02 03 04 05 06
// 07 08 09 10 11 12 13
// -- 15 16 17 18 19 20 (begin is missing)
// 21 22 23 24 25 26 27
// Phase II - (after a forced retry, 4 more sent for 24 total, but still no
// good)
// 14 XX 16 17 18 19 20 (retry fails, because of missing message)
// Phase III - (another forced retry, 5 make it for 29 total and it is good
// this time)
// 14 15 16 17 18 19 20 (2nd retry attempt succeeds)
// close
String snk = " { intervalDroppyAppend(15) => { ackChecker => [ console, counter(\""
+ rpt + "\") ] } } ";
Context ctx = new ReportTestingContext();
EventSink es = FlumeBuilder.buildSink(ctx, snk);
NaiveFileWALDeco<EventSink> wal = new NaiveFileWALDeco<EventSink>(ctx, es,
node.getWalManager(), new TimeTrigger(new ProcessTagger(), 100000),
node.getAckChecker().getAgentAckQueuer(), 1000);
wal.open();
// // Phase I - (20 events arrive, 3 groups make it)
for (int i = 0; i < count; i++) {
Event e = new EventImpl(("test message " + i).getBytes());
wal.append(e);
if (i % 5 == 4) {
wal.rotate();
}
}
// arbitrary sleep for now.
Clock.sleep(1000);
// three blocks should have successfully made it.
assertEquals(3, master.getAckMan().getPending().size()); // (g0, g1, g3)
CounterSink ctr = (CounterSink) ReportManager.get().getReportable(rpt);
LOG.info(" Ack begin dropped, but all true messages went through. 20 == "
+ ctr.getCount() + " sent");
// We dropped an ack begin message. Messages still make it through and there
// should be some sort of ack warning on the master.
assertEquals(20, ctr.getCount());
// check master state
master.getAckMan().dumpLog();
Set<String> pending = master.getAckMan().getPending();
assertEquals(3, pending.size());
// check to make sure wal files are gone.
node.getAckChecker().checkAcks();
node.getAckChecker().checkRetry();
master.getAckMan().dumpLog();
pending = master.getAckMan().getPending();
LOG.info("Number of pending acks (retry had failure)" + pending.size());
assertEquals(0, pending.size());
LOG.info("Event count 20 (one message missing/lost) ==" + ctr.getCount());
assertEquals(20, ctr.getCount());
Clock.sleep(1000); // somehow force other thread to go.
// // Phase II - force message resent retry but one is dropped. (only 4 make
// it)
node.getAckChecker().checkAcks();
node.getAckChecker().forceRetry();
Clock.sleep(500); // somehow force other thread to go.
LOG.info("Was stuff resent? " + " 24 == " + ctr.getCount() + " sent");
// This retry succeeds sending 4 more messages
assertEquals(24, ctr.getCount());
assertEquals(0, master.getAckMan().getPending().size());
// // Phase III - force another retry
node.getAckChecker().checkAcks();
node.getAckChecker().forceRetry();
LOG
.info("After another check. " + count + " < " + ctr.getCount()
+ " sent");
Clock.sleep(500);
// suceeded sending 5 more messages.
assertEquals(29, ctr.getCount());
assertEquals(1, master.getAckMan().getPending().size());
// check the ack.
node.getAckChecker().checkAcks();
// we are clean.
assertEquals(29, ctr.getCount());
assertEquals(0, master.getAckMan().getPending().size());
wal.close();
}
/**
* This adds an interval Droppy between the ack writer and the ack checker.
* This means a batch should fail and be retried.
*
* All except for the internal wal writer to wal reader is serilaized. We
* trigger the reader to read written files manaully (
*/
@Test
public void testMsgDropBehavior() throws FlumeSpecException,
InterruptedException, IOException {
int count = 20;
String rpt = "foo";
// // Phase I (19 message arrive, 3 groups ok).
// Beg .. .. .. .. ..End
// 00 01 02 03 04 05 06
// 07 08 09 10 11 12 13
// 14 -- 16 17 18 19 20 (drop a regular event message)
// 21 22 23 24 25 26 27
// // Phase II (4 more messages, 23 total, still only 3 groups)
// 14 15 16 -- 18 19 20 (retry fails, because of missing message)
// // Phase III (5 more message, 28 total, finally all 4 groups ok)
// 14 15 16 17 18 19 20 (2nd retry attempt succeeds)
// close
String snk = " { intervalDroppyAppend(16) => { ackChecker => [ console, counter(\""
+ rpt + "\") ] } } ";
Context ctx = new ReportTestingContext();
EventSink es = FlumeBuilder.buildSink(ctx, snk);
NaiveFileWALDeco<EventSink> wal = new NaiveFileWALDeco<EventSink>(ctx, es,
node.getWalManager(), new TimeTrigger(new ProcessTagger(), 100000),
node.getAckChecker().getAgentAckQueuer(), 1000);
wal.open();
// // Phase I - (19 events arrive, 3 groups make it)
for (int i = 0; i < count; i++) {
Event e = new EventImpl(("test message " + i).getBytes());
wal.append(e);
if (i % 5 == 4) {
wal.rotate();
}
}
Clock.sleep(1000); // need other thread to make progress
// 3 blocks should have successfully made it.
assertEquals(3, master.getAckMan().getPending().size()); // (g0, g1, g3)
CounterSink ctr = (CounterSink) ReportManager.get().getReportable(rpt);
LOG.info(" Ack begin dropped, but all true message went through. 20 == "
+ ctr.getCount() + " sent");
// We dropped a event message, so we should be down one, 19 of 20 events.
assertEquals(19, ctr.getCount());
// check master state
master.getAckMan().dumpLog();
Set<String> pending = master.getAckMan().getPending();
assertEquals(3, pending.size());
// // Phase II (4 more messages, 23 total, still only 3 groups)
// check to make sure wal files are gone.
node.getAckChecker().checkAcks();
node.getAckChecker().forceRetry();
Clock.sleep(1000);
master.getAckMan().dumpLog();
pending = master.getAckMan().getPending();
LOG.info("Number of pending acks (retry had failure)" + pending.size());
assertEquals(0, pending.size());
LOG.info("Event count 23 (one message missing/lost) ==" + ctr.getCount());
assertEquals(23, ctr.getCount());
// //Phase III (5 more messages, 28 total, 4 groups good)
node.getAckChecker().checkAcks();
node.getAckChecker().forceRetry();
Clock.sleep(1000); // somehow force other thread to go.
LOG.info("Was stuff resent? " + " 28 == " + ctr.getCount() + " sent");
// This retry succeeds sending 5 more messages
assertEquals(28, ctr.getCount());
Clock.sleep(1000); // somehow force other thread to go.
node.getAckChecker().checkAcks();
node.getAckChecker().forceRetry();
LOG.info("After another check. " + " 28 == " + ctr.getCount() + " sent");
// Nothing happened
assertEquals(28, ctr.getCount());
assertEquals(0, master.getAckMan().getPending().size());
wal.close();
}
@Test
public void testAckEndFail() throws FlumeSpecException, IOException,
InterruptedException {
int count = 20;
String rpt = "foo";
// Phase I (20 message sent, 3 groups good, 1 group bad)
// Beg .. .. .. .. ..End
// 00 01 02 03 04 05 06
// 07 08 09 10 11 12 13
// 14 15 16 17 18 19 -- (drop an end ack group event message)
// 21 22 23 24 25 26 27
// Phase II (5 more messeage, retry succeeds).
// 14 15 16 17 18 19 20 (retry succeeeds)
// close
String snk = " { intervalDroppyAppend(21) => { ackChecker => [ console, counter(\""
+ rpt + "\") ] } } ";
Context ctx = new ReportTestingContext();
EventSink es = FlumeBuilder.buildSink(ctx, snk);
// Big delay values so that test has to force different actions.
NaiveFileWALDeco<EventSink> wal = new NaiveFileWALDeco<EventSink>(ctx, es,
node.getWalManager(), new TimeTrigger(new ProcessTagger(), 100000),
node.getAckChecker().getAgentAckQueuer(), 1000);
wal.open();
// Phase I (20 message sent, 3 groups good, 1 group bad)
for (int i = 0; i < count; i++) {
Event e = new EventImpl(("test message " + i).getBytes());
wal.append(e);
if (i % 5 == 4) {
wal.rotate();
}
}
Clock.sleep(1000); // need other thread to make progress
// three blocks should have successfully made it.
assertEquals(3, master.getAckMan().getPending().size()); // (g0, g1, g3)
CounterSink ctr = (CounterSink) ReportManager.get().getReportable(rpt);
LOG.info(" Ack begin dropped, but all true message went through. 20 == "
+ ctr.getCount() + " sent");
// We dropped a event message, so we should be down one.
assertEquals(20, ctr.getCount());
// check master state
Clock.sleep(1000); // somehow force other thread to go.
master.getAckMan().dumpLog();
Set<String> pending = master.getAckMan().getPending();
assertEquals(3, pending.size());
// Phase II (5 more messeage, retry succeeds).
node.getAckChecker().checkAcks();
node.getAckChecker().forceRetry();
Clock.sleep(1000);
master.getAckMan().dumpLog();
pending = master.getAckMan().getPending();
LOG.info("Number of pending acks (retry is success)" + pending.size());
assertEquals(1, pending.size());
LOG.info("Event count 25 (one message missing/lost) ==" + ctr.getCount());
assertEquals(25, ctr.getCount());
node.getAckChecker().checkAcks();
Clock.sleep(1000); // somehow force other thread to go.
LOG.info("Nothing new happened " + " 25 == " + ctr.getCount() + " sent");
// This retry succeeds sending 5 more messages
assertEquals(25, ctr.getCount());
pending = master.getAckMan().getPending();
assertEquals(0, pending.size());
wal.close();
}
/**
* This simulates if the collector fails to send an ack to the master, or if
* the agent somehow fails to receive an ack sent to the master. The remedy
* for both situations is the same -- resend the ack set that failed
*/
@Test
public void testMasterAckMissing() throws FlumeSpecException, IOException,
InterruptedException {
int count = 20;
String rpt = "foo";
String snk = "{ ackChecker => [ console, counter(\"" + rpt + "\") ] }";
Context ctx = new ReportTestingContext();
EventSink es = FlumeBuilder.buildSink(ctx, snk);
NaiveFileWALDeco<EventSink> wal = new NaiveFileWALDeco<EventSink>(ctx, es,
node.getWalManager(), new TimeTrigger(new ProcessTagger(), 100), node
.getAckChecker().getAgentAckQueuer(), 10000);
// open and send data.
wal.open();
for (int i = 0; i < count; i++) {
Event e = new EventImpl(("test message " + i).getBytes());
wal.append(e);
if (i % 5 == 4) {
wal.rotate();
}
}
Clock.sleep(1000); // somehow force other thread to go.
CounterSink ctr = (CounterSink) ReportManager.get().getReportable(rpt);
LOG.info(" All should make it through here. " + count + " =="
+ ctr.getCount() + " sent");
assertEquals(20, ctr.getCount());
// check master state
Clock.sleep(1000);
master.getAckMan().dumpLog();
Set<String> pending = master.getAckMan().getPending();
assertEquals(4, pending.size());
// acks should be at the master, but not check by the agent yet, thus 4
// acks remaining.
// To simulate the master not getting an ack message, we clear the tag on
// the master. This will eventually force the child to retry
// Artificially ack one so the agent doesn't get the ack.
master.getAckMan().check(pending.iterator().next());
node.getAckChecker().checkAcks();
master.getAckMan().dumpLog();
Set<String> pending2 = master.getAckMan().getPending();
Clock.sleep(1000);
// acks are checked so masters can drop state information
assertEquals(0, pending2.size());
assertEquals(1, node.getAckChecker().pending.size());
// Do retry stuff.
node.getAckChecker().forceRetry();
Clock.sleep(1000);
LOG.info("Expected 1 ack pending on master but instead had "
+ master.getAckMan().getPending().size());
assertEquals(1, master.getAckMan().getPending().size());
assertEquals(1, node.getAckChecker().pending.size());
node.getAckChecker().checkAcks();
assertEquals(0, master.getAckMan().getPending().size());
assertEquals(0, node.getAckChecker().pending.size());
wal.close();
}
}