summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRainer Gerhards <rgerhards@adiscon.com>2010-03-17 18:11:03 +0100
committerRainer Gerhards <rgerhards@adiscon.com>2010-03-17 18:11:03 +0100
commit5c7225b77636b5f8231ccb0edc61fb7d68eb752b (patch)
treef2f964db77dbb16d219135e38acf736301780f69
parent679434312c20d9651faff26b8b3a5ab203be7ff3 (diff)
downloadrsyslog-5c7225b77636b5f8231ccb0edc61fb7d68eb752b.tar.gz
rsyslog-5c7225b77636b5f8231ccb0edc61fb7d68eb752b.tar.xz
rsyslog-5c7225b77636b5f8231ccb0edc61fb7d68eb752b.zip
added test for deadlock with dynafile writing and cache exhaustion
Note that this introduces a test case that breaks the current code. This commit does NOT yet provide the necessary code patch. Thus "make check" does currently hang.
-rw-r--r--tests/Makefile.am3
-rwxr-xr-xtests/asynwr_deadlock2.sh64
-rw-r--r--tests/testsuites/asynwr_deadlock2.conf15
3 files changed, 82 insertions, 0 deletions
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 3c2223d9..6f345b51 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -15,6 +15,7 @@ TESTS = $(TESTRUNS) cfg.sh \
asynwr_tinybuf.sh \
asynwr_large.sh \
asynwr_deadlock.sh \
+ asynwr_deadlock2.sh \
gzipwr_large.sh \
gzipwr_large_dynfile.sh \
complex1.sh \
@@ -157,6 +158,8 @@ EXTRA_DIST= 1.rstest 2.rstest 3.rstest err1.rstest \
testsuites/asynwr_large.conf \
asynwr_deadlock.sh \
testsuites/asynwr_deadlock.conf \
+ asynwr_deadlock2.sh \
+ testsuites/asynwr_deadlock2.conf \
gzipwr_large.sh \
testsuites/gzipwr_large.conf \
gzipwr_large_dynfile.sh \
diff --git a/tests/asynwr_deadlock2.sh b/tests/asynwr_deadlock2.sh
new file mode 100755
index 00000000..3b429b9a
--- /dev/null
+++ b/tests/asynwr_deadlock2.sh
@@ -0,0 +1,64 @@
+# This is test case from practice, with the version we introduced it, it
+# caused a deadlock during processing (when the a stream was purged from the
+# dynafile cache).
+# We added this as a standard test in the hopes that iw will help
+# detect such things in the future.
+#
+# The problem that originally caused this test to fail was:
+# We write to files asynchronously (with the async writer thread). There is
+# no signaling done when the file stream is closed. That can lead to the writer
+# process hanging in memory, that in turn leads to the main thread waiting on a
+# condition that never occurs (because it would need to be signalled by the
+# async writer). Even worse, in that case, the async writer was signalled invalid
+# in such a way that when it received a wakeup, it thought it shall not terminate,
+# but received a spurios wakeup due to timeout and no data to write. In that case
+# it (correctly) concluded that it would not need to timeout until a new buffer write
+# was done (in which case it would receive a wakeup). As such, it went into an eternal
+# wait. However, the invalid signaling did not take into account that it did not
+# signal the async writer to shut down. So the main thread went into a condition
+# wait - and thus we had a deadlock. That situation occured only under very specific
+# cirumstances. As far as the analysis goes, the following need to happen:
+# 1. buffers on that file are being flushed
+# 2. no new data arrives
+# 3. the inactivity timeout has not yet happend
+# 4. *then* (and only then) the stream is closed or destructed
+# In that, 1 to 3 are prequisites for the deadlock which will happen in 4. In order
+# to create this case reliably, I have used the "$OMFileFlushOnTXEnd on" directive
+# inside my test case. It makes sure that #1 above happens. The test uses a dynafile
+# cache size of 4, and the load generator generates data for 5 different dynafiles.
+# So over time, we will hit a spot where 4 dynafiles are open and the 5th file name
+# is generated. As such, one file needs to be discarded. Thanks to FlushOnTXEnd, we
+# now likely have #2 in place and thanks to the load pattern generated, we most
+# probably have #3 in place. During the dynafile cache displacement of the oldest
+# entry, #4 is generated. At this point, we have the deadlock we are testing for.
+# Note that this deadlock does not necessarily lead to a total lockup of rsyslogd.
+# Parts of it continue to operate. But in our test setup, this means data is
+# received and placed into the main queue. Once it's high water mark is hit, data
+# is still being enqueued, but at a slow rate. So if one is patient enough, the load
+# generator will be able to finish. However, rsyslogd will never process the data
+# it received because it is locked in the deadlock caused by #4 above.
+# Note that "$OMFileFlushOnTXEnd on" is not causing this behaviour. We just use it
+# to (quite) reliably cause the failure condition. The failure described above
+# (in version 4.6.1) was also present when the setting was set to "off", but its
+# occurence was very much less probable - because the perquisites are then much
+# harder to hit. without it, the test may need to run for several hours before
+# we hit all failure conditions.
+#
+# added 2010-03-17 by Rgerhards
+# This file is part of the rsyslog project, released under GPLv3
+echo =================================================================================
+echo TEST: \[asynwr_deadlock2.sh\]: a case known to have caused a deadlock in the past
+source $srcdir/diag.sh init
+# uncomment for debugging support:
+export RSYSLOG_DEBUG="debug nostdout noprintmutexaction"
+export RSYSLOG_DEBUGLOG="log"
+source $srcdir/diag.sh startup asynwr_deadlock2.conf
+# send 20000 messages, each close to 2K (non-randomized!), so that we can fill
+# the buffers and hopefully run into the "deadlock".
+source $srcdir/diag.sh tcpflood -m20000 -d1800 -P129 -i1 -f5
+# sleep is important! need to make sure the instance is inactive
+source $srcdir/diag.sh shutdown-when-empty # shut down rsyslogd when done processing messages
+source $srcdir/diag.sh wait-shutdown # and wait for it to terminate
+zcat rsyslog.out.*.log > rsyslog.out.log
+source $srcdir/diag.sh seq-check 1 20000 -E
+source $srcdir/diag.sh exit
diff --git a/tests/testsuites/asynwr_deadlock2.conf b/tests/testsuites/asynwr_deadlock2.conf
new file mode 100644
index 00000000..c3eecae3
--- /dev/null
+++ b/tests/testsuites/asynwr_deadlock2.conf
@@ -0,0 +1,15 @@
+# rgerhards, 2010-03-17
+$IncludeConfig diag-common.conf
+
+$ModLoad ../plugins/imtcp/.libs/imtcp
+$MainMsgQueueTimeoutShutdown 10000
+$InputTCPServerRun 13514
+
+$template outfmt,"%msg:F,58:3%,%msg:F,58:4%,%msg:F,58:5%\n"
+$template dynfile,"rsyslog.out.%msg:F,58:2%.log" # use multiple dynafiles
+
+$OMFileFlushOnTXEnd on
+$OMFileFlushInterval 10
+$OMFileIOBufferSize 10k
+$DynaFileCacheSize 4
+local0.* ?dynfile;outfmt