summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRainer Gerhards <rgerhards@adiscon.com>2010-04-26 15:55:51 +0200
committerRainer Gerhards <rgerhards@adiscon.com>2010-04-26 15:55:51 +0200
commited7f89a49c6f820bc65f87514eb0effafcdeaacc (patch)
tree86ae5d3cddc18a0efa4330da85fafe8af331a29d
parent59497bd057f6ee1bcb626d2ce30bf3eb8ef6ebd0 (diff)
parent149bab9cd1eb77c0708369d0a56903db90ab9d5f (diff)
downloadrsyslog-ed7f89a49c6f820bc65f87514eb0effafcdeaacc.tar.gz
rsyslog-ed7f89a49c6f820bc65f87514eb0effafcdeaacc.tar.xz
rsyslog-ed7f89a49c6f820bc65f87514eb0effafcdeaacc.zip
Merge branch 'v5-stable'
-rw-r--r--ChangeLog19
-rw-r--r--action.c27
-rw-r--r--action.h3
-rw-r--r--tests/Makefile.am3
-rwxr-xr-xtests/pipe_noreader.sh9
5 files changed, 49 insertions, 12 deletions
diff --git a/ChangeLog b/ChangeLog
index ee45417e..380bea50 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -65,12 +65,13 @@ Version 5.5.0 [DEVEL] (rgerhards), 2009-11-18
- doc: improved description of what loadable modules can do
---------------------------------------------------------------------------
Version 5.4.2 [v5-stable] (rgerhards), 2010-03-??
-- bugfix(minor): status variable was uninitialized
- However, this would have caused harm only if NO parser modules at
- all were loaded, which would lead to a defunctional configuration
- at all. And, even more important, this is impossible as two parser
- modules are built-in and thus can not be "not loaded", so we always
- have a minimum of two.
+- bugfix(kind of): output plugin retry behaviour could cause engine to loop
+ The rsyslog engine did not guard itself against output modules that do
+ not properly convey back the tryResume() behaviour. This then leads to
+ what looks like an endless loop. I consider this to be a bug of the
+ engine not only because it should be hardened against plugin misbehaviour,
+ but also because plugins may not be totally able to avoid this situation
+ (depending on the type of and processing done by the plugin).
- bugfix: testbench failed when not executed in UTC+1 timezone
accidently, the time zone information was kept inside some
to-be-checked-for responses
@@ -78,6 +79,12 @@ Version 5.4.2 [v5-stable] (rgerhards), 2010-03-??
message-induced off-by-one error (potential segfault) (see 4.6.2)
The analysis has been completed and a better fix been crafted and
integrated.
+- bugfix(minor): status variable was uninitialized
+ However, this would have caused harm only if NO parser modules at
+ all were loaded, which would lead to a defunctional configuration
+ at all. And, even more important, this is impossible as two parser
+ modules are built-in and thus can not be "not loaded", so we always
+ have a minimum of two.
---------------------------------------------------------------------------
Version 5.4.1 [v5-stable] (rgerhards), 2010-03-??
- added new property replacer option "date-rfc3164-buggyday" primarily
diff --git a/action.c b/action.c
index 59d984a5..7751fe0a 100644
--- a/action.c
+++ b/action.c
@@ -445,6 +445,7 @@ static void actionCommitted(action_t *pThis)
static void actionRetry(action_t *pThis)
{
actionSetState(pThis, ACT_STATE_RTRY);
+ pThis->iResumeOKinRow++;
}
@@ -480,13 +481,23 @@ static inline void actionSuspend(action_t *pThis, time_t ttNow)
/* actually do retry processing. Note that the function receives a timestamp so
* that we do not need to call the (expensive) time() API.
* Note that we do the full retry processing here, doing the configured number of
- * iterations.
- * rgerhards, 2009-05-07
+ * iterations. -- rgerhards, 2009-05-07
+ * We need to guard against module which always return RS_RET_OK from their tryResume()
+ * entry point. This is invalid, but has harsh consequences: it will cause the rsyslog
+ * engine to go into a tight loop. That obviously is not acceptable. As such, we track the
+ * count of iterations that a tryResume returning RS_RET_OK is immediately followed by
+ * an unsuccessful call to doAction(). If that happens more than 1,000 times, we assume
+ * the return acutally is a RS_RET_SUSPENDED. In order to go through the various
+ * resumption stages, we do this for every 1000 requests. This magic number 1000 may
+ * not be the most appropriate, but it should be thought of a "if nothing else helps"
+ * kind of facility: in the first place, the module should return a proper indication
+ * of its inability to recover. -- rgerhards, 2010-04-26.
*/
static rsRetVal actionDoRetry(action_t *pThis, time_t ttNow)
{
int iRetries;
int iSleepPeriod;
+ int bTreatOKasSusp;
DEFiRet;
ASSERT(pThis != NULL);
@@ -494,9 +505,14 @@ static rsRetVal actionDoRetry(action_t *pThis, time_t ttNow)
iRetries = 0;
while(pThis->eState == ACT_STATE_RTRY) {
iRet = pThis->pMod->tryResume(pThis->pModData);
- if(iRet == RS_RET_OK) {
+ if((pThis->iResumeOKinRow > 999) && (pThis->iResumeOKinRow % 1000 == 0)) {
+ bTreatOKasSusp = 1;
+ } else {
+ bTreatOKasSusp = 0;
+ }
+ if((iRet == RS_RET_OK) && (!bTreatOKasSusp)) {
actionSetState(pThis, ACT_STATE_RDY);
- } else if(iRet == RS_RET_SUSPENDED) {
+ } else if(iRet == RS_RET_SUSPENDED || bTreatOKasSusp) {
/* max retries reached? */
if((pThis->iResumeRetryCount != -1 && iRetries >= pThis->iResumeRetryCount)) {
actionSuspend(pThis, ttNow);
@@ -715,13 +731,16 @@ actionCallDoAction(action_t *pThis, msg_t *pMsg)
switch(iRet) {
case RS_RET_OK:
actionCommitted(pThis);
+ pThis->iResumeOKinRow = 0; /* we had a successful call! */
break;
case RS_RET_DEFER_COMMIT:
+ pThis->iResumeOKinRow = 0; /* we had a successful call! */
/* we are done, action state remains the same */
break;
case RS_RET_PREVIOUS_COMMITTED:
/* action state remains the same, but we had a commit. */
pThis->bHadAutoCommit = 1;
+ pThis->iResumeOKinRow = 0; /* we had a successful call! */
break;
case RS_RET_SUSPENDED:
actionRetry(pThis);
diff --git a/action.h b/action.h
index d1805075..66b7316d 100644
--- a/action.h
+++ b/action.h
@@ -56,8 +56,9 @@ struct action_s {
sbool bWriteAllMarkMsgs;/* should all mark msgs be written (not matter how recent the action was executed)? */
int iSecsExecOnceInterval; /* if non-zero, minimum seconds to wait until action is executed again */
action_state_t eState; /* current state of action */
- int bHadAutoCommit; /* did an auto-commit happen during doAction()? */
+ bool bHadAutoCommit; /* did an auto-commit happen during doAction()? */
time_t ttResumeRtry; /* when is it time to retry the resume? */
+ int iResumeOKinRow; /* number of times in a row that resume said OK with an immediate failure following */
int iResumeInterval;/* resume interval for this action */
int iResumeRetryCount;/* how often shall we retry a suspended action? (-1 --> eternal) */
int iNbrResRtry; /* number of retries since last suspend */
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 29a2ce14..32765ae4 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -34,6 +34,7 @@ TESTS = $(TESTRUNS) cfg.sh \
complex1.sh \
queue-persist.sh \
pipeaction.sh \
+ pipe_noreader.sh \
execonlyonce.sh \
dircreate_dflt.sh \
dircreate_off.sh \
@@ -217,6 +218,8 @@ EXTRA_DIST= 1.rstest 2.rstest 3.rstest err1.rstest \
testsuites/sndrcv_gzip_rcvr.conf \
pipeaction.sh \
testsuites/pipeaction.conf \
+ pipe_noreader.sh \
+ testsuites/pipe_noreader.conf \
asynwr_simple.sh \
testsuites/asynwr_simple.conf \
asynwr_timeout.sh \
diff --git a/tests/pipe_noreader.sh b/tests/pipe_noreader.sh
index c80aaf3f..8dea0818 100755
--- a/tests/pipe_noreader.sh
+++ b/tests/pipe_noreader.sh
@@ -4,6 +4,12 @@
# as a permanent testcase. For some details, please see bug tracker
# http://bugzilla.adiscon.com/show_bug.cgi?id=186
#
+# IMPORTANT: we do NOT check any result message set. The whole point in
+# this test is to verify that we do NOT run into an eternal loop. As such,
+# the test is "PASS", if rsyslogd terminates. If it does not terminate, we
+# obviously do not cause "FAIL", but processing will hang, which should be
+# a good-enough indication of failure.
+#
# added 2010-04-26 by Rgerhards
# This file is part of the rsyslog project, released under GPLv3
echo ===============================================================================
@@ -18,5 +24,6 @@ source $srcdir/diag.sh startup pipe_noreader.conf
source $srcdir/diag.sh tcpflood -m1000 -d500
source $srcdir/diag.sh shutdown-when-empty # shut down rsyslogd when done processing messages
source $srcdir/diag.sh wait-shutdown # and wait for it to terminate
-source $srcdir/diag.sh seq-check 0 999
+# NO need to check seqno -- see header comment
+echo we did not loop, so the test is sucessfull
source $srcdir/diag.sh exit