summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRainer Gerhards <rgerhards@adiscon.com>2010-04-26 15:19:13 +0200
committerRainer Gerhards <rgerhards@adiscon.com>2010-04-26 15:19:13 +0200
commiteec894fbc5eb263e6def1f2e35f9882967c95a88 (patch)
treeb6be4e3076f499e4c8023f88e4ddbfac55f60ea0
parent270e455d6d9c6e7e580d6784435bf57ddc71bd85 (diff)
downloadrsyslog-eec894fbc5eb263e6def1f2e35f9882967c95a88.tar.gz
rsyslog-eec894fbc5eb263e6def1f2e35f9882967c95a88.tar.xz
rsyslog-eec894fbc5eb263e6def1f2e35f9882967c95a88.zip
bugfix(kind of): output plugin retry behaviour could cause engine to loop
The rsyslog engine did not guard itself against output modules that do not properly convey back the tryResume() behaviour. This then leads to what looks like an endless loop. I consider this to be a bug of the engine not only because it should be hardened against plugin misbehaviour, but also because plugins may not be totally able to avoid this situation (depending on the type of and processing done by the plugin).
-rw-r--r--ChangeLog19
-rw-r--r--action.c28
-rw-r--r--action.h3
-rwxr-xr-xtests/pipe_noreader.sh9
4 files changed, 47 insertions, 12 deletions
diff --git a/ChangeLog b/ChangeLog
index 98ca6bbd..a24a18ba 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,11 +1,12 @@
---------------------------------------------------------------------------
Version 5.4.2 [v5-stable] (rgerhards), 2010-03-??
-- bugfix(minor): status variable was uninitialized
- However, this would have caused harm only if NO parser modules at
- all were loaded, which would lead to a defunctional configuration
- at all. And, even more important, this is impossible as two parser
- modules are built-in and thus can not be "not loaded", so we always
- have a minimum of two.
+- bugfix(kind of): output plugin retry behaviour could cause engine to loop
+ The rsyslog engine did not guard itself against output modules that do
+ not properly convey back the tryResume() behaviour. This then leads to
+ what looks like an endless loop. I consider this to be a bug of the
+ engine not only because it should be hardened against plugin misbehaviour,
+ but also because plugins may not be totally able to avoid this situation
+ (depending on the type of and processing done by the plugin).
- bugfix: testbench failed when not executed in UTC+1 timezone
accidently, the time zone information was kept inside some
to-be-checked-for responses
@@ -13,6 +14,12 @@ Version 5.4.2 [v5-stable] (rgerhards), 2010-03-??
message-induced off-by-one error (potential segfault) (see 4.6.2)
The analysis has been completed and a better fix been crafted and
integrated.
+- bugfix(minor): status variable was uninitialized
+ However, this would have caused harm only if NO parser modules at
+ all were loaded, which would lead to a defunctional configuration
+ at all. And, even more important, this is impossible as two parser
+ modules are built-in and thus can not be "not loaded", so we always
+ have a minimum of two.
---------------------------------------------------------------------------
Version 5.4.1 [v5-stable] (rgerhards), 2010-03-??
- added new property replacer option "date-rfc3164-buggyday" primarily
diff --git a/action.c b/action.c
index 65465785..cff2680b 100644
--- a/action.c
+++ b/action.c
@@ -445,6 +445,7 @@ static void actionCommitted(action_t *pThis)
static void actionRetry(action_t *pThis)
{
actionSetState(pThis, ACT_STATE_RTRY);
+ pThis->iResumeOKinRow++;
}
@@ -480,23 +481,39 @@ static inline void actionSuspend(action_t *pThis, time_t ttNow)
/* actually do retry processing. Note that the function receives a timestamp so
* that we do not need to call the (expensive) time() API.
* Note that we do the full retry processing here, doing the configured number of
- * iterations.
- * rgerhards, 2009-05-07
+ * iterations. -- rgerhards, 2009-05-07
+ * We need to guard against module which always return RS_RET_OK from their tryResume()
+ * entry point. This is invalid, but has harsh consequences: it will cause the rsyslog
+ * engine to go into a tight loop. That obviously is not acceptable. As such, we track the
+ * count of iterations that a tryResume returning RS_RET_OK is immediately followed by
+ * an unsuccessful call to doAction(). If that happens more than 1,000 times, we assume
+ * the return acutally is a RS_RET_SUSPENDED. In order to go through the various
+ * resumption stages, we do this for every 1000 requests. This magic number 1000 may
+ * not be the most appropriate, but it should be thought of a "if nothing else helps"
+ * kind of facility: in the first place, the module should return a proper indication
+ * of its inability to recover. -- rgerhards, 2010-04-26.
*/
static rsRetVal actionDoRetry(action_t *pThis, time_t ttNow)
{
int iRetries;
int iSleepPeriod;
+ int bTreatOKasSusp;
DEFiRet;
ASSERT(pThis != NULL);
iRetries = 0;
while(pThis->eState == ACT_STATE_RTRY) {
+dbgprintf("YYY: resume in row %d\n", pThis->iResumeOKinRow);
iRet = pThis->pMod->tryResume(pThis->pModData);
- if(iRet == RS_RET_OK) {
+ if((pThis->iResumeOKinRow > 999) && (pThis->iResumeOKinRow % 1000 == 0)) {
+ bTreatOKasSusp = 1;
+ } else {
+ bTreatOKasSusp = 0;
+ }
+ if((iRet == RS_RET_OK) && (!bTreatOKasSusp)) {
actionSetState(pThis, ACT_STATE_RDY);
- } else if(iRet == RS_RET_SUSPENDED) {
+ } else if(iRet == RS_RET_SUSPENDED || bTreatOKasSusp) {
/* max retries reached? */
if((pThis->iResumeRetryCount != -1 && iRetries >= pThis->iResumeRetryCount)) {
actionSuspend(pThis, ttNow);
@@ -715,13 +732,16 @@ actionCallDoAction(action_t *pThis, msg_t *pMsg)
switch(iRet) {
case RS_RET_OK:
actionCommitted(pThis);
+ pThis->iResumeOKinRow = 0; /* we had a successful call! */
break;
case RS_RET_DEFER_COMMIT:
+ pThis->iResumeOKinRow = 0; /* we had a successful call! */
/* we are done, action state remains the same */
break;
case RS_RET_PREVIOUS_COMMITTED:
/* action state remains the same, but we had a commit. */
pThis->bHadAutoCommit = 1;
+ pThis->iResumeOKinRow = 0; /* we had a successful call! */
break;
case RS_RET_SUSPENDED:
actionRetry(pThis);
diff --git a/action.h b/action.h
index 6cc4df5c..4a6c3c88 100644
--- a/action.h
+++ b/action.h
@@ -56,8 +56,9 @@ struct action_s {
bool bWriteAllMarkMsgs;/* should all mark msgs be written (not matter how recent the action was executed)? */
int iSecsExecOnceInterval; /* if non-zero, minimum seconds to wait until action is executed again */
action_state_t eState; /* current state of action */
- int bHadAutoCommit; /* did an auto-commit happen during doAction()? */
+ bool bHadAutoCommit; /* did an auto-commit happen during doAction()? */
time_t ttResumeRtry; /* when is it time to retry the resume? */
+ int iResumeOKinRow; /* number of times in a row that resume said OK with an immediate failure following */
int iResumeInterval;/* resume interval for this action */
int iResumeRetryCount;/* how often shall we retry a suspended action? (-1 --> eternal) */
int iNbrResRtry; /* number of retries since last suspend */
diff --git a/tests/pipe_noreader.sh b/tests/pipe_noreader.sh
index c80aaf3f..8dea0818 100755
--- a/tests/pipe_noreader.sh
+++ b/tests/pipe_noreader.sh
@@ -4,6 +4,12 @@
# as a permanent testcase. For some details, please see bug tracker
# http://bugzilla.adiscon.com/show_bug.cgi?id=186
#
+# IMPORTANT: we do NOT check any result message set. The whole point in
+# this test is to verify that we do NOT run into an eternal loop. As such,
+# the test is "PASS", if rsyslogd terminates. If it does not terminate, we
+# obviously do not cause "FAIL", but processing will hang, which should be
+# a good-enough indication of failure.
+#
# added 2010-04-26 by Rgerhards
# This file is part of the rsyslog project, released under GPLv3
echo ===============================================================================
@@ -18,5 +24,6 @@ source $srcdir/diag.sh startup pipe_noreader.conf
source $srcdir/diag.sh tcpflood -m1000 -d500
source $srcdir/diag.sh shutdown-when-empty # shut down rsyslogd when done processing messages
source $srcdir/diag.sh wait-shutdown # and wait for it to terminate
-source $srcdir/diag.sh seq-check 0 999
+# NO need to check seqno -- see header comment
+echo we did not loop, so the test is sucessfull
source $srcdir/diag.sh exit