diff options
-rw-r--r-- | ChangeLog | 19 | ||||
-rw-r--r-- | action.c | 27 | ||||
-rw-r--r-- | action.h | 3 | ||||
-rw-r--r-- | tests/Makefile.am | 3 | ||||
-rwxr-xr-x | tests/pipe_noreader.sh | 9 |
5 files changed, 49 insertions, 12 deletions
@@ -65,12 +65,13 @@ Version 5.5.0 [DEVEL] (rgerhards), 2009-11-18 - doc: improved description of what loadable modules can do --------------------------------------------------------------------------- Version 5.4.2 [v5-stable] (rgerhards), 2010-03-?? -- bugfix(minor): status variable was uninitialized - However, this would have caused harm only if NO parser modules at - all were loaded, which would lead to a defunctional configuration - at all. And, even more important, this is impossible as two parser - modules are built-in and thus can not be "not loaded", so we always - have a minimum of two. +- bugfix(kind of): output plugin retry behaviour could cause engine to loop + The rsyslog engine did not guard itself against output modules that do + not properly convey back the tryResume() behaviour. This then leads to + what looks like an endless loop. I consider this to be a bug of the + engine not only because it should be hardened against plugin misbehaviour, + but also because plugins may not be totally able to avoid this situation + (depending on the type of and processing done by the plugin). - bugfix: testbench failed when not executed in UTC+1 timezone accidently, the time zone information was kept inside some to-be-checked-for responses @@ -78,6 +79,12 @@ Version 5.4.2 [v5-stable] (rgerhards), 2010-03-?? message-induced off-by-one error (potential segfault) (see 4.6.2) The analysis has been completed and a better fix been crafted and integrated. +- bugfix(minor): status variable was uninitialized + However, this would have caused harm only if NO parser modules at + all were loaded, which would lead to a defunctional configuration + at all. And, even more important, this is impossible as two parser + modules are built-in and thus can not be "not loaded", so we always + have a minimum of two. --------------------------------------------------------------------------- Version 5.4.1 [v5-stable] (rgerhards), 2010-03-?? - added new property replacer option "date-rfc3164-buggyday" primarily @@ -445,6 +445,7 @@ static void actionCommitted(action_t *pThis) static void actionRetry(action_t *pThis) { actionSetState(pThis, ACT_STATE_RTRY); + pThis->iResumeOKinRow++; } @@ -480,13 +481,23 @@ static inline void actionSuspend(action_t *pThis, time_t ttNow) /* actually do retry processing. Note that the function receives a timestamp so * that we do not need to call the (expensive) time() API. * Note that we do the full retry processing here, doing the configured number of - * iterations. - * rgerhards, 2009-05-07 + * iterations. -- rgerhards, 2009-05-07 + * We need to guard against module which always return RS_RET_OK from their tryResume() + * entry point. This is invalid, but has harsh consequences: it will cause the rsyslog + * engine to go into a tight loop. That obviously is not acceptable. As such, we track the + * count of iterations that a tryResume returning RS_RET_OK is immediately followed by + * an unsuccessful call to doAction(). If that happens more than 1,000 times, we assume + * the return acutally is a RS_RET_SUSPENDED. In order to go through the various + * resumption stages, we do this for every 1000 requests. This magic number 1000 may + * not be the most appropriate, but it should be thought of a "if nothing else helps" + * kind of facility: in the first place, the module should return a proper indication + * of its inability to recover. -- rgerhards, 2010-04-26. */ static rsRetVal actionDoRetry(action_t *pThis, time_t ttNow) { int iRetries; int iSleepPeriod; + int bTreatOKasSusp; DEFiRet; ASSERT(pThis != NULL); @@ -494,9 +505,14 @@ static rsRetVal actionDoRetry(action_t *pThis, time_t ttNow) iRetries = 0; while(pThis->eState == ACT_STATE_RTRY) { iRet = pThis->pMod->tryResume(pThis->pModData); - if(iRet == RS_RET_OK) { + if((pThis->iResumeOKinRow > 999) && (pThis->iResumeOKinRow % 1000 == 0)) { + bTreatOKasSusp = 1; + } else { + bTreatOKasSusp = 0; + } + if((iRet == RS_RET_OK) && (!bTreatOKasSusp)) { actionSetState(pThis, ACT_STATE_RDY); - } else if(iRet == RS_RET_SUSPENDED) { + } else if(iRet == RS_RET_SUSPENDED || bTreatOKasSusp) { /* max retries reached? */ if((pThis->iResumeRetryCount != -1 && iRetries >= pThis->iResumeRetryCount)) { actionSuspend(pThis, ttNow); @@ -715,13 +731,16 @@ actionCallDoAction(action_t *pThis, msg_t *pMsg) switch(iRet) { case RS_RET_OK: actionCommitted(pThis); + pThis->iResumeOKinRow = 0; /* we had a successful call! */ break; case RS_RET_DEFER_COMMIT: + pThis->iResumeOKinRow = 0; /* we had a successful call! */ /* we are done, action state remains the same */ break; case RS_RET_PREVIOUS_COMMITTED: /* action state remains the same, but we had a commit. */ pThis->bHadAutoCommit = 1; + pThis->iResumeOKinRow = 0; /* we had a successful call! */ break; case RS_RET_SUSPENDED: actionRetry(pThis); @@ -56,8 +56,9 @@ struct action_s { sbool bWriteAllMarkMsgs;/* should all mark msgs be written (not matter how recent the action was executed)? */ int iSecsExecOnceInterval; /* if non-zero, minimum seconds to wait until action is executed again */ action_state_t eState; /* current state of action */ - int bHadAutoCommit; /* did an auto-commit happen during doAction()? */ + bool bHadAutoCommit; /* did an auto-commit happen during doAction()? */ time_t ttResumeRtry; /* when is it time to retry the resume? */ + int iResumeOKinRow; /* number of times in a row that resume said OK with an immediate failure following */ int iResumeInterval;/* resume interval for this action */ int iResumeRetryCount;/* how often shall we retry a suspended action? (-1 --> eternal) */ int iNbrResRtry; /* number of retries since last suspend */ diff --git a/tests/Makefile.am b/tests/Makefile.am index 29a2ce14..32765ae4 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -34,6 +34,7 @@ TESTS = $(TESTRUNS) cfg.sh \ complex1.sh \ queue-persist.sh \ pipeaction.sh \ + pipe_noreader.sh \ execonlyonce.sh \ dircreate_dflt.sh \ dircreate_off.sh \ @@ -217,6 +218,8 @@ EXTRA_DIST= 1.rstest 2.rstest 3.rstest err1.rstest \ testsuites/sndrcv_gzip_rcvr.conf \ pipeaction.sh \ testsuites/pipeaction.conf \ + pipe_noreader.sh \ + testsuites/pipe_noreader.conf \ asynwr_simple.sh \ testsuites/asynwr_simple.conf \ asynwr_timeout.sh \ diff --git a/tests/pipe_noreader.sh b/tests/pipe_noreader.sh index c80aaf3f..8dea0818 100755 --- a/tests/pipe_noreader.sh +++ b/tests/pipe_noreader.sh @@ -4,6 +4,12 @@ # as a permanent testcase. For some details, please see bug tracker # http://bugzilla.adiscon.com/show_bug.cgi?id=186 # +# IMPORTANT: we do NOT check any result message set. The whole point in +# this test is to verify that we do NOT run into an eternal loop. As such, +# the test is "PASS", if rsyslogd terminates. If it does not terminate, we +# obviously do not cause "FAIL", but processing will hang, which should be +# a good-enough indication of failure. +# # added 2010-04-26 by Rgerhards # This file is part of the rsyslog project, released under GPLv3 echo =============================================================================== @@ -18,5 +24,6 @@ source $srcdir/diag.sh startup pipe_noreader.conf source $srcdir/diag.sh tcpflood -m1000 -d500 source $srcdir/diag.sh shutdown-when-empty # shut down rsyslogd when done processing messages source $srcdir/diag.sh wait-shutdown # and wait for it to terminate -source $srcdir/diag.sh seq-check 0 999 +# NO need to check seqno -- see header comment +echo we did not loop, so the test is sucessfull source $srcdir/diag.sh exit |