5 files changed, 49 insertions, 12 deletions
diff --git a/ChangeLog b/ChangeLog
index ee45417e..380bea50 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -65,12 +65,13 @@ Version 5.5.0  [DEVEL] (rgerhards), 2009-11-18
 - doc: improved description of what loadable modules can do
 ---------------------------------------------------------------------------
 Version 5.4.2  [v5-stable] (rgerhards), 2010-03-??
-- bugfix(minor): status variable was uninitialized
-  However, this would have caused harm only if NO parser modules at
-  all were loaded, which would lead to a defunctional configuration
-  at all. And, even more important, this is impossible as two parser
-  modules are built-in and thus can not be "not loaded", so we always
-  have a minimum of two.
+- bugfix(kind of): output plugin retry behaviour could cause engine to loop
+  The rsyslog engine did not guard itself against output modules that do
+  not properly convey back the tryResume() behaviour. This then leads to
+  what looks like an endless loop. I consider this to be a bug of the 
+  engine not only because it should be hardened against plugin misbehaviour,
+  but also because plugins may not be totally able to avoid this situation
+  (depending on the type of and processing done by the plugin).
 - bugfix: testbench failed when not executed in UTC+1 timezone
   accidently, the time zone information was kept inside some
   to-be-checked-for responses
@@ -78,6 +79,12 @@ Version 5.4.2  [v5-stable] (rgerhards), 2010-03-??
   message-induced off-by-one error (potential segfault) (see 4.6.2)
   The analysis has been completed and a better fix been crafted and 
   integrated.
+- bugfix(minor): status variable was uninitialized
+  However, this would have caused harm only if NO parser modules at
+  all were loaded, which would lead to a defunctional configuration
+  at all. And, even more important, this is impossible as two parser
+  modules are built-in and thus can not be "not loaded", so we always
+  have a minimum of two.
 ---------------------------------------------------------------------------
 Version 5.4.1  [v5-stable] (rgerhards), 2010-03-??
 - added new property replacer option "date-rfc3164-buggyday" primarily
diff --git a/action.c b/action.c
index 59d984a5..7751fe0a 100644
--- a/action.c
+++ b/action.c
@@ -445,6 +445,7 @@ static void actionCommitted(action_t *pThis)
 static void actionRetry(action_t *pThis)
 {
 	actionSetState(pThis, ACT_STATE_RTRY);
+	pThis->iResumeOKinRow++;
 }
 
 
@@ -480,13 +481,23 @@ static inline void actionSuspend(action_t *pThis, time_t ttNow)
 /* actually do retry processing. Note that the function receives a timestamp so
  * that we do not need to call the (expensive) time() API.
  * Note that we do the full retry processing here, doing the configured number of
- * iterations.
- * rgerhards, 2009-05-07
+ * iterations. -- rgerhards, 2009-05-07
+ * We need to guard against module which always return RS_RET_OK from their tryResume()
+ * entry point. This is invalid, but has harsh consequences: it will cause the rsyslog
+ * engine to go into a tight loop. That obviously is not acceptable. As such, we track the
+ * count of iterations that a tryResume returning RS_RET_OK is immediately followed by
+ * an unsuccessful call to doAction(). If that happens more than 1,000 times, we assume 
+ * the return acutally is a RS_RET_SUSPENDED. In order to go through the various 
+ * resumption stages, we do this for every 1000 requests. This magic number 1000 may
+ * not be the most appropriate, but it should be thought of a "if nothing else helps"
+ * kind of facility: in the first place, the module should return a proper indication
+ * of its inability to recover. -- rgerhards, 2010-04-26.
  */
 static rsRetVal actionDoRetry(action_t *pThis, time_t ttNow)
 {
 	int iRetries;
 	int iSleepPeriod;
+	int bTreatOKasSusp;
 	DEFiRet;
 
 	ASSERT(pThis != NULL);
@@ -494,9 +505,14 @@ static rsRetVal actionDoRetry(action_t *pThis, time_t ttNow)
 	iRetries = 0;
 	while(pThis->eState == ACT_STATE_RTRY) {
 		iRet = pThis->pMod->tryResume(pThis->pModData);
-		if(iRet == RS_RET_OK) {
+		if((pThis->iResumeOKinRow > 999) && (pThis->iResumeOKinRow % 1000 == 0)) {
+			bTreatOKasSusp = 1;
+		} else {
+			bTreatOKasSusp = 0;
+		}
+		if((iRet == RS_RET_OK) && (!bTreatOKasSusp)) {
 			actionSetState(pThis, ACT_STATE_RDY);
-		} else if(iRet == RS_RET_SUSPENDED) {
+		} else if(iRet == RS_RET_SUSPENDED || bTreatOKasSusp) {
 			/* max retries reached? */
 			if((pThis->iResumeRetryCount != -1 && iRetries >= pThis->iResumeRetryCount)) {
 				actionSuspend(pThis, ttNow);
@@ -715,13 +731,16 @@ actionCallDoAction(action_t *pThis, msg_t *pMsg)
 	switch(iRet) {
 		case RS_RET_OK:
 			actionCommitted(pThis);
+			pThis->iResumeOKinRow = 0; /* we had a successful call! */
 			break;
 		case RS_RET_DEFER_COMMIT:
+			pThis->iResumeOKinRow = 0; /* we had a successful call! */
 			/* we are done, action state remains the same */
 			break;
 		case RS_RET_PREVIOUS_COMMITTED:
 			/* action state remains the same, but we had a commit. */
 			pThis->bHadAutoCommit = 1;
+			pThis->iResumeOKinRow = 0; /* we had a successful call! */
 			break;
 		case RS_RET_SUSPENDED:
 			actionRetry(pThis);
diff --git a/action.h b/action.h
index d1805075..66b7316d 100644
--- a/action.h
+++ b/action.h
@@ -56,8 +56,9 @@ struct action_s {
 	sbool	bWriteAllMarkMsgs;/* should all mark msgs be written (not matter how recent the action was executed)? */
 	int	iSecsExecOnceInterval; /* if non-zero, minimum seconds to wait until action is executed again */
 	action_state_t eState;	/* current state of action */
-	int	bHadAutoCommit;	/* did an auto-commit happen during doAction()? */
+	bool	bHadAutoCommit;	/* did an auto-commit happen during doAction()? */
 	time_t	ttResumeRtry;	/* when is it time to retry the resume? */
+	int	iResumeOKinRow;	/* number of times in a row that resume said OK with an immediate failure following */
 	int	iResumeInterval;/* resume interval for this action */
 	int	iResumeRetryCount;/* how often shall we retry a suspended action? (-1 --> eternal) */
 	int	iNbrResRtry;	/* number of retries since last suspend */
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 29a2ce14..32765ae4 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -34,6 +34,7 @@ TESTS = $(TESTRUNS) cfg.sh \
 	complex1.sh \
 	queue-persist.sh \
 	pipeaction.sh \
+	pipe_noreader.sh \
 	execonlyonce.sh \
 	dircreate_dflt.sh \
 	dircreate_off.sh \
@@ -217,6 +218,8 @@ EXTRA_DIST= 1.rstest 2.rstest 3.rstest err1.rstest \
 	   testsuites/sndrcv_gzip_rcvr.conf \
 	   pipeaction.sh \
 	   testsuites/pipeaction.conf \
+	   pipe_noreader.sh \
+	   testsuites/pipe_noreader.conf \
 	   asynwr_simple.sh \
 	   testsuites/asynwr_simple.conf \
 	   asynwr_timeout.sh \
diff --git a/tests/pipe_noreader.sh b/tests/pipe_noreader.sh
index c80aaf3f..8dea0818 100755
--- a/tests/pipe_noreader.sh
+++ b/tests/pipe_noreader.sh
@@ -4,6 +4,12 @@
 # as a permanent testcase. For some details, please see bug tracker
 # http://bugzilla.adiscon.com/show_bug.cgi?id=186
 #
+# IMPORTANT: we do NOT check any result message set. The whole point in
+# this test is to verify that we do NOT run into an eternal loop. As such,
+# the test is "PASS", if rsyslogd terminates. If it does not terminate, we
+# obviously do not cause "FAIL", but processing will hang, which should be
+# a good-enough indication of failure.
+#
 # added 2010-04-26 by Rgerhards
 # This file is part of the rsyslog project, released  under GPLv3
 echo ===============================================================================
@@ -18,5 +24,6 @@ source $srcdir/diag.sh startup pipe_noreader.conf
 source $srcdir/diag.sh tcpflood -m1000 -d500
 source $srcdir/diag.sh shutdown-when-empty # shut down rsyslogd when done processing messages
 source $srcdir/diag.sh wait-shutdown       # and wait for it to terminate
-source $srcdir/diag.sh seq-check 0 999
+# NO need to check seqno -- see header comment
+echo we did not loop, so the test is sucessfull
 source $srcdir/diag.sh exit