From a201de795ad630ddf81fea9f7bbb13ad79bdfff4 Mon Sep 17 00:00:00 2001 From: Rainer Gerhards Date: Wed, 15 Jun 2011 12:02:39 +0200 Subject: doc: clarified requirements for $WorkDirectory directive --- doc/rsyslog_conf_global.html | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/rsyslog_conf_global.html b/doc/rsyslog_conf_global.html index ce46bac2..9e85ae1e 100644 --- a/doc/rsyslog_conf_global.html +++ b/doc/rsyslog_conf_global.html @@ -249,7 +249,8 @@ default may change as uniprocessor systems become less common. [available since
  • $PreserveFQDN [on/off) - if set to off (legacy default to remain compatible to sysklogd), the domain part from a name that is within the same domain as the receiving system is stripped. If set to on, full names are always used.
  • -
  • $WorkDirectory <name> (directory for spool and other work files)
  • +
  • $WorkDirectory <name> (directory for spool and other work files. +Do not use trailing slashes)
  • $UDPServerAddress <IP> (imudp) -- local IP address (or name) the UDP listens should bind to
  • $UDPServerRun <port> (imudp) -- former -- cgit From 41d8672524b93aaf7b7835336ac9b92428b90fed Mon Sep 17 00:00:00 2001 From: Rainer Gerhards Date: Wed, 15 Jun 2011 12:20:12 +0200 Subject: bugfix/improvement:$WorkDirectory now gracefully handles trailing slashes --- ChangeLog | 1 + runtime/glbl.c | 23 ++++++++++++++++++++++- runtime/rsyslog.h | 1 + 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 8867e04e..042aeae3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -3,6 +3,7 @@ Version 5.8.2 [V5-stable] (rgerhards), 2011-06-?? - bugfix: memory leak in imtcp & subsystems under some circumstances This leak is tied to error conditions which lead to incorrect cleanup of some data structures. [backport from v6] +- bugfix/improvement:$WorkDirectory now gracefully handles trailing slashes --------------------------------------------------------------------------- Version 5.8.1 [V5-stable] (rgerhards), 2011-05-19 - bugfix: invalid processing in QUEUE_FULL condition diff --git a/runtime/glbl.c b/runtime/glbl.c index ec4992cf..dea5a17b 100644 --- a/runtime/glbl.c +++ b/runtime/glbl.c @@ -159,8 +159,29 @@ static void SetGlobalInputTermination(void) */ static rsRetVal setWorkDir(void __attribute__((unused)) *pVal, uchar *pNewVal) { - DEFiRet; + size_t lenDir; + int i; struct stat sb; + DEFiRet; + + /* remove trailing slashes */ + lenDir = ustrlen(pNewVal); + i = lenDir - 1; + while(i > 0 && pNewVal[i] == '/') { + --i; + } + + if(i < 0) { + errmsg.LogError(0, RS_RET_ERR_WRKDIR, "$WorkDirectory: empty value " + "- directive ignored"); + ABORT_FINALIZE(RS_RET_ERR_WRKDIR); + } + + if(i != (int) lenDir - 1) { + pNewVal[i+1] = '\0'; + errmsg.LogError(0, RS_RET_WRN_WRKDIR, "$WorkDirectory: trailing slashes " + "removed, new value is '%s'", pNewVal); + } if(stat((char*) pNewVal, &sb) != 0) { errmsg.LogError(0, RS_RET_ERR_WRKDIR, "$WorkDirectory: %s can not be " diff --git a/runtime/rsyslog.h b/runtime/rsyslog.h index d63dbe4f..52b29ac4 100644 --- a/runtime/rsyslog.h +++ b/runtime/rsyslog.h @@ -342,6 +342,7 @@ enum rsRetVal_ /** return value. All methods return this if not specified oth RS_RET_ERR_HDFS_OPEN = -2179, /**< error during hdfsOpen (e.g. file does not exist) */ RS_RET_FILE_NOT_SPECIFIED = -2180, /**< file name not configured where this was required */ RS_RET_ERR_WRKDIR = -2181, /**< problems with the rsyslog working directory */ + RS_RET_WRN_WRKDIR = -2182, /**< correctable problems with the rsyslog working directory */ /* RainerScript error messages (range 1000.. 1999) */ RS_RET_SYSVAR_NOT_FOUND = 1001, /**< system variable could not be found (maybe misspelled) */ -- cgit From 90f8c7300495da9b61e4706be652c612fccc084f Mon Sep 17 00:00:00 2001 From: Rainer Gerhards Date: Thu, 16 Jun 2011 15:17:48 +0200 Subject: bugfix: problems in failover action handling closes: http://bugzilla.adiscon.com/show_bug.cgi?id=270 (not yet confirmed!) --- ChangeLog | 2 + action.c | 140 +++++++++++++++++++++++++++++++++++++------------------- action.h | 2 +- runtime/rule.c | 12 +++-- tools/syslogd.c | 2 +- 5 files changed, 106 insertions(+), 52 deletions(-) diff --git a/ChangeLog b/ChangeLog index 042aeae3..0d7bf02a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ --------------------------------------------------------------------------- Version 5.8.2 [V5-stable] (rgerhards), 2011-06-?? +- bugfix: problems in failover action handling + closes: http://bugzilla.adiscon.com/show_bug.cgi?id=270 (not yet confirmed!) - bugfix: memory leak in imtcp & subsystems under some circumstances This leak is tied to error conditions which lead to incorrect cleanup of some data structures. [backport from v6] diff --git a/action.c b/action.c index c5bd03cb..b1986c38 100644 --- a/action.c +++ b/action.c @@ -39,7 +39,35 @@ * - processAction * - submitBatch * - tryDoAction - * - + * - ... + * + * MORE ON PROCESSING, QUEUES and FILTERING + * All filtering needs to be done BEFORE messages are enqueued to an + * action. In previous code, part of the filtering was done at the + * "remote end" of the action queue, which lead to problems in + * non-direct mode (because then things run asynchronously). In order + * to solve this problem once and for all, I have changed the code so + * that all filtering is done before enq, and processing on the + * dequeue side of action processing now always executes whatever is + * enqueued. This is the only way to handle things consistently and + * (as much as possible) in a queue-type agnostic way. However, it is + * a rather radical change, which I unfortunately needed to make from + * stable version 5.8.1 to 5.8.2. If new problems pop up, you now know + * what may be their cause. In any case, the way it is done now is the + * only correct one. + * A problem is that, under fortunate conditions, we use the current + * batch for the output system as well. This is very good from a performance + * point of view, but makes the distinction between enq and deq side of + * the queue a bit hard. The current idea is that the filter condition + * alone is checked at the deq side of the queue (seems to be unavoidable + * to do it that way), but all other complex conditons (like failover + * handling) go into the computation of the filter condition. For + * non-direct queues, we still enqueue only what is acutally necessary. + * Note that in this case the rest of the code must ensure that the filter + * is set to "true". While this is not perfect and not as simple as + * we would like to see it, it looks like the best way to tackle that + * beast. + * rgerhards, 2011-06-15 * * Copyright 2007-2011 Rainer Gerhards and Adiscon GmbH. * @@ -611,8 +639,8 @@ static rsRetVal actionTryResume(action_t *pThis, int *pbShutdownImmediate) } if(Debug && (pThis->eState == ACT_STATE_RTRY ||pThis->eState == ACT_STATE_SUSP)) { - DBGPRINTF("actionTryResume: action state: %s, next retry (if applicable): %u [now %u]\n", - getActStateName(pThis), (unsigned) pThis->ttResumeRtry, (unsigned) ttNow); + DBGPRINTF("actionTryResume: action %p state: %s, next retry (if applicable): %u [now %u]\n", + pThis, getActStateName(pThis), (unsigned) pThis->ttResumeRtry, (unsigned) ttNow); } finalize_it: @@ -932,16 +960,19 @@ tryDoAction(action_t *pAction, batch_t *pBatch, int *pnElem) i = pBatch->iDoneUpTo; /* all messages below that index are processed */ iElemProcessed = 0; iCommittedUpTo = i; +dbgprintf("XXXXX: tryDoAction %p, pnElem %d, nElem %d\n", pAction, *pnElem, pBatch->nElem); while(iElemProcessed <= *pnElem && i < pBatch->nElem) { if(*(pBatch->pbShutdownImmediate)) ABORT_FINALIZE(RS_RET_FORCE_TERM); + /* NOTE: do NOT extend the filter below! Anything else must be done on the + * enq side of the queue (see file header comment)! -- rgerhards, 2011-06-15 + */ if( pBatch->pElem[i].bFilterOK - && pBatch->pElem[i].state != BATCH_STATE_DISC//) { - && ((pAction->bExecWhenPrevSusp == 0) || pBatch->pElem[i].bPrevWasSuspended) ) { + && pBatch->pElem[i].state != BATCH_STATE_DISC) { pMsg = (msg_t*) pBatch->pElem[i].pUsrp; localRet = actionProcessMessage(pAction, pMsg, pBatch->pElem[i].staticActParams, - pBatch->pbShutdownImmediate); - DBGPRINTF("action call returned %d\n", localRet); + pBatch->pbShutdownImmediate); + DBGPRINTF("action %p call returned %d\n", pAction, localRet); /* Note: we directly modify the batch object state, because we know that * wo do not overwrite BATCH_STATE_DISC indicators! */ @@ -1035,6 +1066,8 @@ submitBatch(action_t *pAction, batch_t *pBatch, int nElem) bDone = 1; } else { /* retry with half as much. Depth is log_2 batchsize, so recursion is not too deep */ + DBGPRINTF("submitBatch recursing trying to find and exclude the culprit " + "for iRet %d\n", localRet); submitBatch(pAction, pBatch, nElem / 2); submitBatch(pAction, pBatch, nElem - (nElem / 2)); bDone = 1; @@ -1224,11 +1257,13 @@ doSubmitToActionQ(action_t *pAction, msg_t *pMsg) /* This function builds up a batch of messages to be (later) * submitted to the action queue. - * Note: this function is also called from syslogd itself as part of its - * flush processing. If so, pBatch will be NULL and idxBtch undefined. + * Important: this function MUST not be called with messages that are to + * be discarded due to their "prevWasSuspended" state. It will not check for + * this and submit all messages to the queue for execution. So these must + * be filtered out before calling us (what is done currently!). */ rsRetVal -actionWriteToAction(action_t *pAction, batch_t *pBatch, int idxBtch) +actionWriteToAction(action_t *pAction) { msg_t *pMsgSave; /* to save current message pointer, necessary to restore it in case it needs to be updated (e.g. repeated msgs) */ @@ -1325,35 +1360,7 @@ actionWriteToAction(action_t *pAction, batch_t *pBatch, int idxBtch) /* When we reach this point, we have a valid, non-disabled action. * So let's enqueue our message for execution. -- rgerhards, 2007-07-24 */ - if( pBatch != NULL - && (pAction->bExecWhenPrevSusp == 1 && pBatch->pElem[idxBtch].bPrevWasSuspended)) { - /* in that case, we need to create a special batch which reflects the - * suspended state. Otherwise, that information would be dropped inside - * the queue engine. TODO: in later releases (v6?) create a better - * solution than what we do here. However, for v5 this sounds much too - * intrusive. -- rgerhardsm, 2011-03-16 - * (Code is copied over from queue.c and slightly modified) - */ - batch_t singleBatch; - batch_obj_t batchObj; - int i; - memset(&batchObj, 0, sizeof(batch_obj_t)); - memset(&singleBatch, 0, sizeof(batch_t)); - batchObj.state = BATCH_STATE_RDY; - batchObj.pUsrp = (obj_t*) pAction->f_pMsg; - batchObj.bPrevWasSuspended = 1; - batchObj.bFilterOK = 1; - singleBatch.nElem = 1; /* there always is only one in direct mode */ - singleBatch.pElem = &batchObj; - - iRet = qqueueEnqObjDirectBatch(pAction->pQueue, &singleBatch); - - for(i = 0 ; i < CONF_OMOD_NUMSTRINGS_MAXSIZE ; ++i) { - free(batchObj.staticActStrings[i]); - } - } else { /* standard case, just submit */ - iRet = doSubmitToActionQ(pAction, pAction->f_pMsg); - } + iRet = doSubmitToActionQ(pAction, pAction->f_pMsg); if(iRet == RS_RET_OK) pAction->f_prevcount = 0; /* message processed, so we start a new cycle */ @@ -1413,7 +1420,7 @@ doActionCallAction(action_t *pAction, batch_t *pBatch, int idxBtch) * isolated messages), but back off so we'll flush less often in the future. */ if(getActNow(pAction) > REPEATTIME(pAction)) { - iRet = actionWriteToAction(pAction, pBatch, idxBtch); + iRet = actionWriteToAction(pAction); BACKOFF(pAction); } } else {/* new message, save it */ @@ -1422,7 +1429,7 @@ doActionCallAction(action_t *pAction, batch_t *pBatch, int idxBtch) */ if(pAction->f_pMsg != NULL) { if(pAction->f_prevcount > 0) - actionWriteToAction(pAction, pBatch, idxBtch); + actionWriteToAction(pAction); /* we do not care about iRet above - I think it's right but if we have * some troubles, you know where to look at ;) -- rgerhards, 2007-08-01 */ @@ -1430,7 +1437,7 @@ doActionCallAction(action_t *pAction, batch_t *pBatch, int idxBtch) } pAction->f_pMsg = MsgAddRef(pMsg); /* call the output driver */ - iRet = actionWriteToAction(pAction, pBatch, idxBtch); + iRet = actionWriteToAction(pAction); } finalize_it: @@ -1528,16 +1535,51 @@ static rsRetVal doSubmitToActionQBatch(action_t *pAction, batch_t *pBatch) { int i; + sbool bNeedSubmit; DEFiRet; + /* TODO + ich arbeite an dieser funktion, es müssen die verscheidenen modi geprüft werden. ausserdem + muss geschaut werden, in welche anderen funktionen die neue Funktionalität noch eingebaut + werden muss, bzw. ob man das an zentralerer stelle machen kann. Am besten die gesamte + filter evaluation nochmal druchgehen (also das füllen des arrays). + */ + DBGPRINTF("Called action(Batch), logging to %s\n", module.GetStateName(pAction->pMod)); - if(pAction->pQueue->qType == QUEUETYPE_DIRECT) - iRet = qqueueEnqObjDirectBatch(pAction->pQueue, pBatch); - else { /* in this case, we do single submits to the queue. + /* if necessary, take care of failover cases. We do this by simply + * changing the filter setting, which is perfectly legal. + */ + if(pAction->pQueue->qType == QUEUETYPE_DIRECT) { + /* note: for direct mode, we need to adjust the filter property. For non-direct + * this is not necessary, because in that case we enqueue only what actually needs + * to be processed. + */ + if(pAction->bExecWhenPrevSusp) { + bNeedSubmit = 0; + for(i = 0 ; i < batchNumMsgs(pBatch) && !*(pBatch->pbShutdownImmediate) ; ++i) { + if(!pBatch->pElem[i].bPrevWasSuspended) { + DBGPRINTF("action enq stage: change bFilterOK to 0 due to " + "failover case in elem %d\n", i); + pBatch->pElem[i].bFilterOK = 0; + } + if(pBatch->pElem[i].bFilterOK) + bNeedSubmit = 1; + } + if(bNeedSubmit) { + iRet = qqueueEnqObjDirectBatch(pAction->pQueue, pBatch); + } else { + DBGPRINTF("no need to submit batch, all bFilterOK==0\n"); + } + } else { + iRet = qqueueEnqObjDirectBatch(pAction->pQueue, pBatch); + } + } else { /* in this case, we do single submits to the queue. * TODO: optimize this, we may do at least a multi-submit! */ for(i = 0 ; i < batchNumMsgs(pBatch) && !*(pBatch->pbShutdownImmediate) ; ++i) { - if(pBatch->pElem[i].bFilterOK) { + if( pBatch->pElem[i].bFilterOK + && pBatch->pElem[i].state != BATCH_STATE_DISC + && (pAction->bExecWhenPrevSusp == 0 || pBatch->pElem[i].bPrevWasSuspended == 1)) { doSubmitToActionQ(pAction, (msg_t*)(pBatch->pElem[i].pUsrp)); } } @@ -1558,8 +1600,12 @@ helperSubmitToActionQComplexBatch(action_t *pAction, batch_t *pBatch) int i; DEFiRet; - DBGPRINTF("Called action(complex case), logging to %s\n", module.GetStateName(pAction->pMod)); + DBGPRINTF("Called action %p (complex case), logging to %s\n", + pAction, module.GetStateName(pAction->pMod)); for(i = 0 ; i < batchNumMsgs(pBatch) && !*(pBatch->pbShutdownImmediate) ; ++i) { + DBGPRINTF("action %p: filterOK:%d state:%d execWhenPrev:%d prevWasSusp:%d\n", + pAction, pBatch->pElem[i].bFilterOK, pBatch->pElem[i].state, + pAction->bExecWhenPrevSusp, pBatch->pElem[i].bPrevWasSuspended); if( pBatch->pElem[i].bFilterOK && pBatch->pElem[i].state != BATCH_STATE_DISC && ((pAction->bExecWhenPrevSusp == 0) || pBatch->pElem[i].bPrevWasSuspended) ) { diff --git a/action.h b/action.h index 0ab8062a..bae64d31 100644 --- a/action.h +++ b/action.h @@ -100,7 +100,7 @@ rsRetVal actionDestruct(action_t *pThis); rsRetVal actionDbgPrint(action_t *pThis); rsRetVal actionSetGlobalResumeInterval(int iNewVal); rsRetVal actionDoAction(action_t *pAction); -rsRetVal actionWriteToAction(action_t *pAction, batch_t *pBatch, int idxBtch); +rsRetVal actionWriteToAction(action_t *pAction); rsRetVal actionCallHUPHdlr(action_t *pAction); rsRetVal actionClassInit(void); rsRetVal addAction(action_t **ppAction, modInfo_t *pMod, void *pModData, omodStringRequest_t *pOMSR, int bSuspended); diff --git a/runtime/rule.c b/runtime/rule.c index d5f18e71..19239d61 100644 --- a/runtime/rule.c +++ b/runtime/rule.c @@ -266,6 +266,7 @@ static rsRetVal processBatch(rule_t *pThis, batch_t *pBatch) { int i; + rsRetVal localRet; DEFiRet; ISOBJ_TYPE_assert(pThis, rule); @@ -273,9 +274,14 @@ processBatch(rule_t *pThis, batch_t *pBatch) /* first check the filters and reset status variables */ for(i = 0 ; i < batchNumMsgs(pBatch) && !*(pBatch->pbShutdownImmediate) ; ++i) { - CHKiRet(shouldProcessThisMessage(pThis, (msg_t*)(pBatch->pElem[i].pUsrp), - &(pBatch->pElem[i].bFilterOK))); - // TODO: really abort on error? 2010-06-10 + localRet = shouldProcessThisMessage(pThis, (msg_t*)(pBatch->pElem[i].pUsrp), + &(pBatch->pElem[i].bFilterOK)); + if(localRet != RS_RET_OK) { + DBGPRINTF("processBatch: iRet %d returned from shouldProcessThisMessage, " + "ignoring message\n", localRet); + + pBatch->pElem[i].bFilterOK = 0; + } if(pBatch->pElem[i].bFilterOK) { /* re-init only when actually needed (cache write cost!) */ pBatch->pElem[i].bPrevWasSuspended = 0; diff --git a/tools/syslogd.c b/tools/syslogd.c index 487ab364..096f9309 100644 --- a/tools/syslogd.c +++ b/tools/syslogd.c @@ -799,7 +799,7 @@ DEFFUNC_llExecFunc(flushRptdMsgsActions) DBGPRINTF("flush %s: repeated %d times, %d sec.\n", module.GetStateName(pAction->pMod), pAction->f_prevcount, repeatinterval[pAction->f_repeatcount]); - actionWriteToAction(pAction, NULL, 0); + actionWriteToAction(pAction); BACKOFF(pAction); } UnlockObj(pAction); -- cgit From da52cbae520e747568162ad558bf01d40658c745 Mon Sep 17 00:00:00 2001 From: Rainer Gerhards Date: Thu, 16 Jun 2011 17:35:45 +0200 Subject: bugfix: timestamp was incorrectly calculated for timezones with minute offset closes: http://bugzilla.adiscon.com/show_bug.cgi?id=271 --- ChangeLog | 3 +++ runtime/datetime.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index a50c4ee0..b547f0ea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ --------------------------------------------------------------------------- Version 3.22.4 [v3-stable] (rgerhards), 2010-??-?? +- bugfix: timestamp was incorrectly calculated for timezones with minute + offset + closes: http://bugzilla.adiscon.com/show_bug.cgi?id=271 - improved some code based on clang static analyzer results --------------------------------------------------------------------------- Version 3.22.3 [v3-stable] (rgerhards), 2010-11-24 diff --git a/runtime/datetime.c b/runtime/datetime.c index 20ca6191..bed33127 100644 --- a/runtime/datetime.c +++ b/runtime/datetime.c @@ -112,7 +112,7 @@ static void getCurrTime(struct syslogTime *t) else t->OffsetMode = '+'; t->OffsetHour = lBias / 3600; - t->OffsetMinute = lBias % 3600; + t->OffsetMinute = (lBias % 3600) / 60; } -- cgit From 257b06aac8222cdea231a95cbbe659679a2d417e Mon Sep 17 00:00:00 2001 From: Rainer Gerhards Date: Thu, 16 Jun 2011 17:52:16 +0200 Subject: failover problem was not totally solved, now (hopefully ;)) I overlooked a border case, which came up on a larger testbench run. --- action.c | 86 ++++++++++++++++++++++++++++++-------------- tests/manytcp-too-few-tls.sh | 6 +++- 2 files changed, 64 insertions(+), 28 deletions(-) diff --git a/action.c b/action.c index b1986c38..7909d8e3 100644 --- a/action.c +++ b/action.c @@ -1526,6 +1526,63 @@ finalize_it: } +/* enqueue a batch in direct mode. We have put this into its own function just to avoid + * cluttering the actual submit function. + * rgerhards, 2011-06-16 + */ +static inline rsRetVal +doQueueEnqObjDirectBatch(action_t *pAction, batch_t *pBatch) +{ + sbool FilterSave[1024]; + sbool *pFilterSave; + sbool bNeedSubmit; + sbool bModifiedFilter; + int i; + DEFiRet; + + if(batchNumMsgs(pBatch) <= (int) (sizeof(FilterSave)/sizeof(sbool))) { + pFilterSave = FilterSave; + } else { + CHKmalloc(pFilterSave = malloc(batchNumMsgs(pBatch) * sizeof(sbool))); + } + + /* note: for direct mode, we need to adjust the filter property. For non-direct + * this is not necessary, because in that case we enqueue only what actually needs + * to be processed. + */ + if(pAction->bExecWhenPrevSusp) { + bNeedSubmit = 0; + bModifiedFilter = 0; + for(i = 0 ; i < batchNumMsgs(pBatch) && !*(pBatch->pbShutdownImmediate) ; ++i) { + pFilterSave[i] = pBatch->pElem[i].bFilterOK; + if(!pBatch->pElem[i].bPrevWasSuspended) { + DBGPRINTF("action enq stage: change bFilterOK to 0 due to " + "failover case in elem %d\n", i); + pBatch->pElem[i].bFilterOK = 0; + bModifiedFilter = 1; + } + if(pBatch->pElem[i].bFilterOK) + bNeedSubmit = 1; + } + if(bNeedSubmit) { + iRet = qqueueEnqObjDirectBatch(pAction->pQueue, pBatch); + } else { + DBGPRINTF("no need to submit batch, all bFilterOK==0\n"); + } + if(bModifiedFilter) { + for(i = 0 ; i < batchNumMsgs(pBatch) ; ++i) { + /* note: clang static code analyzer reports a false positive below */ + pBatch->pElem[i].bFilterOK = pFilterSave[i]; + } + } + } else { + iRet = qqueueEnqObjDirectBatch(pAction->pQueue, pBatch); + } + +finalize_it: + RETiRet; +} + /* This submits the message to the action queue in case we do NOT need to handle repeat * message processing. That case permits us to gain lots of freedom during processing * and thus speed. @@ -1535,7 +1592,6 @@ static rsRetVal doSubmitToActionQBatch(action_t *pAction, batch_t *pBatch) { int i; - sbool bNeedSubmit; DEFiRet; /* TODO @@ -1546,33 +1602,9 @@ doSubmitToActionQBatch(action_t *pAction, batch_t *pBatch) */ DBGPRINTF("Called action(Batch), logging to %s\n", module.GetStateName(pAction->pMod)); - /* if necessary, take care of failover cases. We do this by simply - * changing the filter setting, which is perfectly legal. - */ + if(pAction->pQueue->qType == QUEUETYPE_DIRECT) { - /* note: for direct mode, we need to adjust the filter property. For non-direct - * this is not necessary, because in that case we enqueue only what actually needs - * to be processed. - */ - if(pAction->bExecWhenPrevSusp) { - bNeedSubmit = 0; - for(i = 0 ; i < batchNumMsgs(pBatch) && !*(pBatch->pbShutdownImmediate) ; ++i) { - if(!pBatch->pElem[i].bPrevWasSuspended) { - DBGPRINTF("action enq stage: change bFilterOK to 0 due to " - "failover case in elem %d\n", i); - pBatch->pElem[i].bFilterOK = 0; - } - if(pBatch->pElem[i].bFilterOK) - bNeedSubmit = 1; - } - if(bNeedSubmit) { - iRet = qqueueEnqObjDirectBatch(pAction->pQueue, pBatch); - } else { - DBGPRINTF("no need to submit batch, all bFilterOK==0\n"); - } - } else { - iRet = qqueueEnqObjDirectBatch(pAction->pQueue, pBatch); - } + iRet = doQueueEnqObjDirectBatch(pAction, pBatch); } else { /* in this case, we do single submits to the queue. * TODO: optimize this, we may do at least a multi-submit! */ diff --git a/tests/manytcp-too-few-tls.sh b/tests/manytcp-too-few-tls.sh index 899a87dc..970a572d 100755 --- a/tests/manytcp-too-few-tls.sh +++ b/tests/manytcp-too-few-tls.sh @@ -11,5 +11,9 @@ sleep 1 source $srcdir/diag.sh shutdown-when-empty # shut down rsyslogd when done processing messages source $srcdir/diag.sh wait-shutdown-vg # we need to wait until rsyslogd is finished! source $srcdir/diag.sh check-exit-vg -source $srcdir/diag.sh seq-check 0 39999 +# we do not do a seq check, as of the design of this test some messages +# will be lost. So there is no point in checking if all were received. The +# point is that we look at the valgrind result, to make sure we do not +# have a mem leak in those error cases (we had in the past, thus the test +# to prevent that in the future). source $srcdir/diag.sh exit -- cgit From 656740b66307af24d1318d389c3ef0e7095460bc Mon Sep 17 00:00:00 2001 From: Rainer Gerhards Date: Mon, 20 Jun 2011 15:01:11 +0200 Subject: bugfix: problems in failover action handling closes: http://bugzilla.adiscon.com/show_bug.cgi?id=254 --- ChangeLog | 3 ++- action.c | 20 ++++++++++++-------- runtime/rule.c | 1 - 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4d1fe09f..6cdfb670 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,8 @@ --------------------------------------------------------------------------- Version 5.8.2 [V5-stable] (rgerhards), 2011-06-?? - bugfix: problems in failover action handling - closes: http://bugzilla.adiscon.com/show_bug.cgi?id=270 (not yet confirmed!) + closes: http://bugzilla.adiscon.com/show_bug.cgi?id=270 + closes: http://bugzilla.adiscon.com/show_bug.cgi?id=254 - bugfix: memory leak in imtcp & subsystems under some circumstances This leak is tied to error conditions which lead to incorrect cleanup of some data structures. [backport from v6] diff --git a/action.c b/action.c index 7909d8e3..54a05fc3 100644 --- a/action.c +++ b/action.c @@ -979,11 +979,13 @@ dbgprintf("XXXXX: tryDoAction %p, pnElem %d, nElem %d\n", pAction, *pnElem, if(localRet == RS_RET_OK) { /* mark messages as committed */ while(iCommittedUpTo <= i) { + pBatch->pElem[iCommittedUpTo].bPrevWasSuspended = 0; /* we had success! */ pBatch->pElem[iCommittedUpTo++].state = BATCH_STATE_COMM; } } else if(localRet == RS_RET_PREVIOUS_COMMITTED) { /* mark messages as committed */ while(iCommittedUpTo < i) { + pBatch->pElem[iCommittedUpTo].bPrevWasSuspended = 0; /* we had success! */ pBatch->pElem[iCommittedUpTo++].state = BATCH_STATE_COMM; } pBatch->pElem[i].state = BATCH_STATE_SUB; @@ -1563,6 +1565,9 @@ doQueueEnqObjDirectBatch(action_t *pAction, batch_t *pBatch) } if(pBatch->pElem[i].bFilterOK) bNeedSubmit = 1; + DBGPRINTF("action %p[%d]: filterOK:%d state:%d execWhenPrev:%d prevWasSusp:%d\n", + pAction, i, pBatch->pElem[i].bFilterOK, pBatch->pElem[i].state, + pAction->bExecWhenPrevSusp, pBatch->pElem[i].bPrevWasSuspended); } if(bNeedSubmit) { iRet = qqueueEnqObjDirectBatch(pAction->pQueue, pBatch); @@ -1571,6 +1576,9 @@ doQueueEnqObjDirectBatch(action_t *pAction, batch_t *pBatch) } if(bModifiedFilter) { for(i = 0 ; i < batchNumMsgs(pBatch) ; ++i) { + DBGPRINTF("action %p: filterOK:%d state:%d execWhenPrev:%d prevWasSusp:%d\n", + pAction, pBatch->pElem[i].bFilterOK, pBatch->pElem[i].state, + pAction->bExecWhenPrevSusp, pBatch->pElem[i].bPrevWasSuspended); /* note: clang static code analyzer reports a false positive below */ pBatch->pElem[i].bFilterOK = pFilterSave[i]; } @@ -1594,21 +1602,17 @@ doSubmitToActionQBatch(action_t *pAction, batch_t *pBatch) int i; DEFiRet; - /* TODO - ich arbeite an dieser funktion, es müssen die verscheidenen modi geprüft werden. ausserdem - muss geschaut werden, in welche anderen funktionen die neue Funktionalität noch eingebaut - werden muss, bzw. ob man das an zentralerer stelle machen kann. Am besten die gesamte - filter evaluation nochmal druchgehen (also das füllen des arrays). - */ - DBGPRINTF("Called action(Batch), logging to %s\n", module.GetStateName(pAction->pMod)); if(pAction->pQueue->qType == QUEUETYPE_DIRECT) { iRet = doQueueEnqObjDirectBatch(pAction, pBatch); - } else { /* in this case, we do single submits to the queue. + } else {/* in this case, we do single submits to the queue. * TODO: optimize this, we may do at least a multi-submit! */ for(i = 0 ; i < batchNumMsgs(pBatch) && !*(pBatch->pbShutdownImmediate) ; ++i) { + DBGPRINTF("action %p: filterOK:%d state:%d execWhenPrev:%d prevWasSusp:%d\n", + pAction, pBatch->pElem[i].bFilterOK, pBatch->pElem[i].state, + pAction->bExecWhenPrevSusp, pBatch->pElem[i].bPrevWasSuspended); if( pBatch->pElem[i].bFilterOK && pBatch->pElem[i].state != BATCH_STATE_DISC && (pAction->bExecWhenPrevSusp == 0 || pBatch->pElem[i].bPrevWasSuspended == 1)) { diff --git a/runtime/rule.c b/runtime/rule.c index 19239d61..0776e2dc 100644 --- a/runtime/rule.c +++ b/runtime/rule.c @@ -279,7 +279,6 @@ processBatch(rule_t *pThis, batch_t *pBatch) if(localRet != RS_RET_OK) { DBGPRINTF("processBatch: iRet %d returned from shouldProcessThisMessage, " "ignoring message\n", localRet); - pBatch->pElem[i].bFilterOK = 0; } if(pBatch->pElem[i].bFilterOK) { -- cgit From 9fcc6b7285c316b97d4178c9e264d6de432f88b6 Mon Sep 17 00:00:00 2001 From: Rainer Gerhards Date: Mon, 20 Jun 2011 15:57:15 +0200 Subject: testbench: added new tests for failover cases --- tests/Makefile.am | 11 ++++++ tests/diag.sh | 4 +-- tests/failover-async.sh | 2 +- tests/failover-double.sh | 12 +++++++ tests/sndrcv_drvr.sh | 50 +--------------------------- tests/sndrcv_drvr_noexit.sh | 49 +++++++++++++++++++++++++++ tests/sndrcv_failover.sh | 21 ++++++++++++ tests/testsuites/failover-double.conf | 9 +++++ tests/testsuites/sndrcv_failover_rcvr.conf | 11 ++++++ tests/testsuites/sndrcv_failover_sender.conf | 13 ++++++++ 10 files changed, 130 insertions(+), 52 deletions(-) create mode 100755 tests/failover-double.sh create mode 100755 tests/sndrcv_drvr_noexit.sh create mode 100755 tests/sndrcv_failover.sh create mode 100644 tests/testsuites/failover-double.conf create mode 100644 tests/testsuites/sndrcv_failover_rcvr.conf create mode 100644 tests/testsuites/sndrcv_failover_sender.conf diff --git a/tests/Makefile.am b/tests/Makefile.am index 3ee69413..eec7b16b 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -18,6 +18,7 @@ TESTS += \ imtcp_conndrop.sh \ imtcp_addtlframedelim.sh \ sndrcv.sh \ + sndrcv_failover.sh \ sndrcv_gzip.sh \ sndrcv_udp.sh \ sndrcv_udp_nonstdpt.sh \ @@ -52,6 +53,8 @@ TESTS += \ discard-rptdmsg.sh \ discard-allmark.sh \ discard.sh \ + failover-async.sh \ + failover-double.sh \ failover-basic.sh \ failover-rptd.sh \ failover-no-rptd.sh \ @@ -285,6 +288,10 @@ EXTRA_DIST= 1.rstest 2.rstest 3.rstest err1.rstest \ failover-basic.sh \ failover-basic-vg.sh \ testsuites/failover-basic.conf \ + failover-async.sh \ + testsuites/failover-async.conf \ + failover-double.sh \ + testsuites/failover-double.conf \ discard-rptdmsg.sh \ discard-rptdmsg-vg.sh \ testsuites/discard-rptdmsg.conf \ @@ -307,6 +314,10 @@ EXTRA_DIST= 1.rstest 2.rstest 3.rstest err1.rstest \ threadingmqaq.sh \ testsuites/threadingmqaq.conf \ sndrcv_drvr.sh \ + sndrcv_drvr_noexit.sh \ + sndrcv_failover.sh \ + testsuites/sndrcv_failover_sender.conf \ + testsuites/sndrcv_failover_rcvr.conf \ sndrcv.sh \ testsuites/sndrcv_sender.conf \ testsuites/sndrcv_rcvr.conf \ diff --git a/tests/diag.sh b/tests/diag.sh index d1242fb1..7651b1de 100755 --- a/tests/diag.sh +++ b/tests/diag.sh @@ -22,7 +22,7 @@ case $1 in rm -f work rsyslog.out.log rsyslog2.out.log rsyslog.out.log.save # common work files rm -rf test-spool test-logdir rm -f rsyslog.out.*.log work-presort rsyslog.pipe - rm -f rsyslog.input + rm -f rsyslog.input rsyslog.empty rm -f core.* vgcore.* mkdir test-spool ;; @@ -31,7 +31,7 @@ case $1 in rm -f work rsyslog.out.log rsyslog2.out.log rsyslog.out.log.save # common work files rm -rf test-spool test-logdir rm -f rsyslog.out.*.log rsyslog.random.data work-presort rsyslog.pipe - rm -f rsyslog.input stat-file1 + rm -f rsyslog.input stat-file1 #rsyslog.empty echo ------------------------------------------------------------------------------- ;; 'startup') # start rsyslogd with default params. $2 is the config file name to use diff --git a/tests/failover-async.sh b/tests/failover-async.sh index f17bc0ff..5fc393de 100755 --- a/tests/failover-async.sh +++ b/tests/failover-async.sh @@ -3,7 +3,7 @@ echo =========================================================================== echo \[failover-async.sh\]: async test for failover functionality source $srcdir/diag.sh init source $srcdir/diag.sh startup failover-async.conf -source $srcdir/diag.sh injectmsg 0 5 +source $srcdir/diag.sh injectmsg 0 5000 echo doing shutdown source $srcdir/diag.sh shutdown-when-empty echo wait on shutdown diff --git a/tests/failover-double.sh b/tests/failover-double.sh new file mode 100755 index 00000000..172b91de --- /dev/null +++ b/tests/failover-double.sh @@ -0,0 +1,12 @@ +# This file is part of the rsyslog project, released under GPLv3 +echo =============================================================================== +echo \[failover-double.sh\]: test for double failover functionality +source $srcdir/diag.sh init +source $srcdir/diag.sh startup failover-double.conf +source $srcdir/diag.sh injectmsg 0 5000 +echo doing shutdown +source $srcdir/diag.sh shutdown-when-empty +echo wait on shutdown +source $srcdir/diag.sh wait-shutdown +source $srcdir/diag.sh seq-check 0 4999 +source $srcdir/diag.sh exit diff --git a/tests/sndrcv_drvr.sh b/tests/sndrcv_drvr.sh index 3d613069..f9092647 100755 --- a/tests/sndrcv_drvr.sh +++ b/tests/sndrcv_drvr.sh @@ -1,50 +1,2 @@ -# This is test driver for testing two rsyslog instances. It can be -# utilized by any test that just needs two instances with different -# config files, where messages are injected in instance TWO and -# (with whatever rsyslog mechanism) being relayed over to instance ONE, -# where they are written to the log file. After the run, the completeness -# of that log file is checked. -# The code is almost the same, but the config files differ (probably greatly) -# for different test cases. As such, this driver needs to be called with the -# config file name ($2). From that name, the sender and receiver config file -# names are automatically generated. -# So: $1 config file name, $2 number of messages -# -# A note on TLS testing: the current testsuite (in git!) already contains -# TLS test cases. However, getting these test cases correct is not simple. -# That's not a problem with the code itself, but rater a problem with -# synchronization in the test environment. So I have deciced to keep the -# TLS tests in, but not yet actually utilize them. This is most probably -# left as an excercise for future (devel) releases. -- rgerhards, 2009-11-11 -# -# added 2009-11-11 by Rgerhards -# This file is part of the rsyslog project, released under GPLv3 -# uncomment for debugging support: -source $srcdir/diag.sh init -# start up the instances -#export RSYSLOG_DEBUG="debug nostdout noprintmutexaction" -#export RSYSLOG_DEBUGLOG="log" -source $srcdir/diag.sh startup $1_rcvr.conf -source $srcdir/diag.sh wait-startup -#export RSYSLOG_DEBUGLOG="log2" -#valgrind="valgrind" -source $srcdir/diag.sh startup $1_sender.conf 2 -source $srcdir/diag.sh wait-startup 2 -# may be needed by TLS (once we do it): sleep 30 - -# now inject the messages into instance 2. It will connect to instance 1, -# and that instance will record the data. -source $srcdir/diag.sh tcpflood -m$2 -i1 -sleep 2 # make sure all data is received in input buffers -# shut down sender when everything is sent, receiver continues to run concurrently -# may be needed by TLS (once we do it): sleep 60 -source $srcdir/diag.sh shutdown-when-empty 2 -source $srcdir/diag.sh wait-shutdown 2 -# now it is time to stop the receiver as well -source $srcdir/diag.sh shutdown-when-empty -source $srcdir/diag.sh wait-shutdown - -# may be needed by TLS (once we do it): sleep 60 -# do the final check -source $srcdir/diag.sh seq-check 1 $2 +source $srcdir/sndrcv_drvr_noexit.sh $1 $2 source $srcdir/diag.sh exit diff --git a/tests/sndrcv_drvr_noexit.sh b/tests/sndrcv_drvr_noexit.sh new file mode 100755 index 00000000..899eace3 --- /dev/null +++ b/tests/sndrcv_drvr_noexit.sh @@ -0,0 +1,49 @@ +# This is test driver for testing two rsyslog instances. It can be +# utilized by any test that just needs two instances with different +# config files, where messages are injected in instance TWO and +# (with whatever rsyslog mechanism) being relayed over to instance ONE, +# where they are written to the log file. After the run, the completeness +# of that log file is checked. +# The code is almost the same, but the config files differ (probably greatly) +# for different test cases. As such, this driver needs to be called with the +# config file name ($2). From that name, the sender and receiver config file +# names are automatically generated. +# So: $1 config file name, $2 number of messages +# +# A note on TLS testing: the current testsuite (in git!) already contains +# TLS test cases. However, getting these test cases correct is not simple. +# That's not a problem with the code itself, but rater a problem with +# synchronization in the test environment. So I have deciced to keep the +# TLS tests in, but not yet actually utilize them. This is most probably +# left as an excercise for future (devel) releases. -- rgerhards, 2009-11-11 +# +# added 2009-11-11 by Rgerhards +# This file is part of the rsyslog project, released under GPLv3 +# uncomment for debugging support: +source $srcdir/diag.sh init +# start up the instances +#export RSYSLOG_DEBUG="debug nostdout noprintmutexaction" +#export RSYSLOG_DEBUGLOG="log" +source $srcdir/diag.sh startup $1_rcvr.conf +source $srcdir/diag.sh wait-startup +#export RSYSLOG_DEBUGLOG="log2" +#valgrind="valgrind" +source $srcdir/diag.sh startup $1_sender.conf 2 +source $srcdir/diag.sh wait-startup 2 +# may be needed by TLS (once we do it): sleep 30 + +# now inject the messages into instance 2. It will connect to instance 1, +# and that instance will record the data. +source $srcdir/diag.sh tcpflood -m$2 -i1 +sleep 2 # make sure all data is received in input buffers +# shut down sender when everything is sent, receiver continues to run concurrently +# may be needed by TLS (once we do it): sleep 60 +source $srcdir/diag.sh shutdown-when-empty 2 +source $srcdir/diag.sh wait-shutdown 2 +# now it is time to stop the receiver as well +source $srcdir/diag.sh shutdown-when-empty +source $srcdir/diag.sh wait-shutdown + +# may be needed by TLS (once we do it): sleep 60 +# do the final check +source $srcdir/diag.sh seq-check 1 $2 diff --git a/tests/sndrcv_failover.sh b/tests/sndrcv_failover.sh new file mode 100755 index 00000000..4c5e1831 --- /dev/null +++ b/tests/sndrcv_failover.sh @@ -0,0 +1,21 @@ +# This tests failover capabilities. Data is sent to local port 13516, where +# no process shall listen. Then it fails over to a second instance, then to +# a file. The second instance is started. So all data should be received +# there and none be logged to the file. +# This builds on the basic sndrcv.sh test, but adds a first, failing, +# location to the conf file. +# added 2011-06-20 by Rgerhards +# This file is part of the rsyslog project, released under GPLv3 +echo =============================================================================== +echo \[sndrcv_failover.sh\]: testing failover capabilities for tcp sending +source $srcdir/sndrcv_drvr_noexit.sh sndrcv_failover 50000 +ls -l rsyslog.empty +if [[ -s rsyslog.empty ]] ; then + echo "FAIL: rsyslog.empty has data. Failover handling failed. Data is written" + echo " even though the previous action (in a failover chain!) properly" + echo " worked." + exit 1 +else + echo "rsyslog.empty is empty - OK" +fi ; +source $srcdir/diag.sh exit diff --git a/tests/testsuites/failover-double.conf b/tests/testsuites/failover-double.conf new file mode 100644 index 00000000..a9991321 --- /dev/null +++ b/tests/testsuites/failover-double.conf @@ -0,0 +1,9 @@ +$IncludeConfig diag-common.conf + +$template outfmt,"%msg:F,58:2%\n" + +:msg, contains, "msgnum:" @@127.0.0.1:13516 +$ActionExecOnlyWhenPreviousIsSuspended on +& @@127.0.0.1:1234 +& ./rsyslog.out.log;outfmt +$ActionExecOnlyWhenPreviousIsSuspended off diff --git a/tests/testsuites/sndrcv_failover_rcvr.conf b/tests/testsuites/sndrcv_failover_rcvr.conf new file mode 100644 index 00000000..6f7ce34b --- /dev/null +++ b/tests/testsuites/sndrcv_failover_rcvr.conf @@ -0,0 +1,11 @@ +# see equally-named shell file for details +# rgerhards, 2009-11-11 +$IncludeConfig diag-common.conf + +$ModLoad ../plugins/imtcp/.libs/imtcp +# then SENDER sends to this port (not tcpflood!) +$InputTCPServerRun 13515 + +$template outfmt,"%msg:F,58:2%\n" +$template dynfile,"rsyslog.out.log" # trick to use relative path names! +:msg, contains, "msgnum:" ?dynfile;outfmt diff --git a/tests/testsuites/sndrcv_failover_sender.conf b/tests/testsuites/sndrcv_failover_sender.conf new file mode 100644 index 00000000..b8e7c186 --- /dev/null +++ b/tests/testsuites/sndrcv_failover_sender.conf @@ -0,0 +1,13 @@ +# see tcpsndrcv.sh for details +# rgerhards, 2009-11-11 +$IncludeConfig diag-common2.conf + +$ModLoad ../plugins/imtcp/.libs/imtcp +# this listener is for message generation by the test framework! +$InputTCPServerRun 13514 + +*.* @@127.0.0.1:13516 # this must be DEAD +$ActionExecOnlyWhenPreviousIsSuspended on +& @@127.0.0.1:13515 +& ./rsyslog.empty +$ActionExecOnlyWhenPreviousIsSuspended off -- cgit From de33ec026e505658d613b372ea9d32656ebb86d8 Mon Sep 17 00:00:00 2001 From: Rainer Gerhards Date: Tue, 21 Jun 2011 10:03:47 +0200 Subject: preparing for 5.8.2 --- ChangeLog | 2 +- configure.ac | 2 +- doc/manual.html | 2 +- tests/Makefile.am | 3 ++- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6cdfb670..9cef03e6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,5 @@ --------------------------------------------------------------------------- -Version 5.8.2 [V5-stable] (rgerhards), 2011-06-?? +Version 5.8.2 [V5-stable] (rgerhards), 2011-06-21 - bugfix: problems in failover action handling closes: http://bugzilla.adiscon.com/show_bug.cgi?id=270 closes: http://bugzilla.adiscon.com/show_bug.cgi?id=254 diff --git a/configure.ac b/configure.ac index 1faaff89..5873c661 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ(2.61) -AC_INIT([rsyslog],[5.8.1],[rsyslog@lists.adiscon.com]) +AC_INIT([rsyslog],[5.8.2],[rsyslog@lists.adiscon.com]) AM_INIT_AUTOMAKE m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) diff --git a/doc/manual.html b/doc/manual.html index 5c656752..db9eb907 100644 --- a/doc/manual.html +++ b/doc/manual.html @@ -19,7 +19,7 @@ rsyslog support available directly from the source!

    Please visit the rsyslog sponsor's page to honor the project sponsors or become one yourself! We are very grateful for any help towards the project goals.

    -

    This documentation is for version 5.8.1 (stable branch) of rsyslog. +

    This documentation is for version 5.8.2 (stable branch) of rsyslog. Visit the rsyslog status page to obtain current version information and project status.

    If you like rsyslog, you might diff --git a/tests/Makefile.am b/tests/Makefile.am index eec7b16b..77381d2f 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -101,7 +101,8 @@ endif if ENABLE_GNUTLS if HAVE_VALGRIND -TESTS += manytcp-too-few-tls.sh +# This test does not work on v5 as we keep DH params +#TESTS += manytcp-too-few-tls.sh endif endif -- cgit From 86225089f2d0e82deb368e1688464e8ba84d24cf Mon Sep 17 00:00:00 2001 From: Rainer Gerhards Date: Tue, 21 Jun 2011 12:35:14 +0200 Subject: bugfix: mutex was invalidly left unlocked during action processing At least one case where this can occur is during thread shutdown, which may be initiated by lower activity. In most cases, this is quite unlikely to happen. However, if it does, data structures may be corrupted which could lead to fatal failure and segfault. I detected this via a testbench test, not a user report. But I assume that some users may have had unreproducable aborts that were cause by this bug. --- ChangeLog | 7 +++++++ runtime/queue.c | 12 ++++++++---- tests/diag.sh | 2 +- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9cef03e6..e438ac61 100644 --- a/ChangeLog +++ b/ChangeLog @@ -3,6 +3,13 @@ Version 5.8.2 [V5-stable] (rgerhards), 2011-06-21 - bugfix: problems in failover action handling closes: http://bugzilla.adiscon.com/show_bug.cgi?id=270 closes: http://bugzilla.adiscon.com/show_bug.cgi?id=254 +- bugfix: mutex was invalidly left unlocked during action processing + At least one case where this can occur is during thread shutdown, which + may be initiated by lower activity. In most cases, this is quite + unlikely to happen. However, if it does, data structures may be + corrupted which could lead to fatal failure and segfault. I detected + this via a testbench test, not a user report. But I assume that some + users may have had unreproducable aborts that were cause by this bug. - bugfix: memory leak in imtcp & subsystems under some circumstances This leak is tied to error conditions which lead to incorrect cleanup of some data structures. [backport from v6] diff --git a/runtime/queue.c b/runtime/queue.c index 88e01a7a..00eb76c7 100644 --- a/runtime/queue.c +++ b/runtime/queue.c @@ -1678,6 +1678,7 @@ static rsRetVal ConsumerReg(qqueue_t *pThis, wti_t *pWti) { int iCancelStateSave; + int bNeedReLock = 0; /**< do we need to lock the mutex again? */ DEFiRet; ISOBJ_TYPE_assert(pThis, qqueue); @@ -1687,6 +1688,7 @@ ConsumerReg(qqueue_t *pThis, wti_t *pWti) /* we now have a non-idle batch of work, so we can release the queue mutex and process it */ d_pthread_mutex_unlock(pThis->mut); + bNeedReLock = 1; /* at this spot, we may be cancelled */ pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &iCancelStateSave); @@ -1706,12 +1708,14 @@ ConsumerReg(qqueue_t *pThis, wti_t *pWti) /* but now cancellation is no longer permitted */ pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); - /* now we are done, but need to re-aquire the mutex */ - d_pthread_mutex_lock(pThis->mut); - finalize_it: - dbgprintf("regular consumer finished, iret=%d, szlog %d sz phys %d\n", iRet, + DBGPRINTF("regular consumer finished, iret=%d, szlog %d sz phys %d\n", iRet, getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis)); + + /* now we are done, but potentially need to re-aquire the mutex */ + if(bNeedReLock) + d_pthread_mutex_lock(pThis->mut); + RETiRet; } diff --git a/tests/diag.sh b/tests/diag.sh index 7651b1de..617118ba 100755 --- a/tests/diag.sh +++ b/tests/diag.sh @@ -10,7 +10,7 @@ #valgrind="valgrind --tool=helgrind --log-fd=1" #valgrind="valgrind --tool=exp-ptrcheck --log-fd=1" #set -o xtrace -#export RSYSLOG_DEBUG="debug nologfuncflow nostdout" +#export RSYSLOG_DEBUG="debug nologfuncflow noprintmutexaction stdout" #export RSYSLOG_DEBUGLOG="log" case $1 in 'init') $srcdir/killrsyslog.sh # kill rsyslogd if it runs for some reason -- cgit