diff options
85 files changed, 4769 insertions, 1770 deletions
@@ -1,19 +1,14 @@ -Rainer Gerhards <rgerhards@adiscon.com>, Adiscon GmbH -Michael Meckelein <mmeckelein@hq.adiscon.com>, Adiscon GmbH +Thankfully, we have had so many contributions that maintaining the +AUTHORS file would be a big task in itself. On the other hand, we +now use git and I make sure that each author receives proper credit +for patches I receive. -Contributors -Michael Biebl - - helped continously with autotools - - provided numerous advise on how to do things under Linux - - provided excellent advise in many, many cases -Andres Riancho (andres-dot-riancho-at-gmail-dot-com) - (alias APR in code files) - - supplied regexp functionality for the property replacer - a great feature. - thanks! -Bjoern Kalkbrenner - - provided code for the "execute shell script" action -Peter Vrabec - - provided IPv6-enabling code -varmojfekoj - - helped with a variety of things and, most importantly, contributed - the gssapi functionality +So rather than trying to reproduce the git author log here (and +often making mistakes in that), I invite you to check the git logs. +You can also do this online at + +http://git.adiscon.com/?p=rsyslog.git;a=summary + +Rainer Gerhards +<rgerhards@adiscon.com> +lead rsyslog developer @@ -1,4 +1,133 @@ --------------------------------------------------------------------------- +Version 5.1.6 [v5-beta] (rgerhards), 2009-10-15 +- feature imports from v4.5.6 +- bugfix: potential race condition when queue worker threads were + terminated +- bugfix: solved potential (temporary) stall of messages when the queue was + almost empty and few new data added (caused testbench to sometimes hang!) +- fixed some race condition in testbench +- added more elaborate diagnostics to parts of the testbench +- bugfixes imported from 4.5.4: + * bugfix: potential segfault in stream writer on destruction + * bugfix: potential race in object loader (obj.c) during use/release + * bugfixes: potential problems in out file zip writer +- included some important fixes from 4.4.2: + * bugfix: invalid handling of zero-sized messages + * bugfix: zero-sized UDP messages are no longer processed + * bugfix: random data could be appended to message + * bugfix: reverse lookup reduction logic in imudp do DNS queries too often +--------------------------------------------------------------------------- +Version 5.1.5 [v5-beta] (rgerhards), 2009-09-11 +- added new config option $ActionWriteAllMarkMessages + this option permites to process mark messages under all circumstances, + even if an action was recently called. This can be useful to use mark + messages as a kind of heartbeat. +- bugfix: hostnames with dashes in them were incorrectly treated as + malformed, thus causing them to be treated as TAG (this was a regression + introduced from the "rfc3164 strict" change in 4.5.0). Testbench has been + updated to include a smaple message with a hostname containing a dash. +- bugfix: strings improperly reused, resulting in some message properties + be populated with strings from previous messages. This was caused by + an improper predicate check. +--------------------------------------------------------------------------- +Version 5.1.4 [DEVEL] (rgerhards), 2009-08-20 +- legacy syslog parser changed so that it now accepts date stamps in + wrong case. Some devices seem to create them and I do not see any harm + in supporting that. +- added $InputTCPMaxListeners directive - permits to specify how many + TCP servers shall be possible (default is 20). +- bugfix: memory leak with some input modules. Those inputs that + use parseAndSubmitMsg() leak two small memory blocks with every message. + Typically, those process only relatively few messages, so the issue + does most probably not have any effect in practice. +- bugfix: if tcp listen port could not be created, no error message was + emitted +- bugfix: discard action did not work (did not discard messages) +- bugfix: discard action caused segfault +- bugfix: potential segfault in output file writer (omfile) + In async write mode, we use modular arithmetic to index the output + buffer array. However, the counter variables accidently were signed, + thus resulting in negative indizes after integer overflow. That in turn + could lead to segfaults, but was depending on the memory layout of + the instance in question (which in turn depended on a number of + variables, like compile settings but also configuration). The counters + are now unsigned (as they always should have been) and so the dangling + mis-indexing does no longer happen. This bug potentially affected all + installations, even if only some may actually have seen a segfault. +--------------------------------------------------------------------------- +Version 5.1.3 [DEVEL] (rgerhards), 2009-07-28 +- architecture change: queue now always has at least one worker thread + if not running in direct mode. Previous versions could run without + any active workers. This simplifies the code at a very small expense. + See v5 compatibility note document for more in-depth discussion. +- enhance: UDP spoofing supported via new output module omudpspoof + See the omudpspoof documentation for details and samples +- bugfix: message could be truncated after TAG, often when forwarding + This was a result of an internal processing error if maximum field + sizes had been specified in the property replacer. +- bugfix: minor static memory leak while reading configuration + did NOT leak based on message volume +- internal: added ability to terminate input modules not via pthread_cancel + but an alternate approach via pthread_kill. This is somewhat safer as we + do not need to think about the cancel-safeness of all libraries we use. + However, not all inputs can easily supported, so this now is a feature + that can be requested by the input module (the most important ones + request it). +--------------------------------------------------------------------------- +Version 5.1.2 [DEVEL] (rgerhards), 2009-07-08 +- bugfix: properties inputname, fromhost, fromhost-ip, msg were lost when + working with disk queues +- some performance enhancements +- bugfix: abort condition when RecvFrom was not set and message reduction + was on. Happend e.g. with imuxsock. +- added $klogConsoleLogLevel directive which permits to set a new + console log level while rsyslog is active +- some internal code cleanup +--------------------------------------------------------------------------- +Version 5.1.1 [DEVEL] (rgerhards), 2009-07-03 +- bugfix: huge memory leak in queue engine (made rsyslogd unusable in + production). Occured if at least one queue was in direct mode + (the default for action queues) +- imported many performance optimizations from v4-devel (4.5.0) +- bugfix: subtle (and usually irrelevant) issue in timout processing + timeout could be one second too early if nanoseconds wrapped +- set a more sensible timeout for shutdow, now 1.5 seconds to complete + processing (this also removes those cases where the shutdown message + was not written because the termination happened before it) +--------------------------------------------------------------------------- +Version 5.1.0 [DEVEL] (rgerhards), 2009-05-29 + +*********************************** NOTE ********************************** +The v5 versions of rsyslog feature a greatly redesigned queue engine. The +major theme for the v5 release is twofold: + +a) greatly improved performance +b) enable audit-grade processing + +Here, audit-grade processing means that rsyslog, if used together with +audit-grade transports and configured correctly, will never lose messages +that already have been acknowledged, not even in fatal failure cases like +sudden loss of power. + +Note that large parts of rsyslog's important core components have been +restructured to support these design goals. As such, early versions of +the engine will probably be less stable than the v3/v4 engine. + +Also note that the initial versions do not cover all and everything. As +usual, the code will evolve toward the final goal as version numbers +increase. +*********************************** NOTE ********************************** + +- redesigned queue engine so that it supports ultra-reliable operations + This resulted in a rewrite of large parts. The new capability can be + used to build audit-grade systems on the basis of rsyslog. +- added $MainMsgQueueDequeueBatchSize and $ActionQueueDequeueBatchSize + configuration directives +- implemented a new transactional output module interface which provides + superior performance (for databases potentially far superior performance) +- increased ompgsql performance by adapting to new transactional + output module interface +--------------------------------------------------------------------------- Version 4.5.6 [v4-beta] (rgerhards), 2009-09-?? - bugfix(minor): diag function returned wrong queue memeber count for the main queue if an active DA queue existed. This had no relevance diff --git a/Makefile.am b/Makefile.am index a050e95e..7ab48455 100644 --- a/Makefile.am +++ b/Makefile.am @@ -95,6 +95,10 @@ if ENABLE_OMSTDOUT SUBDIRS += plugins/omstdout endif +if ENABLE_OMUDPSPOOF +SUBDIRS += plugins/omudpspoof +endif + if ENABLE_OMTEMPLATE SUBDIRS += plugins/omtemplate endif @@ -42,11 +42,15 @@ #include "cfsysline.h" #include "srUtils.h" #include "errmsg.h" +#include "batch.h" +#include "wti.h" #include "datetime.h" #include "unicode-helper.h" +#define NO_TIME_PROVIDED 0 /* indicate we do not provide any cached time */ + /* forward definitions */ -rsRetVal actionCallDoAction(action_t *pAction, msg_t *pMsg); +static rsRetVal processBatchMain(action_t *pAction, batch_t *pBatch); /* object static data (once for all instances) */ /* TODO: make this an object! DEFobjStaticHelpers -- rgerhards, 2008-03-05 */ @@ -61,10 +65,12 @@ static int glbliActionResumeInterval = 30; int glbliActionResumeRetryCount = 0; /* how often should suspended actions be retried? */ static int bActionRepMsgHasMsg = 0; /* last messsage repeated... has msg fragment in it */ +static int bActionWriteAllMarkMsgs = FALSE; /* should all mark messages be unconditionally written? */ static uchar *pszActionName; /* short name for the action */ -/* main message queue and its configuration parameters */ +/* action queue and its configuration parameters */ static queueType_t ActionQueType = QUEUETYPE_DIRECT; /* type of the main message queue above */ static int iActionQueueSize = 1000; /* size of the main message queue above */ +static int iActionQueueDeqBatchSize = 16; /* batch size for action queues */ static int iActionQHighWtrMark = 800; /* high water mark for disk-assisted queues */ static int iActionQLowWtrMark = 200; /* low water mark for disk-assisted queues */ static int iActionQDiscardMark = 9800; /* begin to discard messages */ @@ -146,6 +152,7 @@ actionResetQueueParams(void) ActionQueType = QUEUETYPE_DIRECT; /* type of the main message queue above */ iActionQueueSize = 1000; /* size of the main message queue above */ + iActionQueueDeqBatchSize = 16; /* default batch size */ iActionQHighWtrMark = 800; /* high water mark for disk-assisted queues */ iActionQLowWtrMark = 200; /* low water mark for disk-assisted queues */ iActionQDiscardMark = 9800; /* begin to discard messages */ @@ -283,7 +290,8 @@ actionConstructFinalize(action_t *pThis) * to be run on multiple threads. So far, this is forbidden by the interface * spec. -- rgerhards, 2008-01-30 */ - CHKiRet(qqueueConstruct(&pThis->pQueue, ActionQueType, 1, iActionQueueSize, (rsRetVal (*)(void*,void*))actionCallDoAction)); + CHKiRet(qqueueConstruct(&pThis->pQueue, ActionQueType, 1, iActionQueueSize, + (rsRetVal (*)(void*, batch_t*))processBatchMain)); obj.SetName((obj_t*) pThis->pQueue, pszQName); /* ... set some properties ... */ @@ -298,6 +306,7 @@ actionConstructFinalize(action_t *pThis) qqueueSetpUsr(pThis->pQueue, pThis); setQPROP(qqueueSetsizeOnDiskMax, "$ActionQueueMaxDiskSpace", iActionQueMaxDiskSpace); + setQPROP(qqueueSetiDeqBatchSize, "$ActionQueueDequeueBatchSize", iActionQueueDeqBatchSize); setQPROP(qqueueSetMaxFileSize, "$ActionQueueFileSize", iActionQueMaxFileSize); setQPROPstr(qqueueSetFilePrefix, "$ActionQueueFileName", pszActionQFName); setQPROP(qqueueSetiPersistUpdCnt, "$ActionQueueCheckpointInterval", iActionQPersistUpdCnt); @@ -334,87 +343,239 @@ finalize_it: } -/* set an action back to active state -- rgerhards, 2007-08-02 + +/* set the global resume interval + */ +rsRetVal actionSetGlobalResumeInterval(int iNewVal) +{ + glbliActionResumeInterval = iNewVal; + return RS_RET_OK; +} + + +/* returns the action state name in human-readable form + * returned string must not be modified. + * rgerhards, 2009-05-07 + */ +static uchar *getActStateName(action_t *pThis) +{ + switch(pThis->eState) { + case ACT_STATE_RDY: + return (uchar*) "rdy"; + case ACT_STATE_ITX: + return (uchar*) "itx"; + case ACT_STATE_RTRY: + return (uchar*) "rtry"; + case ACT_STATE_SUSP: + return (uchar*) "susp"; + case ACT_STATE_DIED: + return (uchar*) "died"; + case ACT_STATE_COMM: + return (uchar*) "comm"; + default: + return (uchar*) "ERROR/UNKNWON"; + } +} + + +/* returns a suitable return code based on action state + * rgerhards, 2009-05-07 */ -static rsRetVal actionResume(action_t *pThis) +static rsRetVal getReturnCode(action_t *pThis) { DEFiRet; ASSERT(pThis != NULL); - pThis->bSuspended = 0; + switch(pThis->eState) { + case ACT_STATE_RDY: + iRet = RS_RET_OK; + break; + case ACT_STATE_ITX: + if(pThis->bHadAutoCommit) { + pThis->bHadAutoCommit = 0; /* auto-reset */ + iRet = RS_RET_PREVIOUS_COMMITTED; + } else { + iRet = RS_RET_DEFER_COMMIT; + } + break; + case ACT_STATE_RTRY: + iRet = RS_RET_SUSPENDED; + break; + case ACT_STATE_SUSP: + case ACT_STATE_DIED: + iRet = RS_RET_ACTION_FAILED; + break; + default: + DBGPRINTF("Invalid action engine state %d, program error\n", + (int) pThis->eState); + iRet = RS_RET_ERR; + break; + } RETiRet; } -/* set the global resume interval +/* set the action to a new state + * rgerhards, 2007-08-02 */ -rsRetVal actionSetGlobalResumeInterval(int iNewVal) +static inline void actionSetState(action_t *pThis, action_state_t newState) { - glbliActionResumeInterval = iNewVal; - return RS_RET_OK; + pThis->eState = newState; + DBGPRINTF("Action %p transitioned to state: %s\n", pThis, getActStateName(pThis)); +} + +/* Handles the transient commit state. So far, this is + * mostly a dummy... + * rgerhards, 2007-08-02 + */ +static void actionCommitted(action_t *pThis) +{ + actionSetState(pThis, ACT_STATE_RDY); +} + + +/* set action to "rtry" state. + * rgerhards, 2007-08-02 + */ +static void actionRetry(action_t *pThis) +{ + actionSetState(pThis, ACT_STATE_RTRY); +} + + +/* Disable action, this means it will never again be usable + * until rsyslog is reloaded. Use only as a last resort, but + * depends on output module. + * rgerhards, 2007-08-02 + */ +static void actionDisable(action_t *pThis) +{ + actionSetState(pThis, ACT_STATE_DIED); +} + + +/* Suspend action, this involves changing the acton state as well + * as setting the next retry time. + * if we have more than 10 retries, we prolong the + * retry interval. If something is really stalled, it will + * get re-tried only very, very seldom - but that saves + * CPU time. TODO: maybe a config option for that? + * rgerhards, 2007-08-02 + */ +static inline void actionSuspend(action_t *pThis, time_t ttNow) +{ + if(ttNow == NO_TIME_PROVIDED) + time(&ttNow); + pThis->ttResumeRtry = ttNow + pThis->iResumeInterval * (pThis->iNbrResRtry / 10 + 1); + actionSetState(pThis, ACT_STATE_SUSP); + DBGPRINTF("earliest retry=%d\n", (int) pThis->ttResumeRtry); } -/* suspend an action -- rgerhards, 2007-08-02 +/* actually do retry processing. Note that the function receives a timestamp so + * that we do not need to call the (expensive) time() API. + * Note that we do the full retry processing here, doing the configured number of + * iterations. + * rgerhards, 2009-05-07 */ -static rsRetVal actionSuspend(action_t *pThis, time_t tNow) +static rsRetVal actionDoRetry(action_t *pThis, time_t ttNow) { + int iRetries; + int iSleepPeriod; DEFiRet; ASSERT(pThis != NULL); - pThis->bSuspended = 1; - pThis->ttResumeRtry = tNow + pThis->iResumeInterval; - pThis->iNbrResRtry = 0; /* tell that we did not yet retry to resume */ + + iRetries = 0; + while(pThis->eState == ACT_STATE_RTRY) { + iRet = pThis->pMod->tryResume(pThis->pModData); + if(iRet == RS_RET_OK) { + actionSetState(pThis, ACT_STATE_RDY); + } else if(iRet == RS_RET_SUSPENDED) { + /* max retries reached? */ + if((pThis->iResumeRetryCount != -1 && iRetries >= pThis->iResumeRetryCount)) { + actionSuspend(pThis, ttNow); + } else { + ++pThis->iNbrResRtry; + ++iRetries; + iSleepPeriod = pThis->iResumeInterval; + ttNow += iSleepPeriod; /* not truly exact, but sufficiently... */ + srSleep(iSleepPeriod, 0); + } + } else if(iRet == RS_RET_DISABLE_ACTION) { + actionDisable(pThis); + } + } + + if(pThis->eState == ACT_STATE_RDY) { + pThis->iNbrResRtry = 0; + } RETiRet; } /* try to resume an action -- rgerhards, 2007-08-02 - * returns RS_RET_OK if resumption worked, RS_RET_SUSPEND if the - * action is still suspended. + * changed to new action state engine -- rgerhards, 2009-05-07 */ static rsRetVal actionTryResume(action_t *pThis) { DEFiRet; - time_t ttNow; + time_t ttNow = NO_TIME_PROVIDED; ASSERT(pThis != NULL); - /* for resume handling, we must always obtain a fresh timestamp. We used - * to use the action timestamp, but in this case we will never reach a - * point where a resumption is actually tried, because the action timestamp - * is always in the past. So we can not avoid doing a fresh time() call - * here. -- rgerhards, 2009-03-18 - */ - time(&ttNow); /* cache "now" */ - - /* first check if it is time for a re-try */ - if(ttNow > pThis->ttResumeRtry) { - iRet = pThis->pMod->tryResume(pThis->pModData); - if(iRet == RS_RET_SUSPENDED) { - /* set new tryResume time */ - ++pThis->iNbrResRtry; - /* if we have more than 10 retries, we prolong the - * retry interval. If something is really stalled, it will - * get re-tried only very, very seldom - but that saves - * CPU time. TODO: maybe a config option for that? - * rgerhards, 2007-08-02 - */ - pThis->ttResumeRtry = ttNow + pThis->iResumeInterval * (pThis->iNbrResRtry / 10 + 1); + if(pThis->eState == ACT_STATE_SUSP) { + /* if we are suspended, we need to check if the timeout expired. + * for this handling, we must always obtain a fresh timestamp. We used + * to use the action timestamp, but in this case we will never reach a + * point where a resumption is actually tried, because the action timestamp + * is always in the past. So we can not avoid doing a fresh time() call + * here. -- rgerhards, 2009-03-18 + */ + time(&ttNow); /* cache "now" */ + if(ttNow > pThis->ttResumeRtry) { + actionSetState(pThis, ACT_STATE_RTRY); /* back to retries */ } - } else { - /* it's too early, we are still suspended --> indicate this */ - iRet = RS_RET_SUSPENDED; } - if(iRet == RS_RET_OK) - actionResume(pThis); + if(pThis->eState == ACT_STATE_RTRY) { + if(ttNow == NO_TIME_PROVIDED) /* use cached result if we have it */ + time(&ttNow); + CHKiRet(actionDoRetry(pThis, ttNow)); + } + + if(Debug && (pThis->eState == ACT_STATE_RTRY ||pThis->eState == ACT_STATE_SUSP)) { + DBGPRINTF("actionTryResume: action state: %s, next retry (if applicable): %u [now %u]\n", + getActStateName(pThis), (unsigned) pThis->ttResumeRtry, (unsigned) ttNow); + } + +finalize_it: + RETiRet; +} + - DBGPRINTF("actionTryResume: iRet: %d, next retry (if applicable): %u [now %u]\n", - iRet, (unsigned) pThis->ttResumeRtry, (unsigned) ttNow); +/* prepare an action for performing work. This involves trying to recover it, + * depending on its current state. + * rgerhards, 2009-05-07 + */ +static rsRetVal actionPrepare(action_t *pThis) +{ + DEFiRet; + + assert(pThis != NULL); + CHKiRet(actionTryResume(pThis)); + + /* if we are now ready, we initialize the transaction and advance + * action state accordingly + */ + if(pThis->eState == ACT_STATE_RDY) { + CHKiRet(pThis->pMod->mod.om.beginTransaction(pThis->pModData)); + actionSetState(pThis, ACT_STATE_ITX); + } +finalize_it: RETiRet; } @@ -431,12 +592,11 @@ rsRetVal actionDbgPrint(action_t *pThis) dbgprintf("\n\tInstance data: 0x%lx\n", (unsigned long) pThis->pModData); dbgprintf("\tRepeatedMsgReduction: %d\n", pThis->f_ReduceRepeated); dbgprintf("\tResume Interval: %d\n", pThis->iResumeInterval); - dbgprintf("\tSuspended: %d", pThis->bSuspended); - if(pThis->bSuspended) { - dbgprintf(" next retry: %u, number retries: %d", (unsigned) pThis->ttResumeRtry, pThis->iNbrResRtry); + if(pThis->eState == ACT_STATE_SUSP) { + dbgprintf("\tresume next retry: %u, number retries: %d", + (unsigned) pThis->ttResumeRtry, pThis->iNbrResRtry); } - dbgprintf("\n"); - dbgprintf("\tDisabled: %d\n", !pThis->bEnabled); + dbgprintf("\tState: %s\n", getActStateName(pThis)); dbgprintf("\tExec only when previous is suspended: %d\n", pThis->bExecWhenPrevSusp); dbgprintf("\n"); @@ -444,33 +604,16 @@ rsRetVal actionDbgPrint(action_t *pThis) } -/* call the DoAction output plugin entry point - * rgerhards, 2008-01-28 +/* prepare the calling parameters for doAction() + * rgerhards, 2009-05-07 */ -#pragma GCC diagnostic ignored "-Wempty-body" -rsRetVal -actionCallDoAction(action_t *pAction, msg_t *pMsg) +static rsRetVal prepareDoActionParams(action_t *pAction, msg_t *pMsg) { - DEFiRet; - int iRetries; int i; - int iArr; - int iSleepPeriod; - int bCallAction; - int iCancelStateSave; + DEFiRet; ASSERT(pAction != NULL); - /* We now must guard the output module against execution by multiple threads. The - * plugin interface specifies that output modules must not be thread-safe (except - * if they notify us they are - functionality not yet implemented...). - * rgerhards, 2008-01-30 - */ - pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); - d_pthread_mutex_lock(&pAction->mutActExec); - pthread_cleanup_push(mutexCancelCleanup, &pAction->mutActExec); - pthread_setcancelstate(iCancelStateSave, NULL); - /* here we must loop to process all requested strings */ for(i = 0 ; i < pAction->iNumTpls ; ++i) { switch(pAction->eParamPassing) { @@ -484,47 +627,21 @@ actionCallDoAction(action_t *pAction, msg_t *pMsg) } } - iRetries = 0; - do { - /* on first invocation, this if should never be true. We just put it at the top - * of the loop so that processing (and code) is simplified. This code is actually - * triggered on the 2nd+ invocation. -- rgerhards, 2008-01-30 - */ - if(iRet == RS_RET_SUSPENDED) { - /* ok, this calls for our retry logic... */ - ++iRetries; - iSleepPeriod = pAction->iResumeInterval; - srSleep(iSleepPeriod, 0); - } - /* first check if we are suspended and, if so, retry */ - if(actionIsSuspended(pAction)) { - iRet = actionTryResume(pAction); - if(iRet == RS_RET_OK) - bCallAction = 1; - else - bCallAction = 0; - } else { - bCallAction = 1; - } - - if(bCallAction) { - /* call configured action */ - iRet = pAction->pMod->mod.om.doAction(pAction->ppMsgs, pMsg->msgFlags, pAction->pModData); - if(iRet == RS_RET_SUSPENDED) { - DBGPRINTF("Action requested to be suspended, done that.\n"); - actionSuspend(pAction, getActNow(pAction)); - } - } +finalize_it: + RETiRet; +} - } while(iRet == RS_RET_SUSPENDED && (pAction->iResumeRetryCount == -1 || iRetries < pAction->iResumeRetryCount)); /* do...while! */ - if(iRet == RS_RET_DISABLE_ACTION) { - DBGPRINTF("Action requested to be disabled, done that.\n"); - pAction->bEnabled = 0; /* that's it... */ - } +/* cleanup doAction calling parameters + * rgerhards, 2009-05-07 + */ +static rsRetVal cleanupDoActionParams(action_t *pAction) +{ + int i; + int iArr; + DEFiRet; -finalize_it: - /* cleanup */ + ASSERT(pAction != NULL); for(i = 0 ; i < pAction->iNumTpls ; ++i) { if(pAction->ppMsgs[i] != NULL) { switch(pAction->eParamPassing) { @@ -545,9 +662,312 @@ finalize_it: } } + RETiRet; +} + + +/* call the DoAction output plugin entry point + * Performance note: we build the action parameters here in this function. That + * means we do it while we hold the action look, potentially reducing concurrency + * (especially if the action queue is run in DIRECT mode). As an alternative, we + * may generate all params for the batch as whole before aquiring the action. However, + * that requires more memory, for large batches potentially a lot of memory. So for the + * time being, I am doing it here - the performance hit should be very minor and may even + * not be a hit because we may gain CPU cache locality gains with the "fewer memory" + * approach (I'd say that is rater likely). + * rgerhards, 2008-01-28 + */ +rsRetVal +actionCallDoAction(action_t *pThis, msg_t *pMsg) +{ + DEFiRet; + + ASSERT(pThis != NULL); + ISOBJ_TYPE_assert(pMsg, msg); + + DBGPRINTF("entering actionCalldoAction(), state: %s\n", getActStateName(pThis)); + CHKiRet(prepareDoActionParams(pThis, pMsg)); + + pThis->bHadAutoCommit = 0; + iRet = pThis->pMod->mod.om.doAction(pThis->ppMsgs, pMsg->msgFlags, pThis->pModData); + switch(iRet) { + case RS_RET_OK: + actionCommitted(pThis); + break; + case RS_RET_DEFER_COMMIT: + /* we are done, action state remains the same */ + break; + case RS_RET_PREVIOUS_COMMITTED: + /* action state remains the same, but we had a commit. */ + pThis->bHadAutoCommit = 1; + break; + case RS_RET_SUSPENDED: + actionRetry(pThis); + break; + case RS_RET_DISABLE_ACTION: + actionDisable(pThis); + break; + default:/* permanent failure of this message - no sense in retrying. This is + * not yet handled (but easy TODO) + */ + FINALIZE; + } + iRet = getReturnCode(pThis); + +finalize_it: + cleanupDoActionParams(pThis); /* iRet ignored! */ + + RETiRet; +} + + +/* process a message + * this readies the action and then calls doAction() + * rgerhards, 2008-01-28 + */ +rsRetVal +actionProcessMessage(action_t *pThis, msg_t *pMsg) +{ + DEFiRet; + + ASSERT(pThis != NULL); + ISOBJ_TYPE_assert(pMsg, msg); + +RUNLOG_STR("inside actionProcessMsg()"); + CHKiRet(actionPrepare(pThis)); + if(pThis->eState == ACT_STATE_ITX) + CHKiRet(actionCallDoAction(pThis, pMsg)); + + iRet = getReturnCode(pThis); +finalize_it: + RETiRet; +} + + +/* finish processing a batch. Most importantly, that means we commit if we + * need to do so. + * rgerhards, 2008-01-28 + */ +static rsRetVal +finishBatch(action_t *pThis) +{ + DEFiRet; + + ASSERT(pThis != NULL); + + if(pThis->eState == ACT_STATE_RDY) + FINALIZE; /* nothing to do */ + + CHKiRet(actionPrepare(pThis)); + if(pThis->eState == ACT_STATE_ITX) { + iRet = pThis->pMod->mod.om.endTransaction(pThis->pModData); + switch(iRet) { + case RS_RET_OK: + actionCommitted(pThis); + break; + case RS_RET_SUSPENDED: + actionRetry(pThis); + break; + case RS_RET_DISABLE_ACTION: + actionDisable(pThis); + break; + case RS_RET_DEFER_COMMIT: + DBGPRINTF("output plugin error: endTransaction() returns RS_RET_DEFER_COMMIT " + "- ignored\n"); + actionCommitted(pThis); + break; + case RS_RET_PREVIOUS_COMMITTED: + DBGPRINTF("output plugin error: endTransaction() returns RS_RET_PREVIOUS_COMMITTED " + "- ignored\n"); + actionCommitted(pThis); + break; + default:/* permanent failure of this message - no sense in retrying. This is + * not yet handled (but easy TODO) + */ + FINALIZE; + } + } + iRet = getReturnCode(pThis); + +finalize_it: + RETiRet; +} + + +/* try to submit a partial batch of elements. + * rgerhards, 2009-05-12 + */ +static rsRetVal +tryDoAction(action_t *pAction, batch_t *pBatch, int *pnElem) +{ + int i; + int iElemProcessed; + int iCommittedUpTo; + msg_t *pMsg; + rsRetVal localRet; + DEFiRet; + + assert(pBatch != NULL); + assert(pnElem != NULL); + + i = pBatch->iDoneUpTo; /* all messages below that index are processed */ + iElemProcessed = 0; + iCommittedUpTo = i; + while(iElemProcessed <= *pnElem && i < pBatch->nElem) { + pMsg = (msg_t*) pBatch->pElem[i].pUsrp; + DBGPRINTF("submitBatch: i:%d, batch size %d, to process %d, pMsg: %p, state %d\n", i, pBatch->nElem, *pnElem, pMsg, pBatch->pElem[i].state);//remove later! + if(pBatch->pElem[i].state != BATCH_STATE_DISC) { + localRet = actionProcessMessage(pAction, pMsg); + DBGPRINTF("action call returned %d\n", localRet); + if(localRet == RS_RET_OK) { + /* mark messages as committed */ + while(iCommittedUpTo < i) { + pBatch->pElem[iCommittedUpTo++].state = BATCH_STATE_COMM; + } + } else if(localRet == RS_RET_PREVIOUS_COMMITTED) { + /* mark messages as committed */ + while(iCommittedUpTo < i - 1) { + pBatch->pElem[iCommittedUpTo++].state = BATCH_STATE_COMM; + } + pBatch->pElem[i].state = BATCH_STATE_SUB; + } else if(localRet == RS_RET_PREVIOUS_COMMITTED) { + pBatch->pElem[i].state = BATCH_STATE_SUB; + } else if(localRet == RS_RET_DISCARDMSG) { + pBatch->pElem[i].state = BATCH_STATE_DISC; + } else { + iRet = localRet; + FINALIZE; + } + } + ++i; + ++iElemProcessed; + } + +finalize_it: + if(pBatch->nElem == 1 && pBatch->pElem[0].state == BATCH_STATE_DISC) { + iRet = RS_RET_DISCARDMSG; + } else if(pBatch->iDoneUpTo != iCommittedUpTo) { + *pnElem += iCommittedUpTo - pBatch->iDoneUpTo; + pBatch->iDoneUpTo = iCommittedUpTo; + } + RETiRet; +} + + +/* submit a batch for actual action processing. + * The first nElem elements are processed. This function calls itself + * recursively if it needs to handle errors. + * rgerhards, 2009-05-12 + */ +static rsRetVal +submitBatch(action_t *pAction, batch_t *pBatch, int nElem) +{ + int i; + int bDone; + rsRetVal localRet; + DEFiRet; + + assert(pBatch != NULL); + + bDone = 0; + do { + localRet = tryDoAction(pAction, pBatch, &nElem); + if( localRet == RS_RET_OK + || localRet == RS_RET_PREVIOUS_COMMITTED + || localRet == RS_RET_DEFER_COMMIT) { + /* try commit transaction, once done, we can simply do so as if + * that return state was returned from tryDoAction(). + */ + localRet = finishBatch(pAction); + } + + if( localRet == RS_RET_OK + || localRet == RS_RET_PREVIOUS_COMMITTED + || localRet == RS_RET_DEFER_COMMIT) { + bDone = 1; + } else if(localRet == RS_RET_DISCARDMSG) { + iRet = RS_RET_DISCARDMSG; /* TODO: verify this sequence -- rgerhards, 2009-07-30 */ + bDone = 1; + } else if(localRet == RS_RET_SUSPENDED) { + ; /* do nothing, this will retry the full batch */ + } else if(localRet == RS_RET_ACTION_FAILED) { + /* in this case, the whole batch can not be processed */ + for(i = 0 ; i < nElem ; ++i) { + pBatch->pElem[++pBatch->iDoneUpTo].state = BATCH_STATE_BAD; + } + bDone = 1; + } else { + if(nElem == 1) { + pBatch->pElem[++pBatch->iDoneUpTo].state = BATCH_STATE_BAD; + bDone = 1; + } else { + /* retry with half as much. Depth is log_2 batchsize, so recursion is not too deep */ + submitBatch(pAction, pBatch, nElem / 2); + submitBatch(pAction, pBatch, nElem - (nElem / 2)); + bDone = 1; + } + } + } while(!bDone); /* do .. while()! */ + + RETiRet; +} + + +/* receive a batch and process it. This includes retry handling. + * rgerhards, 2009-05-12 + */ +static rsRetVal +processAction(action_t *pAction, batch_t *pBatch) +{ + int i; + msg_t *pMsg; + rsRetVal localRet; + DEFiRet; + + assert(pBatch != NULL); + + pBatch->iDoneUpTo = 0; + /* TODO: think about action batches, must be handled at upper layer! + * MULTIQUEUE + */ + localRet = submitBatch(pAction, pBatch, pBatch->nElem); + CHKiRet(localRet); + + /* this must be moved away - up into the dequeue part of the queue, I guess, but that's for another day */ + for(i = 0 ; i < pBatch->nElem ; i++) { + pMsg = (msg_t*) pBatch->pElem[i].pUsrp; + } + iRet = finishBatch(pAction); + +finalize_it: + RETiRet; +} + + +#pragma GCC diagnostic ignored "-Wempty-body" +/* receive an array of to-process user pointers and submit them + * for processing. + * rgerhards, 2009-04-22 + */ +static rsRetVal +processBatchMain(action_t *pAction, batch_t *pBatch) +{ + DEFiRet; + + assert(pBatch != NULL); + + /* We now must guard the output module against execution by multiple threads. The + * plugin interface specifies that output modules must not be thread-safe (except + * if they notify us they are - functionality not yet implemented...). + * rgerhards, 2008-01-30 + */ + d_pthread_mutex_lock(&pAction->mutActExec); + pthread_cleanup_push(mutexCancelCleanup, &pAction->mutActExec); + + iRet = processAction(pAction, pBatch); + pthread_cleanup_pop(1); /* unlock mutex */ - msgDestruct(&pMsg); /* we are now finished with the message */ RETiRet; } #pragma GCC diagnostic warning "-Wempty-body" @@ -563,7 +983,6 @@ rsRetVal actionCallHUPHdlr(action_t *pAction) { DEFiRet; - int iCancelStateSave; ASSERT(pAction != NULL); DBGPRINTF("Action %p checks HUP hdlr: %p\n", pAction, pAction->pMod->doHUP); @@ -572,10 +991,8 @@ actionCallHUPHdlr(action_t *pAction) FINALIZE; /* no HUP handler, so we are done ;) */ } - pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); d_pthread_mutex_lock(&pAction->mutActExec); pthread_cleanup_push(mutexCancelCleanup, &pAction->mutActExec); - pthread_setcancelstate(iCancelStateSave, NULL); CHKiRet(pAction->pMod->doHUP(pAction->pModData)); pthread_cleanup_pop(1); /* unlock mutex */ @@ -757,24 +1174,12 @@ static rsRetVal doActionCallAction(action_t *pAction, msg_t *pMsg) { DEFiRet; - /* first, we need to check if this is a disabled - * entry. If so, we must not further process it. - * rgerhards 2005-09-26 - * In the future, disabled modules may be re-probed from time - * to time. They are in a perfectly legal state, except that the - * doAction method indicated that it wanted to be disabled - but - * we do not consider this is a solution for eternity... So we - * should check from time to time if affairs have improved. - * rgerhards, 2007-07-24 - */ - if(pAction->bEnabled == 0) { - ABORT_FINALIZE(RS_RET_OK); - } pAction->tActNow = -1; /* we do not yet know our current time (clear prev. value) */ /* don't output marks to recently written outputs */ - if((pMsg->msgFlags & MARK) && (getActNow(pAction) - pAction->f_time) < MarkInterval / 2) { + if(pAction->bWriteAllMarkMsgs == FALSE + && (pMsg->msgFlags & MARK) && (getActNow(pAction) - pAction->f_time) < MarkInterval / 2) { ABORT_FINALIZE(RS_RET_OK); } @@ -820,6 +1225,7 @@ finalize_it: RETiRet; } + /* call the configured action. Does all necessary housekeeping. * rgerhards, 2007-08-01 * FYI: currently, this function is only called from the queue @@ -832,20 +1238,22 @@ rsRetVal actionCallAction(action_t *pAction, msg_t *pMsg) { DEFiRet; - int iCancelStateSave; ISOBJ_TYPE_assert(pMsg, msg); ASSERT(pAction != NULL); - pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); - LockObj(pAction); - pthread_cleanup_push(mutexCancelCleanup, pAction->Sync_mut); - pthread_setcancelstate(iCancelStateSave, NULL); - iRet = doActionCallAction(pAction, pMsg); - pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); - UnlockObj(pAction); - pthread_cleanup_pop(0); /* remove mutex cleanup handler */ - pthread_setcancelstate(iCancelStateSave, NULL); + /* We need to lock the mutex only for repeated line processing. + * rgerhards, 2009-06-19 + */ + //if(pAction->f_ReduceRepeated == 1) { + LockObj(pAction); + pthread_cleanup_push(mutexCancelCleanup, pAction->Sync_mut); + iRet = doActionCallAction(pAction, pMsg); + UnlockObj(pAction); + pthread_cleanup_pop(0); /* remove mutex cleanup handler */ + //} else { + //iRet = doActionCallAction(pAction, pMsg); + //} RETiRet; } @@ -863,6 +1271,8 @@ actionAddCfSysLineHdrl(void) CHKiRet(regCfSysLineHdlr((uchar *)"actionname", 0, eCmdHdlrGetWord, NULL, &pszActionName, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"actionqueuefilename", 0, eCmdHdlrGetWord, NULL, &pszActionQFName, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"actionqueuesize", 0, eCmdHdlrInt, NULL, &iActionQueueSize, NULL)); + CHKiRet(regCfSysLineHdlr((uchar *)"actionwriteallmarkmessages", 0, eCmdHdlrBinary, NULL, &bActionWriteAllMarkMsgs, NULL)); + CHKiRet(regCfSysLineHdlr((uchar *)"actionqueuedequeuebatchsize", 0, eCmdHdlrInt, NULL, &iActionQueueDeqBatchSize, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"actionqueuemaxdiskspace", 0, eCmdHdlrSize, NULL, &iActionQueMaxDiskSpace, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"actionqueuehighwatermark", 0, eCmdHdlrInt, NULL, &iActionQHighWtrMark, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"actionqueuelowwatermark", 0, eCmdHdlrInt, NULL, &iActionQLowWtrMark, NULL)); @@ -916,6 +1326,8 @@ addAction(action_t **ppAction, modInfo_t *pMod, void *pModData, omodStringReques pAction->pModData = pModData; pAction->pszName = pszActionName; pszActionName = NULL; /* free again! */ + pAction->bWriteAllMarkMsgs = bActionWriteAllMarkMsgs; + bActionWriteAllMarkMsgs = FALSE; /* reset */ pAction->bExecWhenPrevSusp = bActExecWhenPrevSusp; pAction->iSecsExecOnceInterval = iActExecOnceInterval; pAction->iExecEveryNthOccur = iActExecEveryNthOccur; @@ -980,7 +1392,7 @@ addAction(action_t **ppAction, modInfo_t *pMod, void *pModData, omodStringReques DBGPRINTF("module is incompatible with RepeatedMsgReduction - turned off\n"); pAction->f_ReduceRepeated = 0; } - pAction->bEnabled = 1; /* action is enabled */ + pAction->eState = ACT_STATE_RDY; /* action is enabled */ if(bSuspended) actionSuspend(pAction, time(NULL)); /* "good" time call, only during init and unavoidable */ @@ -36,6 +36,15 @@ extern int glbliActionResumeRetryCount; +typedef enum { + ACT_STATE_DIED = 0, /* action permanently failed and now disabled - MUST BE ZEO! */ + ACT_STATE_RDY = 1, /* action ready, waiting for new transaction */ + ACT_STATE_ITX = 2, /* transaction active, waiting for new data or commit */ + ACT_STATE_COMM = 3, /* transaction finished (a transient state) */ + ACT_STATE_RTRY = 4, /* failure occured, trying to restablish ready state */ + ACT_STATE_SUSP = 5 /* suspended due to failure (return fail until timeout expired) */ +} action_state_t; + /* the following struct defines the action object data structure */ struct action_s { @@ -44,9 +53,10 @@ struct action_s { populated on an as-needed basis. This is a performance optimization. */ time_t tLastExec; /* time this action was last executed */ bool bExecWhenPrevSusp;/* execute only when previous action is suspended? */ + bool bWriteAllMarkMsgs;/* should all mark msgs be written (not matter how recent the action was executed)? */ int iSecsExecOnceInterval; /* if non-zero, minimum seconds to wait until action is executed again */ - short bEnabled; /* is the related action enabled (1) or disabled (0)? */ - bool bSuspended; /* is the related action temporarily suspended? */ + action_state_t eState; /* current state of action */ + int bHadAutoCommit; /* did an auto-commit happen during doAction()? */ time_t ttResumeRtry; /* when is it time to retry the resume? */ int iResumeInterval;/* resume interval for this action */ int iResumeRetryCount;/* how often shall we retry a suspended action? (-1 --> eternal) */ diff --git a/configure.ac b/configure.ac index 39e78d24..5bbc5e55 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ(2.61) -AC_INIT([rsyslog],[4.5.6],[rsyslog@lists.adiscon.com]) +AC_INIT([rsyslog],[5.1.6],[rsyslog@lists.adiscon.com]) AM_INIT_AUTOMAKE AC_CONFIG_SRCDIR([ChangeLog]) AC_CONFIG_MACRO_DIR([m4]) @@ -182,6 +182,7 @@ if test "$enable_regexp" = "yes"; then fi + # zlib compression AC_ARG_ENABLE(zlib, [AS_HELP_STRING([--enable-zlib],[Enable zlib compression support @<:@default=yes@:>@])], @@ -239,7 +240,7 @@ AC_ARG_ENABLE(pthreads, ) if test "x$enable_pthreads" = "xno"; then - AC_MSG_ERROR(rsyslog v3 does no longer support single threading mode -- use a previous version for that); + AC_MSG_ERROR(rsyslog v3+ does no longer support single threading mode -- use a previous version for that); fi if test "x$enable_pthreads" != "xno"; then @@ -730,6 +731,36 @@ AC_ARG_ENABLE(omprog, AM_CONDITIONAL(ENABLE_OMPROG, test x$enable_omprog = xyes) +# settings for omudpspoof +AC_ARG_ENABLE(omudpspoof, + [AS_HELP_STRING([--enable-omudpspoof],[Compiles omudpspoof module @<:@default=no@:>@])], + [case "${enableval}" in + yes) enable_omudpspoof="yes" ;; + no) enable_omudpspoof="no" ;; + *) AC_MSG_ERROR(bad value ${enableval} for --enable-omudpspoof) ;; + esac], + [enable_omudpspoof=no] +) + +if test "x$enable_omudpspoof" = "xyes"; then + AC_CHECK_HEADERS( + [libnet.h],, + [AC_MSG_FAILURE([libnet is missing])] + ) + AC_CHECK_LIB( + [net], + [libnet_init], + [UDPSPOOF_CFLAGS="" + UDPSPOOF_LIBS="-lnet" + ], + [AC_MSG_FAILURE([libnet is missing])] + ) +fi +AM_CONDITIONAL(ENABLE_OMUDPSPOOF, test x$enable_omudpspoof = xyes) +AC_SUBST(UDPSPOOF_CFLAGS) +AC_SUBST(UDPSPOOF_LIBS) + + # settings for omstdout AC_ARG_ENABLE(omstdout, [AS_HELP_STRING([--enable-omstdout],[Compiles stdout module @<:@default=no@:>@])], @@ -829,6 +860,7 @@ AC_CONFIG_FILES([Makefile \ plugins/ommail/Makefile \ plugins/omsnmp/Makefile \ plugins/omoracle/Makefile \ + plugins/omudpspoof/Makefile \ plugins/cust1/Makefile \ tests/Makefile]) AC_OUTPUT @@ -836,7 +868,6 @@ AC_OUTPUT echo "****************************************************" echo "rsyslog will be compiled with the following settings:" echo -echo " Multithreading support enabled: $enable_pthreads" echo " Large file support enabled: $enable_largefile" echo " Networking support enabled: $enable_inet" echo " Regular expressions support enabled: $enable_regexp" @@ -855,6 +886,7 @@ echo "---{ output plugins }---" echo " Mail support enabled: $enable_mail" echo " omprog module will be compiled: $enable_omprog" echo " omstdout module will be compiled: $enable_omstdout" +echo " omudpspoof module will be compiled: $enable_omudpspoof" echo " output template module will be compiled: $enable_omtemplate" echo echo "---{ database support }---" diff --git a/doc/Makefile.am b/doc/Makefile.am index 3dfc8d3a..a447cddf 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -30,6 +30,8 @@ html_files = \ version_naming.html \ contributors.html \ dev_queue.html \ + omstdout.html \ + omudpspoof.html \ omsnmp.html \ ommysql.html \ omoracle.html \ @@ -91,6 +93,7 @@ html_files = \ rsconf1_umask.html \ v3compatibility.html \ v4compatibility.html \ + v5compatibility.html \ im3195.html \ netstream.html \ ns_gtls.html \ diff --git a/doc/action-call.dot b/doc/action-call.dot new file mode 100644 index 00000000..86c6834d --- /dev/null +++ b/doc/action-call.dot @@ -0,0 +1,33 @@ +// This file is part of rsyslog. +// +// rsyslog action call state diagram +// +// see http://www.graphviz.org for how to obtain the graphviz processor +// which is used to build the actual graph. +// +// generate the graph with +// $ dot action-call.dot -Tpng >action-call.png + +digraph G { + label="\n\nrsyslog message states during action processing\nhttp://www.rsyslog.com"; + //fontsize=20; + + ok [label="ready for processing" color="green"]; + mpf [label="message permanent failure" color="red"]; + tf [label="temporary failure"] + cPen [label="commit pending"]; + com [label="committed" color="red"]; + + tf -> tf [label="retry fails, i < n"]; + tf -> mpf [label="retry fails, i = n"]; + tf -> ok [label="retry succeeds"]; + ok -> com [label="doAction RS_RET_OK"]; + ok -> cPen [label="doAction COMMIT_PENDING"]; + ok -> tf [label="doAction RS_RET_SUSPENDED"]; + ok -> mpf [label="doAction RS_RET_DISABLED"]; + cPen -> com [label="endTransaction RS_RET_OK"]; + cPen -> tf [label="endTransaction _SUSPENDED"]; + + //{rank=same; tf cPen} + {rank=same; com mpf} +} diff --git a/doc/action_state.dot b/doc/action_state.dot new file mode 100644 index 00000000..d56d9da0 --- /dev/null +++ b/doc/action_state.dot @@ -0,0 +1,33 @@ +// This file is part of rsyslog. +// +// rsyslog message state diagram +// +// see http://www.graphviz.org for how to obtain the graphviz processor +// which is used to build the actual graph. +// +// generate the graph with +// $ dot file.dot -Tpng >file.png + +digraph msgState { + compound=true; nodesep=1.0 + //label="\n\nrsyslog action transaction states\nhttp://www.rsyslog.com"; + //fontsize=20; + + rdy [label="ready" group="main"]; + itx [label="in Tx" group="main"]; + comm [label="commit"] + rtry [label="retry"] + susp [label="suspended"] + + rdy -> itx [label="transaction begins"] + itx -> itx [label="success"] + itx -> comm [label="commit\n(caller or auto)"] + itx -> rtry [label="error"] + comm -> rdy [label="success"] + comm -> rtry [label="error"] + rtry -> rdy [label="recovered"] + rtry -> susp [label="could not\nrecover"] + susp -> rtry [label="timeout expired"] + + {rank=same; comm rtry} +} diff --git a/doc/batch_state.dot b/doc/batch_state.dot new file mode 100644 index 00000000..0dd48b47 --- /dev/null +++ b/doc/batch_state.dot @@ -0,0 +1,28 @@ +// This file is part of rsyslog. +// +// rsyslog batch state diagram +// +// see http://www.graphviz.org for how to obtain the graphviz processor +// which is used to build the actual graph. +// +// generate the graph with +// $ dot file.dot -Tpng >file.png + +digraph msgState { + compound=true; nodesep=1.0 + //label="\n\nrsyslog batch states\nhttp://www.rsyslog.com"; + rankdir=LR + + rdy [label="ready"]; + bad [label="message-caused\nfailure"]; + sub [label="submitted"] + disc [label="discarded" color="red"] + + rdy -> sub [label="submitted to action"] + rdy -> bad [label="permanent fail"] + rdy -> disc [label="action requests discarding"] + sub -> rdy [label="next action or\naction-caused failure"] + bad -> rdy [label="next action"] + + //{rank=same; comm rtry } +} diff --git a/doc/design.tex b/doc/design.tex new file mode 100644 index 00000000..53d25313 --- /dev/null +++ b/doc/design.tex @@ -0,0 +1,859 @@ +\documentclass[a4paper,10pt]{article} +\usepackage{amsmath} +\usepackage{amsfonts} +\usepackage{amssymb} +\usepackage{graphicx} +\usepackage{listings} +\usepackage{algorithm,algorithmic} +\usepackage{float} + +\pagestyle{headings} + +\newcommand{\IN}{\mathbb{N}} +\newcommand{\MM}{\mathcal{M}} +\newcommand{\QQ}{\mathcal{Q}} +\newcommand{\AAA}{\mathcal{A}} +\title{Rsyslog Design and Internals} +\author{Rainer Gerhards\\ +rgerhards@adiscon.com} + +\begin{document} + +\maketitle + +\begin{abstract} +This paper describes rsyslog design and internals. It is created to facilitate a discussion about the implementation of "batched queue processing". As such, it does not describe the full design of rsyslog but rather those elements that are relevant to queues. However, the document may be expanded in the future. +\end{abstract} + +\tableofcontents + +\section{Preliminaries} +\subsection{On the Use of English} +\begin{quotation} +\begin{flushright} +I ventured to write this book in English because ... \\ +it will be more easily read in poor English, \\ +than in good German by 90\% of my intended readers. \\ +--- HANS J. STETTER, Analysis of Discretization Methods for \\ +Ordinary Differential Equations (1973) +\end{flushright} +\end{quotation} + +There is not much I could add to Mr. Stetter's thought, except, maybe, that the number to quote probably tends more to 99\% in this case than to the 90\% Mr. Stetter notes. So please pardon those errors in language use that I have not yet been able to fix or even see. Suggestions for corrections and improvements are always welcome. +\subsection{Notational Conventions} +In general, in rsyslog there exists single objects $o$, which are used to build larger sets $O$, which form a superset $\mathcal{O}$ of all those objects that exist at a given time inside a running instance of rsyslog. As seen above, single objects are always described by lower case letters ($o$), larger sets by upper case letters ($O$) and the ``all-sets'' in caligraphic letters ($\mathcal{O}$). Often, objects $O_i, i \in \IN, i \le |\mathcal{O}|$ partition $\mathcal{O}$, but this is not necessarily the case. + +\subsection{Definitions} +\subsubsection{Sudden Fatal Failure} +As sudden fatal failure is one that occurs at some instant and causes Complete loss of processing capabilities. The two major cases are a sudden power loss or a ``kill -9'' of the process. There are more exotic cases, too, like disasters. + +One may argue that it is possible to protect against many sudden fatal failure cases. For example, using an uninterruptable power supply (UPS) will prevent a sudden power loss. While this is true in most cases, it does not hold if looked very closely: in the case of the UPS, for example, a failure in the UPS itself may cause a sudden power loss, which can not be mitigated. Well, actually there can be several layers of mitigation, but always one more potential failure scenario remains. So it is not possible to totally solve the issue. + +The concept of ``sudden fatal failure'' now covers all these rest risk that result in termiantion of rsyslogd without the ability execute any code before this happens. This is a very important concept in regard to audit-gradeness. + +\subsubsection{Audit Grade} +In the context of this document, ``audit grade'' means that a subsystem never loses a message that it has taken responsibility for, not even in cases of sudden fatal failures. The only limit in this restriction is that a subsystem does not guarantee message survival if the subsytem at large is being destroyed (e.g. during a disaster) or some of its components are not of audit-grade. This draws a fine limitation on the audit-grade of a subsystem. + +For example, the rsyslog queue subsystem receives messages and acknowledges them to the submitter (e.g. an input), when they have been enqueued in the storage system. If the queue system is configured to provide audit-grade operation\footnote{Audit-grade queue operation is considerably slower than regular operations, as such this mode is not enabled by default. Most installations will never need a completely audit-grade queue}, the queue relies on the storage subsystem to work properly. If, for example, a disk read error occurs, the message may no longer be readable from the disk and as such is lost. The root cause here is that the disk subsystem was not of audit grade, because it otherwise would not have lost the message. So in this case the queue code is of audit grade, but the one of its components, the disk subsytem, was not. So the overall system is not of audit grade. + +To simplify talking about the audit-gradness of several subsytems, we assume that all of their subsystems are also of audit grade. In an actual deployment, however, this means the the system designer must carefully select audit-grade subsystems. Overlooking a single non-audit-grade component will make the whole system of not audit grade quality. + +Please note that it can be rather tricky to ensure a complete system is of audit grade. A border case is main memory integrity. Even with error-correcting memory, there may situations arise where a memory error occurs (probably due to a very unlikely series of well-hitting cosmic rays) that is unrecoverable. At this point, system integrity is at risk. The only real solution is to immediately shut down the system and restart it (without giving any process a chance to execute). Note, however, that in an extreme view, an operating system routine that does so can also be considered dangerous, as memory in use by this routine might be affected by the malfunction. We could extend this scenario and further complicate it, but that goes beyond the scope of this paper. The example was primarily meant to show how subtle audit-grade reliability is. + +In rsyslog, we currently use a slightly \marginpar{duplication\\permitted}relaxed consistency condition for message integrity inside an audit-grade subsystem. While we do not accept message loss, we permit slight message \emph{duplication}, but only in exceptional cases. This is permitted because, with proper message generation, the dulication problem can be easily fixed at the end-to-end layer. For example, the original sender can include a UUID, which can be used to sort out duplicates at the final destination. Insisting on not allowing duplication complicates matters and is often impossible with today's logging protocols. So, for the time being, we aim at this relaxed criteria, which is hard enough to achive. After we have achieved that goal, we may further try to solve the duplicaton problem. Some hooks already exist. But we do not guarantee such an effort will be made any time soon. + +\section{Overall Design} +From a high-level prespective, rsyslogd is ``just'' a high-performance message router. It accepts messages from various sources, applies user-configured filters to them, and routes potentially transformed messages to destinations based on these filters. +\section{Objects} +\subsection{Plugins} +Plugins provide code potentially written by a third party to extend rsyslog. + +Conceptually, a plugin is a tuple of callable functions $(\phi_1, \phi_2, \ldots)$ which implement an interface. There are three different types of plugins: input, output and library. The plugin type denotes the primary interface implemented by the plugin. Additional interfaces may be implemented\footnote{This is not yet done in plugins, but is possible and assumed to be done at a later point in time}. + +In the context of this paper, the output plugin interface is most important. It implements three entry points: + +\paragraph{doAction()} +is used to submit messages to the output plugin. The entry point may or may not commit the messages to their ultimate destination. + +\paragraph{beginTransaction()} +is used to inform the plugin that a new transaction begins. It must prepare for processing. + +\paragraph{endTransaction()} +is indicated that the upper layer \emph{needs} to close the transaction. If there is any uncommited data left, it must be commited or rolled back. + +Every instance of an output plugin is guaranteed \emph{not} to be called concurrently by multiple threads. Further, no context switch will happen between calls to $doAction()$ and $endTransaction()$. + +\subsection{State Sets} +Several object have associated state based on a specific state set. These state sets are described together with the objects. + +As a general rule, individual state is associated with all instances $o$ of a class of objects. This state is called the object's \marginpar{state component} \emph{state component} $s$. If we want to obtain an object's state, we write $S(o)$. Please note that $S(o)$ is only defined for those objects that have a state component. + +\subsection{Messages} +A message $m$ represents a a single syslog message inside the system. It is a tuple of attributes. Some of these attributes directly orginate from the message content, some others are meta-information taken from the context. For example, there is an meta-attribute ``time of reception'' which conveys when the message was received by rsyslog's input subsystem. We do not list attributes here, as there are many and it is not of importance which exactly they are. + +The set $\MM$ is composed of all messages that exist at a given time inside rsyslog. + +\subsection{Queue} +A queue +$$Q = (C, \Phi, M)$$ +is a triplet of a set of configuration parameters $C$, a set of callbacks $\Phi$ and a set of messages $M \subseteq \MM$. + +If we need to obtain the set of message from a queue, we write $M(Q)$. The elements of the set of configuration parameters are written as $C_{param}$ where $param$ is an abbreviation of the parameter's meaning. To obtain a specific parameter from a queue, we write $C_{param}(Q)$. The most important elements of $C$ are: + +\paragraph{$C_{type}$} which denotes the queue implementation type. Most importantly, this selects from a set of queue drivers (for example disk-only or in-memory driver), which affects the basic operation of the queue instance. + +\paragraph{$C_{mMsg}$} which denotes the upper bound on the cardinality of $M$. + +\paragraph{$C_{mBatch}$} which denotes the upper bound of the cardinality of message batches created for this queue. + +Be $\QQ = \{Q_m, Q_1, Q_2, \ldots, Q_{|\AAA|}\}$ the set of all queues that exist inside rsyslog after the configuration file has been processed, with $|\QQ| = |\AAA| + 1$. + +Then +$$M_0 = \MM \setminus \bigcup_{i=1}^{|\QQ|} Q_i(M)$$ +\marginpar{at-risk-set}is the set of non-queued messages. The messages have either never been enqueued or have been dequeued but not finally been processed. This set represents the messages that may potentially be lost during an unclean shutdown of rsyslogd. This is why I call this set the ``\emph{at-risk-set}''. + + +\subsection{Batches} +A batch represents multiple processable messages. It is a unit of processing inside rsyslog's output system. Batches are used to dequeue a number of messages from a queue and then submit them to the lower action layer. Batches are natural \emph{transaction boundaries}, in the sense that multiple output transactions may be done on the messages inside a batch, but each transaction must end at the end of the batch. A batch is always associated to a specific queue $Q$. + +A batch +$$B = (b_1, b_2, \ldots, b_n )$$ +is a $n$-tuple of \marginpar{processable\\message}processable messages +$$b = (m, s)$$ +which are an ordered pair of a message $m$ and an associated processing state $s$. To denote the $n$-th message inside the batch, we write $m(b_n)$, to denote the status component of the $n$-th message, we write $S(b_n)$. + +\begin{figure} +\begin{center} +\includegraphics[scale=0.4]{batch_state.jpeg} +\end{center} +\caption{batch message processing states} +\label{fig_batchmsg_states} +\end{figure} + +The state set for the processing states is defined as follows: +$$ +S_B = \{ rdy, bad, sub, disc \} +$$ + +With the semantics of the various states being the following: + +\begin{center} +\begin{tabular}{|l|l|} \hline + State & Semantics \\\hline + rdy & ready for processing\\ + bad & this message triggered an unrecoverable failure in action\\ + & processing and must not be resubmitted to this action\\ + sub & message submitted for processsing, result yet unknown \\ + disc & action sucessfully processed, but must not be submitted \\ + & to any further action in action unit \\\hline +\end{tabular} +\end{center} +The associated state diagram is shown in figure \ref{fig_batchmsg_states} on page \pageref{fig_batchmsg_states}. + +Batch sizes vary. The actual cardinality is a function of the cardinality of $M(Q)$ at the time of batch creation and the queue configuration: + +$$1 \leq |B| \leq \max(C_{mBatch}(Q), |M(Q)|)$$ + +\subsection{Action Unit} +An action unit +$$u = (f, a_1, \ldots, a_n), a_i \in \AAA \text{ for } i \in \IN, i \le n$$ +is a tuple consisting of a filter function $f$ and $n \in \IN$ actions. \emph{Does rsyslog still support nonsense action units with $n=0$? - check!} + +\subsection{Action} +An action +$$a = (a_C, a_\psi)$$ +is an ordered pair of a tuple of configuration attributes $a_C$, and a tuple of processing functions $a_\psi$. Be the set $\AAA$ composed of all actions that exist in rsyslog after the configuration file has been processed. + + +\section{Processing} +\subsection{Object States} +Various objects keep state. Some of these objects, like messages, batches and actions seem to share state. However, thinking about shared state leads to very complex setup. As such, state is modelled for each object $o$ individually. Instead, the state function $S_O(o)$ can be used to obtain an obtain an individual objects state. That state can be used to modify the state diagrams of the other objects with which relationships exist. + +\subsubsection{Actions} +Actions are provided by output plugins. An action enables the engine to write messages to some destination. It is important to note that ``destination'' is a very broad abstraction. A destination may be a file inside a local or remote file system, a database table or a remote syslog server in another network. + +Actions are transactional in the following sense: more than one message can be submitted to an action. The action does not necessarily process the submitted messages unless the caller ends the transaction. However, the action itself may also end the transaction and notify the caller. This is \emph{not} considered an error condition and \emph{must} be handled gracefully by the caller. If a transaction aborts, the caller \emph{must} assume that none of the elements submitted since the begin of transaction have been processed. The action will try to backout anything that was already processed at the time the transaction failed. However, not all outputs work on actually transactional destination. As such, an action is permitted not to backout incomplete interim results. As such, after a transaction abort, some message duplication may occur. We call this the \emph{relaxed integrity condition} for actions. + +An output transaction is started by calling \emph{beginTransaction()} either explicitely or implicitely by a call to \emph{doAction()} without calling \emph{beginTransaction()} before. Then, one or more calls to \emph{doAction()} follow. When the caller intends to finish the transaction, it calls \emph{endTransaction()}. However, the transaction may also be terminated from the action itself in response to a \emph{doAction()} call. + +Mathematically, an action transaction builds a totally ordered set of uncommitted messages $M_u$. The order relation is defined over the sequence in which messages are being provided to \emph{doAction()}. At any time a commit is attempted, the full set $M_u$ is committed and may either succeeed completely or not at all (in the sense of the relaxed integrity condition described above). + +A commit is attempted when +\begin{enumerate} +\item the caller decides to call \emph{endTransaction()} +\item or earlier if the action decides it needs to commit now (e.g. because of buffers filling up). +\end{enumerate} + +In the seconds case, the action may decide to commit all message but the current one or all (this is depending on action logic). So if the action decideds to commit a transaction before the caller calls \emph{endTransaction()}, a set of commited messages $M_c$ is build and $M_u$ is modified. Be $n$ the $n$-th iterated \emph{doAction()} call and $m_n$ the current message of this call, then the sets are build as follows: + +\begin{algorithm} +%\caption{} +\begin{algorithmic} +\IF{action commits $m_n$} + \STATE $M_c = M_u \cup m_n$ + \STATE $M_u = \emptyset$ +\ELSE + \STATE $M_c = M_u$ + \STATE $M_u = \{ m_n\}$ +\ENDIF +\end{algorithmic} +\end{algorithm} + +In other words, if anything is committed early, it is always the full set $M_u$, with or without the current message. The caller needs to know which messages are already commited. As \emph{doAction()} finishes one transaction and starts a new one in a single call, we can not use action state the let the caller know this happened. So we use our above finding and just convey back if the transacton is still continuing or the current message or all others before it were committed. The caller must then act accordingly. Please note that when an error happens, the whole transaction must still be considered failed. As such, ``partial commit'' states need not to be mixed with failure states. + +Please note that the above method leaves a small potential issue unaddressed: if the action does an early commit of $M_u \setminus m_n$, an error happens when adding $m_n$ to the new $M_u$ (like running out of resources), the action would need to convey both the successful transaction as well as the failure state. This is not possible with the current interface. We could use callbacks to provide such notification, but this complicates the code. So, if that situaton arises, the action must temporarily buffer the error condition and convey it as part of either the next \emph{doAction()} call or during \emph{endTransation()} processing. This can be done, for example, by advancing its internal state accordingly. + +The state set for a actions is defined as follows: +$$ +S_A = \{ rdy, itx, comm, rtry, susp, died \} +$$ + +With the semantics of the various states being the following: + +\begin{center} +\begin{tabular}{|l|l|} \hline + State & Semantics \\\hline + rdy & ready, waiting for transaction begin\\ + itx & in transaction, accept more data \\ + comm & transaction finished \\ + rtry & action failed but may be able to recover \\ + susp & action currently defunctional until timeout expires \\ + died & unrecoverable error condition occured, no longer usable \\\hline +\end{tabular} +\end{center} + +In the associated state diagram in figure \ref{fig_action_states}, we do not include the \emph{died} state, because it is entered whenever a totally unrecoverable error state may occur. This is a very exceptional incident (which most output plugins do not even support), so we have kept the diagram simple. + +\begin{figure} +\begin{center} +\includegraphics[scale=0.5]{action_state.jpeg} +\end{center} +\caption{Action State Diagram} +\label{fig_action_states} +\end{figure} + +\emph{Note well} that the state diagram describes the action state. It does \emph{not} describe the transaction state. While action- and transaction state are closely related to each other, they are different entities. + +The return code of \emph{doAction()} and \emph{endTransaction()} is used to convey the transaction state. As such, it is a function of the actions's current state after processing the request. The mapping is as shown below: + +\begin{center} +\begin{tabular}{|l|l|} \hline + State & Return Code (RS\_RET\_\ldots)\\\hline + rdy & OK \\ + itx & COMMITTED (if there was an auto-commit without $m_n$)\\ + & DEFER\_COMMIT (if there was no auto-commit)\\ + comm & internal state, not to be exposed to upper layer \\ + rtry & SUSPENDED \emph{(new code needed)} \\ + susp & SUSPENDED \\ + died & DISABLED \\\hline +\end{tabular} +\end{center} + +For the rest of this document, let's assume there is a function \emph{getReturnCode()} that implements this mapping. + +It is important to think about how retries are handled. There is a user-configured per-action upper number of retries $C_r$ and retry interval $C_i$. In \emph{rsyslog v3}, there is no concept of output transactions. As such, only single messages are processed. When a temporary action failure occurs, the action is re-tried $C_r$ times, where the action processing thread is waiting in a \emph{sleep()} $C_i$ operating system API call\footnote{a suitable API is used, not \emph{sleep()} itself}. If the action succeeds during the retry processing, everything continues as usual. If it does not succeed, two things happen: +\begin{itemize} +\item the message is flagged as ``action permanent failure'' (what may trigger backup processing) +\item the action is actually suspended for $C_i$ seconds +\end{itemize} +If then a new message is sent to the action, and $C_i$ seconds have not yet elapsed, the action is flagged as having failed without being re-tried again\footnote{During the analysis for this paper, it was seen that actually $C_r$ retries are attempted in v3, but each of them will never actually re-try the action. This is a software bug, which does not cause any harm and thus will not be fixed in v3. The new implementation in v4 will obviously not inherit this problem}. This is done in an effort to reduce resource utilization and prevent the system from slowing down e.g. by too-many retries to a remote server that went offline. + +With transactional output mode in \emph{rsyslog v4}, the logic above can no longer work. First of all, retrying single actions does not help, because all of the current transaction needs to be resubmitted. As such, the upper layers need to be notified of failure. Then, they need to resubmit the batch. In that design, the lower layer needs to return immediately after detecting the failure. Recovery handling is now to be done when the next transaction is started. However, we must make sure that we do not do excessive retries. So retry processing is only to be carried out if it was not tried less than $C_i$ seconds ago. + +The required functionality can be implemeted by a \emph{prepareAction} function that readies the action for processing if there is need to do so. That function is then called in all entry points before anything else is done. Then, actual processing is carried out and the resulting action state be used to generate the return code for the upper-layer caller. Find below a rough pseudocode to do so: + +\lstset{language=python} +\begin{lstlisting} +def prepareAction(): + if state == rtry: + try recovery (adjust state accordingly) + if state == rdy: + beginTransaction() [output plugin] + +def processMessage(message): + prepareAction() + if state == itx + doAction(message) [output plugin] + return getReturnCode() + +def doEndTransaction(): + prepareAction() + if state == itx + endTransaction(); [output plugin] + return getReturnCode() +\end{lstlisting} + +\subsection{Output Subsystem Layers} +The rsyslog engine is organized in layers, where each layer is represented by the dominating object: + +\begin{figure} +\includegraphics[scale=0.75]{rsyslog_output_layers.jpeg} +\label{rsyslog output layers} +\end{figure} + +If looking at the data flow, a queue dequeues batches of messages, which are than run through a generic action system and put into output plugins. Note that on the batch layer, only batches are supported as units of work, whereas the action layer is message-oriented but supports transactions of multiple messages. This is done by indicating when a transaction necessarily needs to end (that point being the end of batch from the batch layer). + +The plugins can be written by third parties and are roughly comparable to minidrivers. The generic action system provides all complexity of action processing wheras the output plugin provides a limited set of callbacks that enable the generic framework to talk to the actual destination system. As such, writing outputs is a very simple task. However, rsyslog does not limit the creation of very complex outputs, which may be able to offer superior performance for some destinations. + +\subsection{Output Failure} +\subsubsection{Cases} +When an output action is called, it may encounter a failure condition. In general, there are two different cases: +\begin{enumerate} +\item action caused failures +\item message-content caused failures +\end{enumerate}. + +Failures rooted in the action are things like broken network connections, file systems run out of space or database servers that are down. Most importantly, the failure is not related to message content. As such, it is appropriate to retry the action with the same message until it finally succeeds (assuming that someone restores the system in question to proper operation). We can not expect that the problem is cleared just by discarding the current message and re-trying with the next one. + +In my view, action caused failures are the far majority of all failures. For rsyslog versions 3 and below, all rsyslog-provided plugins consider failures to be action-caused and thus potentially recoverable by simple retry. With the only exception being fatal error conditions that render the whole action unusable. + +David Lang pointed out, that there may also exist error conditions that are not caused by the action (or the subsystem it talks to) itself, but rather by message data. He provided the following samples where message content can cause permanent issues with action execution: + +\begin{itemize} +\item unicode text causing grief +\item dynafile hits a read-only file +\item basicly data-driven things that trigger bugs in the message delivery +mechanism in some form. +\end{itemize} + +As David Lang said ``In an ideal world these would never happen, but for most output types I can think of some form of corrupt input that could cause that message to fail.''. +So this class of failure conditions actually exists. No matter how often the action retry mechanism is called, it will never succeeds (one may argue that the read-only dynafile is fixable, but we could replace that sample with an invalidly generated filename). The proper cure for these actions is to find the offending one and discard it. + +In conclusion, actions need to return different error states for these two different types of failures. Traditionally, RS\_RET\_SUSPENDED is returned when an action specific failure is hit. Most existing plugins also do this if a message-related failure occured, simply because they did not yet know that this situation exists. However, plugins also return different error codes, and at least these can be treated to mean message-permanent failures. To support this, a change to plugins is still required, because many simple return SUSPENDED state if anything went wrong (replacing the real error condition with SUSPENDED). A dedicated PROBABLE\_INVALID\_MSG return state is probably useful so that an output plugin can convey back that it consideres the message to be bad. On the other hand, this implies that the plugin must try to detect those, what means that the developer must think about all potential message-causes problems. That approach can be considered unreliable and as such it may be better not to provide such a dedicted state. + +\subsubsection{Handling of Failures} +In spite of the two different failure cases, different handling is needed for them. The action-based failure cases can and must be handled on the action level. As transactions abort when a failure occurs, support from the upper ``batch layer'' is necessary in order to handle resending batches of messages. + +For message-caused failure cases, the offending message must be found and then be discarded. A complexity here is that while a failure-causing message is being searched for, an action-based failure might occur. In that case, first the action-based failure condition must be solved, before the search for the problem message can continue. + +One approach might be that when the action-layer conveys back an action-caused failure (SUSPENDED), the batch layer knows that it simply needs to restart the full transaction (but not start an ``invalid message search''). If a message-based error condition is conveyed back, the batch system can not restart the full batch. Instead, it needs to enter search mode, where it creates partitions of the original batch, and calls itself recursively (at least in theory) on each of the subsets. + +Then, the same handling applies until either a failing message has been found or all messages have been successfully processed. Note that in the recursive step, action-based failures are recovered by full batch resubmits. This solves the above-mentioned complexity in a consistent way. + +If a binary-search-like method is used to detect failing records\footnote{This was originally suggested by David Lang.}, recursion may not really be an issue, as the recursion depth is limited to $\log_2 |B|$ where $B$ is the message batch. + +A message-caused failure can be rooted in one or more messages. One important question is if it is expected that the failure is caused by a single or multiple messages. Both is possible, so it is a question of probability. If we assume that it is more probable that a single messages causes the problems, it is useful to immediately return back to full batch submission of transactions once a problem-causing message has been identified. But then, if there are multiple problem-causing messages inside the batch, we may need many more iterations. + +If, on the other hand, we assume that it is more probable that multiple messages cause problems, it may make sense to keep resubmitting only subsets of the batch. However, then the performance is suboptimal if actually only one message was problematic. A solution might be to pick a compromise, e.g. first assume that a single message is problematic, but assume the opposite as soon as a second message with problems has been found. + +A potential algorithm for processing $n \le |B|$ messages from batch $B$ is described below. In the pseudocode, a ``processable'' message is one that neither is already committed nor had a permanent failure with this action. The term ``mpf'' means ``message permanent failure'' for this action (this will later be described in a batch state set). + +\begin{small} +\lstset{language=python} +\begin{lstlisting} +def submitBatch(B, n): + foreach processable message in + (first [at most] n messages of batch): + call processMessage + if action-caused failure: + retry full batch + if action-caused permanent failure: + mark all n messages as mpf + return + if auto-commit: + mark commited messages in batch as committed + if message-caused failure: + if n == 1: + mark message as mpf + return + else: + call submitBatch(B, n/2) + call submitBatch(B, n/2) +\end{lstlisting} +\end{small} + +After submitBatch() has completed, all messages are either committed or in mpf state. + +Note that an action-caused permanent failure occurs if an action-caused failure can not be resolved with the operator-configured number of retries. It will never occur if the user configured infinite retries. While an action is suspended, all calls will result in an action-caused permanent failure. Please keep in mind that these will be resubmitted to any backup actions inside the action unit, so the action's ability to cause permanent failure states is vital for a number of use cases (backup syslog server, to name just one). + +Batch processing inside an action unit thus can follow these strucuture: + +\begin{algorithm} +\caption{processBatch(B)} +\begin{algorithmic} +\FORALL{action $a$ in action unit} + \IF{execute action only on messages that failed before} + \STATE $n = |\text{messages in batch in mpf state}|$ + \STATE change mpf state back to ready + \ELSE + \STATE $n = |B \setminus \text{msgs with state discard}|$ + \STATE change all message states $\ne$ discard to ready + \ENDIF + \IF{$n >0$ } + \STATE call submitBatch(B, n) for action $a$ + \ENDIF +\ENDFOR +\end{algorithmic} +\end{algorithm} + +\paragraph{Why is it Important to differentiate the failure cases?} +This text originates from the mailing list and must be merged in. I provide it in the form it is, so it will not be forgotten (plus, it conveys the information). + +One may think that it is not necessary to differentiate between action-caused and message-caused failures. However, not doing so introduces subtle issues, because +then you either + +A) do not need the batch logic at all (because the action is configured for +infinite retries) + +Or + +B) you loose many messages if the action is not configured for infinite +retries and you have a longer-duration outage e.g. on a database server. +Let's say it is offline for a couple of hours, then you lose almost +everything in that period + +To prevent this, you need two different retry methods. + +One may argue that it is hard to differentiate between the two failure cases. This is correct. Buit I think it mostly depends on the quality of the output module. + +First of all, ``mostly'' implies that there may be some other cases, where it +really is impossible to differentiate between the two. In that case, I would +treat the issue as an action-caused failure. There are two reasons for this: + +1) rsyslog v3 currently does this always and not even a single person +complained about that so far. This is an empiric argument, and it does not +mean it caused problems. But it carries the co-notation that this seems not +to be too bad. + +2) If we would treat it as message-caused failure, we would no longer be able +to handle extended outages of destination systems, which I consider a vitally +important feature. + +When weighing the two, I know of lots of people who rely on 2), in sharp +contrast to knowig noone having problems with 1). So my conclusion is that it is +less problematic to define an otherwise undefinable failure reason to be +action-caused. Even more so as I assume this problem only exists in the +minority of cases. + +Now back to the quality of the output module: thinking about databases, their +API is usually very good at conveying back if there was a SQL error or a +connection abort. So while a SQL error may also be an indication of a +configuration problem, I would strongly tend to treat it is a being +message-caused. This is under the assumption that any reasonable responsive +admin will hopefully test his configuration at least once before turning it +into production. And config SQL errors should manifest immediately, so I +expect these to be fixed before a configuration runs in production. So it is +the duty of the output module to interpret the return code it received from +the API call and decide whether the failure is more likely action-caused or +message-caused. For database outputs, I would assume that it is always easy +to classify failures that must be action-caused, especially in the +dominating cases of failed network connections or failed servers. + +For other outputs it may not be as easy. But, for example, all stream network +outputs can detect a broken connection, so this also is a sure fit. + +For dynafiles, it really depends on how hard the output module is tries to differentiate +between the two failure cases. But I think you can go great length here, too. +Especially if you do not only look at the create() return code, but, iff a +failure occurs, you do more API calls to find out the cause. + +So I think the remaining problem is small enough to cause not too much issues +(and if so, they are unavoidable in any case). In conclusion, the two failure states are not only necessary, but can sufficiently sure enough be detected. + +\subsection{Random Topics} +I have begun to gather material from the mailing list in this section, because I feel it may be useful for others as well. Right now, the information is well hidden in the mailing list archives and there may be value in combining it all in one place. + +Due to the nature of this material, there is no specific organization between the subchapters and also formatting and language doesn't deny its rooting in the mailing list. + +\subsection{Reliability of Message Dequeueing} +A batch is actually dequeued when it is taken off a queue. So if at that point we +have a system power failure (for whatever reason), the messages are lost. +While the rsyslog engine intends to be very reliable, it is not a complete +transactional system. A slight risk remains. For this, you need to understand +what happens when the batch is processed. I assume that we have no sudden, +untrappable process termination. Then, if a batch cannot be processed, it is +returned back to the top of queue. This is not yet implemented, but is how +single messages (which you can think of an abstraction of a batch in the +current code) are handled. If, for example, the engine shuts down, but an +action takes longer than the configured shutdown timeout, the action is +cancelled and the queue engine reclaims the unprocessed messages. They go +into a special area inside the .qi file and are placed on top of the queue +once the engine restarts. + +The only case where this not work is sudden process termination. I see two +cases: + +a) a fatal software bug +We cannot really address this. Even if the messages were remaining in the +queue until finally processed, a software bug (maybe an invalid pointer) may +affect the queue structures at large, possibly even at the risk of total loss +of all data inside that queue. So this is an inevitable risk. + +b) sudden power fail +... which can and should be mitigated at another level + +One may argue that there also is + +c) admin error +e.g, kill -9 rsyslogd +Here a fully transactional queue will probably help. + +However, I do not think that the risk involved justifies a far more complex +fully transactional implementation of the queue object. Some risk always +remains (what in the disaster case, even with a fully transactional queue?). + +And it is so complex to let the messages stay in queue because it is complex +to work with such messages and disk queues. It would also cost a lot of +performance, especially when done reliably (need to sync). We would then need +to touch each element at least four times, twice as much as currently. Also, +the hybrid disk/memory queues become very, very complex. There are more +complexities around this, I just wanted to tell the most obvious. + +So, all in all, the idea is that messages are dequeued, processed and put +back to the queue (think: ungetc()) when something goes wrong. Reasonable +(but not more) effort is made to prevent message loss while the messages are +in unprocessed state outside of the queue. + +\paragraph{More reliable can actually be less reliable} +On the rsyslog mailing list, we had a discussion about how reliable rsyslog should be. It circles about a small potential window of message loss in the case of sudden fatal failure. Rsyslog can be configured to put all messages into a disk queue (instead of main memory), so these messages survive such a powerfail condition. However, messages dequeued and scheduled for processing during the power outage may be lost. + +I now consider a case where we have bursty UDP traffic and rsyslog is configured to use a disk-only queue (which obviously is much slower than an in-memory queue). Looking at processing speeds, the max burst rate is limited by using an ultra-reliable queue. To avoid using UDP messages, a second instance could be run that uses an in-memory queue and forwards received messages to the one in ultra-reliable mode (that is with the disk-only queue). So that second instance queues in memory until the (slower) reliable rsyslogd can now accept the message and put it into the reliable queue. Let's say that you have a burst of $r$ messages and that from these burst only $r/2$ can be enqueued (because the ultra reliable queue is so slow). So you lose $r/2$ messages. + +Now consider the case that you run rsyslog with just a reliable queue, one that is kept in memory but not able to cover the power failure scenario. Obviously, all messages in that queue are lost when power fails (or almost all to be precise). However, that system has a much broader bandwidth. So with it, there would never have been r messages inside the queue, because that system has a much higher sustained message rate (and thus the burst causes much less of trouble). Let's say the system is just twice as fast in this setup (I guess it usually would be *much* faster). Than, it would be able to process all r records. + +In that scenario, the ultra-reliable system loses $r/2$ messages, whereas the somewhat more "unreliable" system loses none - by virtue of being able to process messages as they arrive. + +Now extend that picture to messages residing inside the OS buffers or even those that are still queued in their sources because a stream transport blocked sending them. + +I know that each detail of this picture can be argued at length about. + +However, my opinion is that there is no "ultra-reliable" system in life, only various probabilities in losing messages. These probabilities often depend on each other, what makes calculating them very hard to impossible. Still, the probability of message loss in the system at large is just the product of the probabilities in each of its components. And reliability is just the inverse of that probability. + +This is where *I* conclude that it can make sense to permit a system to lose some messages under certain circumstances, if that influences the overall probability calculation towards the desired end result. In that sense, I tend to think that a fast, memory-queuing rsyslogd instance can be much more reliable compared to one that is configured as being ultra-reliable, where the rest of the system at large is badly influenced by this (the scenario above). + +However, I also know that for regulatory requirements, you often seem to need to prove that a system may not lose messages once it has received them, even at the cost of an overall increased probability of message loss. + +My view of reliability is much the same as my view of security: there is no such thing as "being totally secure", you can just reduce the probability that something bad happens. The worst thing in security is someone who thinks he is "totally secure" and as such is no longer actively looking at potential issues. + +The same I see for reliability. There is no thing like "being totally reliable" and it is a really bad idea to think you could ever be. Knowing this, one may begin to think about how to decrease the overall probability of message loss AND think about what rate is acceptable (and what to do with these cases, e.g. "how can they hurt"). + +\paragraph{Different Use Cases} +As David Lang pointed out, there exist different use cases for different levels of reliability. Most importantly, there exist use cases that do not demand very high throughput but rather ultra-realiability of the queue system. Here, ultra-reliability is just another word for the queue being of ``audit-grade''. Even if the queue provides audit-grade, the overall system is only then of audit-grade when all other components - most notably the transport protocols spoken by the inputs and outputs - are also of audit-grade. Most importantly, this means that an audit-grade system purely based on the IETF syslog protocol series can not be build. + +Used together with truly reliable protocols \emph{and} senders that block processing until a final acknowledgement has been received, an audit-grade system can potentially build based on rsyslog. To do so, an audit-grade queue subsystem is required, which is not present in releases less than 4.1.? (most importantly, v2 and v3 do not provide this capability). + +\subsection{Audit-Grade Queue Operations} +\subsubsection{Perquisites} +Audit-grade queue operations certain perquisites: +\begin{itemize} +\item rsyslog engine is of version 4.1.? or greater +\item disk-only queue type +\item checkpoint interval set to 1 +\item queue is configured to not permit losing any messages\footnote{The queue has several settings that can be used to fine-tune situations in which it may discard messages intentionally. All of these must be turned off. Most importantly, that means the producer is blocked for an infinite time if the queue is full.} +\item queue consumer must also be of audit-grade +\end{itemize} +Only when these prequisites are met, queue operation can be considered of being audit-grade. Note that when message loss in case of sudden fatal failure and similar incidents is acceptable, neither disk-only queues nore a checkpoint interval of 1 is necessary. Such a configuration can also be build with rsyslog v3, which is up to that level. + +Note that in the sections below we describe the implementation in broader terms. Most importantly, we do not restrict ourselves to disk-only queue storage drivers. This is important, because it simplifies design and opens the capability to introduce new, possibly faster-performing, queue storage drivers in the future. + +But it is important to keep in mind that a concrete queue is only of audit-grade if it matches all the perquisites given here, most importantly with the right configuration. + +\subsubsection{Implementation Alternatives} +Messages, or more precisely objects\footnote{While rsyslog deals with messages, the queue is designed to handle any type of thing that is represented as an rsyslog object. This is considered useful as queues may at some time contain other things than just messages, so we keep it generic.}, are enqueued by the queue producer (either an input module or the main message queue's consumer). The enqueue operation is completed only when the message has been successfully accepted by the queue storage driver. Then and only then the producer is permitted to remove the object from its own storage system. A rough sketch is given in algorithm \ref{alg_q_enq}. + +\begin{algorithm} +\caption{enqueueObject($o$)} +\begin{algorithmic} +\label{alg_q_enq} +\STATE lock queue mutex +\WHILE{queue is not ready for enqueue} + \STATE wait on queue to become ready +\ENDWHILE +\STATE call queue store driver to add $o$ +\STATE unlock queue mutex +\end{algorithmic} +\end{algorithm} + +The dequeue-operation is more complex. We must ensure that each object stays in the queue until it is finally processed. Hereby, an object is finally processed, when processing of it has been completed. Remember that to enhance performance, objects are dequeued in batches of many. So at any given time, multiple messages may be processed, but not necessarily have finally completed doing so. If another worker thread then tries to obtain a new batch for processing, those ``in-process'' message must not be handed out a second time. Also, if a sudden fatal failure occurs during processing, queue operation must restart at the point of last commit. This means that all ``in-process'' messages need to be changed back to ``no processed'' state and be restarted again. In those cases the (acceptable) slight message duplication can occur. + +In our design, we differentiate between ``logical'' and ``physical'' dequeuing of batches. If a batch is generated for processing, it is logically dequeued --- in the sense that no other batch generating request will be able to receive another copy of these messages. If no exceptional situation happens, those messages will be processed and thus can be considered consumed under normal circumstances. + +However, actual deletion from the physical queue storage happens only after the batch is fully processed. At this point, all objects have been acknowledged by their destinations, which now have the responsibility for the object's survival. Consequently, we can delete them from the queue store. This process is considered the ``physical'' dequeue of the object. + +In order to find some simpler terms, we will call the logical dequeue operation just ``dequeue'' and the physical dequeue operation ``delete''. This is consistent with all previous work on rsyslog and thus probably leads to the least surprise when reading older source code and documentation. + +A first idea for a deletion is given in algorithm \ref{alg_pdeq_batch_1} (remember that $O(b)$ contains all objects within the given batch $b$, this is \emph{not} $O$-notation and should probably in the future be replaced by something else). + +\begin{algorithm} +\caption{deleteBatch($b$), first approach} +\begin{algorithmic} +\label{alg_pdeq_batch_1} +\STATE lock queue mutex +\FORALL{$o \in O(b)$} + \STATE find $o$ in queue storage + \STATE remove $o$ and keep queue structures intact +\ENDFOR +\STATE unlock queue mutex +\end{algorithmic} +\end{algorithm} + +This algorithm is simple, but requires searching the queue store for the object to be deleted -- a potentially lengthy operation. However, we can improve the searching process if we know more about the inner structure of batch objects. It seems appropriate to dequeue objects in queue-sequential order. A drawback of doing so is that we must prevent other worker threads from trying to dequeue concurrently. This is not really a drawback. We need to guard dequeue operations by a mutex in any case, because otherwise internal structures can not be kept consistent. Practical experience and testing have shown that many small dequeue operations cause a lot of locking contention and as such badly affect performance. So it actually is a welcome enhancement to aquire the queue lock only once for the whole batch dequeue operation. As dequeing is a comperatively fast operation, the lock is not held for extended periods of time. + +A first approach to this functionality is shown in algorithm \ref{alg_ldeq_batch_1}. Note that $C_{mBatch}$ is the configured maximum number of elements inside a batch, $i$ is an index to address the objects inside the batch. + +\begin{figure}[h] +\begin{center} +\includegraphics[scale=0.6]{rsyslog_queue_pointers.jpeg} +\end{center} +\caption{\textbf{Queue Store Pointers}: boxes represent queue entries, colored boxes entries with objects. Objects in green are unprocessed, in blue are dequeued but not deleted and those in gray have already been deleted. White indicates not yet used entries. Gray objects may be overwritten at any time. Their entries are actually free, we have used the gray color primarily to indicate there once existed objects. Each queue pointer points to the next entry to process.} +\label{fig_queue_ptr} +\end{figure} + +\begin{algorithm} +\caption{dequeueBatch($b$)} +\begin{algorithmic} +\label{alg_ldeq_batch_1} +\STATE lock queue mutex +\STATE $0 \to i$ +\WHILE{queue non-empty and $i < C_{mBatch}$} + \STATE obtain next obj $o$ from queue store + \STATE advance logical dequeue position + \STATE put $o$ into batch +\ENDWHILE +\STATE unlock queue mutex +\end{algorithmic} +\end{algorithm} + +A key concept is somewhat hidden in \marginpar{queue pointers} \emph{advance logical dequeue position}. Each queue store is purely sequential, with objects being enqueued at one ``end'' of the store and dequeued at the other. Of course, each queue store has only finite capacity, but we ignore this to explain the overall picture. A queue can be implemented by two pointers: one that points to the tail of the queue, where new messages are enqueued and one that points to the head of it, where new messages are dequeued. The idea is now to duplicate the dequeue pointer and split it into one for (logical) dequeue and one for deletion. Figure \ref{fig_queue_ptr} shows this three-pointer approach. Now, we can simple advance either the dequeue or deletion pointer, depending on operation, and do not need to find the first dequeue position inside the queue store. The dequeue pointer always points at it. This mode can be implemented with all currently existing queue storage drivers (but the sequential disk driver may need to use a second file handle or stream object instead of two pointers). + +This makes an efficient implementation of algorithm \ref{alg_ldeq_batch_1} possible: when it logically dequeues, it just needs to advance the dequeue pointer. So the algorithm executes in $O(n)$ time where $n$ specifies the number of elements to dequeue with an upper bound of $C_{mBatch}$. + +\begin{figure}[h] +\begin{center} +\includegraphics[scale=0.6]{rsyslog_queue_pointers2.jpeg} +\end{center} +\caption{\textbf{Physically Dequeueing Messages}: In this sample, we have two batches. With multiple workers, they may be deleted in any order.} +\label{fig_queue_ptr_deq} +\end{figure} + +Furthermore, we can also improve algorithm \ref{alg_pdeq_batch_1}: Consider that each batch is logically dequeued as an atomic operation. That means all batch objects form a sequential subset of the queue. Figure \ref{fig_queue_ptr_deq} shows the situation when two batches have been dequeued. So the costly ``find'' operation now needs to be carried out only once at the beginning of the batch. As all other objects are sequential, once we have found the batch begin inside the queue, we can simply delete the $|b|$ elements in queue-sequential order after it. So the cost of the find operation can be reduced from $O(|b|)$ to $O(1)$. + +We can even reduce the remaining cost of the find operation. If the batch to be deleted is right at the queue's head (as is ``B1'' in the figure), the ``find'' immediately terminates with the first element and incurs no cost at all. The situation is different if the batch is not at the queue head, ``B2'' is an example for that (assuming that ``B1'' has not yet been dequeued). We would now still need to search over the objects that are not part of the batch and can then finally get to the object at the head of the batch in question. For queue storage drivers that support random access to queue elements, storing a simple pointer to the batches' queue head element further improves the situation and enables $O(1)$ access to the queue element. This is indicated by the dotted lines in figure \ref{fig_queue_ptr_deq}. Once the head of the queue has been found, two things can happen (depending on the capabilities of the queue storage driver): + +\begin{enumerate} +\item the head element can be flagged as ``this and next $n$ elements are deleted'' +\item all elements are actually deleted +\end{enumerate} + +Note that a mixed form is also possible (and probably useful for our \emph{singly} linked list storage driver: there, some $n'$ elements be actually deleted and the head element is flagged as ``this and next $n - n'$ elements are deleted''. Note that in the linked-list case, all but the first elements can be deleted with ease\footnote{It can be considered to change from a singly-linked list to a doubly-linked list, if the benefit outweighs the extra effort required.}, so probably just the head would stay inside the queue. Note that removing elements off the queue, where possible, is useful because it frees resources. On a busy system, freeing messages as soon as possible can prevent message loss (in non-audit-grade setup) or system slowdown. So it should be done when possible. + +If we have a purely sequential queue storage driver (currently the sequential disk driver), finding and updating the head element is not an option. Even in this case, we can observe that the batch at the actual deletion pointer will eventually be submitted for deletion. So a route to take is to create a list of elements that can be deleted as soon as the physical dequeue pointer reaches any of these elements. We call this the \marginpar{to-delete list}``to-delete list''. To facilitate processing, this list must be ordered in sequence of dequeing. This information may not be available from the storage subsystem itself, but it can easily be generated. To do so, a strictly monotonically increasing counter is kept with each logical dequeue operation and stored as part of the batch\footnote{As this must be done via the usual computer-implemented modular arithmetic, we must be careful that we do not see repetion of values because of overflows. Each day has $60 \cdot 60 \cot 24 = 86,400$ seconds (ignoring the subleties of UTC). Now let's assume that we have a moderately-busy system with 1,000 messages per second. We further assume, to be on the save side, that each message is processed inside its own batch. So we have $86,400,000$ batches per day. If we now use a typical $32$-bit integer for generating the batch IDs, we the unique range will be used up after +$$\frac{2^{32}}{8640000} \approx 497 \text{ days}$$ +days of uninterrupted rsyslog operation. While this sounds somewhat save, it goes down to approximately 10 days of messages are submitted at rate of 50,000 messages per second (which is high, but not unheared of). So it is strongly advised to use 64 bits, which we consider to be save, because for our 1,000 messages per second the range would be exhausted only after +$$\frac{2^{64}}{8640000} \approx 2.135 \cdot 10^{11} \text{ days}$$ +which equals approximately $584,500,000$ \emph{years}. So even at a rate of one million messages per second, the range would be sufficient for over 500,000 years of continuos operations -- that should be far sufficient.} +An example: let us assume that ``B2'' was submitted for deletion first. Then, the head of ``B2'' is not at the queue's delete pointer. As such, no action can be carried out immediately. So the batch head pointer is stored into a ``to be deleted'' list. Processing continues. Some time later, batch ``B1'' is submitted for deletion. Now, the head pointer is at the head of the delete list, as such all batch elements are dequeued. Then, the ``to be deleted'' list is checked, and ``B2'' is found in it. Now, ``B2'' is at the head of the (new) deletion pointer and can also be removed. So, ultimately, all messages are physically dequeued. This is more formally describe in algorithm \ref{alg_phys_deq_seq_store}. In that pseudocode, we made a simplification by always putting the to be deleted batch in the ``to-delete'' list, which then enables us to use somewhat more generic code to carry out the work. + +Note that there is a price to pay for deletions via the ``to-delete'' list: if a sudden fatal failure happens during processing, the set of duplicate messages is increased. For example, if a fatal failure happens after ``B2'' has been fully processed and scheduled for deletion, but \emph{before ``B1'' is also submitted for deletion}, ``B2'' will be reprocessed after recovery. This would not happen if ``B2'' would have been removed from the queue. + +\begin{algorithm} +\caption{deleteBatch($b$)} +\begin{algorithmic} +\label{alg_phys_deq_seq_store} +\REQUIRE queue mutex is locked by caller +\STATE enqueue $b.head, |b|$ in ``to-delete'' list $D$ +\COMMENT ``to-delete'' list must be in order of logical dequeue +\WHILE{$D.head = Q.deletePtr$} + \FOR{$|b|$ elements} + \STATE delete element at queue head + \STATE move $q.deletePtr$ + \ENDFOR + \STATE remove head of ``to-delete'' list +\ENDWHILE +\end{algorithmic} +\end{algorithm} + +\paragraph{Warp-Up of Queue Delete Operations} +When evaluating which route to take, the ``to-delete'' list approach looks elegant for all cases. The negative side effect of potentially increased message duplication currently does not even exist: today, the sequential disk queue storage driver permits only a single worker thread and thus there always will be only one thread at a time. Even if we remove that limitation, message duplication could not be avoided, as stated in the algorithm description above. What remains are the other queue storage drivers. However, they operate in-memory, so message duplication will not happen simply because all messages will be lost on sudden fatal failure. The advantage of limited message duplication only exists in the so-far hypothetical case of a random-access, audit-grade disk queue storage driver. Thus, the decision could be postponed unless that happens (if it ever does). + +From a code complexity point of view, the ``to-delete'' list approch is definitely advantagous. Not only because of the reduced number of algorithms required. We also do not need to maintain unique batch IDs and all the logic associated with them. + +The other aspect to look at is memory consumption. Assuming that we delete the actual objects, just not their containers inside the queue, extra memory consumption is not really that worse. More importantly, currently only the linked-list queue storage driver can benefit at all, because it is the only driver capable of deleting queue entries in mid-queue. All others, including the array memory driver, do not have this capability. + +From a performance point of view, the ``to delete'' list approach looks approximately as good as the others, with some mild better performance for some storage drivers for a non-``to delete'' list approach. This can be mitigated, especially if the potentially somewhat-costly maintenance of the ``to-delete'' list is slightly optimized and the algorithm actually checks if the to be deleted batch is right at the queue's delete pointer position. The improved code simplicity, together with current CPU's code caching, may even result in an otherwise not expected speedup. + +In conclusion, we will implement the ``to-delete'' list approach on the queue layer (above the queue storage drivers). However, we will leave the window open to permit overwriting it with queue storage driver specific functionality. How to do this will not be specified now, as there is currently no need and we do not even know if there ever will be. However, we retain the discussion on the various modes as well as the relevant algorithmic discussions and data structurs inside this paper so that it is readily available should need arise. We also think this is important so that everybody later knows that the decision was made based on good argument and not by accident (we consider this useful in another design enhancement attempt). + +\paragraph{Processing Sequence} Looking at the processing sequence, we notice that always objects are dequeued, then processed and then deleted. Then, the whole process starts again. In particular, this meanss that after the previous batch has been deleted, the next batch will be dequeued. Now consider that we need to have exclusive access to the queue for both of these operations. As such it seems natural to combine this into a single step, further reducing potential locking contention. + +Note that a side-effect of this approach is that messages can be deleted only when a new batch is dequeued. With current design, this means that at least one message must reside inside the queue. Otherwise, the last batch will not be deleted. However, this something that can (and must!) be solved on the queue worker layer, in that it deletes a batch when the queue is empty. + +This leads us to the implementation of dequeueBatch() and deleteBatch() shown in algorithms \ref{alg_deq_batch_final} and \ref{alg_del_batch_final}. Note that $l$ is a flag variable that indicates if the queue is already locked. + +\begin{algorithm} +\caption{dequeueBatch($b$): final version} +\begin{algorithmic} +\label{alg_deq_batch_final} +\STATE lock queue mutex +\STATE call deleteBatch(b, 1) +\STATE $0 \to i$ +\WHILE{queue non-empty and $i < C_{mBatch}$} + \STATE obtain next obj $o$ from queue store + \STATE advance dequeue position + \STATE put $o$ into batch +\ENDWHILE +\STATE commit queue changes to storage system (if needed, e.g. fsync()) +\STATE unlock queue mutex +\end{algorithmic} +\end{algorithm} + + +\begin{algorithm} +\caption{deleteBatch($b, l$): final version} +\begin{algorithmic} +\label{alg_del_batch_final} +\IF{queue not yet locked (test via $l$)} + \STATE lock queue mutex +\ENDIF +\FORALL{objects $o$ in $b$} + \STATE destruct $o$ +\ENDFOR +\STATE enqueue $b.head, |b|$ in ``to-delete'' list $D$ +\COMMENT ``to-delete'' list must be in order of logical dequeue +\WHILE{$D.head = Q.deletePtr$} + \FOR{$|b|$ elements} + \STATE delete element at queue head + \STATE move $q.deletePtr$ + \ENDFOR + \STATE remove head of ``to-delete'' list +\ENDWHILE +\STATE commit queue changes to storage system (if needed, e.g. fsync()) +\IF{queue not yet locked (test via $l$)} + \STATE unlock queue mutex +\ENDIF +\end{algorithmic} +\end{algorithm} + +\subsubsection{Queue Stores} +Currently, rsyslog supports three different types of queue store drivers: + +\begin{itemize} +\item memory array +\item memory linked list +\item disk sequential file +\end{itemize} + +They all provide an abstracted sequential queue store as shown in figure \ref{fig_queue_ptr} on page \pageref{fig_queue_ptr}. + +Obviously, some differences exist. Most importantly, the disk sequential file driver does \emph{not} support more than one queue worker thread (in order to prevent excessive disk activity and the subtle issues with rewriting parts of sequential files). So if this driver is used, the queue automatically limits itself to a maximum of one worker thread (even if user configuration settings + +Different queue store drivers have different properties: + +\begin{tabular}{|l||l|l|l|}\hline + & array & linked list & seqential file \\ \hline +pointer type & integer index & memory address & file number and \\ + & & & offset within file \\ \hline +physical access & random & random & sequential \\ \hline +remove middle & no & yes & no \\ +elements & & & \\ \hline +access to $n$-th& $O(1)$, index:& $O(n)$, follow & not supported \\ +element & $n \mod C_{mMsg}$ & pointer links & \\ \hline +speed & fastest & fast & slow \\\hline +mem overhead & large & some & almost none \\\hline +reliability & reliable & reliable & audit-grade\footnote{if configured correctly}\\ +\hline +\end{tabular} + +\subsubsection{Implementation} +The actual implementation will be based on algorithms \ref{alg_deq_batch_final} and \ref{alg_del_batch_final}. The rsyslog v3 queue storage driver will be extended one additional method, which permits non-destructive dequeueing of elements. As such, the driver now has the $qAdd()$, $qDeq()$, and $qDel()$ entry points (together with the usual construction and destruction entry points). The queue drivers must support the three pointers for enqueue, dequeue and delete. The ``to-delete'' list will be maintained on the upper queue layer (and not the queue driver layer). This functionality will be optimized so that if a batch to delete is right at the queue's delete pointer, it will immediatly be deleted and not be sent to the ``to-delete'' list. This is especially important with the sequential disk driver, as the condition here always is true (and thus the driver can pretend this in the relevant API without even comparing any pointers -- what would otherwise quite complicated in this driver. + +The full list of the queue store driver interface is: + +\paragraph{qConstruct} Initializes the queue store. + +\paragraph{qDestruct} Destructs the queue store, including all messages that may still be present in it. + +\paragraph{qAdd} Enqueue a new object into the queue. Note that this entry point must only be called when the queue is non-full. + +\paragraph{qDeq} Non-destructive dequeue of the object at queue head. Dequeue pointer is advanced. + +\paragraph{qDel} Delete the object at queue head. Delete pointer is advanced. + +Disk queue store drivers may support additional internal functions. However, they should not be exposed to the rest of the queue subsystem. + +\begin{figure} +\begin{center} +\includegraphics[scale=0.4]{queue_msg_state.jpeg} +\end{center} +\caption{Logical Message States during Queue Processing} +\label{fig_queue_msg_state} +\end{figure} + +Figure \ref{fig_queue_msg_state} shows a logical message state diagram during queue processing. There is no actual state variable, but rather the processing flow demands these state. Note that the state transition from ``dequeued'' to ``queued'' only happens after a fatal failure and a successful system recovery. So this is a rather exceptional case. + +Another subtle issue is that we now need two different queue size counters: one for seeing when the queue is physically full and one for detecting when there are no more messages to be dequeued. + +As a simplification, support for ungetting objects can be removed (as objects never leave the queue), what also means that cancel-processing is probably less complex. + +\paragraph{Sequential Disk Queue Store Driver} +The enequeue, deqeueue and delete pointers must be implemented via three stream objects. Most importantly, the dequeue stream must be configured not to delete files when it closes them. A side-effect of this implementation is that data is actually read twice, once to actually obtain it and a second time to delete it. This could only be avoided by an overall redesign on how the disk queue works. + +\subsubsection{Checkmarks} +The following things need to be verified in the actual implementation. + +\paragraph{Queue Full} +Is it possible to set an infinte timeout on queue full condition during enqueue? If not, we must provide it. + +\paragraph{Termination the Queue} +If we cancel a worker, we need to start from the physical dequeue pointer and pull everything that is not scheduled for deletion - NOT from the logical dequeue pointer. + +\paragraph{Failed Messages} +If a message fails on a detached action queue, no backup processing is available (because we detect the failure at a point where the message is already considered processed from the main queue's point of view. We need address this and have two options: + + +I see two approaches at handling this: + +a) we enable an action to configure a backup file that shall receive all +message permanent failures. This is simple (not only to implement but to +configure and understand) + +b) we push the failed message back to the main queue, but with an indication +that it failed in an action. This is harder to implement and most importantly +harder to understand/configure, but more flexible + +\section{Future Development} +This section covers topics that can not currently be developed, but where important thoughts came up in discussions. For obvious reasons, the section has brainstorming character. + +\subsection{Audit-Grade High Performance Queue Storage Driver} +An audit grade driver must ensure that no message is lost, but should also be able to handle large workloads. The sequential disk driver does not support the later. + +An additional disk driver is envisioned with the properties like the linked list driver, but a reliable on-disk store. In particular, random access to queue elements is desired, which requires an addressing capability. + +A potential implementation requires a pre-formatted file. That file is organized in pages of $n$ bytes (e.g. 1K). The page index is used to address a queue item. If an item fits into 1K, it uses one page. If it is larger than 1K, consequtive pages are used to store the element. A page header must be present to indicate how many pages a single element is made up of. + +It may be noted that we could even improve performance by keeping part of the data in-memory. For audit-gradeness, it is required that upon enqueue the message is written to disk and only after final processing it needs to be removed. However, it is not forbidden to keep the same message in main memory. That way, the logical dequeue operation could be done one the in-memory representation. Only the physical dequeue would need to write to disk again. As such, we save one disk read out of three writes and one read otherwise required (so one can roughly say that we save one third of disk operations. + +Note that due to potential multi-pages messages we can not directly address individual elements, but we can reliably and quikly address elements whom's address we know (learned, for example, during logical dequeue). This is similar to the organization of the in-memory linked list. Actally, such a store \emph{is} a linked list implementation, just that memory is allocated on disk instead of in main memory. + +To further improve speed, object representation could be zipped before being written to a page. + +File Layout +Page 0: control structures (most importantyle queue pointers) (can make sense to store in a separate file, which could be moved to a dedicated disk subsystem - can potentially greatly reduce disk seek times). +Page 1 to n: actual object storage + +Algorithms \ref{alg_AuditGradeStoreEnqueue} and \ref{alg_AuditGradeStoreDelete} show how records are enqueued and deleted. Note that the delete part does not even need to read back the record. If we keep at last some records in-memory, the performance cost of ultra-reliable mode can actually comparatively low. Note that we may not even really need to commit data to the storage system in ``AuditGradeStoreDelete()'', because if a fatal failure occurs at this point, at worst message duplication may happen, what we have considered to be acceptable. + +\begin{algorithm} +\caption{AuditGradeStoreEnqueue($o$)} +\begin{algorithmic} +\label{alg_AuditGradeStoreEnqueue} +\REQUIRE queue mutex is locked by caller +\STATE write $o$ to current enqueue location +\STATE update \& write queue structures [page 0] +\STATE sync all files touched +\STATE store $o$ in an in-memory structure (or a cache) +\end{algorithmic} +\end{algorithm} + +\begin{algorithm} +\caption{AuditGradeStoreDelete($o$)} +\begin{algorithmic} +\label{alg_AuditGradeStoreDelete} +\REQUIRE queue mutex is locked by caller +\STATE update queue dequeue pointer \& write queue structures [page 0] +\STATE sync all files touched +\end{algorithmic} +\end{algorithm} + + +\end{document} diff --git a/doc/dev_oplugins.html b/doc/dev_oplugins.html index cc2f7f38..63c186a3 100644 --- a/doc/dev_oplugins.html +++ b/doc/dev_oplugins.html @@ -144,19 +144,172 @@ array-passing capability not blindly be used.</b> In such cases, we can not guar plugin from segfaulting and if the plugin (as currently always) is run within rsyslog's process space, that results in a segfault for rsyslog. So do not do this. <h3>Batching of Messages</h3> -<p>With the current plugin interface, each message is passed via a separate call to the plugin. -This is annoying and costs performance in some uses cases (primarily for database outputs). -However, that's the way it (currently) is, no easy way around it. There are some ideas -to implement batching capabilities inside the rsyslog core, but without that the only -resort is to do it inside your plugin yourself. You are not prohibited from doing so. -There are some consequences, though: most importantly, the rsyslog core is no longer -intersted in messages that it passed to a plugin. As such, it will not try to make sure -the message is not lost before it was ultimately processed (because rsyslog, due to -doAction() returning successfully, thinks the message *was* ultimately processed). -<p>When the rsyslog core receives batching capabilities, this will be implemented in -a way that is fully compatible to the existing plugin interface. While we have not yet -thought about the implementation, that will probably mean that some new interfaces -or options be used to turn on batching capabilities. +<p>Starting with rsyslog 4.3.x, batching of output messages is supported. Previously, only +a single-message interface was supported. +<p>With the <b>single message</b> plugin interface, each message is passed via a separate call to the plugin. +Most importantly, the rsyslog engine assumes that each call to the plugin is a complete transaction +and as such assumes that messages be properly commited after the plugin returns to the engine. +<p>With the <b>batching</b> interface, rsyslog employs something along the line of +"transactions". Obviously, the rsyslog core can not make non-transactional outputs +to be fully transactional. But what it can is support that the output tells the core which +messages have been commited by the output and which not yet. The core can than take care +of those uncommited messages when problems occur. For example, if a plugin has received +50 messages but not yet told the core that it commited them, and then returns an error state, the +core assumes that all these 50 messages were <b>not</b> written to the output. The core then +requeues all 50 messages and does the usual retry processing. Once the output plugin tells the +core that it is ready again to accept messages, the rsyslog core will provide it with these 50 +not yet commited messages again (actually, at this point, the rsyslog core no longer knows that +it is re-submiting the messages). If, in contrary, the plugin had told rsyslog that 40 of these 50 +messages were commited (before it failed), then only 10 would have been requeued and resubmitted. +<p>In order to provide an efficient implementation, there are some (mild) constraints in that +transactional model: first of all, rsyslog itself specifies the ultimate transaction boundaries. +That is, it tells the plugin when a transaction begins and when it must finish. The plugin +is free to commit messages in between, but it <b>must</b> commit all work done when the core +tells it that the transaction ends. All messages passed in between a begin and end transaction +notification are called a batch of messages. They are passed in one by one, just as without +transaction support. Note that batch sizes are variable within the range of 1 to a user configured +maximum limit. Most importantly, that means that plugins may receive batches of single messages, +so they are required to commit each message individually. If the plugin tries to be "smarter" +than the rsyslog engine and does not commit messages in those cases (for example), the plugin +puts message stream integrity at risk: once rsyslog has notified the plugin of transacton end, +it discards all messages as it considers them committed and save. If now something goes wrong, +the rsyslog core does not try to recover lost messages (and keep in mind that "goes wrong" +includes such uncontrollable things like connection loss to a database server). So it is +highly recommended to fully abide to the plugin interface details, even though you may +think you can do it better. The second reason for that is that the core engine will +have configuration settings that enable the user to tune commit rate to their use-case +specific needs. And, as a relief: why would rsyslog ever decide to use batches of one? +There is a trivial case and that is when we have very low activity so that no queue of +messages builds up, in which case it makes sense to commit work as it arrives. +(As a side-note, there are some valid cases where a timeout-based commit feature makes sense. +This is also under evaluation and, once decided, the core will offer an interface plus a way +to preserve message stream integrity for properly-crafted plugins). +<p>The second restriction is that if a plugin makes commits in between (what is perfectly +legal) those commits must be in-order. So if a commit is made for message ten out of 50, +this means that messages one to nine are also commited. It would be possible to remove +this restriction, but we have decided to deliberately introduce it to simpify things. +<h3>Output Plugin Transaction Interface</h3> +<p>In order to keep compatible with existing output plugins (and because it introduces +no complexity), the transactional plugin interface is build on the traditional +non-transactional one. Well... actually the traditional interface was transactional +since its introduction, in the sense that each message was processed in its own +transaction. +<p>So the current <code>doAction()</b> entry point can be considered to have this +structure (from the transactional interface point of view): +<p><pre><code> +doAction() + { + beginTransaction() + ProcessMessage() + endTransaction() + } + </code></pre> +<p>For the <b>transactional interface</b>, we now move these implicit <code>beginTransaction()</code> +and <code>endTransaction(()</code> call out of the message processing body, resulting is such +a structure: +<p><pre><code> +beginTransaction() + { + /* prepare for transaction */ + } + +doAction() + { + ProcessMessage() + /* maybe do partial commits */ + } + +endTransaction() + { + /* commit (rest of) batch */ + } +</code></pre> +<p>And this calling structure actually is the transactional interface! It is as simple as this. +For the new interface, the core calls a <code>beginTransaction()</code> entry point inside the +plugin at the start of the batch. Similarly, the core call <code>endTransaction()</code> at the +end of the batch. The plugin must implement these entry points according to its needs. +<p>But how does the core know when to use the old or the new calling interface? This is rather +easy: when loading a plugin, the core queries the plugin for the <code>beginTransaction()</code> +and <code>endTransaction()</code> entry points. If the plugin supports these, the new interface is +used. If the plugin does not support them, the old interface is used and rsyslog implies that +a commit is done after each message. Note that there is no special "downlevel" handling +necessary to support this. In the case of the non-transactional interface, rsyslog considers +each completed call to <code>doAction</code> as partial commit up to the current message. +So implementation inside the core is very straightforward. +<p>Actually, <b>we recommend that the transactional entry points only be defined by those +plugins that actually need them</b>. All others should not define them in which case +the default commit behaviour inside rsyslog will apply (thus removing complexity from the +plugin). +<p>In order to support partial commits, special return codes must be defined for +<code>doAction</code>. All those return codes mean that processing completed successfully. +But they convey additional information about the commit status as follows: +<p> +<table border="0"> +<tr> +<td valign="top"><i>RS_RET_OK</i></td> +<td>The record and all previous inside the batch has been commited. +<i>Note:</i> this definition is what makes integrating plugins without the +transaction being/end calls so easy - this is the traditional "success" return +state and if every call returns it, there is no need for actually calling +<code>endTransaction()</code>, because there is no transaction open).</td> +</tr> +<tr> +<td valign="top"><i>RS_RET_DEFER_COMMIT</i></td> +<td>The record has been processed, but is not yet commited. This is the +expected state for transactional-aware plugins.</td> +</tr> +<tr> +<td valign="top"><i>RS_RET_PREVIOUS_COMMITTED</i></td> +<td>The <b>previous</b> record inside the batch has been committed, but the +current one not yet. This state is introduced to support sources that fill up +buffers and commit once a buffer is completely filled. That may occur halfway +in the next record, so it may be important to be able to tell the +engine the everything up to the previouos record is commited</td> +</tr> +</table> +<p>Note that the typical <b>calling cycle</b> is <code>beginTransaction()</code>, +followed by <i>n</i> times +<code>doAction()</code></n> followed by <code>endTransaction()</code>. However, if either +<code>beginTransaction()</code> or <code>doAction()</code> return back an error state +(including RS_RET_SUSPENDED), then the transaction is considered aborted. In result, the +remaining calls in this cycle (e.g. <code>endTransaction()</code>) are never made and a +new cycle (starting with <code>beginTransaction()</code> is begun when processing resumes. +So an output plugin must expect and handle those partial cycles gracefully. +<p><b>The question remains how can a plugin know if the core supports batching?</b> +First of all, even if the engine would not know it, the plugin would return with RS_RET_DEFER_COMMIT, +what then would be treated as an error by the engine. This would effectively disable the +output, but cause no further harm (but may be harm enough in itself). +<p>The real solution is to enable the plugin to query the rsyslog core if this feature is +supported or not. At the time of the introduction of batching, no such query-interface +exists. So we introduce it with that release. What the means is if a rsyslog core can +not provide this query interface, it is a core that was build before batching support +was available. So the absence of a query interface indicates that the transactional +interface is not available. One might now be tempted the think there is no need to do +the actual check, but is is recommended to ask the rsyslog engine explicitely if +the transactional interface is present and will be honored. This enables us to +create versions in the future which have, for whatever reason we do not yet know, no +support for this interface. +<p>The logic to do these checks is contained in the <code>INITChkCoreFeature</code> macro, +which can be used as follows: +<p><pre><code> +INITChkCoreFeature(bCoreSupportsBatching, CORE_FEATURE_BATCHING); +</code></pre> +<p>Here, bCoreSupportsBatching is a plugin-defined integer which after execution is +1 if batches (and thus the transational interface) is supported and 0 otherwise. +CORE_FEATURE_BATCHING is the feature we are interested in. Future versions of rsyslog +may contain additional feature-test-macros (you can see all of them in +./runtime/rsyslog.h). +<p>Note that the ompsql output plugin supports transactional mode in a hybrid way and +thus can be considered good example code. + +<h2>Open Issues</h2> +<ul> +<li>Processing errors handling +<li>reliable re-queue during error handling and queue termination +</ul> + + + <h3>Licensing</h3> <p>From the rsyslog point of view, plugins constitute separate projects. As such, we think plugins are not required to be compatible with GPLv3. However, this is diff --git a/doc/manual.html b/doc/manual.html index e6935d50..90af6940 100644 --- a/doc/manual.html +++ b/doc/manual.html @@ -19,7 +19,7 @@ rsyslog support</a> available directly from the source!</p> <p><b>Please visit the <a href="http://www.rsyslog.com/sponsors">rsyslog sponsor's page</a> to honor the project sponsors or become one yourself!</b> We are very grateful for any help towards the project goals.</p> -<p><b>This documentation is for version 4.5.6 (v4-beta branch) of rsyslog.</b> +<p><b>This documentation is for version 5.1.6 (devel branch) of rsyslog.</b> Visit the <i><a href="http://www.rsyslog.com/doc-status.html">rsyslog status page</a></i></b> to obtain current version information and project status. </p><p><b>If you like rsyslog, you might @@ -28,9 +28,11 @@ time - even a single mouse click helps. Learn <a href="how2help.html">how to hel Due to popular demand, there is now a <a href="rsyslog_ng_comparison.html">side-by-side comparison between rsyslog and syslog-ng</a>.</p> <p>If you are upgrading from rsyslog v2 or stock sysklogd, -<a href="v3compatibility.html">be sure to read the rsyslog v3 compatibility document</a>, +<a href="v3compatibility.html">be sure to read the rsyslog v3 compatibility notes</a>, and if you are upgrading from v3, read the -<a href="v4compatibility.html">rsyslog v4 compatibility document</a>. +<a href="v4compatibility.html">rsyslog v4 compatibility notes</a> and +if you upgrade from v4, read the +<a href="v5compatibility.html">rsyslog v5 compatibility notes</a>. <p>Rsyslog will work even if you do not read the doc, but doing so will definitely improve your experience.</p> <p><b>Follow the links below for the</b></p> @@ -42,8 +44,7 @@ if you do not read the doc, but doing so will definitely improve your experience <li>a commented <a href="sample.conf.html">sample rsyslog.conf</a> </li> <li><a href="bugs.html">rsyslog bug list</a></li> <li><a href="rsyslog_packages.html"> rsyslog packages</a></li> -<li><a href="generic_design.html">backgrounder on -generic syslog application design</a> +<li><a href="generic_design.html">backgrounder on generic syslog application design</a></li> <li><a href="modules.html">description of rsyslog modules</a></li> </ul> <p><b>We have some in-depth papers on</b></p> diff --git a/doc/omudpspoof.html b/doc/omudpspoof.html new file mode 100644 index 00000000..e5f963c7 --- /dev/null +++ b/doc/omudpspoof.html @@ -0,0 +1,77 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html><head> +<title>UDP spoofing output module (omudpspoof)</title> +</head> +<body> +<a href="rsyslog_conf_modules.html">rsyslog module reference</a> + +<h1>UDP spoofing output module (omudpspoof)</h1> +<p><b>Module Name: omstdout</b></p> +<p><b>Author: </b>David Lang <david@lang.hm> and Rainer Gerhards +<rgerhards@adiscon.com></p> +<p><b>Available Since</b>: 5.1.3</p> +<p><b>Description</b>:</p> +<p>This module is similar to the regular UDP forwarder, but permits to +spoof the sender address. Also, it enables to circle through a number of +source ports. +<p><b>Configuration Directives</b>:</p> +<ul> +<li><b>$ActionUDPSpoofSourceNameTemplate</b> <templatename><br> +This MUST be specified. It is the name of the template that contains a +numerical IP address that is to be used as the source system IP address. +While it may often be a constant value, it can be generated as usual via the +property replacer, as long as it is a valid IPv4 address. +<li><b>$ActionUDPSpoofTargetHost</b> <hostname><br> +Host that the messages shall be sent to. +<li><b>$ActionUDPSpoofTargetPort</b> <port><br> +Remote port that the messages shall be sent to. +<li><b>$ActionUDPSpoofDefaultTemplate</b> <templatename><br> +This setting instructs omudpspoof to use a template different from the +default template for all of its actions that do not have a template specified +explicitely. +<li><b>$ActionUDPSpoofSourcePortStart</b> <number><br> +Specifies the start value for circeling the source ports. Must be less than or +equal to the end value. Default is 32000. +<li><b>$ActionUDPSpoofSourcePortEnd</b> <number><br> +Specifies the ending value for circeling the source ports. Must be less than or +equal to the start value. Default is 42000. +</ul> +<b>Caveats/Known Bugs:</b> +<ul> +<li><b>IPv6</b> is currently not supported. If you need this capability, please let us +know via the rsyslog mailing list. +</ul> +<p><b>Sample:</b></p> +<p>The following sample forwards all syslog messages in unmodified form to the +remote server server.example.com. The sender address 192.0.2.1 with the fixed +source port 514 is used. +</p> +<textarea rows="8" cols="80">$ModLoad omudpspoof +$template spoofaddr,"192.0.2.1" +$template spooftemplate,"%rawmsg%" +$ActionUDPSpoofSourceNameTemplate spoofaddr +$ActionUDPSpoofTargetHost server.example.com +$ActionUDPSpoofSourcePortStart 514 +$ActionUDPSpoofSourcePortEnd 514 +*.* :omudpspoof:;spooftemplate +</textarea> +<p>The following sample is similar to the first, but uses as many defaults as possible. +In that sample, a source port in the range 32000..42000 is used. The message is formatted +according to rsyslog's canned default forwarding format. Note that if any parameters +have been changed, the previously set defaults will be used! +</p> +<textarea rows="5" cols="80">$ModLoad omudpspoof +$template spoofaddr,"192.0.2.1" +$ActionUDPSpoofSourceNameTemplate spoofaddr +$ActionUDPSpoofTargetHost server.example.com +*.* :omudpspoof: +</textarea> +<p>[<a href="rsyslog_conf.html">rsyslog.conf overview</a>] +[<a href="manual.html">manual index</a>] [<a href="http://www.rsyslog.com/">rsyslog site</a>]</p> +<p><font size="2">This documentation is part of the +<a href="http://www.rsyslog.com/">rsyslog</a> +project.<br> +Copyright © 2009 by <a href="http://www.gerhards.net/rainer">Rainer Gerhards</a> and +<a href="http://www.adiscon.com/">Adiscon</a>. +Released under the GNU GPL version 3 or higher.</font></p> +</body></html> diff --git a/doc/queue_msg_state.dot b/doc/queue_msg_state.dot new file mode 100644 index 00000000..bfef2657 --- /dev/null +++ b/doc/queue_msg_state.dot @@ -0,0 +1,25 @@ +// This file is part of rsyslog. +// +// rsyslog message state in queue processing +// +// see http://www.graphviz.org for how to obtain the graphviz processor +// which is used to build the actual graph. +// +// generate the graph with +// $ dot file.dot -Tpng >file.png + +digraph msgState { + rankdir=LR + + prod [label="producer" style="dotted" shape="box"] + que [label="queued"] + deq [label="dequeued"] + del [label="deleted"] + + prod -> que [label="qEnq()" style="dotted"] + que -> deq [label="qDeq()"] + deq -> del [label="qDel()"] + deq -> que [label="fatal failure\n& restart"] + + //{rank=same; del apf pdn } +} diff --git a/doc/queue_msg_state.jpeg b/doc/queue_msg_state.jpeg Binary files differnew file mode 100644 index 00000000..a215f000 --- /dev/null +++ b/doc/queue_msg_state.jpeg diff --git a/doc/queues.html b/doc/queues.html index 45ce1bd1..75b70fbf 100644 --- a/doc/queues.html +++ b/doc/queues.html @@ -336,6 +336,33 @@ in this regard - it was just not requested so far. So if you need more fine-grained control, let us know and we'll probably implement it. There are two configuration directives, both should be used together or results are unpredictable:" <i>$<object>QueueDequeueTimeBegin <hour></i>" and "<i>$<object>QueueDequeueTimeEnd <hour></i>". The hour parameter must be specified in 24-hour format (so 10pm is 22). A use case for this parameter can be found in the <a href="http://wiki.rsyslog.com/index.php/OffPeakHours">rsyslog wiki</a>. </p> +<h2>Performance</h2> +<p>The locking involved with maintaining the queue has a potentially large +performance impact. How large this is, and if it exists at all, depends much on +the configuration and actual use case. However, the queue is able to work on +so-called "batches" when dequeueing data elements. With batches, +multiple data elements are dequeued at once (with a single locking call). +The queue dequeues all available elements up to a configured upper +limit (<i><object>DequeueBatchSize <number></i>). It is important +to note that the actual upper limit is dictated by availability. The queue engine +will never wait for a batch to fill. So even if a high upper limit is configured, +batches may consist of fewer elements, even just one, if there are no more elements +waiting in the queue. +<p>Batching +can improve performance considerably. Note, however, that it affects the +order in which messages are passed to the queue worker threads, as each worker +now receive as batch of messages. Also, the larger the batch size and the higher +the maximum number of permitted worker threads, the more main memory is needed. +For a busy server, large batch sizes (around 1,000 or even more elements) may be useful. +Please note that with batching, the main memory must hold BatchSize * NumOfWorkers +objects in memory (worst-case scenario), even if running in disk-only mode. So if you +use the default 5 workers at the main message queue and set the batch size to 1,000, you need +to be prepared that the main message queue holds up to 5,000 messages in main memory +<b>in addition</b> to the configured queue size limits! +<p>The queue object's default maximum batch size +is eight, but there exists different defaults for the actual parts of +rsyslog processing that utilize queues. So you need to check these object's +defaults. <h2>Terminating Queues</h2> <p>Terminating a process sounds easy, but can be complex. Terminating a running queue is in fact the most complex operation a queue diff --git a/doc/rsyslog_conf_global.html b/doc/rsyslog_conf_global.html index 50d83a50..45eeabe6 100644 --- a/doc/rsyslog_conf_global.html +++ b/doc/rsyslog_conf_global.html @@ -58,6 +58,7 @@ default template for UDP and plain TCP forwarding action</li> <li>$ActionGSSForwardDefaultTemplate [templateName] - sets a new default template for GSS-API forwarding action</li> <li>$ActionQueueCheckpointInterval <number></li> +<li>$ActionQueueDequeueBatchSize <number> [default 16]</li> <li>$ActionQueueDequeueSlowdown <number> [number is timeout in <i> micro</i>seconds (1000000us is 1sec!), default 0 (no delay). Simple rate-limiting!]</li> @@ -109,6 +110,14 @@ that it should be not be much more often than once per second).</li> <li><b>$ActionSendUDPRebindInterval</b> nbr</a>- [available since 4.3.2] - instructs the UDP send action to rebind the send socket every nbr of messages sent. Zero, the default, means that no rebind is done. This directive is useful for use with load-balancers.</li> +<li><b>$ActionWriteAllMarkMessages</b> [on/<b>off</b>]- [available since 5.1.5] - normally, mark messages +are written to actions only if the action was not recently executed (by default, recently means within the +past 20 minutes). If this setting is switched to "on", mark messages are always sent to actions, +no matter how recently they have been executed. In this mode, mark messages can be used as a kind of +heartbeat. Note that this option auto-resets to "off", so if you intend to use it with multiple +actions, it must be specified in front off <b>all</b> selector lines that should provide this +functionality. +</li> <li><a href="rsconf1_allowedsender.html">$AllowedSender</a></li> <li><a href="rsconf1_controlcharacterescapeprefix.html">$ControlCharacterEscapePrefix</a></li> <li><a href="rsconf1_debugprintcfsyslinehandlerlist.html">$DebugPrintCFSyslineHandlerList</a></li> @@ -149,6 +158,7 @@ Usually that should not be a big issue, as the restart-type HUP can easily be re something along the lines of "/etc/init.d/rsyslog restart". </li> <li><a href="rsconf1_includeconfig.html">$IncludeConfig</a></li><li>MainMsgQueueCheckpointInterval <number></li> +<li>$MainMsgQueueDequeueBatchSize <number> [default 32]</li> <li>$MainMsgQueueDequeueSlowdown <number> [number is timeout in <i> micro</i>seconds (1000000us is 1sec!), default 0 (no delay). Simple rate-limiting!]</li> diff --git a/doc/rsyslog_conf_modules.html b/doc/rsyslog_conf_modules.html index df9abeea..f9bdad4a 100644 --- a/doc/rsyslog_conf_modules.html +++ b/doc/rsyslog_conf_modules.html @@ -20,6 +20,7 @@ SQLLite, Ingres, Oracle, mSQL)</li> <li><a href="ommail.html">ommail</a> - permits rsyslog to alert folks by mail if something important happens</li> <li><a href="omoracle.html">omoracle</a> - output module for Oracle (native OCI interface)</li> +<li><a href="omudpspoof.html">omudpspoof</a> - output module sending UDP syslog messages with a spoofed address</li> <li><a href="imfile.html">imfile</a> - input module for text files</li> <li><a href="imrelp.html">imrelp</a> - RELP diff --git a/doc/rsyslog_ng_comparison.html b/doc/rsyslog_ng_comparison.html index 8e121a8d..7d12a4a7 100644 --- a/doc/rsyslog_ng_comparison.html +++ b/doc/rsyslog_ng_comparison.html @@ -5,6 +5,10 @@ <h1>rsyslog vs. syslog-ng</h1> <p><small><i>Written by <a href="http://www.gerhards.net/rainer">Rainer Gerhards</a> (2008-05-06)</i></small></p> +<p><i>Warning</i>: this comparison is a little outdated, take it with a grain +of salt and be sure to check the links at the bottom (both syslog-ng as well as +rsyslog features are missing, but our priority is on creating great software not +continously updating this comparison ;)). <p>We have often been asked about a comparison sheet between rsyslog and syslog-ng. Unfortunately, I do not know much about syslog-ng, I did not even use it once. Also, there seems to be no @@ -81,9 +85,10 @@ optional input</td> </tr> <tr> <td valign="top">Windows Event Log</td> -<td valign="top">via <a href="http://www.eventreporter.com">EventReporter</a> +<td valign="top">via a Windows event logging software such as +<a href="http://www.eventreporter.com">EventReporter</a> or <a href="http://www.mwagent.com">MonitorWare Agent</a> -(both commercial software)</td> +(both commercial software, both fund rsyslog development)</td> <td valign="top">via separate Windows agent, paid edition only</td> </tr> diff --git a/doc/rsyslog_queue_pointers.jpeg b/doc/rsyslog_queue_pointers.jpeg Binary files differnew file mode 100644 index 00000000..809dd446 --- /dev/null +++ b/doc/rsyslog_queue_pointers.jpeg diff --git a/doc/rsyslog_queue_pointers2.jpeg b/doc/rsyslog_queue_pointers2.jpeg Binary files differnew file mode 100644 index 00000000..2ad60113 --- /dev/null +++ b/doc/rsyslog_queue_pointers2.jpeg diff --git a/doc/src/rsyslog_queue_pointers.dia b/doc/src/rsyslog_queue_pointers.dia Binary files differnew file mode 100644 index 00000000..2ad4cacb --- /dev/null +++ b/doc/src/rsyslog_queue_pointers.dia diff --git a/doc/src/rsyslog_queue_pointers2.dia b/doc/src/rsyslog_queue_pointers2.dia Binary files differnew file mode 100644 index 00000000..6a35c664 --- /dev/null +++ b/doc/src/rsyslog_queue_pointers2.dia diff --git a/doc/status.html b/doc/status.html index 4e8f1a5f..02cc0d70 100644 --- a/doc/status.html +++ b/doc/status.html @@ -2,23 +2,34 @@ <html><head><title>rsyslog status page</title></head> <body> <h2>rsyslog status page</h2> -<p>This page reflects the status as of 2009-05-25.</p> +<p>This page reflects the status as of 2009-08-21.</p> <h2>Current Releases</h2> -<p><b>development:</b> 4.3.1 [2009-05-25] - -<a href="http://www.rsyslog.com/Article372.phtml">change log</a> - -<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-159.phtml">download</a> +<p><b>v5 development:</b> 5.1.4 [2009-08-20] - +<a href="http://www.rsyslog.com/Article392.phtml">change log</a> - +<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-170.phtml">download</a> +<br> +<!-- not at the moment! +<b>v4 development:</b> 4.5.1 [2009-07-15] - +<a href="http://www.rsyslog.com/Article388.phtml">change log</a> - +<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-167.phtml">download</a></p> +--> -<br><b>beta:</b> 3.21.11 [2009-04-03] - -<a href="http://www.rsyslog.com/Article358.phtml">change log</a> - -<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-152.phtml">download</a></p> +<br><b>v4-beta:</b> 4.5.2 [2009-08-21] - +<a href="http://www.rsyslog.com/Article395.phtml">change log</a> - +<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-172.phtml">download</a></p> -<p><b>v3 stable:</b> 3.22.0 [2009-04-21] - <a href="http://www.rsyslog.com/Article368.phtml">change log</a> - -<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-157.phtml">download</a> +<p><b>v4 stable:</b> 4.4.0 [2009-08-21] - +<a href="http://www.rsyslog.com/Article394.phtml">change log</a> - +<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-171.phtml">download</a> -<br><b>v2 stable:</b> 2.0.7 [2009-04-14] - <a href="http://www.rsyslog.com/Article362.phtml">change log</a> - +<br><b>v3 stable:</b> 3.22.1 [2009-07-02] - +<a href="http://www.rsyslog.com/Article381.phtml">change log</a> - +<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-163.phtml">download</a> + +<br>v2 stable: 2.0.7 [2009-04-14] - <a href="http://www.rsyslog.com/Article362.phtml">change log</a> - <a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-154.phtml">download</a> -<br>v0 and v1 are deprecated and no longer supported. If you absolutely do not like to +<br>v0 to v2 are deprecated and no longer supported. If you absolutely do not like to upgrade, you may consider purchasing a <a href="professional_support.html">commercial rsyslog support package</a>. Just let us point out that it is really not a good idea to still run a v0 version. diff --git a/doc/v5compatibility.html b/doc/v5compatibility.html new file mode 100644 index 00000000..6d60062f --- /dev/null +++ b/doc/v5compatibility.html @@ -0,0 +1,30 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html><head><title>Compatibility notes for rsyslog v5</title> +</head> +<body> +<h1>Compatibility Notes for rsyslog v5</h1> +<p><small><i>Written by <a href="http://www.gerhards.net/rainer">Rainer Gerhards</a> +(2009-07-15)</i></small></p> +<p>The changes introduced in rsyslog v5 are numerous, but not very intrusive. +This document describes things to keep in mind when moving from v4 to v5. It +does not list enhancements nor does it talk about compatibility concerns introduced +by earlier versions (for this, see their respective compatibility documents). +<h2>HUP processing</h2> +<p>The $HUPisRestart directive is supported by some early v5 versions, but has been removed +in 5.1.3 and above. That means that restart-type HUP processing is no longer +available. This processing was redundant and had a lot a drawbacks. +For details, please see the +<a href="v4compatibility.html">rsyslog v4 compatibility notes</a> which elaborate +on the reasons and the (few) things you may need to change. +<h2>Queue Worker Thread Shutdown</h2> +<p>Previous rsyslog versions had the capability to "run" on zero queue worker +if no work was required. This was done to save a very limited number of resources. However, +it came at the price of great complexity. In v5, we have decided to let a minium of one +worker run all the time. The additional resource consumption is probably not noticable at +all, however, this enabled us to do some important code cleanups, resulting in faster +and more reliable code (complex code is hard to maintain and error-prone). From the +regular user's point of view, this change should be barely noticable. I am including the +note for expert users, who will notice it in rsyslog debug output and other analysis tools. +So it is no error if each queue in non-direct mode now always runs at least one worker +thread. +</body></html> diff --git a/plugins/imdiag/imdiag.c b/plugins/imdiag/imdiag.c index bf972191..9602f50d 100644 --- a/plugins/imdiag/imdiag.c +++ b/plugins/imdiag/imdiag.c @@ -246,7 +246,7 @@ injectMsg(uchar *pszCmd, tcps_sess_t *pSess) doInjectMsg(i + iFrom); } - CHKiRet(sendResponse(pSess, "messages injected\n")); + CHKiRet(sendResponse(pSess, "%d messages injected\n", nMsgs)); finalize_it: RETiRet; @@ -259,10 +259,13 @@ static rsRetVal waitMainQEmpty(tcps_sess_t *pSess) { int iMsgQueueSize; + int iPrint = 0; DEFiRet; CHKiRet(diagGetMainMsgQSize(&iMsgQueueSize)); while(iMsgQueueSize > 0) { + if(iPrint++ % 500 == 0) + dbgprintf("imdiag sleeping, wait mainq drain, curr size %d\n", iMsgQueueSize); srSleep(0,2); /* wait a little bit */ CHKiRet(diagGetMainMsgQSize(&iMsgQueueSize)); } @@ -297,6 +300,7 @@ OnMsgReceived(tcps_sess_t *pSess, uchar *pRcv, int iLenMsg) getFirstWord(&pszMsg, cmdBuf, sizeof(cmdBuf)/sizeof(uchar), TO_LOWERCASE); + dbgprintf("imdiag received command '%s'\n", cmdBuf); if(!ustrcmp(cmdBuf, UCHAR_CONSTANT("getmainmsgqueuesize"))) { CHKiRet(diagGetMainMsgQSize(&iMsgQueueSize)); CHKiRet(sendResponse(pSess, "%d\n", iMsgQueueSize)); @@ -443,10 +447,17 @@ resetConfigVariables(uchar __attribute__((unused)) *pp, void __attribute__((unus } +BEGINisCompatibleWithFeature +CODESTARTisCompatibleWithFeature + if(eFeat == sFEATURENonCancelInputTermination) + iRet = RS_RET_OK; +ENDisCompatibleWithFeature + BEGINqueryEtryPt CODESTARTqueryEtryPt CODEqueryEtryPt_STD_IMOD_QUERIES +CODEqueryEtryPt_IsCompatibleWithFeature_IF_OMOD_QUERIES ENDqueryEtryPt diff --git a/plugins/imgssapi/imgssapi.c b/plugins/imgssapi/imgssapi.c index d8791880..f9c1f86b 100644 --- a/plugins/imgssapi/imgssapi.c +++ b/plugins/imgssapi/imgssapi.c @@ -674,9 +674,17 @@ CODESTARTafterRun ENDafterRun +BEGINisCompatibleWithFeature +CODESTARTisCompatibleWithFeature + if(eFeat == sFEATURENonCancelInputTermination) + iRet = RS_RET_OK; +ENDisCompatibleWithFeature + + BEGINqueryEtryPt CODESTARTqueryEtryPt CODEqueryEtryPt_STD_IMOD_QUERIES +CODEqueryEtryPt_IsCompatibleWithFeature_IF_OMOD_QUERIES ENDqueryEtryPt diff --git a/plugins/imtcp/imtcp.c b/plugins/imtcp/imtcp.c index d122e976..c56593f2 100644 --- a/plugins/imtcp/imtcp.c +++ b/plugins/imtcp/imtcp.c @@ -254,6 +254,13 @@ CODESTARTafterRun ENDafterRun +BEGINisCompatibleWithFeature +CODESTARTisCompatibleWithFeature + if(eFeat == sFEATURENonCancelInputTermination) + iRet = RS_RET_OK; +ENDisCompatibleWithFeature + + BEGINmodExit CODESTARTmodExit if(pOurTcpsrv != NULL) @@ -293,6 +300,7 @@ resetConfigVariables(uchar __attribute__((unused)) *pp, void __attribute__((unus BEGINqueryEtryPt CODESTARTqueryEtryPt CODEqueryEtryPt_STD_IMOD_QUERIES +CODEqueryEtryPt_IsCompatibleWithFeature_IF_OMOD_QUERIES ENDqueryEtryPt diff --git a/plugins/imudp/imudp.c b/plugins/imudp/imudp.c index 0970259d..a393cf96 100644 --- a/plugins/imudp/imudp.c +++ b/plugins/imudp/imudp.c @@ -256,6 +256,7 @@ processSocket(int fd, struct sockaddr_storage *frominetPrev, int *pbIsPermitted, pMsg->bParseHOSTNAME = 1; MsgSetRcvFromStr(pMsg, fromHost, ustrlen(fromHost), &propFromHost); CHKiRet(MsgSetRcvFromIPStr(pMsg, fromHostIP, ustrlen(fromHostIP), &propFromHostIP)); +dbgprintf("XXX: submitting msg to queue\n"); CHKiRet(submitMsg(pMsg)); } } @@ -332,6 +333,8 @@ CODESTARTrunInput /* wait for io to become ready */ nfds = select(maxfds+1, (fd_set *) &readfds, NULL, NULL, NULL); + if(glbl.GetGlobalInputTermState() == 1) + break; /* terminate input! */ for(i = 0; nfds && i < *udpLstnSocks; i++) { if(FD_ISSET(udpLstnSocks[i+1], &readfds)) { @@ -398,9 +401,17 @@ CODESTARTmodExit ENDmodExit +BEGINisCompatibleWithFeature +CODESTARTisCompatibleWithFeature + if(eFeat == sFEATURENonCancelInputTermination) + iRet = RS_RET_OK; +ENDisCompatibleWithFeature + + BEGINqueryEtryPt CODESTARTqueryEtryPt CODEqueryEtryPt_STD_IMOD_QUERIES +CODEqueryEtryPt_IsCompatibleWithFeature_IF_OMOD_QUERIES ENDqueryEtryPt static rsRetVal resetConfigVariables(uchar __attribute__((unused)) *pp, void __attribute__((unused)) *pVal) diff --git a/plugins/imuxsock/imuxsock.c b/plugins/imuxsock/imuxsock.c index 424d0904..c099be56 100644 --- a/plugins/imuxsock/imuxsock.c +++ b/plugins/imuxsock/imuxsock.c @@ -282,6 +282,8 @@ CODESTARTrunInput /* wait for io to become ready */ nfds = select(maxfds+1, (fd_set *) &readfds, NULL, NULL, NULL); + if(glbl.GetGlobalInputTermState() == 1) + break; /* terminate input! */ for (i = 0; i < nfunix && nfds > 0; i++) { if ((fd = funix[i]) != -1 && FD_ISSET(fd, &readfds)) { @@ -333,11 +335,8 @@ CODESTARTafterRun if (funixn[i] && funix[i] != -1) unlink((char*) funixn[i]); /* free no longer needed string */ - if(pLogSockName != NULL) - free(pLogSockName); - if(pLogHostName != NULL) { - free(pLogHostName); - } + free(pLogSockName); + free(pLogHostName); discardFunixn(); nfunix = 1; @@ -355,9 +354,17 @@ CODESTARTmodExit ENDmodExit +BEGINisCompatibleWithFeature +CODESTARTisCompatibleWithFeature + if(eFeat == sFEATURENonCancelInputTermination) + iRet = RS_RET_OK; +ENDisCompatibleWithFeature + + BEGINqueryEtryPt CODESTARTqueryEtryPt CODEqueryEtryPt_STD_IMOD_QUERIES +CODEqueryEtryPt_IsCompatibleWithFeature_IF_OMOD_QUERIES ENDqueryEtryPt static rsRetVal resetConfigVariables(uchar __attribute__((unused)) *pp, void __attribute__((unused)) *pVal) diff --git a/plugins/ompgsql/ompgsql.c b/plugins/ompgsql/ompgsql.c index eb774835..cb6b6a4d 100644 --- a/plugins/ompgsql/ompgsql.c +++ b/plugins/ompgsql/ompgsql.c @@ -170,6 +170,9 @@ tryExec(uchar *pszCmd, instanceData *pData) int bHadError = 0; /* try insert */ +BEGINfunc +RUNLOG_VAR("%p", pData->f_hpgsql); +RUNLOG_VAR("%s", pszCmd); pgRet = PQexec(pData->f_hpgsql, (char*)pszCmd); execState = PQresultStatus(pgRet); if(execState != PGRES_COMMAND_OK && execState != PGRES_TUPLES_OK) { @@ -178,6 +181,7 @@ tryExec(uchar *pszCmd, instanceData *pData) } PQclear(pgRet); +ENDfunc return(bHadError); } @@ -230,6 +234,14 @@ CODESTARTtryResume } ENDtryResume + +BEGINbeginTransaction +CODESTARTbeginTransaction +dbgprintf("ompgsql: beginTransaction\n"); + iRet = writePgSQL((uchar*) "begin", pData); /* TODO: make user-configurable */ +ENDbeginTransaction + + BEGINdoAction CODESTARTdoAction dbgprintf("\n"); @@ -237,6 +249,13 @@ CODESTARTdoAction ENDdoAction +BEGINendTransaction +CODESTARTendTransaction + iRet = writePgSQL((uchar*) "commit;", pData); /* TODO: make user-configurable */ +dbgprintf("ompgsql: endTransaction\n"); +ENDendTransaction + + BEGINparseSelectorAct int iPgSQLPropErr = 0; CODESTARTparseSelectorAct @@ -314,6 +333,7 @@ ENDmodExit BEGINqueryEtryPt CODESTARTqueryEtryPt CODEqueryEtryPt_STD_OMOD_QUERIES +CODEqueryEtryPt_TXIF_OMOD_QUERIES /* we support the transactional interface! */ ENDqueryEtryPt @@ -322,6 +342,8 @@ CODESTARTmodInit *ipIFVersProvided = CURR_MOD_IF_VERSION; /* we only support the current interface specification */ CODEmodInit_QueryRegCFSLineHdlr CHKiRet(objUse(errmsg, CORE_COMPONENT)); + INITChkCoreFeature(bCoreSupportsBatching, CORE_FEATURE_BATCHING); + DBGPRINTF("ompgsql: %susing transactional output interface.\n", bCoreSupportsBatching ? "" : "not "); ENDmodInit /* vi:set ai: */ diff --git a/plugins/omtesting/omtesting.c b/plugins/omtesting/omtesting.c index 411bcf88..8f6cdbe5 100644 --- a/plugins/omtesting/omtesting.c +++ b/plugins/omtesting/omtesting.c @@ -22,7 +22,7 @@ * NOTE: read comments in module-template.h to understand how this file * works! * - * Copyright 2007 Rainer Gerhards and Adiscon GmbH. + * Copyright 2007, 2009 Rainer Gerhards and Adiscon GmbH. * * This file is part of rsyslog. * @@ -46,12 +46,14 @@ #include <stdio.h> #include <stdarg.h> #include <stdlib.h> +#include <time.h> #include <string.h> #include <ctype.h> #include <assert.h> #include "dirty.h" #include "syslogd-types.h" #include "module-template.h" +#include "cfsysline.h" MODULE_TYPE_OUTPUT @@ -59,9 +61,18 @@ MODULE_TYPE_OUTPUT */ DEF_OMOD_STATIC_DATA +static int bEchoStdout = 0; /* echo non-failed messages to stdout */ + typedef struct _instanceData { + enum { MD_SLEEP, MD_FAIL, MD_RANDFAIL, MD_ALWAYS_SUSPEND } + mode; + int bEchoStdout; int iWaitSeconds; int iWaitUSeconds; /* milli-seconds (one million of a second, just to make sure...) */ + int iCurrCallNbr; + int iFailFrequency; + int iResumeAfter; + int iCurrRetries; } instanceData; BEGINcreateInstance @@ -85,19 +96,106 @@ CODESTARTisCompatibleWithFeature ENDisCompatibleWithFeature -BEGINtryResume -CODESTARTtryResume -ENDtryResume +/* implement "fail" command in retry processing */ +static rsRetVal doFailOnResume(instanceData *pData) +{ + DEFiRet; -BEGINdoAction -CODESTARTdoAction + dbgprintf("fail retry curr %d, max %d\n", pData->iCurrRetries, pData->iResumeAfter); + if(++pData->iCurrRetries == pData->iResumeAfter) { + iRet = RS_RET_OK; + } else { + iRet = RS_RET_SUSPENDED; + } + + RETiRet; +} + + +/* implement "fail" command */ +static rsRetVal doFail(instanceData *pData) +{ + DEFiRet; + + dbgprintf("fail curr %d, frquency %d\n", pData->iCurrCallNbr, pData->iFailFrequency); + if(pData->iCurrCallNbr++ % pData->iFailFrequency == 0) { + pData->iCurrRetries = 0; + iRet = RS_RET_SUSPENDED; + } + + RETiRet; +} + + +/* implement "sleep" command */ +static rsRetVal doSleep(instanceData *pData) +{ + DEFiRet; struct timeval tvSelectTimeout; dbgprintf("sleep(%d, %d)\n", pData->iWaitSeconds, pData->iWaitUSeconds); tvSelectTimeout.tv_sec = pData->iWaitSeconds; tvSelectTimeout.tv_usec = pData->iWaitUSeconds; /* milli seconds */ select(0, NULL, NULL, NULL, &tvSelectTimeout); - //dbgprintf(":omtesting: end doAction(), iRet %d\n", iRet); + RETiRet; +} + + +/* implement "randomfail" command */ +static rsRetVal doRandFail(void) +{ + DEFiRet; + if((rand() >> 4) < (RAND_MAX >> 5)) { /* rougly same probability */ + iRet = RS_RET_OK; + dbgprintf("omtesting randfail: succeeded this time\n"); + } else { + iRet = RS_RET_SUSPENDED; + dbgprintf("omtesting randfail: failed this time\n"); + } + RETiRet; +} + + +BEGINtryResume +CODESTARTtryResume + dbgprintf("omtesting tryResume() called\n"); + switch(pData->mode) { + case MD_SLEEP: + break; + case MD_FAIL: + iRet = doFailOnResume(pData); + break; + case MD_RANDFAIL: + iRet = doRandFail(); + break; + case MD_ALWAYS_SUSPEND: + iRet = RS_RET_SUSPENDED; + } + dbgprintf("omtesting tryResume() returns iRet %d\n", iRet); +ENDtryResume + + +BEGINdoAction +CODESTARTdoAction + dbgprintf("omtesting received msg '%s'\n", ppString[0]); + switch(pData->mode) { + case MD_SLEEP: + iRet = doSleep(pData); + break; + case MD_FAIL: + iRet = doFail(pData); + break; + case MD_RANDFAIL: + iRet = doRandFail(); + case MD_ALWAYS_SUSPEND: + iRet = RS_RET_SUSPENDED; + } + + if(iRet == RS_RET_OK && pData->bEchoStdout) { + fprintf(stdout, "%s", ppString[0]); + fflush(stdout); + } + dbgprintf(":omtesting: end doAction(), iRet %d\n", iRet); ENDdoAction @@ -113,7 +211,7 @@ BEGINparseSelectorAct int i; uchar szBuf[1024]; CODESTARTparseSelectorAct -CODE_STD_STRING_REQUESTparseSelectorAct(0) +CODE_STD_STRING_REQUESTparseSelectorAct(1) /* code here is quick and dirty - if you like, clean it up. But keep * in mind it is just a testing aid ;) -- rgerhards, 2007-12-31 */ @@ -135,6 +233,7 @@ CODE_STD_STRING_REQUESTparseSelectorAct(0) if(isspace(*p)) ++p; + dbgprintf("omtesting command: '%s'\n", szBuf); if(!strcmp((char*) szBuf, "sleep")) { /* parse seconds */ for(i = 0 ; *p && !isspace(*p) && ((unsigned) i < sizeof(szBuf) - 1) ; ++i) { @@ -152,12 +251,43 @@ CODE_STD_STRING_REQUESTparseSelectorAct(0) if(isspace(*p)) ++p; pData->iWaitUSeconds = atoi((char*) szBuf); - } - /* once there are other modes, here is the spot to add it! */ - else { + pData->mode = MD_SLEEP; + } else if(!strcmp((char*) szBuf, "fail")) { + /* "fail fail-freqency resume-after" + * fail-frequency specifies how often doAction() fails + * resume-after speicifes how fast tryResume() should come back with success + * all numbers being "times called" + */ + /* parse fail-frequence */ + for(i = 0 ; *p && !isspace(*p) && ((unsigned) i < sizeof(szBuf) - 1) ; ++i) { + szBuf[i] = *p++; + } + szBuf[i] = '\0'; + if(isspace(*p)) + ++p; + pData->iFailFrequency = atoi((char*) szBuf); + /* parse resume-after */ + for(i = 0 ; *p && !isspace(*p) && ((unsigned) i < sizeof(szBuf) - 1) ; ++i) { + szBuf[i] = *p++; + } + szBuf[i] = '\0'; + if(isspace(*p)) + ++p; + pData->iResumeAfter = atoi((char*) szBuf); + pData->iCurrCallNbr = 1; + pData->mode = MD_FAIL; + } else if(!strcmp((char*) szBuf, "randfail")) { + pData->mode = MD_RANDFAIL; + } else if(!strcmp((char*) szBuf, "always_suspend")) { + pData->mode = MD_ALWAYS_SUSPEND; + } else { dbgprintf("invalid mode '%s', doing 'sleep 1 0' - fix your config\n", szBuf); } + pData->bEchoStdout = bEchoStdout; + CHKiRet(cflineParseTemplateName(&p, *ppOMSR, 0, OMSR_NO_RQD_TPL_OPTS, + (uchar*)"RSYSLOG_TraditionalForwardFormat")); + CODE_STD_FINALIZERparseSelectorAct ENDparseSelectorAct @@ -177,6 +307,10 @@ BEGINmodInit() CODESTARTmodInit *ipIFVersProvided = CURR_MOD_IF_VERSION; /* we only support the current interface specification */ CODEmodInit_QueryRegCFSLineHdlr + CHKiRet(omsdRegCFSLineHdlr((uchar *)"actionomtestingechostdout", 0, eCmdHdlrBinary, NULL, + &bEchoStdout, STD_LOADABLE_MODULE_ID)); + /* we seed the random-number generator in any case... */ + srand(time(NULL)); ENDmodInit /* * vi:set ai: diff --git a/plugins/omudpspoof/Makefile.am b/plugins/omudpspoof/Makefile.am new file mode 100644 index 00000000..79c495a0 --- /dev/null +++ b/plugins/omudpspoof/Makefile.am @@ -0,0 +1,8 @@ +pkglib_LTLIBRARIES = omudpspoof.la + +omudpspoof_la_SOURCES = omudpspoof.c +omudpspoof_la_CPPFLAGS = $(RSRT_CFLAGS) $(PTHREADS_CFLAGS) $(UDPSPOOF_CFLAGS) +omudpspoof_la_LDFLAGS = -module -avoid-version +omudpspoof_la_LIBADD = $(UDPSPOOF_LIBS) + +EXTRA_DIST = diff --git a/plugins/omudpspoof/omudpspoof.c b/plugins/omudpspoof/omudpspoof.c new file mode 100644 index 00000000..8eb63c73 --- /dev/null +++ b/plugins/omudpspoof/omudpspoof.c @@ -0,0 +1,500 @@ +/* omudpspoof.c + * + * This is a udp-based output module that support spoofing. + * + * This file builds on UDP spoofing code contributed by + * David Lang <david@lang.hm>. I then created a "real" rsyslog module + * out of that code and omfwd. I decided to make it a separate module because + * omfwd already mixes up too many things (TCP & UDP & a differnt modes, + * this has historic reasons), it would not be a good idea to also add + * spoofing to it. And, looking at the requirements, there is little in + * common between omfwd and this module. + * + * Note: I have briefly checked libnet source code and I somewhat have the feeling + * that under some circumstances we may get into trouble with the lib. For + * example, it registers an atexit() handler, which should not play nicely + * with our dynamically loaded modules. Anyhow, I refrain from looking deeper + * at libnet code, especially as testing does not show any real issues. If some + * occur, it may be easier to modify libnet for dynamic load environments than + * using a work-around (as a side not, libnet looks somewhat unmaintained, the CVS + * I can see on sourceforge dates has no updates done less than 7 years ago). + * On the other hand, it looks like libnet is thread safe (at least is appropriately + * compiled, which I hope the standard packages are). So I do not guard calls to + * it with my own mutex calls. + * rgerhards, 2009-07-10 + * + * Copyright 2009 David Lang (spoofing code) + * Copyright 2009 Rainer Gerhards and Adiscon GmbH. + * + * This file is part of rsyslog. + * + * Rsyslog is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Rsyslog is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Rsyslog. If not, see <http://www.gnu.org/licenses/>. + * + * A copy of the GPL can be found in the file "COPYING" in this distribution. + */ +#include "config.h" +#include "rsyslog.h" +#include <stdio.h> +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <netinet/in.h> +#include <netdb.h> +#include <fnmatch.h> +#include <assert.h> +#include <errno.h> +#include <ctype.h> +#include <unistd.h> +#ifdef USE_NETZIP +#include <zlib.h> +#endif +#include "conf.h" +#include "syslogd-types.h" +#include "srUtils.h" +#include "net.h" +#include "template.h" +#include "msg.h" +#include "cfsysline.h" +#include "module-template.h" +#include "glbl.h" +#include "errmsg.h" +#include "dirty.h" +#include "unicode-helper.h" + + +#include <libnet.h> +#define _BSD_SOURCE 1 +#define __BSD_SOURCE 1 +#define __FAVOR_BSD 1 + + +MODULE_TYPE_OUTPUT + +/* internal structures + */ +DEF_OMOD_STATIC_DATA +DEFobjCurrIf(errmsg) +DEFobjCurrIf(glbl) +DEFobjCurrIf(net) + +typedef struct _instanceData { + uchar *host; + uchar *port; + int *pSockArray; /* sockets to use for UDP */ + int compressionLevel; /* 0 - no compression, else level for zlib */ + struct addrinfo *f_addr; + u_short sourcePort; + u_short sourcePortStart; /* for sorce port iteration */ + u_short sourcePortEnd; +} instanceData; + +#define DFLT_SOURCE_PORT_START 32000 +#define DFLT_SOURCE_PORT_END 42000 + +/* config data */ +static uchar *pszTplName = NULL; /* name of the default template to use */ +static uchar *pszSourceNameTemplate = NULL; /* name of the template containing the spoofing address */ +static uchar *pszTargetHost = NULL; +static uchar *pszTargetPort = NULL; +static int iCompressionLevel = 0; /* zlib compressionlevel, the usual values */ +static int iSourcePortStart = DFLT_SOURCE_PORT_START; +static int iSourcePortEnd = DFLT_SOURCE_PORT_END; + + +/* add some variables needed for libnet */ +libnet_t *libnet_handle; +char errbuf[LIBNET_ERRBUF_SIZE]; + +/* forward definitions */ +static rsRetVal doTryResume(instanceData *pData); + + +/* Close the UDP sockets. + * rgerhards, 2009-05-29 + */ +static rsRetVal +closeUDPSockets(instanceData *pData) +{ + DEFiRet; + assert(pData != NULL); + if(pData->pSockArray != NULL) { + net.closeUDPListenSockets(pData->pSockArray); + pData->pSockArray = NULL; + freeaddrinfo(pData->f_addr); + pData->f_addr = NULL; + } + RETiRet; +} + + +/* get the syslog forward port + * We may change the implementation to try to lookup the port + * if it is unspecified. So far, we use the IANA default auf 514. + * rgerhards, 2007-06-28 + */ +static inline uchar *getFwdPt(instanceData *pData) +{ + return (pData->port == NULL) ? UCHAR_CONSTANT("514") : pData->port; +} + + +BEGINcreateInstance +CODESTARTcreateInstance +ENDcreateInstance + + +BEGINisCompatibleWithFeature +CODESTARTisCompatibleWithFeature + if(eFeat == sFEATURERepeatedMsgReduction) + iRet = RS_RET_OK; +ENDisCompatibleWithFeature + + +BEGINfreeInstance +CODESTARTfreeInstance + /* final cleanup */ + closeUDPSockets(pData); + free(pData->port); + free(pData->host); +ENDfreeInstance + + +BEGINdbgPrintInstInfo +CODESTARTdbgPrintInstInfo + DBGPRINTF("%s", pData->host); +ENDdbgPrintInstInfo + + +/* Send a message via UDP + * rgehards, 2007-12-20 + */ +static rsRetVal UDPSend(instanceData *pData, uchar *pszSourcename, char *msg, size_t len) +{ + struct addrinfo *r; + int lsent = 0; + int bSendSuccess; + int j, build_ip; + u_char opt[20]; + struct sockaddr_in *tempaddr,source_ip; + libnet_ptag_t ip, ipo; + libnet_ptag_t udp; + DEFiRet; + + if(pData->pSockArray == NULL) { + CHKiRet(doTryResume(pData)); + } + + ip = ipo = udp = 0; + if(pData->sourcePort++ >= pData->sourcePortEnd){ + pData->sourcePort = pData->sourcePortStart; + } + + inet_pton(AF_INET, (char*)pszSourcename, &(source_ip.sin_addr)); + + bSendSuccess = FALSE; + for (r = pData->f_addr; r; r = r->ai_next) { + tempaddr = (struct sockaddr_in *)r->ai_addr; + libnet_clear_packet(libnet_handle); + udp = libnet_build_udp( + pData->sourcePort, /* source port */ + tempaddr->sin_port, /* destination port */ + LIBNET_UDP_H + len, /* packet length */ + 0, /* checksum */ + (u_char*)msg, /* payload */ + len, /* payload size */ + libnet_handle, /* libnet handle */ + udp); /* libnet id */ + if (udp == -1) { + DBGPRINTF("Can't build UDP header: %s\n", libnet_geterror(libnet_handle)); + } + + build_ip = 0; + /* this is not a legal options string */ + for (j = 0; j < 20; j++) { + opt[j] = libnet_get_prand(LIBNET_PR2); + } + ipo = libnet_build_ipv4_options(opt, 20, libnet_handle, ipo); + if (ipo == -1) { + DBGPRINTF("Can't build IP options: %s\n", libnet_geterror(libnet_handle)); + } + ip = libnet_build_ipv4( + LIBNET_IPV4_H + 20 + len + LIBNET_UDP_H, /* length */ + 0, /* TOS */ + 242, /* IP ID */ + 0, /* IP Frag */ + 64, /* TTL */ + IPPROTO_UDP, /* protocol */ + 0, /* checksum */ + source_ip.sin_addr.s_addr, + tempaddr->sin_addr.s_addr, + NULL, /* payload */ + 0, /* payload size */ + libnet_handle, /* libnet handle */ + ip); /* libnet id */ + if (ip == -1) { + DBGPRINTF("Can't build IP header: %s\n", libnet_geterror(libnet_handle)); + } + + /* Write it to the wire. */ + lsent = libnet_write(libnet_handle); + if (lsent == -1) { + DBGPRINTF("Write error: %s\n", libnet_geterror(libnet_handle)); + } else { + bSendSuccess = TRUE; + break; + } + } + /* finished looping */ + if (bSendSuccess == FALSE) { + DBGPRINTF("error forwarding via udp, suspending\n"); + iRet = RS_RET_SUSPENDED; + } + +finalize_it: + RETiRet; +} + + +/* try to resume connection if it is not ready + * rgerhards, 2007-08-02 + */ +static rsRetVal doTryResume(instanceData *pData) +{ + int iErr; + struct addrinfo *res; + struct addrinfo hints; + DEFiRet; + + if(pData->pSockArray != NULL) + FINALIZE; + + /* The remote address is not yet known and needs to be obtained */ + DBGPRINTF(" %s\n", pData->host); + memset(&hints, 0, sizeof(hints)); + /* port must be numeric, because config file syntax requires this */ + hints.ai_flags = AI_NUMERICSERV; + hints.ai_family = glbl.GetDefPFFamily(); + hints.ai_socktype = SOCK_DGRAM; + if((iErr = (getaddrinfo((char*)pData->host, (char*)getFwdPt(pData), &hints, &res))) != 0) { + DBGPRINTF("could not get addrinfo for hostname '%s':'%s': %d%s\n", + pData->host, getFwdPt(pData), iErr, gai_strerror(iErr)); + ABORT_FINALIZE(RS_RET_SUSPENDED); + } + DBGPRINTF("%s found, resuming.\n", pData->host); + pData->f_addr = res; + pData->pSockArray = net.create_udp_socket((uchar*)pData->host, NULL, 0); + +finalize_it: + if(iRet != RS_RET_OK) { + if(pData->f_addr != NULL) { + freeaddrinfo(pData->f_addr); + pData->f_addr = NULL; + } + iRet = RS_RET_SUSPENDED; + } + + RETiRet; +} + + +BEGINtryResume +CODESTARTtryResume + iRet = doTryResume(pData); +ENDtryResume + +BEGINdoAction + char *psz; /* temporary buffering */ + register unsigned l; + int iMaxLine; +CODESTARTdoAction + CHKiRet(doTryResume(pData)); + + iMaxLine = glbl.GetMaxLine(); + + DBGPRINTF(" %s:%s/udpspoofs\n", pData->host, getFwdPt(pData)); + + psz = (char*) ppString[0]; + l = strlen((char*) psz); + if((int) l > iMaxLine) + l = iMaxLine; + +# ifdef USE_NETZIP + /* Check if we should compress and, if so, do it. We also + * check if the message is large enough to justify compression. + * The smaller the message, the less likely is a gain in compression. + * To save CPU cycles, we do not try to compress very small messages. + * What "very small" means needs to be configured. Currently, it is + * hard-coded but this may be changed to a config parameter. + * rgerhards, 2006-11-30 + */ + if(pData->compressionLevel && (l > MIN_SIZE_FOR_COMPRESS)) { + Bytef *out; + uLongf destLen = iMaxLine + iMaxLine/100 +12; /* recommended value from zlib doc */ + uLong srcLen = l; + int ret; + /* TODO: optimize malloc sequence? -- rgerhards, 2008-09-02 */ + CHKmalloc(out = (Bytef*) malloc(destLen)); + out[0] = 'z'; + out[1] = '\0'; + ret = compress2((Bytef*) out+1, &destLen, (Bytef*) psz, + srcLen, pData->compressionLevel); + DBGPRINTF("Compressing message, length was %d now %d, return state %d.\n", + l, (int) destLen, ret); + if(ret != Z_OK) { + /* if we fail, we complain, but only in debug mode + * Otherwise, we are silent. In any case, we ignore the + * failed compression and just sent the uncompressed + * data, which is still valid. So this is probably the + * best course of action. + * rgerhards, 2006-11-30 + */ + DBGPRINTF("Compression failed, sending uncompressed message\n"); + } else if(destLen+1 < l) { + /* only use compression if there is a gain in using it! */ + DBGPRINTF("there is gain in compression, so we do it\n"); + psz = (char*) out; + l = destLen + 1; /* take care for the "z" at message start! */ + } + ++destLen; + } +# endif + + CHKiRet(UDPSend(pData, ppString[1], psz, l)); + +finalize_it: +ENDdoAction + + +BEGINparseSelectorAct +CODESTARTparseSelectorAct +CODE_STD_STRING_REQUESTparseSelectorAct(2) + /* first check if this config line is actually for us */ + if(strncmp((char*) p, ":omudpspoof:", sizeof(":omudpspoof:") - 1)) { + ABORT_FINALIZE(RS_RET_CONFLINE_UNPROCESSED); + } + + /* ok, if we reach this point, we have something for us */ + p += sizeof(":omudpspoof:") - 1; /* eat indicator sequence (-1 because of '\0'!) */ + CHKiRet(createInstance(&pData)); + + if(pszSourceNameTemplate == NULL) { + errmsg.LogError(0, NO_ERRCODE, "No $ActionOMUDPSpoofSourceNameTemplate given, can not continue with this action."); + ABORT_FINALIZE(RS_RET_NO_SRCNAME_TPL); + } + + if(pszTargetHost == NULL) { + errmsg.LogError(0, NO_ERRCODE, "No $ActionOMUDPSpoofTargetHost given, can not continue with this action."); + ABORT_FINALIZE(RS_RET_HOST_NOT_SPECIFIED); + } + + /* fill instance properties */ + CHKmalloc(pData->host = ustrdup(pszTargetHost)); + if(pszTargetPort == NULL) + pData->port = NULL; + else + CHKmalloc(pData->port = ustrdup(pszTargetPort)); + CHKiRet(OMSRsetEntry(*ppOMSR, 1, ustrdup(pszSourceNameTemplate), OMSR_NO_RQD_TPL_OPTS)); + pData->compressionLevel = iCompressionLevel; + pData->sourcePort = pData->sourcePortStart = iSourcePortStart; + pData->sourcePortEnd = iSourcePortEnd; + + /* process template */ + CHKiRet(cflineParseTemplateName(&p, *ppOMSR, 0, OMSR_NO_RQD_TPL_OPTS, + (pszTplName == NULL) ? (uchar*)"RSYSLOG_TraditionalForwardFormat" : pszTplName)); + +CODE_STD_FINALIZERparseSelectorAct +ENDparseSelectorAct + + +/* a common function to free our configuration variables - used both on exit + * and on $ResetConfig processing. -- rgerhards, 2008-05-16 + */ +static void +freeConfigVars(void) +{ + free(pszTplName); + pszTplName = NULL; + free(pszTargetHost); + pszTargetHost = NULL; + free(pszTargetPort); + pszTargetPort = NULL; +} + + +BEGINmodExit +CODESTARTmodExit + /* destroy the libnet state needed for forged UDP sources */ + libnet_destroy(libnet_handle); + /* release what we no longer need */ + objRelease(errmsg, CORE_COMPONENT); + objRelease(glbl, CORE_COMPONENT); + objRelease(net, LM_NET_FILENAME); + freeConfigVars(); +ENDmodExit + + +BEGINqueryEtryPt +CODESTARTqueryEtryPt +CODEqueryEtryPt_STD_OMOD_QUERIES +ENDqueryEtryPt + + +/* Reset config variables for this module to default values. + * rgerhards, 2008-03-28 + */ +static rsRetVal resetConfigVariables(uchar __attribute__((unused)) *pp, void __attribute__((unused)) *pVal) +{ + freeConfigVars(); + /* we now must reset all non-string values */ + iCompressionLevel = 0; + iSourcePortStart = DFLT_SOURCE_PORT_START; + iSourcePortEnd = DFLT_SOURCE_PORT_END; + return RS_RET_OK; +} + + +BEGINmodInit() +CODESTARTmodInit + *ipIFVersProvided = CURR_MOD_IF_VERSION; /* we only support the current interface specification */ +CODEmodInit_QueryRegCFSLineHdlr + CHKiRet(objUse(glbl, CORE_COMPONENT)); + CHKiRet(objUse(errmsg, CORE_COMPONENT)); + CHKiRet(objUse(net,LM_NET_FILENAME)); + + /* Initialize the libnet library. Root priviledges are required. + * this initializes a IPv4 socket to use for forging UDP packets. + */ + libnet_handle = libnet_init( + LIBNET_RAW4, /* injection type */ + NULL, /* network interface */ + errbuf); /* errbuf */ + + if(libnet_handle == NULL) { + errmsg.LogError(0, NO_ERRCODE, "Error initializing libnet, can not continue "); + ABORT_FINALIZE(RS_RET_ERR_LIBNET_INIT); + } + + CHKiRet(regCfSysLineHdlr((uchar *)"actionomudpspoofdefaulttemplate", 0, eCmdHdlrGetWord, NULL, &pszTplName, NULL)); + CHKiRet(regCfSysLineHdlr((uchar *)"actionomudpspoofsourcenametemplate", 0, eCmdHdlrGetWord, NULL, &pszSourceNameTemplate, NULL)); + CHKiRet(regCfSysLineHdlr((uchar *)"actionomudpspooftargethost", 0, eCmdHdlrGetWord, NULL, &pszTargetHost, NULL)); + CHKiRet(regCfSysLineHdlr((uchar *)"actionomudpspooftargetport", 0, eCmdHdlrGetWord, NULL, &pszTargetPort, NULL)); + CHKiRet(regCfSysLineHdlr((uchar *)"actionomudpspoofsourceportstart", 0, eCmdHdlrInt, NULL, &iSourcePortStart, NULL)); + CHKiRet(regCfSysLineHdlr((uchar *)"actionomudpspoofsourceportend", 0, eCmdHdlrInt, NULL, &iSourcePortEnd, NULL)); + CHKiRet(regCfSysLineHdlr((uchar *)"actionomudpcompressionlevel", 0, eCmdHdlrInt, NULL, &iCompressionLevel, NULL)); + CHKiRet(omsdRegCFSLineHdlr((uchar *)"resetconfigvariables", 1, eCmdHdlrCustomHandler, resetConfigVariables, NULL, STD_LOADABLE_MODULE_ID)); +ENDmodInit + +/* vim:set ai: + */ diff --git a/runtime/Makefile.am b/runtime/Makefile.am index 14abe722..caf7c5ca 100644 --- a/runtime/Makefile.am +++ b/runtime/Makefile.am @@ -9,6 +9,7 @@ librsyslog_la_SOURCES = \ rsyslog.h \ unicode-helper.h \ atomic.h \ + batch.h \ syslogd-types.h \ module-template.h \ obj-types.h \ diff --git a/runtime/apc.c b/runtime/apc.c index bc330e39..c2f61266 100644 --- a/runtime/apc.c +++ b/runtime/apc.c @@ -249,12 +249,11 @@ execScheduled(void) apc_list_t *pExecList; apc_list_t *pCurr; apc_list_t *pNext; - DEFVARS_mutexProtection_uncond; DEFiRet; - BEGIN_MTX_PROTECTED_OPERATIONS_UNCOND(&listMutex); + d_pthread_mutex_lock(&listMutex); iRet = unlistCurrent(&pExecList); - END_MTX_PROTECTED_OPERATIONS_UNCOND(&listMutex); + d_pthread_mutex_unlock(&listMutex); CHKiRet(iRet); if(pExecList != NULL) { @@ -290,14 +289,12 @@ ENDobjConstruct(apc) static rsRetVal apcConstructFinalize(apc_t *pThis, apc_id_t *pID) { - DEFVARS_mutexProtection_uncond; DEFiRet; ISOBJ_TYPE_assert(pThis, apc); assert(pID != NULL); - BEGIN_MTX_PROTECTED_OPERATIONS_UNCOND(&listMutex); + d_pthread_mutex_lock(&listMutex); insertApc(pThis, pID); - END_MTX_PROTECTED_OPERATIONS_UNCOND(&listMutex); -RUNLOG_STR("apcConstructFinalize post mutex unlock\n"); + d_pthread_mutex_unlock(&listMutex); RETiRet; } @@ -333,12 +330,10 @@ SetParam2(apc_t *pThis, void *param2) static rsRetVal CancelApc(apc_id_t id) { - DEFVARS_mutexProtection_uncond; - BEGINfunc - BEGIN_MTX_PROTECTED_OPERATIONS_UNCOND(&listMutex); + d_pthread_mutex_lock(&listMutex); deleteApc(id); - END_MTX_PROTECTED_OPERATIONS_UNCOND(&listMutex); + d_pthread_mutex_unlock(&listMutex); ENDfunc return RS_RET_OK; } diff --git a/runtime/atomic.h b/runtime/atomic.h index d5aaf56b..b507b769 100644 --- a/runtime/atomic.h +++ b/runtime/atomic.h @@ -41,6 +41,8 @@ * They simply came in too late. -- rgerhards, 2008-04-02 */ #ifdef HAVE_ATOMIC_BUILTINS +# define ATOMIC_SUB(data, val) __sync_fetch_and_sub(&(data), val) +# define ATOMIC_ADD(data, val) __sync_fetch_and_add(&(data), val) # define ATOMIC_INC(data) ((void) __sync_fetch_and_add(&(data), 1)) # define ATOMIC_INC_AND_FETCH(data) __sync_fetch_and_add(&(data), 1) # define ATOMIC_DEC(data) ((void) __sync_sub_and_fetch(&(data), 1)) @@ -49,7 +51,6 @@ # define ATOMIC_STORE_1_TO_32BIT(data) __sync_lock_test_and_set(&(data), 1) # define ATOMIC_STORE_0_TO_INT(data) __sync_fetch_and_and(&(data), 0) # define ATOMIC_STORE_1_TO_INT(data) __sync_fetch_and_or(&(data), 1) -# define ATOMIC_STORE_INT_TO_INT(data, val) __sync_fetch_and_or(&(data), (val)) # define ATOMIC_CAS(data, oldVal, newVal) __sync_bool_compare_and_swap(&(data), (oldVal), (newVal)); # define ATOMIC_CAS_VAL(data, oldVal, newVal) __sync_val_compare_and_swap(&(data), (oldVal), (newVal)); #else diff --git a/runtime/batch.h b/runtime/batch.h new file mode 100644 index 00000000..031718a7 --- /dev/null +++ b/runtime/batch.h @@ -0,0 +1,72 @@ +/* Definition of the batch_t data structure. + * I am not sure yet if this will become a full-blown object. For now, this header just + * includes the object definition and is not accompanied by code. + * + * Copyright 2009 by Rainer Gerhards and Adiscon GmbH. + * + * This file is part of the rsyslog runtime library. + * + * The rsyslog runtime library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The rsyslog runtime library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the rsyslog runtime library. If not, see <http://www.gnu.org/licenses/>. + * + * A copy of the GPL can be found in the file "COPYING" in this distribution. + * A copy of the LGPL can be found in the file "COPYING.LESSER" in this distribution. + */ + +#ifndef BATCH_H_INCLUDED +#define BATCH_H_INCLUDED + +/* enum for batch states. Actually, we violate a layer here, in that we assume that a batch is used + * for action processing. So far, this seems acceptable, the status is simply ignored inside the + * main message queue. But over time, it could potentially be useful to split the two. + * rgerhad, 2009-05-12 + */ +typedef enum { + BATCH_STATE_RDY = 0, /* object ready for processing */ + BATCH_STATE_BAD = 1, /* unrecoverable failure while processing, do NOT resubmit to same action */ + BATCH_STATE_SUB = 2, /* message submitted for processing, outcome yet unkonwn */ + BATCH_STATE_COMM = 3, /* message successfully commited */ + BATCH_STATE_DISC = 4, /* discarded - processed OK, but do not submit to any other action */ +} batch_state_t; + + +/* an object inside a batch, including any information (state!) needed for it to "life". + */ +struct batch_obj_s { + obj_t *pUsrp; /* pointer to user object (most often message) */ + batch_state_t state; /* associated state */ +}; + +/* the batch + * This object is used to dequeue multiple user pointers which are than handed over + * to processing. The size of elements is fixed after queue creation, but may be + * modified by config variables (better said: queue properties). + * Note that a "user pointer" in rsyslog context so far always is a message + * object. We stick to the more generic term because queues may potentially hold + * other types of objects, too. + * rgerhards, 2009-05-12 + * Note that nElem is not necessarily equal to nElemDeq. This is the case when we + * discard some elements (because of configuration) during dequeue processing. As + * all Elements are only deleted when the batch is processed, we can not immediately + * delete them. So we need to keep their number that we can delete them when the batch + * is completed (else, the whole process does not work correctly). + */ +struct batch_s { + int nElem; /* actual number of element in this entry */ + int nElemDeq; /* actual number of elements dequeued (and thus to be deleted) - see comment above! */ + int iDoneUpTo; /* all messages below this index have state other than RDY */ + qDeqID deqID; /* ID of dequeue operation that generated this batch */ + batch_obj_t *pElem; /* batch elements */ +}; + +#endif /* #ifndef BATCH_H_INCLUDED */ diff --git a/runtime/conf.c b/runtime/conf.c index 83ed2e9b..2e37edf2 100644 --- a/runtime/conf.c +++ b/runtime/conf.c @@ -93,7 +93,7 @@ DEFobjCurrIf(net) DEFobjCurrIf(rule) DEFobjCurrIf(ruleset) -static int iNbrActions; /* number of actions the running config has. Needs to be init on ReInitConf() */ +static int iNbrActions = 0; /* number of currently defined actions */ /* The following global variables are used for building * tag and host selector lines during startup and config reload. @@ -1102,7 +1102,7 @@ static rsRetVal cflineDoAction(uchar **p, action_t **ppAction) dbgprintf("module is incompatible with RepeatedMsgReduction - turned off\n"); pAction->f_ReduceRepeated = 0; } - pAction->bEnabled = 1; /* action is enabled */ + pAction->eState = ACT_STATE_RDY; /* action is enabled */ iNbrActions++; /* one more active action! */ } break; @@ -1204,21 +1204,6 @@ cfline(uchar *line, rule_t **pfCurr) } -/* Reinitialize the configuration subsystem. This is a "work-around" to the fact - * that we do not yet have actual config objects. This method is to be called - * whenever a totally new config is started (which means on startup and HUP). - * Note that it MUST NOT be called for an included config file. - * rgerhards, 2008-07-28 - */ -static rsRetVal -ReInitConf(void) -{ - DEFiRet; - iNbrActions = 0; /* this is what we created the function for ;) - action count is reset */ - RETiRet; -} - - /* return the current number of active actions * rgerhards, 2008-07-28 */ @@ -1252,7 +1237,6 @@ CODESTARTobjQueryInterface(conf) pIf->doIncludeLine = doIncludeLine; pIf->cfline = cfline; pIf->processConfFile = processConfFile; - pIf->ReInitConf = ReInitConf; pIf->GetNbrActActions = GetNbrActActions; finalize_it: diff --git a/runtime/conf.h b/runtime/conf.h index 25b887be..6db1623e 100644 --- a/runtime/conf.h +++ b/runtime/conf.h @@ -37,10 +37,12 @@ BEGINinterface(conf) /* name must also be changed in ENDinterface macro! */ rsRetVal (*doIncludeLine)(uchar **pp, __attribute__((unused)) void* pVal); rsRetVal (*cfline)(uchar *line, rule_t **pfCurr); rsRetVal (*processConfFile)(uchar *pConfFile); - rsRetVal (*ReInitConf)(void); rsRetVal (*GetNbrActActions)(int *); ENDinterface(conf) -#define confCURR_IF_VERSION 2 /* increment whenever you change the interface structure! */ +#define confCURR_IF_VERSION 3 /* increment whenever you change the interface structure! */ +/* in Version 3, entry point "ReInitConf()" was removed, as we do not longer need + * to support restart-type HUP -- rgerhards, 2009-07-15 + */ /* prototypes */ diff --git a/runtime/debug.c b/runtime/debug.c index 1b592ef3..959d56a3 100644 --- a/runtime/debug.c +++ b/runtime/debug.c @@ -830,13 +830,12 @@ sigsegvHdlr(int signum) abort(); } -#if 1 -#pragma GCC diagnostic ignored "-Wempty-body" -/* write the debug message. This is a helper to dbgprintf and dbgoprint which - * contains common code. added 2008-09-26 rgerhards +/* actually write the debug message. This is a separate fuction because the cleanup_push/_pop + * interface otherwise is unsafe to use (generates compiler warnings at least). + * 2009-05-20 rgerhards */ -static void -dbgprint(obj_t *pObj, char *pszMsg, size_t lenMsg) +static inline void +do_dbgprint(uchar *pszObjName, char *pszMsg, size_t lenMsg) { static pthread_t ptLastThrdID = 0; static int bWasNL = 0; @@ -844,20 +843,6 @@ dbgprint(obj_t *pObj, char *pszMsg, size_t lenMsg) char pszWriteBuf[32*1024]; size_t lenWriteBuf; struct timespec t; - uchar *pszObjName = NULL; - - /* we must get the object name before we lock the mutex, because the object - * potentially calls back into us. If we locked the mutex, we would deadlock - * ourselfs. On the other hand, the GetName call needs not to be protected, as - * this thread has a valid reference. If such an object is deleted by another - * thread, we are in much more trouble than just for dbgprint(). -- rgerhards, 2008-09-26 - */ - if(pObj != NULL) { - pszObjName = obj.GetName(pObj); - } - - pthread_mutex_lock(&mutdbgprint); - pthread_cleanup_push(dbgMutexCancelCleanupHdlr, &mutdbgprint); /* The bWasNL handler does not really work. It works if no thread * switching occurs during non-NL messages. Else, things are messed @@ -905,11 +890,35 @@ dbgprint(obj_t *pObj, char *pszMsg, size_t lenMsg) if(altdbg != -1) write(altdbg, pszMsg, lenMsg); bWasNL = (pszMsg[lenMsg - 1] == '\n') ? 1 : 0; +} + +#pragma GCC diagnostic ignored "-Wempty-body" +/* write the debug message. This is a helper to dbgprintf and dbgoprint which + * contains common code. added 2008-09-26 rgerhards + */ +static void +dbgprint(obj_t *pObj, char *pszMsg, size_t lenMsg) +{ + uchar *pszObjName = NULL; + + /* we must get the object name before we lock the mutex, because the object + * potentially calls back into us. If we locked the mutex, we would deadlock + * ourselfs. On the other hand, the GetName call needs not to be protected, as + * this thread has a valid reference. If such an object is deleted by another + * thread, we are in much more trouble than just for dbgprint(). -- rgerhards, 2008-09-26 + */ + if(pObj != NULL) { + pszObjName = obj.GetName(pObj); + } + + pthread_mutex_lock(&mutdbgprint); + pthread_cleanup_push(dbgMutexCancelCleanupHdlr, &mutdbgprint); + + do_dbgprint(pszObjName, pszMsg, lenMsg); pthread_cleanup_pop(1); } #pragma GCC diagnostic warning "-Wempty-body" -#endif /* print some debug output when an object is given * This is mostly a copy of dbgprintf, but I do not know how to combine it @@ -1052,7 +1061,9 @@ int dbgEntrFunc(dbgFuncDB_t **ppFuncDB, const char *file, const char *func, int /* when we reach this point, we have a fully-initialized FuncDB! */ ATOMIC_INC(pFuncDB->nTimesCalled); if(bLogFuncFlow && dbgPrintNameIsInList((const uchar*)pFuncDB->file, printNameFileRoot)) - dbgprintf("%s:%d: %s: enter\n", pFuncDB->file, pFuncDB->line, pFuncDB->func); + if(strcmp(pFuncDB->file, "stringbuf.c")) { /* TODO: make configurable */ + dbgprintf("%s:%d: %s: enter\n", pFuncDB->file, pFuncDB->line, pFuncDB->func); + } if(pThrd->stackPtr >= (int) (sizeof(pThrd->callStack) / sizeof(dbgFuncDB_t*))) { dbgprintf("%s:%d: %s: debug module: call stack for this thread full, suspending call tracking\n", pFuncDB->file, pFuncDB->line, pFuncDB->func); @@ -1082,10 +1093,12 @@ void dbgExitFunc(dbgFuncDB_t *pFuncDB, int iStackPtrRestore, int iRet) dbgFuncDBPrintActiveMutexes(pFuncDB, "WARNING: mutex still owned by us as we exit function, mutex: ", pthread_self()); if(bLogFuncFlow && dbgPrintNameIsInList((const uchar*)pFuncDB->file, printNameFileRoot)) { - if(iRet == RS_RET_NO_IRET) - dbgprintf("%s:%d: %s: exit: (no iRet)\n", pFuncDB->file, pFuncDB->line, pFuncDB->func); - else - dbgprintf("%s:%d: %s: exit: %d\n", pFuncDB->file, pFuncDB->line, pFuncDB->func, iRet); + if(strcmp(pFuncDB->file, "stringbuf.c")) { /* TODO: make configurable */ + if(iRet == RS_RET_NO_IRET) + dbgprintf("%s:%d: %s: exit: (no iRet)\n", pFuncDB->file, pFuncDB->line, pFuncDB->func); + else + dbgprintf("%s:%d: %s: exit: %d\n", pFuncDB->file, pFuncDB->line, pFuncDB->func, iRet); + } } pThrd->stackPtr = iStackPtrRestore; if(pThrd->stackPtr < 0) { diff --git a/runtime/glbl.c b/runtime/glbl.c index 7fa61963..f27b8e73 100644 --- a/runtime/glbl.c +++ b/runtime/glbl.c @@ -39,6 +39,7 @@ #include "cfsysline.h" #include "glbl.h" #include "prop.h" +#include "atomic.h" /* some defaults */ #ifndef DFLT_NETSTRM_DRVR @@ -55,7 +56,6 @@ DEFobjCurrIf(prop) */ static uchar *pszWorkDir = NULL; static int bOptimizeUniProc = 1; /* enable uniprocessor optimizations */ -static int bHUPisRestart = 0; /* should SIGHUP cause a full system restart? */ static int bPreserveFQDN = 0; /* should FQDNs always be preserved? */ static int iMaxLine = 2048; /* maximum length of a syslog message */ static int iDefPFFamily = PF_UNSPEC; /* protocol family (IPv4, IPv6 or both) */ @@ -72,6 +72,7 @@ static uchar *pszDfltNetstrmDrvr = NULL; /* module name of default netstream dri static uchar *pszDfltNetstrmDrvrCAF = NULL; /* default CA file for the netstrm driver */ static uchar *pszDfltNetstrmDrvrKeyFile = NULL; /* default key file for the netstrm driver (server) */ static uchar *pszDfltNetstrmDrvrCertFile = NULL; /* default cert file for the netstrm driver (server) */ +static int bTerminateInputs = 0; /* global switch that inputs shall terminate ASAP (1=> terminate) */ /* define a macro for the simple properties' set and get functions @@ -95,7 +96,6 @@ static dataType Get##nameFunc(void) \ SIMP_PROP(OptimizeUniProc, bOptimizeUniProc, int) SIMP_PROP(PreserveFQDN, bPreserveFQDN, int) -SIMP_PROP(HUPisRestart, bHUPisRestart, int) SIMP_PROP(MaxLine, iMaxLine, int) SIMP_PROP(DefPFFamily, iDefPFFamily, int) /* note that in the future we may check the family argument */ SIMP_PROP(DropMalPTRMsgs, bDropMalPTRMsgs, int) @@ -117,6 +117,24 @@ SIMP_PROP_SET(DfltNetstrmDrvrCertFile, pszDfltNetstrmDrvrCertFile, uchar*) /* TO #undef SIMP_PROP_GET +/* return global input termination status + * rgerhards, 2009-07-20 + */ +static int GetGlobalInputTermState(void) +{ + return ATOMIC_FETCH_32BIT(bTerminateInputs); +} + + +/* set global termiantion state to "terminate". Note that this is a + * "once in a lifetime" action which can not be undone. -- gerhards, 2009-07-20 + */ +static void SetGlobalInputTermination(void) +{ + ATOMIC_STORE_1_TO_INT(bTerminateInputs); +} + + /* return our local hostname. if it is not set, "[localhost]" is returned */ static uchar* @@ -241,13 +259,14 @@ CODESTARTobjQueryInterface(glbl) pIf->GetWorkDir = GetWorkDir; pIf->GenerateLocalHostNameProperty = GenerateLocalHostNameProperty; pIf->GetLocalHostNameProp = GetLocalHostNameProp; + pIf->SetGlobalInputTermination = SetGlobalInputTermination; + pIf->GetGlobalInputTermState = GetGlobalInputTermState; #define SIMP_PROP(name) \ pIf->Get##name = Get##name; \ pIf->Set##name = Set##name; SIMP_PROP(MaxLine); SIMP_PROP(OptimizeUniProc); SIMP_PROP(PreserveFQDN); - SIMP_PROP(HUPisRestart); SIMP_PROP(DefPFFamily); SIMP_PROP(DropMalPTRMsgs); SIMP_PROP(Option_DisallowWarning); @@ -293,7 +312,6 @@ static rsRetVal resetConfigVariables(uchar __attribute__((unused)) *pp, void __a } bDropMalPTRMsgs = 0; bOptimizeUniProc = 1; - bHUPisRestart = 0; bPreserveFQDN = 0; return RS_RET_OK; } @@ -316,7 +334,6 @@ BEGINAbstractObjClassInit(glbl, 1, OBJ_IS_CORE_MODULE) /* class, version */ CHKiRet(regCfSysLineHdlr((uchar *)"defaultnetstreamdriverkeyfile", 0, eCmdHdlrGetWord, NULL, &pszDfltNetstrmDrvrKeyFile, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"defaultnetstreamdrivercertfile", 0, eCmdHdlrGetWord, NULL, &pszDfltNetstrmDrvrCertFile, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"optimizeforuniprocessor", 0, eCmdHdlrBinary, NULL, &bOptimizeUniProc, NULL)); - CHKiRet(regCfSysLineHdlr((uchar *)"hupisrestart", 0, eCmdHdlrBinary, NULL, &bHUPisRestart, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"preservefqdn", 0, eCmdHdlrBinary, NULL, &bPreserveFQDN, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"resetconfigvariables", 1, eCmdHdlrCustomHandler, resetConfigVariables, NULL, NULL)); ENDObjClassInit(glbl) diff --git a/runtime/glbl.h b/runtime/glbl.h index dcfb6d5f..0d0c8210 100644 --- a/runtime/glbl.h +++ b/runtime/glbl.h @@ -8,7 +8,7 @@ * Please note that there currently is no glbl.c file as we do not yet * have any implementations. * - * Copyright 2008 Rainer Gerhards and Adiscon GmbH. + * Copyright 2008, 2009 Rainer Gerhards and Adiscon GmbH. * * This file is part of the rsyslog runtime library. * @@ -44,7 +44,6 @@ BEGINinterface(glbl) /* name must also be changed in ENDinterface macro! */ rsRetVal (*Set##name)(dataType); SIMP_PROP(MaxLine, int) SIMP_PROP(OptimizeUniProc, int) - SIMP_PROP(HUPisRestart, int) SIMP_PROP(PreserveFQDN, int) SIMP_PROP(DefPFFamily, int) SIMP_PROP(DropMalPTRMsgs, int) @@ -62,9 +61,12 @@ BEGINinterface(glbl) /* name must also be changed in ENDinterface macro! */ /* added v3, 2009-06-30 */ rsRetVal (*GenerateLocalHostNameProperty)(void); prop_t* (*GetLocalHostNameProp)(void); + /* added v4, 2009-07-20 */ + int (*GetGlobalInputTermState)(void); + void (*SetGlobalInputTermination)(void); #undef SIMP_PROP ENDinterface(glbl) -#define glblCURR_IF_VERSION 3 /* increment whenever you change the interface structure! */ +#define glblCURR_IF_VERSION 4 /* increment whenever you change the interface structure! */ /* version 2 had PreserveFQDN added - rgerhards, 2008-12-08 */ /* the remaining prototypes */ diff --git a/runtime/module-template.h b/runtime/module-template.h index 3e963199..d49da2c9 100644 --- a/runtime/module-template.h +++ b/runtime/module-template.h @@ -368,6 +368,17 @@ static rsRetVal queryEtryPt(uchar *name, rsRetVal (**pEtryPoint)())\ *pEtryPoint = endTransaction;\ } + +/* the following definition is a queryEtryPt block that must be added + * if a non-output module supports "isCompatibleWithFeature". + * rgerhards, 2009-07-20 + */ +#define CODEqueryEtryPt_IsCompatibleWithFeature_IF_OMOD_QUERIES \ + else if(!strcmp((char*) name, "isCompatibleWithFeature")) {\ + *pEtryPoint = isCompatibleWithFeature;\ + } + + /* the following definition is the standard block for queryEtryPt for INPUT * modules. This can be used if no specific handling (e.g. to cover version * differences) is needed. diff --git a/runtime/modules.c b/runtime/modules.c index 871f356a..bdb15e7f 100644 --- a/runtime/modules.c +++ b/runtime/modules.c @@ -77,6 +77,27 @@ static modInfo_t *pLoadedModulesLast = NULL; /* tail-pointer */ uchar *pModDir = NULL; /* read-only after startup */ +/* we provide a set of dummy functions for modules that do not support the + * some interfaces. + * On the commit feature: As the modules do not support it, they commit each message they + * receive, and as such the dummies can always return RS_RET_OK without causing + * harm. This simplifies things as in action processing we do not need to check + * if the transactional entry points exist. + */ +static rsRetVal dummyBeginTransaction() +{ + return RS_RET_OK; +} +static rsRetVal dummyEndTransaction() +{ + return RS_RET_OK; +} +static rsRetVal dummyIsCompatibleWithFeature() +{ +dbgprintf("XXX: dummy isCompatibleWithFeature called!\n"); + return RS_RET_INCOMPATIBLE; +} + #ifdef DEBUG /* we add some home-grown support to track our users (and detect who does not free us). In * the long term, this should probably be migrated into debug.c (TODO). -- rgerhards, 2008-03-11 @@ -216,19 +237,38 @@ static void moduleDestruct(modInfo_t *pThis) } +/* This enables a module to query the core for specific features. + * rgerhards, 2009-04-22 + */ +static rsRetVal queryCoreFeatureSupport(int *pBool, unsigned uFeat) +{ + DEFiRet; + + if((pBool == NULL)) + ABORT_FINALIZE(RS_RET_PARAM_ERROR); + + *pBool = (uFeat & CORE_FEATURE_BATCHING) ? 1 : 0; + +finalize_it: + RETiRet; +} + + /* The following function is the queryEntryPoint for host-based entry points. * Modules may call it to get access to core interface functions. Please note * that utility functions can be accessed via shared libraries - at least this * is my current shool of thinking. * Please note that the implementation as a query interface allows to take * care of plug-in interface version differences. -- rgerhards, 2007-07-31 + * ... but often it better not to use a new interface. So we now add core + * functions here that a plugin may request. -- rgerhards, 2009-04-22 */ static rsRetVal queryHostEtryPt(uchar *name, rsRetVal (**pEtryPoint)()) { DEFiRet; if((name == NULL) || (pEtryPoint == NULL)) - return RS_RET_PARAM_ERROR; + ABORT_FINALIZE(RS_RET_PARAM_ERROR); if(!strcmp((char*) name, "regCfSysLineHdlr")) { *pEtryPoint = regCfSysLineHdlr; @@ -236,6 +276,8 @@ static rsRetVal queryHostEtryPt(uchar *name, rsRetVal (**pEtryPoint)()) *pEtryPoint = objGetObjInterface; } else if(!strcmp((char*) name, "OMSRgetSupportedTplOpts")) { *pEtryPoint = OMSRgetSupportedTplOpts; + } else if(!strcmp((char*) name, "queryCoreFeatureSupport")) { + *pEtryPoint = queryCoreFeatureSupport; } else { *pEtryPoint = NULL; /* to be on the safe side */ ABORT_FINALIZE(RS_RET_ENTRY_POINT_NOT_FOUND); @@ -392,6 +434,11 @@ doModInit(rsRetVal (*modInit)(int, int*, rsRetVal(**)(), rsRetVal(*)(), modInfo_ */ CHKiRet((*pNew->modQueryEtryPt)((uchar*)"modGetID", &pNew->modGetID)); CHKiRet((*pNew->modQueryEtryPt)((uchar*)"modExit", &pNew->modExit)); + localRet = (*pNew->modQueryEtryPt)((uchar*)"isCompatibleWithFeature", &pNew->isCompatibleWithFeature); + if(localRet == RS_RET_MODULE_ENTRY_POINT_NOT_FOUND) + pNew->isCompatibleWithFeature = dummyIsCompatibleWithFeature; + else if(localRet != RS_RET_OK) + ABORT_FINALIZE(localRet); /* ... and now the module-specific interfaces */ switch(pNew->eType) { @@ -406,12 +453,25 @@ doModInit(rsRetVal (*modInit)(int, int*, rsRetVal(**)(), rsRetVal(*)(), modInfo_ CHKiRet((*pNew->modQueryEtryPt)((uchar*)"dbgPrintInstInfo", &pNew->dbgPrintInstInfo)); CHKiRet((*pNew->modQueryEtryPt)((uchar*)"doAction", &pNew->mod.om.doAction)); CHKiRet((*pNew->modQueryEtryPt)((uchar*)"parseSelectorAct", &pNew->mod.om.parseSelectorAct)); - CHKiRet((*pNew->modQueryEtryPt)((uchar*)"isCompatibleWithFeature", &pNew->isCompatibleWithFeature)); CHKiRet((*pNew->modQueryEtryPt)((uchar*)"tryResume", &pNew->tryResume)); /* try load optional interfaces */ localRet = (*pNew->modQueryEtryPt)((uchar*)"doHUP", &pNew->doHUP); if(localRet != RS_RET_OK && localRet != RS_RET_MODULE_ENTRY_POINT_NOT_FOUND) ABORT_FINALIZE(localRet); + + localRet = (*pNew->modQueryEtryPt)((uchar*)"beginTransaction", &pNew->mod.om.beginTransaction); + if(localRet == RS_RET_MODULE_ENTRY_POINT_NOT_FOUND) + pNew->mod.om.beginTransaction = dummyBeginTransaction; + else if(localRet != RS_RET_OK) + ABORT_FINALIZE(localRet); + + localRet = (*pNew->modQueryEtryPt)((uchar*)"endTransaction", &pNew->mod.om.endTransaction); + if(localRet == RS_RET_MODULE_ENTRY_POINT_NOT_FOUND) { + pNew->mod.om.endTransaction = dummyEndTransaction; + //pNew->mod.om.beginTransaction = dummyEndTransaction; + } else if(localRet != RS_RET_OK) { + ABORT_FINALIZE(localRet); + } break; case eMOD_LIB: break; diff --git a/runtime/modules.h b/runtime/modules.h index 4d874019..71e3199c 100644 --- a/runtime/modules.h +++ b/runtime/modules.h @@ -111,7 +111,9 @@ typedef struct modInfo_s { struct {/* data for output modules */ /* below: perform the configured action */ + rsRetVal (*beginTransaction)(void*); rsRetVal (*doAction)(uchar**, unsigned, void*); + rsRetVal (*endTransaction)(void*); rsRetVal (*parseSelectorAct)(uchar**, void**,omodStringRequest_t**); } om; struct { /* data for library modules */ diff --git a/runtime/msg.c b/runtime/msg.c index 208ea77a..d28ee350 100644 --- a/runtime/msg.c +++ b/runtime/msg.c @@ -1136,15 +1136,21 @@ char *getProtocolVersionString(msg_t *pM) } -static char *getRawMsg(msg_t *pM) +static inline void +getRawMsg(msg_t *pM, uchar **pBuf, int *piLen) { - if(pM == NULL) - return ""; - else - if(pM->pszRawMsg == NULL) - return ""; - else - return (char*)pM->pszRawMsg; + if(pM == NULL) { + *pBuf= UCHAR_CONSTANT(""); + *piLen = 0; + } else { + if(pM->pszRawMsg == NULL) { + *pBuf= UCHAR_CONSTANT(""); + *piLen = 0; + } else { + *pBuf = pM->pszRawMsg; + *piLen = pM->iLenRawMsg; + } + } } @@ -1171,7 +1177,7 @@ uchar *getMSG(msg_t *pM) if(pM == NULL) ret = UCHAR_CONSTANT(""); else { - if(pM->offMSG == -1) + if(pM->iLenMSG == 0) ret = UCHAR_CONSTANT(""); else ret = pM->pszRawMsg + pM->offMSG; @@ -1762,10 +1768,10 @@ int getProgramNameLen(msg_t *pM, bool bLockMutex) /* get the "programname" as sz string * rgerhards, 2005-10-19 */ -char *getProgramName(msg_t *pM, bool bLockMutex) +uchar *getProgramName(msg_t *pM, bool bLockMutex) { prepareProgramName(pM, bLockMutex); - return (pM->pCSProgName == NULL) ? "" : (char*) rsCStrGetSzStrNoNULL(pM->pCSProgName); + return (pM->pCSProgName == NULL) ? UCHAR_CONSTANT("") : rsCStrGetSzStrNoNULL(pM->pCSProgName); } @@ -1782,7 +1788,7 @@ static void tryEmulateAPPNAME(msg_t *pM) if(getProtocolVersion(pM) == 0) { /* only then it makes sense to emulate */ - MsgSetAPPNAME(pM, getProgramName(pM, MUTEX_ALREADY_LOCKED)); + MsgSetAPPNAME(pM, (char*)getProgramName(pM, MUTEX_ALREADY_LOCKED)); } } @@ -1947,12 +1953,20 @@ void MsgSetHOSTNAME(msg_t *pThis, uchar* pszHOSTNAME, int lenHOSTNAME) /* set the offset of the MSG part into the raw msg buffer + * Note that the offset may be higher than the length of the raw message + * (exactly by one). This can happen if we have a message that does not + * contain any MSG part. */ void MsgSetMSGoffs(msg_t *pMsg, short offs) { ISOBJ_TYPE_assert(pMsg, msg); - pMsg->iLenMSG = pMsg->iLenRawMsg - offs; pMsg->offMSG = offs; + if(offs > pMsg->iLenRawMsg) { + assert(offs - 1 == pMsg->iLenRawMsg); + pMsg->iLenMSG = 0; + } else { + pMsg->iLenMSG = pMsg->iLenRawMsg - offs; + } } @@ -1986,7 +2000,8 @@ rsRetVal MsgReplaceMSG(msg_t *pThis, uchar* pszMSG, int lenMSG) pThis->pszRawMsg = bufNew; } - memcpy(pThis->pszRawMsg + pThis->offMSG, pszMSG, lenMSG); + if(lenMSG > 0) + memcpy(pThis->pszRawMsg + pThis->offMSG, pszMSG, lenMSG); pThis->pszRawMsg[lenNew] = '\0'; /* this also works with truncation! */ pThis->iLenRawMsg = lenNew; pThis->iLenMSG = lenMSG; @@ -2172,12 +2187,13 @@ uchar *MsgGetProp(msg_t *pMsg, struct templateEntry *pTpe, break; case PROP_HOSTNAME: pRes = (uchar*)getHOSTNAME(pMsg); + bufLen = getHOSTNAMELen(pMsg); break; case PROP_SYSLOGTAG: getTAG(pMsg, &pRes, &bufLen); break; case PROP_RAWMSG: - pRes = (uchar*)getRawMsg(pMsg); + getRawMsg(pMsg, &pRes, &bufLen); break; /* enable this, if someone actually uses UxTradMsg, delete after some time has * passed and nobody complained -- rgerhards, 2009-06-16 @@ -2209,6 +2225,7 @@ uchar *MsgGetProp(msg_t *pMsg, struct templateEntry *pTpe, break; case PROP_IUT: pRes = UCHAR_CONSTANT("1"); /* always 1 for syslog messages (a MonitorWare thing;)) */ + bufLen = 1; break; case PROP_SYSLOGFACILITY: pRes = (uchar*)getFacility(pMsg); @@ -2226,7 +2243,7 @@ uchar *MsgGetProp(msg_t *pMsg, struct templateEntry *pTpe, pRes = (uchar*)getTimeGenerated(pMsg, pTpe->data.field.eDateFormat); break; case PROP_PROGRAMNAME: - pRes = (uchar*)getProgramName(pMsg, LOCK_MUTEX); + pRes = getProgramName(pMsg, LOCK_MUTEX); break; case PROP_PROTOCOL_VERSION: pRes = (uchar*)getProtocolVersionString(pMsg); diff --git a/runtime/msg.h b/runtime/msg.h index 3a02365b..b006cbec 100644 --- a/runtime/msg.h +++ b/runtime/msg.h @@ -176,7 +176,7 @@ int getMSGLen(msg_t *pM); char *getHOSTNAME(msg_t *pM); int getHOSTNAMELen(msg_t *pM); -char *getProgramName(msg_t *pM, bool bLockMutex); +uchar *getProgramName(msg_t *pM, bool bLockMutex); int getProgramNameLen(msg_t *pM, bool bLockMutex); uchar *getRcvFrom(msg_t *pM); rsRetVal propNameToID(cstr_t *pCSPropName, propid_t *pPropID); diff --git a/runtime/queue.c b/runtime/queue.c index ddff1bcf..101052a1 100644 --- a/runtime/queue.c +++ b/runtime/queue.c @@ -8,7 +8,11 @@ * (and in the web doc set on http://www.rsyslog.com/doc). Be sure to read it * if you are getting aquainted to the object. * - * Copyright 2008 Rainer Gerhards and Adiscon GmbH. + * NOTE: as of 2009-04-22, I have begin to remove the qqueue* prefix from static + * function names - this makes it really hard to read and does not provide much + * benefit, at least I (now) think so... + * + * Copyright 2008, 2009 Rainer Gerhards and Adiscon GmbH. * * This file is part of the rsyslog runtime library. * @@ -51,10 +55,10 @@ #include "wti.h" #include "msg.h" #include "atomic.h" +#include "msg.h" /* TODO: remove once we remove MsgAddRef() call */ #ifdef OS_SOLARIS # include <sched.h> -# define pthread_yield() sched_yield() #endif /* static data */ @@ -63,59 +67,157 @@ DEFobjCurrIf(glbl) DEFobjCurrIf(strm) /* forward-definitions */ -static rsRetVal qqueueChkPersist(qqueue_t *pThis); -static rsRetVal qqueueSetEnqOnly(qqueue_t *pThis, int bEnqOnly, int bLockMutex); -static rsRetVal qqueueRateLimiter(qqueue_t *pThis); +static rsRetVal qqueueChkPersist(qqueue_t *pThis, int nUpdates); +static rsRetVal SetEnqOnly(qqueue_t *pThis, int bEnqOnly, int bLockMutex); +static rsRetVal RateLimiter(qqueue_t *pThis); static int qqueueChkStopWrkrDA(qqueue_t *pThis); +static rsRetVal GetDeqBatchSize(qqueue_t *pThis, int *pVal); static int qqueueIsIdleDA(qqueue_t *pThis); -static rsRetVal qqueueConsumerDA(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave); -static rsRetVal qqueueConsumerCancelCleanup(void *arg1, void *arg2); -static rsRetVal qqueueUngetObj(qqueue_t *pThis, obj_t *pUsr, int bLockMutex); +static rsRetVal ConsumerDA(qqueue_t *pThis, wti_t *pWti); +static rsRetVal batchProcessed(qqueue_t *pThis, wti_t *pWti); /* some constants for queuePersist () */ #define QUEUE_CHECKPOINT 1 #define QUEUE_NO_CHECKPOINT 0 +/*********************************************************************** + * we need a private data structure, the "to-delete" list. As C does + * not provide any partly private data structures, we implement this + * structure right here inside the module. + * Note that this list must always be kept sorted based on a unique + * dequeue ID (which is monotonically increasing). + * rgerhards, 2009-05-18 + ***********************************************************************/ + +/* generate next uniqueue dequeue ID. Note that uniqueness is only required + * on a per-queue basis and while this instance runs. So a stricly monotonically + * increasing counter is sufficient (if enough bits are used). + */ +static inline qDeqID getNextDeqID(qqueue_t *pQueue) +{ + ISOBJ_TYPE_assert(pQueue, qqueue); + return pQueue->deqIDAdd++; +} + + +/* return the top element of the to-delete list or NULL, if the + * list is empty. + */ +static inline toDeleteLst_t *tdlPeek(qqueue_t *pQueue) +{ + ISOBJ_TYPE_assert(pQueue, qqueue); + return pQueue->toDeleteLst; +} + + +/* remove the top element of the to-delete list. Nothing but the + * element itself is destroyed. Must not be called when the list + * is empty. + */ +static inline rsRetVal tdlPop(qqueue_t *pQueue) +{ + toDeleteLst_t *pRemove; + DEFiRet; + + ISOBJ_TYPE_assert(pQueue, qqueue); + assert(pQueue->toDeleteLst != NULL); + + pRemove = pQueue->toDeleteLst; + pQueue->toDeleteLst = pQueue->toDeleteLst->pNext; + free(pRemove); + + RETiRet; +} + + +/* Add a new to-delete list entry. The function allocates the data + * structure, populates it with the values provided and links the new + * element into the correct place inside the list. + */ +static inline rsRetVal tdlAdd(qqueue_t *pQueue, qDeqID deqID, int nElemDeq) +{ + toDeleteLst_t *pNew; + toDeleteLst_t *pPrev; + DEFiRet; + + ISOBJ_TYPE_assert(pQueue, qqueue); + assert(pQueue->toDeleteLst != NULL); + + CHKmalloc(pNew = malloc(sizeof(toDeleteLst_t))); + pNew->deqID = deqID; + pNew->nElemDeq = nElemDeq; + + /* now find right spot */ + for( pPrev = pQueue->toDeleteLst + ; pPrev != NULL && deqID > pPrev->deqID + ; pPrev = pPrev->pNext) { + /*JUST SEARCH*/; + } + + if(pPrev == NULL) { + pNew->pNext = pQueue->toDeleteLst; + pQueue->toDeleteLst = pNew; + } else { + pNew->pNext = pPrev->pNext; + pPrev->pNext = pNew; + } + +finalize_it: + RETiRet; +} + + /* methods */ -/* get the overall queue size, which includes ungotten objects. Must only be called +/* get the physical queue size. Must only be called * while mutex is locked! * rgerhards, 2008-01-29 */ static inline int -qqueueGetOverallQueueSize(qqueue_t *pThis) +getPhysicalQueueSize(qqueue_t *pThis) { -#if 0 /* leave a bit in for debugging -- rgerhards, 2008-01-30 */ -BEGINfunc -dbgoprint((obj_t*) pThis, "queue size: %d (regular %d, ungotten %d)\n", - pThis->iQueueSize + pThis->iUngottenObjs, pThis->iQueueSize, pThis->iUngottenObjs); -ENDfunc -#endif - return pThis->iQueueSize + pThis->iUngottenObjs; + return pThis->iQueueSize; } +/* get the logical queue size (that is store size minus logically dequeued elements). + * Must only be called while mutex is locked! + * rgerhards, 2009-05-19 + */ +static inline int +getLogicalQueueSize(qqueue_t *pThis) +{ + return pThis->iQueueSize - pThis->nLogDeq; +} + + + /* This function drains the queue in cases where this needs to be done. The most probable * reason is a HUP which needs to discard data (because the queue is configured to be lossy). * During a shutdown, this is typically not needed, as the OS frees up ressources and does * this much quicker than when we clean up ourselvs. -- rgerhards, 2008-10-21 * This function returns void, as it makes no sense to communicate an error back, even if * it happens. + * This functions works "around" the regular deque mechanism, because it is only used to + * clean up (in cases where message loss is acceptable). */ static inline void queueDrain(qqueue_t *pThis) { void *pUsr; - ASSERT(pThis != NULL); + BEGINfunc + DBGOPRINT((obj_t*) pThis, "queue (type %d) will lose %d messages, destroying...\n", pThis->qType, pThis->iQueueSize); /* iQueueSize is not decremented by qDel(), so we need to do it ourselves */ while(ATOMIC_DEC_AND_FETCH(pThis->iQueueSize) > 0) { - pThis->qDel(pThis, &pUsr); + pThis->qDeq(pThis, &pUsr); if(pUsr != NULL) { objDestruct(pUsr); } + pThis->qDel(pThis); } + ENDfunc } @@ -138,37 +240,17 @@ static inline rsRetVal qqueueAdviseMaxWorkers(qqueue_t *pThis) /* if we have not yet reached the high water mark, there is no need to start a * worker. -- rgerhards, 2008-01-26 */ - if(qqueueGetOverallQueueSize(pThis) >= pThis->iHighWtrMrk || pThis->bQueueStarted == 0) { + if(getLogicalQueueSize(pThis) >= pThis->iHighWtrMrk || pThis->bQueueStarted == 0) { wtpAdviseMaxWorkers(pThis->pWtpDA, 1); /* disk queues have always one worker */ } + } + /* regular workers always run */ + if(pThis->qType == QUEUETYPE_DISK || pThis->iMinMsgsPerWrkr == 0) { + iMaxWorkers = 1; } else { - if(pThis->qType == QUEUETYPE_DISK || pThis->iMinMsgsPerWrkr == 0) { - iMaxWorkers = 1; - } else { - iMaxWorkers = qqueueGetOverallQueueSize(pThis) / pThis->iMinMsgsPerWrkr + 1; - } - wtpAdviseMaxWorkers(pThis->pWtpReg, iMaxWorkers); /* disk queues have always one worker */ + iMaxWorkers = getLogicalQueueSize(pThis) / pThis->iMinMsgsPerWrkr + 1; } - } - - RETiRet; -} - - -/* wait until we have a fully initialized DA queue. Sometimes, we need to - * sync with it, as we expect it for some function. - * rgerhards, 2008-02-27 - */ -static rsRetVal -qqueueWaitDAModeInitialized(qqueue_t *pThis) -{ - DEFiRet; - - ISOBJ_TYPE_assert(pThis, qqueue); - ASSERT(pThis->bRunsDA); - - while(pThis->bRunsDA != 2) { - d_pthread_cond_wait(&pThis->condDAReady, pThis->mut); + wtpAdviseMaxWorkers(pThis->pWtpReg, iMaxWorkers); /* disk queues have always one worker */ } RETiRet; @@ -186,45 +268,16 @@ qqueueWaitDAModeInitialized(qqueue_t *pThis) * rgerhards, 2008-01-15 */ static rsRetVal -qqueueTurnOffDAMode(qqueue_t *pThis) +TurnOffDAMode(qqueue_t *pThis) { DEFiRet; ISOBJ_TYPE_assert(pThis, qqueue); ASSERT(pThis->bRunsDA); - - /* at this point, we need a fully initialized DA queue. So if it isn't, we finally need - * to wait for its startup... -- rgerhards, 2008-01-25 - */ - qqueueWaitDAModeInitialized(pThis); - - /* if we need to pull any data that we still need from the (child) disk queue, - * now would be the time to do so. At present, we do not need this, but I'd like to - * keep that comment if future need arises. - */ - - /* we need to check if the DA queue is empty because the DA worker may simply have - * terminated do to no new messages arriving. That does not, however, mean that the - * DA queue is empty. If there is still data in that queue, we do nothing and leave - * that for a later incarnation of this function (it will be called multiple times - * during the lifetime of DA-mode, depending on how often the DA worker receives an - * inactivity timeout. -- rgerhards, 2008-01-25 - */ - if(pThis->pqDA->iQueueSize == 0) { + if(getLogicalQueueSize(pThis->pqDA) == 0) { pThis->bRunsDA = 0; /* tell the world we are back in non-DA mode */ - /* we destruct the queue object, which will also shutdown the queue worker. As the queue is empty, - * this will be quick. - */ - qqueueDestruct(&pThis->pqDA); /* and now we are ready to destruct the DA queue */ - dbgoprint((obj_t*) pThis, "disk-assistance has been turned off, disk queue was empty (iRet %d)\n", + DBGOPRINT((obj_t*) pThis, "disk-assistance has been turned off, disk queue was empty (iRet %d)\n", iRet); - /* now we need to check if the regular queue has some messages. This may be the case - * when it is waiting that the high water mark is reached again. If so, we need to start up - * a regular worker. -- rgerhards, 2008-01-26 - */ - if(qqueueGetOverallQueueSize(pThis) > 0) { - qqueueAdviseMaxWorkers(pThis); - } } RETiRet; @@ -246,9 +299,9 @@ qqueueChkIsDA(qqueue_t *pThis) ISOBJ_TYPE_assert(pThis, qqueue); if(pThis->pszFilePrefix != NULL) { pThis->bIsDA = 1; - dbgoprint((obj_t*) pThis, "is disk-assisted, disk will be used on demand\n"); + DBGOPRINT((obj_t*) pThis, "is disk-assisted, disk will be used on demand\n"); } else { - dbgoprint((obj_t*) pThis, "is NOT disk-assisted\n"); + DBGOPRINT((obj_t*) pThis, "is NOT disk-assisted\n"); } RETiRet; @@ -267,16 +320,13 @@ qqueueChkIsDA(qqueue_t *pThis) * rgerhards, 2008-01-15 */ static rsRetVal -qqueueStartDA(qqueue_t *pThis) +StartDA(qqueue_t *pThis) { DEFiRet; uchar pszDAQName[128]; ISOBJ_TYPE_assert(pThis, qqueue); - if(pThis->bRunsDA == 2) /* check if already in (fully initialized) DA mode... */ - FINALIZE; /* ... then we are already done! */ - /* create message queue */ CHKiRet(qqueueConstruct(&pThis->pqDA, QUEUETYPE_DISK , 1, 0, pThis->pConsumer)); @@ -298,11 +348,15 @@ qqueueStartDA(qqueue_t *pThis) CHKiRet(qqueueSetbSyncQueueFiles(pThis->pqDA, pThis->bSyncQueueFiles)); CHKiRet(qqueueSettoActShutdown(pThis->pqDA, pThis->toActShutdown)); CHKiRet(qqueueSettoEnq(pThis->pqDA, pThis->toEnq)); - CHKiRet(qqueueSetEnqOnly(pThis->pqDA, pThis->bDAEnqOnly, MUTEX_ALREADY_LOCKED)); + CHKiRet(SetEnqOnly(pThis->pqDA, pThis->bDAEnqOnly, MUTEX_ALREADY_LOCKED)); CHKiRet(qqueueSetiDeqtWinFromHr(pThis->pqDA, pThis->iDeqtWinFromHr)); CHKiRet(qqueueSetiDeqtWinToHr(pThis->pqDA, pThis->iDeqtWinToHr)); CHKiRet(qqueueSetiHighWtrMrk(pThis->pqDA, 0)); CHKiRet(qqueueSetiDiscardMrk(pThis->pqDA, 0)); + + // experimental: XXX + CHKiRet(qqueueSettoWrkShutdown(pThis->pqDA, 0)); + if(pThis->toQShutdown == 0) { CHKiRet(qqueueSettoQShutdown(pThis->pqDA, 0)); /* if the user really wants... */ } else { @@ -318,19 +372,9 @@ qqueueStartDA(qqueue_t *pThis) if(iRet != RS_RET_OK && iRet != RS_RET_FILE_NOT_FOUND) FINALIZE; /* something is wrong */ - /* as we are right now starting DA mode because we are so busy, it is - * extremely unlikely that any regular worker is sleeping on empty queue. HOWEVER, - * we want to be on the safe side, and so we awake anyone that is waiting - * on one. So even if the scheduler plays badly with us, things should be - * quite well. -- rgerhards, 2008-01-15 - */ - wtpWakeupWrkr(pThis->pWtpReg); /* awake all workers, but not ourselves ;) */ - - pThis->bRunsDA = 2; /* we are now in DA mode, but not fully initialized */ - pThis->bChildIsDone = 0;/* set to 1 when child's worker detect queue is finished */ - pthread_cond_broadcast(&pThis->condDAReady); /* signal we are now initialized and ready to go ;) */ + //pthread_cond_broadcast(&pThis->condDAReady); /* signal we are now initialized and ready to go ;) */ - dbgoprint((obj_t*) pThis, "is now running in disk assisted mode, disk queue 0x%lx\n", + DBGOPRINT((obj_t*) pThis, "is now running in disk assisted mode, disk queue 0x%lx\n", qqueueGetID(pThis->pqDA)); finalize_it: @@ -338,7 +382,7 @@ finalize_it: if(pThis->pqDA != NULL) { qqueueDestruct(&pThis->pqDA); } - dbgoprint((obj_t*) pThis, "error %d creating disk queue - giving up.\n", iRet); + DBGOPRINT((obj_t*) pThis, "error %d creating disk queue - giving up.\n", iRet); pThis->bIsDA = 0; } @@ -352,8 +396,8 @@ finalize_it: * If this function fails (should not happen), DA mode is not turned on. * rgerhards, 2008-01-16 */ -static inline rsRetVal -qqueueInitDA(qqueue_t *pThis, int bEnqOnly, int bLockMutex) +static rsRetVal +InitDA(qqueue_t *pThis, int bEnqOnly, int bLockMutex) { DEFiRet; DEFVARS_mutexProtection; @@ -366,17 +410,18 @@ qqueueInitDA(qqueue_t *pThis, int bEnqOnly, int bLockMutex) * is intentional. We assume that when we need it once, we may also need it on another * occasion. Ressources used are quite minimal when no worker is running. * rgerhards, 2008-01-24 + * NOTE: this is the DA worker *pool*, not the DA queue! */ if(pThis->pWtpDA == NULL) { - lenBuf = snprintf((char*)pszBuf, sizeof(pszBuf), "%s:DA", obj.GetName((obj_t*) pThis)); + lenBuf = snprintf((char*)pszBuf, sizeof(pszBuf), "%s:DAwpool", obj.GetName((obj_t*) pThis)); CHKiRet(wtpConstruct (&pThis->pWtpDA)); CHKiRet(wtpSetDbgHdr (pThis->pWtpDA, pszBuf, lenBuf)); CHKiRet(wtpSetpfChkStopWrkr (pThis->pWtpDA, (rsRetVal (*)(void *pUsr, int)) qqueueChkStopWrkrDA)); - CHKiRet(wtpSetpfIsIdle (pThis->pWtpDA, (rsRetVal (*)(void *pUsr, int)) qqueueIsIdleDA)); - CHKiRet(wtpSetpfDoWork (pThis->pWtpDA, (rsRetVal (*)(void *pUsr, void *pWti, int)) qqueueConsumerDA)); - CHKiRet(wtpSetpfOnWorkerCancel (pThis->pWtpDA, (rsRetVal (*)(void *pUsr, void*pWti)) qqueueConsumerCancelCleanup)); - CHKiRet(wtpSetpfOnWorkerStartup (pThis->pWtpDA, (rsRetVal (*)(void *pUsr)) qqueueStartDA)); - CHKiRet(wtpSetpfOnWorkerShutdown(pThis->pWtpDA, (rsRetVal (*)(void *pUsr)) qqueueTurnOffDAMode)); + CHKiRet(wtpSetpfGetDeqBatchSize (pThis->pWtpDA, (rsRetVal (*)(void *pUsr, int*)) GetDeqBatchSize)); + CHKiRet(wtpSetpfIsIdle (pThis->pWtpDA, (rsRetVal (*)(void *pUsr, wtp_t*)) qqueueIsIdleDA)); + CHKiRet(wtpSetpfDoWork (pThis->pWtpDA, (rsRetVal (*)(void *pUsr, void *pWti)) ConsumerDA)); + CHKiRet(wtpSetpfObjProcessed (pThis->pWtpDA, (rsRetVal (*)(void *pUsr, wti_t *pWti)) batchProcessed)); + CHKiRet(wtpSetpfOnWorkerShutdown(pThis->pWtpDA, (rsRetVal (*)(void *pUsr)) TurnOffDAMode)); CHKiRet(wtpSetpmutUsr (pThis->pWtpDA, pThis->mut)); CHKiRet(wtpSetpcondBusy (pThis->pWtpDA, &pThis->notEmpty)); CHKiRet(wtpSetiNumWorkerThreads (pThis->pWtpDA, 1)); @@ -387,14 +432,20 @@ qqueueInitDA(qqueue_t *pThis, int bEnqOnly, int bLockMutex) /* if we reach this point, we have a "good" DA worker pool */ /* indicate we now run in DA mode - this is reset by the DA worker if it fails */ - pThis->bRunsDA = 1; pThis->bDAEnqOnly = bEnqOnly; + /* now construct the actual queue (if it does not already exist) */ + if(pThis->pqDA == NULL) { + CHKiRet(StartDA(pThis)); + } + + pThis->bRunsDA = 1; + /* now we must now adivse the wtp that we need one worker. If none is yet active, * that will also start one up. If we forgot that step, everything would be stalled * until the next enqueue request. */ - wtpAdviseMaxWorkers(pThis->pWtpDA, 1); /* DA queues alsways have just one worker max */ + wtpAdviseMaxWorkers(pThis->pWtpDA, 1); /* DA queues always have just one worker max */ finalize_it: END_MTX_PROTECTED_OPERATIONS(pThis->mut); @@ -408,15 +459,15 @@ finalize_it: * complete. * rgerhards, 2008-01-14 */ -static inline rsRetVal -qqueueChkStrtDA(qqueue_t *pThis) +static rsRetVal +ChkStrtDA(qqueue_t *pThis) { DEFiRet; ISOBJ_TYPE_assert(pThis, qqueue); /* if we do not hit the high water mark, we have nothing to do */ - if(qqueueGetOverallQueueSize(pThis) != pThis->iHighWtrMrk) + if(getPhysicalQueueSize(pThis) != pThis->iHighWtrMrk) ABORT_FINALIZE(RS_RET_OK); if(pThis->bRunsDA) { @@ -429,16 +480,16 @@ qqueueChkStrtDA(qqueue_t *pThis) * terminated due to the inactivity timeout, thus we need to advise the pool that * we need at least one). */ - dbgoprint((obj_t*) pThis, "%d entries - passed high water mark in DA mode, send notify\n", - qqueueGetOverallQueueSize(pThis)); + DBGOPRINT((obj_t*) pThis, "%d entries - passed high water mark in DA mode, send notify\n", + getPhysicalQueueSize(pThis)); qqueueAdviseMaxWorkers(pThis); } else { /* this is the case when we are currently not running in DA mode. So it is time * to turn it back on. */ - dbgoprint((obj_t*) pThis, "%d entries - passed high water mark for disk-assisted mode, initiating...\n", - qqueueGetOverallQueueSize(pThis)); - qqueueInitDA(pThis, QUEUE_MODE_ENQDEQ, MUTEX_ALREADY_LOCKED); /* initiate DA mode */ + DBGOPRINT((obj_t*) pThis, "%d entries - passed high water mark for disk-assisted mode, initiating...\n", + getPhysicalQueueSize(pThis)); + InitDA(pThis, QUEUE_MODE_ENQDEQ, MUTEX_ALREADY_LOCKED); /* initiate DA mode */ } finalize_it: @@ -469,6 +520,7 @@ static rsRetVal qConstructFixedArray(qqueue_t *pThis) ABORT_FINALIZE(RS_RET_OUT_OF_MEMORY); } + pThis->tVars.farray.deqhead = 0; pThis->tVars.farray.head = 0; pThis->tVars.farray.tail = 0; @@ -486,9 +538,7 @@ static rsRetVal qDestructFixedArray(qqueue_t *pThis) ASSERT(pThis != NULL); queueDrain(pThis); /* discard any remaining queue entries */ - - if(pThis->tVars.farray.pBuf != NULL) - free(pThis->tVars.farray.pBuf); + free(pThis->tVars.farray.pBuf); RETiRet; } @@ -507,76 +557,57 @@ static rsRetVal qAddFixedArray(qqueue_t *pThis, void* in) RETiRet; } -static rsRetVal qDelFixedArray(qqueue_t *pThis, void **out) + +static rsRetVal qDeqFixedArray(qqueue_t *pThis, void **out) { DEFiRet; ASSERT(pThis != NULL); - *out = (void*) pThis->tVars.farray.pBuf[pThis->tVars.farray.head]; + *out = (void*) pThis->tVars.farray.pBuf[pThis->tVars.farray.deqhead]; - pThis->tVars.farray.head++; - if (pThis->tVars.farray.head == pThis->iMaxQueueSize) - pThis->tVars.farray.head = 0; + pThis->tVars.farray.deqhead++; + if (pThis->tVars.farray.deqhead == pThis->iMaxQueueSize) + pThis->tVars.farray.deqhead = 0; RETiRet; } -/* -------------------- linked list -------------------- */ - -/* first some generic functions which are also used for the unget linked list */ - -static inline rsRetVal qqueueAddLinkedList(qLinkedList_t **ppRoot, qLinkedList_t **ppLast, void* pUsr) +static rsRetVal qDelFixedArray(qqueue_t *pThis) { DEFiRet; - qLinkedList_t *pEntry; - - ASSERT(ppRoot != NULL); - ASSERT(ppLast != NULL); - - if((pEntry = (qLinkedList_t*) malloc(sizeof(qLinkedList_t))) == NULL) { - ABORT_FINALIZE(RS_RET_OUT_OF_MEMORY); - } - pEntry->pNext = NULL; - pEntry->pUsr = pUsr; + ASSERT(pThis != NULL); - if(*ppRoot == NULL) { - *ppRoot = *ppLast = pEntry; - } else { - (*ppLast)->pNext = pEntry; - *ppLast = pEntry; - } + pThis->tVars.farray.head++; + if (pThis->tVars.farray.head == pThis->iMaxQueueSize) + pThis->tVars.farray.head = 0; -finalize_it: RETiRet; } -static inline rsRetVal qqueueDelLinkedList(qLinkedList_t **ppRoot, qLinkedList_t **ppLast, obj_t **ppUsr) + +/* reset the logical dequeue pointer to the physical dequeue position. + * This is only needed after we cancelled workers (during queue shutdown). + */ +static rsRetVal +qUnDeqAllFixedArray(qqueue_t *pThis) { DEFiRet; - qLinkedList_t *pEntry; - ASSERT(ppRoot != NULL); - ASSERT(ppLast != NULL); - ASSERT(ppUsr != NULL); - ASSERT(*ppRoot != NULL); - - pEntry = *ppRoot; - *ppUsr = pEntry->pUsr; + ISOBJ_TYPE_assert(pThis, qqueue); - if(*ppRoot == *ppLast) { - *ppRoot = NULL; - *ppLast = NULL; - } else { - *ppRoot = pEntry->pNext; - } - free(pEntry); + DBGOPRINT((obj_t*) pThis, "resetting FixedArray deq index to %ld (was %ld), logical dequeue count %d\n", + pThis->tVars.farray.head, pThis->tVars.farray.deqhead, pThis->nLogDeq); + + pThis->tVars.farray.deqhead = pThis->tVars.farray.head; + pThis->nLogDeq = 0; RETiRet; } -/* end generic functions which are also used for the unget linked list */ + +/* -------------------- linked list -------------------- */ static rsRetVal qConstructLinkedList(qqueue_t *pThis) @@ -585,8 +616,9 @@ static rsRetVal qConstructLinkedList(qqueue_t *pThis) ASSERT(pThis != NULL); - pThis->tVars.linklist.pRoot = 0; - pThis->tVars.linklist.pLast = 0; + pThis->tVars.linklist.pDeqRoot = NULL; + pThis->tVars.linklist.pDelRoot = NULL; + pThis->tVars.linklist.pLast = NULL; qqueueChkIsDA(pThis); @@ -609,54 +641,79 @@ static rsRetVal qDestructLinkedList(qqueue_t __attribute__((unused)) *pThis) static rsRetVal qAddLinkedList(qqueue_t *pThis, void* pUsr) { - DEFiRet; - - iRet = qqueueAddLinkedList(&pThis->tVars.linklist.pRoot, &pThis->tVars.linklist.pLast, pUsr); -#if 0 qLinkedList_t *pEntry; + DEFiRet; - ASSERT(pThis != NULL); - if((pEntry = (qLinkedList_t*) malloc(sizeof(qLinkedList_t))) == NULL) { - ABORT_FINALIZE(RS_RET_OUT_OF_MEMORY); - } + CHKmalloc((pEntry = (qLinkedList_t*) malloc(sizeof(qLinkedList_t)))); pEntry->pNext = NULL; pEntry->pUsr = pUsr; - if(pThis->tVars.linklist.pRoot == NULL) { - pThis->tVars.linklist.pRoot = pThis->tVars.linklist.pLast = pEntry; + if(pThis->tVars.linklist.pDelRoot == NULL) { + pThis->tVars.linklist.pDelRoot = pThis->tVars.linklist.pDeqRoot = pThis->tVars.linklist.pLast = pEntry; } else { pThis->tVars.linklist.pLast->pNext = pEntry; pThis->tVars.linklist.pLast = pEntry; } + if(pThis->tVars.linklist.pDeqRoot == NULL) { + pThis->tVars.linklist.pDeqRoot = pEntry; + } + finalize_it: -#endif RETiRet; } -static rsRetVal qDelLinkedList(qqueue_t *pThis, obj_t **ppUsr) + +static rsRetVal qDeqLinkedList(qqueue_t *pThis, obj_t **ppUsr) { - DEFiRet; - iRet = qqueueDelLinkedList(&pThis->tVars.linklist.pRoot, &pThis->tVars.linklist.pLast, ppUsr); -#if 0 qLinkedList_t *pEntry; + DEFiRet; - ASSERT(pThis != NULL); - ASSERT(pThis->tVars.linklist.pRoot != NULL); - - pEntry = pThis->tVars.linklist.pRoot; + pEntry = pThis->tVars.linklist.pDeqRoot; + ISOBJ_TYPE_assert(pEntry->pUsr, msg); *ppUsr = pEntry->pUsr; + pThis->tVars.linklist.pDeqRoot = pEntry->pNext; + + RETiRet; +} + + +static rsRetVal qDelLinkedList(qqueue_t *pThis) +{ + qLinkedList_t *pEntry; + DEFiRet; - if(pThis->tVars.linklist.pRoot == pThis->tVars.linklist.pLast) { - pThis->tVars.linklist.pRoot = NULL; - pThis->tVars.linklist.pLast = NULL; + pEntry = pThis->tVars.linklist.pDelRoot; + + if(pThis->tVars.linklist.pDelRoot == pThis->tVars.linklist.pLast) { + pThis->tVars.linklist.pDelRoot = pThis->tVars.linklist.pDeqRoot = pThis->tVars.linklist.pLast = NULL; } else { - pThis->tVars.linklist.pRoot = pEntry->pNext; + pThis->tVars.linklist.pDelRoot = pEntry->pNext; } + free(pEntry); -#endif + RETiRet; +} + + +/* reset the logical dequeue pointer to the physical dequeue position. + * This is only needed after we cancelled workers (during queue shutdown). + */ +static rsRetVal +qUnDeqAllLinkedList(qqueue_t *pThis) +{ + DEFiRet; + + ASSERT(pThis != NULL); + + DBGOPRINT((obj_t*) pThis, "resetting LinkedList deq ptr to %p (was %p), logical dequeue count %d\n", + pThis->tVars.linklist.pDelRoot, pThis->tVars.linklist.pDeqRoot, pThis->nLogDeq); + + pThis->tVars.linklist.pDeqRoot = pThis->tVars.linklist.pDelRoot; + pThis->nLogDeq = 0; + RETiRet; } @@ -700,10 +757,10 @@ qqueueHaveQIF(qqueue_t *pThis) /* check if the file exists */ if(stat((char*) pszQIFNam, &stat_buf) == -1) { if(errno == ENOENT) { - dbgoprint((obj_t*) pThis, "no .qi file found\n"); + DBGOPRINT((obj_t*) pThis, "no .qi file found\n"); ABORT_FINALIZE(RS_RET_FILE_NOT_FOUND); } else { - dbgoprint((obj_t*) pThis, "error %d trying to access .qi file\n", errno); + DBGOPRINT((obj_t*) pThis, "error %d trying to access .qi file\n", errno); ABORT_FINALIZE(RS_RET_IO_ERROR); } } @@ -725,8 +782,6 @@ qqueueTryLoadPersistedInfo(qqueue_t *pThis) uchar pszQIFNam[MAXFNAME]; size_t lenQIFNam; struct stat stat_buf; - int iUngottenObjs; - obj_t *pUsr; ISOBJ_TYPE_assert(pThis, qqueue); @@ -737,10 +792,10 @@ qqueueTryLoadPersistedInfo(qqueue_t *pThis) /* check if the file exists */ if(stat((char*) pszQIFNam, &stat_buf) == -1) { if(errno == ENOENT) { - dbgoprint((obj_t*) pThis, "clean startup, no .qi file found\n"); + DBGOPRINT((obj_t*) pThis, "clean startup, no .qi file found\n"); ABORT_FINALIZE(RS_RET_FILE_NOT_FOUND); } else { - dbgoprint((obj_t*) pThis, "error %d trying to access .qi file\n", errno); + DBGOPRINT((obj_t*) pThis, "error %d trying to access .qi file\n", errno); ABORT_FINALIZE(RS_RET_IO_ERROR); } } @@ -756,25 +811,22 @@ qqueueTryLoadPersistedInfo(qqueue_t *pThis) /* first, we try to read the property bag for ourselfs */ CHKiRet(obj.DeserializePropBag((obj_t*) pThis, psQIF)); - /* then the ungotten object queue */ - iUngottenObjs = pThis->iUngottenObjs; - pThis->iUngottenObjs = 0; /* will be incremented when we add objects! */ - - while(iUngottenObjs > 0) { - /* fill the queue from disk */ - CHKiRet(obj.Deserialize((void*) &pUsr, (uchar*)"msg", psQIF, NULL, NULL)); - qqueueUngetObj(pThis, pUsr, MUTEX_ALREADY_LOCKED); - --iUngottenObjs; /* one less */ - } - - /* and now the stream objects (some order as when persisted!) */ + /* then the stream objects (same order as when persisted!) */ CHKiRet(obj.Deserialize(&pThis->tVars.disk.pWrite, (uchar*) "strm", psQIF, (rsRetVal(*)(obj_t*,void*))qqueueLoadPersStrmInfoFixup, pThis)); - CHKiRet(obj.Deserialize(&pThis->tVars.disk.pRead, (uchar*) "strm", psQIF, + CHKiRet(obj.Deserialize(&pThis->tVars.disk.pReadDel, (uchar*) "strm", psQIF, (rsRetVal(*)(obj_t*,void*))qqueueLoadPersStrmInfoFixup, pThis)); + /* create a duplicate for the read "pointer". + */ + + CHKiRet(strm.Dup(pThis->tVars.disk.pReadDel, &pThis->tVars.disk.pReadDeq)); + CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pReadDeq, 0)); /* deq must NOT delete the files! */ + CHKiRet(strm.ConstructFinalize(pThis->tVars.disk.pReadDeq)); + CHKiRet(strm.SeekCurrOffs(pThis->tVars.disk.pWrite)); - CHKiRet(strm.SeekCurrOffs(pThis->tVars.disk.pRead)); + CHKiRet(strm.SeekCurrOffs(pThis->tVars.disk.pReadDel)); + CHKiRet(strm.SeekCurrOffs(pThis->tVars.disk.pReadDeq)); /* OK, we could successfully read the file, so we now can request that it be * deleted when we are done with the persisted information. @@ -786,7 +838,7 @@ finalize_it: strm.Destruct(&psQIF); if(iRet != RS_RET_OK) { - dbgoprint((obj_t*) pThis, "error %d reading .qi file - can not read persisted info (if any)\n", + DBGOPRINT((obj_t*) pThis, "error %d reading .qi file - can not read persisted info (if any)\n", iRet); } @@ -826,18 +878,26 @@ static rsRetVal qConstructDisk(qqueue_t *pThis) CHKiRet(strm.SetsType(pThis->tVars.disk.pWrite, STREAMTYPE_FILE_CIRCULAR)); CHKiRet(strm.ConstructFinalize(pThis->tVars.disk.pWrite)); - CHKiRet(strm.Construct(&pThis->tVars.disk.pRead)); - CHKiRet(strm.SetbSync(pThis->tVars.disk.pRead, pThis->bSyncQueueFiles)); - CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pRead, 1)); - CHKiRet(strm.SetDir(pThis->tVars.disk.pRead, glbl.GetWorkDir(), strlen((char*)glbl.GetWorkDir()))); - CHKiRet(strm.SetiMaxFiles(pThis->tVars.disk.pRead, 10000000)); - CHKiRet(strm.SettOperationsMode(pThis->tVars.disk.pRead, STREAMMODE_READ)); - CHKiRet(strm.SetsType(pThis->tVars.disk.pRead, STREAMTYPE_FILE_CIRCULAR)); - CHKiRet(strm.ConstructFinalize(pThis->tVars.disk.pRead)); + CHKiRet(strm.Construct(&pThis->tVars.disk.pReadDeq)); + CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pReadDeq, 0)); + CHKiRet(strm.SetDir(pThis->tVars.disk.pReadDeq, glbl.GetWorkDir(), strlen((char*)glbl.GetWorkDir()))); + CHKiRet(strm.SetiMaxFiles(pThis->tVars.disk.pReadDeq, 10000000)); + CHKiRet(strm.SettOperationsMode(pThis->tVars.disk.pReadDeq, STREAMMODE_READ)); + CHKiRet(strm.SetsType(pThis->tVars.disk.pReadDeq, STREAMTYPE_FILE_CIRCULAR)); + CHKiRet(strm.ConstructFinalize(pThis->tVars.disk.pReadDeq)); + CHKiRet(strm.Construct(&pThis->tVars.disk.pReadDel)); + CHKiRet(strm.SetbSync(pThis->tVars.disk.pReadDel, pThis->bSyncQueueFiles)); + CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pReadDel, 1)); + CHKiRet(strm.SetDir(pThis->tVars.disk.pReadDel, glbl.GetWorkDir(), strlen((char*)glbl.GetWorkDir()))); + CHKiRet(strm.SetiMaxFiles(pThis->tVars.disk.pReadDel, 10000000)); + CHKiRet(strm.SettOperationsMode(pThis->tVars.disk.pReadDel, STREAMMODE_READ)); + CHKiRet(strm.SetsType(pThis->tVars.disk.pReadDel, STREAMTYPE_FILE_CIRCULAR)); + CHKiRet(strm.ConstructFinalize(pThis->tVars.disk.pReadDel)); - CHKiRet(strm.SetFName(pThis->tVars.disk.pWrite, pThis->pszFilePrefix, pThis->lenFilePrefix)); - CHKiRet(strm.SetFName(pThis->tVars.disk.pRead, pThis->pszFilePrefix, pThis->lenFilePrefix)); + CHKiRet(strm.SetFName(pThis->tVars.disk.pWrite, pThis->pszFilePrefix, pThis->lenFilePrefix)); + CHKiRet(strm.SetFName(pThis->tVars.disk.pReadDeq, pThis->pszFilePrefix, pThis->lenFilePrefix)); + CHKiRet(strm.SetFName(pThis->tVars.disk.pReadDel, pThis->pszFilePrefix, pThis->lenFilePrefix)); } /* now we set (and overwrite in case of a persisted restart) some parameters which @@ -846,7 +906,8 @@ static rsRetVal qConstructDisk(qqueue_t *pThis) * ability to read existing queue files. -- rgerhards, 2008-01-12 */ CHKiRet(strm.SetiMaxFileSize(pThis->tVars.disk.pWrite, pThis->iMaxFileSize)); - CHKiRet(strm.SetiMaxFileSize(pThis->tVars.disk.pRead, pThis->iMaxFileSize)); + CHKiRet(strm.SetiMaxFileSize(pThis->tVars.disk.pReadDeq, pThis->iMaxFileSize)); + CHKiRet(strm.SetiMaxFileSize(pThis->tVars.disk.pReadDel, pThis->iMaxFileSize)); finalize_it: RETiRet; @@ -860,7 +921,8 @@ static rsRetVal qDestructDisk(qqueue_t *pThis) ASSERT(pThis != NULL); strm.Destruct(&pThis->tVars.disk.pWrite); - strm.Destruct(&pThis->tVars.disk.pRead); + strm.Destruct(&pThis->tVars.disk.pReadDeq); + strm.Destruct(&pThis->tVars.disk.pReadDel); RETiRet; } @@ -885,23 +947,37 @@ static rsRetVal qAddDisk(qqueue_t *pThis, void* pUsr) */ objDestruct(pUsr); - dbgoprint((obj_t*) pThis, "write wrote %lld octets to disk, queue disk size now %lld octets\n", + DBGOPRINT((obj_t*) pThis, "write wrote %lld octets to disk, queue disk size now %lld octets\n", nWriteCount, pThis->tVars.disk.sizeOnDisk); finalize_it: RETiRet; } -static rsRetVal qDelDisk(qqueue_t *pThis, void **ppUsr) + +static rsRetVal qDeqDisk(qqueue_t *pThis, void **ppUsr) { DEFiRet; + CHKiRet(obj.Deserialize(ppUsr, (uchar*) "msg", pThis->tVars.disk.pReadDeq, NULL, NULL)); + +finalize_it: + RETiRet; +} + + +static rsRetVal qDelDisk(qqueue_t *pThis) +{ + obj_t *pDummyObj; /* we need to deserialize it... */ + DEFiRet; + int64 offsIn; int64 offsOut; - CHKiRet(strm.GetCurrOffset(pThis->tVars.disk.pRead, &offsIn)); - CHKiRet(obj.Deserialize(ppUsr, (uchar*) "msg", pThis->tVars.disk.pRead, NULL, NULL)); - CHKiRet(strm.GetCurrOffset(pThis->tVars.disk.pRead, &offsOut)); + CHKiRet(strm.GetCurrOffset(pThis->tVars.disk.pReadDel, &offsIn)); + CHKiRet(obj.Deserialize(&pDummyObj, (uchar*) "msg", pThis->tVars.disk.pReadDel, NULL, NULL)); + objDestruct(pDummyObj); + CHKiRet(strm.GetCurrOffset(pThis->tVars.disk.pReadDel, &offsOut)); /* This time it is a bit tricky: we free disk space only upon file deletion. So we need * to keep track of what we have read until we get an out-offset that is lower than the @@ -913,7 +989,7 @@ static rsRetVal qDelDisk(qqueue_t *pThis, void **ppUsr) } else { pThis->tVars.disk.sizeOnDisk -= pThis->tVars.disk.bytesRead; pThis->tVars.disk.bytesRead = offsOut; - dbgoprint((obj_t*) pThis, "a file has been deleted, now %lld octets disk space used\n", pThis->tVars.disk.sizeOnDisk); + DBGOPRINT((obj_t*) pThis, "a file has been deleted, now %lld octets disk space used\n", pThis->tVars.disk.sizeOnDisk); /* awake possibly waiting enq process */ pthread_cond_signal(&pThis->notFull); /* we hold the mutex while we are in here! */ } @@ -922,6 +998,17 @@ finalize_it: RETiRet; } + +/* This is a dummy function for disks - we do not need to reset anything + * because everything is already persisted... + */ +static rsRetVal +qUnDeqAllDisk(__attribute__((unused)) qqueue_t *pThis) +{ + return RS_RET_OK; +} + + /* -------------------- direct (no queueing) -------------------- */ static rsRetVal qConstructDirect(qqueue_t __attribute__((unused)) *pThis) { @@ -936,6 +1023,8 @@ static rsRetVal qDestructDirect(qqueue_t __attribute__((unused)) *pThis) static rsRetVal qAddDirect(qqueue_t *pThis, void* pUsr) { + batch_t singleBatch; + batch_obj_t batchObj; DEFiRet; ASSERT(pThis != NULL); @@ -945,70 +1034,33 @@ static rsRetVal qAddDirect(qqueue_t *pThis, void* pUsr) * mode the consumer probably has a lot to convey (which get's lost in the other modes * because they are asynchronous. But direct mode is deliberately synchronous. * rgerhards, 2008-02-12 + * We use our knowledge about the batch_t structure below, but without that, we + * pay a too-large performance toll... -- rgerhards, 2009-04-22 */ - iRet = pThis->pConsumer(pThis->pUsr, pUsr); + batchObj.state = BATCH_STATE_RDY; + batchObj.pUsrp = (obj_t*) pUsr; + singleBatch.nElem = 1; /* there always is only one in direct mode */ + singleBatch.pElem = &batchObj; + iRet = pThis->pConsumer(pThis->pUsr, &singleBatch); + objDestruct(pUsr); RETiRet; } -static rsRetVal qDelDirect(qqueue_t __attribute__((unused)) *pThis, __attribute__((unused)) void **out) + +static rsRetVal qDelDirect(qqueue_t __attribute__((unused)) *pThis) { return RS_RET_OK; } - -/* --------------- end type-specific handlers -------------------- */ - - -/* unget a user pointer that has been dequeued. This functionality is especially important - * for consumer cancel cleanup handlers. To support it, a short list of ungotten user pointers - * is maintened in memory. - * rgerhards, 2008-01-20 - */ static rsRetVal -qqueueUngetObj(qqueue_t *pThis, obj_t *pUsr, int bLockMutex) +qUnDeqAllDirect(__attribute__((unused)) qqueue_t *pThis) { - DEFiRet; - DEFVARS_mutexProtection; - - ISOBJ_TYPE_assert(pThis, qqueue); - ISOBJ_assert(pUsr); /* TODO: we aborted right at this place at least 3 times -- race? 2008-02-28, -03-10, -03-15 - The second time I noticed it the queue was in destruction with NO worker threads - running. The pUsr ptr was totally off and provided no clue what it may be pointing - at (except that it looked like the static data pool). Both times, the abort happend - inside an action queue */ - - dbgoprint((obj_t*) pThis, "ungetting user object %s\n", obj.GetName(pUsr)); - BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, bLockMutex); - iRet = qqueueAddLinkedList(&pThis->pUngetRoot, &pThis->pUngetLast, pUsr); - ++pThis->iUngottenObjs; /* indicate one more */ - END_MTX_PROTECTED_OPERATIONS(pThis->mut); - - RETiRet; + return RS_RET_OK; } -/* dequeues a user pointer from the ungotten queue. Pointers from there should always be - * dequeued first. - * - * This function must only be called when the mutex is locked! - * - * rgerhards, 2008-01-29 - */ -static rsRetVal -qqueueGetUngottenObj(qqueue_t *pThis, obj_t **ppUsr) -{ - DEFiRet; - - ISOBJ_TYPE_assert(pThis, qqueue); - ASSERT(ppUsr != NULL); - - iRet = qqueueDelLinkedList(&pThis->pUngetRoot, &pThis->pUngetLast, ppUsr); - --pThis->iUngottenObjs; /* indicate one less */ - dbgoprint((obj_t*) pThis, "dequeued ungotten user object %s\n", obj.GetName(*ppUsr)); - - RETiRet; -} +/* --------------- end type-specific handlers -------------------- */ /* generic code to add a queue entry @@ -1027,7 +1079,8 @@ qqueueAdd(qqueue_t *pThis, void *pUsr) if(pThis->qType != QUEUETYPE_DIRECT) { ATOMIC_INC(pThis->iQueueSize); - dbgoprint((obj_t*) pThis, "entry added, size now %d entries\n", pThis->iQueueSize); + DBGOPRINT((obj_t*) pThis, "entry added, size now log %d, phys %d entries\n", + getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis)); } finalize_it: @@ -1035,12 +1088,10 @@ finalize_it: } -/* generic code to remove a queue entry - * rgerhards, 2008-01-29: we must first see if there is any object in the - * ungotten list and, if so, dequeue it first. +/* generic code to dequeue a queue entry */ static rsRetVal -qqueueDel(qqueue_t *pThis, void *pUsr) +qqueueDeq(qqueue_t *pThis, void **ppUsr) { DEFiRet; @@ -1051,53 +1102,36 @@ qqueueDel(qqueue_t *pThis, void *pUsr) * If we decrement, however, we may lose a message. But that is better than * losing the whole process because it loops... -- rgerhards, 2008-01-03 */ - if(pThis->iUngottenObjs > 0) { - iRet = qqueueGetUngottenObj(pThis, (obj_t**) pUsr); - } else { - iRet = pThis->qDel(pThis, pUsr); - ATOMIC_DEC(pThis->iQueueSize); - } + iRet = pThis->qDeq(pThis, ppUsr); + ATOMIC_INC(pThis->nLogDeq); - dbgoprint((obj_t*) pThis, "entry deleted, state %d, size now %d entries\n", - iRet, pThis->iQueueSize); +// DBGOPRINT((obj_t*) pThis, "entry deleted, size now log %d, phys %d entries\n", +// getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis)); RETiRet; } -/* This function shuts down all worker threads and waits until they - * have terminated. If they timeout, they are cancelled. Parameters have been set - * before this function is called so that DA queues will be fully persisted to - * disk (if configured to do so). - * rgerhards, 2008-01-24 - * Please note that this function shuts down BOTH the parent AND the child queue - * in DA case. This is necessary because their timeouts are tightly coupled. Most - * importantly, the timeouts would be applied twice (or logic be extremely - * complex) if each would have its own shutdown. The function does not self check - * this condition - the caller must make sure it is not called with a parent. +/* Try to terminate queue worker threads within the regular shutdown interval. + * Both the regular and DA queue (if it exists) is waited for, but on the same timeout. + * After this function returns, the workers must either be finished or some force + * to finish them must be applied. + * This function also instructs the DA worker pool (if it exists) to terminate. This is done + * in preparation of final queue shutdown. + * rgerhards, 2009-05-27 */ -static rsRetVal qqueueShutdownWorkers(qqueue_t *pThis) +static rsRetVal +tryShutdownWorkersWithinQueueTimeout(qqueue_t *pThis) { - DEFiRet; - DEFVARS_mutexProtection; struct timespec tTimeout; rsRetVal iRetLocal; + DEFiRet; ISOBJ_TYPE_assert(pThis, qqueue); ASSERT(pThis->pqParent == NULL); /* detect invalid calling sequence */ - dbgoprint((obj_t*) pThis, "initiating worker thread shutdown sequence\n"); - - /* we reduce the low water mark in any case. This is not absolutely necessary, but - * it is useful because we enable DA mode at several spots below and so we do not need - * to think about the low water mark each time. - */ - pThis->iHighWtrMrk = 1; /* if we do not do this, the DA queue will not stop! */ - pThis->iLowWtrMrk = 0; - - /* first try to shutdown the queue within the regular shutdown period */ - BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, LOCK_MUTEX); /* some workers may be running in parallel! */ - if(qqueueGetOverallQueueSize(pThis) > 0) { + d_pthread_mutex_lock(pThis->mut); /* some workers may be running in parallel! */ + if(getPhysicalQueueSize(pThis) > 0) { if(pThis->bRunsDA) { /* We may have waited on the low water mark. As it may have changed, we * see if we reactivate the worker. @@ -1105,7 +1139,7 @@ static rsRetVal qqueueShutdownWorkers(qqueue_t *pThis) wtpAdviseMaxWorkers(pThis->pWtpDA, 1); } } - END_MTX_PROTECTED_OPERATIONS(pThis->mut); + d_pthread_mutex_unlock(pThis->mut); /* Now wait for the queue's workers to shut down. Note that we run into the code even if we just found * out there are no active workers - that doesn't matter: the wtp knows about that and so will @@ -1124,151 +1158,212 @@ static rsRetVal qqueueShutdownWorkers(qqueue_t *pThis) * shutdown of both the regular and DA queue on *the same* timeout. */ timeoutComp(&tTimeout, pThis->toQShutdown); - dbgoprint((obj_t*) pThis, "trying shutdown of regular workers\n"); + DBGOPRINT((obj_t*) pThis, "trying shutdown of regular workers\n"); iRetLocal = wtpShutdownAll(pThis->pWtpReg, wtpState_SHUTDOWN, &tTimeout); if(iRetLocal == RS_RET_TIMED_OUT) { - dbgoprint((obj_t*) pThis, "regular shutdown timed out on primary queue (this is OK)\n"); + DBGOPRINT((obj_t*) pThis, "regular shutdown timed out on primary queue (this is OK)\n"); } else { - /* OK, the regular queue is now shut down. So we can now wait for the DA queue (if running DA) */ - dbgoprint((obj_t*) pThis, "regular queue workers shut down.\n"); - BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, LOCK_MUTEX); /* some workers may be running in parallel! */ - if(pThis->bRunsDA) { - END_MTX_PROTECTED_OPERATIONS(pThis->mut); - dbgoprint((obj_t*) pThis, "we have a DA queue (0x%lx), requesting its shutdown.\n", - qqueueGetID(pThis->pqDA)); - /* we use the same absolute timeout as above, so we do not use more than the configured - * timeout interval! - */ - dbgoprint((obj_t*) pThis, "trying shutdown of DA workers\n"); - iRetLocal = wtpShutdownAll(pThis->pWtpDA, wtpState_SHUTDOWN, &tTimeout); - if(iRetLocal == RS_RET_TIMED_OUT) { - dbgoprint((obj_t*) pThis, "shutdown timed out on DA queue (this is OK)\n"); - } - } else { - END_MTX_PROTECTED_OPERATIONS(pThis->mut); - } + DBGOPRINT((obj_t*) pThis, "regular queue workers shut down.\n"); } - /* when we reach this point, both queues are either empty or the regular queue shutdown timeout - * has expired. Now we need to check if we are configured to not loose messages. If so, we need - * to persist the queue to disk (this is only possible if the queue is DA-enabled). We must also - * set the primary queue to SHUTDOWN_IMMEDIATE, as it shall now terminate as soon as its consumer - * is done. This is especially important as we otherwise may interfere with queue order while the - * DA consumer is running. -- rgerhards, 2008-01-27 - * Note: there was a note that we should not wait eternally on the DA worker if we run in - * enqueue-only note. I have reviewed the code and think there is no need for this check. Howerver, - * I'd like to keep this note in here should we happen to run into some related trouble. - * rgerhards, 2008-01-28 - */ - wtpSetState(pThis->pWtpReg, wtpState_SHUTDOWN_IMMEDIATE); /* set primary queue to shutdown only */ - - /* at this stage, we need to have the DA worker properly initialized and running (if there is one) */ - if(pThis->bRunsDA) - qqueueWaitDAModeInitialized(pThis); - - BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, LOCK_MUTEX); /* some workers may be running in parallel! */ - /* optimize parameters for shutdown of DA-enabled queues */ - if(pThis->bIsDA && qqueueGetOverallQueueSize(pThis) > 0 && pThis->bSaveOnShutdown) { - /* switch to enqueue-only mode so that no more actions happen */ - if(pThis->bRunsDA == 0) { - qqueueInitDA(pThis, QUEUE_MODE_ENQONLY, MUTEX_ALREADY_LOCKED); /* switch to DA mode */ + /* OK, the worker for the regular queue is processed, on the the DA queue regular worker. */ + if(pThis->pqDA != NULL) { + DBGOPRINT((obj_t*) pThis, "we have a DA queue (0x%lx), requesting its shutdown.\n", + qqueueGetID(pThis->pqDA)); + /* we use the same absolute timeout as above, so we do not use more than the configured + * timeout interval! + */ + DBGOPRINT((obj_t*) pThis, "trying shutdown of regular worker of DA queue\n"); + iRetLocal = wtpShutdownAll(pThis->pqDA->pWtpReg, wtpState_SHUTDOWN, &tTimeout); + if(iRetLocal == RS_RET_TIMED_OUT) { + DBGOPRINT((obj_t*) pThis, "shutdown timed out on DA queue worker (this is OK)\n"); } else { - /* TODO: RACE: we may reach this point when the DA worker has been initialized (state 1) - * but is not yet running (state 2). In this case, pThis->pqDA is NULL! rgerhards, 2008-02-27 - */ - qqueueSetEnqOnly(pThis->pqDA, QUEUE_MODE_ENQONLY, MUTEX_ALREADY_LOCKED); /* switch to enqueue-only mode */ + DBGOPRINT((obj_t*) pThis, "DA queue worker shut down.\n"); } - END_MTX_PROTECTED_OPERATIONS(pThis->mut); - /* make sure we do not timeout before we are done */ - dbgoprint((obj_t*) pThis, "bSaveOnShutdown configured, eternal timeout set\n"); - timeoutComp(&tTimeout, QUEUE_TIMEOUT_ETERNAL); - /* and run the primary queue's DA worker to drain the queue */ - iRetLocal = wtpShutdownAll(pThis->pWtpDA, wtpState_SHUTDOWN, &tTimeout); - if(iRetLocal != RS_RET_OK) { - dbgoprint((obj_t*) pThis, "unexpected iRet state %d after trying to shut down primary queue in disk save mode, " - "continuing, but results are unpredictable\n", iRetLocal); + /* we also instruct the DA worker pool to shutdown ASAP. If we need it for persisting + * the queue, it is restarted at a later stage. We don't care here if a timeout happens. + */ + DBGOPRINT((obj_t*) pThis, "trying shutdown of main queue DA worker pool\n"); + iRetLocal = wtpShutdownAll(pThis->pWtpDA, wtpState_SHUTDOWN_IMMEDIATE, &tTimeout); + if(iRetLocal == RS_RET_TIMED_OUT) { + DBGOPRINT((obj_t*) pThis, "shutdown timed out on main queue DA worker pool (this is OK)\n"); + } else { + DBGOPRINT((obj_t*) pThis, "main queue DA worker pool shut down on first try.\n"); } - } else { - END_MTX_PROTECTED_OPERATIONS(pThis->mut); } - /* now the primary queue is either empty, persisted to disk - or set to loose messages. So we - * can now request immediate shutdown of any remaining workers. Note that if bSaveOnShutdown was set, - * the queue is now empty. If regular workers are still running, and try to pull the next message, - * they will automatically terminate as there no longer is any message left to process. + RETiRet; +} + + +/* Try to shut down regular and DA queue workers, within the action timeout + * period. Note that the main queue DA worker is still unaffected (and may shuffle + * data to the disk queue while we terminate the other workers). Not finishing + * processing all messages is now OK (but they may be preserved later, depending + * on bSaveOnShutdown setting). + * rgerhards, 2009-05-27 + */ +static rsRetVal +tryShutdownWorkersWithinActionTimeout(qqueue_t *pThis) +{ + struct timespec tTimeout; + rsRetVal iRetLocal; + DEFiRet; + + ISOBJ_TYPE_assert(pThis, qqueue); + ASSERT(pThis->pqParent == NULL); /* detect invalid calling sequence */ + + /* instruct workers to finish ASAP, even if still work exists */ + /* note that we modify bEnqOnly directly, because going through the method would + * startup some workers again. So this is OK here. -- rgerhards, 2009-05-28 */ - BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, LOCK_MUTEX); /* some workers may be running in parallel! */ - if(qqueueGetOverallQueueSize(pThis) > 0) { - timeoutComp(&tTimeout, pThis->toActShutdown); - if(wtpGetCurNumWrkr(pThis->pWtpReg, LOCK_MUTEX) > 0) { - END_MTX_PROTECTED_OPERATIONS(pThis->mut); - dbgoprint((obj_t*) pThis, "trying immediate shutdown of regular workers\n"); - iRetLocal = wtpShutdownAll(pThis->pWtpReg, wtpState_SHUTDOWN_IMMEDIATE, &tTimeout); - if(iRetLocal == RS_RET_TIMED_OUT) { - dbgoprint((obj_t*) pThis, "immediate shutdown timed out on primary queue (this is acceptable and " - "triggers cancellation)\n"); - } else if(iRetLocal != RS_RET_OK) { - dbgoprint((obj_t*) pThis, "unexpected iRet state %d after trying immediate shutdown of the primary queue " - "in disk save mode. Continuing, but results are unpredictable\n", iRetLocal); - } - /* we need to re-aquire the mutex for the next check in this case! */ - BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, LOCK_MUTEX); /* some workers may be running in parallel! */ + pThis->bEnqOnly = 1; + /* need to set this so that the DA queue begins shutdown in parallel! */ + if(pThis->pqDA != NULL) { + pThis->pqDA->bEnqOnly = 1; + wtpSetState(pThis->pqDA->pWtpReg, wtpState_SHUTDOWN_IMMEDIATE); + } + + /* now give the queue workers a last chance to gracefully shut down (based on action timeout setting) */ + timeoutComp(&tTimeout, pThis->toActShutdown); + DBGOPRINT((obj_t*) pThis, "trying immediate shutdown of regular workers (if any)\n"); + iRetLocal = wtpShutdownAll(pThis->pWtpReg, wtpState_SHUTDOWN_IMMEDIATE, &tTimeout); + if(iRetLocal == RS_RET_TIMED_OUT) { + DBGOPRINT((obj_t*) pThis, "immediate shutdown timed out on primary queue (this is acceptable and " + "triggers cancellation)\n"); + } else if(iRetLocal != RS_RET_OK) { + DBGOPRINT((obj_t*) pThis, "unexpected iRet state %d after trying immediate shutdown of the primary queue " + "in disk save mode. Continuing, but results are unpredictable\n", iRetLocal); + } + + if(pThis->pqDA != NULL) { + /* and now the same for the DA queue */ + DBGOPRINT((obj_t*) pThis, "trying immediate shutdown of DA queue workers\n"); + iRetLocal = wtpShutdownAll(pThis->pqDA->pWtpReg, wtpState_SHUTDOWN_IMMEDIATE, &tTimeout); + if(iRetLocal == RS_RET_TIMED_OUT) { + DBGOPRINT((obj_t*) pThis, "immediate shutdown timed out on DA queue (this is acceptable " + "and triggers cancellation)\n"); + } else if(iRetLocal != RS_RET_OK) { + DBGOPRINT((obj_t*) pThis, "unexpected iRet state %d after trying immediate shutdown of the DA " + "queue in disk save mode. Continuing, but results are unpredictable\n", iRetLocal); } - if(pThis->bIsDA && wtpGetCurNumWrkr(pThis->pWtpDA, LOCK_MUTEX) > 0) { - /* and now the same for the DA queue */ - END_MTX_PROTECTED_OPERATIONS(pThis->mut); - dbgoprint((obj_t*) pThis, "trying immediate shutdown of DA workers\n"); - iRetLocal = wtpShutdownAll(pThis->pWtpDA, wtpState_SHUTDOWN_IMMEDIATE, &tTimeout); - if(iRetLocal == RS_RET_TIMED_OUT) { - dbgoprint((obj_t*) pThis, "immediate shutdown timed out on DA queue (this is acceptable and " - "triggers cancellation)\n"); - } else if(iRetLocal != RS_RET_OK) { - dbgoprint((obj_t*) pThis, "unexpected iRet state %d after trying immediate shutdown of the DA queue " - "in disk save mode. Continuing, but results are unpredictable\n", iRetLocal); - } + /* and now we need to check the DA worker itself (the one that shuffles data to the disk). This + * is necessary because we may be in a situation where the DA queue regular worker and the + * main queue worker stopped rather quickly. In this case, there is almost no time (and + * probably no thread switch!) between the point where we instructed the main queue DA + * worker to shutdown and this code location. In consequence, it may not even have + * noticed that it should should down, less acutally done this. So we provide it with a + * fixed 100ms timeout to try complete its work, what usually should be sufficient. + * rgerhards, 2009-10-06 + */ + timeoutComp(&tTimeout, 100); + DBGOPRINT((obj_t*) pThis, "last try for regular shutdown of main queue DA worker pool\n"); + iRetLocal = wtpShutdownAll(pThis->pWtpDA, wtpState_SHUTDOWN_IMMEDIATE, &tTimeout); + if(iRetLocal == RS_RET_TIMED_OUT) { + DBGOPRINT((obj_t*) pThis, "shutdown timed out on main queue DA worker pool " + "(this is not good, but probably OK)\n"); } else { - END_MTX_PROTECTED_OPERATIONS(pThis->mut); + DBGOPRINT((obj_t*) pThis, "main queue DA worker pool shut down.\n"); } - } else { - END_MTX_PROTECTED_OPERATIONS(pThis->mut); } + RETiRet; +} + + +/* This function cancels all remaining regular workers for both the main and the DA + * queue. The main queue's DA worker pool continues to run (if it exists and is active). + * rgerhards, 2009-05-29 + */ +static rsRetVal +cancelWorkers(qqueue_t *pThis) +{ + rsRetVal iRetLocal; + DEFiRet; + /* Now queue workers should have terminated. If not, we need to cancel them as we have applied * all timeout setting. If any worker in any queue still executes, its consumer is possibly - * long-running and cancelling is the only way to get rid of it. Note that the - * cancellation handler will probably re-queue a user pointer, so the queue's enqueue - * function is still needed (what is no problem as we do not yet destroy the queue - but I - * thought it's a good idea to mention that fact). -- rgerhards, 2008-01-25 + * long-running and cancelling is the only way to get rid of it. */ - dbgoprint((obj_t*) pThis, "checking to see if we need to cancel any worker threads of the primary queue\n"); + DBGOPRINT((obj_t*) pThis, "checking to see if we need to cancel any worker threads of the primary queue\n"); iRetLocal = wtpCancelAll(pThis->pWtpReg); /* returns immediately if all threads already have terminated */ if(iRetLocal != RS_RET_OK) { - dbgoprint((obj_t*) pThis, "unexpected iRet state %d trying to cancel primary queue worker " + DBGOPRINT((obj_t*) pThis, "unexpected iRet state %d trying to cancel primary queue worker " "threads, continuing, but results are unpredictable\n", iRetLocal); } - - /* TODO: think: do we really need to do this here? Can't it happen on DA queue destruction? If we - * disable it, we get an assertion... I think this is OK, as we need to have a certain order and - * canceling the DA workers here ensures that order. But in any instant, we may have a look at this - * code after we have reaced the milestone. -- rgerhards, 2008-01-27 - */ /* ... and now the DA queue, if it exists (should always be after the primary one) */ if(pThis->pqDA != NULL) { - dbgoprint((obj_t*) pThis, "checking to see if we need to cancel any worker threads of the DA queue\n"); + DBGOPRINT((obj_t*) pThis, "checking to see if we need to cancel any worker threads of the DA queue\n"); iRetLocal = wtpCancelAll(pThis->pqDA->pWtpReg); /* returns immediately if all threads already have terminated */ if(iRetLocal != RS_RET_OK) { - dbgoprint((obj_t*) pThis, "unexpected iRet state %d trying to cancel DA queue worker " + DBGOPRINT((obj_t*) pThis, "unexpected iRet state %d trying to cancel DA queue worker " "threads, continuing, but results are unpredictable\n", iRetLocal); } + + /* finally, we cancel the main queue's DA worker pool, if it still is running. It may be + * restarted later to persist the queue. But we stop it, because otherwise we get into + * big trouble when resetting the logical dequeue pointer. This operation can only be + * done when *no* worker is running. So time for a shutdown... -- rgerhards, 2009-05-28 + */ + DBGOPRINT((obj_t*) pThis, "checking to see if we need to cancel the main queue's DA worker pool\n"); + iRetLocal = wtpCancelAll(pThis->pWtpDA); /* returns immediately if all threads already have terminated */ } + RETiRet; +} + + +/* This function shuts down all worker threads and waits until they + * have terminated. If they timeout, they are cancelled. + * rgerhards, 2008-01-24 + * Please note that this function shuts down BOTH the parent AND the child queue + * in DA case. This is necessary because their timeouts are tightly coupled. Most + * importantly, the timeouts would be applied twice (or logic be extremely + * complex) if each would have its own shutdown. The function does not self check + * this condition - the caller must make sure it is not called with a parent. + * rgerhards, 2009-05-26: we do NO longer persist the queue here if bSaveOnShutdown + * is set. This must be handled by the caller. Not doing that cleans up the queue + * shutdown considerably. Also, older engines had a potential hang condition when + * the DA queue was already started and the DA worker configured for infinite + * retries and the action was during retry processing. This was a design issue, + * which is solved as of now. Note that the shutdown now may take a little bit + * longer, because we no longer can persist the queue in parallel to waiting + * on worker timeouts. + */ +static rsRetVal +ShutdownWorkers(qqueue_t *pThis) +{ + DEFiRet; + + ISOBJ_TYPE_assert(pThis, qqueue); + ASSERT(pThis->pqParent == NULL); /* detect invalid calling sequence */ + + DBGOPRINT((obj_t*) pThis, "initiating worker thread shutdown sequence\n"); + + /* we reduce the low water mark in any case. This is not absolutely necessary, but + * it is useful because we enable DA mode at several spots below and so we do not need + * to think about the low water mark each time. + */ + pThis->iHighWtrMrk = 1; /* if we do not do this, the DA queue will not stop! */ + pThis->iLowWtrMrk = 0; + + CHKiRet(tryShutdownWorkersWithinQueueTimeout(pThis)); + + if(getPhysicalQueueSize(pThis) > 0) { + CHKiRet(tryShutdownWorkersWithinActionTimeout(pThis)); + } + + CHKiRet(cancelWorkers(pThis)); + /* ... finally ... all worker threads have terminated :-) * Well, more precisely, they *are in termination*. Some cancel cleanup handlers - * may still be running. + * may still be running. Note that the main queue's DA worker may still be running. */ - dbgoprint((obj_t*) pThis, "worker threads terminated, remaining queue size %d.\n", qqueueGetOverallQueueSize(pThis)); + DBGOPRINT((obj_t*) pThis, "worker threads terminated, remaining queue size log %d, phys %d.\n", + getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis)); +finalize_it: RETiRet; } @@ -1280,7 +1375,7 @@ static rsRetVal qqueueShutdownWorkers(qqueue_t *pThis) * to modify some parameters before the queue is actually started. */ rsRetVal qqueueConstruct(qqueue_t **ppThis, queueType_t qType, int iWorkerThreads, - int iMaxQueueSize, rsRetVal (*pConsumer)(void*,void*)) + int iMaxQueueSize, rsRetVal (*pConsumer)(void*, batch_t*)) { DEFiRet; qqueue_t *pThis; @@ -1305,10 +1400,12 @@ rsRetVal qqueueConstruct(qqueue_t **ppThis, queueType_t qType, int iWorkerThread pThis->lenSpoolDir = strlen((char*)pThis->pszSpoolDir); pThis->iMaxFileSize = 1024 * 1024; /* default is 1 MiB */ pThis->iQueueSize = 0; + pThis->nLogDeq = 0; pThis->iMaxQueueSize = iMaxQueueSize; pThis->pConsumer = pConsumer; pThis->iNumWorkerThreads = iWorkerThreads; pThis->iDeqtWinToHr = 25; /* disable time-windowed dequeuing by default */ + pThis->iDeqBatchSize = 8; /* conservative default, should still provide good performance */ pThis->pszFilePrefix = NULL; pThis->qType = qType; @@ -1319,19 +1416,25 @@ rsRetVal qqueueConstruct(qqueue_t **ppThis, queueType_t qType, int iWorkerThread pThis->qConstruct = qConstructFixedArray; pThis->qDestruct = qDestructFixedArray; pThis->qAdd = qAddFixedArray; + pThis->qDeq = qDeqFixedArray; pThis->qDel = qDelFixedArray; + pThis->qUnDeqAll = qUnDeqAllFixedArray; break; case QUEUETYPE_LINKEDLIST: pThis->qConstruct = qConstructLinkedList; pThis->qDestruct = qDestructLinkedList; pThis->qAdd = qAddLinkedList; - pThis->qDel = (rsRetVal (*)(qqueue_t*,void**)) qDelLinkedList; + pThis->qDeq = (rsRetVal (*)(qqueue_t*,void**)) qDeqLinkedList; + pThis->qDel = (rsRetVal (*)(qqueue_t*)) qDelLinkedList; + pThis->qUnDeqAll = qUnDeqAllLinkedList; break; case QUEUETYPE_DISK: pThis->qConstruct = qConstructDisk; pThis->qDestruct = qDestructDisk; pThis->qAdd = qAddDisk; + pThis->qDeq = qDeqDisk; pThis->qDel = qDelDisk; + pThis->qUnDeqAll = qUnDeqAllDisk; /* special handling */ pThis->iNumWorkerThreads = 1; /* we need exactly one worker */ break; @@ -1340,6 +1443,7 @@ rsRetVal qqueueConstruct(qqueue_t **ppThis, queueType_t qType, int iWorkerThread pThis->qDestruct = qDestructDirect; pThis->qAdd = qAddDirect; pThis->qDel = qDelDirect; + pThis->qUnDeqAll = qUnDeqAllDirect; break; } @@ -1349,36 +1453,6 @@ finalize_it: } -/* cancellation cleanup handler for queueWorker () - * Updates admin structure and frees ressources. - * Params: - * arg1 - user pointer (in this case a qqueue_t) - * arg2 - user data pointer (in this case a queue data element, any object [queue's pUsr ptr!]) - * Note that arg2 may be NULL, in which case no dequeued but unprocessed pUsr exists! - * rgerhards, 2008-01-16 - */ -static rsRetVal -qqueueConsumerCancelCleanup(void *arg1, void *arg2) -{ - DEFiRet; - - qqueue_t *pThis = (qqueue_t*) arg1; - obj_t *pUsr = (obj_t*) arg2; - - ISOBJ_TYPE_assert(pThis, qqueue); - - if(pUsr != NULL) { - /* make sure the data element is not lost */ - dbgoprint((obj_t*) pThis, "cancelation cleanup handler consumer called, we need to unget one user data element\n"); - CHKiRet(qqueueUngetObj(pThis, pUsr, LOCK_MUTEX)); - } - -finalize_it: - RETiRet; -} - - - /* This function checks if the provided message shall be discarded and does so, if needed. * In DA mode, we do not discard any messages as we assume the disk subsystem is fast enough to * provide real-time creation of spool files. @@ -1404,12 +1478,12 @@ static int qqueueChkDiscardMsg(qqueue_t *pThis, int iQueueSize, int bRunsDA, voi if(pThis->iDiscardMrk > 0 && iQueueSize >= pThis->iDiscardMrk && bRunsDA == 0) { iRetLocal = objGetSeverity(pUsr, &iSeverity); if(iRetLocal == RS_RET_OK && iSeverity >= pThis->iDiscardSeverity) { - dbgoprint((obj_t*) pThis, "queue nearly full (%d entries), discarded severity %d message\n", + DBGOPRINT((obj_t*) pThis, "queue nearly full (%d entries), discarded severity %d message\n", iQueueSize, iSeverity); objDestruct(pUsr); ABORT_FINALIZE(RS_RET_QUEUE_FULL); } else { - dbgoprint((obj_t*) pThis, "queue nearly full (%d entries), but could not drop msg " + DBGOPRINT((obj_t*) pThis, "queue nearly full (%d entries), but could not drop msg " "(iRet: %d, severity %d)\n", iQueueSize, iRetLocal, iSeverity); } } @@ -1419,38 +1493,171 @@ finalize_it: } -/* dequeue the queued object for the queue consumers. - * rgerhards, 2008-10-21 +/* Finally remove n elements from the queue store. */ -static rsRetVal -qqueueDequeueConsumable(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave) +static inline rsRetVal +DoDeleteBatchFromQStore(qqueue_t *pThis, int nElem) { + int i; DEFiRet; + + ISOBJ_TYPE_assert(pThis, qqueue); + + /* now send delete request to storage driver */ + for(i = 0 ; i < nElem ; ++i) { + pThis->qDel(pThis); + } + + /* iQueueSize is not decremented by qDel(), so we need to do it ourselves */ + ATOMIC_SUB(pThis->iQueueSize, nElem); + ATOMIC_SUB(pThis->nLogDeq, nElem); +dbgprintf("delete batch from store, new sizes: log %d, phys %d\n", getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis)); + ++pThis->deqIDDel; /* one more batch dequeued */ + + RETiRet; +} + + +/* remove messages from the physical queue store that are fully processed. This is + * controlled via the to-delete list. We can only delete those elements, that are + * at the current physical tail of the queue. If the batch is from another position, + * we schedule it for deletion, but actual deletion will happen at a later call + * of this function here. We always delete as much as possible, which includes + * picking up things from the to-delete list. + */ +static inline rsRetVal +DeleteBatchFromQStore(qqueue_t *pThis, batch_t *pBatch) +{ + toDeleteLst_t *pTdl; + qDeqID deqIDDel; + DEFiRet; + + ISOBJ_TYPE_assert(pThis, qqueue); + assert(pBatch != NULL); + + pTdl = tdlPeek(pThis); /* get current head element */ + if(pTdl == NULL) { /* to-delete list empty */ + DoDeleteBatchFromQStore(pThis, pBatch->nElemDeq); + } else if(pBatch->deqID == pThis->deqIDDel) { + deqIDDel = pThis->deqIDDel; + pTdl = tdlPeek(pThis); + while(pTdl != NULL && deqIDDel == pTdl->deqID) { + DoDeleteBatchFromQStore(pThis, pTdl->nElemDeq); + tdlPop(pThis); + ++deqIDDel; + pTdl = tdlPeek(pThis); + } + } else { + /* can not delete, insert into to-delete list */ + dbgprintf("not at head of to-delete list, enqueue %d\n", (int) pBatch->deqID); + CHKiRet(tdlAdd(pThis, pBatch->deqID, pBatch->nElemDeq)); + } + +finalize_it: + RETiRet; +} + + +/* Delete a batch of processed user objects from the queue, which includes + * destructing the objects themself. + * rgerhards, 2009-05-13 + */ +static inline rsRetVal +DeleteProcessedBatch(qqueue_t *pThis, batch_t *pBatch) +{ + int i; void *pUsr; + DEFiRet; + + ISOBJ_TYPE_assert(pThis, qqueue); + assert(pBatch != NULL); + + for(i = 0 ; i < pBatch->nElem ; ++i) { + pUsr = pBatch->pElem[i].pUsrp; + objDestruct(pUsr); + } + + iRet = DeleteBatchFromQStore(pThis, pBatch); + + pBatch->nElem = pBatch->nElemDeq = 0; /* reset batch */ + + RETiRet; +} + + +/* dequeue as many user pointers as are available, until we hit the configured + * upper limit of pointers. + * This must only be called when the queue mutex is LOOKED, otherwise serious + * malfunction will happen. + */ +static inline rsRetVal +DequeueConsumableElements(qqueue_t *pThis, wti_t *pWti, int *piRemainingQueueSize) +{ + int nDequeued; + int nDiscarded; + int nDeleted; int iQueueSize; - int bRunsDA; /* cache for early mutex release */ - - /* dequeue element (still protected from mutex) */ - iRet = qqueueDel(pThis, &pUsr); - qqueueChkPersist(pThis); - iQueueSize = qqueueGetOverallQueueSize(pThis); /* cache this for after mutex release */ - bRunsDA = pThis->bRunsDA; /* cache this for after mutex release */ - - /* We now need to save the user pointer for the cancel cleanup handler, BUT ONLY - * if we could successfully obtain a user pointer. Otherwise, we would bring the - * cancel cleanup handler into big troubles (and we did ;)). Note that we can - * NOT set the variable further below, as this may lead to an object leak. We - * may get cancelled before we reach that part of the code, so the only - * solution is to do it here. -- rgerhards, 2008-02-27 - */ - if(iRet == RS_RET_OK) { - pWti->pUsrp = pUsr; + void *pUsr; + rsRetVal localRet; + DEFiRet; + + nDeleted = pWti->batch.nElemDeq; + DeleteProcessedBatch(pThis, &pWti->batch); + + nDequeued = nDiscarded = 0; + while((iQueueSize = getLogicalQueueSize(pThis)) > 0 && nDequeued < pThis->iDeqBatchSize) { +dbgprintf("DequeueConsumableElements, index %d\n", nDequeued); + CHKiRet(qqueueDeq(pThis, &pUsr)); + + /* check if we should discard this element */ + localRet = qqueueChkDiscardMsg(pThis, pThis->iQueueSize, pThis->bRunsDA, pUsr); + if(localRet == RS_RET_QUEUE_FULL) { + ++nDiscarded; + continue; + } else if(localRet != RS_RET_OK) { + ABORT_FINALIZE(localRet); + } + + /* all well, use this element */ + pWti->batch.pElem[nDequeued].pUsrp = pUsr; + pWti->batch.pElem[nDequeued].state = BATCH_STATE_RDY; + ++nDequeued; } + /* it is sufficient to persist only when the bulk of work is done */ + qqueueChkPersist(pThis, nDequeued+nDiscarded+nDeleted); + + pWti->batch.nElem = nDequeued; + pWti->batch.nElemDeq = nDequeued + nDiscarded; + pWti->batch.deqID = getNextDeqID(pThis); + *piRemainingQueueSize = iQueueSize; + +finalize_it: + RETiRet; +} + + +/* dequeue the queued object for the queue consumers. + * rgerhards, 2008-10-21 + * I made a radical change - we now dequeue multiple elements, and store these objects in + * an array of user pointers. We expect that this increases performance. + * rgerhards, 2009-04-22 + */ +static rsRetVal +DequeueConsumable(qqueue_t *pThis, wti_t *pWti) +{ + DEFiRet; + int iQueueSize = 0; /* keep the compiler happy... */ + + /* dequeue element batch (still protected from mutex) */ + iRet = DequeueConsumableElements(pThis, pWti, &iQueueSize); + /* awake some flow-controlled sources if we can do this right now */ /* TODO: this could be done better from a performance point of view -- do it only if * we have someone waiting for the condition (or only when we hit the watermark right * on the nail [exact value]) -- rgerhards, 2008-03-14 + * now that we dequeue batches of pointers, this is much less an issue... + * rgerhards, 2009-04-22 */ if(iQueueSize < pThis->iFullDlyMrk / 2) { pthread_cond_broadcast(&pThis->belowFullDlyWtrMrk); @@ -1460,37 +1667,15 @@ qqueueDequeueConsumable(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave) pthread_cond_broadcast(&pThis->belowLightDlyWtrMrk); } - /* rgerhards, 2008-09-30: I reversed the order of cond_signal und mutex_unlock - * as of the pthreads recommendation on predictable scheduling behaviour. I don't see - * any problems caused by this, but I add this comment in case some will be seen - * in the next time. - */ + // TODO: MULTI: check physical queue size? pthread_cond_signal(&pThis->notFull); - d_pthread_mutex_unlock(pThis->mut); - pthread_setcancelstate(iCancelStateSave, NULL); /* WE ARE NO LONGER PROTECTED BY THE MUTEX */ - /* do actual processing (the lengthy part, runs in parallel) - * If we had a problem while dequeing, we do not call the consumer, - * but we otherwise ignore it. This is in the hopes that it will be - * self-healing. However, this is really not a good thing. - * rgerhards, 2008-01-03 - */ - if(iRet != RS_RET_OK) - FINALIZE; - - /* we are running in normal, non-disk-assisted mode do a quick check if we need to drain the queue. - * In DA mode, we do not discard any messages as we assume the disk subsystem is fast enough to - * provide real-time creation of spool files. - * Note: It is OK to use the cached iQueueSize here, because it does not hurt if it is slightly wrong. - */ - CHKiRet(qqueueChkDiscardMsg(pThis, iQueueSize, bRunsDA, pUsr)); - -finalize_it: if(iRet != RS_RET_OK && iRet != RS_RET_DISCARDMSG) { - dbgoprint((obj_t*) pThis, "error %d dequeueing element - ignoring, but strange things " + DBGOPRINT((obj_t*) pThis, "error %d dequeueing element - ignoring, but strange things " "may happen\n", iRet); } + RETiRet; } @@ -1533,7 +1718,7 @@ finalize_it: * but you get the idea from the code above. */ static rsRetVal -qqueueRateLimiter(qqueue_t *pThis) +RateLimiter(qqueue_t *pThis) { DEFiRet; int iDelay; @@ -1582,7 +1767,7 @@ qqueueRateLimiter(qqueue_t *pThis) } if(iDelay > 0) { - dbgoprint((obj_t*) pThis, "outside dequeue time window, delaying %d seconds\n", iDelay); + DBGOPRINT((obj_t*) pThis, "outside dequeue time window, delaying %d seconds\n", iDelay); srSleep(iDelay, 0); } @@ -1590,37 +1775,88 @@ qqueueRateLimiter(qqueue_t *pThis) } +/* This dequeues the next batch. + * rgerhards, 2009-05-20 + */ +static inline rsRetVal +DequeueForConsumer(qqueue_t *pThis, wti_t *pWti) +{ + DEFiRet; + + ISOBJ_TYPE_assert(pThis, qqueue); + ISOBJ_TYPE_assert(pWti, wti); + +dbgprintf("YYY: deqeueu for consumer"); + CHKiRet(DequeueConsumable(pThis, pWti)); + + if(pWti->batch.nElem == 0) + ABORT_FINALIZE(RS_RET_IDLE); + + +finalize_it: + RETiRet; +} + + +/* This is called when a batch is processed and the worker does not + * ask for another batch (e.g. because it is to be terminated) + * rgerhards, 2009-05-27 + */ +static rsRetVal +batchProcessed(qqueue_t *pThis, wti_t *pWti) +{ + DEFiRet; + + ISOBJ_TYPE_assert(pThis, qqueue); + ISOBJ_TYPE_assert(pWti, wti); +dbgprintf("XXX: batchProcessed deletes %d records\n", pWti->batch.nElemDeq); + + DeleteProcessedBatch(pThis, &pWti->batch); + qqueueChkPersist(pThis, pWti->batch.nElemDeq); + + RETiRet; +} + /* This is the queue consumer in the regular (non-DA) case. It is * protected by the queue mutex, but MUST release it as soon as possible. * rgerhards, 2008-01-21 */ static rsRetVal -qqueueConsumerReg(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave) +ConsumerReg(qqueue_t *pThis, wti_t *pWti) { DEFiRet; ISOBJ_TYPE_assert(pThis, qqueue); ISOBJ_TYPE_assert(pWti, wti); - CHKiRet(qqueueDequeueConsumable(pThis, pWti, iCancelStateSave)); - CHKiRet(pThis->pConsumer(pThis->pUsr, pWti->pUsrp)); + CHKiRet(DequeueForConsumer(pThis, pWti)); + + /* we now have a non-idle batch of work, so we can release the queue mutex and process it */ + d_pthread_mutex_unlock(pThis->mut); + + CHKiRet(pThis->pConsumer(pThis->pUsr, &pWti->batch)); /* we now need to check if we should deliberately delay processing a bit * and, if so, do that. -- rgerhards, 2008-01-30 */ +//TODO: MULTIQUEUE: the following setting is no longer correct - need to think about how to do that... if(pThis->iDeqSlowdown) { - dbgoprint((obj_t*) pThis, "sleeping %d microseconds as requested by config params\n", + DBGOPRINT((obj_t*) pThis, "sleeping %d microseconds as requested by config params\n", pThis->iDeqSlowdown); srSleep(pThis->iDeqSlowdown / 1000000, pThis->iDeqSlowdown % 1000000); } + /* now we are done, but need to re-aquire the mutex */ + d_pthread_mutex_lock(pThis->mut); + finalize_it: +dbgprintf("XXX: regular consumer finished, iret=%d, szlog %d sz phys %d\n", iRet, getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis)); RETiRet; } -/* This is a special consumer to feed the disk-queue in disk-assited mode. +/* This is a special consumer to feed the disk-queue in disk-assisted mode. * When active, our own queue more or less acts as a memory buffer to the disk. * So this consumer just needs to drain the memory queue and submit entries * to the disk queue. The disk queue will then call the actual consumer from @@ -1630,18 +1866,33 @@ finalize_it: * rgerhards, 2008-01-14 */ static rsRetVal -qqueueConsumerDA(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave) +ConsumerDA(qqueue_t *pThis, wti_t *pWti) { + int i; DEFiRet; ISOBJ_TYPE_assert(pThis, qqueue); ISOBJ_TYPE_assert(pWti, wti); - CHKiRet(qqueueDequeueConsumable(pThis, pWti, iCancelStateSave)); - CHKiRet(qqueueEnqObj(pThis->pqDA, eFLOWCTL_NO_DELAY, pWti->pUsrp)); + CHKiRet(DequeueForConsumer(pThis, pWti)); + + /* we now have a non-idle batch of work, so we can release the queue mutex and process it */ + d_pthread_mutex_unlock(pThis->mut); + + /* iterate over returned results and enqueue them in DA queue */ + for(i = 0 ; i < pWti->batch.nElem ; i++) { + /* TODO: we must add a generic "addRef" mechanism, because the disk queue enqueue destructs + * the message. So far, we simply assume we always have msg_t, what currently is always the case. + * rgerhards, 2009-05-28 + */ + CHKiRet(qqueueEnqObj(pThis->pqDA, eFLOWCTL_NO_DELAY, (obj_t*)MsgAddRef((msg_t*)(pWti->batch.pElem[i].pUsrp)))); + } + + /* now we are done, but need to re-aquire the mutex */ + d_pthread_mutex_lock(pThis->mut); finalize_it: - dbgoprint((obj_t*) pThis, "DAConsumer returns with iRet %d\n", iRet); + DBGOPRINT((obj_t*) pThis, "DAConsumer returns with iRet %d\n", iRet); RETiRet; } @@ -1651,20 +1902,17 @@ finalize_it: * If we are a child, we have done our duty when the queue is empty. In that case, * we can terminate. * Version for the DA worker thread. NOTE: the pThis->bRunsDA is different from - * the DA queue + * the DA queue. + * If our queue is in destruction, we drain to the DA queue and so we shall not terminate + * until we have done so. */ -static int +static rsRetVal qqueueChkStopWrkrDA(qqueue_t *pThis) { - /* if our queue is in destruction, we drain to the DA queue and so we shall not terminate - * until we have done so. - */ - int bStopWrkr; - - BEGINfunc + DEFiRet; if(pThis->bEnqOnly) { - bStopWrkr = 1; + iRet = RS_RET_TERMINATE_WHEN_IDLE; } else { if(pThis->bRunsDA) { ASSERT(pThis->pqDA != NULL); @@ -1672,19 +1920,21 @@ qqueueChkStopWrkrDA(qqueue_t *pThis) && pThis->pqDA->sizeOnDiskMax > 0 && pThis->pqDA->tVars.disk.sizeOnDisk > pThis->pqDA->sizeOnDiskMax) { /* this queue can never grow, so we can give up... */ - bStopWrkr = 1; - } else if(qqueueGetOverallQueueSize(pThis) < pThis->iHighWtrMrk && pThis->bQueueStarted == 1) { - bStopWrkr = 1; - } else { - bStopWrkr = 0; + iRet = RS_RET_TERMINATE_NOW; + } else if(getPhysicalQueueSize(pThis) < pThis->iHighWtrMrk && pThis->bQueueStarted == 1) { +dbgprintf("XXX: terminate_NOW DA worker: queue size %d, high water mark %d\n", getPhysicalQueueSize(pThis), pThis->iHighWtrMrk); + iRet = RS_RET_TERMINATE_NOW; +RUNLOG_STR("XXX: re-start reg worker"); +qqueueAdviseMaxWorkers(pThis); +RUNLOG_STR("XXX: done re-start reg worker"); } } else { - bStopWrkr = 1; + // experimental iRet = RS_RET_TERMINATE_NOW; + ; } } - ENDfunc - return bStopWrkr; + RETiRet; } @@ -1695,38 +1945,50 @@ qqueueChkStopWrkrDA(qqueue_t *pThis) * Version for the regular worker thread. NOTE: the pThis->bRunsDA is different from * the DA queue */ -static int -qqueueChkStopWrkrReg(qqueue_t *pThis) +static rsRetVal +ChkStopWrkrReg(qqueue_t *pThis) +{ + DEFiRet; + if(pThis->bEnqOnly) { + iRet = RS_RET_TERMINATE_NOW; + } else if(pThis->pqParent != NULL) { + iRet = RS_RET_TERMINATE_WHEN_IDLE; + } + + RETiRet; +} + + +/* return the configured "deq max at once" interval + * rgerhards, 2009-04-22 + */ +static rsRetVal +GetDeqBatchSize(qqueue_t *pThis, int *pVal) { - return pThis->bEnqOnly || pThis->bRunsDA || (pThis->pqParent != NULL && qqueueGetOverallQueueSize(pThis) == 0); + DEFiRet; + assert(pVal != NULL); + *pVal = pThis->iDeqBatchSize; +if(pThis->pqParent != NULL) + *pVal = 16; + RETiRet; } /* must only be called when the queue mutex is locked, else results - * are not stable! DA queue version + * are not stable! DA worker version (pThis *is* the *main* queue, not DA!) */ static int qqueueIsIdleDA(qqueue_t *pThis) { - /* remember: iQueueSize is the DA queue size, not the main queue! */ - /* TODO: I think we need just a single function for DA and non-DA mode - but I leave it for now as is */ - return(qqueueGetOverallQueueSize(pThis) == 0 || (pThis->bRunsDA && qqueueGetOverallQueueSize(pThis) <= pThis->iLowWtrMrk)); + return(getPhysicalQueueSize(pThis) <= pThis->iLowWtrMrk); } /* must only be called when the queue mutex is locked, else results - * are not stable! Regular queue version + * are not stable! Regular worker version. */ static int -qqueueIsIdleReg(qqueue_t *pThis) -{ -#if 0 /* enable for performance testing */ - int ret; - ret = qqueueGetOverallQueueSize(pThis) == 0 || (pThis->bRunsDA && qqueueGetOverallQueueSize(pThis) <= pThis->iLowWtrMrk); - if(ret) fprintf(stderr, "queue is idle\n"); - return ret; -#else - /* regular code! */ - return(qqueueGetOverallQueueSize(pThis) == 0 || (pThis->bRunsDA && qqueueGetOverallQueueSize(pThis) <= pThis->iLowWtrMrk)); -#endif +IsIdleReg(qqueue_t *pThis) +{ + return(getPhysicalQueueSize(pThis) == 0); } @@ -1744,14 +2006,13 @@ qqueueIsIdleReg(qqueue_t *pThis) * I am telling this, because I, too, always get confused by those... */ static rsRetVal -qqueueRegOnWrkrShutdown(qqueue_t *pThis) +RegOnWrkrShutdown(qqueue_t *pThis) { DEFiRet; ISOBJ_TYPE_assert(pThis, qqueue); if(pThis->pqParent != NULL) { - pThis->pqParent->bChildIsDone = 1; /* indicate we are done */ if(pThis->pqParent->pWtpDA != NULL) { /* see comment in function header from 2008-02-27 */ wtpAdviseMaxWorkers(pThis->pqParent->pWtpDA, 1); /* reactivate DA worker (always 1) */ } @@ -1761,28 +2022,11 @@ qqueueRegOnWrkrShutdown(qqueue_t *pThis) } -/* The following function is called when a regular queue worker starts up. We need this - * hook to indicate in the parent queue (if we are a child) that we are not done yet. - */ -static rsRetVal -qqueueRegOnWrkrStartup(qqueue_t *pThis) -{ - DEFiRet; - - ISOBJ_TYPE_assert(pThis, qqueue); - - if(pThis->pqParent != NULL) { - pThis->pqParent->bChildIsDone = 0; - } - - RETiRet; -} - - /* start up the queue - it must have been constructed and parameters defined * before. */ -rsRetVal qqueueStart(qqueue_t *pThis) /* this is the ConstructionFinalizer */ +rsRetVal +qqueueStart(qqueue_t *pThis) /* this is the ConstructionFinalizer */ { DEFiRet; rsRetVal iRetLocal; @@ -1804,7 +2048,7 @@ rsRetVal qqueueStart(qqueue_t *pThis) /* this is the ConstructionFinalizer */ pthread_mutex_init(pThis->mut, NULL); } else { /* child queue, we need to use parent's mutex */ - dbgoprint((obj_t*) pThis, "I am a child\n"); + DBGOPRINT((obj_t*) pThis, "I am a child\n"); pThis->mut = pThis->pqParent->mut; } @@ -1818,11 +2062,12 @@ rsRetVal qqueueStart(qqueue_t *pThis) /* this is the ConstructionFinalizer */ /* call type-specific constructor */ CHKiRet(pThis->qConstruct(pThis)); /* this also sets bIsDA */ - dbgoprint((obj_t*) pThis, "type %d, enq-only %d, disk assisted %d, maxFileSz %lld, qsize %d, child %d, " - "full delay %d, light delay %d starting\n", + DBGOPRINT((obj_t*) pThis, "type %d, enq-only %d, disk assisted %d, maxFileSz %lld, lqsize %d, pqsize %d, child %d, " + "full delay %d, light delay %d, deq batch size %d starting\n", pThis->qType, pThis->bEnqOnly, pThis->bIsDA, pThis->iMaxFileSize, - qqueueGetOverallQueueSize(pThis), pThis->pqParent == NULL ? 0 : 1, - pThis->iFullDlyMrk, pThis->iLightDlyMrk); + getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis), + pThis->pqParent == NULL ? 0 : 1, pThis->iFullDlyMrk, pThis->iLightDlyMrk, + pThis->iDeqBatchSize); if(pThis->qType == QUEUETYPE_DIRECT) FINALIZE; /* with direct queues, we are already finished... */ @@ -1833,13 +2078,13 @@ rsRetVal qqueueStart(qqueue_t *pThis) /* this is the ConstructionFinalizer */ lenBuf = snprintf((char*)pszBuf, sizeof(pszBuf), "%s:Reg", obj.GetName((obj_t*) pThis)); CHKiRet(wtpConstruct (&pThis->pWtpReg)); CHKiRet(wtpSetDbgHdr (pThis->pWtpReg, pszBuf, lenBuf)); - CHKiRet(wtpSetpfRateLimiter (pThis->pWtpReg, (rsRetVal (*)(void *pUsr)) qqueueRateLimiter)); - CHKiRet(wtpSetpfChkStopWrkr (pThis->pWtpReg, (rsRetVal (*)(void *pUsr, int)) qqueueChkStopWrkrReg)); - CHKiRet(wtpSetpfIsIdle (pThis->pWtpReg, (rsRetVal (*)(void *pUsr, int)) qqueueIsIdleReg)); - CHKiRet(wtpSetpfDoWork (pThis->pWtpReg, (rsRetVal (*)(void *pUsr, void *pWti, int)) qqueueConsumerReg)); - CHKiRet(wtpSetpfOnWorkerCancel (pThis->pWtpReg, (rsRetVal (*)(void *pUsr, void*pWti))qqueueConsumerCancelCleanup)); - CHKiRet(wtpSetpfOnWorkerStartup (pThis->pWtpReg, (rsRetVal (*)(void *pUsr)) qqueueRegOnWrkrStartup)); - CHKiRet(wtpSetpfOnWorkerShutdown(pThis->pWtpReg, (rsRetVal (*)(void *pUsr)) qqueueRegOnWrkrShutdown)); + CHKiRet(wtpSetpfRateLimiter (pThis->pWtpReg, (rsRetVal (*)(void *pUsr)) RateLimiter)); + CHKiRet(wtpSetpfChkStopWrkr (pThis->pWtpReg, (rsRetVal (*)(void *pUsr, int)) ChkStopWrkrReg)); + CHKiRet(wtpSetpfGetDeqBatchSize (pThis->pWtpReg, (rsRetVal (*)(void *pUsr, int*)) GetDeqBatchSize)); + CHKiRet(wtpSetpfIsIdle (pThis->pWtpReg, (rsRetVal (*)(void *pUsr, wtp_t*)) IsIdleReg)); + CHKiRet(wtpSetpfDoWork (pThis->pWtpReg, (rsRetVal (*)(void *pUsr, void *pWti)) ConsumerReg)); + CHKiRet(wtpSetpfObjProcessed (pThis->pWtpReg, (rsRetVal (*)(void *pUsr, wti_t *pWti)) batchProcessed)); + CHKiRet(wtpSetpfOnWorkerShutdown(pThis->pWtpReg, (rsRetVal (*)(void *pUsr)) RegOnWrkrShutdown)); CHKiRet(wtpSetpmutUsr (pThis->pWtpReg, pThis->mut)); CHKiRet(wtpSetpcondBusy (pThis->pWtpReg, &pThis->notEmpty)); CHKiRet(wtpSetiNumWorkerThreads (pThis->pWtpReg, pThis->iNumWorkerThreads)); @@ -1854,18 +2099,18 @@ rsRetVal qqueueStart(qqueue_t *pThis) /* this is the ConstructionFinalizer */ */ iRetLocal = qqueueHaveQIF(pThis); if(iRetLocal == RS_RET_OK) { - dbgoprint((obj_t*) pThis, "on-disk queue present, needs to be reloaded\n"); - qqueueInitDA(pThis, QUEUE_MODE_ENQDEQ, LOCK_MUTEX); /* initiate DA mode */ + DBGOPRINT((obj_t*) pThis, "on-disk queue present, needs to be reloaded\n"); + InitDA(pThis, QUEUE_MODE_ENQDEQ, LOCK_MUTEX); /* initiate DA mode */ bInitialized = 1; /* we are done */ } else { /* TODO: use logerror? -- rgerhards, 2008-01-16 */ - dbgoprint((obj_t*) pThis, "error %d trying to access on-disk queue files, starting without them. " + DBGOPRINT((obj_t*) pThis, "error %d trying to access on-disk queue files, starting without them. " "Some data may be lost\n", iRetLocal); } } - if(!bInitialized) { - dbgoprint((obj_t*) pThis, "queue starts up without (loading) any DA disk state (this is normal for the DA " + if(Debug && !bInitialized) { + DBGOPRINT((obj_t*) pThis, "queue starts up without (loading) any DA disk state (this is normal for the DA " "queue itself!)\n"); } @@ -1893,12 +2138,11 @@ static rsRetVal qqueuePersist(qqueue_t *pThis, int bIsCheckpoint) strm_t *psQIF = NULL; /* Queue Info File */ uchar pszQIFNam[MAXFNAME]; size_t lenQIFNam; - obj_t *pUsr; ASSERT(pThis != NULL); if(pThis->qType != QUEUETYPE_DISK) { - if(qqueueGetOverallQueueSize(pThis) > 0) { + if(getPhysicalQueueSize(pThis) > 0) { /* This error code is OK, but we will probably not implement this any time * The reason is that persistence happens via DA queues. But I would like to * leave the code as is, as we so have a hook in case we need one. @@ -1909,19 +2153,19 @@ static rsRetVal qqueuePersist(qqueue_t *pThis, int bIsCheckpoint) FINALIZE; /* if the queue is empty, we are happy and done... */ } - dbgoprint((obj_t*) pThis, "persisting queue to disk, %d entries...\n", qqueueGetOverallQueueSize(pThis)); + DBGOPRINT((obj_t*) pThis, "persisting queue to disk, %d entries...\n", getPhysicalQueueSize(pThis)); /* Construct file name */ lenQIFNam = snprintf((char*)pszQIFNam, sizeof(pszQIFNam) / sizeof(uchar), "%s/%s.qi", (char*) glbl.GetWorkDir(), (char*)pThis->pszFilePrefix); - if((bIsCheckpoint != QUEUE_CHECKPOINT) && (qqueueGetOverallQueueSize(pThis) == 0)) { + if((bIsCheckpoint != QUEUE_CHECKPOINT) && (getPhysicalQueueSize(pThis) == 0)) { if(pThis->bNeedDelQIF) { unlink((char*)pszQIFNam); pThis->bNeedDelQIF = 0; } /* indicate spool file needs to be deleted */ - CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pRead, 1)); + CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pReadDel, 1)); FINALIZE; /* nothing left to do, so be happy */ } @@ -1940,29 +2184,19 @@ static rsRetVal qqueuePersist(qqueue_t *pThis, int bIsCheckpoint) */ CHKiRet(obj.BeginSerializePropBag(psQIF, (obj_t*) pThis)); objSerializeSCALAR(psQIF, iQueueSize, INT); - objSerializeSCALAR(psQIF, iUngottenObjs, INT); objSerializeSCALAR(psQIF, tVars.disk.sizeOnDisk, INT64); objSerializeSCALAR(psQIF, tVars.disk.bytesRead, INT64); CHKiRet(obj.EndSerialize(psQIF)); - /* now we must persist all objects on the ungotten queue - they can not go to - * to the regular files. -- rgerhards, 2008-01-29 - */ - while(pThis->iUngottenObjs > 0) { - CHKiRet(qqueueGetUngottenObj(pThis, &pUsr)); - CHKiRet((objSerialize(pUsr))(pUsr, psQIF)); - objDestruct(pUsr); - } - /* now persist the stream info */ CHKiRet(strm.Serialize(pThis->tVars.disk.pWrite, psQIF)); - CHKiRet(strm.Serialize(pThis->tVars.disk.pRead, psQIF)); + CHKiRet(strm.Serialize(pThis->tVars.disk.pReadDel, psQIF)); /* tell the input file object that it must not delete the file on close if the queue * is non-empty - but only if we are not during a simple checkpoint */ if(bIsCheckpoint != QUEUE_CHECKPOINT) { - CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pRead, 0)); + CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pReadDel, 0)); } /* we have persisted the queue object. So whenever it comes to an empty queue, @@ -1979,20 +2213,63 @@ finalize_it: /* check if we need to persist the current queue info. If an - * error occurs, thus should be ignored by caller (but we still + * error occurs, this should be ignored by caller (but we still * abide to our regular call interface)... * rgerhards, 2008-01-13 + * nUpdates is the number of updates since the last call to this function. + * It may be > 1 due to batches. -- rgerhards, 2009-05-12 */ -static rsRetVal qqueueChkPersist(qqueue_t *pThis) +static rsRetVal qqueueChkPersist(qqueue_t *pThis, int nUpdates) { + DEFiRet; ISOBJ_TYPE_assert(pThis, qqueue); + assert(nUpdates >= 0); - if(pThis->iPersistUpdCnt && ++pThis->iUpdsSincePersist >= pThis->iPersistUpdCnt) { + if(nUpdates == 0) + FINALIZE; + + pThis->iUpdsSincePersist += nUpdates; + if(pThis->iPersistUpdCnt && pThis->iUpdsSincePersist >= pThis->iPersistUpdCnt) { qqueuePersist(pThis, QUEUE_CHECKPOINT); pThis->iUpdsSincePersist = 0; } - return RS_RET_OK; +finalize_it: + RETiRet; +} + + +/* persist a queue with all data elements to disk - this is used to handle + * bSaveOnShutdown. We utilize the DA worker to do this. This must only + * be called after all workers have been shut down and if bSaveOnShutdown + * is actually set. Note that this function may potentially run long, + * depending on the queue configuration (e.g. store on remote machine). + * rgerhards, 2009-05-26 + */ +static inline rsRetVal +DoSaveOnShutdown(qqueue_t *pThis) +{ + struct timespec tTimeout; + rsRetVal iRetLocal; + DEFiRet; + + ISOBJ_TYPE_assert(pThis, qqueue); + + InitDA(pThis, QUEUE_MODE_ENQONLY, LOCK_MUTEX); /* switch to DA mode */ +dbgprintf("after InitDA, queue log %d, phys %d\n", getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis)); + /* make sure we do not timeout before we are done */ + DBGOPRINT((obj_t*) pThis, "bSaveOnShutdown configured, infinite timeout set\n"); + timeoutComp(&tTimeout, QUEUE_TIMEOUT_ETERNAL); + /* and run the primary queue's DA worker to drain the queue */ + iRetLocal = wtpShutdownAll(pThis->pWtpDA, wtpState_SHUTDOWN, &tTimeout); + DBGOPRINT((obj_t*) pThis, "end queue persistence run, iRet %d, queue size log %d, phys %d\n", + iRetLocal, getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis)); + if(iRetLocal != RS_RET_OK) { + DBGOPRINT((obj_t*) pThis, "unexpected iRet state %d after trying to shut down primary queue in disk save mode, " + "continuing, but results are unpredictable\n", iRetLocal); + } + + RETiRet; } @@ -2001,14 +2278,24 @@ BEGINobjDestruct(qqueue) /* be sure to specify the object type also in END and C CODESTARTobjDestruct(qqueue) pThis->bQueueInDestruction = 1; /* indicate we are in destruction (modifies some behaviour) */ - /* shut down all workers (handles *all* of the persistence logic) - * See function head comment of queueShutdownWorkers () on why we don't call it - * We also do not need to shutdown workers when we are in enqueue-only mode or we are a + /* shut down all workers + * We do not need to shutdown workers when we are in enqueue-only mode or we are a * direct queue - because in both cases we have none... ;) * with a child! -- rgerhards, 2008-01-28 */ if(pThis->qType != QUEUETYPE_DIRECT && !pThis->bEnqOnly && pThis->pqParent == NULL) - qqueueShutdownWorkers(pThis); + ShutdownWorkers(pThis); + + /* now all workers are terminated. Messages may exist. Also, some logically dequeued + * messages may never have been processed because their worker was terminated. So + * we need to reset the logical dequeue pointer, persist the queue if configured to do + * so and then destruct everything. -- rgerhards, 2009-05-26 + */ + CHKiRet(pThis->qUnDeqAll(pThis)); + + if(pThis->bIsDA && getPhysicalQueueSize(pThis) > 0 && pThis->bSaveOnShutdown) { + CHKiRet(DoSaveOnShutdown(pThis)); + } /* finally destruct our (regular) worker thread pool * Note: currently pWtpReg is never NULL, but if we optimize our logic, this may happen, @@ -2044,7 +2331,7 @@ CODESTARTobjDestruct(qqueue) * if need arises (what I doubt...) -- rgerhards, 2008-01-25 */ CHKiRet_Hdlr(qqueuePersist(pThis, QUEUE_NO_CHECKPOINT)) { - dbgoprint((obj_t*) pThis, "error %d persisting queue - data lost!\n", iRet); + DBGOPRINT((obj_t*) pThis, "error %d persisting queue - data lost!\n", iRet); } /* finally, clean up some simple things... */ @@ -2063,11 +2350,8 @@ CODESTARTobjDestruct(qqueue) /* type-specific destructor */ iRet = pThis->qDestruct(pThis); - if(pThis->pszFilePrefix != NULL) - free(pThis->pszFilePrefix); - - if(pThis->pszSpoolDir != NULL) - free(pThis->pszSpoolDir); + free(pThis->pszFilePrefix); + free(pThis->pszSpoolDir); ENDobjDestruct(qqueue) @@ -2081,8 +2365,8 @@ qqueueSetFilePrefix(qqueue_t *pThis, uchar *pszPrefix, size_t iLenPrefix) { DEFiRet; - if(pThis->pszFilePrefix != NULL) - free(pThis->pszFilePrefix); + free(pThis->pszFilePrefix); + pThis->pszFilePrefix = NULL; if(pszPrefix == NULL) /* just unset the prefix! */ ABORT_FINALIZE(RS_RET_OK); @@ -2117,111 +2401,7 @@ finalize_it: } -/* enqueue a new user data element - * Enqueues the new element and awakes worker thread. - */ -rsRetVal -qqueueEnqObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr) -{ - DEFiRet; - int iCancelStateSave; - struct timespec t; - - ISOBJ_TYPE_assert(pThis, qqueue); - - /* first check if we need to discard this message (which will cause CHKiRet() to exit) - * rgerhards, 2008-10-07: It is OK to do this outside of mutex protection. The iQueueSize - * and bRunsDA parameters may not reflect the correct settings here, but they are - * "good enough" in the sense that they can be used to drive the decision. Valgrind's - * threading tools may point this access to be an error, but this is done - * intentional. I do not see this causes problems to us. - */ - CHKiRet(qqueueChkDiscardMsg(pThis, pThis->iQueueSize, pThis->bRunsDA, pUsr)); - - /* Please note that this function is not cancel-safe and consequently - * sets the calling thread's cancelibility state to PTHREAD_CANCEL_DISABLE - * during its execution. If that is not done, race conditions occur if the - * thread is canceled (most important use case is input module termination). - * rgerhards, 2008-01-08 - */ - if(pThis->qType != QUEUETYPE_DIRECT) { - pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); - d_pthread_mutex_lock(pThis->mut); - } - - /* then check if we need to add an assistance disk queue */ - if(pThis->bIsDA) - CHKiRet(qqueueChkStrtDA(pThis)); - - /* handle flow control - * There are two different flow control mechanisms: basic and advanced flow control. - * Basic flow control has always been implemented and protects the queue structures - * in that it makes sure no more data is enqueued than the queue is configured to - * support. Enhanced flow control is being added today. There are some sources which - * can easily be stopped, e.g. a file reader. This is the case because it is unlikely - * that blocking those sources will have negative effects (after all, the file is - * continued to be written). Other sources can somewhat be blocked (e.g. the kernel - * log reader or the local log stream reader): in general, nothing is lost if messages - * from these sources are not picked up immediately. HOWEVER, they can not block for - * an extended period of time, as this either causes message loss or - even worse - some - * other bad effects (e.g. unresponsive system in respect to the main system log socket). - * Finally, there are some (few) sources which can not be blocked at all. UDP syslog is - * a prime example. If a UDP message is not received, it is simply lost. So we can't - * do anything against UDP sockets that come in too fast. The core idea of advanced - * flow control is that we take into account the different natures of the sources and - * select flow control mechanisms that fit these needs. This also means, in the end - * result, that non-blockable sources like UDP syslog receive priority in the system. - * It's a side effect, but a good one ;) -- rgerhards, 2008-03-14 - */ - if(flowCtlType == eFLOWCTL_FULL_DELAY) { - while(pThis->iQueueSize >= pThis->iFullDlyMrk) { - dbgoprint((obj_t*) pThis, "enqueueMsg: FullDelay mark reached for full delayable message - blocking.\n"); - pthread_cond_wait(&pThis->belowFullDlyWtrMrk, pThis->mut); /* TODO error check? But what do then? */ - } - } else if(flowCtlType == eFLOWCTL_LIGHT_DELAY) { - if(pThis->iQueueSize >= pThis->iLightDlyMrk) { - dbgoprint((obj_t*) pThis, "enqueueMsg: LightDelay mark reached for light delayable message - blocking a bit.\n"); - timeoutComp(&t, 1000); /* 1000 millisconds = 1 second TODO: make configurable */ - pthread_cond_timedwait(&pThis->belowLightDlyWtrMrk, pThis->mut, &t); /* TODO error check? But what do then? */ - } - } - - /* from our regular flow control settings, we are now ready to enqueue the object. - * However, we now need to do a check if the queue permits to add more data. If that - * is not the case, basic flow control enters the field, which means we wait for - * the queue to become ready or drop the new message. -- rgerhards, 2008-03-14 - */ - while( (pThis->iMaxQueueSize > 0 && pThis->iQueueSize >= pThis->iMaxQueueSize) - || (pThis->qType == QUEUETYPE_DISK && pThis->sizeOnDiskMax != 0 - && pThis->tVars.disk.sizeOnDisk > pThis->sizeOnDiskMax)) { - dbgoprint((obj_t*) pThis, "enqueueMsg: queue FULL - waiting to drain.\n"); - timeoutComp(&t, pThis->toEnq); - if(pthread_cond_timedwait(&pThis->notFull, pThis->mut, &t) != 0) { - dbgoprint((obj_t*) pThis, "enqueueMsg: cond timeout, dropping message!\n"); - objDestruct(pUsr); - ABORT_FINALIZE(RS_RET_QUEUE_FULL); - } - } - - /* and finally enqueue the message */ - CHKiRet(qqueueAdd(pThis, pUsr)); - qqueueChkPersist(pThis); - -finalize_it: - if(pThis->qType != QUEUETYPE_DIRECT) { - /* make sure at least one worker is running. */ - qqueueAdviseMaxWorkers(pThis); - /* and release the mutex */ - d_pthread_mutex_unlock(pThis->mut); - pthread_setcancelstate(iCancelStateSave, NULL); - dbgoprint((obj_t*) pThis, "EnqueueMsg advised worker start\n"); - } - - RETiRet; -} - - -/* enqueue a single data object. This currently is a helper to qqueueMultiEnqObj. +/* enqueue a single data object. * Note that the queue mutex MUST already be locked when this function is called. * rgerhards, 2009-06-16 */ @@ -2237,7 +2417,7 @@ doEnqSingleObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr) /* then check if we need to add an assistance disk queue */ if(pThis->bIsDA) - CHKiRet(qqueueChkStrtDA(pThis)); + CHKiRet(ChkStrtDA(pThis)); /* handle flow control * There are two different flow control mechanisms: basic and advanced flow control. @@ -2261,12 +2441,12 @@ doEnqSingleObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr) */ if(flowCtlType == eFLOWCTL_FULL_DELAY) { while(pThis->iQueueSize >= pThis->iFullDlyMrk) { - dbgoprint((obj_t*) pThis, "enqueueMsg: FullDelay mark reached for full delayable message - blocking.\n"); + DBGOPRINT((obj_t*) pThis, "enqueueMsg: FullDelay mark reached for full delayable message - blocking.\n"); pthread_cond_wait(&pThis->belowFullDlyWtrMrk, pThis->mut); /* TODO error check? But what do then? */ } } else if(flowCtlType == eFLOWCTL_LIGHT_DELAY) { if(pThis->iQueueSize >= pThis->iLightDlyMrk) { - dbgoprint((obj_t*) pThis, "enqueueMsg: LightDelay mark reached for light delayable message - blocking a bit.\n"); + DBGOPRINT((obj_t*) pThis, "enqueueMsg: LightDelay mark reached for light delayable message - blocking a bit.\n"); timeoutComp(&t, 1000); /* 1000 millisconds = 1 second TODO: make configurable */ pthread_cond_timedwait(&pThis->belowLightDlyWtrMrk, pThis->mut, &t); /* TODO error check? But what do then? */ } @@ -2280,10 +2460,10 @@ doEnqSingleObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr) while( (pThis->iMaxQueueSize > 0 && pThis->iQueueSize >= pThis->iMaxQueueSize) || (pThis->qType == QUEUETYPE_DISK && pThis->sizeOnDiskMax != 0 && pThis->tVars.disk.sizeOnDisk > pThis->sizeOnDiskMax)) { - dbgoprint((obj_t*) pThis, "enqueueMsg: queue FULL - waiting to drain.\n"); + DBGOPRINT((obj_t*) pThis, "enqueueMsg: queue FULL - waiting to drain.\n"); timeoutComp(&t, pThis->toEnq); if(pthread_cond_timedwait(&pThis->notFull, pThis->mut, &t) != 0) { - dbgoprint((obj_t*) pThis, "enqueueMsg: cond timeout, dropping message!\n"); + DBGOPRINT((obj_t*) pThis, "enqueueMsg: cond timeout, dropping message!\n"); objDestruct(pUsr); ABORT_FINALIZE(RS_RET_QUEUE_FULL); } @@ -2291,7 +2471,6 @@ doEnqSingleObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr) /* and finally enqueue the message */ CHKiRet(qqueueAdd(pThis, pUsr)); - qqueueChkPersist(pThis); // TODO: optimize, do in outer function! (but we need parts from v5?) finalize_it: RETiRet; @@ -2325,14 +2504,51 @@ dbgprintf("queueMultiEnq: %d\n", i); CHKiRet(doEnqSingleObj(pThis, pMultiSub->ppMsgs[i]->flowCtlType, (void*)pMultiSub->ppMsgs[i])); } + qqueueChkPersist(pThis, pMultiSub->nElem); + +finalize_it: + if(pThis->qType != QUEUETYPE_DIRECT) { + /* make sure at least one worker is running. */ + qqueueAdviseMaxWorkers(pThis); + /* and release the mutex */ + d_pthread_mutex_unlock(pThis->mut); + pthread_setcancelstate(iCancelStateSave, NULL); + DBGOPRINT((obj_t*) pThis, "MultiEnqObj advised worker start\n"); + } + + RETiRet; +} + + +/* enqueue a new user data element + * Enqueues the new element and awakes worker thread. + */ +rsRetVal +qqueueEnqObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr) +{ + DEFiRet; + int iCancelStateSave; + + ISOBJ_TYPE_assert(pThis, qqueue); + + if(pThis->qType != QUEUETYPE_DIRECT) { + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); + d_pthread_mutex_lock(pThis->mut); + } + + CHKiRet(doEnqSingleObj(pThis, flowCtlType, pUsr)); + + qqueueChkPersist(pThis, 1); + finalize_it: if(pThis->qType != QUEUETYPE_DIRECT) { /* make sure at least one worker is running. */ qqueueAdviseMaxWorkers(pThis); +dbgprintf("YYY: call advise with mutex %p locked \n", pThis->mut); /* and release the mutex */ d_pthread_mutex_unlock(pThis->mut); pthread_setcancelstate(iCancelStateSave, NULL); - dbgoprint((obj_t*) pThis, "MultiEnqObj advised worker start\n"); + DBGOPRINT((obj_t*) pThis, "EnqueueMsg advised worker start\n"); } RETiRet; @@ -2348,7 +2564,7 @@ finalize_it: * rgerhards, 2008-01-16 */ static rsRetVal -qqueueSetEnqOnly(qqueue_t *pThis, int bEnqOnly, int bLockMutex) +SetEnqOnly(qqueue_t *pThis, int bEnqOnly, int bLockMutex) { DEFiRet; DEFVARS_mutexProtection; @@ -2370,7 +2586,7 @@ qqueueSetEnqOnly(qqueue_t *pThis, int bEnqOnly, int bLockMutex) if(bEnqOnly == 1) { /* switch to enqueue-only mode */ /* this means we need to terminate all workers - that's it... */ - dbgoprint((obj_t*) pThis, "switching to enqueue-only mode, terminating all worker threads\n"); + DBGOPRINT((obj_t*) pThis, "switching to enqueue-only mode, terminating all worker threads\n"); if(pThis->pWtpReg != NULL) wtpWakeupAllWrkr(pThis->pWtpReg); if(pThis->pWtpDA != NULL) @@ -2410,6 +2626,7 @@ DEFpropSetMeth(qqueue, iMinMsgsPerWrkr, int) DEFpropSetMeth(qqueue, bSaveOnShutdown, int) DEFpropSetMeth(qqueue, pUsr, void*) DEFpropSetMeth(qqueue, iDeqSlowdown, int) +DEFpropSetMeth(qqueue, iDeqBatchSize, int) DEFpropSetMeth(qqueue, sizeOnDiskMax, int64) @@ -2428,8 +2645,6 @@ static rsRetVal qqueueSetProperty(qqueue_t *pThis, var_t *pProp) if(isProp("iQueueSize")) { pThis->iQueueSize = pProp->val.num; - } else if(isProp("iUngottenObjs")) { - pThis->iUngottenObjs = pProp->val.num; } else if(isProp("tVars.disk.sizeOnDisk")) { pThis->tVars.disk.sizeOnDisk = pProp->val.num; } else if(isProp("tVars.disk.bytesRead")) { diff --git a/runtime/queue.h b/runtime/queue.h index 1d82d8d9..73c62b52 100644 --- a/runtime/queue.h +++ b/runtime/queue.h @@ -27,8 +27,18 @@ #include <pthread.h> #include "obj.h" #include "wtp.h" +#include "batch.h" #include "stream.h" +/* support for the toDelete list */ +typedef struct toDeleteLst_s toDeleteLst_t; +struct toDeleteLst_s { + qDeqID deqID; + int nElemDeq; /* numbe of elements that were dequeued and as such must now be discarded */ + struct toDeleteLst_s *pNext; +}; + + /* queue types */ typedef enum { QUEUETYPE_FIXED_ARRAY = 0,/* a simple queue made out of a fixed (initially malloced) array fast but memoryhog */ @@ -44,20 +54,11 @@ typedef struct qLinkedList_S { } qLinkedList_t; -typedef struct qWrkThrd_s { - pthread_t thrdID; /* thread ID */ - qWrkCmd_t tCurrCmd; /* current command to be carried out by worker */ - obj_t *pUsr; /* current user object being processed (or NULL if none) */ - struct queue_s *pQueue; /* my queue (important if only the work thread instance is passed! */ - int iThrd; /* my worker thread array index */ - pthread_cond_t condInitDone; /* signaled when the thread startup is done (once per thread existance) */ - pthread_mutex_t mut; -} qWrkThrd_t; /* type for queue worker threads */ - /* the queue object */ typedef struct queue_s { BEGINobjInstance; queueType_t qType; + int nLogDeq; /* number of elements currently logically dequeued */ bool bEnqOnly; /* does queue run in enqueue-only mode (1) or not (0)? */ bool bSaveOnShutdown;/* persists everthing on shutdown (if DA!)? 1-yes, 0-no */ bool bQueueStarted; /* has queueStart() been called on this queue? 1-yes, 0-no */ @@ -83,7 +84,9 @@ typedef struct queue_s { int toQShutdown; /* timeout for regular queue shutdown in ms */ int toActShutdown; /* timeout for long-running action shutdown in ms */ int toWrkShutdown; /* timeout for idle workers in ms, -1 means indefinite (0 is immediate) */ + toDeleteLst_t *toDeleteLst;/* this queue's to-delete list */ int toEnq; /* enqueue timeout */ + int iDeqBatchSize; /* max number of elements that shall be dequeued at once */ /* rate limiting settings (will be expanded) */ int iDeqSlowdown; /* slow down dequeue by specified nbr of microseconds */ /* end rate limiting */ @@ -97,18 +100,19 @@ typedef struct queue_s { * applied to detect user configuration errors (and tell me how should we detect what * the user really wanted...). -- rgerhards, 2008-04-02 */ - /* ane dequeue time window */ - rsRetVal (*pConsumer)(void *,void*); /* user-supplied consumer function for dequeued messages */ + /* end dequeue time window */ + rsRetVal (*pConsumer)(void *,batch_t*); /* user-supplied consumer function for dequeued messages */ /* calling interface for pConsumer: arg1 is the global user pointer from this structure, arg2 is the - * user pointer that was dequeued (actual sample: for actions, arg1 is the pAction and arg2 is pointer - * to message) - * rgerhards, 2008-01-28 + * user pointer array that was dequeued (actual sample: for actions, arg1 is the pAction and arg2 + * is pointer to an array of message message pointers) */ /* type-specific handlers (set during construction) */ rsRetVal (*qConstruct)(struct queue_s *pThis); rsRetVal (*qDestruct)(struct queue_s *pThis); rsRetVal (*qAdd)(struct queue_s *pThis, void *pUsr); - rsRetVal (*qDel)(struct queue_s *pThis, void **ppUsr); + rsRetVal (*qDeq)(struct queue_s *pThis, void **ppUsr); + rsRetVal (*qDel)(struct queue_s *pThis); + rsRetVal (*qUnDeqAll)(struct queue_s *pThis); /* end type-specific handler */ /* synchronization variables */ pthread_mutex_t mutThrdMgmt; /* mutex for the queue's thread management */ @@ -117,7 +121,6 @@ typedef struct queue_s { pthread_cond_t belowFullDlyWtrMrk; /* below eFLOWCTL_FULL_DELAY watermark */ pthread_cond_t belowLightDlyWtrMrk; /* below eFLOWCTL_FULL_DELAY watermark */ pthread_cond_t condDAReady;/* signalled when the DA queue is fully initialized and ready for processing */ - int bChildIsDone; /* set to 1 when the child DA queue has finished processing, 0 otherwise */ int bThrdStateChanged; /* at least one thread state has changed if 1 */ /* end sync variables */ /* the following variables are always present, because they @@ -132,32 +135,30 @@ typedef struct queue_s { int iNumberFiles; /* how many files make up the queue? */ int64 iMaxFileSize; /* max size for a single queue file */ int64 sizeOnDiskMax; /* maximum size on disk allowed */ + qDeqID deqIDAdd; /* next dequeue ID to use during add to queue store */ + qDeqID deqIDDel; /* queue store delete position */ int bIsDA; /* is this queue disk assisted? */ int bRunsDA; /* is this queue actually *running* disk assisted? */ struct queue_s *pqDA; /* queue for disk-assisted modes */ struct queue_s *pqParent;/* pointer to the parent (if this is a child queue) */ int bDAEnqOnly; /* EnqOnly setting for DA queue */ - /* some data elements for the queueUngetObj() functionality. This list should always be short - * and is always kept in memory - */ - qLinkedList_t *pUngetRoot; - qLinkedList_t *pUngetLast; - int iUngottenObjs; /* number of objects currently in the "ungotten" list */ /* now follow queueing mode specific data elements */ union { /* different data elements based on queue type (qType) */ struct { - long head, tail; + long deqhead, head, tail; void** pBuf; /* the queued user data structure */ } farray; struct { - qLinkedList_t *pRoot; + qLinkedList_t *pDeqRoot; + qLinkedList_t *pDelRoot; qLinkedList_t *pLast; } linklist; struct { int64 sizeOnDisk; /* current amount of disk space used */ int64 bytesRead; /* number of bytes read from current (undeleted!) file */ - strm_t *pWrite; /* current file to be written */ - strm_t *pRead; /* current file to be read */ + strm_t *pWrite; /* current file to be written */ + strm_t *pReadDeq; /* current file for dequeueing */ + strm_t *pReadDel; /* current file for deleting */ } disk; } tVars; } qqueue_t; @@ -184,7 +185,7 @@ rsRetVal qqueueStart(qqueue_t *pThis); rsRetVal qqueueSetMaxFileSize(qqueue_t *pThis, size_t iMaxFileSize); rsRetVal qqueueSetFilePrefix(qqueue_t *pThis, uchar *pszPrefix, size_t iLenPrefix); rsRetVal qqueueConstruct(qqueue_t **ppThis, queueType_t qType, int iWorkerThreads, - int iMaxQueueSize, rsRetVal (*pConsumer)(void*,void*)); + int iMaxQueueSize, rsRetVal (*pConsumer)(void*,batch_t*)); PROTOTYPEObjClassInit(qqueue); PROTOTYPEpropSetMeth(qqueue, iPersistUpdCnt, int); PROTOTYPEpropSetMeth(qqueue, bSyncQueueFiles, int); @@ -203,6 +204,7 @@ PROTOTYPEpropSetMeth(qqueue, bSaveOnShutdown, int); PROTOTYPEpropSetMeth(qqueue, pUsr, void*); PROTOTYPEpropSetMeth(qqueue, iDeqSlowdown, int); PROTOTYPEpropSetMeth(qqueue, sizeOnDiskMax, int64); +PROTOTYPEpropSetMeth(qqueue, iDeqBatchSize, int); #define qqueueGetID(pThis) ((unsigned long) pThis) #endif /* #ifndef QUEUE_H_INCLUDED */ diff --git a/runtime/rsyslog.h b/runtime/rsyslog.h index 27bea6bc..59e8458b 100644 --- a/runtime/rsyslog.h +++ b/runtime/rsyslog.h @@ -69,8 +69,24 @@ #endif +/* the rsyslog core provides information about present feature to plugins + * asking it. Below are feature-test macros which must be used to query + * features. Note that this must be powers of two, so that multiple queries + * can be combined. -- rgerhards, 2009-04-27 + */ +#define CORE_FEATURE_BATCHING 1 +/*#define CORE_FEATURE_whatever 2 ... and so on ... */ + +/* some universal fixed size integer defines ... */ +typedef long long int64; +typedef long long unsigned uint64; +typedef int64 number_t; /* type to use for numbers - TODO: maybe an autoconf option? */ +typedef char intTiny; /* 0..127! */ +typedef unsigned char uintTiny; /* 0..255! */ + /* define some base data types */ typedef unsigned char uchar;/* get rid of the unhandy "unsigned char" */ +typedef struct aUsrp_s aUsrp_t; typedef struct thrdInfo thrdInfo_t; typedef struct obj_s obj_t; typedef struct ruleset_s ruleset_t; @@ -87,6 +103,7 @@ typedef struct nsd_gsspi_s nsd_gsspi_t; typedef struct nsd_nss_s nsd_nss_t; typedef struct nsdsel_ptcp_s nsdsel_ptcp_t; typedef struct nsdsel_gtls_s nsdsel_gtls_t; +typedef struct wti_s wti_t; typedef obj_t nsd_t; typedef obj_t nsdsel_t; typedef struct msg msg_t; @@ -102,18 +119,15 @@ typedef struct tcps_sess_s tcps_sess_t; typedef struct strmsrv_s strmsrv_t; typedef struct strms_sess_s strms_sess_t; typedef struct vmstk_s vmstk_t; +typedef struct batch_obj_s batch_obj_t; +typedef struct batch_s batch_t; +typedef struct wtp_s wtp_t; typedef rsRetVal (*prsf_t)(struct vmstk_s*, int); /* pointer to a RainerScript function */ +typedef uint64 qDeqID; /* queue Dequeue order ID. 32 bits is considered dangerously few */ typedef struct tcpLstnPortList_s tcpLstnPortList_t; // TODO: rename? typedef struct strmLstnPortList_s strmLstnPortList_t; // TODO: rename? -/* some universal 64 bit define... */ -typedef long long int64; -typedef long long unsigned uint64; -typedef int64 number_t; /* type to use for numbers - TODO: maybe an autoconf option? */ -typedef char intTiny; /* 0..127! */ -typedef uchar uintTiny; /* 0..255! */ - #ifdef __hpux typedef unsigned int u_int32_t; /* TODO: is this correct? */ typedef int socklen_t; @@ -369,15 +383,20 @@ enum rsRetVal_ /** return value. All methods return this if not specified oth RS_RET_VAR_NOT_FOUND = -2142, /**< variable not found */ RS_RET_EMPTY_MSG = -2143, /**< provided (raw) MSG is empty */ RS_RET_PEER_CLOSED_CONN = -2144, /**< remote peer closed connection (information, no error) */ + RS_RET_NO_SRCNAME_TPL = -2150, /**< sourcename template was not specified where one was needed (omudpspoof spoof addr) */ + RS_RET_HOST_NOT_SPECIFIED = -2151, /**< (target) host was not specified where it was needed */ + RS_RET_ERR_LIBNET_INIT = -2152, /**< error initializing libnet */ /* RainerScript error messages (range 1000.. 1999) */ RS_RET_SYSVAR_NOT_FOUND = 1001, /**< system variable could not be found (maybe misspelled) */ /* some generic error/status codes */ + RS_RET_OK = 0, /**< operation successful */ RS_RET_OK_DELETE_LISTENTRY = 1, /**< operation successful, but callee requested the deletion of an entry (special state) */ RS_RET_TERMINATE_NOW = 2, /**< operation successful, function is requested to terminate (mostly used with threads) */ RS_RET_NO_RUN = 3, /**< operation successful, but function does not like to be executed */ - RS_RET_OK = 0 /**< operation successful */ + RS_RET_IDLE = 4, /**< operation successful, but callee is idle (e.g. because queue is empty) */ + RS_RET_TERMINATE_WHEN_IDLE = 5 /**< operation successful, function is requested to terminate when idle */ }; /* some helpful macros to work with srRetVals. diff --git a/runtime/ruleset.c b/runtime/ruleset.c index d98b4217..5ac9a8fd 100644 --- a/runtime/ruleset.c +++ b/runtime/ruleset.c @@ -138,8 +138,10 @@ finalize_it: */ DEFFUNC_llExecFunc(processMsgDoRules) { + rsRetVal iRet; ISOBJ_TYPE_assert(pData, rule); - return rule.ProcessMsg((rule_t*) pData, (msg_t*) pParam); + iRet = rule.ProcessMsg((rule_t*) pData, (msg_t*) pParam); + return iRet; } @@ -159,8 +161,9 @@ processMsg(msg_t *pMsg) CHKiRet(llExecFunc(&pThis->llRules, processMsgDoRules, pMsg)); finalize_it: - if(iRet == RS_RET_DISCARDMSG) - iRet = RS_RET_OK; + + //if(iRet == RS_RET_DISCARDMSG) + //iRet = RS_RET_OK; RETiRet; } diff --git a/runtime/srUtils.h b/runtime/srUtils.h index 16766312..c4f73e16 100644 --- a/runtime/srUtils.h +++ b/runtime/srUtils.h @@ -110,30 +110,17 @@ rsRetVal getFileSize(uchar *pszName, off_t *pSize); /* some useful constants */ #define DEFVARS_mutexProtection\ - int iCancelStateSave; \ int bLockedOpIsLocked=0 #define BEGIN_MTX_PROTECTED_OPERATIONS(mut, bMustLock) \ if(bMustLock == LOCK_MUTEX) { \ - pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); \ d_pthread_mutex_lock(mut); \ + assert(bLockedOpIsLocked == 0); \ bLockedOpIsLocked = 1; \ } #define END_MTX_PROTECTED_OPERATIONS(mut) \ if(bLockedOpIsLocked) { \ d_pthread_mutex_unlock(mut); \ - pthread_setcancelstate(iCancelStateSave, NULL); \ + bLockedOpIsLocked = 0; \ } -/* The unconditional versions of the macro always lock the mutex. They are preferred in - * complex scenarios, where the simple ones might get mixed up by multiple calls. - */ -#define DEFVARS_mutexProtection_uncond\ - int iCancelStateSave -#define BEGIN_MTX_PROTECTED_OPERATIONS_UNCOND(mut) \ - pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); \ - d_pthread_mutex_lock(mut); -#define END_MTX_PROTECTED_OPERATIONS_UNCOND(mut) \ - d_pthread_mutex_unlock(mut); \ - pthread_setcancelstate(iCancelStateSave, NULL); - #endif diff --git a/runtime/stream.c b/runtime/stream.c index ac90df28..58f16cce 100644 --- a/runtime/stream.c +++ b/runtime/stream.c @@ -253,7 +253,7 @@ static rsRetVal strmOpenFile(strm_t *pThis) pThis->pszFName, pThis->lenFName, pThis->iCurrFNum, pThis->iFileNumDigits)); } else { if(pThis->pszDir == NULL) { - if((pThis->pszCurrFName = (uchar*) strdup((char*) pThis->pszFName)) == NULL) + if((pThis->pszCurrFName = ustrdup(pThis->pszFName)) == NULL) ABORT_FINALIZE(RS_RET_OUT_OF_MEMORY); } else { CHKiRet(genFileName(&pThis->pszCurrFName, pThis->pszDir, pThis->lenDir, @@ -1448,6 +1448,46 @@ finalize_it: } +/* duplicate a stream object excluding dynamic properties. This function is + * primarily meant to provide a duplicate that later on can be used to access + * the data. This is needed, for example, for a restart of the disk queue. + * Note that ConstructFinalize() is NOT called. So our caller may change some + * properties before finalizing things. + * rgerhards, 2009-05-26 + */ +rsRetVal +strmDup(strm_t *pThis, strm_t **ppNew) +{ + strm_t *pNew = NULL; + DEFiRet; + + ISOBJ_TYPE_assert(pThis, strm); + assert(ppNew != NULL); + + CHKiRet(strmConstruct(&pNew)); + pNew->sType = pThis->sType; + pNew->iCurrFNum = pThis->iCurrFNum; + CHKmalloc(pNew->pszFName = ustrdup(pThis->pszFName)); + pNew->lenFName = pThis->lenFName; + CHKmalloc(pNew->pszDir = ustrdup(pThis->pszDir)); + pNew->lenDir = pThis->lenDir; + pNew->tOperationsMode = pThis->tOperationsMode; + pNew->tOpenMode = pThis->tOpenMode; + pNew->iMaxFileSize = pThis->iMaxFileSize; + pNew->iMaxFiles = pThis->iMaxFiles; + pNew->iFileNumDigits = pThis->iFileNumDigits; + pNew->bDeleteOnClose = pThis->bDeleteOnClose; + pNew->iCurrOffs = pThis->iCurrOffs; + + *ppNew = pNew; + pNew = NULL; + +finalize_it: + if(pNew != NULL) + strmDestruct(&pNew); + + RETiRet; +} /* set a user write-counter. This counter is initialized to zero and * receives the number of bytes written. It is accurate only after a @@ -1563,6 +1603,7 @@ CODESTARTobjQueryInterface(strm) pIf->RecordEnd = strmRecordEnd; pIf->Serialize = strmSerialize; pIf->GetCurrOffset = strmGetCurrOffset; + pIf->Dup = strmDup; pIf->SetWCntr = strmSetWCntr; /* set methods */ pIf->SetbDeleteOnClose = strmSetbDeleteOnClose; diff --git a/runtime/stream.h b/runtime/stream.h index 64ffb6e1..9577d704 100644 --- a/runtime/stream.h +++ b/runtime/stream.h @@ -169,6 +169,7 @@ BEGINinterface(strm) /* name must also be changed in ENDinterface macro! */ rsRetVal (*Serialize)(strm_t *pThis, strm_t *pStrm); rsRetVal (*GetCurrOffset)(strm_t *pThis, int64 *pOffs); rsRetVal (*SetWCntr)(strm_t *pThis, number_t *pWCnt); + rsRetVal (*Dup)(strm_t *pThis, strm_t **ppNew); INTERFACEpropSetMeth(strm, bDeleteOnClose, int); INTERFACEpropSetMeth(strm, iMaxFileSize, int); INTERFACEpropSetMeth(strm, iMaxFiles, int); @@ -183,7 +184,7 @@ BEGINinterface(strm) /* name must also be changed in ENDinterface macro! */ INTERFACEpropSetMeth(strm, iFlushInterval, int); INTERFACEpropSetMeth(strm, pszSizeLimitCmd, uchar*); ENDinterface(strm) -#define strmCURR_IF_VERSION 2 /* increment whenever you change the interface structure! */ +#define strmCURR_IF_VERSION 5 /* increment whenever you change the interface structure! */ /* prototypes */ diff --git a/runtime/syslogd-types.h b/runtime/syslogd-types.h index 4a26f993..161ee06f 100644 --- a/runtime/syslogd-types.h +++ b/runtime/syslogd-types.h @@ -56,7 +56,8 @@ * applications I do not yet envision. -- rgerhards, 2007-07-24 */ typedef enum _syslogFeature { - sFEATURERepeatedMsgReduction = 1 + sFEATURERepeatedMsgReduction = 1, + sFEATURENonCancelInputTermination = 2 } syslogFeature; /* we define our own facility and severities */ diff --git a/runtime/wti.c b/runtime/wti.c index abdf4add..53b695b0 100644 --- a/runtime/wti.c +++ b/runtime/wti.c @@ -39,9 +39,10 @@ #include <pthread.h> #include <errno.h> -#ifdef OS_SOLARIS -# include <sched.h> -#endif +/// TODO: check on solaris if this is any longer needed - I don't think so - rgerhards, 2009-09-20 +//#ifdef OS_SOLARIS +//# include <sched.h> +//#endif #include "rsyslog.h" #include "stringbuf.h" @@ -75,92 +76,50 @@ wtiGetDbgHdr(wti_t *pThis) } -/* get the current worker state. For simplicity and speed, we have - * NOT used our regular calling interface this time. I hope that won't - * bite in the long term... -- rgerhards, 2008-01-17 - * TODO: may be performance optimized by atomic operations +/* return the current worker processing state. For the sake of + * simplicity, we do not use the iRet interface. -- rgerhards, 2009-07-17 */ -qWrkCmd_t -wtiGetState(wti_t *pThis, int bLockMutex) +bool +wtiGetState(wti_t *pThis) { - DEFVARS_mutexProtection; - qWrkCmd_t tCmd; - - BEGINfunc - ISOBJ_TYPE_assert(pThis, wti); - - BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, bLockMutex); - tCmd = pThis->tCurrCmd; - END_MTX_PROTECTED_OPERATIONS(&pThis->mut); - - ENDfunc - return tCmd; + return ATOMIC_FETCH_32BIT(pThis->bIsRunning); } -/* send a command to a specific thread - * bActiveOnly specifies if the command should be sent only when the worker is - * in an active state. -- rgerhards, 2008-01-20 +/* Set this thread to "always running" state (can not be unset) + * rgerhards, 2009-07-20 */ rsRetVal -wtiSetState(wti_t *pThis, qWrkCmd_t tCmd, int bActiveOnly, int bLockMutex) +wtiSetAlwaysRunning(wti_t *pThis) { - DEFiRet; - qWrkCmd_t tCurrCmd; - DEFVARS_mutexProtection; - ISOBJ_TYPE_assert(pThis, wti); - assert(tCmd <= eWRKTHRD_SHUTDOWN_IMMEDIATE); - - BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, bLockMutex); - - tCurrCmd = pThis->tCurrCmd; - /* all worker states must be followed sequentially, only termination can be set in any state */ - if( (bActiveOnly && (tCurrCmd < eWRKTHRD_RUN_CREATED)) - || (tCurrCmd > tCmd && !(tCmd == eWRKTHRD_TERMINATING || tCmd == eWRKTHRD_STOPPED))) { - DBGPRINTF("%s: command %d can not be accepted in current %d processing state - ignored\n", - wtiGetDbgHdr(pThis), tCmd, tCurrCmd); - } else { - DBGPRINTF("%s: receiving command %d\n", wtiGetDbgHdr(pThis), tCmd); - /* we could replace this with a simple if, but we leave the switch in in case we need - * to add something at a later stage. -- rgerhards, 2008-09-30 - */ - switch(tCmd) { - case eWRKTHRD_TERMINATING: - /* TODO: re-enable meaningful debug msg! (via function callback?) - dbgprintf("%s: thread terminating with %d entries left in queue, %d workers running.\n", - wtiGetDbgHdr(pThis->pQueue), pThis->pQueue->iQueueSize, - pThis->pQueue->iCurNumWrkThrd); - */ - pthread_cond_signal(&pThis->condExitDone); - dbgprintf("%s: worker terminating\n", wtiGetDbgHdr(pThis)); - break; - /* these cases just to satisfy the compiler, we do (yet) not act an them: */ - case eWRKTHRD_RUNNING: - case eWRKTHRD_STOPPED: - case eWRKTHRD_RUN_CREATED: - case eWRKTHRD_RUN_INIT: - case eWRKTHRD_SHUTDOWN: - case eWRKTHRD_SHUTDOWN_IMMEDIATE: - /* DO NOTHING */ - break; - } - /* apply the new state */ - unsigned val = ATOMIC_CAS_VAL(pThis->tCurrCmd, tCurrCmd, tCmd); - if(val != tCurrCmd) { - DBGPRINTF("wtiSetState PROBLEM, tCurrCmd %d overwritten with %d, wanted to set %d\n", tCurrCmd, val, tCmd); - } - - } + pThis->bAlwaysRunning = TRUE; + return RS_RET_OK; +} - END_MTX_PROTECTED_OPERATIONS(&pThis->mut); - RETiRet; +/* Set status (thread is running or not), actually an property of + * use for wtp, but we need to have it per thread instance (thus it + * is inside wti). -- rgerhards, 2009-07-17 + */ +rsRetVal +wtiSetState(wti_t *pThis, bool bNewVal) +{ + ISOBJ_TYPE_assert(pThis, wti); + if(bNewVal) + ATOMIC_STORE_1_TO_INT(pThis->bIsRunning); + else + ATOMIC_STORE_0_TO_INT(pThis->bIsRunning); + return RS_RET_OK; } -/* Cancel the thread. If the thread is already cancelled or terminated, - * we do not again cancel it. But it is save and legal to call wtiCancelThrd() in - * such situations. +/* Cancel the thread. If the thread is not running. But it is save and legal to + * call wtiCancelThrd() in such situations. This function only returns when the + * thread has terminated. Else we may get race conditions all over the code... + * Note that when waiting for the thread to terminate, we do a busy wait, checking + * progress every 10ms. It is very unlikely that we will ever cancel a thread + * and, if so, it will only happen at the end of the rsyslog run. So doing this + * kind of not optimal wait is considered preferable over using condition variables. * rgerhards, 2008-02-26 */ rsRetVal @@ -170,19 +129,16 @@ wtiCancelThrd(wti_t *pThis) ISOBJ_TYPE_assert(pThis, wti); - d_pthread_mutex_lock(&pThis->mut); - - wtiProcessThrdChanges(pThis, MUTEX_ALREADY_LOCKED); /* process state change, so that we have current state vars */ - - if(pThis->tCurrCmd >= eWRKTHRD_TERMINATING) { - dbgoprint((obj_t*) pThis, "canceling worker thread, curr stat %d\n", pThis->tCurrCmd); + if(wtiGetState(pThis)) { + dbgoprint((obj_t*) pThis, "canceling worker thread\n"); pthread_cancel(pThis->thrdID); - wtiSetState(pThis, eWRKTHRD_TERMINATING, 0, MUTEX_ALREADY_LOCKED); - ATOMIC_STORE_1_TO_INT(pThis->pWtp->bThrdStateChanged); /* indicate change, so harverster will be called */ + /* now wait until the thread terminates... */ + while(wtiGetState(pThis)) { +//fprintf(stderr, "sleep loop for getState\n"); + srSleep(0, 10000); + } } - d_pthread_mutex_unlock(&pThis->mut); - RETiRet; } @@ -190,26 +146,8 @@ wtiCancelThrd(wti_t *pThis) /* Destructor */ BEGINobjDestruct(wti) /* be sure to specify the object type also in END and CODESTART macros! */ CODESTARTobjDestruct(wti) - /* if we reach this point, we must make sure the associated worker has terminated. It is - * the callers duty to make sure the worker already knows it shall terminate. - * TODO: is it *really* the caller's duty? ...mmmhhhh.... smells bad... rgerhards, 2008-01-25 - */ - wtiProcessThrdChanges(pThis, LOCK_MUTEX); /* process state change one last time */ - - d_pthread_mutex_lock(&pThis->mut); - if(wtiGetState(pThis, MUTEX_ALREADY_LOCKED) != eWRKTHRD_STOPPED) { - dbgprintf("%s: WARNING: worker %p shall be destructed but is still running (might be OK) - joining it\n", - wtiGetDbgHdr(pThis), pThis); - /* let's hope the caller actually instructed it to shutdown... */ - pthread_cond_wait(&pThis->condExitDone, &pThis->mut); - wtiJoinThrd(pThis); - } - d_pthread_mutex_unlock(&pThis->mut); - /* actual destruction */ - pthread_cond_destroy(&pThis->condExitDone); - pthread_mutex_destroy(&pThis->mut); - + free(pThis->batch.pElem); free(pThis->pszDbgHdr); ENDobjDestruct(wti) @@ -217,8 +155,6 @@ ENDobjDestruct(wti) /* Standard-Constructor for the wti object */ BEGINobjConstruct(wti) /* be sure to specify the object type also in END macro! */ - pthread_cond_init(&pThis->condExitDone, NULL); - pthread_mutex_init(&pThis->mut, NULL); ENDobjConstruct(wti) @@ -229,81 +165,28 @@ rsRetVal wtiConstructFinalize(wti_t *pThis) { DEFiRet; + int iDeqBatchSize; ISOBJ_TYPE_assert(pThis, wti); dbgprintf("%s: finalizing construction of worker instance data\n", wtiGetDbgHdr(pThis)); - /* initialize our thread instance descriptor */ - pThis->pUsrp = NULL; - pThis->tCurrCmd = eWRKTHRD_STOPPED; - - RETiRet; -} - - -/* join a specific worker thread - * we do not lock the mutex, because join will sync anyways... - */ -rsRetVal -wtiJoinThrd(wti_t *pThis) -{ - DEFiRet; + /* initialize our thread instance descriptor (no concurrency here) */ + pThis->bIsRunning = FALSE; - ISOBJ_TYPE_assert(pThis, wti); - dbgprintf("waiting for worker %s termination, current state %d\n", wtiGetDbgHdr(pThis), pThis->tCurrCmd); - if (pThis->thrdID == 0) { - dbgprintf("worker %s was already stopped\n", wtiGetDbgHdr(pThis)); - } else { - pthread_join(pThis->thrdID, NULL); - wtiSetState(pThis, eWRKTHRD_STOPPED, 0, MUTEX_ALREADY_LOCKED); /* back to virgin... */ - pThis->thrdID = 0; /* invalidate the thread ID so that we do not accidently find reused ones */ - dbgprintf("worker %s has stopped\n", wtiGetDbgHdr(pThis)); - } - - RETiRet; -} - -/* check if we had a worker thread changes and, if so, act - * on it. At a minimum, terminated threads are harvested (joined). - */ -rsRetVal -wtiProcessThrdChanges(wti_t *pThis, int bLockMutex) -{ - DEFiRet; - DEFVARS_mutexProtection; - - ISOBJ_TYPE_assert(pThis, wti); - - BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, bLockMutex); - switch(pThis->tCurrCmd) { - case eWRKTHRD_TERMINATING: - /* we need to at least temporarily release the mutex, because otherwise - * we may deadlock with the thread we intend to join (it aquires the mutex - * during termination processing). -- rgerhards, 2008-02-26 - */ - END_MTX_PROTECTED_OPERATIONS(&pThis->mut); - iRet = wtiJoinThrd(pThis); - BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, bLockMutex); - break; - /* these cases just to satisfy the compiler, we do not act an them: */ - case eWRKTHRD_STOPPED: - case eWRKTHRD_RUN_CREATED: - case eWRKTHRD_RUN_INIT: - case eWRKTHRD_RUNNING: - case eWRKTHRD_SHUTDOWN: - case eWRKTHRD_SHUTDOWN_IMMEDIATE: - /* DO NOTHING */ - break; - } - END_MTX_PROTECTED_OPERATIONS(&pThis->mut); + /* we now alloc the array for user pointers. We obtain the max from the queue itself. */ + CHKiRet(pThis->pWtp->pfGetDeqBatchSize(pThis->pWtp->pUsr, &iDeqBatchSize)); + CHKmalloc(pThis->batch.pElem = calloc((size_t)iDeqBatchSize, sizeof(batch_obj_t))); +finalize_it: RETiRet; } /* cancellation cleanup handler for queueWorker () * Updates admin structure and frees ressources. + * Keep in mind that cancellation is disabled if we run into + * the cancel cleanup handler (and have been cancelled). * rgerhards, 2008-01-16 */ static void @@ -311,7 +194,6 @@ wtiWorkerCancelCleanup(void *arg) { wti_t *pThis = (wti_t*) arg; wtp_t *pWtp; - int iCancelStateSave; BEGINfunc ISOBJ_TYPE_assert(pThis, wti); @@ -320,17 +202,39 @@ wtiWorkerCancelCleanup(void *arg) DBGPRINTF("%s: cancelation cleanup handler called.\n", wtiGetDbgHdr(pThis)); - /* call user supplied handler (that one e.g. requeues the element) */ - pWtp->pfOnWorkerCancel(pThis->pWtp->pUsr, pThis->pUsrp); + /* call user supplied handler */ + pWtp->pfOnWorkerCancel(pThis->pWtp->pUsr, pThis->batch.pElem[0].pUsrp); - pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); - d_pthread_mutex_lock(&pWtp->mut); - wtiSetState(pThis, eWRKTHRD_TERMINATING, 0, MUTEX_ALREADY_LOCKED); - /* TODO: sync access? I currently think it is NOT needed -- rgerhards, 2008-01-28 */ - ATOMIC_STORE_1_TO_INT(pWtp->bThrdStateChanged); /* indicate change, so harverster will be called */ + ENDfunc +} - d_pthread_mutex_unlock(&pWtp->mut); - pthread_setcancelstate(iCancelStateSave, NULL); + +/* wait for queue to become non-empty or timeout + * helper to wtiWorker. Note the the predicate is + * re-tested by the caller, so it is OK to NOT do it here. + * rgerhards, 2009-05-20 + */ +static inline void +doIdleProcessing(wti_t *pThis, wtp_t *pWtp, int *pbInactivityTOOccured) +{ + struct timespec t; + + BEGINfunc + DBGPRINTF("%s: worker IDLE, waiting for work.\n", wtiGetDbgHdr(pThis)); + + pWtp->pfOnIdle(pWtp->pUsr, MUTEX_ALREADY_LOCKED); + + if(pThis->bAlwaysRunning) { + /* never shut down any started worker */ +dbgprintf("YYY/ZZZ: wti Idle wait cond busy, mutex %p\n", pWtp->pmutUsr); + d_pthread_cond_wait(pWtp->pcondBusy, pWtp->pmutUsr); + } else { + timeoutComp(&t, pWtp->toWrkShutdown);/* get absolute timeout */ + if(d_pthread_cond_timedwait(pWtp->pcondBusy, pWtp->pmutUsr, &t) != 0) { + DBGPRINTF("%s: inactivity timeout, worker terminating...\n", wtiGetDbgHdr(pThis)); + *pbInactivityTOOccured = 1; /* indicate we had a timeout */ + } + } ENDfunc } @@ -341,82 +245,72 @@ wtiWorkerCancelCleanup(void *arg) rsRetVal wtiWorker(wti_t *pThis) { - DEFiRet; - DEFVARS_mutexProtection; - struct timespec t; wtp_t *pWtp; /* our worker thread pool */ int bInactivityTOOccured = 0; + rsRetVal localRet; + rsRetVal terminateRet; + int iCancelStateSave; + DEFiRet; ISOBJ_TYPE_assert(pThis, wti); pWtp = pThis->pWtp; /* shortcut */ ISOBJ_TYPE_assert(pWtp, wtp); dbgSetThrdName(pThis->pszDbgHdr); - pThis->pUsrp = NULL; pthread_cleanup_push(wtiWorkerCancelCleanup, pThis); - BEGIN_MTX_PROTECTED_OPERATIONS(pWtp->pmutUsr, LOCK_MUTEX); pWtp->pfOnWorkerStartup(pWtp->pUsr); - END_MTX_PROTECTED_OPERATIONS(pWtp->pmutUsr); /* now we have our identity, on to real processing */ while(1) { /* loop will be broken below - need to do mutex locks */ - /* process any pending thread requests */ - wtpProcessThrdChanges(pWtp); - - /* if we have a rate-limiter set for this worker pool, let's call it. Please - * keep in mind that the rate-limiter may hold us for an extended period - * of time. -- rgerhards, 2008-04-02 - */ - if(pWtp->pfRateLimiter != NULL) { + if(pWtp->pfRateLimiter != NULL) { /* call rate-limiter, if defined */ pWtp->pfRateLimiter(pWtp->pUsr); } - wtpSetInactivityGuard(pThis->pWtp, 0, LOCK_MUTEX); /* must be set before usr mutex is locked! */ - BEGIN_MTX_PROTECTED_OPERATIONS(pWtp->pmutUsr, LOCK_MUTEX); - - if( (bInactivityTOOccured && pWtp->pfIsIdle(pWtp->pUsr, MUTEX_ALREADY_LOCKED)) - || wtpChkStopWrkr(pWtp, LOCK_MUTEX, MUTEX_ALREADY_LOCKED)) { - END_MTX_PROTECTED_OPERATIONS(pWtp->pmutUsr); - break; /* end worker thread run */ +dbgprintf("YYY/ZZZ: pre lock mutex\n"); + d_pthread_mutex_lock(pWtp->pmutUsr); + +dbgprintf("YYY/ZZZ: wti locks mutex %p\n", pWtp->pmutUsr); + /* first check if we are in shutdown process (but evaluate a bit later) */ + terminateRet = wtpChkStopWrkr(pWtp, MUTEX_ALREADY_LOCKED); + if(terminateRet == RS_RET_TERMINATE_NOW) { + /* we now need to free the old batch */ + localRet = pWtp->pfObjProcessed(pWtp->pUsr, pThis); + dbgoprint((obj_t*) pThis, "terminating worker because of TERMINATE_NOW mode, del iRet %d\n", + localRet); + d_pthread_mutex_unlock(pWtp->pmutUsr); + break; } - bInactivityTOOccured = 0; /* reset for next run */ - /* if we reach this point, we are still protected by the mutex */ - - if(pWtp->pfIsIdle(pWtp->pUsr, MUTEX_ALREADY_LOCKED)) { - DBGPRINTF("%s: worker IDLE, waiting for work.\n", wtiGetDbgHdr(pThis)); - pWtp->pfOnIdle(pWtp->pUsr, MUTEX_ALREADY_LOCKED); - - if(pWtp->toWrkShutdown == -1) { - /* never shut down any started worker */ - d_pthread_cond_wait(pWtp->pcondBusy, pWtp->pmutUsr); - } else { - timeoutComp(&t, pWtp->toWrkShutdown);/* get absolute timeout */ - if(d_pthread_cond_timedwait(pWtp->pcondBusy, pWtp->pmutUsr, &t) != 0) { - DBGPRINTF("%s: inactivity timeout, worker terminating...\n", wtiGetDbgHdr(pThis)); - bInactivityTOOccured = 1; /* indicate we had a timeout */ - } + /* try to execute and process whatever we have */ + /* Note that this function releases and re-aquires the mutex. The returned + * information on idle state must be processed before releasing the mutex again. + */ + localRet = pWtp->pfDoWork(pWtp->pUsr, pThis); + +dbgprintf("YYY/ZZZ: wti loop locked mutex %p again\n", pWtp->pmutUsr); + if(localRet == RS_RET_IDLE) { + if(terminateRet == RS_RET_TERMINATE_WHEN_IDLE || bInactivityTOOccured) { + d_pthread_mutex_unlock(pWtp->pmutUsr); + break; /* end of loop */ } - END_MTX_PROTECTED_OPERATIONS(pWtp->pmutUsr); + doIdleProcessing(pThis, pWtp, &bInactivityTOOccured); + d_pthread_mutex_unlock(pWtp->pmutUsr); continue; /* request next iteration */ } - /* if we reach this point, we have a non-empty queue (and are still protected by mutex) */ - pWtp->pfDoWork(pWtp->pUsr, pThis, iCancelStateSave); + d_pthread_mutex_unlock(pWtp->pmutUsr); + + bInactivityTOOccured = 0; /* reset for next run */ } /* indicate termination */ + d_pthread_mutex_lock(pWtp->pmutUsr); pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); - d_pthread_mutex_lock(&pThis->mut); pthread_cleanup_pop(0); /* remove cleanup handler */ - pWtp->pfOnWorkerShutdown(pWtp->pUsr); - - wtiSetState(pThis, eWRKTHRD_TERMINATING, 0, MUTEX_ALREADY_LOCKED); - ATOMIC_STORE_1_TO_INT(pWtp->bThrdStateChanged); /* indicate change, so harverster will be called */ - d_pthread_mutex_unlock(&pThis->mut); pthread_setcancelstate(iCancelStateSave, NULL); + d_pthread_mutex_unlock(pWtp->pmutUsr); RETiRet; } @@ -444,7 +338,6 @@ wtiSetDbgHdr(wti_t *pThis, uchar *pszMsg, size_t lenMsg) if(pThis->pszDbgHdr != NULL) { free(pThis->pszDbgHdr); - pThis->pszDbgHdr = NULL; } if((pThis->pszDbgHdr = malloc(sizeof(uchar) * lenMsg + 1)) == NULL) @@ -478,6 +371,5 @@ BEGINObjClassInit(wti, 1, OBJ_IS_CORE_MODULE) /* one is the object version (most CHKiRet(objUse(glbl, CORE_COMPONENT)); ENDObjClassInit(wti) -/* - * vi:set ai: +/* vi:set ai: */ diff --git a/runtime/wti.h b/runtime/wti.h index 72653b15..f466a053 100644 --- a/runtime/wti.h +++ b/runtime/wti.h @@ -1,6 +1,6 @@ /* Definition of the worker thread instance (wti) class. * - * Copyright 2008 Rainer Gerhards and Adiscon GmbH. + * Copyright 2008, 2009 by Rainer Gerhards and Adiscon GmbH. * * This file is part of the rsyslog runtime library. * @@ -27,21 +27,19 @@ #include <pthread.h> #include "wtp.h" #include "obj.h" +#include "batch.h" + /* the worker thread instance class */ -typedef struct wti_s { +struct wti_s { BEGINobjInstance; - pthread_t thrdID; /* thread ID */ - qWrkCmd_t tCurrCmd; /* current command to be carried out by worker */ - obj_t *pUsrp; /* pointer to an object meaningful for current user pointer (e.g. queue pUsr data elemt) */ + pthread_t thrdID; /* thread ID */ + int bIsRunning; /* is this thread currently running? (must be int for atomic op!) */ + bool bAlwaysRunning; /* should this thread always run? */ wtp_t *pWtp; /* my worker thread pool (important if only the work thread instance is passed! */ - pthread_cond_t condExitDone; /* signaled when the thread exit is done (once per thread existance) */ - pthread_mutex_t mut; - bool bShutdownRqtd; /* shutdown for this thread requested? 0 - no , 1 - yes */ + batch_t batch; /* pointer to an object array meaningful for current user pointer (e.g. queue pUsr data elemt) */ uchar *pszDbgHdr; /* header string for debug messages */ -} wti_t; - -/* some symbolic constants for easier reference */ +}; /* prototypes */ @@ -49,12 +47,11 @@ rsRetVal wtiConstruct(wti_t **ppThis); rsRetVal wtiConstructFinalize(wti_t *pThis); rsRetVal wtiDestruct(wti_t **ppThis); rsRetVal wtiWorker(wti_t *pThis); -rsRetVal wtiProcessThrdChanges(wti_t *pThis, int bLockMutex); rsRetVal wtiSetDbgHdr(wti_t *pThis, uchar *pszMsg, size_t lenMsg); -rsRetVal wtiSetState(wti_t *pThis, qWrkCmd_t tCmd, int bActiveOnly, int bLockMutex); -rsRetVal wtiJoinThrd(wti_t *pThis); rsRetVal wtiCancelThrd(wti_t *pThis); -qWrkCmd_t wtiGetState(wti_t *pThis, int bLockMutex); +rsRetVal wtiSetAlwaysRunning(wti_t *pThis); +rsRetVal wtiSetState(wti_t *pThis, bool bNew); +bool wtiGetState(wti_t *pThis); PROTOTYPEObjClassInit(wti); PROTOTYPEpropSetMeth(wti, pszDbgHdr, uchar*); PROTOTYPEpropSetMeth(wti, pWtp, wtp_t*); diff --git a/runtime/wtp.c b/runtime/wtp.c index 0c66dd11..47b99fe8 100644 --- a/runtime/wtp.c +++ b/runtime/wtp.c @@ -8,7 +8,7 @@ * (and in the web doc set on http://www.rsyslog.com/doc). Be sure to read it * if you are getting aquainted to the object. * - * Copyright 2008 Rainer Gerhards and Adiscon GmbH. + * Copyright 2008,2009 Rainer Gerhards and Adiscon GmbH. * * This file is part of the rsyslog runtime library. * @@ -44,9 +44,10 @@ # include <sys/prctl.h> #endif -#ifdef OS_SOLARIS -# include <sched.h> -#endif +/// TODO: check on solaris if this is any longer needed - I don't think so - rgerhards, 2009-09-20 +//#ifdef OS_SOLARIS +//# include <sched.h> +//#endif #include "rsyslog.h" #include "stringbuf.h" @@ -82,17 +83,20 @@ wtpGetDbgHdr(wtp_t *pThis) /* Not implemented dummy function for constructor */ -static rsRetVal NotImplementedDummy() { return RS_RET_OK; } +static rsRetVal NotImplementedDummy() { return RS_RET_NOT_IMPLEMENTED; } /* Standard-Constructor for the wtp object */ BEGINobjConstruct(wtp) /* be sure to specify the object type also in END macro! */ - pthread_mutex_init(&pThis->mut, NULL); - pthread_mutex_init(&pThis->mutThrdShutdwn, NULL); + pthread_mutex_init(&pThis->mutWtp, NULL); pthread_cond_init(&pThis->condThrdTrm, NULL); + pthread_attr_init(&pThis->attrThrd); + pthread_attr_setdetachstate(&pThis->attrThrd, PTHREAD_CREATE_DETACHED); /* set all function pointers to "not implemented" dummy so that we can safely call them */ pThis->pfChkStopWrkr = NotImplementedDummy; + pThis->pfGetDeqBatchSize = NotImplementedDummy; pThis->pfIsIdle = NotImplementedDummy; pThis->pfDoWork = NotImplementedDummy; + pThis->pfObjProcessed = NotImplementedDummy; pThis->pfOnIdle = NotImplementedDummy; pThis->pfOnWorkerCancel = NotImplementedDummy; pThis->pfOnWorkerStartup = NotImplementedDummy; @@ -114,13 +118,13 @@ wtpConstructFinalize(wtp_t *pThis) ISOBJ_TYPE_assert(pThis, wtp); - dbgprintf("%s: finalizing construction of worker thread pool\n", wtpGetDbgHdr(pThis)); + DBGPRINTF("%s: finalizing construction of worker thread pool\n", wtpGetDbgHdr(pThis)); /* alloc and construct workers - this can only be done in finalizer as we previously do * not know the max number of workers */ if((pThis->pWrkr = malloc(sizeof(wti_t*) * pThis->iNumWorkerThreads)) == NULL) ABORT_FINALIZE(RS_RET_OUT_OF_MEMORY); - + for(i = 0 ; i < pThis->iNumWorkerThreads ; ++i) { CHKiRet(wtiConstruct(&pThis->pWrkr[i])); pWti = pThis->pWrkr[i]; @@ -140,8 +144,6 @@ finalize_it: BEGINobjDestruct(wtp) /* be sure to specify the object type also in END and CODESTART macros! */ int i; CODESTARTobjDestruct(wtp) - wtpProcessThrdChanges(pThis); /* process thread changes one last time */ - /* destruct workers */ for(i = 0 ; i < pThis->iNumWorkerThreads ; ++i) wtiDestruct(&pThis->pWrkr[i]); @@ -151,27 +153,13 @@ CODESTARTobjDestruct(wtp) /* actual destruction */ pthread_cond_destroy(&pThis->condThrdTrm); - pthread_mutex_destroy(&pThis->mut); - pthread_mutex_destroy(&pThis->mutThrdShutdwn); + pthread_mutex_destroy(&pThis->mutWtp); + pthread_attr_destroy(&pThis->attrThrd); free(pThis->pszDbgHdr); ENDobjDestruct(wtp) -/* wake up at least one worker thread. - * rgerhards, 2008-01-20 - */ -rsRetVal -wtpWakeupWrkr(wtp_t *pThis) -{ - DEFiRet; - - /* TODO; mutex? I think not needed, as we do not need predictable exec order -- rgerhards, 2008-01-28 */ - ISOBJ_TYPE_assert(pThis, wtp); - pthread_cond_signal(pThis->pcondBusy); - RETiRet; -} - /* wake up all worker threads. * rgerhards, 2008-01-16 */ @@ -186,99 +174,61 @@ wtpWakeupAllWrkr(wtp_t *pThis) } -/* check if we had any worker thread changes and, if so, act - * on them. At a minimum, terminated threads are harvested (joined). - * This function MUST NEVER block on the queue mutex! - */ -rsRetVal -wtpProcessThrdChanges(wtp_t *pThis) -{ - DEFiRet; - int i; - - ISOBJ_TYPE_assert(pThis, wtp); - - if(pThis->bThrdStateChanged == 0) - FINALIZE; - - if(d_pthread_mutex_trylock(&(pThis->mutThrdShutdwn)) != 0) { - /* another thread is already in the loop */ - FINALIZE; - } - - /* Note: there is a left-over potential race condition below: - * pThis->bThrdStateChanged may be re-set by another thread while - * we work on it and thus the loop may terminate too early. However, - * there are no really bad effects from that so I perfer - for this - * version - to live with the problem as is. Not a good idea to - * introduce that large change into the stable branch without very - * good reason. -- rgerhards, 2009-04-02 - */ - do { - /* reset the change marker */ - ATOMIC_STORE_0_TO_INT(pThis->bThrdStateChanged); - /* go through all threads */ - for(i = 0 ; i < pThis->iNumWorkerThreads ; ++i) { - wtiProcessThrdChanges(pThis->pWrkr[i], LOCK_MUTEX); - } - /* restart if another change occured while we were processing the changes */ - } while(pThis->bThrdStateChanged != 0); - - d_pthread_mutex_unlock(&(pThis->mutThrdShutdwn)); - -finalize_it: - RETiRet; -} - - -/* Sent a specific state for the worker thread pool. - * rgerhards, 2008-01-21 +/* Sent a specific state for the worker thread pool. -- rgerhards, 2008-01-21 + * We do not need to do atomic instructions as set operations are only + * called when terminating the pool, and then in strict sequence. So we + * can never overwrite each other. On the other hand, it also doesn't + * matter if the read operation obtains an older value, as we then simply + * do one more iteration, what is perfectly legal (during shutdown + * they are awoken in any case). -- rgerhards, 2009-07-20 */ rsRetVal wtpSetState(wtp_t *pThis, wtpState_t iNewState) { - DEFiRet; - ISOBJ_TYPE_assert(pThis, wtp); pThis->wtpState = iNewState; - /* TODO: must wakeup workers? seen to be not needed -- rgerhards, 2008-01-28 */ - - RETiRet; + return RS_RET_OK; } /* check if the worker shall shutdown (1 = yes, 0 = no) - * TODO: check if we can use atomic operations to enhance performance * Note: there may be two mutexes locked, the bLockUsrMutex is the one in our "user" * (e.g. the queue clas) * rgerhards, 2008-01-21 */ rsRetVal -wtpChkStopWrkr(wtp_t *pThis, int bLockMutex, int bLockUsrMutex) +wtpChkStopWrkr(wtp_t *pThis, int bLockUsrMutex) { DEFiRet; - DEFVARS_mutexProtection; + wtpState_t wtpState; ISOBJ_TYPE_assert(pThis, wtp); + /* we need a consistent value, but it doesn't really matter if it is changed + * right after the fetch - then we simply do one more iteration in the worker + */ + wtpState = ATOMIC_FETCH_32BIT(pThis->wtpState); - BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, bLockMutex); - if( (pThis->wtpState == wtpState_SHUTDOWN_IMMEDIATE) - || ((pThis->wtpState == wtpState_SHUTDOWN) && pThis->pfIsIdle(pThis->pUsr, bLockUsrMutex))) - iRet = RS_RET_TERMINATE_NOW; - END_MTX_PROTECTED_OPERATIONS(&pThis->mut); + if(wtpState == wtpState_SHUTDOWN_IMMEDIATE) { + ABORT_FINALIZE(RS_RET_TERMINATE_NOW); + } else if(wtpState == wtpState_SHUTDOWN) { + ABORT_FINALIZE(RS_RET_TERMINATE_WHEN_IDLE); + } /* try customer handler if one was set and we do not yet have a definite result */ - if(iRet == RS_RET_OK && pThis->pfChkStopWrkr != NULL) { + if(pThis->pfChkStopWrkr != NULL) { iRet = pThis->pfChkStopWrkr(pThis->pUsr, bLockUsrMutex); } +finalize_it: RETiRet; } #pragma GCC diagnostic ignored "-Wempty-body" /* Send a shutdown command to all workers and see if they terminate. - * A timeout may be specified. + * A timeout may be specified. This function may also be called with + * the current number of workers being 0, in which case it does not + * shut down any worker. * rgerhards, 2008-01-14 */ rsRetVal @@ -286,30 +236,22 @@ wtpShutdownAll(wtp_t *pThis, wtpState_t tShutdownCmd, struct timespec *ptTimeout { DEFiRet; int bTimedOut; - int iCancelStateSave; ISOBJ_TYPE_assert(pThis, wtp); wtpSetState(pThis, tShutdownCmd); wtpWakeupAllWrkr(pThis); - /* see if we need to harvest (join) any terminated threads (even in timeout case, - * some may have terminated... - */ - wtpProcessThrdChanges(pThis); - - /* and wait for their termination */ - pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); - d_pthread_mutex_lock(&pThis->mut); - pthread_cleanup_push(mutexCancelCleanup, &pThis->mut); - pthread_setcancelstate(iCancelStateSave, NULL); + /* wait for worker thread termination */ + d_pthread_mutex_lock(&pThis->mutWtp); + pthread_cleanup_push(mutexCancelCleanup, &pThis->mutWtp); bTimedOut = 0; while(pThis->iCurNumWrkThrd > 0 && !bTimedOut) { - dbgprintf("%s: waiting %ldms on worker thread termination, %d still running\n", - wtpGetDbgHdr(pThis), timeoutVal(ptTimeout), pThis->iCurNumWrkThrd); + DBGPRINTF("%s: waiting %ldms on worker thread termination, %d still running\n", + wtpGetDbgHdr(pThis), timeoutVal(ptTimeout), ATOMIC_FETCH_32BIT(pThis->iCurNumWrkThrd)); - if(d_pthread_cond_timedwait(&pThis->condThrdTrm, &pThis->mut, ptTimeout) != 0) { - dbgprintf("%s: timeout waiting on worker thread termination\n", wtpGetDbgHdr(pThis)); + if(d_pthread_cond_timedwait(&pThis->condThrdTrm, &pThis->mutWtp, ptTimeout) != 0) { + DBGPRINTF("%s: timeout waiting on worker thread termination\n", wtpGetDbgHdr(pThis)); bTimedOut = 1; /* we exit the loop on timeout */ } } @@ -318,40 +260,11 @@ wtpShutdownAll(wtp_t *pThis, wtpState_t tShutdownCmd, struct timespec *ptTimeout if(bTimedOut) iRet = RS_RET_TIMED_OUT; - /* see if we need to harvest (join) any terminated threads (even in timeout case, - * some may have terminated... - */ - wtpProcessThrdChanges(pThis); - RETiRet; } #pragma GCC diagnostic warning "-Wempty-body" -/* indicate that a thread has terminated and awake anyone waiting on it - * rgerhards, 2008-01-23 - */ -rsRetVal wtpSignalWrkrTermination(wtp_t *pThis) -{ - DEFiRet; - /* I leave the mutex code here out as it gives us deadlocks. I think it is not really - * needed and we are on the safe side. I leave this comment in if practice proves us - * wrong. The whole thing should be removed after half a year or year if we see there - * actually is no issue (or revisit it from a theoretical POV). - * rgerhards, 2008-01-28 - * revisited 2008-09-30, still a bit unclear, leave in - */ - /*TODO: mutex or not mutex, that's the question ;)DEFVARS_mutexProtection;*/ - - ISOBJ_TYPE_assert(pThis, wtp); - - /*BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, LOCK_MUTEX);*/ - pthread_cond_signal(&pThis->condThrdTrm); /* activate anyone waiting on thread shutdown */ - /*END_MTX_PROTECTED_OPERATIONS(&pThis->mut);*/ - RETiRet; -} - - /* Unconditionally cancel all running worker threads. * rgerhards, 2008-01-14 */ @@ -363,12 +276,8 @@ wtpCancelAll(wtp_t *pThis) ISOBJ_TYPE_assert(pThis, wtp); - /* process any pending thread requests so that we know who actually is still running */ - wtpProcessThrdChanges(pThis); - /* go through all workers and cancel those that are active */ for(i = 0 ; i < pThis->iNumWorkerThreads ; ++i) { - dbgprintf("%s: try canceling worker thread %d\n", wtpGetDbgHdr(pThis), i); wtiCancelThrd(pThis->pWrkr[i]); } @@ -376,39 +285,29 @@ wtpCancelAll(wtp_t *pThis) } - -/* Set the Inactivity Guard - * rgerhards, 2008-01-21 - */ -rsRetVal -wtpSetInactivityGuard(wtp_t *pThis, int bNewState, int bLockMutex) -{ - DEFiRet; - DEFVARS_mutexProtection; - - BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, bLockMutex); - pThis->bInactivityGuard = bNewState; - END_MTX_PROTECTED_OPERATIONS(&pThis->mut); - - RETiRet; -} - - -/* cancellation cleanup handler for executing worker - * decrements the worker counter - * rgerhards, 2008-01-20 +/* cancellation cleanup handler for executing worker decrements the worker counter. + * This is also called when the the worker is normally shut down. + * rgerhards, 2009-07-20 */ -void +static void wtpWrkrExecCancelCleanup(void *arg) { - wtp_t *pThis = (wtp_t*) arg; + wti_t *pWti = (wti_t*) arg; + wtp_t *pThis; BEGINfunc + ISOBJ_TYPE_assert(pWti, wti); + pThis = pWti->pWtp; ISOBJ_TYPE_assert(pThis, wtp); - pThis->iCurNumWrkThrd--; - wtpSignalWrkrTermination(pThis); - dbgprintf("%s: thread CANCELED with %d workers running.\n", wtpGetDbgHdr(pThis), pThis->iCurNumWrkThrd); + /* the order of the next two statements is important! */ + wtiSetState(pWti, WRKTHRD_STOPPED); + ATOMIC_DEC(pThis->iCurNumWrkThrd); + + DBGPRINTF("%s: Worker thread %lx, terminated, num workers now %d\n", + wtpGetDbgHdr(pThis), (unsigned long) pWti, ATOMIC_FETCH_32BIT(pThis->iCurNumWrkThrd)); + + pthread_cond_broadcast(&pThis->condThrdTrm); /* activate anyone waiting on thread shutdown */ ENDfunc } @@ -423,12 +322,11 @@ wtpWorker(void *arg) /* the arg is actually a wti object, even though we are in { uchar *pszDbgHdr; uchar thrdName[32] = "rs:"; - DEFiRet; - DEFVARS_mutexProtection; wti_t *pWti = (wti_t*) arg; wtp_t *pThis; sigset_t sigSet; + BEGINfunc ISOBJ_TYPE_assert(pWti, wti); pThis = pWti->pWtp; ISOBJ_TYPE_assert(pThis, wtp); @@ -445,39 +343,9 @@ wtpWorker(void *arg) /* the arg is actually a wti object, even though we are in } # endif - BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, LOCK_MUTEX); - - /* do some late initialization */ - - pthread_cleanup_push(wtpWrkrExecCancelCleanup, pThis); - - /* finally change to RUNNING state. We need to check if we actually should still run, - * because someone may have requested us to shut down even before we got a chance to do - * our init. That would be a bad race... -- rgerhards, 2008-01-16 - */ - wtiSetState(pWti, eWRKTHRD_RUNNING, 0, MUTEX_ALREADY_LOCKED); /* we are running now! */ - - do { - END_MTX_PROTECTED_OPERATIONS(&pThis->mut); - - iRet = wtiWorker(pWti); /* just to make sure: this is NOT protected by the mutex! */ - - BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, LOCK_MUTEX); - } while(pThis->iCurNumWrkThrd == 1 && pThis->bInactivityGuard == 1); - /* inactivity guard prevents shutdown of all workers while one should be running due to race - * condition. It can lead to one more worker running than desired, but that is acceptable. After - * all, that worker will shutdown itself due to inactivity timeout. If, however, none were running - * when one was required, processing could come to a halt. -- rgerhards, 2008-01-21 - */ - - pthread_cleanup_pop(0); - pThis->iCurNumWrkThrd--; - wtpSignalWrkrTermination(pThis); - - dbgprintf("%s: Worker thread %lx, terminated, num workers now %d\n", - wtpGetDbgHdr(pThis), (unsigned long) pWti, pThis->iCurNumWrkThrd); - - END_MTX_PROTECTED_OPERATIONS(&pThis->mut); + pthread_cleanup_push(wtpWrkrExecCancelCleanup, pWti); + wtiWorker(pWti); + pthread_cleanup_pop(1); ENDfunc pthread_exit(0); @@ -487,27 +355,20 @@ wtpWorker(void *arg) /* the arg is actually a wti object, even though we are in /* start a new worker */ static rsRetVal -wtpStartWrkr(wtp_t *pThis, int bLockMutex) +wtpStartWrkr(wtp_t *pThis) { - DEFiRet; - DEFVARS_mutexProtection; wti_t *pWti; int i; int iState; + DEFiRet; ISOBJ_TYPE_assert(pThis, wtp); - wtpProcessThrdChanges(pThis); // TODO: Performance: this causes a lot of FUTEX calls - - BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, bLockMutex); - - pThis->iCurNumWrkThrd++; + d_pthread_mutex_lock(&pThis->mutWtp); - /* find free spot in thread table. If we find at least one worker that is in initialization, - * we do NOT start a new one. Let's give the other one a chance, first. - */ + /* find free spot in thread table. */ for(i = 0 ; i < pThis->iNumWorkerThreads ; ++i) { - if(wtiGetState(pThis->pWrkr[i], LOCK_MUTEX) == eWRKTHRD_STOPPED) { + if(wtiGetState(pThis->pWrkr[i]) == WRKTHRD_STOPPED) { break; } } @@ -515,17 +376,20 @@ wtpStartWrkr(wtp_t *pThis, int bLockMutex) if(i == pThis->iNumWorkerThreads) ABORT_FINALIZE(RS_RET_NO_MORE_THREADS); + if(i == 0 || pThis->toWrkShutdown == -1) { + wtiSetAlwaysRunning(pThis->pWrkr[i]); + } + pWti = pThis->pWrkr[i]; - wtiSetState(pWti, eWRKTHRD_RUN_CREATED, 0, LOCK_MUTEX); - iState = pthread_create(&(pWti->thrdID), NULL, wtpWorker, (void*) pWti); - dbgprintf("%s: started with state %d, num workers now %d\n", - wtpGetDbgHdr(pThis), iState, pThis->iCurNumWrkThrd); + wtiSetState(pWti, WRKTHRD_RUNNING); + iState = pthread_create(&(pWti->thrdID), &pThis->attrThrd, wtpWorker, (void*) pWti); + ATOMIC_INC(pThis->iCurNumWrkThrd); /* we got one more! */ - /* indicate we just started a worker and would like to see it running */ - wtpSetInactivityGuard(pThis, 1, MUTEX_ALREADY_LOCKED); + DBGPRINTF("%s: started with state %d, num workers now %d\n", + wtpGetDbgHdr(pThis), iState, ATOMIC_FETCH_32BIT(pThis->iCurNumWrkThrd)); finalize_it: - END_MTX_PROTECTED_OPERATIONS(&pThis->mut); + d_pthread_mutex_unlock(&pThis->mutWtp); RETiRet; } @@ -542,38 +406,34 @@ rsRetVal wtpAdviseMaxWorkers(wtp_t *pThis, int nMaxWrkr) { DEFiRet; - DEFVARS_mutexProtection; int nMissing; /* number workers missing to run */ int i; ISOBJ_TYPE_assert(pThis, wtp); +int nMaxWrkrTmp = nMaxWrkr; if(nMaxWrkr == 0) FINALIZE; - BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, LOCK_MUTEX); - if(nMaxWrkr > pThis->iNumWorkerThreads) /* limit to configured maximum */ nMaxWrkr = pThis->iNumWorkerThreads; - nMissing = nMaxWrkr - pThis->iCurNumWrkThrd; + nMissing = nMaxWrkr - ATOMIC_FETCH_32BIT(pThis->iCurNumWrkThrd); +dbgprintf("wtpAdviseMaxWorkers, nmax: %d, curr %d, missing %d\n", nMaxWrkrTmp, pThis->iNumWorkerThreads, nMissing); if(nMissing > 0) { - dbgprintf("%s: high activity - starting %d additional worker thread(s).\n", wtpGetDbgHdr(pThis), nMissing); + DBGPRINTF("%s: high activity - starting %d additional worker thread(s).\n", wtpGetDbgHdr(pThis), nMissing); /* start the rqtd nbr of workers */ for(i = 0 ; i < nMissing ; ++i) { - CHKiRet(wtpStartWrkr(pThis, MUTEX_ALREADY_LOCKED)); - } - } else { - if(nMaxWrkr > 0) { - dbgprintf("wtpAdviseMaxWorkers signals busy\n"); - wtpWakeupWrkr(pThis); + CHKiRet(wtpStartWrkr(pThis)); } + } else { +dbgprintf("YYY: adivse signal cond busy"); + pthread_cond_signal(pThis->pcondBusy); } finalize_it: - END_MTX_PROTECTED_OPERATIONS(&pThis->mut); RETiRet; } @@ -587,37 +447,16 @@ DEFpropSetMethPTR(wtp, pmutUsr, pthread_mutex_t) DEFpropSetMethPTR(wtp, pcondBusy, pthread_cond_t) DEFpropSetMethFP(wtp, pfChkStopWrkr, rsRetVal(*pVal)(void*, int)) DEFpropSetMethFP(wtp, pfRateLimiter, rsRetVal(*pVal)(void*)) -DEFpropSetMethFP(wtp, pfIsIdle, rsRetVal(*pVal)(void*, int)) -DEFpropSetMethFP(wtp, pfDoWork, rsRetVal(*pVal)(void*, void*, int)) +DEFpropSetMethFP(wtp, pfGetDeqBatchSize, rsRetVal(*pVal)(void*, int*)) +DEFpropSetMethFP(wtp, pfIsIdle, rsRetVal(*pVal)(void*, wtp_t*)) +DEFpropSetMethFP(wtp, pfDoWork, rsRetVal(*pVal)(void*, void*)) +DEFpropSetMethFP(wtp, pfObjProcessed, rsRetVal(*pVal)(void*, wti_t*)) DEFpropSetMethFP(wtp, pfOnIdle, rsRetVal(*pVal)(void*, int)) DEFpropSetMethFP(wtp, pfOnWorkerCancel, rsRetVal(*pVal)(void*, void*)) DEFpropSetMethFP(wtp, pfOnWorkerStartup, rsRetVal(*pVal)(void*)) DEFpropSetMethFP(wtp, pfOnWorkerShutdown, rsRetVal(*pVal)(void*)) -/* return the current number of worker threads. - * TODO: atomic operation would bring a nice performance - * enhancemcent - * rgerhards, 2008-01-27 - */ -int -wtpGetCurNumWrkr(wtp_t *pThis, int bLockMutex) -{ - DEFVARS_mutexProtection; - int iNumWrkr; - - BEGINfunc - ISOBJ_TYPE_assert(pThis, wtp); - - BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, bLockMutex); - iNumWrkr = pThis->iCurNumWrkThrd; - END_MTX_PROTECTED_OPERATIONS(&pThis->mut); - - ENDfunc - return iNumWrkr; -} - - /* set the debug header message * The passed-in string is duplicated. So if the caller does not need * it any longer, it must free it. Must be called only before object is finalized. @@ -669,6 +508,5 @@ BEGINObjClassInit(wtp, 1, OBJ_IS_CORE_MODULE) CHKiRet(objUse(glbl, CORE_COMPONENT)); ENDObjClassInit(wtp) -/* - * vi:set ai: +/* vi:set ai: */ diff --git a/runtime/wtp.h b/runtime/wtp.h index 1ce171cc..0505b91c 100644 --- a/runtime/wtp.h +++ b/runtime/wtp.h @@ -27,18 +27,9 @@ #include <pthread.h> #include "obj.h" -/* commands and states for worker threads. */ -typedef enum { - eWRKTHRD_STOPPED = 0, /* worker thread is not running (either actually never ran or was shut down) */ - eWRKTHRD_TERMINATING = 1,/* worker thread has shut down, but some finalzing is still needed */ - /* ALL active states MUST be numerically higher than eWRKTHRD_TERMINATED and NONE must be lower! */ - eWRKTHRD_RUN_CREATED = 2,/* worker thread has been created, but not yet begun initialization (prob. not yet scheduled) */ - eWRKTHRD_RUN_INIT = 3, /* worker thread is initializing, but not yet fully running */ - eWRKTHRD_RUNNING = 4, /* worker thread is up and running and shall continue to do so */ - eWRKTHRD_SHUTDOWN = 5, /* worker thread is running but shall terminate when wtp is empty */ - eWRKTHRD_SHUTDOWN_IMMEDIATE = 6/* worker thread is running but shall terminate even if wtp is full */ - /* SHUTDOWN_IMMEDIATE MUST alsways be the numerically highest state! */ -} qWrkCmd_t; +/* states for worker threads. */ +#define WRKTHRD_STOPPED FALSE +#define WRKTHRD_RUNNING TRUE /* possible states of a worker thread pool */ @@ -50,36 +41,36 @@ typedef enum { /* the worker thread pool (wtp) object */ -typedef struct wtp_s { +struct wtp_s { BEGINobjInstance; wtpState_t wtpState; int iNumWorkerThreads;/* number of worker threads to use */ int iCurNumWrkThrd;/* current number of active worker threads */ struct wti_s **pWrkr;/* array with control structure for the worker thread(s) associated with this wtp */ int toWrkShutdown; /* timeout for idle workers in ms, -1 means indefinite (0 is immediate) */ - bool bInactivityGuard;/* prevents inactivity due to race condition */ rsRetVal (*pConsumer)(void *); /* user-supplied consumer function for dewtpd messages */ /* synchronization variables */ - pthread_mutex_t mutThrdShutdwn; /* mutex to guard thread shutdown processing */ - pthread_mutex_t mut; /* mutex for the wtp's thread management */ + pthread_mutex_t mutWtp; /* mutex for the wtp's thread management */ pthread_cond_t condThrdTrm;/* signalled when threads terminate */ - int bThrdStateChanged; /* at least one thread state has changed if 1 */ /* end sync variables */ /* user objects */ - void *pUsr; /* pointer to user object */ + void *pUsr; /* pointer to user object (in this case, the queue the wtp belongs to) */ + pthread_attr_t attrThrd;/* attribute for new threads (created just once and cached here) */ pthread_mutex_t *pmutUsr; pthread_cond_t *pcondBusy; /* condition the user will signal "busy again, keep runing" on (awakes worker) */ rsRetVal (*pfChkStopWrkr)(void *pUsr, int); + rsRetVal (*pfGetDeqBatchSize)(void *pUsr, int*); /* obtains max dequeue count from queue config */ + rsRetVal (*pfObjProcessed)(void *pUsr, wti_t *pWti); /* indicate user object is processed */ rsRetVal (*pfRateLimiter)(void *pUsr); - rsRetVal (*pfIsIdle)(void *pUsr, int); - rsRetVal (*pfDoWork)(void *pUsr, void *pWti, int); + rsRetVal (*pfIsIdle)(void *pUsr, wtp_t *pWtp); + rsRetVal (*pfDoWork)(void *pUsr, void *pWti); rsRetVal (*pfOnIdle)(void *pUsr, int); rsRetVal (*pfOnWorkerCancel)(void *pUsr, void*pWti); rsRetVal (*pfOnWorkerStartup)(void *pUsr); rsRetVal (*pfOnWorkerShutdown)(void *pUsr); /* end user objects */ uchar *pszDbgHdr; /* header string for debug messages */ -} wtp_t; +}; /* some symbolic constants for easier reference */ @@ -90,21 +81,19 @@ rsRetVal wtpConstructFinalize(wtp_t *pThis); rsRetVal wtpDestruct(wtp_t **ppThis); rsRetVal wtpAdviseMaxWorkers(wtp_t *pThis, int nMaxWrkr); rsRetVal wtpProcessThrdChanges(wtp_t *pThis); -rsRetVal wtpSetInactivityGuard(wtp_t *pThis, int bNewState, int bLockMutex); -rsRetVal wtpChkStopWrkr(wtp_t *pThis, int bLockMutex, int bLockUsrMutex); +rsRetVal wtpChkStopWrkr(wtp_t *pThis, int bLockUsrMutex); rsRetVal wtpSetState(wtp_t *pThis, wtpState_t iNewState); -rsRetVal wtpWakeupWrkr(wtp_t *pThis); rsRetVal wtpWakeupAllWrkr(wtp_t *pThis); rsRetVal wtpCancelAll(wtp_t *pThis); rsRetVal wtpSetDbgHdr(wtp_t *pThis, uchar *pszMsg, size_t lenMsg); -rsRetVal wtpSignalWrkrTermination(wtp_t *pWtp); rsRetVal wtpShutdownAll(wtp_t *pThis, wtpState_t tShutdownCmd, struct timespec *ptTimeout); -int wtpGetCurNumWrkr(wtp_t *pThis, int bLockMutex); PROTOTYPEObjClassInit(wtp); PROTOTYPEpropSetMethFP(wtp, pfChkStopWrkr, rsRetVal(*pVal)(void*, int)); PROTOTYPEpropSetMethFP(wtp, pfRateLimiter, rsRetVal(*pVal)(void*)); -PROTOTYPEpropSetMethFP(wtp, pfIsIdle, rsRetVal(*pVal)(void*, int)); -PROTOTYPEpropSetMethFP(wtp, pfDoWork, rsRetVal(*pVal)(void*, void*, int)); +PROTOTYPEpropSetMethFP(wtp, pfGetDeqBatchSize, rsRetVal(*pVal)(void*, int*)); +PROTOTYPEpropSetMethFP(wtp, pfIsIdle, rsRetVal(*pVal)(void*, wtp_t*)); +PROTOTYPEpropSetMethFP(wtp, pfDoWork, rsRetVal(*pVal)(void*, void*)); +PROTOTYPEpropSetMethFP(wtp, pfObjProcessed, rsRetVal(*pVal)(void*, wti_t*)); PROTOTYPEpropSetMethFP(wtp, pfOnIdle, rsRetVal(*pVal)(void*, int)); PROTOTYPEpropSetMethFP(wtp, pfOnWorkerCancel, rsRetVal(*pVal)(void*,void*)); PROTOTYPEpropSetMethFP(wtp, pfOnWorkerStartup, rsRetVal(*pVal)(void*)); diff --git a/tcps_sess.c b/tcps_sess.c index 8d307380..09861ab9 100644 --- a/tcps_sess.c +++ b/tcps_sess.c @@ -256,6 +256,7 @@ defaultDoSubmitMessage(tcps_sess_t *pThis, struct syslogTime *stTime, time_t ttG CHKiRet(MsgSetRcvFromIP(pMsg, pThis->fromHostIP)); MsgSetRuleset(pMsg, pThis->pLstnInfo->pRuleset); +dbgprintf("YYY: submitting msg to queue\n"); if(pMultiSub == NULL) { CHKiRet(submitMsg(pMsg)); } else { @@ -165,9 +165,9 @@ TCPSessTblInit(tcpsrv_t *pThis) ISOBJ_TYPE_assert(pThis, tcpsrv); assert(pThis->pSessions == NULL); - dbgprintf("Allocating buffer for %d TCP sessions.\n", pThis->iSessMax); + DBGPRINTF("Allocating buffer for %d TCP sessions.\n", pThis->iSessMax); if((pThis->pSessions = (tcps_sess_t **) calloc(pThis->iSessMax, sizeof(tcps_sess_t *))) == NULL) { - dbgprintf("Error: TCPSessInit() could not alloc memory for TCP session table.\n"); + DBGPRINTF("Error: TCPSessInit() could not alloc memory for TCP session table.\n"); ABORT_FINALIZE(RS_RET_OUT_OF_MEMORY); } @@ -412,7 +412,7 @@ SessAccept(tcpsrv_t *pThis, tcpLstnPortList_t *pLstnInfo, tcps_sess_t **ppSess, * rgerhards, 2005-09-26 */ if(!pThis->pIsPermittedHost((struct sockaddr*) addr, (char*) fromHostFQDN, pThis->pUsr, pSess->pUsr)) { - dbgprintf("%s is not an allowed sender\n", fromHostFQDN); + DBGPRINTF("%s is not an allowed sender\n", fromHostFQDN); if(glbl.GetOption_DisallowWarning()) { errno = 0; errmsg.LogError(0, RS_RET_HOST_NOT_PERMITTED, "TCP message from disallowed sender %s discarded", fromHostFQDN); @@ -465,6 +465,61 @@ RunCancelCleanup(void *arg) } +/* process a receive request on one of the streams + * rgerhards, 2009-07-020 + */ +static rsRetVal +doReceive(tcpsrv_t *pThis, tcps_sess_t **ppSess) +{ + char buf[128*1024]; /* reception buffer - may hold a partial or multiple messages */ + ssize_t iRcvd; + DEFiRet; + + ISOBJ_TYPE_assert(pThis, tcpsrv); + DBGPRINTF("netstream %p with new data\n", (*ppSess)->pStrm); + + /* Receive message */ + iRet = pThis->pRcvData(*ppSess, buf, sizeof(buf), &iRcvd); + switch(iRet) { + case RS_RET_CLOSED: + if(pThis->bEmitMsgOnClose) { + uchar *pszPeer; + int lenPeer; + errno = 0; + prop.GetString((*ppSess)->fromHostIP, &pszPeer, &lenPeer); + errmsg.LogError(0, RS_RET_PEER_CLOSED_CONN, "Netstream session %p closed by remote peer %s.\n", + (*ppSess)->pStrm, pszPeer); + } + pThis->pOnRegularClose(*ppSess); + tcps_sess.Destruct(ppSess); + break; + case RS_RET_RETRY: + /* we simply ignore retry - this is not an error, but we also have not received anything */ + break; + case RS_RET_OK: + /* valid data received, process it! */ + if(tcps_sess.DataRcvd(*ppSess, buf, iRcvd) != RS_RET_OK) { + /* in this case, something went awfully wrong. + * We are instructed to terminate the session. + */ + errmsg.LogError(0, NO_ERRCODE, "Tearing down TCP Session - see " + "previous messages for reason(s)\n"); + pThis->pOnErrClose(*ppSess); + tcps_sess.Destruct(ppSess); + } + break; + default: + errno = 0; + errmsg.LogError(0, iRet, "netstream session %p will be closed due to error\n", + (*ppSess)->pStrm); + pThis->pOnErrClose(*ppSess); + tcps_sess.Destruct(ppSess); + break; + } + RETiRet; +} + + /* This function is called to gather input. */ #pragma GCC diagnostic ignored "-Wempty-body" static rsRetVal @@ -477,13 +532,12 @@ Run(tcpsrv_t *pThis) int bIsReady; tcps_sess_t *pNewSess; nssel_t *pSel; - ssize_t iRcvd; ISOBJ_TYPE_assert(pThis, tcpsrv); /* this is an endless loop - it is terminated by the framework canelling * this thread. Thus, we also need to instantiate a cancel cleanup handler - * to prevent us from leaking anything. -- rgerharsd, 20080-04-24 + * to prevent us from leaking anything. -- rgerhards, 20080-04-24 */ pthread_cleanup_push(RunCancelCleanup, (void*) &pSel); while(1) { @@ -507,11 +561,13 @@ Run(tcpsrv_t *pThis) /* wait for io to become ready */ CHKiRet(nssel.Wait(pSel, &nfds)); + if(glbl.GetGlobalInputTermState() == 1) + break; /* terminate input! */ for(i = 0 ; i < pThis->iLstnCurr ; ++i) { CHKiRet(nssel.IsReady(pSel, pThis->ppLstn[i], NSDSEL_RD, &bIsReady, &nfds)); if(bIsReady) { - dbgprintf("New connect on NSD %p.\n", pThis->ppLstn[i]); + DBGPRINTF("New connect on NSD %p.\n", pThis->ppLstn[i]); SessAccept(pThis, pThis->ppLstnPort[i], &pNewSess, pThis->ppLstn[i]); --nfds; /* indicate we have processed one */ } @@ -522,47 +578,7 @@ Run(tcpsrv_t *pThis) while(nfds && iTCPSess != -1) { CHKiRet(nssel.IsReady(pSel, pThis->pSessions[iTCPSess]->pStrm, NSDSEL_RD, &bIsReady, &nfds)); if(bIsReady) { - char buf[128*1024]; /* reception buffer - may hold a partial or multiple messages */ - dbgprintf("netstream %p with new data\n", pThis->pSessions[iTCPSess]->pStrm); - - /* Receive message */ - iRet = pThis->pRcvData(pThis->pSessions[iTCPSess], buf, sizeof(buf), &iRcvd); - switch(iRet) { - case RS_RET_CLOSED: - if(pThis->bEmitMsgOnClose) { - uchar *pszPeer; - int lenPeer; - errno = 0; - prop.GetString(pThis->pSessions[iTCPSess]->fromHostIP, &pszPeer, &lenPeer); - errmsg.LogError(0, RS_RET_PEER_CLOSED_CONN, "Netstream session %p closed by remote peer %s.\n", - pThis->pSessions[iTCPSess]->pStrm, pszPeer); - } - pThis->pOnRegularClose(pThis->pSessions[iTCPSess]); - tcps_sess.Destruct(&pThis->pSessions[iTCPSess]); - break; - case RS_RET_RETRY: - /* we simply ignore retry - this is not an error, but we also have not received anything */ - break; - case RS_RET_OK: - /* valid data received, process it! */ - if(tcps_sess.DataRcvd(pThis->pSessions[iTCPSess], buf, iRcvd) != RS_RET_OK) { - /* in this case, something went awfully wrong. - * We are instructed to terminate the session. - */ - errmsg.LogError(0, NO_ERRCODE, "Tearing down TCP Session %d - see " - "previous messages for reason(s)\n", iTCPSess); - pThis->pOnErrClose(pThis->pSessions[iTCPSess]); - tcps_sess.Destruct(&pThis->pSessions[iTCPSess]); - } - break; - default: - errno = 0; - errmsg.LogError(0, iRet, "netstream session %p will be closed due to error\n", - pThis->pSessions[iTCPSess]->pStrm); - pThis->pOnErrClose(pThis->pSessions[iTCPSess]); - tcps_sess.Destruct(&pThis->pSessions[iTCPSess]); - break; - } + doReceive(pThis, &pThis->pSessions[iTCPSess]); --nfds; /* indicate we have processed one */ } iTCPSess = TCPSessGetNxtSess(pThis, iTCPSess); @@ -578,7 +594,7 @@ finalize_it: /* this is a very special case - this time only we do not exit the } /* note that this point is usually not reached */ - pthread_cleanup_pop(0); /* remove cleanup handler */ + pthread_cleanup_pop(1); /* remove cleanup handler */ RETiRet; } @@ -566,8 +566,11 @@ static int do_Parameter(unsigned char **pp, struct template *pTpl) /* got the name */ cstrFinalize(pStrB); - if(propNameToID(pStrB, &pTpe->data.field.propid) != RS_RET_OK) + if(propNameToID(pStrB, &pTpe->data.field.propid) != RS_RET_OK) { + cstrDestruct(&pStrB); return 1; + } + cstrDestruct(&pStrB); /* Check frompos, if it has an R, then topos should be a regex */ if(*p == ':') { diff --git a/tests/Makefile.am b/tests/Makefile.am index c5deaa47..b175b414 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -2,8 +2,12 @@ if ENABLE_TESTBENCH TESTRUNS = rt_init rscript check_PROGRAMS = $(TESTRUNS) ourtail nettester tcpflood chkseq TESTS = $(TESTRUNS) cfg.sh \ + arrayqueue.sh \ + linkedlistqueue.sh \ + da-mainmsg-q.sh \ validation-run.sh \ imtcp-multiport.sh \ + daqueue-persist.sh \ diskqueue.sh \ diskqueue-fsync.sh \ manytcp.sh \ @@ -17,6 +21,7 @@ TESTS += omod-if-array.sh \ inputname.sh \ threadingmq.sh \ threadingmqaq.sh \ + discard.sh \ fieldtest.sh endif @@ -73,6 +78,7 @@ EXTRA_DIST= 1.rstest 2.rstest 3.rstest err1.rstest \ testsuites/1.parse1 \ testsuites/2.parse1 \ testsuites/3.parse1 \ + testsuites/4.parse1 \ testsuites/oversizeTag-1.parse1 \ testsuites/date1.parse1 \ testsuites/date2.parse1 \ @@ -84,6 +90,7 @@ EXTRA_DIST= 1.rstest 2.rstest 3.rstest err1.rstest \ testsuites/rfc5424-2.parse1 \ testsuites/rfc5424-3.parse1 \ testsuites/rfc5424-4.parse1 \ + testsuites/malformed1.parse1 \ testsuites/omod-if-array.conf \ testsuites/1.omod-if-array \ testsuites/1.field1 \ @@ -92,6 +99,12 @@ EXTRA_DIST= 1.rstest 2.rstest 3.rstest err1.rstest \ fieldtest.sh \ diskqueue.sh \ testsuites/diskqueue.conf \ + arrayqueue.sh \ + testsuites/arrayqueue.conf \ + linkedlistqueue.sh \ + testsuites/linkedlistqueue.conf \ + da-mainmsg-q.sh \ + testsuites/da-mainmsg-q.conf \ diskqueue-fsync.sh \ testsuites/diskqueue-fsync.conf \ imtcp-multiport.sh \ @@ -104,8 +117,12 @@ EXTRA_DIST= 1.rstest 2.rstest 3.rstest err1.rstest \ testsuites/1.inputname_imtcp_12515 \ testsuites/1.inputname_imtcp_12516 \ omod-if-array.sh \ + discard.sh \ + testsuites/discard.conf \ diag.sh \ testsuites/diag-common.conf \ + daqueue-persist.sh \ + daqueue-persist-drvr.sh \ queue-persist.sh \ queue-persist-drvr.sh \ testsuites/queue-persist.conf \ diff --git a/tests/arrayqueue.sh b/tests/arrayqueue.sh new file mode 100755 index 00000000..58fd24ae --- /dev/null +++ b/tests/arrayqueue.sh @@ -0,0 +1,17 @@ +# Test for fixedArray queue mode +# added 2009-05-20 by rgerhards +# This file is part of the rsyslog project, released under GPLv3 +echo testing queue fixedArray queue mode +source $srcdir/diag.sh init +source $srcdir/diag.sh startup arrayqueue.conf + +# 40000 messages should be enough +source $srcdir/diag.sh injectmsg 0 40000 + +# terminate *now* (don't wait for queue to drain!) +kill `cat rsyslog.pid` + +# now wait until rsyslog.pid is gone (and the process finished) +source $srcdir/diag.sh wait-shutdown +source $srcdir/diag.sh seq-check 0 39999 +source $srcdir/diag.sh exit diff --git a/tests/da-mainmsg-q.sh b/tests/da-mainmsg-q.sh new file mode 100755 index 00000000..d502fca3 --- /dev/null +++ b/tests/da-mainmsg-q.sh @@ -0,0 +1,31 @@ +# Test for DA mode on the main message queue +# This test checks if DA mode operates correctly. To do so, +# it uses a small in-memory queue size, so that DA mode is initiated +# rather soon, and disk spooling used. There is some uncertainty (based +# on machine speeds), but in general the test should work rather well. +# We add a few messages after the initial run, just so that we can +# check everything recovers from DA mode correctly. +# added 2009-04-22 by Rgerhards +# This file is part of the rsyslog project, released under GPLv3 +echo "[da-mainmsg-q.sh]: testing main message queue in DA mode (going to disk)" +source $srcdir/diag.sh init +source $srcdir/diag.sh startup da-mainmsg-q.conf + +# part1: send first 50 messages (in memory, only) +#source $srcdir/diag.sh tcpflood 127.0.0.1 13514 1 50 +source $srcdir/diag.sh injectmsg 0 50 +source $srcdir/diag.sh wait-queueempty # let queue drain for this test case + +# part 2: send bunch of messages. This should trigger DA mode +#source $srcdir/diag.sh injectmsg 50 20000 +source $srcdir/diag.sh injectmsg 50 2000 +ls -l test-spool # for manual review + +# send another handful +source $srcdir/diag.sh injectmsg 2050 50 +#sleep 1 # we need this so that rsyslogd can receive all outstanding messages + +# clean up and check test result +source $srcdir/diag.sh shutdown-when-empty # shut down rsyslogd when done processing messages +source $srcdir/diag.sh seq-check 2099 +source $srcdir/diag.sh exit diff --git a/tests/daqueue-persist-drvr.sh b/tests/daqueue-persist-drvr.sh new file mode 100755 index 00000000..d95991fc --- /dev/null +++ b/tests/daqueue-persist-drvr.sh @@ -0,0 +1,31 @@ +# Test for queue data persisting at shutdown. The +# plan is to start an instance, emit some data, do a relatively +# fast shutdown and then re-start the engine to process the +# remaining data. +# added 2009-05-27 by Rgerhards +# This file is part of the rsyslog project, released under GPLv3 +# uncomment for debugging support: +echo testing memory daqueue persisting to disk, mode $1 +source $srcdir/diag.sh init + +# prepare config +echo \$MainMsgQueueType $1 > work-queuemode.conf +echo "*.* :omtesting:sleep 0 1000" > work-delay.conf + +# inject 10000 msgs, so that DO hit the high watermark +source $srcdir/diag.sh startup queue-persist.conf +source $srcdir/diag.sh injectmsg 0 10000 +$srcdir/diag.sh shutdown-immediate +$srcdir/diag.sh wait-shutdown +source $srcdir/diag.sh check-mainq-spool + +#exit + +# restart engine and have rest processed +#remove delay +echo "#" > work-delay.conf +source $srcdir/diag.sh startup queue-persist.conf +source $srcdir/diag.sh shutdown-when-empty # shut down rsyslogd when done processing messages +$srcdir/diag.sh wait-shutdown +source $srcdir/diag.sh seq-check 0 4999 +source $srcdir/diag.sh exit diff --git a/tests/daqueue-persist.sh b/tests/daqueue-persist.sh new file mode 100755 index 00000000..ff81c987 --- /dev/null +++ b/tests/daqueue-persist.sh @@ -0,0 +1,12 @@ +# Test for queue data persisting at shutdown. We use the actual driver +# to carry out multiple tests with different queue modes +# added 2009-05-27 by Rgerhards +# This file is part of the rsyslog project, released under GPLv3 +echo TEST: daqueue-persist.sh +source $srcdir/daqueue-persist-drvr.sh LinkedList +source $srcdir/daqueue-persist-drvr.sh FixedArray +# the disk test should not fail, however, the config is extreme and using +# it more or less is a config error +source $srcdir/daqueue-persist-drvr.sh Disk +# we do not test Direct mode because this absolute can not work in direct mode +# (maybe we should do a fail-type of test?) diff --git a/tests/diag.sh b/tests/diag.sh index 13bb877d..d8ba43b8 100755 --- a/tests/diag.sh +++ b/tests/diag.sh @@ -9,7 +9,7 @@ #valgrind="valgrind --tool=drd --log-fd=1" #valgrind="valgrind --tool=helgrind --log-fd=1" #set -o xtrace -#export RSYSLOG_DEBUG="debug nostdout printmutexaction" +#export RSYSLOG_DEBUG="debug nostdout noprintmutexaction" #export RSYSLOG_DEBUGLOG="log" case $1 in 'init') $srcdir/killrsyslog.sh # kill rsyslogd if it runs for some reason @@ -39,6 +39,12 @@ case $1 in while test -f rsyslog.pid; do true done + if [ -e core.* ] + then + echo "ABORT! core file exists, starting interactive shell" + bash + exit 1 + fi ;; 'wait-queueempty') # wait for main message queue to be empty echo WaitMainQueueEmpty | java -classpath $abs_top_builddir DiagTalker diff --git a/tests/discard.sh b/tests/discard.sh new file mode 100755 index 00000000..0fafc7d9 --- /dev/null +++ b/tests/discard.sh @@ -0,0 +1,16 @@ +# Test for discard functionality +# This test checks if discard works. It is not a perfect test but +# will find at least segfaults and obviously not discarded messages. +# added 2009-07-30 by Rgerhards +# This file is part of the rsyslog project, released under GPLv3 +# uncomment for debugging support: +echo TEST discard.sh: testing discard functionality +source $srcdir/diag.sh init +source $srcdir/diag.sh startup discard.conf +# 20000 messages should be enough - the disk test is slow enough ;) +sleep 4 +source $srcdir/diag.sh tcpflood 127.0.0.1 13514 1 10 1 +source $srcdir/diag.sh shutdown-when-empty # shut down rsyslogd when done processing messages +source $srcdir/diag.sh wait-shutdown +source $srcdir/diag.sh seq-check 10 -s2 +source $srcdir/diag.sh exit diff --git a/tests/linkedlistqueue.sh b/tests/linkedlistqueue.sh new file mode 100755 index 00000000..72c2a403 --- /dev/null +++ b/tests/linkedlistqueue.sh @@ -0,0 +1,17 @@ +# Test for Linkedlist queue mode +# added 2009-05-20 by rgerhards +# This file is part of the rsyslog project, released under GPLv3 +echo testing queue Linkedlist queue mode +source $srcdir/diag.sh init +source $srcdir/diag.sh startup linkedlistqueue.conf + +# 40000 messages should be enough +source $srcdir/diag.sh injectmsg 0 40000 + +# terminate *now* (don't wait for queue to drain) +kill `cat rsyslog.pid` + +# now wait until rsyslog.pid is gone (and the process finished) +source $srcdir/diag.sh wait-shutdown +source $srcdir/diag.sh seq-check 0 39999 +source $srcdir/diag.sh exit diff --git a/tests/nettester.c b/tests/nettester.c index 209c2a6f..2838b919 100644 --- a/tests/nettester.c +++ b/tests/nettester.c @@ -47,6 +47,7 @@ #include <signal.h> #include <netinet/in.h> #include <getopt.h> +#include <errno.h> #define EXIT_FAILURE 1 #define INVALID_SOCKET -1 @@ -90,6 +91,7 @@ void readLine(int fd, char *ln) if(verbose) fprintf(stderr, "begin readLine\n"); lenRead = read(fd, &c, 1); + while(lenRead == 1 && c != '\n') { if(c == '\0') { *ln = c; @@ -102,6 +104,11 @@ void readLine(int fd, char *ln) } *ln = '\0'; + if(lenRead < 0) { + printf("read from rsyslogd returned with error '%s' - aborting test\n", strerror(errno)); + exit(1); + } + if(verbose) fprintf(stderr, "end readLine, val read '%s'\n", orig); } @@ -147,7 +154,7 @@ tcpSend(char *buf, int lenBuf) fprintf(stderr, "connect() failed\n"); return(1); } else { - usleep(100000); /* 0.1 sec, these are us! */ + usleep(100000); /* ms = 1000 us! */ } } } @@ -308,6 +315,10 @@ processTestFile(int fd, char *pszFileName) /* pull response from server and then check if it meets our expectation */ readLine(fd, buf); + if(strlen(buf) == 0) { + printf("something went wrong - read a zero-length string from rsyslogd"); + exit(1); + } if(strcmp(expected, buf)) { ++iFailed; printf("\nExpected Response:\n'%s'\nActual Response:\n'%s'\n", @@ -372,11 +383,24 @@ doTests(int fd, char *files) return(iFailed); } + +/* indicate that our child has died (where it is not permitted to!). + */ +void childDied(__attribute__((unused)) int sig) +{ + printf("ERROR: child died unexpectedly (maybe a segfault?)!\n"); + exit(1); +} + + /* cleanup */ void doAtExit(void) { int status; + /* disarm died-child handler */ + signal(SIGCHLD, SIG_IGN); + if(rsyslogdPid != 0) { kill(rsyslogdPid, SIGTERM); waitpid(rsyslogdPid, &status, 0); /* wait until instance terminates */ @@ -457,6 +481,9 @@ int main(int argc, char *argv[]) } fclose(fp); + /* arm died-child handler */ + signal(SIGCHLD, childDied); + /* start to be tested rsyslogd */ openPipe(testSuite, &rsyslogdPid, &fd); readLine(fd, buf); @@ -467,5 +494,6 @@ int main(int argc, char *argv[]) ret = 1; if(verbose) printf("End of nettester run (%d).\n", ret); + exit(ret); } diff --git a/tests/queue-persist.sh b/tests/queue-persist.sh index 999655b1..e05b3da3 100755 --- a/tests/queue-persist.sh +++ b/tests/queue-persist.sh @@ -2,6 +2,7 @@ # to carry out multiple tests with different queue modes # added 2009-05-27 by Rgerhards # This file is part of the rsyslog project, released under GPLv3 +echo TEST: queue-persist.sh source $srcdir/queue-persist-drvr.sh LinkedList source $srcdir/queue-persist-drvr.sh FixedArray # the disk test should not fail, however, the config is extreme and using diff --git a/tests/rt-init.c b/tests/rt-init.c index aaac7ed1..b9c4ce2e 100644 --- a/tests/rt-init.c +++ b/tests/rt-init.c @@ -21,10 +21,9 @@ * * A copy of the GPL can be found in the file "COPYING" in this distribution. */ -#include <stdio.h> - #include "rsyslog.h" #include "testbench.h" +#include <stdio.h> /* must be last, else we get a zlib compile error on some platforms */ MODULE_TYPE_TESTBENCH diff --git a/tests/testsuites/4.parse1 b/tests/testsuites/4.parse1 new file mode 100644 index 00000000..07e2445a --- /dev/null +++ b/tests/testsuites/4.parse1 @@ -0,0 +1,4 @@ +<29>Jul 31 21:39:21 example-b example-gw[10538]: disconnect host=/192.0.2.1 destination=192.0.2.2/11282 in=3274 out=1448 duration=0 +29,daemon,notice,Jul 31 21:39:21,example-b,example-gw,example-gw[10538]:, disconnect host=/192.0.2.1 destination=192.0.2.2/11282 in=3274 out=1448 duration=0 +# yet another real-life sample where we had some issues with - the important +# part is the dash inside the hostname! diff --git a/tests/testsuites/arrayqueue.conf b/tests/testsuites/arrayqueue.conf new file mode 100644 index 00000000..c5874a83 --- /dev/null +++ b/tests/testsuites/arrayqueue.conf @@ -0,0 +1,14 @@ +# Test for queue fixedArray mode (see .sh file for details) +# rgerhards, 2009-04-17 +$IncludeConfig diag-common.conf + +$ModLoad ../plugins/imtcp/.libs/imtcp +$MainMsgQueueTimeoutShutdown 10000 +$InputTCPServerRun 13514 + +# set spool locations and switch queue to disk-only mode +$MainMsgQueueType FixedArray + +$template outfmt,"%msg:F,58:2%\n" +$template dynfile,"rsyslog.out.log" # trick to use relative path names! +:msg, contains, "msgnum:" ?dynfile;outfmt diff --git a/tests/testsuites/da-mainmsg-q.conf b/tests/testsuites/da-mainmsg-q.conf new file mode 100644 index 00000000..843a3e4f --- /dev/null +++ b/tests/testsuites/da-mainmsg-q.conf @@ -0,0 +1,21 @@ +# Test for DA mode in main message queue (see .sh file for details) +# rgerhards, 2009-04-22 +$ModLoad ../plugins/imtcp/.libs/imtcp +$MainMsgQueueTimeoutShutdown 10000 +$InputTCPServerRun 13514 + +$IncludeConfig diag-common.conf + +# set spool locations and switch queue to disk assisted mode +$WorkDirectory test-spool +$MainMsgQueueSize 200 # this *should* trigger moving on to DA mode... +# note: we must set QueueSize sufficiently high, so that 70% (light delay mark) +# is high enough above HighWatermark! +$MainMsgQueueHighWatermark 80 +$MainMsgQueueLowWatermark 40 +$MainMsgQueueFilename mainq +$MainMsgQueueType linkedlist + +$template outfmt,"%msg:F,58:2%\n" +$template dynfile,"rsyslog.out.log" # trick to use relative path names! +:msg, contains, "msgnum:" ?dynfile;outfmt diff --git a/tests/testsuites/discard.conf b/tests/testsuites/discard.conf new file mode 100644 index 00000000..bbe2fe77 --- /dev/null +++ b/tests/testsuites/discard.conf @@ -0,0 +1,13 @@ +# Test for discard functionality +# rgerhards, 2009-07-30 +$IncludeConfig diag-common.conf + +$ModLoad ../plugins/imtcp/.libs/imtcp +$MainMsgQueueTimeoutShutdown 10000 +$InputTCPServerRun 13514 + +:msg, contains, "00000001" ~ + +$template outfmt,"%msg:F,58:2%\n" +$template dynfile,"rsyslog.out.log" # trick to use relative path names! +:msg, contains, "msgnum:" ?dynfile;outfmt diff --git a/tests/testsuites/linkedlistqueue.conf b/tests/testsuites/linkedlistqueue.conf new file mode 100644 index 00000000..92a9649c --- /dev/null +++ b/tests/testsuites/linkedlistqueue.conf @@ -0,0 +1,16 @@ +# Test for queue LinkedList mode (see .sh file for details) +# rgerhards, 2009-04-17 +$IncludeConfig diag-common.conf + +$ModLoad ../plugins/imtcp/.libs/imtcp +$MainMsgQueueTimeoutShutdown 10000 +$InputTCPServerRun 13514 + +$ErrorMessagesToStderr off + +# set spool locations and switch queue to disk-only mode +$MainMsgQueueType LinkedList + +$template outfmt,"%msg:F,58:2%\n" +$template dynfile,"rsyslog.out.log" # trick to use relative path names! +:msg, contains, "msgnum:" ?dynfile;outfmt diff --git a/tests/testsuites/malformed1.parse1 b/tests/testsuites/malformed1.parse1 new file mode 100644 index 00000000..2d95170d --- /dev/null +++ b/tests/testsuites/malformed1.parse1 @@ -0,0 +1,5 @@ +<131>Oct 8 23:05:06 10.321.1.123 05",result_code=200,b +131,local0,err,Oct 8 23:05:06,10.321.1.123,05",result_code=200,b,05",result_code=200,b +# a somewhat mangeld-with real-life sample of a malformed message +# the key here is not what is being parsed, but that we do not abort! +# NOTE: if a parser enhancement breaks the format, this is probably OK @@ -51,8 +51,7 @@ static rsRetVal thrdConstruct(thrdInfo_t **ppThis) assert(ppThis != NULL); - if((pThis = calloc(1, sizeof(thrdInfo_t))) == NULL) - return RS_RET_OUT_OF_MEMORY; + CHKmalloc(pThis = calloc(1, sizeof(thrdInfo_t))); /* OK, we got the element, now initialize members that should * not be zero-filled. @@ -61,6 +60,8 @@ static rsRetVal thrdConstruct(thrdInfo_t **ppThis) pthread_mutex_init (pThis->mutTermOK, NULL); *ppThis = pThis; + +finalize_it: RETiRet; } @@ -91,8 +92,20 @@ rsRetVal thrdTerminate(thrdInfo_t *pThis) DEFiRet; assert(pThis != NULL); - pthread_cancel(pThis->thrdID); - pthread_join(pThis->thrdID, NULL); /* wait for cancel to complete */ +#if 0 // TODO: somehow does not really work yet! + if(pThis->bNeedsCancel) { +#endif + DBGPRINTF("request term via canceling for input thread 0x%x\n", (unsigned) pThis->thrdID); + pthread_cancel(pThis->thrdID); +#if 0 // TODO: somehow does not really work yet! + if(pThis->bNeedsCancel) { + } else { + + DBGPRINTF("request term via SIGTTIN for input thread 0x%x\n", (unsigned) pThis->thrdID); + pthread_kill(pThis->thrdID, SIGTTIN); + } +#endif + pthread_join(pThis->thrdID, NULL); /* wait for input thread to complete */ pThis->bIsActive = 0; /* call cleanup function, if any */ @@ -132,6 +145,11 @@ static void* thrdStarter(void *arg) sigfillset(&sigSet); pthread_sigmask(SIG_BLOCK, &sigSet, NULL); + /* but ignore SIGTTN, which we (ab)use to signal the thread to shutdown -- rgerhards, 2009-07-20 */ + sigemptyset(&sigSet); + sigaddset(&sigSet, SIGTTIN); + pthread_sigmask(SIG_UNBLOCK, &sigSet, NULL); + /* setup complete, we are now ready to execute the user code. We will not * regain control until the user code is finished, in which case we terminate * the thread. @@ -147,7 +165,7 @@ static void* thrdStarter(void *arg) * executing threads. It is added at the end of the list. * rgerhards, 2007-12-14 */ -rsRetVal thrdCreate(rsRetVal (*thrdMain)(thrdInfo_t*), rsRetVal(*afterRun)(thrdInfo_t *)) +rsRetVal thrdCreate(rsRetVal (*thrdMain)(thrdInfo_t*), rsRetVal(*afterRun)(thrdInfo_t *), bool bNeedsCancel) { DEFiRet; thrdInfo_t *pThis; @@ -159,6 +177,7 @@ rsRetVal thrdCreate(rsRetVal (*thrdMain)(thrdInfo_t*), rsRetVal(*afterRun)(thrdI pThis->bIsActive = 1; pThis->pUsrThrdMain = thrdMain; pThis->pAfterRun = afterRun; + pThis->bNeedsCancel = bNeedsCancel; i = pthread_create(&pThis->thrdID, NULL, thrdStarter, pThis); CHKiRet(llAppend(&llThrds, NULL, pThis)); @@ -31,6 +31,7 @@ struct thrdInfo { rsRetVal (*pUsrThrdMain)(struct thrdInfo*); /* user thread main to be called in new thread */ rsRetVal (*pAfterRun)(struct thrdInfo*); /* cleanup function */ pthread_t thrdID; + bool bNeedsCancel; /* must input be terminated by pthread_cancel()? */ }; /* prototypes */ @@ -38,7 +39,7 @@ rsRetVal thrdExit(void); rsRetVal thrdInit(void); rsRetVal thrdTerminate(thrdInfo_t *pThis); rsRetVal thrdTerminateAll(void); -rsRetVal thrdCreate(rsRetVal (*thrdMain)(thrdInfo_t*), rsRetVal(*afterRun)(thrdInfo_t *)); +rsRetVal thrdCreate(rsRetVal (*thrdMain)(thrdInfo_t*), rsRetVal(*afterRun)(thrdInfo_t *), bool); rsRetVal thrdSleep(thrdInfo_t *pThis, int iSeconds, int iuSeconds); /* macros (replace inline functions) */ diff --git a/tools/omfwd.c b/tools/omfwd.c index fe65f515..38d9cfa6 100644 --- a/tools/omfwd.c +++ b/tools/omfwd.c @@ -10,7 +10,7 @@ * of the "old" message code without any modifications. However, it * helps to have things at the right place one we go to the meat of it. * - * Copyright 2007 Rainer Gerhards and Adiscon GmbH. + * Copyright 2007, 2009 Rainer Gerhards and Adiscon GmbH. * * This file is part of rsyslog. * diff --git a/tools/syslogd.c b/tools/syslogd.c index fb3294d5..165c751b 100644 --- a/tools/syslogd.c +++ b/tools/syslogd.c @@ -16,12 +16,9 @@ * parts of the code have been rewritten. * * This Project was intiated and is maintained by - * Rainer Gerhards <rgerhards@hq.adiscon.com>. See - * AUTHORS to learn who helped make it become a reality. + * Rainer Gerhards <rgerhards@hq.adiscon.com>. * - * If you have questions about rsyslogd in general, please email - * info@adiscon.com. To learn more about rsyslogd, please visit - * http://www.rsyslog.com. + * For further information, please see http://www.rsyslog.com * * \author Rainer Gerhards <rgerhards@adiscon.com> * \date 2003-10-17 @@ -129,12 +126,15 @@ #include "omfile.h" #include "omdiscard.h" #include "threads.h" +#include "wti.h" #include "queue.h" #include "stream.h" #include "conf.h" #include "errmsg.h" #include "datetime.h" #include "parser.h" +//#include "sysvar.h" +#include "batch.h" #include "unicode-helper.h" #include "ruleset.h" #include "rule.h" @@ -293,6 +293,7 @@ static int iMainMsgQtoWrkShutdown = 60000; /* timeout for worker thread shutdo static int iMainMsgQWrkMinMsgs = 100; /* minimum messages per worker needed to start a new one */ static int iMainMsgQDeqSlowdown = 0; /* dequeue slowdown (simple rate limiting) */ static int64 iMainMsgQueMaxDiskSpace = 0; /* max disk space allocated 0 ==> unlimited */ +static int iMainMsgQueDeqBatchSize = 32; /* dequeue batch size */ static int bMainMsgQSaveOnShutdown = 1; /* save queue on shutdown (when DA enabled)? */ static int iMainMsgQueueDeqtWinFromHr = 0; /* hour begin of time frame when queue is to be dequeued */ static int iMainMsgQueueDeqtWinToHr = 25; /* hour begin of time frame when queue is to be dequeued */ @@ -359,6 +360,7 @@ static rsRetVal resetConfigVariables(uchar __attribute__((unused)) *pp, void __a bMainMsgQSaveOnShutdown = 1; MainMsgQueType = QUEUETYPE_FIXED_ARRAY; iMainMsgQueMaxDiskSpace = 0; + iMainMsgQueDeqBatchSize = 32; glbliActionResumeRetryCount = 0; return RS_RET_OK; @@ -421,7 +423,7 @@ diagGetMainMsgQSize(int *piSize) /* rgerhards, 2005-10-24: crunch_list is called only during option processing. So - * it is never called once rsyslogd is running (not even when HUPed). This code + * it is never called once rsyslogd is running. This code * contains some exits, but they are considered safe because they only happen * during startup. Anyhow, when we review the code here, we might want to * reconsider the exit()s. @@ -921,22 +923,27 @@ finalize_it: /* The consumer of dequeued messages. This function is called by the * queue engine on dequeueing of a message. It runs on a SEPARATE - * THREAD. - * Please note: the message object is destructed by the queue itself! + * THREAD. It receives an array of pointers, which it must iterate + * over. We do not do any further batching, as this is of no benefit + * for the main queue. */ static rsRetVal -msgConsumer(void __attribute__((unused)) *notNeeded, void *pUsr) +msgConsumer(void __attribute__((unused)) *notNeeded, batch_t *pBatch) { + int i; + msg_t *pMsg; DEFiRet; - msg_t *pMsg = (msg_t*) pUsr; - assert(pMsg != NULL); + assert(pBatch != NULL); - if((pMsg->msgFlags & NEEDS_PARSING) != 0) { - parseMsg(pMsg); + for(i = 0 ; i < pBatch->nElem ; i++) { + pMsg = (msg_t*) pBatch->pElem[i].pUsrp; + DBGPRINTF("msgConsumer processes msg %d/%d\n", i, pBatch->nElem); + if((pMsg->msgFlags & NEEDS_PARSING) != 0) { + parseMsg(pMsg); + } + ruleset.ProcessMsg(pMsg); } - ruleset.ProcessMsg(pMsg); - msgDestruct(&pMsg); RETiRet; } @@ -1194,8 +1201,6 @@ int parseLegacySyslogMsg(msg_t *pMsg, int flags) assert(pMsg != NULL); assert(pMsg->pszRawMsg != NULL); lenMsg = pMsg->iLenRawMsg - (pMsg->offAfterPRI + 1); -RUNLOG_VAR("%d", pMsg->offAfterPRI); -RUNLOG_VAR("%d", lenMsg); p2parse = pMsg->pszRawMsg + pMsg->offAfterPRI; /* point to start of text, after PRI */ /* Check to see if msg contains a timestamp. We start by assuming @@ -1251,16 +1256,16 @@ RUNLOG_VAR("%d", lenMsg); bTAGCharDetected = 0; if(lenMsg > 0 && flags & PARSE_HOSTNAME) { i = 0; - while(lenMsg > 0 && (isalnum(p2parse[i]) || p2parse[i] == '.' || p2parse[i] == '.' + while(i < lenMsg && (isalnum(p2parse[i]) || p2parse[i] == '.' || p2parse[i] == '.' || p2parse[i] == '_' || p2parse[i] == '-') && i < CONF_TAG_MAXSIZE) { bufParseHOSTNAME[i] = p2parse[i]; ++i; - --lenMsg; } if(i > 0 && p2parse[i] == ' ' && isalnum(p2parse[i-1])) { /* we got a hostname! */ p2parse += i + 1; /* "eat" it (including SP delimiter) */ + lenMsg -= i + 1; bufParseHOSTNAME[i] = '\0'; MsgSetHOSTNAME(pMsg, bufParseHOSTNAME, i); } @@ -1627,6 +1632,7 @@ static void doDie(int sig) # define MSG1 "DoDie called.\n" # define MSG2 "DoDie called 5 times - unconditional exit\n" static int iRetries = 0; /* debug aid */ + dbgprintf(MSG1); if(Debug) write(1, MSG1, sizeof(MSG1) - 1); if(iRetries++ == 4) { @@ -1693,6 +1699,7 @@ die(int sig) /* close the inputs */ DBGPRINTF("Terminating input threads...\n"); + glbl.SetGlobalInputTermination(); thrdTerminateAll(); /* and THEN send the termination log message (see long comment above) */ @@ -1725,8 +1732,6 @@ die(int sig) */ tplDeleteAll(); - remove_pid(PidFile); - /* de-init some modules */ modExitIminternal(); @@ -1748,16 +1753,6 @@ die(int sig) /* terminate the remaining classes */ GlobalClassExit(); - /* TODO: this would also be the right place to de-init the builtin output modules. We - * do not currently do that, because the module interface does not allow for - * it. This will come some time later (it's essential with loadable modules). - * For the time being, this is a memory leak on exit, but as the process is - * terminated, we do not really bother about it. - * rgerhards, 2007-08-03 - * I have added some code now, but all that mod init/de-init should be moved to - * init, so that modules are unloaded and reloaded on HUP to. Eventually it should go - * into destructAllActions() - but that needs to be seen. -- rgerhards, 2007-08-09 - */ module.UnloadAndDestructAll(eMOD_LINK_ALL); DBGPRINTF("Clean shutdown completed, bye\n"); @@ -1770,6 +1765,9 @@ die(int sig) */ freeAllDynMemForTermination(); /* NO CODE HERE - feeelAllDynMemForTermination() must be the last thing before exit()! */ + + remove_pid(PidFile); + exit(0); /* "good" exit, this is the terminator function for rsyslog [die()] */ } @@ -2122,6 +2120,7 @@ static rsRetVal runInputModules(void) { modInfo_t *pMod; + int bNeedsCancel; BEGINfunc /* loop through all modules and activate them (brr...) */ @@ -2129,7 +2128,9 @@ runInputModules(void) while(pMod != NULL) { if(pMod->mod.im.bCanRun) { /* activate here */ - thrdCreate(pMod->mod.im.runInput, pMod->mod.im.afterRun); + bNeedsCancel = (pMod->isCompatibleWithFeature(sFEATURENonCancelInputTermination) == RS_RET_OK) ? + 0 : 1; + thrdCreate(pMod->mod.im.runInput, pMod->mod.im.afterRun, bNeedsCancel); } pMod = module.GetNxtType(pMod, eMOD_IN); } @@ -2167,13 +2168,12 @@ startInputModules(void) } -/* INIT -- Initialize syslogd from configuration table - * init() is called at initial startup AND each time syslogd is HUPed +/* INIT -- Initialize syslogd * Note that if iConfigVerify is set, only the config file is verified but nothing * else happens. -- rgerhards, 2008-07-28 */ static rsRetVal -init() +init(void) { rsRetVal localRet; int iNbrActions; @@ -2184,8 +2184,6 @@ init() struct sigaction sigAct; DEFiRet; - thrdTerminateAll(); /* stop all running input threads - TODO: reconsider location! */ - /* initialize some static variables */ pDfltHostnameCmp = NULL; pDfltProgNameCmp = NULL; @@ -2193,37 +2191,6 @@ init() DBGPRINTF("rsyslog %s - called init()\n", VERSION); - /* delete the message queue, which also flushes all messages left over */ - if(pMsgQueue != NULL) { - DBGPRINTF("deleting main message queue\n"); - qqueueDestruct(&pMsgQueue); /* delete pThis here! */ - pMsgQueue = NULL; - } - - /* Close all open log files and free log descriptor array. This also frees - * all output-modules instance data. - */ - destructAllActions(); - - /* Unload all non-static modules */ - DBGPRINTF("Unloading non-static modules.\n"); - module.UnloadAndDestructAll(eMOD_LINK_DYNAMIC_LOADED); - - DBGPRINTF("Clearing templates.\n"); - tplDeleteNew(); - - /* re-setting values to defaults (where applicable) */ - /* once we have loadable modules, we must re-visit this code. The reason is - * that config variables are not re-set, because the module is not yet loaded. On - * the other hand, that doesn't matter, because the module got unloaded and is then - * re-loaded, so the variables should be re-set via that way. And this is exactly how - * it works. Loadable module's variables are initialized on load, the rest here. - * rgerhards, 2008-04-28 - */ - conf.cfsysline((uchar*)"ResetConfigVariables"); - - conf.ReInitConf(); - /* construct the default ruleset */ ruleset.Construct(&pRuleset); ruleset.SetName(pRuleset, UCHAR_CONSTANT("RSYSLOG_DefaultRuleset")); @@ -2343,6 +2310,7 @@ init() setQPROP(qqueueSetMaxFileSize, "$MainMsgQueueFileSize", iMainMsgQueMaxFileSize); setQPROP(qqueueSetsizeOnDiskMax, "$MainMsgQueueMaxDiskSpace", iMainMsgQueMaxDiskSpace); + setQPROP(qqueueSetiDeqBatchSize, "$MainMsgQueueDequeueBatchSize", iMainMsgQueDeqBatchSize); setQPROPstr(qqueueSetFilePrefix, "$MainMsgQueueFileName", pszMainMsgQFName); setQPROP(qqueueSetiPersistUpdCnt, "$MainMsgQueueCheckpointInterval", iMainMsgQPersistUpdCnt); setQPROP(qqueueSetbSyncQueueFiles, "$MainMsgQueueSyncQueueFiles", bMainMsgQSyncQeueFiles); @@ -2387,22 +2355,22 @@ init() dbgPrintInitInfo(); } + memset(&sigAct, 0, sizeof (sigAct)); + sigemptyset(&sigAct.sa_mask); + sigAct.sa_handler = sighup_handler; + sigaction(SIGHUP, &sigAct, NULL); + + DBGPRINTF(" started.\n"); + /* we now generate the startup message. It now includes everything to * identify this instance. -- rgerhards, 2005-08-17 */ snprintf(bufStartUpMsg, sizeof(bufStartUpMsg)/sizeof(char), " [origin software=\"rsyslogd\" " "swVersion=\"" VERSION \ - "\" x-pid=\"%d\" x-info=\"http://www.rsyslog.com\"] (re)start", + "\" x-pid=\"%d\" x-info=\"http://www.rsyslog.com\"] start", (int) myPid); logmsgInternal(NO_ERRCODE, LOG_SYSLOG|LOG_INFO, (uchar*)bufStartUpMsg, 0); - memset(&sigAct, 0, sizeof (sigAct)); - sigemptyset(&sigAct.sa_mask); - sigAct.sa_handler = sighup_handler; - sigaction(SIGHUP, &sigAct, NULL); - - DBGPRINTF(" (re)started.\n"); - finalize_it: RETiRet; } @@ -2501,6 +2469,9 @@ void sighup_handler() sigaction(SIGHUP, &sigAct, NULL); } +void sigttin_handler() +{ +} /* this function pulls all internal messages from the buffer * and puts them into the processing engine. @@ -2545,19 +2516,11 @@ doHUP(void) snprintf(buf, sizeof(buf) / sizeof(char), " [origin software=\"rsyslogd\" " "swVersion=\"" VERSION - "\" x-pid=\"%d\" x-info=\"http://www.rsyslog.com\"] rsyslogd was HUPed, type '%s'.", - (int) myPid, glbl.GetHUPisRestart() ? "restart" : "lightweight"); + "\" x-pid=\"%d\" x-info=\"http://www.rsyslog.com\"] rsyslogd was HUPed", + (int) myPid); errno = 0; - logmsgInternal(NO_ERRCODE, LOG_SYSLOG|LOG_INFO, (uchar*)buf, 0); - if(glbl.GetHUPisRestart()) { - DBGPRINTF("Received SIGHUP, configured to be restart, reloading rsyslogd.\n"); - init(); /* main queue is stopped as part of init() */ - runInputModules(); - } else { - DBGPRINTF("Received SIGHUP, configured to be a non-restart type of HUP - notifying actions.\n"); - ruleset.IterateAllActions(doHUPActions, NULL); - } + ruleset.IterateAllActions(doHUPActions, NULL); } @@ -2689,6 +2652,7 @@ static rsRetVal loadBuildInModules(void) CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueuedequeueslowdown", 0, eCmdHdlrInt, NULL, &iMainMsgQDeqSlowdown, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueueworkerthreadminimummessages", 0, eCmdHdlrInt, NULL, &iMainMsgQWrkMinMsgs, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueuemaxfilesize", 0, eCmdHdlrSize, NULL, &iMainMsgQueMaxFileSize, NULL)); + CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueuedequeuebatchsize", 0, eCmdHdlrSize, NULL, &iMainMsgQueDeqBatchSize, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueuemaxdiskspace", 0, eCmdHdlrSize, NULL, &iMainMsgQueMaxDiskSpace, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueuesaveonshutdown", 0, eCmdHdlrBinary, NULL, &bMainMsgQSaveOnShutdown, NULL)); CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueuedequeuetimebegin", 0, eCmdHdlrInt, NULL, &iMainMsgQueueDeqtWinFromHr, NULL)); @@ -2832,20 +2796,16 @@ static rsRetVal mainThread() */ if(gidDropPriv != 0) { doDropPrivGid(gidDropPriv); - glbl.SetHUPisRestart(0); /* we can not do restart-type HUPs with dropped privs */ } if(uidDropPriv != 0) { doDropPrivUid(uidDropPriv); - glbl.SetHUPisRestart(0); /* we can not do restart-type HUPs with dropped privs */ } /* finally let the inputs run... */ runInputModules(); /* END OF INTIALIZATION - * ... but keep in mind that we might do a restart and thus init() might - * be called again. -- rgerhards, 2005-10-24 */ DBGPRINTF("initialization completed, transitioning to regular run mode\n"); @@ -3119,6 +3079,8 @@ doGlblProcessInit(void) sigaction(SIGCHLD, &sigAct, NULL); sigAct.sa_handler = Debug ? debug_switch : SIG_IGN; sigaction(SIGUSR1, &sigAct, NULL); + sigAct.sa_handler = sigttin_handler; + sigaction(SIGTTIN, &sigAct, NULL); /* (ab)used to interrupt input threads */ sigAct.sa_handler = SIG_IGN; sigaction(SIGPIPE, &sigAct, NULL); sigaction(SIGXFSZ, &sigAct, NULL); /* do not abort if 2gig file limit is hit */ |