Merge branch 'v5-devel'

Conflicts: ChangeLog runtime/datetime.c
author: Rainer Gerhards <rgerhards@adiscon.com> 2009-07-03 12:05:50 +0200
committer: Rainer Gerhards <rgerhards@adiscon.com> 2009-07-03 12:05:50 +0200
commit: 22328c72aef2df73741afefe0e8b53d3ad83f15d (patch)
tree: 4a0b311508ebb28a0dc72e12993f704c2709d46a
parent: da933a7e105acf814d5e7955d39d29eab3a96613 (diff)
parent: 7c0ca7553738161c681c0d0600de99e9fabee81d (diff)
download: rsyslog-22328c72aef2df73741afefe0e8b53d3ad83f15d.tar.gz
rsyslog-22328c72aef2df73741afefe0e8b53d3ad83f15d.tar.xz
rsyslog-22328c72aef2df73741afefe0e8b53d3ad83f15d.zip
52 files changed, 3661 insertions, 858 deletions
diff --git a/ChangeLog b/ChangeLog
index 2e5e3296..456d1ad6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,48 @@
 ---------------------------------------------------------------------------
+Version 5.1.1  [DEVEL] (rgerhards), 2009-06-??
+- bugfix: huge memory leak in queue engine (made rsyslogd unusable in
+  production). Occured if at least one queue was in direct mode 
+  (the default for action queues)
+- imported many performance optimizations from v4-devel (4.5.0)
+- bugfix: subtle (and usually irrelevant) issue in timout processing
+  timeout could be one second too early if nanoseconds wrapped
+- set a more sensible timeout for shutdow, now 1.5 seconds to complete
+  processing (this also removes those cases where the shutdown message
+  was not written because the termination happened before it)
+---------------------------------------------------------------------------
+Version 5.1.0  [DEVEL] (rgerhards), 2009-05-29
+
+*********************************** NOTE **********************************
+The v5 versions of rsyslog feature a greatly redesigned queue engine. The
+major theme for the v5 release is twofold:
+
+a) greatly improved performance
+b) enable audit-grade processing
+
+Here, audit-grade processing means that rsyslog, if used together with
+audit-grade transports and configured correctly, will never lose messages
+that already have been acknowledged, not even in fatal failure cases like
+sudden loss of power.
+
+Note that large parts of rsyslog's important core components have been
+restructured to support these design goals. As such, early versions of
+the engine will probably be less stable than the v3/v4 engine.
+
+Also note that the initial versions do not cover all and everything. As
+usual, the code will evolve toward the final goal as version numbers
+increase.
+*********************************** NOTE **********************************
+
+- redesigned queue engine so that it supports ultra-reliable operations
+  This resulted in a rewrite of large parts. The new capability can be
+  used to build audit-grade systems on the basis of rsyslog.
+- added $MainMsgQueueDequeueBatchSize and $ActionQueueDequeueBatchSize 
+  configuration directives
+- implemented a new transactional output module interface which provides
+  superior performance (for databases potentially far superior performance)
+- increased ompgsql performance by adapting to new transactional
+  output module interface
+---------------------------------------------------------------------------
 Version 4.5.1  [DEVEL] (rgerhards), 2009-07-??
 - added $klogConsoleLogLevel directive which permits to set a new
   console log level while rsyslog is active
diff --git a/action.c b/action.c
index f21feea6..352f7f9c 100644
--- a/action.c
+++ b/action.c
@@ -42,10 +42,14 @@
 #include "cfsysline.h"
 #include "srUtils.h"
 #include "errmsg.h"
+#include "batch.h"
+#include "wti.h"
 #include "datetime.h"
 
+#define NO_TIME_PROVIDED 0 /* indicate we do not provide any cached time */
+
 /* forward definitions */
-rsRetVal actionCallDoAction(action_t *pAction, msg_t *pMsg);
+static rsRetVal processBatchMain(action_t *pAction, batch_t *pBatch);
 
 /* object static data (once for all instances) */
 /* TODO: make this an object! DEFobjStaticHelpers -- rgerhards, 2008-03-05 */
@@ -64,6 +68,7 @@ static uchar *pszActionName;					/* short name for the action */
 /* main message queue and its configuration parameters */
 static queueType_t ActionQueType = QUEUETYPE_DIRECT;		/* type of the main message queue above */
 static int iActionQueueSize = 1000;				/* size of the main message queue above */
+static int iActionQueueDeqBatchSize = 16;			/* batch size for action queues */
 static int iActionQHighWtrMark = 800;				/* high water mark for disk-assisted queues */
 static int iActionQLowWtrMark = 200;				/* low water mark for disk-assisted queues */
 static int iActionQDiscardMark = 9800;				/* begin to discard messages */
@@ -145,6 +150,7 @@ actionResetQueueParams(void)
 
 	ActionQueType = QUEUETYPE_DIRECT;		/* type of the main message queue above */
 	iActionQueueSize = 1000;			/* size of the main message queue above */
+	iActionQueueDeqBatchSize = 16;			/* default batch size */
 	iActionQHighWtrMark = 800;			/* high water mark for disk-assisted queues */
 	iActionQLowWtrMark = 200;			/* low water mark for disk-assisted queues */
 	iActionQDiscardMark = 9800;			/* begin to discard messages */
@@ -282,7 +288,8 @@ actionConstructFinalize(action_t *pThis)
 	 * to be run on multiple threads. So far, this is forbidden by the interface
 	 * spec. -- rgerhards, 2008-01-30
 	 */
-	CHKiRet(qqueueConstruct(&pThis->pQueue, ActionQueType, 1, iActionQueueSize, (rsRetVal (*)(void*,void*))actionCallDoAction));
+	CHKiRet(qqueueConstruct(&pThis->pQueue, ActionQueType, 1, iActionQueueSize,
+					(rsRetVal (*)(void*, batch_t*))processBatchMain));
 	obj.SetName((obj_t*) pThis->pQueue, pszQName);
 
 	/* ... set some properties ... */
@@ -297,6 +304,7 @@ actionConstructFinalize(action_t *pThis)
 
 	qqueueSetpUsr(pThis->pQueue, pThis);
 	setQPROP(qqueueSetsizeOnDiskMax, "$ActionQueueMaxDiskSpace", iActionQueMaxDiskSpace);
+	setQPROP(qqueueSetiDeqBatchSize, "$ActionQueueDequeueBatchSize", iActionQueueDeqBatchSize);
 	setQPROP(qqueueSetMaxFileSize, "$ActionQueueFileSize", iActionQueMaxFileSize);
 	setQPROPstr(qqueueSetFilePrefix, "$ActionQueueFileName", pszActionQFName);
 	setQPROP(qqueueSetiPersistUpdCnt, "$ActionQueueCheckpointInterval", iActionQPersistUpdCnt);
@@ -333,87 +341,239 @@ finalize_it:
 }
 
 
-/* set an action back to active state -- rgerhards, 2007-08-02
+
+/* set the global resume interval
+ */
+rsRetVal actionSetGlobalResumeInterval(int iNewVal)
+{
+	glbliActionResumeInterval = iNewVal;
+	return RS_RET_OK;
+}
+
+
+/* returns the action state name in human-readable form
+ * returned string must not be modified.
+ * rgerhards, 2009-05-07
+ */
+static uchar *getActStateName(action_t *pThis)
+{
+	switch(pThis->eState) {
+		case ACT_STATE_RDY:
+			return (uchar*) "rdy";
+		case ACT_STATE_ITX:
+			return (uchar*) "itx";
+		case ACT_STATE_RTRY:
+			return (uchar*) "rtry";
+		case ACT_STATE_SUSP:
+			return (uchar*) "susp";
+		case ACT_STATE_DIED:
+			return (uchar*) "died";
+		case ACT_STATE_COMM:
+			return (uchar*) "comm";
+		default:
+			return (uchar*) "ERROR/UNKNWON";
+	}
+}
+
+
+/* returns a suitable return code based on action state
+ * rgerhards, 2009-05-07
  */
-static rsRetVal actionResume(action_t *pThis)
+static rsRetVal getReturnCode(action_t *pThis)
 {
 	DEFiRet;
 
 	ASSERT(pThis != NULL);
-	pThis->bSuspended = 0;
+	switch(pThis->eState) {
+		case ACT_STATE_RDY:
+			iRet = RS_RET_OK;
+			break;
+		case ACT_STATE_ITX:
+			if(pThis->bHadAutoCommit) {
+				pThis->bHadAutoCommit = 0; /* auto-reset */
+				iRet = RS_RET_PREVIOUS_COMMITTED;
+			} else {
+				iRet = RS_RET_DEFER_COMMIT;
+			}
+			break;
+		case ACT_STATE_RTRY:
+			iRet = RS_RET_SUSPENDED;
+			break;
+		case ACT_STATE_SUSP:
+		case ACT_STATE_DIED:
+			iRet = RS_RET_ACTION_FAILED;
+			break;
+		default:
+			DBGPRINTF("Invalid action engine state %d, program error\n",
+					(int) pThis->eState);
+			iRet = RS_RET_ERR;
+			break;
+	}
 
 	RETiRet;
 }
 
 
-/* set the global resume interval
+/* set the action to a new state
+ * rgerhards, 2007-08-02
  */
-rsRetVal actionSetGlobalResumeInterval(int iNewVal)
+static inline void actionSetState(action_t *pThis, action_state_t newState)
 {
-	glbliActionResumeInterval = iNewVal;
-	return RS_RET_OK;
+	pThis->eState = newState;
+	DBGPRINTF("Action %p transitioned to state: %s\n", pThis, getActStateName(pThis));
 }
 
+/* Handles the transient commit state. So far, this is
+ * mostly a dummy...
+ * rgerhards, 2007-08-02
+ */
+static void actionCommitted(action_t *pThis)
+{
+	actionSetState(pThis, ACT_STATE_RDY);
+}
 
-/* suspend an action -- rgerhards, 2007-08-02
+
+/* set action to "rtry" state.
+ * rgerhards, 2007-08-02
  */
-static rsRetVal actionSuspend(action_t *pThis, time_t tNow)
+static void actionRetry(action_t *pThis)
 {
+	actionSetState(pThis, ACT_STATE_RTRY);
+}
+
+
+/* Disable action, this means it will never again be usable
+ * until rsyslog is reloaded. Use only as a last resort, but
+ * depends on output module.
+ * rgerhards, 2007-08-02
+ */
+static void actionDisable(action_t *pThis)
+{
+	actionSetState(pThis, ACT_STATE_DIED);
+}
+
+
+/* Suspend action, this involves changing the acton state as well
+ * as setting the next retry time.
+ * if we have more than 10 retries, we prolong the
+ * retry interval. If something is really stalled, it will
+ * get re-tried only very, very seldom - but that saves
+ * CPU time. TODO: maybe a config option for that?
+ * rgerhards, 2007-08-02
+ */
+static inline void actionSuspend(action_t *pThis, time_t ttNow)
+{
+	if(ttNow == NO_TIME_PROVIDED)
+		time(&ttNow);
+	pThis->ttResumeRtry = ttNow + pThis->iResumeInterval * (pThis->iNbrResRtry / 10 + 1);
+	actionSetState(pThis, ACT_STATE_SUSP);
+	DBGPRINTF("earliest retry=%d\n", (int) pThis->ttResumeRtry);
+}
+
+
+/* actually do retry processing. Note that the function receives a timestamp so
+ * that we do not need to call the (expensive) time() API.
+ * Note that we do the full retry processing here, doing the configured number of
+ * iterations.
+ * rgerhards, 2009-05-07
+ */
+static rsRetVal actionDoRetry(action_t *pThis, time_t ttNow)
+{
+	int iRetries;
+	int iSleepPeriod;
 	DEFiRet;
 
 	ASSERT(pThis != NULL);
-	pThis->bSuspended = 1;
-	pThis->ttResumeRtry = tNow + pThis->iResumeInterval;
-	pThis->iNbrResRtry = 0; /* tell that we did not yet retry to resume */
+
+	iRetries = 0;
+	while(pThis->eState == ACT_STATE_RTRY) {
+		iRet = pThis->pMod->tryResume(pThis->pModData);
+		if(iRet == RS_RET_OK) {
+			actionSetState(pThis, ACT_STATE_RDY);
+		} else if(iRet == RS_RET_SUSPENDED) {
+			/* max retries reached? */
+			if((pThis->iResumeRetryCount != -1 && iRetries >= pThis->iResumeRetryCount)) {
+				actionSuspend(pThis, ttNow);
+			} else {
+				++pThis->iNbrResRtry;
+				++iRetries;
+				iSleepPeriod = pThis->iResumeInterval;
+				ttNow += iSleepPeriod; /* not truly exact, but sufficiently... */
+				srSleep(iSleepPeriod, 0);
+			}
+		} else if(iRet == RS_RET_DISABLE_ACTION) {
+			actionDisable(pThis);
+		}
+	}
+
+	if(pThis->eState == ACT_STATE_RDY) {
+		pThis->iNbrResRtry = 0;
+	}
 
 	RETiRet;
 }
 
 
 /* try to resume an action -- rgerhards, 2007-08-02
- * returns RS_RET_OK if resumption worked, RS_RET_SUSPEND if the
- * action is still suspended.
+ * changed to new action state engine -- rgerhards, 2009-05-07
  */
 static rsRetVal actionTryResume(action_t *pThis)
 {
 	DEFiRet;
-	time_t ttNow;
+	time_t ttNow = NO_TIME_PROVIDED;
 
 	ASSERT(pThis != NULL);
 
-	/* for resume handling, we must always obtain a fresh timestamp. We used
-	 * to use the action timestamp, but in this case we will never reach a
-	 * point where a resumption is actually tried, because the action timestamp
-	 * is always in the past. So we can not avoid doing a fresh time() call
-	 * here. -- rgerhards, 2009-03-18
-	 */
-	time(&ttNow); /* cache "now" */
-
-	/* first check if it is time for a re-try */
-	if(ttNow > pThis->ttResumeRtry) {
-		iRet = pThis->pMod->tryResume(pThis->pModData);
-		if(iRet == RS_RET_SUSPENDED) {
-			/* set new tryResume time */
-			++pThis->iNbrResRtry;
-			/* if we have more than 10 retries, we prolong the
-			 * retry interval. If something is really stalled, it will
-			 * get re-tried only very, very seldom - but that saves
-			 * CPU time. TODO: maybe a config option for that?
-			 * rgerhards, 2007-08-02
-			 */
-			pThis->ttResumeRtry = ttNow + pThis->iResumeInterval * (pThis->iNbrResRtry / 10 + 1);
+	if(pThis->eState == ACT_STATE_SUSP) {
+		/* if we are suspended, we need to check if the timeout expired.
+		 * for this handling, we must always obtain a fresh timestamp. We used
+		 * to use the action timestamp, but in this case we will never reach a
+		 * point where a resumption is actually tried, because the action timestamp
+		 * is always in the past. So we can not avoid doing a fresh time() call
+		 * here. -- rgerhards, 2009-03-18
+		 */
+		time(&ttNow); /* cache "now" */
+		if(ttNow > pThis->ttResumeRtry) {
+			actionSetState(pThis, ACT_STATE_RTRY); /* back to retries */
 		}
-	} else {
-		/* it's too early, we are still suspended --> indicate this */
-		iRet = RS_RET_SUSPENDED;
 	}
 
-	if(iRet == RS_RET_OK)
-		actionResume(pThis);
+	if(pThis->eState == ACT_STATE_RTRY) {
+		if(ttNow == NO_TIME_PROVIDED) /* use cached result if we have it */
+			time(&ttNow);
+		CHKiRet(actionDoRetry(pThis, ttNow));
+	}
+
+	if(Debug && (pThis->eState == ACT_STATE_RTRY ||pThis->eState == ACT_STATE_SUSP)) {
+		dbgprintf("actionTryResume: action state: %s, next retry (if applicable): %u [now %u]\n",
+			getActStateName(pThis), (unsigned) pThis->ttResumeRtry, (unsigned) ttNow);
+	}
 
-	dbgprintf("actionTryResume: iRet: %d, next retry (if applicable): %u [now %u]\n",
-		iRet, (unsigned) pThis->ttResumeRtry, (unsigned) ttNow);
+finalize_it:
+	RETiRet;
+}
 
+
+/* prepare an action for performing work. This involves trying to recover it,
+ * depending on its current state.
+ * rgerhards, 2009-05-07
+ */
+static rsRetVal actionPrepare(action_t *pThis)
+{
+	DEFiRet;
+
+	assert(pThis != NULL);
+	CHKiRet(actionTryResume(pThis));
+
+	/* if we are now ready, we initialize the transaction and advance
+	 * action state accordingly
+	 */
+	if(pThis->eState == ACT_STATE_RDY) {
+		CHKiRet(pThis->pMod->mod.om.beginTransaction(pThis->pModData));
+		actionSetState(pThis, ACT_STATE_ITX);
+	}
+
+finalize_it:
 	RETiRet;
 }
 
@@ -430,12 +590,11 @@ rsRetVal actionDbgPrint(action_t *pThis)
 	dbgprintf("\n\tInstance data: 0x%lx\n", (unsigned long) pThis->pModData);
 	dbgprintf("\tRepeatedMsgReduction: %d\n", pThis->f_ReduceRepeated);
 	dbgprintf("\tResume Interval: %d\n", pThis->iResumeInterval);
-	dbgprintf("\tSuspended: %d", pThis->bSuspended);
-	if(pThis->bSuspended) {
-		dbgprintf(" next retry: %u, number retries: %d", (unsigned) pThis->ttResumeRtry, pThis->iNbrResRtry);
+	if(pThis->eState == ACT_STATE_SUSP) {
+		dbgprintf("\tresume next retry: %u, number retries: %d",
+			  (unsigned) pThis->ttResumeRtry, pThis->iNbrResRtry);
 	}
-	dbgprintf("\n");
-	dbgprintf("\tDisabled: %d\n", !pThis->bEnabled);
+	dbgprintf("\tState: %s\n", getActStateName(pThis));
 	dbgprintf("\tExec only when previous is suspended: %d\n", pThis->bExecWhenPrevSusp);
 	dbgprintf("\n");
 
@@ -443,33 +602,16 @@ rsRetVal actionDbgPrint(action_t *pThis)
 }
 
 
-/* call the DoAction output plugin entry point
- * rgerhards, 2008-01-28
+/* prepare the calling parameters for doAction()
+ * rgerhards, 2009-05-07
  */
-#pragma GCC diagnostic ignored "-Wempty-body"
-rsRetVal
-actionCallDoAction(action_t *pAction, msg_t *pMsg)
+static rsRetVal prepareDoActionParams(action_t *pAction, msg_t *pMsg)
 {
-	DEFiRet;
-	int iRetries;
 	int i;
-	int iArr;
-	int iSleepPeriod;
-	int bCallAction;
-	int iCancelStateSave;
+	DEFiRet;
 
 	ASSERT(pAction != NULL);
 
-	/* We now must guard the output module against execution by multiple threads. The
-	 * plugin interface specifies that output modules must not be thread-safe (except
-	 * if they notify us they are - functionality not yet implemented...).
-	 * rgerhards, 2008-01-30
-	 */
-	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave);
-	d_pthread_mutex_lock(&pAction->mutActExec);
-	pthread_cleanup_push(mutexCancelCleanup, &pAction->mutActExec);
-	pthread_setcancelstate(iCancelStateSave, NULL);
-
 	/* here we must loop to process all requested strings */
 	for(i = 0 ; i < pAction->iNumTpls ; ++i) {
 		switch(pAction->eParamPassing) {
@@ -483,47 +625,21 @@ actionCallDoAction(action_t *pAction, msg_t *pMsg)
 		}
 	}
 
-	iRetries = 0;
-	do {
-		/* on first invocation, this if should never be true. We just put it at the top
-		 * of the loop so that processing (and code) is simplified. This code is actually
-		 * triggered on the 2nd+ invocation. -- rgerhards, 2008-01-30
-		 */
-		if(iRet == RS_RET_SUSPENDED) {
-			/* ok, this calls for our retry logic... */
-			++iRetries;
-			iSleepPeriod = pAction->iResumeInterval;
-			srSleep(iSleepPeriod, 0);
-		}
-		/* first check if we are suspended and, if so, retry */
-		if(actionIsSuspended(pAction)) {
-			iRet = actionTryResume(pAction);
-			if(iRet == RS_RET_OK)
-				bCallAction = 1;
-			else
-				bCallAction = 0;
-		} else {
-			bCallAction = 1;
-		}
-
-		if(bCallAction) {
-			/* call configured action */
-			iRet = pAction->pMod->mod.om.doAction(pAction->ppMsgs, pMsg->msgFlags, pAction->pModData);
-			if(iRet == RS_RET_SUSPENDED) {
-				dbgprintf("Action requested to be suspended, done that.\n");
-				actionSuspend(pAction, getActNow(pAction));
-			}
-		}
+finalize_it:
+	RETiRet;
+}
 
-	} while(iRet == RS_RET_SUSPENDED && (pAction->iResumeRetryCount == -1 || iRetries < pAction->iResumeRetryCount)); /* do...while! */
 
-	if(iRet == RS_RET_DISABLE_ACTION) {
-		dbgprintf("Action requested to be disabled, done that.\n");
-		pAction->bEnabled = 0; /* that's it... */
-	}
+/* cleanup doAction calling parameters
+ * rgerhards, 2009-05-07
+ */
+static rsRetVal cleanupDoActionParams(action_t *pAction)
+{
+	int i;
+	int iArr;
+	DEFiRet;
 
-finalize_it:
-	/* cleanup */
+	ASSERT(pAction != NULL);
 	for(i = 0 ; i < pAction->iNumTpls ; ++i) {
 		if(pAction->ppMsgs[i] != NULL) {
 			switch(pAction->eParamPassing) {
@@ -544,9 +660,306 @@ finalize_it:
 		}
 	}
 
+	RETiRet;
+}
+
+
+/* call the DoAction output plugin entry point
+ * Performance note: we build the action parameters here in this function. That
+ * means we do it while we hold the action look, potentially reducing concurrency
+ * (especially if the action queue is run in DIRECT mode). As an alternative, we
+ * may generate all params for the batch as whole before aquiring the action. However,
+ * that requires more memory, for large batches potentially a lot of memory. So for the
+ * time being, I am doing it here - the performance hit should be very minor and may even
+ * not be a hit because we may gain CPU cache locality gains with the "fewer memory"
+ * approach (I'd say that is rater likely).
+ * rgerhards, 2008-01-28
+ */
+rsRetVal
+actionCallDoAction(action_t *pThis, msg_t *pMsg)
+{
+	DEFiRet;
+
+	ASSERT(pThis != NULL);
+	ISOBJ_TYPE_assert(pMsg, msg);
+
+	DBGPRINTF("entering actionCalldoAction(), state: %s\n", getActStateName(pThis));
+	CHKiRet(prepareDoActionParams(pThis, pMsg));
+
+	pThis->bHadAutoCommit = 0;
+	iRet = pThis->pMod->mod.om.doAction(pThis->ppMsgs, pMsg->msgFlags, pThis->pModData);
+	switch(iRet) {
+		case RS_RET_OK:
+			actionCommitted(pThis);
+			break;
+		case RS_RET_DEFER_COMMIT:
+			/* we are done, action state remains the same */
+			break;
+		case RS_RET_PREVIOUS_COMMITTED:
+			/* action state remains the same, but we had a commit. */
+			pThis->bHadAutoCommit = 1;
+			break;
+		case RS_RET_SUSPENDED:
+			actionRetry(pThis);
+			break;
+		case RS_RET_DISABLE_ACTION:
+			actionDisable(pThis);
+			break;
+		default:/* permanent failure of this message - no sense in retrying. This is
+			 * not yet handled (but easy TODO)
+			 */
+			FINALIZE;
+	}
+	iRet = getReturnCode(pThis);
+
+finalize_it:
+	cleanupDoActionParams(pThis); /* iRet ignored! */
+
+	RETiRet;
+}
+
+
+/* process a message
+ * this readies the action and then calls doAction()
+ * rgerhards, 2008-01-28
+ */
+rsRetVal
+actionProcessMessage(action_t *pThis, msg_t *pMsg)
+{
+	DEFiRet;
+
+	ASSERT(pThis != NULL);
+	ISOBJ_TYPE_assert(pMsg, msg);
+
+RUNLOG_STR("inside actionProcessMsg()");
+	CHKiRet(actionPrepare(pThis));
+	if(pThis->eState == ACT_STATE_ITX)
+		CHKiRet(actionCallDoAction(pThis, pMsg));
+
+	iRet = getReturnCode(pThis);
+finalize_it:
+	RETiRet;
+}
+
+
+/* finish processing a batch. Most importantly, that means we commit if we 
+ * need to do so.
+ * rgerhards, 2008-01-28
+ */
+static rsRetVal
+finishBatch(action_t *pThis)
+{
+	DEFiRet;
+
+	ASSERT(pThis != NULL);
+
+	if(pThis->eState == ACT_STATE_RDY)
+		FINALIZE; /* nothing to do */
+
+	CHKiRet(actionPrepare(pThis));
+	if(pThis->eState == ACT_STATE_ITX) {
+		iRet = pThis->pMod->mod.om.endTransaction(pThis->pModData);
+		switch(iRet) {
+			case RS_RET_OK:
+				actionCommitted(pThis);
+				break;
+			case RS_RET_SUSPENDED:
+				actionRetry(pThis);
+				break;
+			case RS_RET_DISABLE_ACTION:
+				actionDisable(pThis);
+				break;
+			case RS_RET_DEFER_COMMIT:
+				DBGPRINTF("output plugin error: endTransaction() returns RS_RET_DEFER_COMMIT "
+					  "- ignored\n");
+				actionCommitted(pThis);
+				break;
+			case RS_RET_PREVIOUS_COMMITTED:
+				DBGPRINTF("output plugin error: endTransaction() returns RS_RET_PREVIOUS_COMMITTED "
+					  "- ignored\n");
+				actionCommitted(pThis);
+				break;
+			default:/* permanent failure of this message - no sense in retrying. This is
+				 * not yet handled (but easy TODO)
+				 */
+				FINALIZE;
+		}
+	}
+	iRet = getReturnCode(pThis);
+
+finalize_it:
+	RETiRet;
+}
+
+
+/* try to submit a partial batch of elements.
+ * rgerhards, 2009-05-12
+ */
+static rsRetVal
+tryDoAction(action_t *pAction, batch_t *pBatch, int *pnElem)
+{
+	int i;
+	int iElemProcessed;
+	int iCommittedUpTo;
+	msg_t *pMsg;
+	rsRetVal localRet;
+	DEFiRet;
+
+	assert(pBatch != NULL);
+	assert(pnElem != NULL);
+
+	i = pBatch->iDoneUpTo;	/* all messages below that index are processed */
+	iElemProcessed = 0;
+	iCommittedUpTo = i;
+	while(iElemProcessed <= *pnElem && i < pBatch->nElem) {
+		pMsg = (msg_t*) pBatch->pElem[i].pUsrp;
+	dbgprintf("submitBatch: i:%d, batch size %d, to process %d, pMsg: %p\n", i, pBatch->nElem, *pnElem, pMsg);//remove later!
+		localRet = actionProcessMessage(pAction, pMsg);
+		dbgprintf("action call returned %d\n", localRet);
+		if(localRet == RS_RET_OK) {
+			/* mark messages as committed */
+			while(iCommittedUpTo < i) {
+				pBatch->pElem[iCommittedUpTo++].state = BATCH_STATE_COMM;
+			}
+		} else if(localRet == RS_RET_PREVIOUS_COMMITTED) {
+			/* mark messages as committed */
+			while(iCommittedUpTo < i - 1) {
+				pBatch->pElem[iCommittedUpTo++].state = BATCH_STATE_COMM;
+			}
+			pBatch->pElem[i].state = BATCH_STATE_SUB;
+		} else if(localRet == RS_RET_PREVIOUS_COMMITTED) {
+			pBatch->pElem[i].state = BATCH_STATE_SUB;
+		} else {
+			iRet = localRet;
+			FINALIZE;
+		}
+		++i;
+		++iElemProcessed;
+	}
+
+finalize_it:
+	if(pBatch->iDoneUpTo != iCommittedUpTo) {
+		*pnElem += iCommittedUpTo - pBatch->iDoneUpTo;
+		pBatch->iDoneUpTo = iCommittedUpTo;
+	}
+	RETiRet;
+}
+
+
+/* submit a batch for actual action processing.
+ * The first nElem elements are processed. This function calls itself
+ * recursively if it needs to handle errors.
+ * rgerhards, 2009-05-12
+ */
+static rsRetVal
+submitBatch(action_t *pAction, batch_t *pBatch, int nElem)
+{
+	int i;
+	int bDone;
+	rsRetVal localRet;
+	DEFiRet;
+
+	assert(pBatch != NULL);
+
+	bDone = 0;
+	do {
+		localRet = tryDoAction(pAction, pBatch, &nElem);
+		if(   localRet == RS_RET_OK
+		   || localRet == RS_RET_PREVIOUS_COMMITTED
+		   || localRet == RS_RET_DEFER_COMMIT) {
+			/* try commit transaction, once done, we can simply do so as if
+			 * that return state was returned from tryDoAction().
+			 */
+			localRet = finishBatch(pAction);
+		}
+
+		if(   localRet == RS_RET_OK
+		   || localRet == RS_RET_PREVIOUS_COMMITTED
+		   || localRet == RS_RET_DEFER_COMMIT) {
+			bDone = 1;
+		} else if(localRet == RS_RET_SUSPENDED) {
+			; /* do nothing, this will retry the full batch */
+		} else if(localRet == RS_RET_ACTION_FAILED) {
+			/* in this case, the whole batch can not be processed */
+			for(i = 0 ; i < nElem ; ++i) {
+				pBatch->pElem[++pBatch->iDoneUpTo].state = BATCH_STATE_BAD;
+			}
+			bDone = 1;
+		} else {
+			if(nElem == 1) {
+				pBatch->pElem[++pBatch->iDoneUpTo].state = BATCH_STATE_BAD;
+				bDone = 1;
+			} else {
+				/* retry with half as much. Depth is log_2 batchsize, so recursion is not too deep */
+				submitBatch(pAction, pBatch, nElem / 2);
+				submitBatch(pAction, pBatch, nElem - (nElem / 2));
+				bDone = 1;
+			}
+		}
+	} while(!bDone); /* do .. while()! */
+
+	RETiRet;
+}
+
+
+/* receive a batch and process it. This includes retry handling.
+ * rgerhards, 2009-05-12
+ */
+static rsRetVal
+processAction(action_t *pAction, batch_t *pBatch)
+{
+	int i;
+	msg_t *pMsg;
+	rsRetVal localRet;
+	DEFiRet;
+
+	assert(pBatch != NULL);
+
+	pBatch->iDoneUpTo = 0;
+	/* TODO: think about action batches, must be handled at upper layer!
+	 * MULTIQUEUE
+	 */
+	localRet = submitBatch(pAction, pBatch, pBatch->nElem);
+	CHKiRet(localRet);
+
+	/* this must be moved away - up into the dequeue part of the queue, I guess, but that's for another day */
+	for(i = 0 ; i < pBatch->nElem ; i++) {
+		pMsg = (msg_t*) pBatch->pElem[i].pUsrp;
+	}
+	iRet = finishBatch(pAction);
+
+finalize_it:
+	RETiRet;
+}
+
+
+#pragma GCC diagnostic ignored "-Wempty-body"
+/* receive an array of to-process user pointers and submit them
+ * for processing.
+ * rgerhards, 2009-04-22
+ */
+static rsRetVal
+processBatchMain(action_t *pAction, batch_t *pBatch)
+{
+	int iCancelStateSave;
+	DEFiRet;
+
+	assert(pBatch != NULL);
+
+	/* We now must guard the output module against execution by multiple threads. The
+	 * plugin interface specifies that output modules must not be thread-safe (except
+	 * if they notify us they are - functionality not yet implemented...).
+	 * rgerhards, 2008-01-30
+	 */
+	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave);
+	d_pthread_mutex_lock(&pAction->mutActExec);
+	pthread_cleanup_push(mutexCancelCleanup, &pAction->mutActExec);
+	pthread_setcancelstate(iCancelStateSave, NULL);
+
+	iRet = processAction(pAction, pBatch);
+
 	pthread_cleanup_pop(1); /* unlock mutex */
 
-	msgDestruct(&pMsg); /* we are now finished with the message */
 	RETiRet;
 }
 #pragma GCC diagnostic warning "-Wempty-body"
@@ -756,19 +1169,6 @@ static rsRetVal
 doActionCallAction(action_t *pAction, msg_t *pMsg)
 {
 	DEFiRet;
-	/* first, we need to check if this is a disabled
-	 * entry. If so, we must not further process it.
-	 * rgerhards 2005-09-26
-	 * In the future, disabled modules may be re-probed from time
-	 * to time. They are in a perfectly legal state, except that the
-	 * doAction method indicated that it wanted to be disabled - but
-	 * we do not consider this is a solution for eternity... So we
-	 * should check from time to time if affairs have improved.
-	 * rgerhards, 2007-07-24
-	 */
-	if(pAction->bEnabled == 0) {
-		ABORT_FINALIZE(RS_RET_OK);
-	}
 
 	pAction->tActNow = -1; /* we do not yet know our current time (clear prev. value) */
 
@@ -819,6 +1219,7 @@ finalize_it:
 	RETiRet;
 }
 
+
 /* call the configured action. Does all necessary housekeeping.
  * rgerhards, 2007-08-01
  * FYI: currently, this function is only called from the queue
@@ -836,15 +1237,22 @@ actionCallAction(action_t *pAction, msg_t *pMsg)
 	ISOBJ_TYPE_assert(pMsg, msg);
 	ASSERT(pAction != NULL);
 
-	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave);
-	LockObj(pAction);
-	pthread_cleanup_push(mutexCancelCleanup, pAction->Sync_mut);
-	pthread_setcancelstate(iCancelStateSave, NULL);
-	iRet = doActionCallAction(pAction, pMsg);
-	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave);
-	UnlockObj(pAction);
-	pthread_cleanup_pop(0); /* remove mutex cleanup handler */
-	pthread_setcancelstate(iCancelStateSave, NULL);
+	/* We need to lock the mutex only for repeated line processing. 
+	 * rgerhards, 2009-06-19
+	 */
+	//if(pAction->f_ReduceRepeated == 1) {
+		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave);
+		LockObj(pAction);
+		pthread_cleanup_push(mutexCancelCleanup, pAction->Sync_mut);
+		pthread_setcancelstate(iCancelStateSave, NULL);
+		iRet = doActionCallAction(pAction, pMsg);
+		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave);
+		UnlockObj(pAction);
+		pthread_cleanup_pop(0); /* remove mutex cleanup handler */
+		pthread_setcancelstate(iCancelStateSave, NULL);
+	//} else {
+		//iRet = doActionCallAction(pAction, pMsg);
+	//}
 
 	RETiRet;
 }
@@ -862,6 +1270,7 @@ actionAddCfSysLineHdrl(void)
 	CHKiRet(regCfSysLineHdlr((uchar *)"actionname", 0, eCmdHdlrGetWord, NULL, &pszActionName, NULL));
 	CHKiRet(regCfSysLineHdlr((uchar *)"actionqueuefilename", 0, eCmdHdlrGetWord, NULL, &pszActionQFName, NULL));
 	CHKiRet(regCfSysLineHdlr((uchar *)"actionqueuesize", 0, eCmdHdlrInt, NULL, &iActionQueueSize, NULL));
+	CHKiRet(regCfSysLineHdlr((uchar *)"actionqueuedequeuebatchsize", 0, eCmdHdlrInt, NULL, &iActionQueueDeqBatchSize, NULL));
 	CHKiRet(regCfSysLineHdlr((uchar *)"actionqueuemaxdiskspace", 0, eCmdHdlrSize, NULL, &iActionQueMaxDiskSpace, NULL));
 	CHKiRet(regCfSysLineHdlr((uchar *)"actionqueuehighwatermark", 0, eCmdHdlrInt, NULL, &iActionQHighWtrMark, NULL));
 	CHKiRet(regCfSysLineHdlr((uchar *)"actionqueuelowwatermark", 0, eCmdHdlrInt, NULL, &iActionQLowWtrMark, NULL));
@@ -979,7 +1388,7 @@ addAction(action_t **ppAction, modInfo_t *pMod, void *pModData, omodStringReques
 		dbgprintf("module is incompatible with RepeatedMsgReduction - turned off\n");
 		pAction->f_ReduceRepeated = 0;
 	}
-	pAction->bEnabled = 1; /* action is enabled */
+	pAction->eState = ACT_STATE_RDY; /* action is enabled */
 
 	if(bSuspended)
 		actionSuspend(pAction, time(NULL)); /* "good" time call, only during init and unavoidable */
diff --git a/action.h b/action.h
index 579a1215..a29936e5 100644
--- a/action.h
+++ b/action.h
@@ -36,6 +36,15 @@
 extern int glbliActionResumeRetryCount;
 
 
+typedef enum {
+	ACT_STATE_DIED = 0,	/* action permanently failed and now disabled  - MUST BE ZEO! */
+	ACT_STATE_RDY  = 1,	/* action ready, waiting for new transaction */
+	ACT_STATE_ITX  = 2,	/* transaction active, waiting for new data or commit */
+	ACT_STATE_COMM = 3, 	/* transaction finished (a transient state) */
+	ACT_STATE_RTRY = 4,	/* failure occured, trying to restablish ready state */
+	ACT_STATE_SUSP = 5	/* suspended due to failure (return fail until timeout expired) */
+} action_state_t;
+
 /* the following struct defines the action object data structure
  */
 struct action_s {
@@ -45,8 +54,8 @@ struct action_s {
 	time_t	tLastExec;	/* time this action was last executed */
 	bool	bExecWhenPrevSusp;/* execute only when previous action is suspended? */
 	int	iSecsExecOnceInterval; /* if non-zero, minimum seconds to wait until action is executed again */
-	short	bEnabled;	/* is the related action enabled (1) or disabled (0)? */
-	bool	bSuspended;	/* is the related action temporarily suspended? */
+	action_state_t eState;	/* current state of action */
+	int	bHadAutoCommit;	/* did an auto-commit happen during doAction()? */
 	time_t	ttResumeRtry;	/* when is it time to retry the resume? */
 	int	iResumeInterval;/* resume interval for this action */
 	int	iResumeRetryCount;/* how often shall we retry a suspended action? (-1 --> eternal) */
diff --git a/configure.ac b/configure.ac
index 598e4fa3..78556be6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ(2.61)
-AC_INIT([rsyslog],[4.5.0],[rsyslog@lists.adiscon.com])
+AC_INIT([rsyslog],[5.1.1],[rsyslog@lists.adiscon.com])
 AM_INIT_AUTOMAKE
 AC_CONFIG_SRCDIR([ChangeLog])
 AC_CONFIG_MACRO_DIR([m4])
diff --git a/doc/action-call.dot b/doc/action-call.dot
new file mode 100644
index 00000000..86c6834d
--- /dev/null
+++ b/doc/action-call.dot
@@ -0,0 +1,33 @@
+// This file is part of rsyslog.
+// 
+// rsyslog action call state diagram
+//
+// see http://www.graphviz.org for how to obtain the graphviz processor
+// which is used to build the actual graph.
+//
+// generate the graph with
+// $ dot action-call.dot -Tpng >action-call.png
+
+digraph G {
+	label="\n\nrsyslog message states during action processing\nhttp://www.rsyslog.com";
+	//fontsize=20;
+
+	ok	[label="ready for processing" color="green"];
+	mpf	[label="message permanent failure" color="red"];
+	tf	[label="temporary failure"]
+	cPen	[label="commit pending"];
+	com	[label="committed" color="red"];
+
+	tf -> tf	[label="retry fails, i < n"];
+	tf -> mpf	[label="retry fails, i = n"];
+	tf -> ok	[label="retry succeeds"];
+	ok -> com	[label="doAction RS_RET_OK"];
+	ok -> cPen	[label="doAction COMMIT_PENDING"];
+	ok -> tf	[label="doAction RS_RET_SUSPENDED"];
+	ok -> mpf	[label="doAction RS_RET_DISABLED"];
+	cPen -> com	[label="endTransaction RS_RET_OK"];
+	cPen -> tf	[label="endTransaction _SUSPENDED"];
+
+	//{rank=same; tf cPen}
+	{rank=same; com mpf}
+}
diff --git a/doc/action_state.dot b/doc/action_state.dot
new file mode 100644
index 00000000..d56d9da0
--- /dev/null
+++ b/doc/action_state.dot
@@ -0,0 +1,33 @@
+// This file is part of rsyslog.
+// 
+// rsyslog message state diagram
+//
+// see http://www.graphviz.org for how to obtain the graphviz processor
+// which is used to build the actual graph.
+//
+// generate the graph with
+// $ dot file.dot -Tpng >file.png
+
+digraph msgState {
+	compound=true; nodesep=1.0
+	//label="\n\nrsyslog action transaction states\nhttp://www.rsyslog.com";
+	//fontsize=20;
+
+	rdy	[label="ready" group="main"];
+	itx	[label="in Tx" group="main"];
+	comm	[label="commit"]
+	rtry	[label="retry"]
+	susp	[label="suspended"]
+
+	rdy  -> itx	[label="transaction begins"]
+	itx  -> itx	[label="success"]
+	itx  -> comm	[label="commit\n(caller or auto)"]
+	itx  -> rtry	[label="error"]
+	comm -> rdy	[label="success"]
+	comm -> rtry	[label="error"]
+	rtry -> rdy	[label="recovered"]
+	rtry -> susp	[label="could not\nrecover"]
+	susp -> rtry	[label="timeout expired"]
+
+	{rank=same; comm rtry}
+}
diff --git a/doc/batch_state.dot b/doc/batch_state.dot
new file mode 100644
index 00000000..0dd48b47
--- /dev/null
+++ b/doc/batch_state.dot
@@ -0,0 +1,28 @@
+// This file is part of rsyslog.
+// 
+// rsyslog batch state diagram
+//
+// see http://www.graphviz.org for how to obtain the graphviz processor
+// which is used to build the actual graph.
+//
+// generate the graph with
+// $ dot file.dot -Tpng >file.png
+
+digraph msgState {
+	compound=true; nodesep=1.0
+	//label="\n\nrsyslog batch states\nhttp://www.rsyslog.com";
+	rankdir=LR
+
+	rdy	[label="ready"];
+	bad	[label="message-caused\nfailure"];
+	sub	[label="submitted"]
+	disc	[label="discarded" color="red"]
+
+	rdy  -> sub	[label="submitted to action"]
+	rdy  -> bad	[label="permanent fail"]
+	rdy  -> disc	[label="action requests discarding"]
+	sub  -> rdy	[label="next action or\naction-caused failure"]
+	bad  -> rdy	[label="next action"]
+
+	//{rank=same; comm rtry }
+}
diff --git a/doc/design.tex b/doc/design.tex
new file mode 100644
index 00000000..53d25313
--- /dev/null
+++ b/doc/design.tex
@@ -0,0 +1,859 @@
+\documentclass[a4paper,10pt]{article}
+\usepackage{amsmath}
+\usepackage{amsfonts}
+\usepackage{amssymb}
+\usepackage{graphicx}
+\usepackage{listings}
+\usepackage{algorithm,algorithmic}
+\usepackage{float}
+
+\pagestyle{headings}
+
+\newcommand{\IN}{\mathbb{N}}
+\newcommand{\MM}{\mathcal{M}}
+\newcommand{\QQ}{\mathcal{Q}}
+\newcommand{\AAA}{\mathcal{A}}
+\title{Rsyslog Design and Internals}
+\author{Rainer Gerhards\\
+rgerhards@adiscon.com}
+
+\begin{document}
+
+\maketitle
+
+\begin{abstract}
+This paper describes rsyslog design and internals. It is created to facilitate a discussion about the implementation of "batched queue processing". As such, it does not describe the full design of rsyslog but rather those elements that are relevant to queues. However, the document may be expanded in the future.
+\end{abstract}
+
+\tableofcontents
+
+\section{Preliminaries}
+\subsection{On the Use of English}
+\begin{quotation}
+\begin{flushright}
+I ventured to write this book in English because ... \\
+it will be more easily read in poor English, \\
+than in good German by 90\% of my intended readers. \\
+--- HANS J. STETTER, Analysis of Discretization Methods for \\
+Ordinary Differential Equations (1973)
+\end{flushright}
+\end{quotation}
+
+There is not much I could add to Mr. Stetter's thought, except, maybe, that the number to quote probably tends more to 99\% in this case than to the 90\% Mr. Stetter notes. So please pardon those errors in language use that I have not yet been able to fix or even see. Suggestions for corrections and improvements are always welcome.
+\subsection{Notational Conventions}
+In general, in rsyslog there exists single objects $o$, which are used to build larger sets $O$, which form a superset $\mathcal{O}$ of all those objects that exist at a given time inside a running instance of rsyslog. As seen above, single objects are always described by lower case letters ($o$), larger sets by upper case letters ($O$) and the ``all-sets'' in caligraphic letters ($\mathcal{O}$). Often, objects $O_i, i \in \IN, i \le |\mathcal{O}|$ partition $\mathcal{O}$, but this is not necessarily the case.
+
+\subsection{Definitions}
+\subsubsection{Sudden Fatal Failure}
+As sudden fatal failure is one that occurs at some instant and causes Complete loss of processing capabilities. The two major cases are a sudden power loss or a ``kill -9'' of the process. There are more exotic cases, too, like disasters.
+
+One may argue that it is possible to protect against many sudden fatal failure cases. For example, using an uninterruptable power supply (UPS) will prevent a sudden power loss. While this is true in most cases, it does not hold if looked very closely: in the case of the UPS, for example, a failure in the UPS itself may cause a sudden power loss, which can not be mitigated. Well, actually there can be several layers of mitigation, but always one more potential failure scenario remains. So it is not possible to totally solve the issue.
+
+The concept of ``sudden fatal failure'' now covers all these rest risk that result in termiantion of rsyslogd without the ability execute any code before this happens. This is a very important concept in regard to audit-gradeness.
+
+\subsubsection{Audit Grade}
+In the context of this document, ``audit grade'' means that a subsystem never loses a message that it has taken responsibility for, not even in cases of sudden fatal failures. The only limit in this restriction is that a subsystem does not guarantee message survival if the subsytem at large is being destroyed (e.g. during a disaster) or some of its components are not of audit-grade. This draws a fine limitation on the audit-grade of a subsystem.
+
+For example, the rsyslog queue subsystem receives messages and acknowledges them to the submitter (e.g. an input), when they have been enqueued in the storage system. If the queue system is configured to provide audit-grade operation\footnote{Audit-grade queue operation is considerably slower than regular operations, as such this mode is not enabled by default. Most installations will never need a completely audit-grade queue}, the queue relies on the storage subsystem to work properly. If, for example, a disk read error occurs, the message may no longer be readable from the disk and as such is lost. The root cause here is that the disk subsystem was not of audit grade, because it otherwise would not have lost the message. So in this case the queue code is of audit grade, but the one of its components, the disk subsytem, was not. So the overall system is not of audit grade.
+
+To simplify talking about the audit-gradness of several subsytems, we assume that all of their subsystems are also of audit grade. In an actual deployment, however, this means the the system designer must carefully select audit-grade subsystems. Overlooking a single non-audit-grade component will make the whole system of not audit grade quality.
+
+Please note that it can be rather tricky to ensure a complete system is of audit grade. A border case is main memory integrity. Even with error-correcting memory, there may situations arise where a memory error occurs (probably due to a very unlikely series of well-hitting cosmic rays) that is unrecoverable. At this point, system integrity is at risk. The only real solution is to immediately shut down the system and restart it (without giving any process a chance to execute). Note, however, that in an extreme view, an operating system routine that does so can also be considered dangerous, as memory in use by this routine might be affected by the malfunction. We could extend this scenario and further complicate it, but that goes beyond the scope of this paper. The example was primarily meant to show how subtle audit-grade reliability is.
+
+In rsyslog, we currently use a slightly \marginpar{duplication\\permitted}relaxed consistency condition for message integrity inside an audit-grade subsystem. While we do not accept message loss, we permit slight message \emph{duplication}, but only in exceptional cases. This is permitted because, with proper message generation, the dulication problem can be easily fixed at the end-to-end layer. For example, the original sender can include a UUID, which can be used to sort out duplicates at the final destination. Insisting on not allowing duplication complicates matters and is often impossible with today's logging protocols. So, for the time being, we aim at this relaxed criteria, which is hard enough to achive. After we have achieved that goal, we may further try to solve the duplicaton problem. Some hooks already exist. But we do not guarantee such an effort will be made any time soon.
+
+\section{Overall Design}
+From a high-level prespective, rsyslogd is ``just'' a high-performance message router. It accepts messages from various sources, applies user-configured filters to them, and routes potentially transformed messages to destinations based on these filters.
+\section{Objects}
+\subsection{Plugins}
+Plugins provide code potentially written by a third party to extend rsyslog.
+
+Conceptually, a plugin is a tuple of callable functions $(\phi_1, \phi_2, \ldots)$ which implement an interface. There are three different types of plugins: input, output and library. The plugin type denotes the primary interface implemented by the plugin. Additional interfaces may be implemented\footnote{This is not yet done in plugins, but is possible and assumed to be done at a later point in time}.
+
+In the context of this paper, the output plugin interface is most important. It implements three entry points:
+
+\paragraph{doAction()}
+is used to submit messages to the output plugin. The entry point may or may not commit the messages to their ultimate destination.
+
+\paragraph{beginTransaction()}
+is used to inform the plugin that a new transaction begins. It must prepare for processing.
+
+\paragraph{endTransaction()}
+is indicated that the upper layer \emph{needs} to close the transaction. If there is any uncommited data left, it must be commited or rolled back.
+
+Every instance of an output plugin is guaranteed \emph{not} to be called concurrently by multiple threads. Further, no context switch will happen between calls to $doAction()$ and $endTransaction()$.
+
+\subsection{State Sets}
+Several object have associated state based on a specific state set. These state sets are described together with the objects.
+
+As a general rule, individual state is associated with all instances $o$ of a class of objects. This state is called the object's \marginpar{state component} \emph{state component} $s$. If we want to obtain an object's state, we write $S(o)$. Please note that $S(o)$ is only defined for those objects that have a state component.
+
+\subsection{Messages} 
+A message $m$ represents a a single syslog message inside the system. It is a tuple of attributes. Some of these attributes directly orginate from the message content, some others are meta-information taken from the context. For example, there is an meta-attribute ``time of reception'' which conveys when the message was received by rsyslog's input subsystem. We do not list attributes here, as there are many and it is not of importance which exactly they are.
+
+The set $\MM$ is composed of all messages that exist at a given time inside rsyslog.
+
+\subsection{Queue}
+A queue
+$$Q = (C, \Phi, M)$$
+is a triplet of a set of configuration parameters $C$, a set of callbacks $\Phi$ and a set of messages $M \subseteq \MM$.
+
+If we need to obtain the set of message from a queue, we write $M(Q)$. The elements of the set of configuration parameters are written as $C_{param}$ where $param$ is an abbreviation of the parameter's meaning. To obtain a specific parameter from a queue, we write $C_{param}(Q)$. The most important elements of $C$ are:
+
+\paragraph{$C_{type}$} which denotes the queue implementation type. Most importantly, this selects from a set of queue drivers (for example disk-only or in-memory driver), which affects the basic operation of the queue instance.
+
+\paragraph{$C_{mMsg}$} which denotes the upper bound on the cardinality of $M$.
+
+\paragraph{$C_{mBatch}$} which denotes the upper bound of the cardinality of message batches created for this queue.
+
+Be $\QQ = \{Q_m, Q_1, Q_2, \ldots, Q_{|\AAA|}\}$ the set of all queues that exist inside rsyslog after the configuration file has been processed, with $|\QQ| = |\AAA| + 1$.
+
+Then
+$$M_0 = \MM \setminus \bigcup_{i=1}^{|\QQ|} Q_i(M)$$
+\marginpar{at-risk-set}is the set of non-queued messages. The messages have either never been enqueued or have been dequeued but not finally been processed. This set represents the messages that may potentially be lost during an unclean shutdown of rsyslogd. This is why I call this set the ``\emph{at-risk-set}''.
+
+
+\subsection{Batches}
+A batch represents multiple processable messages. It is a unit of processing inside rsyslog's output system. Batches are used to dequeue a number of messages from a queue and then submit them to the lower action layer. Batches are natural \emph{transaction boundaries}, in the sense that multiple output transactions may be done on the messages inside a batch, but each transaction must end at the end of the batch. A batch is always associated to a specific queue $Q$.
+
+A batch
+$$B = (b_1, b_2, \ldots, b_n )$$
+is a $n$-tuple of \marginpar{processable\\message}processable messages
+$$b = (m, s)$$
+which are an ordered pair of a message $m$ and an associated processing state $s$. To denote the $n$-th message inside the batch, we write $m(b_n)$, to denote the status component of the $n$-th message, we write $S(b_n)$.
+
+\begin{figure}
+\begin{center}
+\includegraphics[scale=0.4]{batch_state.jpeg}
+\end{center}
+\caption{batch message processing states}
+\label{fig_batchmsg_states}
+\end{figure}
+
+The state set for the processing states is defined as follows:
+$$
+S_B = \{ rdy, bad, sub, disc \}
+$$
+
+With the semantics of the various states being the following:
+
+\begin{center}
+\begin{tabular}{|l|l|} \hline
+	State		& Semantics \\\hline
+	rdy 		& ready for processing\\
+	bad		& this message triggered an unrecoverable failure in action\\
+			& processing and must not be resubmitted to this action\\
+	sub		& message submitted for processsing, result yet unknown \\
+	disc		& action sucessfully processed, but must not be submitted \\
+			& to any further action in action unit \\\hline
+\end{tabular}
+\end{center}
+The associated state diagram is shown in figure \ref{fig_batchmsg_states} on page \pageref{fig_batchmsg_states}.
+
+Batch sizes vary. The actual cardinality is a function of the cardinality of $M(Q)$ at the time of batch creation and the queue configuration:
+
+$$1 \leq |B| \leq \max(C_{mBatch}(Q), |M(Q)|)$$
+
+\subsection{Action Unit}
+An action unit 
+$$u = (f, a_1, \ldots, a_n), a_i \in \AAA \text{ for } i \in \IN, i \le n$$
+is a tuple consisting of a filter function $f$ and $n \in \IN$ actions. \emph{Does rsyslog still support nonsense action units with $n=0$? - check!}
+
+\subsection{Action}
+An action
+$$a = (a_C, a_\psi)$$
+is an ordered pair of a tuple of configuration attributes $a_C$, and a tuple of processing functions $a_\psi$. Be the set $\AAA$ composed of all actions that exist in rsyslog after the configuration file has been processed.
+
+
+\section{Processing}
+\subsection{Object States}
+Various objects keep state. Some of these objects, like messages, batches and actions seem to share state. However, thinking about shared state leads to very complex setup. As such, state is modelled for each object $o$ individually. Instead, the state function $S_O(o)$ can be used to obtain an obtain an individual objects state. That state can be used to modify the state diagrams of the other objects with which relationships exist.
+
+\subsubsection{Actions}
+Actions are provided by output plugins. An action enables the engine to write messages to some destination. It is important to note that ``destination'' is a very broad abstraction. A destination may be a file inside a local or remote file system, a database table or a remote syslog server in another network.
+
+Actions are transactional in the following sense: more than one message can be submitted to an action. The action does not necessarily process the submitted messages unless the caller ends the transaction. However, the action itself may also end the transaction and notify the caller. This is \emph{not} considered an error condition and \emph{must} be handled gracefully by the caller. If a transaction aborts, the caller \emph{must} assume that none of the elements submitted since the begin of transaction have been processed. The action will try to backout anything that was already processed at the time the transaction failed. However, not all outputs work on actually transactional destination. As such, an action is permitted not to backout incomplete interim results. As such, after a transaction abort, some message duplication may occur. We call this the \emph{relaxed integrity condition} for actions.
+
+An output transaction is started by calling \emph{beginTransaction()} either explicitely or implicitely by a call to \emph{doAction()} without calling \emph{beginTransaction()} before. Then, one or more calls to \emph{doAction()} follow. When the caller intends to finish the transaction, it calls \emph{endTransaction()}. However, the transaction may also be terminated from the action itself in response to a \emph{doAction()} call.
+
+Mathematically, an action transaction builds a totally ordered set of uncommitted messages $M_u$. The order relation is defined over the sequence in which messages are being provided to \emph{doAction()}. At any time a commit is attempted, the full set $M_u$ is committed and may either succeeed completely or not at all (in the sense of the relaxed integrity condition described above). 
+
+A commit is attempted when 
+\begin{enumerate}
+\item the caller decides to call \emph{endTransaction()} 
+\item or earlier if the action decides it needs to commit now (e.g. because of buffers filling up).
+\end{enumerate}
+
+In the seconds case, the action may decide to commit all message but the current one or all (this is depending on action logic). So if the action decideds to commit a transaction before the caller calls \emph{endTransaction()}, a set of commited messages $M_c$ is build and $M_u$ is modified. Be $n$ the $n$-th iterated \emph{doAction()} call and $m_n$ the current message of this call, then the sets are build as follows:
+
+\begin{algorithm}
+%\caption{}
+\begin{algorithmic}
+\IF{action commits $m_n$}
+	\STATE $M_c = M_u \cup m_n$
+	\STATE $M_u = \emptyset$
+\ELSE
+	\STATE $M_c = M_u$
+	\STATE $M_u = \{ m_n\}$
+\ENDIF
+\end{algorithmic}
+\end{algorithm}
+
+In other words, if anything is committed early, it is always the full set $M_u$, with or without the current message. The caller needs to know which messages are already commited. As \emph{doAction()} finishes one transaction and starts a new one in a single call, we can not use action state the let the caller know this happened. So we use our above finding and just convey back if the transacton is still continuing or the current message or all others before it were committed. The caller must then act accordingly. Please note that when an error happens, the whole transaction must still be considered failed. As such, ``partial commit'' states need not to be mixed with failure states.
+
+Please note that the above method leaves a small potential issue unaddressed: if the action does an early commit of $M_u \setminus m_n$, an error happens when adding $m_n$ to the new $M_u$ (like running out of resources), the action would need to convey both the successful transaction as well as the failure state. This is not possible with the current interface. We could use callbacks to provide such notification, but this complicates the code. So, if that situaton arises, the action must temporarily buffer the error condition and convey it as part of either the next \emph{doAction()} call or during \emph{endTransation()} processing. This can be done, for example, by advancing its internal state accordingly.
+
+The state set for a actions is defined as follows:
+$$
+S_A = \{ rdy, itx, comm, rtry, susp, died \}
+$$
+
+With the semantics of the various states being the following:
+
+\begin{center}
+\begin{tabular}{|l|l|} \hline
+	State		& Semantics \\\hline
+	rdy 		& ready, waiting for transaction begin\\
+	itx		& in transaction, accept more data \\
+	comm		& transaction finished \\
+	rtry		& action failed but may be able to recover \\
+	susp		& action currently defunctional until timeout expires \\
+	died		& unrecoverable error condition occured, no longer usable \\\hline
+\end{tabular}
+\end{center}
+
+In the associated state diagram in figure \ref{fig_action_states}, we do not include the \emph{died} state, because it is entered whenever a totally unrecoverable error state may occur. This is a very exceptional incident (which most output plugins do not even support), so we have kept the diagram simple.
+
+\begin{figure}
+\begin{center}
+\includegraphics[scale=0.5]{action_state.jpeg}
+\end{center}
+\caption{Action State Diagram}
+\label{fig_action_states}
+\end{figure}
+
+\emph{Note well} that the state diagram describes the action state. It does \emph{not} describe the transaction state. While action- and transaction state are closely related to each other, they are different entities.
+
+The return code of \emph{doAction()} and \emph{endTransaction()} is used to convey the transaction state. As such, it is a function of the actions's current state after processing the request. The mapping is as shown below:
+
+\begin{center}
+\begin{tabular}{|l|l|} \hline
+	State		& Return Code (RS\_RET\_\ldots)\\\hline
+	rdy 		& OK \\
+	itx		& COMMITTED (if there was an auto-commit without $m_n$)\\
+			& DEFER\_COMMIT (if there was no auto-commit)\\
+	comm		& internal state, not to be exposed to upper layer \\
+	rtry		& SUSPENDED \emph{(new code needed)} \\
+	susp		& SUSPENDED \\
+	died		& DISABLED \\\hline
+\end{tabular}
+\end{center}
+
+For the rest of this document, let's assume there is a function \emph{getReturnCode()} that implements this mapping.
+
+It is important to think about how retries are handled. There is a user-configured per-action upper number of retries $C_r$ and retry interval $C_i$. In \emph{rsyslog v3}, there is no concept of output transactions. As such, only single messages are processed. When a temporary action failure occurs, the action is re-tried $C_r$ times, where the action processing thread is waiting in a \emph{sleep()} $C_i$ operating system API call\footnote{a suitable API is used, not \emph{sleep()} itself}. If the action succeeds during the retry processing, everything continues as usual. If it does not succeed, two things happen:
+\begin{itemize}
+\item the message is flagged as ``action permanent failure'' (what may trigger backup processing)
+\item the action is actually suspended for $C_i$ seconds
+\end{itemize}
+If then a new message is sent to the action, and $C_i$ seconds have not yet elapsed, the action is flagged as having failed without being re-tried again\footnote{During the analysis for this paper, it was seen that actually $C_r$ retries are attempted in v3, but each of them will never actually re-try the action. This is a software bug, which does not cause any harm and thus will not be fixed in v3. The new implementation in v4 will obviously not inherit this problem}. This is done in an effort to reduce resource utilization and prevent the system from slowing down e.g. by too-many retries to a remote server that went offline.
+
+With transactional output mode in \emph{rsyslog v4}, the logic above can no longer work. First of all, retrying single actions does not help, because all of the current transaction needs to be resubmitted. As such, the upper layers need to be notified of failure. Then, they need to resubmit the batch. In that design, the lower layer needs to return immediately after detecting the failure. Recovery handling is now to be done when the next transaction is started. However, we must make sure that we do not do excessive retries. So retry processing is only to be carried out if it was not tried less than $C_i$ seconds ago.
+
+The required functionality can be implemeted by a \emph{prepareAction} function that readies the action for processing if there is need to do so. That function is then called in all entry points before anything else is done. Then, actual processing is carried out and the resulting action state be used to generate the return code for the upper-layer caller. Find below a rough pseudocode to do so:
+
+\lstset{language=python}
+\begin{lstlisting}
+def prepareAction():
+   if state == rtry:
+      try recovery (adjust state accordingly)
+   if state == rdy:
+      beginTransaction() [output plugin]
+    
+def processMessage(message):
+   prepareAction()
+   if state == itx
+      doAction(message) [output plugin]
+   return getReturnCode()
+
+def doEndTransaction():
+   prepareAction()
+   if state == itx
+      endTransaction(); [output plugin]
+   return getReturnCode()
+\end{lstlisting}
+
+\subsection{Output Subsystem Layers}
+The rsyslog engine is organized in layers, where each layer is represented by the dominating object:
+
+\begin{figure}
+\includegraphics[scale=0.75]{rsyslog_output_layers.jpeg}
+\label{rsyslog output layers}
+\end{figure}
+
+If looking at the data flow, a queue dequeues batches of messages, which are than run through a generic action system and put into output plugins. Note that on the batch layer, only batches are supported as units of work, whereas the action layer is message-oriented but supports transactions of multiple messages. This is done by indicating when a transaction necessarily needs to end (that point being the end of batch from the batch layer).
+
+The plugins can be written by third parties and are roughly comparable to minidrivers. The generic action system provides all complexity of action processing wheras the output plugin provides a limited set of callbacks that enable the generic framework to talk to the actual destination system. As such, writing outputs is a very simple task. However, rsyslog does not limit the creation of very complex outputs, which may be able to offer superior performance for some destinations.
+
+\subsection{Output Failure}
+\subsubsection{Cases}
+When an output action is called, it may encounter a failure condition. In general, there are two different cases:
+\begin{enumerate}
+\item action caused failures
+\item message-content caused failures
+\end{enumerate}.
+
+Failures rooted in the action are things like broken network connections, file systems run out of space or database servers that are down. Most importantly, the failure is not related to message content. As such, it is appropriate to retry the action with the same message until it finally succeeds (assuming that someone restores the system in question to proper operation). We can not expect that the problem is cleared just by discarding the current message and re-trying with the next one.
+
+In my view, action caused failures are the far majority of all failures. For rsyslog versions 3 and below, all rsyslog-provided plugins consider failures to be action-caused and thus potentially recoverable by simple retry. With the only exception being fatal error conditions that render the whole action unusable.
+
+David Lang pointed out, that there may also exist error conditions that are not caused by the action (or the subsystem it talks to) itself, but rather by message data. He provided the following samples where message content can cause permanent issues with action execution:
+
+\begin{itemize}
+\item unicode text causing grief
+\item dynafile hits a read-only file
+\item basicly data-driven things that trigger bugs in the message delivery 
+mechanism in some form.
+\end{itemize}
+
+As David Lang said ``In an ideal world these would never happen, but for most output types I can think of some form of corrupt input that could cause that message to fail.''.
+So this class of failure conditions actually exists. No matter how often the action retry mechanism is called, it will never succeeds (one may argue that the read-only dynafile is fixable, but we could replace that sample with an invalidly generated filename). The proper cure for these actions is to find the offending one and discard it.
+
+In conclusion, actions need to return different error states for these two different types of failures. Traditionally, RS\_RET\_SUSPENDED is returned when an action specific failure is hit. Most existing plugins also do this if a message-related failure occured, simply because they did not yet know that this situation exists. However, plugins also return different error codes, and at least these can be treated to mean message-permanent failures. To support this, a change to plugins is still required, because many simple return SUSPENDED state if anything went wrong (replacing the real error condition with SUSPENDED). A dedicated PROBABLE\_INVALID\_MSG return state is probably useful so that an output plugin can convey back that it consideres the message to be bad. On the other hand, this implies that the plugin must try to detect those, what means that the developer must think about all potential message-causes problems. That approach can be considered unreliable and as such it may be better not to provide such a dedicted state.
+
+\subsubsection{Handling of Failures}
+In spite of the two different failure cases, different handling is needed for them. The action-based failure cases can and must be handled on the action level. As transactions abort when a failure occurs, support from the upper ``batch layer'' is necessary in order to handle resending batches of messages.
+
+For message-caused failure cases, the offending message must be found and then be discarded. A complexity here is that while a failure-causing message is being searched for, an action-based failure might occur. In that case, first the action-based failure condition must be solved, before the search for the problem message can continue.
+
+One approach might be that when the action-layer conveys back an action-caused failure (SUSPENDED), the batch layer knows that it simply needs to restart the full transaction (but not start an ``invalid message search''). If a message-based error condition is conveyed back, the batch system can not restart the full batch. Instead, it needs to enter search mode, where it creates partitions of the original batch, and calls itself recursively (at least in theory) on each of the subsets.
+
+Then, the same handling applies until either a failing message has been found or all messages have been successfully processed. Note that in the recursive step, action-based failures are recovered by full batch resubmits. This solves the above-mentioned complexity in a consistent way.
+
+If a binary-search-like method is used to detect failing records\footnote{This was originally suggested by David Lang.}, recursion may not really be an issue, as the recursion depth is limited to $\log_2 |B|$ where $B$ is the message batch.
+
+A message-caused failure can be rooted in one or more messages. One important question is if it is expected that the failure is caused by a single or multiple messages. Both is possible, so it is a question of probability. If we assume that it is more probable that a single messages causes the problems, it is useful to immediately return back to full batch submission of transactions once a problem-causing message has been identified. But then, if there are multiple problem-causing messages inside the batch, we may need many more iterations.
+
+If, on the other hand, we assume that it is more probable that multiple messages cause problems, it may make sense to keep resubmitting only subsets of the batch. However, then the performance is suboptimal if actually only one message was problematic. A solution might be to pick a compromise, e.g. first assume that a single message is problematic, but assume the opposite as soon as a second message with problems has been found.
+
+A potential algorithm for processing $n \le |B|$ messages from batch $B$ is described below. In the pseudocode, a ``processable'' message is one that neither is already committed nor had a permanent failure with this action. The term ``mpf'' means ``message permanent failure'' for this action (this will later be described in a batch state set).
+
+\begin{small}
+\lstset{language=python}
+\begin{lstlisting}
+def submitBatch(B, n):
+   foreach processable message in 
+           (first [at most] n messages of batch):
+      call processMessage
+      if action-caused failure:
+         retry full batch
+      if action-caused permanent failure:
+         mark all n messages as mpf
+         return
+      if auto-commit:
+         mark commited messages in batch as committed
+      if message-caused failure:
+         if n == 1:
+            mark message as mpf
+            return
+         else:
+            call submitBatch(B, n/2)
+            call submitBatch(B, n/2)
+\end{lstlisting}
+\end{small}
+
+After submitBatch() has completed, all messages are either committed or in mpf state.
+
+Note that an action-caused permanent failure occurs if an action-caused failure can not be resolved with the operator-configured number of retries. It will never occur if the user configured infinite retries. While an action is suspended, all calls will result in an action-caused permanent failure. Please keep in mind that these will be resubmitted to any backup actions inside the action unit, so the action's ability to cause permanent failure states is vital for a number of use cases (backup syslog server, to name just one).
+
+Batch processing inside an action unit thus can follow these strucuture:
+
+\begin{algorithm}
+\caption{processBatch(B)}
+\begin{algorithmic}
+\FORALL{action $a$ in action unit}
+	\IF{execute action only on messages that failed before}
+		\STATE $n = |\text{messages in batch in mpf state}|$
+		\STATE change mpf state back to ready
+	\ELSE
+		\STATE $n = |B \setminus \text{msgs with state discard}|$
+		\STATE change all message states $\ne$ discard to ready 
+	\ENDIF
+	\IF{$n >0$ }
+		\STATE call submitBatch(B, n) for action $a$
+	\ENDIF
+\ENDFOR
+\end{algorithmic}
+\end{algorithm}
+
+\paragraph{Why is it Important to differentiate the failure cases?}
+This text originates from the mailing list and must be merged in. I provide it in the form it is, so it will not be forgotten (plus, it conveys the information).
+
+One may think that it is not necessary to differentiate between action-caused and message-caused failures. However, not doing so introduces subtle issues, because 
+then you either 
+
+A) do not need the batch logic at all (because the action is configured for
+infinite retries)
+
+Or
+
+B) you loose many messages if the action is not configured for infinite
+retries and you have a longer-duration outage e.g. on a database server.
+Let's say it is offline for a couple of hours, then you lose almost
+everything in that period
+
+To prevent this, you need two different retry methods.
+
+One may argue that it is hard to differentiate between the two failure cases. This is correct. Buit I think it mostly depends on the quality of the output module. 
+
+First of all, ``mostly'' implies that there may be some other cases, where it
+really is impossible to differentiate between the two. In that case, I would
+treat the issue as an action-caused failure. There are two reasons for this:
+
+1) rsyslog v3 currently does this always and not even a single person
+complained about that so far. This is an empiric argument, and it does not
+mean it caused problems. But it carries the co-notation that this seems not
+to be too bad.
+
+2) If we would treat it as message-caused failure, we would no longer be able
+to handle extended outages of destination systems, which I consider a vitally
+important feature.
+
+When weighing the two, I know of lots of people who rely on 2), in sharp
+contrast to knowig noone having problems with 1). So my conclusion is that it is
+less problematic to define an otherwise undefinable failure reason to be
+action-caused. Even more so as I assume this problem only exists in the
+minority of cases.
+
+Now back to the quality of the output module: thinking about databases, their
+API is usually very good at conveying back if there was a SQL error or a
+connection abort. So while a SQL error may also be an indication of a
+configuration problem, I would strongly tend to treat it is a being
+message-caused. This is under the assumption that any reasonable responsive
+admin will hopefully test his configuration at least once before turning it
+into production. And config SQL errors should manifest immediately, so I
+expect these to be fixed before a configuration runs in production. So it is
+the duty of the output module to interpret the return code it received from
+the API call and decide whether the failure is more likely action-caused or
+message-caused. For database outputs, I would assume that it is always easy
+to classify failures that must be action-caused, especially in the
+dominating cases of failed network connections or failed servers.
+
+For other outputs it may not be as easy. But, for example, all stream network
+outputs can detect a broken connection, so this also is a sure fit.
+
+For dynafiles, it really depends on how hard the output module is tries to differentiate
+between the two failure cases. But I think you can go great length here, too.
+Especially if you do not only look at the create() return code, but, iff a
+failure occurs, you do more API calls to find out the cause.
+
+So I think the remaining problem is small enough to cause not too much issues
+(and if so, they are unavoidable in any case). In conclusion, the two failure states are not only necessary, but can sufficiently sure enough be detected.
+
+\subsection{Random Topics}
+I have begun to gather material from the mailing list in this section, because I feel it may be useful for others as well. Right now, the information is well hidden in the mailing list archives and there may be value in combining it all in one place.
+
+Due to the nature of this material, there is no specific organization between the subchapters and also formatting and language doesn't deny its rooting in the mailing list.
+
+\subsection{Reliability of Message Dequeueing}
+A batch is actually dequeued when it is taken off a queue. So if at that point we
+have a system power failure (for whatever reason), the messages are lost.
+While the rsyslog engine intends to be very reliable, it is not a complete
+transactional system. A slight risk remains. For this, you need to understand
+what happens when the batch is processed. I assume that we have no sudden,
+untrappable process termination. Then, if a batch cannot be processed, it is
+returned back to the top of queue. This is not yet implemented, but is how
+single messages (which you can think of an abstraction of a batch in the
+current code) are handled. If, for example, the engine shuts down, but an
+action takes longer than the configured shutdown timeout, the action is
+cancelled and the queue engine reclaims the unprocessed messages. They go
+into a special area inside the .qi file and are placed on top of the queue
+once the engine restarts.
+
+The only case where this not work is sudden process termination. I see two
+cases:
+
+a) a fatal software bug
+We cannot really address this. Even if the messages were remaining in the
+queue until finally processed, a software bug (maybe an invalid pointer) may
+affect the queue structures at large, possibly even at the risk of total loss
+of all data inside that queue. So this is an inevitable risk.
+
+b) sudden power fail
+... which can and should be mitigated at another level
+
+One may argue that there also is
+
+c) admin error
+e.g, kill -9 rsyslogd
+Here a fully transactional queue will probably help.
+
+However, I do not think that the risk involved justifies a far more complex
+fully transactional implementation of the queue object. Some risk always
+remains (what in the disaster case, even with a fully transactional queue?).
+
+And it is so complex to let the messages stay in queue because it is complex
+to work with such messages and disk queues. It would also cost a lot of
+performance, especially when done reliably (need to sync). We would then need
+to touch each element at least four times, twice as much as currently. Also,
+the hybrid disk/memory queues become very, very complex. There are more
+complexities around this, I just wanted to tell the most obvious.
+
+So, all in all, the idea is that messages are dequeued, processed and put
+back to the queue (think: ungetc()) when something goes wrong. Reasonable
+(but not more) effort is made to prevent message loss while the messages are
+in unprocessed state outside of the queue.
+
+\paragraph{More reliable can actually be less reliable}
+On the rsyslog mailing list, we had a discussion about how reliable rsyslog should be. It circles about a small potential window of message loss in the case of sudden fatal failure. Rsyslog can be configured to put all messages into a disk queue (instead of main memory), so these messages survive such a powerfail condition. However, messages dequeued and scheduled for processing during the power outage may be lost. 
+
+I now consider a case where we have bursty UDP traffic and rsyslog is configured to use a disk-only queue (which obviously is much slower than an in-memory queue). Looking at processing speeds, the max burst rate is limited by using an ultra-reliable queue. To avoid using UDP messages, a second instance could be run that uses an in-memory queue and forwards received messages to the one in ultra-reliable mode (that is with the disk-only queue). So that second instance queues in memory until the (slower) reliable rsyslogd can now accept the message and put it into the reliable queue. Let's say that you have a burst of $r$ messages and that from these burst only $r/2$ can be enqueued (because the ultra reliable queue is so slow). So you lose $r/2$ messages.
+
+Now consider the case that you run rsyslog with just a reliable queue, one that is kept in memory but not able to cover the power failure scenario. Obviously, all messages in that queue are lost when power fails (or almost all to be precise). However, that system has a much broader bandwidth. So with it, there would never have been r messages inside the queue, because that system has a much higher sustained message rate (and thus the burst causes much less of trouble). Let's say the system is just twice as fast in this setup (I guess it usually would be *much* faster). Than, it would be able to process all r records.
+
+In that scenario, the ultra-reliable system loses $r/2$ messages, whereas the somewhat more "unreliable" system loses none - by virtue of being able to process messages as they arrive. 
+
+Now extend that picture to messages residing inside the OS buffers or even those that are still queued in their sources because a stream transport blocked sending them.
+
+I know that each detail of this picture can be argued at length about.
+
+However, my opinion is that there is no "ultra-reliable" system in life, only various probabilities in losing messages. These probabilities  often depend on each other, what makes calculating them very hard to impossible. Still, the probability of message loss in the system at large is just the product of the probabilities in each of its  components. And reliability is just the inverse of that probability.
+
+This is where *I* conclude that it can make sense to permit a system to lose some messages under certain circumstances, if that influences the overall probability calculation towards the desired end result. In that sense, I tend to think that a fast, memory-queuing rsyslogd instance can be much more reliable compared to one that is configured as being ultra-reliable, where the rest of the system at large is badly influenced by this (the scenario above).
+
+However, I also know that for regulatory requirements, you often seem to need to prove that a system may not lose messages once it has received them, even at the cost of an overall increased probability of message loss.
+
+My view of reliability is much the same as my view of security: there is no such thing as "being totally secure", you can just reduce the probability that something bad happens. The worst thing in security is someone who thinks he is "totally secure" and as such is no longer actively looking at potential issues.
+
+The same I see for reliability. There is no thing like "being totally reliable" and it is a really bad idea to think you could ever be. Knowing this, one may begin to think about how to decrease the overall probability of message loss AND think about what rate is acceptable (and what to do with these cases, e.g. "how can they hurt").
+
+\paragraph{Different Use Cases}
+As David Lang pointed out, there exist different use cases for different levels of reliability. Most importantly, there exist use cases that do not demand very high throughput but rather ultra-realiability of the queue system. Here, ultra-reliability is just another word for the queue being of ``audit-grade''. Even if the queue provides audit-grade, the overall system is only then of audit-grade when all other components - most notably the transport protocols spoken by the inputs and outputs - are also of audit-grade. Most importantly, this means that an audit-grade system purely based on the IETF syslog protocol series can not be build.
+
+Used together with truly reliable protocols \emph{and} senders that block processing until a final acknowledgement has been received, an audit-grade system can potentially build based on rsyslog. To do so, an audit-grade queue subsystem is required, which is not present in releases less than 4.1.? (most importantly, v2 and v3 do not provide this capability).
+
+\subsection{Audit-Grade Queue Operations}
+\subsubsection{Perquisites}
+Audit-grade queue operations certain perquisites:
+\begin{itemize}
+\item rsyslog engine is of version 4.1.? or greater
+\item disk-only queue type
+\item checkpoint interval set to 1
+\item queue is configured to not permit losing any messages\footnote{The queue has several settings that can be used to fine-tune situations in which it may discard messages intentionally. All of these must be turned off. Most importantly, that means the producer is blocked for an infinite time if the queue is full.}
+\item queue consumer must also be of audit-grade
+\end{itemize}
+Only when these prequisites are met, queue operation can be considered of being audit-grade. Note that when message loss in case of sudden fatal failure and similar incidents is acceptable, neither disk-only queues nore a checkpoint interval of 1 is necessary. Such a configuration can also be build with rsyslog v3, which is up to that level.
+
+Note that in the sections below we describe the implementation in broader terms. Most importantly, we do not restrict ourselves to disk-only queue storage drivers. This is important, because it simplifies design and opens the capability to introduce new, possibly faster-performing, queue storage drivers in the future.
+
+But it is important to keep in mind that a concrete queue is only of audit-grade if it matches all the perquisites given here, most importantly with the right configuration.
+
+\subsubsection{Implementation Alternatives}
+Messages, or more precisely objects\footnote{While rsyslog deals with messages, the queue is designed to handle any type of thing that is represented as an rsyslog object. This is considered useful as queues may at some time contain other things than just messages, so we keep it generic.}, are enqueued by the queue producer (either an input module or the main message queue's consumer). The enqueue operation is completed only when the message has been successfully accepted by the queue storage driver. Then and only then the producer is permitted to remove the object from its own storage system. A rough sketch is given in algorithm \ref{alg_q_enq}.
+
+\begin{algorithm}
+\caption{enqueueObject($o$)}
+\begin{algorithmic}
+\label{alg_q_enq}
+\STATE lock queue mutex
+\WHILE{queue is not ready for enqueue}
+	\STATE wait on queue to become ready
+\ENDWHILE
+\STATE call queue store driver to add $o$
+\STATE unlock queue mutex
+\end{algorithmic}
+\end{algorithm}
+
+The dequeue-operation is more complex. We must ensure that each object stays in the queue until it is finally processed. Hereby, an object is finally processed, when processing of it has been completed. Remember that to enhance performance, objects are dequeued in batches of many. So at any given time, multiple messages may be processed, but not necessarily have finally completed doing so. If another worker thread then tries to obtain a new batch for processing, those ``in-process'' message must not be handed out a second time. Also, if a sudden fatal failure occurs during processing, queue operation must restart at the point of last commit. This means that all ``in-process'' messages need to be changed back to ``no processed'' state and be restarted again. In those cases the (acceptable) slight message duplication can occur.
+
+In our design, we differentiate between ``logical'' and ``physical'' dequeuing of batches. If a batch is generated for processing, it is logically dequeued --- in the sense that no other batch generating request will be able to receive another copy of these messages. If no exceptional situation happens, those messages will be processed and thus can be considered consumed under normal circumstances.
+
+However, actual deletion from the physical queue storage happens only after the batch is fully processed. At this point, all objects have been acknowledged by their destinations, which now have the responsibility for the object's survival. Consequently, we can delete them from the queue store. This process is considered the ``physical'' dequeue of the object.
+
+In order to find some simpler terms, we will call the logical dequeue operation just ``dequeue'' and the physical dequeue operation ``delete''. This is consistent with all previous work on rsyslog and thus probably leads to the least surprise when reading older source code and documentation.
+
+A first idea for a deletion is given in algorithm \ref{alg_pdeq_batch_1} (remember that $O(b)$ contains all objects within the given batch $b$, this is \emph{not} $O$-notation and should probably in the future be replaced by something else).
+
+\begin{algorithm}
+\caption{deleteBatch($b$), first approach}
+\begin{algorithmic}
+\label{alg_pdeq_batch_1}
+\STATE lock queue mutex
+\FORALL{$o \in O(b)$}
+	\STATE find $o$ in queue storage
+	\STATE remove $o$ and keep queue structures intact
+\ENDFOR
+\STATE unlock queue mutex
+\end{algorithmic}
+\end{algorithm}
+
+This algorithm is simple, but requires searching the queue store for the object to be deleted -- a potentially lengthy operation. However, we can improve the searching process if we know more about the inner structure of batch objects. It seems appropriate to dequeue objects in queue-sequential order. A drawback of doing so is that we must prevent other worker threads from trying to dequeue concurrently. This is not really a drawback. We need to guard dequeue operations by a mutex in any case, because otherwise internal structures can not be kept consistent. Practical experience and testing have shown that many small dequeue operations cause a lot of locking contention and as such badly affect performance. So it actually is a welcome enhancement to aquire the queue lock only once for the whole batch dequeue operation. As dequeing is a comperatively fast operation, the lock is not held for extended periods of time.
+
+A first approach to this functionality is shown in algorithm \ref{alg_ldeq_batch_1}. Note that $C_{mBatch}$ is the configured maximum number of elements inside a batch, $i$ is an index to address the objects inside the batch.
+
+\begin{figure}[h]
+\begin{center}
+\includegraphics[scale=0.6]{rsyslog_queue_pointers.jpeg}
+\end{center}
+\caption{\textbf{Queue Store Pointers}: boxes represent queue entries, colored boxes entries with objects. Objects in green are unprocessed, in blue are dequeued but not deleted and those in gray have already been deleted. White indicates not yet used entries. Gray objects may be overwritten at any time. Their entries are actually free, we have used the gray color primarily to indicate there once existed objects. Each queue pointer points to the next entry to process.}
+\label{fig_queue_ptr}
+\end{figure}
+
+\begin{algorithm}
+\caption{dequeueBatch($b$)}
+\begin{algorithmic}
+\label{alg_ldeq_batch_1}
+\STATE lock queue mutex
+\STATE $0 \to i$
+\WHILE{queue non-empty and $i < C_{mBatch}$}
+	\STATE obtain next obj $o$ from queue store 
+	\STATE advance logical dequeue position
+	\STATE put $o$ into batch
+\ENDWHILE
+\STATE unlock queue mutex
+\end{algorithmic}
+\end{algorithm}
+
+A key concept is somewhat hidden in \marginpar{queue pointers} \emph{advance logical dequeue position}. Each queue store is purely sequential, with objects being enqueued at one ``end'' of the store and dequeued at the other. Of course, each queue store has only finite capacity, but we ignore this to explain the overall picture. A queue can be implemented by two pointers: one that points to the tail of the queue, where new messages are enqueued and one that points to the head of it, where new messages are dequeued. The idea is now to duplicate the dequeue pointer and split it into one for (logical) dequeue and one for deletion. Figure \ref{fig_queue_ptr} shows this three-pointer approach. Now, we can simple advance either the dequeue or deletion pointer, depending on operation, and do not need to find the first dequeue position inside the queue store. The dequeue pointer always points at it. This mode can be implemented with all currently existing queue storage drivers (but the sequential disk driver may need to use a second file handle or stream object instead of two pointers).
+
+This makes an efficient implementation of algorithm \ref{alg_ldeq_batch_1} possible: when it logically dequeues, it just needs to advance the dequeue pointer. So the algorithm executes in $O(n)$ time where $n$ specifies the number of elements to dequeue with an upper bound of $C_{mBatch}$.
+
+\begin{figure}[h]
+\begin{center}
+\includegraphics[scale=0.6]{rsyslog_queue_pointers2.jpeg}
+\end{center}
+\caption{\textbf{Physically Dequeueing Messages}: In this sample, we have two batches. With multiple workers, they may be deleted in any order.}
+\label{fig_queue_ptr_deq}
+\end{figure}
+
+Furthermore, we can also improve algorithm \ref{alg_pdeq_batch_1}: Consider that each batch is logically dequeued as an atomic operation. That means all batch objects form a sequential subset of the queue. Figure \ref{fig_queue_ptr_deq} shows the situation when two batches have been dequeued.  So the costly ``find'' operation now needs to be carried out only once at the beginning of the batch. As all other objects are sequential, once we have found the batch begin inside the queue, we can simply delete the $|b|$ elements in queue-sequential order after it. So the cost of the find operation can be reduced from $O(|b|)$ to $O(1)$.
+
+We can even reduce the remaining cost of the find operation. If the batch to be deleted is right at the queue's head (as is ``B1'' in the figure), the ``find'' immediately terminates with the first element and incurs no cost at all. The situation is different if the batch is not at the queue head, ``B2'' is an example for that (assuming that ``B1'' has not yet been dequeued). We would now still need to search over the objects that are not part of the batch and can then finally get to the object at the head of the batch in question. For queue storage drivers that support random access to queue elements, storing a simple pointer to the batches' queue head element further improves the situation and enables $O(1)$ access to the queue element. This is indicated by the dotted lines in figure \ref{fig_queue_ptr_deq}. Once the head of the queue has been found, two things can happen (depending on the capabilities of the queue storage driver):
+
+\begin{enumerate}
+\item the head element can be flagged as ``this and next $n$ elements are deleted''
+\item all elements are actually deleted
+\end{enumerate}
+
+Note that a mixed form is also possible (and probably useful for our \emph{singly} linked list storage driver: there, some $n'$ elements be actually deleted and the head element is flagged as ``this and next $n - n'$ elements are deleted''. Note that in the linked-list case, all but the first elements can be deleted with ease\footnote{It can be considered to change from a singly-linked list to a doubly-linked list, if the benefit outweighs the extra effort required.}, so probably just the head would stay inside the queue. Note that removing elements off the queue, where possible, is useful because it frees resources. On a busy system, freeing messages as soon as possible can prevent message loss (in non-audit-grade setup) or system slowdown. So it should be done when possible.
+
+If we have a purely sequential queue storage driver (currently the sequential disk driver), finding and updating the head element is not an option. Even in this case, we can observe that the batch at the actual deletion pointer will eventually be submitted for deletion. So a route to take is to create a list of elements that can be deleted as soon as the physical dequeue pointer reaches any of these elements. We call this the \marginpar{to-delete list}``to-delete list''. To facilitate processing, this list must be ordered in sequence of dequeing. This information may not be available from the storage subsystem itself, but it can easily be generated. To do so, a strictly monotonically increasing counter is kept with each logical dequeue operation and stored as part of the batch\footnote{As this must be done via the usual computer-implemented modular arithmetic, we must be careful that we do not see repetion of values because of overflows. Each day has $60 \cdot 60 \cot 24 = 86,400$ seconds (ignoring the subleties of UTC). Now let's assume that we have a moderately-busy system with 1,000 messages per second. We further assume, to be on the save side, that each message is processed inside its own batch. So we have $86,400,000$ batches per day. If we now use a typical $32$-bit integer for generating the batch IDs, we the unique range will be used up after
+$$\frac{2^{32}}{8640000} \approx 497 \text{ days}$$
+days of uninterrupted rsyslog operation. While this sounds somewhat save, it goes down to approximately 10 days of messages are submitted at rate of 50,000 messages per second (which is high, but not unheared of). So it is strongly advised to use 64 bits, which we consider to be save, because for our 1,000 messages per second the range would be exhausted only after
+$$\frac{2^{64}}{8640000} \approx 2.135 \cdot 10^{11} \text{ days}$$
+which equals approximately $584,500,000$ \emph{years}. So even at a rate of one million messages per second, the range would be sufficient for over 500,000 years of continuos operations -- that should be far sufficient.}
+An example: let us assume that ``B2'' was submitted for deletion first. Then, the head of ``B2'' is not at the queue's delete pointer. As such, no action can be carried out immediately. So the batch head pointer is stored into a ``to be deleted'' list. Processing continues. Some time later, batch ``B1'' is submitted for deletion. Now, the head pointer is at the head of the delete list, as such all batch elements are dequeued. Then, the ``to be deleted'' list is checked, and ``B2'' is found in it. Now, ``B2'' is at the head of the (new) deletion pointer and can also be removed. So, ultimately, all messages are physically dequeued. This is more formally describe in algorithm \ref{alg_phys_deq_seq_store}. In that pseudocode, we made a simplification by always putting the to be deleted batch in the ``to-delete'' list, which then enables us to use somewhat more generic code to carry out the work.
+
+Note that there is a price to pay for deletions via the ``to-delete'' list: if a sudden fatal failure happens during processing, the set of duplicate messages is increased. For example, if a fatal failure happens after ``B2'' has been fully processed and scheduled for deletion, but \emph{before ``B1'' is also submitted for deletion}, ``B2'' will be reprocessed after recovery. This would not happen if ``B2'' would have been removed from the queue.
+
+\begin{algorithm}
+\caption{deleteBatch($b$)}
+\begin{algorithmic}
+\label{alg_phys_deq_seq_store}
+\REQUIRE queue mutex is locked by caller
+\STATE enqueue $b.head, |b|$ in ``to-delete'' list $D$
+\COMMENT ``to-delete'' list must be in order of logical dequeue
+\WHILE{$D.head = Q.deletePtr$}
+	\FOR{$|b|$ elements}
+		\STATE delete element at queue head
+		\STATE move $q.deletePtr$
+	\ENDFOR
+	\STATE remove head of ``to-delete'' list
+\ENDWHILE 
+\end{algorithmic}
+\end{algorithm}
+
+\paragraph{Warp-Up of Queue Delete Operations}
+When evaluating which route to take, the ``to-delete'' list approach looks elegant for all cases. The negative side effect of potentially increased message duplication currently does not even exist: today, the sequential disk queue storage driver permits only a single worker thread and thus there always will be only one thread at a time. Even if we remove that limitation, message duplication could not be avoided, as stated in the algorithm description above. What remains are the other queue storage drivers. However, they operate in-memory, so message duplication will not happen simply because all messages will be lost on sudden fatal failure. The advantage of limited message duplication only exists in the so-far hypothetical case of a random-access, audit-grade disk queue storage driver. Thus, the decision could be postponed unless that happens (if it ever does).
+
+From a code complexity point of view, the ``to-delete'' list approch is definitely advantagous. Not only because of the reduced number of algorithms required. We also do not need to maintain unique batch IDs and all the logic associated with them.
+
+The other aspect to look at is memory consumption. Assuming that we delete the actual objects, just not their containers inside the queue, extra memory consumption is not really that worse. More importantly, currently only the linked-list queue storage driver can benefit at all, because it is the only driver capable of deleting queue entries in mid-queue. All others, including the array memory driver, do not have this capability.
+
+From a performance point of view, the ``to delete'' list approach looks approximately as good as the others, with some mild better performance for some storage drivers for a non-``to delete'' list approach. This can be mitigated, especially if the potentially somewhat-costly maintenance of the ``to-delete'' list is slightly optimized and the algorithm actually checks if the to be deleted batch is right at the queue's delete pointer position. The improved code simplicity, together with current CPU's code caching, may even result in an otherwise not expected speedup.
+
+In conclusion, we will implement the ``to-delete'' list approach on the queue layer (above the queue storage drivers). However, we will leave the window open to permit overwriting it with queue storage driver specific functionality. How to do this will not be specified now, as there is currently no need and we do not even know if there ever will be. However, we retain the discussion on the various modes as well as the relevant algorithmic discussions and data structurs inside this paper so that it is readily available should need arise. We also think this is important so that everybody later knows that the decision was made based on good argument and not by accident (we consider this useful in another design enhancement attempt).
+
+\paragraph{Processing Sequence} Looking at the processing sequence, we notice that always objects are dequeued, then processed and then deleted. Then, the whole process starts again. In particular, this meanss that after the previous batch has been deleted, the next batch will be dequeued. Now consider that we need to have exclusive access to the queue for both of these operations. As such it seems natural to combine this into a single step, further reducing potential locking contention.
+
+Note that a side-effect of this approach is that messages can be deleted only when a new batch is dequeued. With current design, this means that at least one message must reside inside the queue. Otherwise, the last batch will not be deleted. However, this something that can (and must!) be solved on the queue worker layer, in that it deletes a batch when the queue is empty.
+
+This leads us to the implementation of dequeueBatch() and deleteBatch() shown in algorithms \ref{alg_deq_batch_final} and \ref{alg_del_batch_final}. Note that $l$ is a flag variable that indicates if the queue is already locked.
+
+\begin{algorithm}
+\caption{dequeueBatch($b$): final version}
+\begin{algorithmic}
+\label{alg_deq_batch_final}
+\STATE lock queue mutex
+\STATE call deleteBatch(b, 1)
+\STATE $0 \to i$
+\WHILE{queue non-empty and $i < C_{mBatch}$}
+	\STATE obtain next obj $o$ from queue store 
+	\STATE advance dequeue position
+	\STATE put $o$ into batch
+\ENDWHILE
+\STATE commit queue changes to storage system (if needed, e.g. fsync())
+\STATE unlock queue mutex
+\end{algorithmic}
+\end{algorithm}
+
+
+\begin{algorithm}
+\caption{deleteBatch($b, l$): final version}
+\begin{algorithmic}
+\label{alg_del_batch_final}
+\IF{queue not yet locked (test via $l$)}
+	\STATE lock queue mutex
+\ENDIF
+\FORALL{objects $o$ in $b$}
+	\STATE destruct $o$
+\ENDFOR
+\STATE enqueue $b.head, |b|$ in ``to-delete'' list $D$
+\COMMENT ``to-delete'' list must be in order of logical dequeue
+\WHILE{$D.head = Q.deletePtr$}
+	\FOR{$|b|$ elements}
+		\STATE delete element at queue head
+		\STATE move $q.deletePtr$
+	\ENDFOR
+	\STATE remove head of ``to-delete'' list
+\ENDWHILE 
+\STATE commit queue changes to storage system (if needed, e.g. fsync())
+\IF{queue not yet locked (test via $l$)}
+	\STATE unlock queue mutex
+\ENDIF
+\end{algorithmic}
+\end{algorithm}
+
+\subsubsection{Queue Stores}
+Currently, rsyslog supports three different types of queue store drivers:
+
+\begin{itemize}
+\item memory array
+\item memory linked list
+\item disk sequential file
+\end{itemize}
+
+They all provide an abstracted sequential queue store as shown in figure \ref{fig_queue_ptr} on page \pageref{fig_queue_ptr}.
+
+Obviously, some differences exist. Most importantly, the disk sequential file driver does \emph{not} support more than one queue worker thread (in order to prevent excessive disk activity and the subtle issues with rewriting parts of sequential files). So if this driver is used, the queue automatically limits itself to a maximum of one worker thread (even if user configuration settings 
+
+Different queue store drivers have different properties:
+
+\begin{tabular}{|l||l|l|l|}\hline
+		&	array 	& linked list		& seqential file \\ \hline
+pointer type	& integer index	& memory address	& file number and \\
+		&		&			& offset within file \\ \hline
+physical access	& random	& random		& sequential \\ \hline
+remove middle	& no		& yes			& no \\
+elements	&		&			& \\ \hline
+access to $n$-th& $O(1)$, index:& $O(n)$, follow	& not supported \\
+element		& $n \mod C_{mMsg}$	& pointer links	& \\ \hline
+speed		& fastest	& fast			& slow \\\hline
+mem overhead	& large		& some			& almost none \\\hline
+reliability	& reliable	& reliable		& audit-grade\footnote{if configured correctly}\\
+\hline
+\end{tabular}
+
+\subsubsection{Implementation}
+The actual implementation will be based on algorithms \ref{alg_deq_batch_final} and \ref{alg_del_batch_final}. The rsyslog v3 queue storage driver will be extended one additional method, which permits non-destructive dequeueing of elements. As such, the driver now has the $qAdd()$, $qDeq()$, and $qDel()$ entry points (together with the usual construction and destruction entry points). The queue drivers must support the three pointers for enqueue, dequeue and delete. The ``to-delete'' list will be maintained on the upper queue layer (and not the queue driver layer). This functionality will be optimized so that if a batch to delete is right at the queue's delete pointer, it will immediatly be deleted and not be sent to the ``to-delete'' list. This is especially important with the sequential disk driver, as the condition here always is true (and thus the driver can pretend this in the relevant API without even comparing any pointers -- what would otherwise quite complicated in this driver.
+
+The full list of the queue store driver interface is:
+
+\paragraph{qConstruct} Initializes the queue store.
+
+\paragraph{qDestruct} Destructs the queue store, including all messages that may still be present in it.
+
+\paragraph{qAdd} Enqueue a new object into the queue. Note that this entry point must only be called when the queue is non-full.
+
+\paragraph{qDeq} Non-destructive dequeue of the object at queue head. Dequeue pointer is advanced.
+
+\paragraph{qDel} Delete the object at queue head. Delete pointer is advanced.
+
+Disk queue store drivers may support additional internal functions. However, they should not be exposed to the rest of the queue subsystem.
+
+\begin{figure}
+\begin{center}
+\includegraphics[scale=0.4]{queue_msg_state.jpeg}
+\end{center}
+\caption{Logical Message States during Queue Processing}
+\label{fig_queue_msg_state}
+\end{figure}
+
+Figure \ref{fig_queue_msg_state} shows a logical message state diagram during queue processing. There is no actual state variable, but rather the processing flow demands these state. Note that the state transition from ``dequeued'' to ``queued'' only happens after a fatal failure and a successful system recovery. So this is a rather exceptional case.
+
+Another subtle issue is that we now need two different queue size counters: one for seeing when the queue is physically full and one for detecting when there are no more messages to be dequeued.
+
+As a simplification, support for ungetting objects can be removed (as objects never leave the queue), what also means that cancel-processing is probably less complex.
+
+\paragraph{Sequential Disk Queue Store Driver}
+The enequeue, deqeueue and delete pointers must be implemented via three stream objects. Most importantly, the dequeue stream must be configured not to delete files when it closes them. A side-effect of this implementation is that data is actually read twice, once to actually obtain it and a second time to delete it. This could only be avoided by an overall redesign on how the disk queue works.
+
+\subsubsection{Checkmarks}
+The following things need to be verified in the actual implementation.
+
+\paragraph{Queue Full}
+Is it possible to set an infinte timeout on queue full condition during enqueue? If not, we must provide it.
+
+\paragraph{Termination the Queue}
+If we cancel a worker, we need to start from the physical dequeue pointer and pull everything that is not scheduled for deletion - NOT from the logical dequeue pointer.
+
+\paragraph{Failed Messages}
+If a message fails on a detached action queue, no backup processing is available (because we detect the failure at a point where the message is already considered processed from the main queue's point of view. We need address this and have two options:
+
+
+I see two approaches at handling this:
+
+a) we enable an action to configure a backup file that shall receive all
+message permanent failures. This is simple (not only to implement but to
+configure and understand)
+
+b) we push the failed message back to the main queue, but with an indication
+that it failed in an action. This is harder to implement and most importantly
+harder to understand/configure, but more flexible
+
+\section{Future Development}
+This section covers topics that can not currently be developed, but where important thoughts came up in discussions. For obvious reasons, the section has brainstorming character.
+
+\subsection{Audit-Grade High Performance Queue Storage Driver}
+An audit grade driver must ensure that no message is lost, but should also be able to handle large workloads. The sequential disk driver does not support the later.
+
+An additional disk driver is envisioned with the properties like the linked list driver, but a reliable on-disk store. In particular, random access to queue elements is desired, which requires an addressing capability.
+
+A potential implementation requires a pre-formatted file. That file is organized in pages of $n$ bytes (e.g. 1K). The page index is used to address a queue item. If an item fits into 1K, it uses one page. If it is larger than 1K, consequtive pages are used to store the element. A page header must be present to indicate how many pages a single element is made up of.
+
+It may be noted that we could even improve performance by keeping part of the data in-memory. For audit-gradeness, it is required that upon enqueue the message is written to disk and only after final processing it needs to be removed. However, it is not forbidden to keep the same message in main memory. That way, the logical dequeue operation could be done one the in-memory representation. Only the physical dequeue would need to write to disk again. As such, we save one disk read out of three writes and one read otherwise required (so one can roughly say that we save one third of disk operations.
+
+Note that due to potential multi-pages messages we can not directly address individual elements, but we can reliably and quikly address elements whom's address we know (learned, for example, during logical dequeue). This is similar to the organization of the in-memory linked list. Actally, such a store \emph{is} a linked list implementation, just that memory is allocated on disk instead of in main memory.
+
+To further improve speed, object representation could be zipped before being written to a page.
+
+File Layout
+Page 0: control structures (most importantyle queue pointers) (can make sense to store in a separate file, which could be moved to a dedicated disk subsystem - can potentially greatly reduce disk seek times).
+Page 1 to n: actual object storage
+
+Algorithms \ref{alg_AuditGradeStoreEnqueue} and \ref{alg_AuditGradeStoreDelete} show how records are enqueued and deleted. Note that the delete part does not even need to read back the record. If we keep at last some records in-memory, the performance cost of ultra-reliable mode can actually comparatively low. Note that we may not even really need to commit data to the storage system in ``AuditGradeStoreDelete()'', because if a fatal failure occurs at this point, at worst message duplication may happen, what we have considered to be acceptable.
+
+\begin{algorithm}
+\caption{AuditGradeStoreEnqueue($o$)}
+\begin{algorithmic}
+\label{alg_AuditGradeStoreEnqueue}
+\REQUIRE queue mutex is locked by caller
+\STATE write $o$ to current enqueue location
+\STATE update \& write queue structures [page 0]
+\STATE sync all files touched 
+\STATE store $o$ in an in-memory structure (or a cache)
+\end{algorithmic}
+\end{algorithm}
+
+\begin{algorithm}
+\caption{AuditGradeStoreDelete($o$)}
+\begin{algorithmic}
+\label{alg_AuditGradeStoreDelete}
+\REQUIRE queue mutex is locked by caller
+\STATE update queue dequeue pointer \& write queue structures [page 0]
+\STATE sync all files touched 
+\end{algorithmic}
+\end{algorithm}
+
+
+\end{document}
diff --git a/doc/dev_oplugins.html b/doc/dev_oplugins.html
index cc2f7f38..63c186a3 100644
--- a/doc/dev_oplugins.html
+++ b/doc/dev_oplugins.html
@@ -144,19 +144,172 @@ array-passing capability not blindly be used.</b> In such cases, we can not guar
 plugin from segfaulting and if the plugin (as currently always) is run within 
 rsyslog's process space, that results in a segfault for rsyslog. So do not do this.
 <h3>Batching of Messages</h3>
-<p>With the current plugin interface, each message is passed via a separate call to the plugin.
-This is annoying and costs performance in some uses cases (primarily for database outputs).
-However, that's the way it (currently) is, no easy way around it. There are some ideas
-to implement batching capabilities inside the rsyslog core, but without that the only
-resort is to do it inside your plugin yourself. You are not prohibited from doing so.
-There are some consequences, though: most importantly, the rsyslog core is no longer
-intersted in messages that it passed to a plugin. As such, it will not try to make sure
-the message is not lost before it was ultimately processed (because rsyslog, due to
-doAction() returning successfully, thinks the message *was* ultimately processed).
-<p>When the rsyslog core receives batching capabilities, this will be implemented in
-a way that is fully compatible to the existing plugin interface. While we have not yet
-thought about the implementation, that will probably mean that some new interfaces
-or options be used to turn on batching capabilities.
+<p>Starting with rsyslog 4.3.x, batching of output messages is supported. Previously, only
+a single-message interface was supported.
+<p>With the <b>single message</b> plugin interface, each message is passed via a separate call to the plugin.
+Most importantly, the rsyslog engine assumes that each call to the plugin is a complete transaction
+and as such assumes that messages be properly commited after the plugin returns to the engine.
+<p>With the <b>batching</b> interface, rsyslog employs something along the line of
+&quot;transactions&quot;. Obviously, the rsyslog core can not make non-transactional outputs
+to be fully transactional. But what it can is support that the output tells the core which
+messages have been commited by the output and which not yet. The core can than take care
+of those uncommited messages when problems occur. For example, if a plugin has received
+50 messages but not yet told the core that it commited them, and then returns an error state, the
+core assumes that all these 50 messages were <b>not</b> written to the output. The core then
+requeues all 50 messages and does the usual retry processing. Once the output plugin tells the 
+core that it is ready again to accept messages, the rsyslog core will provide it with these 50
+not yet commited messages again (actually, at this point, the rsyslog core no longer knows that
+it is re-submiting the messages). If, in contrary, the plugin had told rsyslog that 40 of these 50
+messages were commited (before it failed), then only 10 would have been requeued and resubmitted.
+<p>In order to provide an efficient implementation, there are some (mild) constraints in that
+transactional model: first of all, rsyslog itself specifies the ultimate transaction boundaries.
+That is, it tells the plugin when a transaction begins and when it must finish. The plugin
+is free to commit messages in between, but it <b>must</b> commit all work done when the core
+tells it that the transaction ends. All messages passed in between a begin and end transaction
+notification are called a batch of messages. They are passed in one by one, just as without
+transaction support. Note that batch sizes are variable within the range of 1 to a user configured
+maximum limit. Most importantly, that means that plugins may receive batches of single messages,
+so they are required to commit each message individually. If the plugin tries to be &quot;smarter&quot;
+than the rsyslog engine and does not commit messages in those cases (for example), the plugin
+puts message stream integrity at risk: once rsyslog has notified the plugin of transacton end,
+it discards all messages as it considers them committed and save. If now something goes wrong,
+the rsyslog core does not try to recover lost messages (and keep in mind that &quot;goes wrong&quot;
+includes such uncontrollable things like connection loss to a database server). So it is
+highly recommended to fully abide to the plugin interface details, even though you may
+think you can do it better. The second reason for that is that the core engine will 
+have configuration settings that enable the user to tune commit rate to their use-case
+specific needs. And, as a relief: why would rsyslog ever decide to use batches of one?
+There is a trivial case and that is when we have very low activity so that no queue of
+messages builds up, in which case it makes sense to commit work as it arrives.
+(As a side-note, there are some valid cases where a timeout-based commit feature makes sense.
+This is also under evaluation and, once decided, the core will offer an interface plus a way
+to preserve message stream integrity for properly-crafted plugins).
+<p>The second restriction is that if a plugin makes commits in between (what is perfectly
+legal) those commits must be in-order. So if a commit is made for message ten out of 50,
+this means that messages one to nine are also commited. It would be possible to remove
+this restriction, but we have decided to deliberately introduce it to simpify things.
+<h3>Output Plugin Transaction Interface</h3>
+<p>In order to keep compatible with existing output plugins (and because it introduces
+no complexity), the transactional plugin interface is build on the traditional
+non-transactional one. Well... actually the traditional interface was transactional
+since its introduction, in the sense that each message was processed in its own
+transaction.
+<p>So the current <code>doAction()</b> entry point can be considered to have this
+structure (from the transactional interface point of view):
+<p><pre><code>
+doAction()
+    {
+    beginTransaction()
+    ProcessMessage()
+    endTransaction()
+    }
+ </code></pre>
+<p>For the <b>transactional interface</b>, we now move these implicit <code>beginTransaction()</code>
+and <code>endTransaction(()</code> call out of the message processing body, resulting is such
+a structure:
+<p><pre><code>
+beginTransaction()
+    {
+    /* prepare for transaction */
+    }
+
+doAction()
+    {
+    ProcessMessage()
+    /* maybe do partial commits */
+    }
+
+endTransaction()
+    {
+    /* commit (rest of) batch */
+    }
+</code></pre>
+<p>And this calling structure actually is the transactional interface! It is as simple as this.
+For the new interface, the core calls a <code>beginTransaction()</code> entry point inside the
+plugin at the start of the batch. Similarly, the core call <code>endTransaction()</code> at the
+end of the batch. The plugin must implement these entry points according to its needs.
+<p>But how does the core know when to use the old or the new calling interface? This is rather
+easy: when loading a plugin, the core queries the plugin for the <code>beginTransaction()</code>
+and <code>endTransaction()</code> entry points. If the plugin supports these, the new interface is
+used. If the plugin does not support them, the old interface is used and rsyslog implies that
+a commit is done after each message. Note that there is no special "downlevel" handling
+necessary to support this. In the case of the non-transactional interface, rsyslog considers
+each completed call to <code>doAction</code> as partial commit up to the current message.
+So implementation inside the core is very straightforward.
+<p>Actually, <b>we recommend that the transactional entry points only be defined by those
+plugins that actually need them</b>. All others should not define them in which case
+the default commit behaviour inside rsyslog will apply (thus removing complexity from the
+plugin).
+<p>In order to support partial commits, special return codes must be defined for
+<code>doAction</code>. All those return codes mean that processing completed successfully.
+But they convey additional information about the commit status as follows:
+<p>
+<table border="0">
+<tr>
+<td valign="top"><i>RS_RET_OK</i></td>
+<td>The record and all previous inside the batch has been commited.
+<i>Note:</i> this definition is what makes integrating plugins without the
+transaction being/end calls so easy - this is the traditional "success" return
+state and if every call returns it, there is no need for actually calling
+<code>endTransaction()</code>, because there is no transaction open).</td>
+</tr>
+<tr>
+<td valign="top"><i>RS_RET_DEFER_COMMIT</i></td>
+<td>The record has been processed, but is not yet commited. This is the
+expected state for transactional-aware plugins.</td>
+</tr>
+<tr>
+<td valign="top"><i>RS_RET_PREVIOUS_COMMITTED</i></td>
+<td>The <b>previous</b> record inside the batch has been committed, but the
+current one not yet. This state is introduced to support sources that fill up
+buffers and commit once a buffer is completely filled. That may occur halfway
+in the next record, so it may be important to be able to tell the
+engine the everything up to the previouos record is commited</td>
+</tr>
+</table>
+<p>Note that the typical <b>calling cycle</b> is <code>beginTransaction()</code>,
+followed by <i>n</i> times
+<code>doAction()</code></n> followed by <code>endTransaction()</code>. However, if either 
+<code>beginTransaction()</code> or <code>doAction()</code> return back an error state
+(including RS_RET_SUSPENDED), then the transaction is considered aborted. In result, the
+remaining calls in this cycle (e.g. <code>endTransaction()</code>) are never made and a 
+new cycle (starting with <code>beginTransaction()</code> is begun when processing resumes.
+So an output plugin must expect and handle those partial cycles gracefully.
+<p><b>The question remains how can a plugin know if the core supports batching?</b>
+First of all, even if the engine would not know it, the plugin would return with RS_RET_DEFER_COMMIT,
+what then would be treated as an error by the engine. This would effectively disable the
+output, but cause no further harm (but may be harm enough in itself).
+<p>The real solution is to enable the plugin to query the rsyslog core if this feature is
+supported or not. At the time of the introduction of batching, no such query-interface
+exists. So we introduce it with that release. What the means is if a rsyslog core can
+not provide this query interface, it is a core that was build before batching support
+was available. So the absence of a query interface indicates that the transactional
+interface is not available. One might now be tempted the think there is no need to do
+the actual check, but is is recommended to ask the rsyslog engine explicitely if
+the transactional interface is present and will be honored. This enables us to
+create versions in the future which have, for whatever reason we do not yet know, no
+support for this interface.
+<p>The logic to do these checks is contained in the <code>INITChkCoreFeature</code> macro,
+which can be used as follows:
+<p><pre><code>
+INITChkCoreFeature(bCoreSupportsBatching, CORE_FEATURE_BATCHING);
+</code></pre>
+<p>Here, bCoreSupportsBatching is a plugin-defined integer which after execution is
+1 if batches (and thus the transational interface) is supported and 0 otherwise.
+CORE_FEATURE_BATCHING is the feature we are interested in. Future versions of rsyslog
+may contain additional feature-test-macros (you can see all of them in
+./runtime/rsyslog.h).
+<p>Note that the ompsql output plugin supports transactional mode in a hybrid way and
+thus can be considered good example code.
+
+<h2>Open Issues</h2>
+<ul>
+<li>Processing errors handling
+<li>reliable re-queue during error handling and queue termination
+</ul>
+
+
+
 <h3>Licensing</h3>
 <p>From the rsyslog point of view, plugins constitute separate projects. As such,
 we think plugins are not required to be compatible with GPLv3. However, this is
diff --git a/doc/manual.html b/doc/manual.html
index 6770ce7e..2af2e564 100644
--- a/doc/manual.html
+++ b/doc/manual.html
@@ -19,7 +19,7 @@ rsyslog support</a> available directly from the source!</p>
 <p><b>Please visit the <a href="http://www.rsyslog.com/sponsors">rsyslog sponsor's page</a>
 to honor the project sponsors or become one yourself!</b> We are very grateful for any help towards the
 project goals.</p>
-<p><b>This documentation is for version 4.5.0 (beta branch) of rsyslog.</b>
+<p><b>This documentation is for version 5.1.1 (devel branch) of rsyslog.</b>
 Visit the <i> <a href="http://www.rsyslog.com/doc-status.html">rsyslog status page</a></i></b> to obtain current
 version information and project status.
 </p><p><b>If you like rsyslog, you might
diff --git a/doc/queue_msg_state.dot b/doc/queue_msg_state.dot
new file mode 100644
index 00000000..bfef2657
--- /dev/null
+++ b/doc/queue_msg_state.dot
@@ -0,0 +1,25 @@
+// This file is part of rsyslog.
+// 
+// rsyslog message state in queue processing
+//
+// see http://www.graphviz.org for how to obtain the graphviz processor
+// which is used to build the actual graph.
+//
+// generate the graph with
+// $ dot file.dot -Tpng >file.png
+
+digraph msgState {
+	rankdir=LR
+
+	prod	[label="producer" style="dotted" shape="box"]
+	que	[label="queued"]
+	deq	[label="dequeued"]
+	del	[label="deleted"]
+
+	prod -> que 	[label="qEnq()" style="dotted"]
+	que  -> deq	[label="qDeq()"]
+	deq  -> del	[label="qDel()"]
+	deq  -> que	[label="fatal failure\n& restart"]
+
+	//{rank=same; del apf pdn }
+}
diff --git a/doc/queue_msg_state.jpeg b/doc/queue_msg_state.jpeg
new file mode 100644
index 00000000..a215f000
--- /dev/null
+++ b/doc/queue_msg_state.jpeg
diff --git a/doc/queues.html b/doc/queues.html
index 45ce1bd1..75b70fbf 100644
--- a/doc/queues.html
+++ b/doc/queues.html
@@ -336,6 +336,33 @@ in this regard - it was just not requested so far. So if you need more
 fine-grained control, let us know and we'll probably implement it.
 There are two configuration directives, both should be used together or
 results are unpredictable:" <i>$&lt;object&gt;QueueDequeueTimeBegin &lt;hour&gt;</i>" and&nbsp;"<i>$&lt;object&gt;QueueDequeueTimeEnd &lt;hour&gt;</i>". The hour parameter must be specified in 24-hour format (so 10pm is 22). A use case for this parameter can be found in the <a href="http://wiki.rsyslog.com/index.php/OffPeakHours">rsyslog wiki</a>. </p>
+<h2>Performance</h2>
+<p>The locking involved with maintaining the queue has a potentially large
+performance impact. How large this is, and if it exists at all, depends much on
+the configuration and actual use case. However, the queue is able to work on
+so-called &quot;batches&quot; when dequeueing data elements. With batches,
+multiple data elements are dequeued at once (with a single locking call).
+The queue dequeues all available elements up to a configured upper
+limit (<i>&lt;object&gt;DequeueBatchSize &lt;number&gt;</i>). It is important
+to note that the actual upper limit is dictated by availability. The queue engine
+will never wait for a batch to fill. So even if a high upper limit is configured,
+batches may consist of fewer elements, even just one, if there are no more elements
+waiting in the queue.
+<p>Batching
+can improve performance considerably. Note, however, that it affects the
+order in which messages are passed to the queue worker threads, as each worker
+now receive as batch of messages. Also, the larger the batch size and the higher
+the maximum number of permitted worker threads, the more main memory is needed. 
+For a busy server, large batch sizes (around 1,000 or even more elements) may be useful.
+Please note that with batching, the main memory must hold BatchSize * NumOfWorkers
+objects in memory (worst-case scenario), even if running in disk-only mode. So if you
+use the default 5 workers at the main message queue and set the batch size to 1,000, you need
+to be prepared that the main message queue holds up to 5,000 messages in main memory
+<b>in addition</b> to the configured queue size limits!
+<p>The queue object's default maximum batch size
+is eight, but there exists different defaults for the actual parts of
+rsyslog processing that utilize queues. So you need to check these object's
+defaults.
 <h2>Terminating Queues</h2>
 <p>Terminating a process sounds easy, but can be complex.
 Terminating a running queue is in fact the most complex operation a queue 
diff --git a/doc/rsyslog_conf_global.html b/doc/rsyslog_conf_global.html
index 03842758..2371c4ae 100644
--- a/doc/rsyslog_conf_global.html
+++ b/doc/rsyslog_conf_global.html
@@ -58,6 +58,7 @@ default template for UDP and plain TCP forwarding action</li>
 <li>$ActionGSSForwardDefaultTemplate [templateName] - sets a
 new default template for GSS-API forwarding action</li>
 <li>$ActionQueueCheckpointInterval &lt;number&gt;</li>
+<li>$ActionQueueDequeueBatchSize &lt;number&gt; [default 16]</li>
 <li>$ActionQueueDequeueSlowdown &lt;number&gt; [number
 is timeout in <i> micro</i>seconds (1000000us is 1sec!),
 default 0 (no delay). Simple rate-limiting!]</li>
@@ -135,6 +136,7 @@ not recommended for use with rsyslog. To do a full restart, simply stop and star
 compatibility reasons. If it is set to "off", a HUP will only close open files. This is a much quicker action and usually
 the only one that is needed e.g. for log rotation. <b>It is recommended to set the setting to "off".</b></li>
 <li><a href="rsconf1_includeconfig.html">$IncludeConfig</a></li><li>MainMsgQueueCheckpointInterval &lt;number&gt;</li>
+<li>$MainMsgQueueDequeueBatchSize &lt;number&gt; [default 32]</li>
 <li>$MainMsgQueueDequeueSlowdown &lt;number&gt; [number
 is timeout in <i> micro</i>seconds (1000000us is 1sec!),
 default 0 (no delay). Simple rate-limiting!]</li>
diff --git a/doc/rsyslog_queue_pointers.jpeg b/doc/rsyslog_queue_pointers.jpeg
new file mode 100644
index 00000000..809dd446
--- /dev/null
+++ b/doc/rsyslog_queue_pointers.jpeg
diff --git a/doc/rsyslog_queue_pointers2.jpeg b/doc/rsyslog_queue_pointers2.jpeg
new file mode 100644
index 00000000..2ad60113
--- /dev/null
+++ b/doc/rsyslog_queue_pointers2.jpeg
diff --git a/doc/src/rsyslog_queue_pointers.dia b/doc/src/rsyslog_queue_pointers.dia
new file mode 100644
index 00000000..2ad4cacb
--- /dev/null
+++ b/doc/src/rsyslog_queue_pointers.dia
diff --git a/doc/src/rsyslog_queue_pointers2.dia b/doc/src/rsyslog_queue_pointers2.dia
new file mode 100644
index 00000000..6a35c664
--- /dev/null
+++ b/doc/src/rsyslog_queue_pointers2.dia
diff --git a/doc/status.html b/doc/status.html
index 4e8f1a5f..272b49bc 100644
--- a/doc/status.html
+++ b/doc/status.html
@@ -2,23 +2,30 @@
 <html><head><title>rsyslog status page</title></head>
 <body>
 <h2>rsyslog status page</h2>
-<p>This page reflects the status as of 2009-05-25.</p>
+<p>This page reflects the status as of 2009-06-24.</p>
 <h2>Current Releases</h2>
 
-<p><b>development:</b> 4.3.1 [2009-05-25] -
-<a href="http://www.rsyslog.com/Article372.phtml">change log</a> -
-<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-159.phtml">download</a>
+<p><b>v5 development:</b> 5.1.0 [2009-05-29] -
+<a href="http://www.rsyslog.com/Article374.phtml">change log</a> -
+<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-160.phtml">download</a>
 
-<br><b>beta:</b> 3.21.11 [2009-04-03] -
-<a href="http://www.rsyslog.com/Article358.phtml">change log</a> -
-<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-152.phtml">download</a></p>
+<br><b>v4 development:</b> currently no version other than beta
 
-<p><b>v3 stable:</b> 3.22.0 [2009-04-21] - <a href="http://www.rsyslog.com/Article368.phtml">change log</a> -
+<br><b>beta:</b> 4.3.2 [2009-06-24] -
+<a href="http://www.rsyslog.com/Article378.phtml">change log</a> -
+<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-162.phtml">download</a></p>
+
+<p><b>v4 stable:</b> 4.2.0 [2009-06-23] -
+<a href="http://www.rsyslog.com/Article376.phtml">change log</a> -
+<a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-161.phtml">download</a>
+
+<br><b>v3 stable:</b> 3.22.0 [2009-04-21] -
+<a href="http://www.rsyslog.com/Article368.phtml">change log</a> -
 <a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-157.phtml">download</a>
 
-<br><b>v2 stable:</b> 2.0.7 [2009-04-14] - <a href="http://www.rsyslog.com/Article362.phtml">change log</a> -
+<br>v2 stable: 2.0.7 [2009-04-14] - <a href="http://www.rsyslog.com/Article362.phtml">change log</a> -
 <a href="http://www.rsyslog.com/Downloads-req-viewdownloaddetails-lid-154.phtml">download</a>
-<br>v0 and v1 are deprecated and no longer supported. If you absolutely do not like to
+<br>v0 to v2 are deprecated and no longer supported. If you absolutely do not like to
 upgrade, you may consider purchasing a
 <a href="professional_support.html">commercial rsyslog support package</a>. Just let us point
 out that it is really not a good idea to still run a v0 version.
diff --git a/plugins/imdiag/imdiag.c b/plugins/imdiag/imdiag.c
index bf972191..7cacc44a 100644
--- a/plugins/imdiag/imdiag.c
+++ b/plugins/imdiag/imdiag.c
@@ -246,7 +246,7 @@ injectMsg(uchar *pszCmd, tcps_sess_t *pSess)
 		doInjectMsg(i + iFrom);
 	}
 
-	CHKiRet(sendResponse(pSess, "messages injected\n"));
+	CHKiRet(sendResponse(pSess, "%d messages injected\n", nMsgs));
 
 finalize_it:
 	RETiRet;
@@ -259,10 +259,13 @@ static rsRetVal
 waitMainQEmpty(tcps_sess_t *pSess)
 {
 	int iMsgQueueSize;
+	int iPrint = 0;
 	DEFiRet;
 
 	CHKiRet(diagGetMainMsgQSize(&iMsgQueueSize));
 	while(iMsgQueueSize > 0) {
+		if(iPrint++ % 500 == 0) 
+			dbgprintf("imdiag sleeping, wait mainq drain, curr size %d\n", iMsgQueueSize);
 		srSleep(0,2);	/* wait a little bit */
 		CHKiRet(diagGetMainMsgQSize(&iMsgQueueSize));
 	}
@@ -297,6 +300,7 @@ OnMsgReceived(tcps_sess_t *pSess, uchar *pRcv, int iLenMsg)
 
 	getFirstWord(&pszMsg, cmdBuf, sizeof(cmdBuf)/sizeof(uchar), TO_LOWERCASE);
 
+	dbgprintf("imdiag received command '%s'\n", cmdBuf);
 	if(!ustrcmp(cmdBuf, UCHAR_CONSTANT("getmainmsgqueuesize"))) {
 		CHKiRet(diagGetMainMsgQSize(&iMsgQueueSize));
 		CHKiRet(sendResponse(pSess, "%d\n", iMsgQueueSize));
diff --git a/plugins/imuxsock/imuxsock.c b/plugins/imuxsock/imuxsock.c
index 424d0904..6d60efaf 100644
--- a/plugins/imuxsock/imuxsock.c
+++ b/plugins/imuxsock/imuxsock.c
@@ -333,11 +333,8 @@ CODESTARTafterRun
 		if (funixn[i] && funix[i] != -1)
 			unlink((char*) funixn[i]);
 	/* free no longer needed string */
-	if(pLogSockName != NULL)
-		free(pLogSockName);
-	if(pLogHostName != NULL) {
-		free(pLogHostName);
-	}
+	free(pLogSockName);
+	free(pLogHostName);
 
 	discardFunixn();
 	nfunix = 1;
diff --git a/plugins/ompgsql/ompgsql.c b/plugins/ompgsql/ompgsql.c
index eb774835..cb6b6a4d 100644
--- a/plugins/ompgsql/ompgsql.c
+++ b/plugins/ompgsql/ompgsql.c
@@ -170,6 +170,9 @@ tryExec(uchar *pszCmd, instanceData *pData)
 	int bHadError = 0;
 
 	/* try insert */
+BEGINfunc
+RUNLOG_VAR("%p", pData->f_hpgsql);
+RUNLOG_VAR("%s", pszCmd);
 	pgRet = PQexec(pData->f_hpgsql, (char*)pszCmd);
 	execState = PQresultStatus(pgRet);
 	if(execState != PGRES_COMMAND_OK && execState != PGRES_TUPLES_OK) {
@@ -178,6 +181,7 @@ tryExec(uchar *pszCmd, instanceData *pData)
 	}
 	PQclear(pgRet);
 
+ENDfunc
 	return(bHadError);
 }
 
@@ -230,6 +234,14 @@ CODESTARTtryResume
 	}
 ENDtryResume
 
+
+BEGINbeginTransaction
+CODESTARTbeginTransaction
+dbgprintf("ompgsql: beginTransaction\n");
+	iRet = writePgSQL((uchar*) "begin", pData); /* TODO: make user-configurable */
+ENDbeginTransaction
+
+
 BEGINdoAction
 CODESTARTdoAction
 	dbgprintf("\n");
@@ -237,6 +249,13 @@ CODESTARTdoAction
 ENDdoAction
 
 
+BEGINendTransaction
+CODESTARTendTransaction
+	iRet = writePgSQL((uchar*) "commit;", pData); /* TODO: make user-configurable */
+dbgprintf("ompgsql: endTransaction\n");
+ENDendTransaction
+
+
 BEGINparseSelectorAct
 	int iPgSQLPropErr = 0;
 CODESTARTparseSelectorAct
@@ -314,6 +333,7 @@ ENDmodExit
 BEGINqueryEtryPt
 CODESTARTqueryEtryPt
 CODEqueryEtryPt_STD_OMOD_QUERIES
+CODEqueryEtryPt_TXIF_OMOD_QUERIES /* we support the transactional interface! */
 ENDqueryEtryPt
 
 
@@ -322,6 +342,8 @@ CODESTARTmodInit
 	*ipIFVersProvided = CURR_MOD_IF_VERSION; /* we only support the current interface specification */
 CODEmodInit_QueryRegCFSLineHdlr
 	CHKiRet(objUse(errmsg, CORE_COMPONENT));
+	INITChkCoreFeature(bCoreSupportsBatching, CORE_FEATURE_BATCHING);
+	DBGPRINTF("ompgsql: %susing transactional output interface.\n", bCoreSupportsBatching ? "" : "not ");
 ENDmodInit
 /* vi:set ai:
  */
diff --git a/plugins/omtesting/omtesting.c b/plugins/omtesting/omtesting.c
index 411bcf88..8f6cdbe5 100644
--- a/plugins/omtesting/omtesting.c
+++ b/plugins/omtesting/omtesting.c
@@ -22,7 +22,7 @@
  * NOTE: read comments in module-template.h to understand how this file
  *       works!
  *
- * Copyright 2007 Rainer Gerhards and Adiscon GmbH.
+ * Copyright 2007, 2009 Rainer Gerhards and Adiscon GmbH.
  *
  * This file is part of rsyslog.
  *
@@ -46,12 +46,14 @@
 #include <stdio.h>
 #include <stdarg.h>
 #include <stdlib.h>
+#include <time.h>
 #include <string.h>
 #include <ctype.h>
 #include <assert.h>
 #include "dirty.h"
 #include "syslogd-types.h"
 #include "module-template.h"
+#include "cfsysline.h"
 
 MODULE_TYPE_OUTPUT
 
@@ -59,9 +61,18 @@ MODULE_TYPE_OUTPUT
  */
 DEF_OMOD_STATIC_DATA
 
+static int bEchoStdout = 0;	/* echo non-failed messages to stdout */
+
 typedef struct _instanceData {
+	enum { MD_SLEEP, MD_FAIL, MD_RANDFAIL, MD_ALWAYS_SUSPEND }
+		mode;
+	int	bEchoStdout;
 	int	iWaitSeconds;
 	int	iWaitUSeconds;	/* milli-seconds (one million of a second, just to make sure...) */
+	int 	iCurrCallNbr;
+	int	iFailFrequency;
+	int	iResumeAfter;
+	int	iCurrRetries;
 } instanceData;
 
 BEGINcreateInstance
@@ -85,19 +96,106 @@ CODESTARTisCompatibleWithFeature
 ENDisCompatibleWithFeature
 
 
-BEGINtryResume
-CODESTARTtryResume
-ENDtryResume
+/* implement "fail" command in retry processing */
+static rsRetVal doFailOnResume(instanceData *pData)
+{
+	DEFiRet;
 
-BEGINdoAction
-CODESTARTdoAction
+	dbgprintf("fail retry curr %d, max %d\n", pData->iCurrRetries, pData->iResumeAfter);
+	if(++pData->iCurrRetries == pData->iResumeAfter) {
+		iRet = RS_RET_OK;
+	} else {
+		iRet = RS_RET_SUSPENDED;
+	}
+
+	RETiRet;
+}
+
+
+/* implement "fail" command */
+static rsRetVal doFail(instanceData *pData)
+{
+	DEFiRet;
+
+	dbgprintf("fail curr %d, frquency %d\n", pData->iCurrCallNbr, pData->iFailFrequency);
+	if(pData->iCurrCallNbr++ % pData->iFailFrequency == 0) {
+		pData->iCurrRetries = 0;
+		iRet = RS_RET_SUSPENDED;
+	}
+
+	RETiRet;
+}
+
+
+/* implement "sleep" command */
+static rsRetVal doSleep(instanceData *pData)
+{
+	DEFiRet;
 	struct timeval tvSelectTimeout;
 
 	dbgprintf("sleep(%d, %d)\n", pData->iWaitSeconds, pData->iWaitUSeconds);
 	tvSelectTimeout.tv_sec = pData->iWaitSeconds;
 	tvSelectTimeout.tv_usec = pData->iWaitUSeconds; /* milli seconds */
 	select(0, NULL, NULL, NULL, &tvSelectTimeout);
-	//dbgprintf(":omtesting: end doAction(), iRet %d\n", iRet);
+	RETiRet;
+}
+
+
+/* implement "randomfail" command */
+static rsRetVal doRandFail(void)
+{
+	DEFiRet;
+	if((rand() >> 4) < (RAND_MAX >> 5)) { /* rougly same probability */
+		iRet = RS_RET_OK;
+		dbgprintf("omtesting randfail: succeeded this time\n");
+	} else {
+		iRet = RS_RET_SUSPENDED;
+		dbgprintf("omtesting randfail: failed this time\n");
+	}
+	RETiRet;
+}
+
+
+BEGINtryResume
+CODESTARTtryResume
+	dbgprintf("omtesting tryResume() called\n");
+	switch(pData->mode) {
+		case MD_SLEEP:
+			break;
+		case MD_FAIL:
+			iRet = doFailOnResume(pData);
+			break;
+		case MD_RANDFAIL:
+			iRet = doRandFail();
+			break;
+		case MD_ALWAYS_SUSPEND:
+			iRet = RS_RET_SUSPENDED;
+	}
+	dbgprintf("omtesting tryResume() returns iRet %d\n", iRet);
+ENDtryResume
+
+
+BEGINdoAction
+CODESTARTdoAction
+	dbgprintf("omtesting received msg '%s'\n", ppString[0]);
+	switch(pData->mode) {
+		case MD_SLEEP:
+			iRet = doSleep(pData);
+			break;
+		case MD_FAIL:
+			iRet = doFail(pData);
+			break;
+		case MD_RANDFAIL:
+			iRet = doRandFail();
+		case MD_ALWAYS_SUSPEND:
+			iRet = RS_RET_SUSPENDED;
+	}
+
+	if(iRet == RS_RET_OK && pData->bEchoStdout) {
+		fprintf(stdout, "%s", ppString[0]);
+		fflush(stdout);
+	}
+	dbgprintf(":omtesting: end doAction(), iRet %d\n", iRet);
 ENDdoAction
 
 
@@ -113,7 +211,7 @@ BEGINparseSelectorAct
 	int i;
 	uchar szBuf[1024];
 CODESTARTparseSelectorAct
-CODE_STD_STRING_REQUESTparseSelectorAct(0)
+CODE_STD_STRING_REQUESTparseSelectorAct(1)
 	/* code here is quick and dirty - if you like, clean it up. But keep
 	 * in mind it is just a testing aid ;) -- rgerhards, 2007-12-31
 	 */
@@ -135,6 +233,7 @@ CODE_STD_STRING_REQUESTparseSelectorAct(0)
 	if(isspace(*p))
 		++p;
 
+	dbgprintf("omtesting command: '%s'\n", szBuf);
 	if(!strcmp((char*) szBuf, "sleep")) {
 		/* parse seconds */
 		for(i = 0 ; *p && !isspace(*p) && ((unsigned) i < sizeof(szBuf) - 1) ; ++i) {
@@ -152,12 +251,43 @@ CODE_STD_STRING_REQUESTparseSelectorAct(0)
 		if(isspace(*p))
 			++p;
 		pData->iWaitUSeconds = atoi((char*) szBuf);
-	}
-	/* once there are other modes, here is the spot to add it! */
-	else {
+		pData->mode = MD_SLEEP;
+	} else if(!strcmp((char*) szBuf, "fail")) {
+		/* "fail fail-freqency resume-after"
+		 * fail-frequency specifies how often doAction() fails
+		 * resume-after speicifes how fast tryResume() should come back with success
+		 * all numbers being "times called"
+		 */
+		/* parse fail-frequence */
+		for(i = 0 ; *p && !isspace(*p) && ((unsigned) i < sizeof(szBuf) - 1) ; ++i) {
+			szBuf[i] = *p++;
+		}
+		szBuf[i] = '\0';
+		if(isspace(*p))
+			++p;
+		pData->iFailFrequency = atoi((char*) szBuf);
+		/* parse resume-after */
+		for(i = 0 ; *p && !isspace(*p) && ((unsigned) i < sizeof(szBuf) - 1) ; ++i) {
+			szBuf[i] = *p++;
+		}
+		szBuf[i] = '\0';
+		if(isspace(*p))
+			++p;
+		pData->iResumeAfter = atoi((char*) szBuf);
+		pData->iCurrCallNbr = 1;
+		pData->mode = MD_FAIL;
+	} else if(!strcmp((char*) szBuf, "randfail")) {
+		pData->mode = MD_RANDFAIL;
+	} else if(!strcmp((char*) szBuf, "always_suspend")) {
+		pData->mode = MD_ALWAYS_SUSPEND;
+	} else {
 		dbgprintf("invalid mode '%s', doing 'sleep 1 0' - fix your config\n", szBuf);
 	}
 
+	pData->bEchoStdout = bEchoStdout;
+	CHKiRet(cflineParseTemplateName(&p, *ppOMSR, 0, OMSR_NO_RQD_TPL_OPTS,
+				         (uchar*)"RSYSLOG_TraditionalForwardFormat"));
+
 CODE_STD_FINALIZERparseSelectorAct
 ENDparseSelectorAct
 
@@ -177,6 +307,10 @@ BEGINmodInit()
 CODESTARTmodInit
 	*ipIFVersProvided = CURR_MOD_IF_VERSION; /* we only support the current interface specification */
 CODEmodInit_QueryRegCFSLineHdlr
+	CHKiRet(omsdRegCFSLineHdlr((uchar *)"actionomtestingechostdout", 0, eCmdHdlrBinary, NULL,
+				   &bEchoStdout, STD_LOADABLE_MODULE_ID));
+	/* we seed the random-number generator in any case... */
+	srand(time(NULL));
 ENDmodInit
 /*
  * vi:set ai:
diff --git a/runtime/Makefile.am b/runtime/Makefile.am
index 14abe722..caf7c5ca 100644
--- a/runtime/Makefile.am
+++ b/runtime/Makefile.am
@@ -9,6 +9,7 @@ librsyslog_la_SOURCES = \
 	rsyslog.h \
 	unicode-helper.h \
 	atomic.h \
+	batch.h \
 	syslogd-types.h \
 	module-template.h \
 	obj-types.h \
diff --git a/runtime/apc.c b/runtime/apc.c
index 5919191d..bc330e39 100644
--- a/runtime/apc.c
+++ b/runtime/apc.c
@@ -335,9 +335,11 @@ CancelApc(apc_id_t id)
 {
 	DEFVARS_mutexProtection_uncond;
 
+	BEGINfunc
 	BEGIN_MTX_PROTECTED_OPERATIONS_UNCOND(&listMutex);
 	deleteApc(id);
 	END_MTX_PROTECTED_OPERATIONS_UNCOND(&listMutex);
+	ENDfunc
 	return RS_RET_OK;
 }
 
diff --git a/runtime/atomic.h b/runtime/atomic.h
index d5aaf56b..f0733f09 100644
--- a/runtime/atomic.h
+++ b/runtime/atomic.h
@@ -41,6 +41,8 @@
  * They simply came in too late. -- rgerhards, 2008-04-02
  */
 #ifdef HAVE_ATOMIC_BUILTINS
+#	define ATOMIC_SUB(data, val) __sync_fetch_and_sub(&(data), val)
+#	define ATOMIC_ADD(data, val) __sync_fetch_and_add(&(data), val)
 #	define ATOMIC_INC(data) ((void) __sync_fetch_and_add(&(data), 1))
 #	define ATOMIC_INC_AND_FETCH(data) __sync_fetch_and_add(&(data), 1)
 #	define ATOMIC_DEC(data) ((void) __sync_sub_and_fetch(&(data), 1))
diff --git a/runtime/batch.h b/runtime/batch.h
new file mode 100644
index 00000000..031718a7
--- /dev/null
+++ b/runtime/batch.h
@@ -0,0 +1,72 @@
+/* Definition of the batch_t data structure.
+ * I am not sure yet if this will become a full-blown object. For now, this header just
+ * includes the object definition and is not accompanied by code.
+ *
+ * Copyright 2009 by Rainer Gerhards and Adiscon GmbH.
+ *
+ * This file is part of the rsyslog runtime library.
+ *
+ * The rsyslog runtime library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * The rsyslog runtime library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the rsyslog runtime library.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * A copy of the GPL can be found in the file "COPYING" in this distribution.
+ * A copy of the LGPL can be found in the file "COPYING.LESSER" in this distribution.
+ */
+
+#ifndef BATCH_H_INCLUDED
+#define BATCH_H_INCLUDED
+
+/* enum for batch states. Actually, we violate a layer here, in that we assume that a batch is used
+ * for action processing. So far, this seems acceptable, the status is simply ignored inside the
+ * main message queue. But over time, it could potentially be useful to split the two.
+ * rgerhad, 2009-05-12
+ */
+typedef enum {
+	BATCH_STATE_RDY  = 0,	/* object ready for processing */
+	BATCH_STATE_BAD  = 1,	/* unrecoverable failure while processing, do NOT resubmit to same action */
+	BATCH_STATE_SUB  = 2,	/* message submitted for processing, outcome yet unkonwn */
+	BATCH_STATE_COMM = 3,	/* message successfully commited */
+	BATCH_STATE_DISC = 4, 	/* discarded - processed OK, but do not submit to any other action */
+} batch_state_t;
+
+
+/* an object inside a batch, including any information (state!) needed for it to "life".
+ */
+struct batch_obj_s {
+	obj_t *pUsrp;		/* pointer to user object (most often message) */
+	batch_state_t state;	/* associated state */
+};
+
+/* the batch
+ * This object is used to dequeue multiple user pointers which are than handed over
+ * to processing. The size of elements is fixed after queue creation, but may be 
+ * modified by config variables (better said: queue properties).
+ * Note that a "user pointer" in rsyslog context so far always is a message 
+ * object. We stick to the more generic term because queues may potentially hold
+ * other types of objects, too.
+ * rgerhards, 2009-05-12
+ * Note that nElem is not necessarily equal to nElemDeq. This is the case when we
+ * discard some elements (because of configuration) during dequeue processing. As
+ * all Elements are only deleted when the batch is processed, we can not immediately
+ * delete them. So we need to keep their number that we can delete them when the batch
+ * is completed (else, the whole process does not work correctly).
+ */
+struct batch_s {
+	int nElem;		/* actual number of element in this entry */
+	int nElemDeq;		/* actual number of elements dequeued (and thus to be deleted) - see comment above! */
+	int iDoneUpTo;		/* all messages below this index have state other than RDY */
+	qDeqID	deqID;		/* ID of dequeue operation that generated this batch */
+	batch_obj_t *pElem;	/* batch elements */
+};
+
+#endif /* #ifndef BATCH_H_INCLUDED */
diff --git a/runtime/conf.c b/runtime/conf.c
index 83ed2e9b..84178fd5 100644
--- a/runtime/conf.c
+++ b/runtime/conf.c
@@ -1102,7 +1102,7 @@ static rsRetVal cflineDoAction(uchar **p, action_t **ppAction)
 					dbgprintf("module is incompatible with RepeatedMsgReduction - turned off\n");
 					pAction->f_ReduceRepeated = 0;
 				}
-				pAction->bEnabled = 1; /* action is enabled */
+				pAction->eState = ACT_STATE_RDY; /* action is enabled */
 				iNbrActions++;	/* one more active action! */
 			}
 			break;
diff --git a/runtime/debug.c b/runtime/debug.c
index 4ee90226..248c5ea3 100644
--- a/runtime/debug.c
+++ b/runtime/debug.c
@@ -828,13 +828,12 @@ sigsegvHdlr(int signum)
 	abort();
 }
 
-#if 1
-#pragma GCC diagnostic ignored "-Wempty-body"
-/* write the debug message. This is a helper to dbgprintf and dbgoprint which
- * contains common code. added 2008-09-26 rgerhards
+/* actually write the debug message. This is a separate fuction because the cleanup_push/_pop
+ * interface otherwise is unsafe to use (generates compiler warnings at least).
+ * 2009-05-20 rgerhards
  */
-static void
-dbgprint(obj_t *pObj, char *pszMsg, size_t lenMsg)
+static inline void
+do_dbgprint(uchar *pszObjName, char *pszMsg, size_t lenMsg)
 {
 	static pthread_t ptLastThrdID = 0;
 	static int bWasNL = 0;
@@ -842,20 +841,6 @@ dbgprint(obj_t *pObj, char *pszMsg, size_t lenMsg)
 	char pszWriteBuf[1024];
 	size_t lenWriteBuf;
 	struct timespec t;
-	uchar *pszObjName = NULL;
-
-	/* we must get the object name before we lock the mutex, because the object
-	 * potentially calls back into us. If we locked the mutex, we would deadlock
-	 * ourselfs. On the other hand, the GetName call needs not to be protected, as
-	 * this thread has a valid reference. If such an object is deleted by another
-	 * thread, we are in much more trouble than just for dbgprint(). -- rgerhards, 2008-09-26
-	 */
-	if(pObj != NULL) {
-		pszObjName = obj.GetName(pObj);
-	}
-
-	pthread_mutex_lock(&mutdbgprint);
-	pthread_cleanup_push(dbgMutexCancelCleanupHdlr, &mutdbgprint);
 
 	/* The bWasNL handler does not really work. It works if no thread
 	 * switching occurs during non-NL messages. Else, things are messed
@@ -903,11 +888,35 @@ dbgprint(obj_t *pObj, char *pszMsg, size_t lenMsg)
 	if(altdbg != -1) write(altdbg, pszMsg, lenMsg);
 
 	bWasNL = (pszMsg[lenMsg - 1] == '\n') ? 1 : 0;
+}
+
+#pragma GCC diagnostic ignored "-Wempty-body"
+/* write the debug message. This is a helper to dbgprintf and dbgoprint which
+ * contains common code. added 2008-09-26 rgerhards
+ */
+static void
+dbgprint(obj_t *pObj, char *pszMsg, size_t lenMsg)
+{
+	uchar *pszObjName = NULL;
+
+	/* we must get the object name before we lock the mutex, because the object
+	 * potentially calls back into us. If we locked the mutex, we would deadlock
+	 * ourselfs. On the other hand, the GetName call needs not to be protected, as
+	 * this thread has a valid reference. If such an object is deleted by another
+	 * thread, we are in much more trouble than just for dbgprint(). -- rgerhards, 2008-09-26
+	 */
+	if(pObj != NULL) {
+		pszObjName = obj.GetName(pObj);
+	}
+
+	pthread_mutex_lock(&mutdbgprint);
+	pthread_cleanup_push(dbgMutexCancelCleanupHdlr, &mutdbgprint);
+
+	do_dbgprint(pszObjName, pszMsg, lenMsg);
 
 	pthread_cleanup_pop(1);
 }
 #pragma GCC diagnostic warning "-Wempty-body"
-#endif
 
 /* print some debug output when an object is given
  * This is mostly a copy of dbgprintf, but I do not know how to combine it
@@ -1050,7 +1059,9 @@ int dbgEntrFunc(dbgFuncDB_t **ppFuncDB, const char *file, const char *func, int
 	/* when we reach this point, we have a fully-initialized FuncDB! */
 	ATOMIC_INC(pFuncDB->nTimesCalled);
 	if(bLogFuncFlow && dbgPrintNameIsInList((const uchar*)pFuncDB->file, printNameFileRoot))
-		dbgprintf("%s:%d: %s: enter\n", pFuncDB->file, pFuncDB->line, pFuncDB->func);
+		if(strcmp(pFuncDB->file, "stringbuf.c")) {	/* TODO: make configurable */
+			dbgprintf("%s:%d: %s: enter\n", pFuncDB->file, pFuncDB->line, pFuncDB->func);
+		}
 	if(pThrd->stackPtr >= (int) (sizeof(pThrd->callStack) / sizeof(dbgFuncDB_t*))) {
 		dbgprintf("%s:%d: %s: debug module: call stack for this thread full, suspending call tracking\n",
 			  pFuncDB->file, pFuncDB->line, pFuncDB->func);
@@ -1080,10 +1091,12 @@ void dbgExitFunc(dbgFuncDB_t *pFuncDB, int iStackPtrRestore, int iRet)
 
 	dbgFuncDBPrintActiveMutexes(pFuncDB, "WARNING: mutex still owned by us as we exit function, mutex: ", pthread_self());
 	if(bLogFuncFlow && dbgPrintNameIsInList((const uchar*)pFuncDB->file, printNameFileRoot)) {
-		if(iRet == RS_RET_NO_IRET)
-			dbgprintf("%s:%d: %s: exit: (no iRet)\n", pFuncDB->file, pFuncDB->line, pFuncDB->func);
-		else 
-			dbgprintf("%s:%d: %s: exit: %d\n", pFuncDB->file, pFuncDB->line, pFuncDB->func, iRet);
+		if(strcmp(pFuncDB->file, "stringbuf.c")) {	/* TODO: make configurable */
+			if(iRet == RS_RET_NO_IRET)
+				dbgprintf("%s:%d: %s: exit: (no iRet)\n", pFuncDB->file, pFuncDB->line, pFuncDB->func);
+			else 
+				dbgprintf("%s:%d: %s: exit: %d\n", pFuncDB->file, pFuncDB->line, pFuncDB->func, iRet);
+		}
 	}
 	pThrd->stackPtr = iStackPtrRestore;
 	if(pThrd->stackPtr < 0) {
diff --git a/runtime/debug.h b/runtime/debug.h
index 1375493d..8b66d784 100644
--- a/runtime/debug.h
+++ b/runtime/debug.h
@@ -134,8 +134,8 @@ void dbgPrintAllDebugInfo(void);
 
 
 /* debug aides */
-//#ifdef RTINST
-#if 0 // temporarily removed for helgrind
+#ifdef RTINST
+//#if 0 // temporarily removed for helgrind
 #define d_pthread_mutex_lock(x)      dbgMutexLock(x, pdbgFuncDB, __LINE__, dbgCALLStaCK_POP_POINT )
 #define d_pthread_mutex_trylock(x)   dbgMutexTryLock(x, pdbgFuncDB, __LINE__, dbgCALLStaCK_POP_POINT )
 #define d_pthread_mutex_unlock(x)    dbgMutexUnlock(x, pdbgFuncDB, __LINE__, dbgCALLStaCK_POP_POINT )
diff --git a/runtime/modules.c b/runtime/modules.c
index bd201252..0ec6b1ce 100644
--- a/runtime/modules.c
+++ b/runtime/modules.c
@@ -77,6 +77,21 @@ static modInfo_t *pLoadedModulesLast = NULL;	/* tail-pointer */
 uchar	*pModDir = NULL; /* read-only after startup */
 
 
+/* we provide a set of dummy functions for output modules that do not support the
+ * transactional interface. As they do not do this, they commit each message they
+ * receive, and as such the dummies can always return RS_RET_OK without causing
+ * harm. This simplifies things as in action processing we do not need to check
+ * if the transactional entry points exist.
+ */
+static rsRetVal dummyBeginTransaction() 
+{
+	return RS_RET_OK;
+}
+static rsRetVal dummyEndTransaction() 
+{
+	return RS_RET_OK;
+}
+
 #ifdef DEBUG
 /* we add some home-grown support to track our users (and detect who does not free us). In
  * the long term, this should probably be migrated into debug.c (TODO). -- rgerhards, 2008-03-11
@@ -216,19 +231,38 @@ static void moduleDestruct(modInfo_t *pThis)
 }
 
 
+/* This enables a module to query the core for specific features.
+ * rgerhards, 2009-04-22
+ */
+static rsRetVal queryCoreFeatureSupport(int *pBool, unsigned uFeat)
+{
+	DEFiRet;
+
+	if((pBool == NULL))
+		ABORT_FINALIZE(RS_RET_PARAM_ERROR);
+
+	*pBool = (uFeat & CORE_FEATURE_BATCHING) ? 1 : 0;
+
+finalize_it:
+	RETiRet;
+}
+
+
 /* The following function is the queryEntryPoint for host-based entry points.
  * Modules may call it to get access to core interface functions. Please note
  * that utility functions can be accessed via shared libraries - at least this
  * is my current shool of thinking.
  * Please note that the implementation as a query interface allows to take
  * care of plug-in interface version differences. -- rgerhards, 2007-07-31
+ * ... but often it better not to use a new interface. So we now add core
+ * functions here that a plugin may request. -- rgerhards, 2009-04-22
  */
 static rsRetVal queryHostEtryPt(uchar *name, rsRetVal (**pEtryPoint)())
 {
 	DEFiRet;
 
 	if((name == NULL) || (pEtryPoint == NULL))
-		return RS_RET_PARAM_ERROR;
+		ABORT_FINALIZE(RS_RET_PARAM_ERROR);
 
 	if(!strcmp((char*) name, "regCfSysLineHdlr")) {
 		*pEtryPoint = regCfSysLineHdlr;
@@ -236,6 +270,8 @@ static rsRetVal queryHostEtryPt(uchar *name, rsRetVal (**pEtryPoint)())
 		*pEtryPoint = objGetObjInterface;
 	} else if(!strcmp((char*) name, "OMSRgetSupportedTplOpts")) {
 		*pEtryPoint = OMSRgetSupportedTplOpts;
+	} else if(!strcmp((char*) name, "queryCoreFeatureSupport")) {
+		*pEtryPoint = queryCoreFeatureSupport;
 	} else {
 		*pEtryPoint = NULL; /* to  be on the safe side */
 		ABORT_FINALIZE(RS_RET_ENTRY_POINT_NOT_FOUND);
@@ -412,6 +448,18 @@ doModInit(rsRetVal (*modInit)(int, int*, rsRetVal(**)(), rsRetVal(*)(), modInfo_
 			localRet = (*pNew->modQueryEtryPt)((uchar*)"doHUP", &pNew->doHUP);
 			if(localRet != RS_RET_OK && localRet != RS_RET_MODULE_ENTRY_POINT_NOT_FOUND)
 				ABORT_FINALIZE(localRet);
+
+			localRet = (*pNew->modQueryEtryPt)((uchar*)"beginTransaction", &pNew->mod.om.beginTransaction);
+			if(localRet == RS_RET_MODULE_ENTRY_POINT_NOT_FOUND)
+				pNew->mod.om.beginTransaction = dummyBeginTransaction;
+			else if(localRet != RS_RET_OK)
+				ABORT_FINALIZE(localRet);
+
+			localRet = (*pNew->modQueryEtryPt)((uchar*)"endTransaction", &pNew->mod.om.endTransaction);
+			if(localRet == RS_RET_MODULE_ENTRY_POINT_NOT_FOUND)
+				pNew->mod.om.beginTransaction = dummyEndTransaction;
+			else if(localRet != RS_RET_OK)
+				ABORT_FINALIZE(localRet);
 			break;
 		case eMOD_LIB:
 			break;
diff --git a/runtime/modules.h b/runtime/modules.h
index 4d874019..71e3199c 100644
--- a/runtime/modules.h
+++ b/runtime/modules.h
@@ -111,7 +111,9 @@ typedef struct modInfo_s {
 		struct {/* data for output modules */
 			/* below: perform the configured action
 			 */
+			rsRetVal (*beginTransaction)(void*);
 			rsRetVal (*doAction)(uchar**, unsigned, void*);
+			rsRetVal (*endTransaction)(void*);
 			rsRetVal (*parseSelectorAct)(uchar**, void**,omodStringRequest_t**);
 		} om;
 		struct { /* data for library modules */
diff --git a/runtime/queue.c b/runtime/queue.c
index 13e7007a..7438fbaa 100644
--- a/runtime/queue.c
+++ b/runtime/queue.c
@@ -8,7 +8,11 @@
  * (and in the web doc set on http://www.rsyslog.com/doc). Be sure to read it
  * if you are getting aquainted to the object.
  *
- * Copyright 2008 Rainer Gerhards and Adiscon GmbH.
+ * NOTE: as of 2009-04-22, I have begin to remove the qqueue* prefix from static
+ * function names - this makes it really hard to read and does not provide much
+ * benefit, at least I (now) think so...
+ *
+ * Copyright 2008, 2009 Rainer Gerhards and Adiscon GmbH.
  *
  * This file is part of the rsyslog runtime library.
  *
@@ -51,6 +55,7 @@
 #include "wti.h"
 #include "msg.h"
 #include "atomic.h"
+#include "msg.h" /* TODO: remove once we remove MsgAddRef() call */
 
 #ifdef OS_SOLARIS
 #	include <sched.h>
@@ -63,59 +68,158 @@ DEFobjCurrIf(glbl)
 DEFobjCurrIf(strm)
 
 /* forward-definitions */
-static rsRetVal qqueueChkPersist(qqueue_t *pThis);
-static rsRetVal qqueueSetEnqOnly(qqueue_t *pThis, int bEnqOnly, int bLockMutex);
-static rsRetVal qqueueRateLimiter(qqueue_t *pThis);
+static rsRetVal ChkStrtDA(qqueue_t *pThis);
+static rsRetVal qqueueChkPersist(qqueue_t *pThis, int nUpdates);
+static rsRetVal SetEnqOnly(qqueue_t *pThis, int bEnqOnly, int bLockMutex);
+static rsRetVal RateLimiter(qqueue_t *pThis);
 static int qqueueChkStopWrkrDA(qqueue_t *pThis);
+static rsRetVal GetDeqBatchSize(qqueue_t *pThis, int *pVal);
 static int qqueueIsIdleDA(qqueue_t *pThis);
-static rsRetVal qqueueConsumerDA(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave);
-static rsRetVal qqueueConsumerCancelCleanup(void *arg1, void *arg2);
-static rsRetVal qqueueUngetObj(qqueue_t *pThis, obj_t *pUsr, int bLockMutex);
+static rsRetVal ConsumerDA(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave);
+static rsRetVal batchProcessed(qqueue_t *pThis, wti_t *pWti);
 
 /* some constants for queuePersist () */
 #define QUEUE_CHECKPOINT	1
 #define QUEUE_NO_CHECKPOINT	0
 
+/***********************************************************************
+ * we need a private data structure, the "to-delete" list. As C does
+ * not provide any partly private data structures, we implement this
+ * structure right here inside the module.
+ * Note that this list must always be kept sorted based on a unique
+ * dequeue ID (which is monotonically increasing).
+ * rgerhards, 2009-05-18
+ ***********************************************************************/
+
+/* generate next uniqueue dequeue ID. Note that uniqueness is only required
+ * on a per-queue basis and while this instance runs. So a stricly monotonically
+ * increasing counter is sufficient (if enough bits are used).
+ */
+static inline qDeqID getNextDeqID(qqueue_t *pQueue)
+{
+	ISOBJ_TYPE_assert(pQueue, qqueue);
+	return pQueue->deqIDAdd++;
+}
+
+
+/* return the top element of the to-delete list or NULL, if the
+ * list is empty.
+ */
+static inline toDeleteLst_t *tdlPeek(qqueue_t *pQueue)
+{
+	ISOBJ_TYPE_assert(pQueue, qqueue);
+	return pQueue->toDeleteLst;
+}
+
+
+/* remove the top element of the to-delete list. Nothing but the
+ * element itself is destroyed. Must not be called when the list
+ * is empty.
+ */
+static inline rsRetVal tdlPop(qqueue_t *pQueue)
+{
+	toDeleteLst_t *pRemove;
+	DEFiRet;
+
+	ISOBJ_TYPE_assert(pQueue, qqueue);
+	assert(pQueue->toDeleteLst != NULL);
+
+	pRemove = pQueue->toDeleteLst;
+	pQueue->toDeleteLst = pQueue->toDeleteLst->pNext;
+	free(pRemove);
+
+	RETiRet;
+}
+
+
+/* Add a new to-delete list entry. The function allocates the data
+ * structure, populates it with the values provided and links the new
+ * element into the correct place inside the list.
+ */
+static inline rsRetVal tdlAdd(qqueue_t *pQueue, qDeqID deqID, int nElemDeq)
+{
+	toDeleteLst_t *pNew;
+	toDeleteLst_t *pPrev;
+	DEFiRet;
+
+	ISOBJ_TYPE_assert(pQueue, qqueue);
+	assert(pQueue->toDeleteLst != NULL);
+
+	CHKmalloc(pNew = malloc(sizeof(toDeleteLst_t)));
+	pNew->deqID = deqID;
+	pNew->nElemDeq = nElemDeq;
+
+	/* now find right spot */
+	for(  pPrev = pQueue->toDeleteLst
+	    ; pPrev != NULL && deqID > pPrev->deqID
+	    ; pPrev = pPrev->pNext) {
+		/*JUST SEARCH*/;
+	}
+
+	if(pPrev == NULL) {
+		pNew->pNext = pQueue->toDeleteLst;
+		pQueue->toDeleteLst = pNew;
+	} else {
+		pNew->pNext = pPrev->pNext;
+		pPrev->pNext = pNew;
+	}
+
+finalize_it:
+	RETiRet;
+}
+
+
 /* methods */
 
 
-/* get the overall queue size, which includes ungotten objects. Must only be called
+/* get the physical queue size. Must only be called
  * while mutex is locked!
  * rgerhards, 2008-01-29
  */
 static inline int
-qqueueGetOverallQueueSize(qqueue_t *pThis)
+getPhysicalQueueSize(qqueue_t *pThis)
 {
-#if 0 /* leave a bit in for debugging -- rgerhards, 2008-01-30 */
-BEGINfunc
-dbgoprint((obj_t*) pThis, "queue size: %d (regular %d, ungotten %d)\n",
-	   pThis->iQueueSize + pThis->iUngottenObjs, pThis->iQueueSize, pThis->iUngottenObjs);
-ENDfunc
-#endif
-	return pThis->iQueueSize + pThis->iUngottenObjs;
+	return pThis->iQueueSize;
+}
+
+
+/* get the logical queue size (that is store size minus logically dequeued elements).
+ * Must only be called while mutex is locked!
+ * rgerhards, 2009-05-19
+ */
+static inline int
+getLogicalQueueSize(qqueue_t *pThis)
+{
+	return pThis->iQueueSize - pThis->nLogDeq;
 }
 
 
+
 /* This function drains the queue in cases where this needs to be done. The most probable
  * reason is a HUP which needs to discard data (because the queue is configured to be lossy).
  * During a shutdown, this is typically not needed, as the OS frees up ressources and does
  * this much quicker than when we clean up ourselvs. -- rgerhards, 2008-10-21
  * This function returns void, as it makes no sense to communicate an error back, even if
  * it happens.
+ * This functions works "around" the regular deque mechanism, because it is only used to
+ * clean up (in cases where message loss is acceptable). 
  */
 static inline void queueDrain(qqueue_t *pThis)
 {
 	void *pUsr;
-	
 	ASSERT(pThis != NULL);
 
+	BEGINfunc
+	dbgoprint((obj_t*) pThis, "queue (type %d) will lose %d messages, destroying...\n", pThis->qType, pThis->iQueueSize);
 	/* iQueueSize is not decremented by qDel(), so we need to do it ourselves */
 	while(ATOMIC_DEC_AND_FETCH(pThis->iQueueSize) > 0) {
-		pThis->qDel(pThis, &pUsr);
+		pThis->qDeq(pThis, &pUsr);
 		if(pUsr != NULL) {
 			objDestruct(pUsr);
 		}
+		pThis->qDel(pThis);
 	}
+	ENDfunc
 }
 
 
@@ -138,17 +242,17 @@ static inline rsRetVal qqueueAdviseMaxWorkers(qqueue_t *pThis)
 			/* if we have not yet reached the high water mark, there is no need to start a
 			 * worker. -- rgerhards, 2008-01-26
 			 */
-			if(qqueueGetOverallQueueSize(pThis) >= pThis->iHighWtrMrk || pThis->bQueueStarted == 0) {
+			if(getLogicalQueueSize(pThis) >= pThis->iHighWtrMrk || pThis->bQueueStarted == 0) {
 				wtpAdviseMaxWorkers(pThis->pWtpDA, 1); /* disk queues have always one worker */
 			}
+		}
+		/* regular workers always run */
+		if(pThis->qType == QUEUETYPE_DISK || pThis->iMinMsgsPerWrkr == 0) {
+			iMaxWorkers = 1;
 		} else {
-			if(pThis->qType == QUEUETYPE_DISK || pThis->iMinMsgsPerWrkr == 0) {
-				iMaxWorkers = 1;
-			} else {
-				iMaxWorkers = qqueueGetOverallQueueSize(pThis) / pThis->iMinMsgsPerWrkr + 1;
-			}
-			wtpAdviseMaxWorkers(pThis->pWtpReg, iMaxWorkers); /* disk queues have always one worker */
+			iMaxWorkers = getLogicalQueueSize(pThis) / pThis->iMinMsgsPerWrkr + 1;
 		}
+		wtpAdviseMaxWorkers(pThis->pWtpReg, iMaxWorkers); /* disk queues have always one worker */
 	}
 
 	RETiRet;
@@ -156,7 +260,11 @@ static inline rsRetVal qqueueAdviseMaxWorkers(qqueue_t *pThis)
 
 
 /* wait until we have a fully initialized DA queue. Sometimes, we need to
- * sync with it, as we expect it for some function.
+ * sync with it, as we expect it for some function. Note that in extreme
+ * cases, the DA queue may already have started up AND terminated when we
+ * call this function. As such,it may validly be that DA is already shut down.
+ * So we just check if we are in init phase and then wait for full startup.
+ * If in non-DA mode, we silently return.
  * rgerhards, 2008-02-27
  */
 static rsRetVal
@@ -165,9 +273,8 @@ qqueueWaitDAModeInitialized(qqueue_t *pThis)
 	DEFiRet;
 
 	ISOBJ_TYPE_assert(pThis, qqueue);
-	ASSERT(pThis->bRunsDA);
 
-	while(pThis->bRunsDA != 2) {
+	while(pThis->bRunsDA == 1) {
 		d_pthread_cond_wait(&pThis->condDAReady, pThis->mut);
 	}
 
@@ -186,45 +293,38 @@ qqueueWaitDAModeInitialized(qqueue_t *pThis)
  * rgerhards, 2008-01-15
  */
 static rsRetVal
-qqueueTurnOffDAMode(qqueue_t *pThis)
+TurnOffDAMode(qqueue_t *pThis)
 {
 	DEFiRet;
 
 	ISOBJ_TYPE_assert(pThis, qqueue);
 	ASSERT(pThis->bRunsDA);
 
-	/* at this point, we need a fully initialized DA queue. So if it isn't, we finally need
-	 * to wait for its startup... -- rgerhards, 2008-01-25
-	 */
-	qqueueWaitDAModeInitialized(pThis);
-
 	/* if we need to pull any data that we still need from the (child) disk queue,
 	 * now would be the time to do so. At present, we do not need this, but I'd like to
 	 * keep that comment if future need arises.
 	 */
 
 	/* we need to check if the DA queue is empty because the DA worker may simply have
-	 * terminated do to no new messages arriving. That does not, however, mean that the
+	 * terminated due to no new messages arriving. That does not, however, mean that the
 	 * DA queue is empty. If there is still data in that queue, we do nothing and leave
 	 * that for a later incarnation of this function (it will be called multiple times
 	 * during the lifetime of DA-mode, depending on how often the DA worker receives an
 	 * inactivity timeout. -- rgerhards, 2008-01-25
 	 */
-	if(pThis->pqDA->iQueueSize == 0) {
+dbgprintf("XXX: getLogicalQueueSize(pThis->pqDA): %d\n", getLogicalQueueSize(pThis->pqDA));
+	if(getLogicalQueueSize(pThis->pqDA) == 0) {
 		pThis->bRunsDA = 0; /* tell the world we are back in non-DA mode */
 		/* we destruct the queue object, which will also shutdown the queue worker. As the queue is empty,
 		 * this will be quick.
 		 */
-		qqueueDestruct(&pThis->pqDA); /* and now we are ready to destruct the DA queue */
+//XXX: TODO		qqueueDestruct(&pThis->pqDA); /* and now we are ready to destruct the DA queue */
 		dbgoprint((obj_t*) pThis, "disk-assistance has been turned off, disk queue was empty (iRet %d)\n",
 			  iRet);
-		/* now we need to check if the regular queue has some messages. This may be the case
-		 * when it is waiting that the high water mark is reached again. If so, we need to start up
-		 * a regular worker. -- rgerhards, 2008-01-26
-		 */
-		if(qqueueGetOverallQueueSize(pThis) > 0) {
-			qqueueAdviseMaxWorkers(pThis);
-		}
+	} else {
+		/* the queue has data again! */
+		dbgprintf("DA queue has data during shutdown, restarting...\n");
+		qqueueAdviseMaxWorkers(pThis->pqDA);
 	}
 
 	RETiRet;
@@ -267,7 +367,7 @@ qqueueChkIsDA(qqueue_t *pThis)
  * rgerhards, 2008-01-15
  */
 static rsRetVal
-qqueueStartDA(qqueue_t *pThis)
+StartDA(qqueue_t *pThis)
 {
 	DEFiRet;
 	uchar pszDAQName[128];
@@ -298,11 +398,15 @@ qqueueStartDA(qqueue_t *pThis)
 	CHKiRet(qqueueSetbSyncQueueFiles(pThis->pqDA, pThis->bSyncQueueFiles));
 	CHKiRet(qqueueSettoActShutdown(pThis->pqDA, pThis->toActShutdown));
 	CHKiRet(qqueueSettoEnq(pThis->pqDA, pThis->toEnq));
-	CHKiRet(qqueueSetEnqOnly(pThis->pqDA, pThis->bDAEnqOnly, MUTEX_ALREADY_LOCKED));
+	CHKiRet(SetEnqOnly(pThis->pqDA, pThis->bDAEnqOnly, MUTEX_ALREADY_LOCKED));
 	CHKiRet(qqueueSetiDeqtWinFromHr(pThis->pqDA, pThis->iDeqtWinFromHr));
 	CHKiRet(qqueueSetiDeqtWinToHr(pThis->pqDA, pThis->iDeqtWinToHr));
 	CHKiRet(qqueueSetiHighWtrMrk(pThis->pqDA, 0));
 	CHKiRet(qqueueSetiDiscardMrk(pThis->pqDA, 0));
+
+	// experimental: XXX
+	CHKiRet(qqueueSettoWrkShutdown(pThis->pqDA, 0));
+
 	if(pThis->toQShutdown == 0) {
 		CHKiRet(qqueueSettoQShutdown(pThis->pqDA, 0)); /* if the user really wants... */
 	} else {
@@ -318,14 +422,6 @@ qqueueStartDA(qqueue_t *pThis)
 	if(iRet != RS_RET_OK && iRet != RS_RET_FILE_NOT_FOUND)
 		FINALIZE; /* something is wrong */
 
-	/* as we are right now starting DA mode because we are so busy, it is
-	 * extremely unlikely that any regular worker is sleeping on empty queue. HOWEVER,
-	 * we want to be on the safe side, and so we awake anyone that is waiting
-	 * on one. So even if the scheduler plays badly with us, things should be
-	 * quite well. -- rgerhards, 2008-01-15
-	 */
-	wtpWakeupWrkr(pThis->pWtpReg); /* awake all workers, but not ourselves ;) */
-
 	pThis->bRunsDA = 2;	/* we are now in DA mode, but not fully initialized */
 	pThis->bChildIsDone = 0;/* set to 1 when child's worker detect queue is finished */
 	pthread_cond_broadcast(&pThis->condDAReady); /* signal we are now initialized and ready to go ;) */
@@ -352,8 +448,8 @@ finalize_it:
  * If this function fails (should not happen), DA mode is not turned on.
  * rgerhards, 2008-01-16
  */
-static inline rsRetVal
-qqueueInitDA(qqueue_t *pThis, int bEnqOnly, int bLockMutex)
+static rsRetVal
+InitDA(qqueue_t *pThis, int bEnqOnly, int bLockMutex)
 {
 	DEFiRet;
 	DEFVARS_mutexProtection;
@@ -366,17 +462,19 @@ qqueueInitDA(qqueue_t *pThis, int bEnqOnly, int bLockMutex)
 	 * is intentional. We assume that when we need it once, we may also need it on another
 	 * occasion. Ressources used are quite minimal when no worker is running.
 	 * rgerhards, 2008-01-24
+	 * NOTE: this is the DA worker *pool*, not the DA queue!
 	 */
 	if(pThis->pWtpDA == NULL) {
-		lenBuf = snprintf((char*)pszBuf, sizeof(pszBuf), "%s:DA", obj.GetName((obj_t*) pThis));
+		lenBuf = snprintf((char*)pszBuf, sizeof(pszBuf), "%s:DAwpool", obj.GetName((obj_t*) pThis));
 		CHKiRet(wtpConstruct		(&pThis->pWtpDA));
 		CHKiRet(wtpSetDbgHdr		(pThis->pWtpDA, pszBuf, lenBuf));
 		CHKiRet(wtpSetpfChkStopWrkr	(pThis->pWtpDA, (rsRetVal (*)(void *pUsr, int)) qqueueChkStopWrkrDA));
-		CHKiRet(wtpSetpfIsIdle		(pThis->pWtpDA, (rsRetVal (*)(void *pUsr, int)) qqueueIsIdleDA));
-		CHKiRet(wtpSetpfDoWork		(pThis->pWtpDA, (rsRetVal (*)(void *pUsr, void *pWti, int)) qqueueConsumerDA));
-		CHKiRet(wtpSetpfOnWorkerCancel	(pThis->pWtpDA, (rsRetVal (*)(void *pUsr, void*pWti)) qqueueConsumerCancelCleanup));
-		CHKiRet(wtpSetpfOnWorkerStartup	(pThis->pWtpDA, (rsRetVal (*)(void *pUsr)) qqueueStartDA));
-		CHKiRet(wtpSetpfOnWorkerShutdown(pThis->pWtpDA, (rsRetVal (*)(void *pUsr)) qqueueTurnOffDAMode));
+		CHKiRet(wtpSetpfGetDeqBatchSize	(pThis->pWtpDA, (rsRetVal (*)(void *pUsr, int*)) GetDeqBatchSize));
+		CHKiRet(wtpSetpfIsIdle		(pThis->pWtpDA, (rsRetVal (*)(void *pUsr, wtp_t*)) qqueueIsIdleDA));
+		CHKiRet(wtpSetpfDoWork		(pThis->pWtpDA, (rsRetVal (*)(void *pUsr, void *pWti, int)) ConsumerDA));
+		CHKiRet(wtpSetpfObjProcessed	(pThis->pWtpDA, (rsRetVal (*)(void *pUsr, wti_t *pWti)) batchProcessed));
+		CHKiRet(wtpSetpfOnWorkerStartup	(pThis->pWtpDA, (rsRetVal (*)(void *pUsr)) StartDA));
+		CHKiRet(wtpSetpfOnWorkerShutdown(pThis->pWtpDA, (rsRetVal (*)(void *pUsr)) TurnOffDAMode));
 		CHKiRet(wtpSetpmutUsr		(pThis->pWtpDA, pThis->mut));
 		CHKiRet(wtpSetpcondBusy		(pThis->pWtpDA, &pThis->notEmpty));
 		CHKiRet(wtpSetiNumWorkerThreads	(pThis->pWtpDA, 1));
@@ -394,7 +492,7 @@ qqueueInitDA(qqueue_t *pThis, int bEnqOnly, int bLockMutex)
 	 * that will also start one up. If we forgot that step, everything would be stalled
 	 * until the next enqueue request.
 	 */
-	wtpAdviseMaxWorkers(pThis->pWtpDA, 1); /* DA queues alsways have just one worker max */
+	wtpAdviseMaxWorkers(pThis->pWtpDA, 1); /* DA queues always have just one worker max */
 
 finalize_it:
 	END_MTX_PROTECTED_OPERATIONS(pThis->mut);
@@ -408,15 +506,15 @@ finalize_it:
  * complete.
  * rgerhards, 2008-01-14
  */
-static inline rsRetVal
-qqueueChkStrtDA(qqueue_t *pThis)
+static rsRetVal
+ChkStrtDA(qqueue_t *pThis)
 {
 	DEFiRet;
 
 	ISOBJ_TYPE_assert(pThis, qqueue);
 
 	/* if we do not hit the high water mark, we have nothing to do */
-	if(qqueueGetOverallQueueSize(pThis) != pThis->iHighWtrMrk)
+	if(getPhysicalQueueSize(pThis) != pThis->iHighWtrMrk)
 		ABORT_FINALIZE(RS_RET_OK);
 
 	if(pThis->bRunsDA) {
@@ -430,15 +528,15 @@ qqueueChkStrtDA(qqueue_t *pThis)
 		 * we need at least one).
 		 */
 		dbgoprint((obj_t*) pThis, "%d entries - passed high water mark in DA mode, send notify\n",
-			  qqueueGetOverallQueueSize(pThis));
+			  getPhysicalQueueSize(pThis));
 		qqueueAdviseMaxWorkers(pThis);
 	} else {
 		/* this is the case when we are currently not running in DA mode. So it is time
 		 * to turn it back on.
 		 */
 		dbgoprint((obj_t*) pThis, "%d entries - passed high water mark for disk-assisted mode, initiating...\n",
-			  qqueueGetOverallQueueSize(pThis));
-		qqueueInitDA(pThis, QUEUE_MODE_ENQDEQ, MUTEX_ALREADY_LOCKED); /* initiate DA mode */
+			  getPhysicalQueueSize(pThis));
+		InitDA(pThis, QUEUE_MODE_ENQDEQ, MUTEX_ALREADY_LOCKED); /* initiate DA mode */
 	}
 
 finalize_it:
@@ -469,6 +567,7 @@ static rsRetVal qConstructFixedArray(qqueue_t *pThis)
 		ABORT_FINALIZE(RS_RET_OUT_OF_MEMORY);
 	}
 
+	pThis->tVars.farray.deqhead = 0;
 	pThis->tVars.farray.head = 0;
 	pThis->tVars.farray.tail = 0;
 
@@ -486,9 +585,7 @@ static rsRetVal qDestructFixedArray(qqueue_t *pThis)
 	ASSERT(pThis != NULL);
 
 	queueDrain(pThis); /* discard any remaining queue entries */
-
-	if(pThis->tVars.farray.pBuf != NULL)
-		free(pThis->tVars.farray.pBuf);
+	free(pThis->tVars.farray.pBuf);
 
 	RETiRet;
 }
@@ -507,76 +604,57 @@ static rsRetVal qAddFixedArray(qqueue_t *pThis, void* in)
 	RETiRet;
 }
 
-static rsRetVal qDelFixedArray(qqueue_t *pThis, void **out)
+
+static rsRetVal qDeqFixedArray(qqueue_t *pThis, void **out)
 {
 	DEFiRet;
 
 	ASSERT(pThis != NULL);
-	*out = (void*) pThis->tVars.farray.pBuf[pThis->tVars.farray.head];
+	*out = (void*) pThis->tVars.farray.pBuf[pThis->tVars.farray.deqhead];
 
-	pThis->tVars.farray.head++;
-	if (pThis->tVars.farray.head == pThis->iMaxQueueSize)
-		pThis->tVars.farray.head = 0;
+	pThis->tVars.farray.deqhead++;
+	if (pThis->tVars.farray.deqhead == pThis->iMaxQueueSize)
+		pThis->tVars.farray.deqhead = 0;
 
 	RETiRet;
 }
 
 
-/* -------------------- linked list  -------------------- */
-
-/* first some generic functions which are also used for the unget linked list */
-
-static inline rsRetVal qqueueAddLinkedList(qLinkedList_t **ppRoot, qLinkedList_t **ppLast, void* pUsr)
+static rsRetVal qDelFixedArray(qqueue_t *pThis)
 {
 	DEFiRet;
-	qLinkedList_t *pEntry;
-
-	ASSERT(ppRoot != NULL);
-	ASSERT(ppLast != NULL);
-
-	if((pEntry = (qLinkedList_t*) malloc(sizeof(qLinkedList_t))) == NULL) {
-		ABORT_FINALIZE(RS_RET_OUT_OF_MEMORY);
-	}
 
-	pEntry->pNext = NULL;
-	pEntry->pUsr = pUsr;
+	ASSERT(pThis != NULL);
 
-	if(*ppRoot == NULL) {
-		*ppRoot = *ppLast = pEntry;
-	} else {
-		(*ppLast)->pNext = pEntry;
-		*ppLast = pEntry;
-	}
+	pThis->tVars.farray.head++;
+	if (pThis->tVars.farray.head == pThis->iMaxQueueSize)
+		pThis->tVars.farray.head = 0;
 
-finalize_it:
 	RETiRet;
 }
 
-static inline rsRetVal qqueueDelLinkedList(qLinkedList_t **ppRoot, qLinkedList_t **ppLast, obj_t **ppUsr)
+
+/* reset the logical dequeue pointer to the physical dequeue position.
+ * This is only needed after we cancelled workers (during queue shutdown).
+ */
+static rsRetVal
+qUnDeqAllFixedArray(qqueue_t *pThis)
 {
 	DEFiRet;
-	qLinkedList_t *pEntry;
 
-	ASSERT(ppRoot != NULL);
-	ASSERT(ppLast != NULL);
-	ASSERT(ppUsr != NULL);
-	ASSERT(*ppRoot != NULL);
-	
-	pEntry = *ppRoot;
-	*ppUsr = pEntry->pUsr;
+	ISOBJ_TYPE_assert(pThis, qqueue);
 
-	if(*ppRoot == *ppLast) {
-		*ppRoot = NULL;
-		*ppLast = NULL;
-	} else {
-		*ppRoot = pEntry->pNext;
-	}
-	free(pEntry);
+	dbgoprint((obj_t*) pThis, "resetting FixedArray deq index to %ld (was %ld), logical dequeue count %d\n",
+		  pThis->tVars.farray.head, pThis->tVars.farray.deqhead, pThis->nLogDeq);
+
+	pThis->tVars.farray.deqhead = pThis->tVars.farray.head;
+	pThis->nLogDeq = 0;
 
 	RETiRet;
 }
 
-/* end generic functions which are also used for the unget linked list */
+
+/* -------------------- linked list  -------------------- */
 
 
 static rsRetVal qConstructLinkedList(qqueue_t *pThis)
@@ -585,8 +663,9 @@ static rsRetVal qConstructLinkedList(qqueue_t *pThis)
 
 	ASSERT(pThis != NULL);
 
-	pThis->tVars.linklist.pRoot = 0;
-	pThis->tVars.linklist.pLast = 0;
+	pThis->tVars.linklist.pDeqRoot = NULL;
+	pThis->tVars.linklist.pDelRoot = NULL;
+	pThis->tVars.linklist.pLast = NULL;
 
 	qqueueChkIsDA(pThis);
 
@@ -609,54 +688,79 @@ static rsRetVal qDestructLinkedList(qqueue_t __attribute__((unused)) *pThis)
 
 static rsRetVal qAddLinkedList(qqueue_t *pThis, void* pUsr)
 {
-	DEFiRet;
-
-	iRet = qqueueAddLinkedList(&pThis->tVars.linklist.pRoot, &pThis->tVars.linklist.pLast, pUsr);
-#if 0
 	qLinkedList_t *pEntry;
+	DEFiRet;
 
-	ASSERT(pThis != NULL);
-	if((pEntry = (qLinkedList_t*) malloc(sizeof(qLinkedList_t))) == NULL) {
-		ABORT_FINALIZE(RS_RET_OUT_OF_MEMORY);
-	}
+	CHKmalloc((pEntry = (qLinkedList_t*) malloc(sizeof(qLinkedList_t))));
 
 	pEntry->pNext = NULL;
 	pEntry->pUsr = pUsr;
 
-	if(pThis->tVars.linklist.pRoot == NULL) {
-		pThis->tVars.linklist.pRoot = pThis->tVars.linklist.pLast = pEntry;
+	if(pThis->tVars.linklist.pDelRoot == NULL) {
+		pThis->tVars.linklist.pDelRoot = pThis->tVars.linklist.pDeqRoot = pThis->tVars.linklist.pLast = pEntry;
 	} else {
 		pThis->tVars.linklist.pLast->pNext = pEntry;
 		pThis->tVars.linklist.pLast = pEntry;
 	}
 
+	if(pThis->tVars.linklist.pDeqRoot == NULL) {
+		pThis->tVars.linklist.pDeqRoot = pEntry;
+	}
+
 finalize_it:
-#endif
 	RETiRet;
 }
 
-static rsRetVal qDelLinkedList(qqueue_t *pThis, obj_t **ppUsr)
+
+static rsRetVal qDeqLinkedList(qqueue_t *pThis, obj_t **ppUsr)
 {
-	DEFiRet;
-	iRet = qqueueDelLinkedList(&pThis->tVars.linklist.pRoot, &pThis->tVars.linklist.pLast, ppUsr);
-#if 0
 	qLinkedList_t *pEntry;
+	DEFiRet;
 
-	ASSERT(pThis != NULL);
-	ASSERT(pThis->tVars.linklist.pRoot != NULL);
-	
-	pEntry = pThis->tVars.linklist.pRoot;
+	pEntry = pThis->tVars.linklist.pDeqRoot;
+	ISOBJ_TYPE_assert(pEntry->pUsr, msg);
 	*ppUsr = pEntry->pUsr;
+	pThis->tVars.linklist.pDeqRoot = pEntry->pNext;
+
+	RETiRet;
+}
+
+
+static rsRetVal qDelLinkedList(qqueue_t *pThis)
+{
+	qLinkedList_t *pEntry;
+	DEFiRet;
+
+	pEntry = pThis->tVars.linklist.pDelRoot;
 
-	if(pThis->tVars.linklist.pRoot == pThis->tVars.linklist.pLast) {
-		pThis->tVars.linklist.pRoot = NULL;
-		pThis->tVars.linklist.pLast = NULL;
+	if(pThis->tVars.linklist.pDelRoot == pThis->tVars.linklist.pLast) {
+		pThis->tVars.linklist.pDelRoot = pThis->tVars.linklist.pDeqRoot = pThis->tVars.linklist.pLast = NULL;
 	} else {
-		pThis->tVars.linklist.pRoot = pEntry->pNext;
+		pThis->tVars.linklist.pDelRoot = pEntry->pNext;
 	}
+
 	free(pEntry);
 
-#endif
+	RETiRet;
+}
+
+
+/* reset the logical dequeue pointer to the physical dequeue position.
+ * This is only needed after we cancelled workers (during queue shutdown).
+ */
+static rsRetVal
+qUnDeqAllLinkedList(qqueue_t *pThis)
+{
+	DEFiRet;
+
+	ASSERT(pThis != NULL);
+
+	dbgoprint((obj_t*) pThis, "resetting LinkedList deq ptr to %p (was %p), logical dequeue count %d\n",
+		  pThis->tVars.linklist.pDelRoot, pThis->tVars.linklist.pDeqRoot, pThis->nLogDeq);
+
+	pThis->tVars.linklist.pDeqRoot = pThis->tVars.linklist.pDelRoot;
+	pThis->nLogDeq = 0;
+
 	RETiRet;
 }
 
@@ -725,8 +829,6 @@ qqueueTryLoadPersistedInfo(qqueue_t *pThis)
 	uchar pszQIFNam[MAXFNAME];
 	size_t lenQIFNam;
 	struct stat stat_buf;
-	int iUngottenObjs;
-	obj_t *pUsr;
 
 	ISOBJ_TYPE_assert(pThis, qqueue);
 
@@ -756,25 +858,22 @@ qqueueTryLoadPersistedInfo(qqueue_t *pThis)
 	/* first, we try to read the property bag for ourselfs */
 	CHKiRet(obj.DeserializePropBag((obj_t*) pThis, psQIF));
 	
-	/* then the ungotten object queue */
-	iUngottenObjs = pThis->iUngottenObjs;
-	pThis->iUngottenObjs = 0; /* will be incremented when we add objects! */
-
-	while(iUngottenObjs > 0) {
-		/* fill the queue from disk */
-		CHKiRet(obj.Deserialize((void*) &pUsr, (uchar*)"msg", psQIF, NULL, NULL));
-		qqueueUngetObj(pThis, pUsr, MUTEX_ALREADY_LOCKED);
-		--iUngottenObjs; /* one less */
-	}
-
-	/* and now the stream objects (some order as when persisted!) */
+	/* then the stream objects (same order as when persisted!) */
 	CHKiRet(obj.Deserialize(&pThis->tVars.disk.pWrite, (uchar*) "strm", psQIF,
 			       (rsRetVal(*)(obj_t*,void*))qqueueLoadPersStrmInfoFixup, pThis));
-	CHKiRet(obj.Deserialize(&pThis->tVars.disk.pRead, (uchar*) "strm", psQIF,
+	CHKiRet(obj.Deserialize(&pThis->tVars.disk.pReadDel, (uchar*) "strm", psQIF,
 			       (rsRetVal(*)(obj_t*,void*))qqueueLoadPersStrmInfoFixup, pThis));
 
+	/* create a duplicate for the read "pointer".
+	 */
+
+	CHKiRet(strm.Dup(pThis->tVars.disk.pReadDel, &pThis->tVars.disk.pReadDeq));
+	CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pReadDeq, 0)); /* deq must NOT delete the files! */
+	CHKiRet(strm.ConstructFinalize(pThis->tVars.disk.pReadDeq));
+
 	CHKiRet(strm.SeekCurrOffs(pThis->tVars.disk.pWrite));
-	CHKiRet(strm.SeekCurrOffs(pThis->tVars.disk.pRead));
+	CHKiRet(strm.SeekCurrOffs(pThis->tVars.disk.pReadDel));
+	CHKiRet(strm.SeekCurrOffs(pThis->tVars.disk.pReadDeq));
 
 	/* OK, we could successfully read the file, so we now can request that it be
 	 * deleted when we are done with the persisted information.
@@ -826,18 +925,26 @@ static rsRetVal qConstructDisk(qqueue_t *pThis)
 		CHKiRet(strm.SetsType(pThis->tVars.disk.pWrite, STREAMTYPE_FILE_CIRCULAR));
 		CHKiRet(strm.ConstructFinalize(pThis->tVars.disk.pWrite));
 
-		CHKiRet(strm.Construct(&pThis->tVars.disk.pRead));
-		CHKiRet(strm.SetbSync(pThis->tVars.disk.pRead, pThis->bSyncQueueFiles));
-		CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pRead, 1));
-		CHKiRet(strm.SetDir(pThis->tVars.disk.pRead, glbl.GetWorkDir(), strlen((char*)glbl.GetWorkDir())));
-		CHKiRet(strm.SetiMaxFiles(pThis->tVars.disk.pRead, 10000000));
-		CHKiRet(strm.SettOperationsMode(pThis->tVars.disk.pRead, STREAMMODE_READ));
-		CHKiRet(strm.SetsType(pThis->tVars.disk.pRead, STREAMTYPE_FILE_CIRCULAR));
-		CHKiRet(strm.ConstructFinalize(pThis->tVars.disk.pRead));
+		CHKiRet(strm.Construct(&pThis->tVars.disk.pReadDeq));
+		CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pReadDeq, 0));
+		CHKiRet(strm.SetDir(pThis->tVars.disk.pReadDeq, glbl.GetWorkDir(), strlen((char*)glbl.GetWorkDir())));
+		CHKiRet(strm.SetiMaxFiles(pThis->tVars.disk.pReadDeq, 10000000));
+		CHKiRet(strm.SettOperationsMode(pThis->tVars.disk.pReadDeq, STREAMMODE_READ));
+		CHKiRet(strm.SetsType(pThis->tVars.disk.pReadDeq, STREAMTYPE_FILE_CIRCULAR));
+		CHKiRet(strm.ConstructFinalize(pThis->tVars.disk.pReadDeq));
 
+		CHKiRet(strm.Construct(&pThis->tVars.disk.pReadDel));
+		CHKiRet(strm.SetbSync(pThis->tVars.disk.pReadDel, pThis->bSyncQueueFiles));
+		CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pReadDel, 1));
+		CHKiRet(strm.SetDir(pThis->tVars.disk.pReadDel, glbl.GetWorkDir(), strlen((char*)glbl.GetWorkDir())));
+		CHKiRet(strm.SetiMaxFiles(pThis->tVars.disk.pReadDel, 10000000));
+		CHKiRet(strm.SettOperationsMode(pThis->tVars.disk.pReadDel, STREAMMODE_READ));
+		CHKiRet(strm.SetsType(pThis->tVars.disk.pReadDel, STREAMTYPE_FILE_CIRCULAR));
+		CHKiRet(strm.ConstructFinalize(pThis->tVars.disk.pReadDel));
 
-		CHKiRet(strm.SetFName(pThis->tVars.disk.pWrite, pThis->pszFilePrefix, pThis->lenFilePrefix));
-		CHKiRet(strm.SetFName(pThis->tVars.disk.pRead,  pThis->pszFilePrefix, pThis->lenFilePrefix));
+		CHKiRet(strm.SetFName(pThis->tVars.disk.pWrite,   pThis->pszFilePrefix, pThis->lenFilePrefix));
+		CHKiRet(strm.SetFName(pThis->tVars.disk.pReadDeq, pThis->pszFilePrefix, pThis->lenFilePrefix));
+		CHKiRet(strm.SetFName(pThis->tVars.disk.pReadDel, pThis->pszFilePrefix, pThis->lenFilePrefix));
 	}
 
 	/* now we set (and overwrite in case of a persisted restart) some parameters which
@@ -846,7 +953,8 @@ static rsRetVal qConstructDisk(qqueue_t *pThis)
 	 * ability to read existing queue files. -- rgerhards, 2008-01-12
 	 */
 	CHKiRet(strm.SetiMaxFileSize(pThis->tVars.disk.pWrite, pThis->iMaxFileSize));
-	CHKiRet(strm.SetiMaxFileSize(pThis->tVars.disk.pRead, pThis->iMaxFileSize));
+	CHKiRet(strm.SetiMaxFileSize(pThis->tVars.disk.pReadDeq, pThis->iMaxFileSize));
+	CHKiRet(strm.SetiMaxFileSize(pThis->tVars.disk.pReadDel, pThis->iMaxFileSize));
 
 finalize_it:
 	RETiRet;
@@ -860,7 +968,8 @@ static rsRetVal qDestructDisk(qqueue_t *pThis)
 	ASSERT(pThis != NULL);
 	
 	strm.Destruct(&pThis->tVars.disk.pWrite);
-	strm.Destruct(&pThis->tVars.disk.pRead);
+	strm.Destruct(&pThis->tVars.disk.pReadDeq);
+	strm.Destruct(&pThis->tVars.disk.pReadDel);
 
 	RETiRet;
 }
@@ -892,16 +1001,30 @@ finalize_it:
 	RETiRet;
 }
 
-static rsRetVal qDelDisk(qqueue_t *pThis, void **ppUsr)
+
+static rsRetVal qDeqDisk(qqueue_t *pThis, void **ppUsr)
+{
+	DEFiRet;
+
+	CHKiRet(obj.Deserialize(ppUsr, (uchar*) "msg", pThis->tVars.disk.pReadDeq, NULL, NULL));
+
+finalize_it:
+	RETiRet;
+}
+
+
+static rsRetVal qDelDisk(qqueue_t *pThis)
 {
+	obj_t *pDummyObj;	/* we need to deserialize it... */
 	DEFiRet;
 
 	int64 offsIn;
 	int64 offsOut;
 
-	CHKiRet(strm.GetCurrOffset(pThis->tVars.disk.pRead, &offsIn));
-	CHKiRet(obj.Deserialize(ppUsr, (uchar*) "msg", pThis->tVars.disk.pRead, NULL, NULL));
-	CHKiRet(strm.GetCurrOffset(pThis->tVars.disk.pRead, &offsOut));
+	CHKiRet(strm.GetCurrOffset(pThis->tVars.disk.pReadDel, &offsIn));
+	CHKiRet(obj.Deserialize(&pDummyObj, (uchar*) "msg", pThis->tVars.disk.pReadDel, NULL, NULL));
+	objDestruct(pDummyObj);
+	CHKiRet(strm.GetCurrOffset(pThis->tVars.disk.pReadDel, &offsOut));
 
 	/* This time it is a bit tricky: we free disk space only upon file deletion. So we need
 	 * to keep track of what we have read until we get an out-offset that is lower than the
@@ -922,6 +1045,17 @@ finalize_it:
 	RETiRet;
 }
 
+
+/* This is a dummy function for disks - we do not need to reset anything
+ * because everything is already persisted...
+ */
+static rsRetVal
+qUnDeqAllDisk(__attribute__((unused)) qqueue_t *pThis)
+{
+	return RS_RET_OK;
+}
+
+
 /* -------------------- direct (no queueing) -------------------- */
 static rsRetVal qConstructDirect(qqueue_t __attribute__((unused)) *pThis)
 {
@@ -936,6 +1070,8 @@ static rsRetVal qDestructDirect(qqueue_t __attribute__((unused)) *pThis)
 
 static rsRetVal qAddDirect(qqueue_t *pThis, void* pUsr)
 {
+	batch_t singleBatch;
+	batch_obj_t batchObj;
 	DEFiRet;
 
 	ASSERT(pThis != NULL);
@@ -945,70 +1081,33 @@ static rsRetVal qAddDirect(qqueue_t *pThis, void* pUsr)
 	 * mode the consumer probably has a lot to convey (which get's lost in the other modes
 	 * because they are asynchronous. But direct mode is deliberately synchronous.
 	 * rgerhards, 2008-02-12
+	 * We use our knowledge about the batch_t structure below, but without that, we
+	 * pay a too-large performance toll... -- rgerhards, 2009-04-22
 	 */
-	iRet = pThis->pConsumer(pThis->pUsr, pUsr);
+	batchObj.state = BATCH_STATE_RDY;
+	batchObj.pUsrp = (obj_t*) pUsr;
+	singleBatch.nElem = 1; /* there always is only one in direct mode */
+	singleBatch.pElem = &batchObj;
+	iRet = pThis->pConsumer(pThis->pUsr, &singleBatch);
+	objDestruct(pUsr);
 
 	RETiRet;
 }
 
-static rsRetVal qDelDirect(qqueue_t __attribute__((unused)) *pThis, __attribute__((unused)) void **out)
+
+static rsRetVal qDelDirect(qqueue_t __attribute__((unused)) *pThis)
 {
 	return RS_RET_OK;
 }
 
-
-/* --------------- end type-specific handlers -------------------- */
-
-
-/* unget a user pointer that has been dequeued. This functionality is especially important
- * for consumer cancel cleanup handlers. To support it, a short list of ungotten user pointers
- * is maintened in memory.
- * rgerhards, 2008-01-20
- */
 static rsRetVal
-qqueueUngetObj(qqueue_t *pThis, obj_t *pUsr, int bLockMutex)
+qUnDeqAllDirect(__attribute__((unused)) qqueue_t *pThis)
 {
-	DEFiRet;
-	DEFVARS_mutexProtection;
-
-	ISOBJ_TYPE_assert(pThis, qqueue);
-	ISOBJ_assert(pUsr); /* TODO: we aborted right at this place at least 3 times -- race? 2008-02-28, -03-10, -03-15
-			       The second time I noticed it the queue was in destruction with NO worker threads
-			       running. The pUsr ptr was totally off and provided no clue what it may be pointing
-			       at (except that it looked like the static data pool). Both times, the abort happend
-			       inside an action queue */
-
-	dbgoprint((obj_t*) pThis, "ungetting user object %s\n", obj.GetName(pUsr));
-	BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, bLockMutex);
-	iRet = qqueueAddLinkedList(&pThis->pUngetRoot, &pThis->pUngetLast, pUsr);
-	++pThis->iUngottenObjs;	/* indicate one more */
-	END_MTX_PROTECTED_OPERATIONS(pThis->mut);
-
-	RETiRet;
+	return RS_RET_OK;
 }
 
 
-/* dequeues a user pointer from the ungotten queue. Pointers from there should always be
- * dequeued first.
- *
- * This function must only be called when the mutex is locked!
- *
- * rgerhards, 2008-01-29
- */
-static rsRetVal
-qqueueGetUngottenObj(qqueue_t *pThis, obj_t **ppUsr)
-{
-	DEFiRet;
-
-	ISOBJ_TYPE_assert(pThis, qqueue);
-	ASSERT(ppUsr != NULL);
-
-	iRet = qqueueDelLinkedList(&pThis->pUngetRoot, &pThis->pUngetLast, ppUsr);
-	--pThis->iUngottenObjs;	/* indicate one less */
-	dbgoprint((obj_t*) pThis, "dequeued ungotten user object %s\n", obj.GetName(*ppUsr));
-
-	RETiRet;
-}
+/* --------------- end type-specific handlers -------------------- */
 
 
 /* generic code to add a queue entry
@@ -1027,7 +1126,8 @@ qqueueAdd(qqueue_t *pThis, void *pUsr)
 
 	if(pThis->qType != QUEUETYPE_DIRECT) {
 		ATOMIC_INC(pThis->iQueueSize);
-		dbgoprint((obj_t*) pThis, "entry added, size now %d entries\n", pThis->iQueueSize);
+		dbgoprint((obj_t*) pThis, "entry added, size now log %d, phys %d entries\n",
+			  getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis));
 	}
 
 finalize_it:
@@ -1035,12 +1135,10 @@ finalize_it:
 }
 
 
-/* generic code to remove a queue entry
- * rgerhards, 2008-01-29: we must first see if there is any object in the
- * ungotten list and, if so, dequeue it first.
+/* generic code to dequeue a queue entry
  */
 static rsRetVal
-qqueueDel(qqueue_t *pThis, void *pUsr)
+qqueueDeq(qqueue_t *pThis, void **ppUsr)
 {
 	DEFiRet;
 
@@ -1051,53 +1149,37 @@ qqueueDel(qqueue_t *pThis, void *pUsr)
 	 * If we decrement, however, we may lose a message. But that is better than
 	 * losing the whole process because it loops... -- rgerhards, 2008-01-03
 	 */
-	if(pThis->iUngottenObjs > 0) {
-		iRet = qqueueGetUngottenObj(pThis, (obj_t**) pUsr);
-	} else {
-		iRet = pThis->qDel(pThis, pUsr);
-		ATOMIC_DEC(pThis->iQueueSize);
-	}
+	iRet = pThis->qDeq(pThis, ppUsr);
+	ATOMIC_INC(pThis->nLogDeq);
 
-	dbgoprint((obj_t*) pThis, "entry deleted, state %d, size now %d entries\n",
-		  iRet, pThis->iQueueSize);
+//	dbgoprint((obj_t*) pThis, "entry deleted, size now log %d, phys %d entries\n",
+//		  getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis));
 
 	RETiRet;
 }
 
 
-/* This function shuts down all worker threads and waits until they
- * have terminated. If they timeout, they are cancelled. Parameters have been set
- * before this function is called so that DA queues will be fully persisted to
- * disk (if configured to do so).
- * rgerhards, 2008-01-24
- * Please note that this function shuts down BOTH the parent AND the child queue
- * in DA case. This is necessary because their timeouts are tightly coupled. Most
- * importantly, the timeouts would be applied twice (or logic be extremely
- * complex) if each would have its own shutdown. The function does not self check
- * this condition - the caller must make sure it is not called with a parent.
+/* Try to terminate queue worker threads within the regular shutdown interval.
+ * Both the regular and DA queue (if it exists) is waited for, but on the same timeout.
+ * After this function returns, the workers must either be finished or some force
+ * to finish them must be applied.
+ * This function also instructs the DA worker pool (if it exists) to terminate. This is done
+ * in preparation of final queue shutdown. 
+ * rgerhards, 2009-05-27
  */
-static rsRetVal qqueueShutdownWorkers(qqueue_t *pThis)
+static rsRetVal
+tryShutdownWorkersWithinQueueTimeout(qqueue_t *pThis)
 {
-	DEFiRet;
-	DEFVARS_mutexProtection;
+	DEFVARS_mutexProtection_uncond;
 	struct timespec tTimeout;
 	rsRetVal iRetLocal;
+	DEFiRet;
 
 	ISOBJ_TYPE_assert(pThis, qqueue);
 	ASSERT(pThis->pqParent == NULL); /* detect invalid calling sequence */
 
-	dbgoprint((obj_t*) pThis, "initiating worker thread shutdown sequence\n");
-
-	/* we reduce the low water mark in any case. This is not absolutely necessary, but
-	 * it is useful because we enable DA mode at several spots below and so we do not need
-	 * to think about the low water mark each time. 
-	 */
-	pThis->iHighWtrMrk = 1; /* if we do not do this, the DA queue will not stop! */
-	pThis->iLowWtrMrk = 0;
-
-	/* first try to shutdown the queue within the regular shutdown period */
-	BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, LOCK_MUTEX);	/* some workers may be running in parallel! */
-	if(qqueueGetOverallQueueSize(pThis) > 0) {
+	BEGIN_MTX_PROTECTED_OPERATIONS_UNCOND(pThis->mut);	/* some workers may be running in parallel! */
+	if(getPhysicalQueueSize(pThis) > 0) {
 		if(pThis->bRunsDA) {
 			/* We may have waited on the low water mark. As it may have changed, we
 			 * see if we reactivate the worker.
@@ -1105,7 +1187,12 @@ static rsRetVal qqueueShutdownWorkers(qqueue_t *pThis)
 			wtpAdviseMaxWorkers(pThis->pWtpDA, 1);
 		}
 	}
-	END_MTX_PROTECTED_OPERATIONS(pThis->mut);
+	END_MTX_PROTECTED_OPERATIONS_UNCOND(pThis->mut);
+
+	/* at this stage, we need to have the DA worker properly initialized and running (if there is one) */
+	if(pThis->bRunsDA) {
+		qqueueWaitDAModeInitialized(pThis);
+	}
 
 	/* Now wait for the queue's workers to shut down. Note that we run into the code even if we just found
 	 * out there are no active workers - that doesn't matter: the wtp knows about that and so will
@@ -1129,116 +1216,118 @@ static rsRetVal qqueueShutdownWorkers(qqueue_t *pThis)
 	if(iRetLocal == RS_RET_TIMED_OUT) {
 		dbgoprint((obj_t*) pThis, "regular shutdown timed out on primary queue (this is OK)\n");
 	} else {
-		/* OK, the regular queue is now shut down. So we can now wait for the DA queue (if running DA) */
 		dbgoprint((obj_t*) pThis, "regular queue workers shut down.\n");
-		BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, LOCK_MUTEX);	/* some workers may be running in parallel! */
-		if(pThis->bRunsDA) {
-			END_MTX_PROTECTED_OPERATIONS(pThis->mut);
-			dbgoprint((obj_t*) pThis, "we have a DA queue (0x%lx), requesting its shutdown.\n",
-				 qqueueGetID(pThis->pqDA));
-			/* we use the same absolute timeout as above, so we do not use more than the configured
-			 * timeout interval!
-			 */
-			dbgoprint((obj_t*) pThis, "trying shutdown of DA workers\n");
-			iRetLocal = wtpShutdownAll(pThis->pWtpDA, wtpState_SHUTDOWN, &tTimeout);
-			if(iRetLocal == RS_RET_TIMED_OUT) {
-				dbgoprint((obj_t*) pThis, "shutdown timed out on DA queue (this is OK)\n");
-			}
+	}
+
+	/* OK, the worker for the regular queue is processed, on the the DA queue regular worker. */
+	if(pThis->bRunsDA) {
+		dbgoprint((obj_t*) pThis, "we have a DA queue (0x%lx), requesting its shutdown.\n",
+			 qqueueGetID(pThis->pqDA));
+		/* we use the same absolute timeout as above, so we do not use more than the configured
+		 * timeout interval!
+		 */
+		dbgoprint((obj_t*) pThis, "trying shutdown of regular worker of DA queue\n");
+		iRetLocal = wtpShutdownAll(pThis->pqDA->pWtpReg, wtpState_SHUTDOWN, &tTimeout);
+		if(iRetLocal == RS_RET_TIMED_OUT) {
+			dbgoprint((obj_t*) pThis, "shutdown timed out on DA queue worker (this is OK)\n");
 		} else {
-			END_MTX_PROTECTED_OPERATIONS(pThis->mut);
+			dbgoprint((obj_t*) pThis, "DA queue worker shut down.\n");
+		}
+		/* we also instruct the DA worker pool to shutdown ASAP. If we need it for persisting
+		 * the queue, it is restarted at a later stage. We don't care here if a timeout happens.
+		 */
+		dbgoprint((obj_t*) pThis, "trying shutdown of regular worker of DA queue\n");
+		iRetLocal = wtpShutdownAll(pThis->pWtpDA, wtpState_SHUTDOWN_IMMEDIATE, &tTimeout);
+		if(iRetLocal == RS_RET_TIMED_OUT) {
+			dbgoprint((obj_t*) pThis, "shutdown timed out on main queue DA worker pool (this is OK)\n");
+		} else {
+			dbgoprint((obj_t*) pThis, "main queue DA worker pool shut down.\n");
 		}
 	}
 
-	/* when we reach this point, both queues are either empty or the regular queue shutdown timeout
-	 * has expired. Now we need to check if we are configured to not loose messages. If so, we need
-	 * to persist the queue to disk (this is only possible if the queue is DA-enabled). We must also
-	 * set the primary queue to SHUTDOWN_IMMEDIATE, as it shall now terminate as soon as its consumer
-	 * is done. This is especially important as we otherwise may interfere with queue order while the
-	 * DA consumer is running. -- rgerhards, 2008-01-27
-	 * Note: there was a note that we should not wait eternally on the DA worker if we run in
-	 * enqueue-only note. I have reviewed the code and think there is no need for this check. Howerver,
-	 * I'd like to keep this note in here should we happen to run into some related trouble.
-	 * rgerhards, 2008-01-28
-	 */
-	wtpSetState(pThis->pWtpReg, wtpState_SHUTDOWN_IMMEDIATE); /* set primary queue to shutdown only */
+	RETiRet;
+}
 
-	/* at this stage, we need to have the DA worker properly initialized and running (if there is one) */
-	if(pThis->bRunsDA)
-		qqueueWaitDAModeInitialized(pThis);
 
+/* Try to shut down regular and DA queue workers, within the action timeout 
+ * period. Note that the main queue DA worker is still unaffected (and may shuffle
+ * data to the disk queue while we terminate the other workers). Not finishing
+ * processing all messages is now OK (but they may be preserved later, depending
+ * on bSaveOnShutdown setting).
+ * rgerhards, 2009-05-27
+ */
+static rsRetVal
+tryShutdownWorkersWithinActionTimeout(qqueue_t *pThis)
+{
+	DEFVARS_mutexProtection;
+	struct timespec tTimeout;
+	rsRetVal iRetLocal;
+	DEFiRet;
+
+	ISOBJ_TYPE_assert(pThis, qqueue);
+	ASSERT(pThis->pqParent == NULL); /* detect invalid calling sequence */
+
+	/* instruct workers to finish ASAP, even if still work exists */
+	/* note that we modify bEnqOnly direclty, because going through the method would
+	 * startup some workers again. So this is OK here. -- rgerhards, 2009-05-28
+	 */
+	pThis->bEnqOnly = 1;
+	wtpSetState(pThis->pWtpReg, wtpState_SHUTDOWN_IMMEDIATE);
+	if(pThis->pqDA != NULL) {
+		pThis->pqDA->bEnqOnly = 1;
+		wtpSetState(pThis->pqDA->pWtpReg, wtpState_SHUTDOWN_IMMEDIATE);
+	}
+
+	/* now give the queue workers a last chance to gracefully shut down (based on action timeout setting) */
+	timeoutComp(&tTimeout, pThis->toActShutdown);
 	BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, LOCK_MUTEX);	/* some workers may be running in parallel! */
-	/* optimize parameters for shutdown of DA-enabled queues */
-	if(pThis->bIsDA && qqueueGetOverallQueueSize(pThis) > 0 && pThis->bSaveOnShutdown) {
-		/* switch to enqueue-only mode so that no more actions happen */
-		if(pThis->bRunsDA == 0) {
-			qqueueInitDA(pThis, QUEUE_MODE_ENQONLY, MUTEX_ALREADY_LOCKED); /* switch to DA mode */
-		} else {
-			/* TODO: RACE: we may reach this point when the DA worker has been initialized (state 1)
-			 * but is not yet running (state 2). In this case, pThis->pqDA is NULL! rgerhards, 2008-02-27
-			 */
-			qqueueSetEnqOnly(pThis->pqDA, QUEUE_MODE_ENQONLY, MUTEX_ALREADY_LOCKED); /* switch to enqueue-only mode */
-		}
+	if(wtpGetCurNumWrkr(pThis->pWtpReg, LOCK_MUTEX) > 0) {
 		END_MTX_PROTECTED_OPERATIONS(pThis->mut);
-		/* make sure we do not timeout before we are done */
-		dbgoprint((obj_t*) pThis, "bSaveOnShutdown configured, eternal timeout set\n");
-		timeoutComp(&tTimeout, QUEUE_TIMEOUT_ETERNAL);
-		/* and run the primary queue's DA worker to drain the queue */
-		iRetLocal = wtpShutdownAll(pThis->pWtpDA, wtpState_SHUTDOWN, &tTimeout);
-		if(iRetLocal != RS_RET_OK) {
-			dbgoprint((obj_t*) pThis, "unexpected iRet state %d after trying to shut down primary queue in disk save mode, "
-				  "continuing, but results are unpredictable\n", iRetLocal);
+		dbgoprint((obj_t*) pThis, "trying immediate shutdown of regular workers\n");
+		iRetLocal = wtpShutdownAll(pThis->pWtpReg, wtpState_SHUTDOWN_IMMEDIATE, &tTimeout);
+		if(iRetLocal == RS_RET_TIMED_OUT) {
+			dbgoprint((obj_t*) pThis, "immediate shutdown timed out on primary queue (this is acceptable and "
+				  "triggers cancellation)\n");
+		} else if(iRetLocal != RS_RET_OK) {
+			dbgoprint((obj_t*) pThis, "unexpected iRet state %d after trying immediate shutdown of the primary queue "
+				  "in disk save mode. Continuing, but results are unpredictable\n", iRetLocal);
 		}
-	} else {
-		END_MTX_PROTECTED_OPERATIONS(pThis->mut);
+		/* we need to re-aquire the mutex for the next check in this case! */
+		BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, LOCK_MUTEX);
 	}
 
-	/* now the primary queue is either empty, persisted to disk - or set to loose messages. So we
-	 * can now request immediate shutdown of any remaining workers. Note that if bSaveOnShutdown was set,
-	 * the queue is now empty. If regular workers are still running, and try to pull the next message,
-	 * they will automatically terminate as there no longer is any message left to process.
-	 */
-	BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, LOCK_MUTEX);	/* some workers may be running in parallel! */
-	if(qqueueGetOverallQueueSize(pThis) > 0) {
-		timeoutComp(&tTimeout, pThis->toActShutdown);
-		if(wtpGetCurNumWrkr(pThis->pWtpReg, LOCK_MUTEX) > 0) {
-			END_MTX_PROTECTED_OPERATIONS(pThis->mut);
-			dbgoprint((obj_t*) pThis, "trying immediate shutdown of regular workers\n");
-			iRetLocal = wtpShutdownAll(pThis->pWtpReg, wtpState_SHUTDOWN_IMMEDIATE, &tTimeout);
-			if(iRetLocal == RS_RET_TIMED_OUT) {
-				dbgoprint((obj_t*) pThis, "immediate shutdown timed out on primary queue (this is acceptable and "
-					  "triggers cancellation)\n");
-			} else if(iRetLocal != RS_RET_OK) {
-				dbgoprint((obj_t*) pThis, "unexpected iRet state %d after trying immediate shutdown of the primary queue "
-					  "in disk save mode. Continuing, but results are unpredictable\n", iRetLocal);
-			}
-			/* we need to re-aquire the mutex for the next check in this case! */
-			BEGIN_MTX_PROTECTED_OPERATIONS(pThis->mut, LOCK_MUTEX);	/* some workers may be running in parallel! */
-		}
-		if(pThis->bIsDA && wtpGetCurNumWrkr(pThis->pWtpDA, LOCK_MUTEX) > 0) {
-			/* and now the same for the DA queue */
-			END_MTX_PROTECTED_OPERATIONS(pThis->mut);
-			dbgoprint((obj_t*) pThis, "trying immediate shutdown of DA workers\n");
-			iRetLocal = wtpShutdownAll(pThis->pWtpDA, wtpState_SHUTDOWN_IMMEDIATE, &tTimeout);
-			if(iRetLocal == RS_RET_TIMED_OUT) {
-				dbgoprint((obj_t*) pThis, "immediate shutdown timed out on DA queue (this is acceptable and "
-					  "triggers cancellation)\n");
-			} else if(iRetLocal != RS_RET_OK) {
-				dbgoprint((obj_t*) pThis, "unexpected iRet state %d after trying immediate shutdown of the DA queue "
-					  "in disk save mode. Continuing, but results are unpredictable\n", iRetLocal);
-			}
-		} else {
-			END_MTX_PROTECTED_OPERATIONS(pThis->mut);
-		}
-	} else {
+	if(pThis->bRunsDA && wtpGetCurNumWrkr(pThis->pqDA->pWtpReg, LOCK_MUTEX) > 0) {
+		/* and now the same for the DA queue */
 		END_MTX_PROTECTED_OPERATIONS(pThis->mut);
+		dbgoprint((obj_t*) pThis, "trying immediate shutdown of DA queue workers\n");
+		iRetLocal = wtpShutdownAll(pThis->pqDA->pWtpReg, wtpState_SHUTDOWN_IMMEDIATE, &tTimeout);
+		if(iRetLocal == RS_RET_TIMED_OUT) {
+			dbgoprint((obj_t*) pThis, "immediate shutdown timed out on DA queue (this is acceptable and "
+				  "triggers cancellation)\n");
+		} else if(iRetLocal != RS_RET_OK) {
+			dbgoprint((obj_t*) pThis, "unexpected iRet state %d after trying immediate shutdown of the DA queue "
+				  "in disk save mode. Continuing, but results are unpredictable\n", iRetLocal);
+		}
 	}
+	END_MTX_PROTECTED_OPERATIONS(pThis->mut);
+
+	RETiRet;
+}
+
+
+/* This function cancels all remenaing regular workers for both the main and the DA
+ * queue. The main queue's DA worker pool continues to run (if it exists and is active).
+ * rgerhards, 2009-05-29
+ */
+static rsRetVal
+cancelWorkers(qqueue_t *pThis)
+{
+	rsRetVal iRetLocal;
+	DEFiRet;
 
 	/* Now queue workers should have terminated. If not, we need to cancel them as we have applied
 	 * all timeout setting. If any worker in any queue still executes, its consumer is possibly
-	 * long-running and cancelling is the only way to get rid of it. Note that the
-	 * cancellation handler will probably re-queue a user pointer, so the queue's enqueue
-	 * function is still needed (what is no problem as we do not yet destroy the queue - but I
-	 * thought it's a good idea to mention that fact). -- rgerhards, 2008-01-25
+	 * long-running and cancelling is the only way to get rid of it.
 	 */
 	dbgoprint((obj_t*) pThis, "checking to see if we need to cancel any worker threads of the primary queue\n");
 	iRetLocal = wtpCancelAll(pThis->pWtpReg); /* returns immediately if all threads already have terminated */
@@ -1247,12 +1336,6 @@ static rsRetVal qqueueShutdownWorkers(qqueue_t *pThis)
 			  "threads, continuing, but results are unpredictable\n", iRetLocal);
 	}
 
-
-	/* TODO: think: do we really need to do this here? Can't it happen on DA queue destruction? If we 
-	 * disable it, we get an assertion... I think this is OK, as we need to have a certain order and
-	 * canceling the DA workers here ensures that order. But in any instant, we may have a look at this
-	 * code after we have reaced the milestone. -- rgerhards, 2008-01-27
-	 */
 	/* ... and now the DA queue, if it exists (should always be after the primary one) */
 	if(pThis->pqDA != NULL) {
 		dbgoprint((obj_t*) pThis, "checking to see if we need to cancel any worker threads of the DA queue\n");
@@ -1261,14 +1344,70 @@ static rsRetVal qqueueShutdownWorkers(qqueue_t *pThis)
 			dbgoprint((obj_t*) pThis, "unexpected iRet state %d trying to cancel DA queue worker "
 				  "threads, continuing, but results are unpredictable\n", iRetLocal);
 		}
+
+		/* finally, we cancel the main queue's DA worker pool, if it still is running. It may be
+		 * restarted later to persist the queue. But we stop it, because otherwise we get into
+		 * big trouble when resetting the logical dequeue pointer. This operation can only be
+		 * done when *no* worker is running. So time for a shutdown... -- rgerhards, 2009-05-28
+		 */
+		dbgoprint((obj_t*) pThis, "checking to see if we need to cancel the main queue's DA worker pool\n");
+		iRetLocal = wtpCancelAll(pThis->pWtpDA); /* returns immediately if all threads already have terminated */
 	}
 
+	RETiRet;
+}
+
+
+/* This function shuts down all worker threads and waits until they
+ * have terminated. If they timeout, they are cancelled.
+ * rgerhards, 2008-01-24
+ * Please note that this function shuts down BOTH the parent AND the child queue
+ * in DA case. This is necessary because their timeouts are tightly coupled. Most
+ * importantly, the timeouts would be applied twice (or logic be extremely
+ * complex) if each would have its own shutdown. The function does not self check
+ * this condition - the caller must make sure it is not called with a parent.
+ * rgerhards, 2009-05-26: we do NO longer persist the queue here if bSaveOnShutdown
+ * is set. This must be handled by the caller. Not doing that cleans up the queue
+ * shutdown considerably. Also, older engines had a potential hang condition when
+ * the DA queue was already started and the DA worker configured for infinite
+ * retries and the action was during retry processing. This was a design issue,
+ * which is solved as of now. Note that the shutdown now may take a little bit
+ * longer, because we no longer can persist the queue in parallel to waiting
+ * on worker timeouts.
+ */
+static rsRetVal
+ShutdownWorkers(qqueue_t *pThis)
+{
+	DEFiRet;
+
+	ISOBJ_TYPE_assert(pThis, qqueue);
+	ASSERT(pThis->pqParent == NULL); /* detect invalid calling sequence */
+
+	dbgoprint((obj_t*) pThis, "initiating worker thread shutdown sequence\n");
+
+	/* we reduce the low water mark in any case. This is not absolutely necessary, but
+	 * it is useful because we enable DA mode at several spots below and so we do not need
+	 * to think about the low water mark each time. 
+	 */
+	pThis->iHighWtrMrk = 1; /* if we do not do this, the DA queue will not stop! */
+	pThis->iLowWtrMrk = 0;
+
+	CHKiRet(tryShutdownWorkersWithinQueueTimeout(pThis));
+
+	if(getPhysicalQueueSize(pThis) > 0) {
+		CHKiRet(tryShutdownWorkersWithinActionTimeout(pThis));
+	}
+
+	CHKiRet(cancelWorkers(pThis));
+
 	/* ... finally ... all worker threads have terminated :-)
 	 * Well, more precisely, they *are in termination*. Some cancel cleanup handlers
-	 * may still be running. 
+	 * may still be running. Note that the main queue's DA worker may still be running.
 	 */
-	dbgoprint((obj_t*) pThis, "worker threads terminated, remaining queue size %d.\n", qqueueGetOverallQueueSize(pThis));
+	dbgoprint((obj_t*) pThis, "worker threads terminated, remaining queue size log %d, phys %d.\n",
+		  getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis));
 
+finalize_it:
 	RETiRet;
 }
 
@@ -1280,7 +1419,7 @@ static rsRetVal qqueueShutdownWorkers(qqueue_t *pThis)
  * to modify some parameters before the queue is actually started.
  */
 rsRetVal qqueueConstruct(qqueue_t **ppThis, queueType_t qType, int iWorkerThreads,
-		        int iMaxQueueSize, rsRetVal (*pConsumer)(void*,void*))
+		        int iMaxQueueSize, rsRetVal (*pConsumer)(void*, batch_t*))
 {
 	DEFiRet;
 	qqueue_t *pThis;
@@ -1305,10 +1444,12 @@ rsRetVal qqueueConstruct(qqueue_t **ppThis, queueType_t qType, int iWorkerThread
 	pThis->lenSpoolDir = strlen((char*)pThis->pszSpoolDir);
 	pThis->iMaxFileSize = 1024 * 1024; /* default is 1 MiB */
 	pThis->iQueueSize = 0;
+	pThis->nLogDeq = 0;
 	pThis->iMaxQueueSize = iMaxQueueSize;
 	pThis->pConsumer = pConsumer;
 	pThis->iNumWorkerThreads = iWorkerThreads;
 	pThis->iDeqtWinToHr = 25; /* disable time-windowed dequeuing by default */
+	pThis->iDeqBatchSize = 8; /* conservative default, should still provide good performance */
 
 	pThis->pszFilePrefix = NULL;
 	pThis->qType = qType;
@@ -1319,19 +1460,25 @@ rsRetVal qqueueConstruct(qqueue_t **ppThis, queueType_t qType, int iWorkerThread
 			pThis->qConstruct = qConstructFixedArray;
 			pThis->qDestruct = qDestructFixedArray;
 			pThis->qAdd = qAddFixedArray;
+			pThis->qDeq = qDeqFixedArray;
 			pThis->qDel = qDelFixedArray;
+			pThis->qUnDeqAll = qUnDeqAllFixedArray;
 			break;
 		case QUEUETYPE_LINKEDLIST:
 			pThis->qConstruct = qConstructLinkedList;
 			pThis->qDestruct = qDestructLinkedList;
 			pThis->qAdd = qAddLinkedList;
-			pThis->qDel = (rsRetVal (*)(qqueue_t*,void**)) qDelLinkedList;
+			pThis->qDeq = (rsRetVal (*)(qqueue_t*,void**)) qDeqLinkedList;
+			pThis->qDel = (rsRetVal (*)(qqueue_t*)) qDelLinkedList;
+			pThis->qUnDeqAll = qUnDeqAllLinkedList;
 			break;
 		case QUEUETYPE_DISK:
 			pThis->qConstruct = qConstructDisk;
 			pThis->qDestruct = qDestructDisk;
 			pThis->qAdd = qAddDisk;
+			pThis->qDeq = qDeqDisk;
 			pThis->qDel = qDelDisk;
+			pThis->qUnDeqAll = qUnDeqAllDisk;
 			/* special handling */
 			pThis->iNumWorkerThreads = 1; /* we need exactly one worker */
 			break;
@@ -1340,6 +1487,7 @@ rsRetVal qqueueConstruct(qqueue_t **ppThis, queueType_t qType, int iWorkerThread
 			pThis->qDestruct = qDestructDirect;
 			pThis->qAdd = qAddDirect;
 			pThis->qDel = qDelDirect;
+			pThis->qUnDeqAll = qUnDeqAllDirect;
 			break;
 	}
 
@@ -1349,36 +1497,6 @@ finalize_it:
 }
 
 
-/* cancellation cleanup handler for queueWorker ()
- * Updates admin structure and frees ressources.
- * Params:
- * arg1 - user pointer (in this case a qqueue_t)
- * arg2 - user data pointer (in this case a queue data element, any object [queue's pUsr ptr!])
- * Note that arg2 may be NULL, in which case no dequeued but unprocessed pUsr exists!
- * rgerhards, 2008-01-16
- */
-static rsRetVal
-qqueueConsumerCancelCleanup(void *arg1, void *arg2)
-{
-	DEFiRet;
-
-	qqueue_t *pThis = (qqueue_t*) arg1;
-	obj_t *pUsr = (obj_t*) arg2;
-
-	ISOBJ_TYPE_assert(pThis, qqueue);
-
-	if(pUsr != NULL) {
-		/* make sure the data element is not lost */
-		dbgoprint((obj_t*) pThis, "cancelation cleanup handler consumer called, we need to unget one user data element\n");
-		CHKiRet(qqueueUngetObj(pThis, pUsr, LOCK_MUTEX));
-	}
-	
-finalize_it:
-	RETiRet;
-}
-
-
-
 /* This function checks if the provided message shall be discarded and does so, if needed.
  * In DA mode, we do not discard any messages as we assume the disk subsystem is fast enough to
  * provide real-time creation of spool files.
@@ -1419,38 +1537,171 @@ finalize_it:
 }
 
 
-/* dequeue the queued object for the queue consumers.
- * rgerhards, 2008-10-21
+/* Finally remove n elements from the queue store.
  */
-static rsRetVal
-qqueueDequeueConsumable(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave)
+static inline rsRetVal
+DoDeleteBatchFromQStore(qqueue_t *pThis, int nElem)
+{
+	int i;
+	DEFiRet;
+
+	ISOBJ_TYPE_assert(pThis, qqueue);
+
+	/* now send delete request to storage driver */
+	for(i = 0 ; i < nElem ; ++i) {
+		pThis->qDel(pThis);
+	}
+
+	/* iQueueSize is not decremented by qDel(), so we need to do it ourselves */
+	ATOMIC_SUB(pThis->iQueueSize, nElem);
+	ATOMIC_SUB(pThis->nLogDeq, nElem);
+dbgprintf("delete batch from store, new sizes: log %d, phys %d\n", getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis));
+	++pThis->deqIDDel; /* one more batch dequeued */
+
+	RETiRet;
+}
+
+
+/* remove messages from the physical queue store that are fully processed. This is
+ * controlled via the to-delete list. We can only delete those elements, that are
+ * at the current physical tail of the queue. If the batch is from another position,
+ * we schedule it for deletion, but actual deletion will happen at a later call
+ * of this function here. We always delete as much as possible, which includes
+ * picking up things from the to-delete list.
+ */
+static inline rsRetVal
+DeleteBatchFromQStore(qqueue_t *pThis, batch_t *pBatch)
 {
+	toDeleteLst_t *pTdl;
+	qDeqID	deqIDDel;
 	DEFiRet;
+
+	ISOBJ_TYPE_assert(pThis, qqueue);
+	assert(pBatch != NULL);
+
+	pTdl = tdlPeek(pThis); /* get current head element */
+	if(pTdl == NULL) { /* to-delete list empty */
+		DoDeleteBatchFromQStore(pThis, pBatch->nElemDeq);
+	} else if(pBatch->deqID == pThis->deqIDDel) {
+		deqIDDel = pThis->deqIDDel;
+		pTdl = tdlPeek(pThis);
+		while(pTdl != NULL && deqIDDel == pTdl->deqID) {
+			DoDeleteBatchFromQStore(pThis, pTdl->nElemDeq);
+			tdlPop(pThis);
+			++deqIDDel;
+			pTdl = tdlPeek(pThis);
+		}
+	} else {
+		/* can not delete, insert into to-delete list */
+		dbgprintf("not at head of to-delete list, enqueue %d\n", (int) pBatch->deqID);
+		CHKiRet(tdlAdd(pThis, pBatch->deqID, pBatch->nElemDeq));
+	}
+
+finalize_it:
+	RETiRet;
+}
+
+
+/* Delete a batch of processed user objects from the queue, which includes
+ * destructing the objects themself.
+ * rgerhards, 2009-05-13
+ */
+static inline rsRetVal
+DeleteProcessedBatch(qqueue_t *pThis, batch_t *pBatch)
+{
+	int i;
 	void *pUsr;
+	DEFiRet;
+
+	ISOBJ_TYPE_assert(pThis, qqueue);
+	assert(pBatch != NULL);
+
+	for(i = 0 ; i < pBatch->nElem ; ++i) {
+		pUsr = pBatch->pElem[i].pUsrp;
+		objDestruct(pUsr);
+	}
+
+	iRet = DeleteBatchFromQStore(pThis, pBatch);
+
+	pBatch->nElem = pBatch->nElemDeq = 0; /* reset batch */
+
+	RETiRet;
+}
+
+
+/* dequeue as many user pointers as are available, until we hit the configured
+ * upper limit of pointers.
+ * This must only be called when the queue mutex is LOOKED, otherwise serious
+ * malfunction will happen.
+ */
+static inline rsRetVal
+DequeueConsumableElements(qqueue_t *pThis, wti_t *pWti, int *piRemainingQueueSize)
+{
+	int nDequeued;
+	int nDiscarded;
+	int nDeleted;
 	int iQueueSize;
-	int bRunsDA;	 /* cache for early mutex release */
-
-	/* dequeue element (still protected from mutex) */
-	iRet = qqueueDel(pThis, &pUsr);
-	qqueueChkPersist(pThis);
-	iQueueSize = qqueueGetOverallQueueSize(pThis); /* cache this for after mutex release */
-	bRunsDA = pThis->bRunsDA; /* cache this for after mutex release */
-
-	/* We now need to save the user pointer for the cancel cleanup handler, BUT ONLY
-	 * if we could successfully obtain a user pointer. Otherwise, we would bring the
-	 * cancel cleanup handler into big troubles (and we did ;)). Note that we can
-	 * NOT set the variable further below, as this may lead to an object leak. We 
-	 * may get cancelled before we reach that part of the code, so the only 
-	 * solution is to do it here. -- rgerhards, 2008-02-27
-	 */
-	if(iRet == RS_RET_OK) {
-		pWti->pUsrp = pUsr;
+	void *pUsr;
+	rsRetVal localRet;
+	DEFiRet;
+
+	nDeleted = pWti->batch.nElemDeq;
+	DeleteProcessedBatch(pThis, &pWti->batch);
+
+	nDequeued = nDiscarded = 0;
+	while((iQueueSize = getLogicalQueueSize(pThis)) > 0 && nDequeued < pThis->iDeqBatchSize) {
+dbgprintf("DequeueConsumableElements, index %d\n", nDequeued);
+		CHKiRet(qqueueDeq(pThis, &pUsr));
+
+		/* check if we should discard this element */
+		localRet = qqueueChkDiscardMsg(pThis, pThis->iQueueSize, pThis->bRunsDA, pUsr);
+		if(localRet == RS_RET_QUEUE_FULL) {
+			++nDiscarded;
+			continue;
+		} else if(localRet != RS_RET_OK) {
+			ABORT_FINALIZE(localRet);
+		}
+
+		/* all well, use this element */
+		pWti->batch.pElem[nDequeued].pUsrp = pUsr;
+		pWti->batch.pElem[nDequeued].state = BATCH_STATE_RDY;
+		++nDequeued;
 	}
 
+	/* it is sufficient to persist only when the bulk of work is done */
+	qqueueChkPersist(pThis, nDequeued+nDiscarded+nDeleted);
+
+	pWti->batch.nElem = nDequeued;
+	pWti->batch.nElemDeq = nDequeued + nDiscarded;
+	pWti->batch.deqID = getNextDeqID(pThis);
+	*piRemainingQueueSize = iQueueSize;
+
+finalize_it:
+	RETiRet;
+}
+
+
+/* dequeue the queued object for the queue consumers.
+ * rgerhards, 2008-10-21
+ * I made a radical change - we now dequeue multiple elements, and store these objects in
+ * an array of user pointers. We expect that this increases performance.
+ * rgerhards, 2009-04-22
+ */
+static rsRetVal
+DequeueConsumable(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave)
+{
+	DEFiRet;
+	int iQueueSize = 0; /* keep the compiler happy... */
+
+	/* dequeue element batch (still protected from mutex) */
+	iRet = DequeueConsumableElements(pThis, pWti, &iQueueSize);
+
 	/* awake some flow-controlled sources if we can do this right now */
 	/* TODO: this could be done better from a performance point of view -- do it only if
 	 * we have someone waiting for the condition (or only when we hit the watermark right
 	 * on the nail [exact value]) -- rgerhards, 2008-03-14
+	 * now that we dequeue batches of pointers, this is much less an issue...
+	 * rgerhards, 2009-04-22
 	 */
 	if(iQueueSize < pThis->iFullDlyMrk) {
 		pthread_cond_broadcast(&pThis->belowFullDlyWtrMrk);
@@ -1460,37 +1711,17 @@ qqueueDequeueConsumable(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave)
 		pthread_cond_broadcast(&pThis->belowLightDlyWtrMrk);
 	}
 
-	/* rgerhards, 2008-09-30: I reversed the order of cond_signal und mutex_unlock
-	 * as of the pthreads recommendation on predictable scheduling behaviour. I don't see
-	 * any problems caused by this, but I add this comment in case some will be seen
-	 * in the next time.
-	 */
+	// TODO: MULTI: check physical queue size?
 	pthread_cond_signal(&pThis->notFull);
 	d_pthread_mutex_unlock(pThis->mut);
 	pthread_setcancelstate(iCancelStateSave, NULL);
 	/* WE ARE NO LONGER PROTECTED BY THE MUTEX */
 
-	/* do actual processing (the lengthy part, runs in parallel)
-	 * If we had a problem while dequeing, we do not call the consumer,
-	 * but we otherwise ignore it. This is in the hopes that it will be
-	 * self-healing. However, this is really not a good thing.
-	 * rgerhards, 2008-01-03
-	 */
-	if(iRet != RS_RET_OK)
-		FINALIZE;
-
-	/* we are running in normal, non-disk-assisted mode do a quick check if we need to drain the queue.
-	 * In DA mode, we do not discard any messages as we assume the disk subsystem is fast enough to
-	 * provide real-time creation of spool files.
-	 * Note: It is OK to use the cached iQueueSize here, because it does not hurt if it is slightly wrong.
-	 */
-	CHKiRet(qqueueChkDiscardMsg(pThis, iQueueSize, bRunsDA, pUsr));
-
-finalize_it:
 	if(iRet != RS_RET_OK && iRet != RS_RET_DISCARDMSG) {
 		dbgoprint((obj_t*) pThis, "error %d dequeueing element - ignoring, but strange things "
 			  "may happen\n", iRet);
 	}
+
 	RETiRet;
 }
 
@@ -1533,7 +1764,7 @@ finalize_it:
  * but you get the idea from the code above.
  */
 static rsRetVal
-qqueueRateLimiter(qqueue_t *pThis)
+RateLimiter(qqueue_t *pThis)
 {
 	DEFiRet;
 	int iDelay;
@@ -1590,25 +1821,68 @@ qqueueRateLimiter(qqueue_t *pThis)
 }
 
 
+/* This dequeues the next batch and checks if the queue is empty. If it is 
+ * empty, return RS_RET_IDLE. That will trigger termination of the function
+ * and tell the upper layer caller to initiate idle processing.
+ * rgerhards, 2009-05-20
+ */
+static inline rsRetVal
+DequeueForConsumer(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave)
+{
+	DEFiRet;
+
+	ISOBJ_TYPE_assert(pThis, qqueue);
+	ISOBJ_TYPE_assert(pWti, wti);
+
+	CHKiRet(DequeueConsumable(pThis, pWti, iCancelStateSave));
+
+	if(pWti->batch.nElem == 0)
+		ABORT_FINALIZE(RS_RET_IDLE);
+
+finalize_it:
+	RETiRet;
+}
+
+
+/* This is called when a batch is processed and the worker does not
+ * ask for another batch (e.g. because it is to be terminated)
+ * rgerhards, 2009-05-27
+ */
+static rsRetVal
+batchProcessed(qqueue_t *pThis, wti_t *pWti)
+{
+	DEFiRet;
+
+	ISOBJ_TYPE_assert(pThis, qqueue);
+	ISOBJ_TYPE_assert(pWti, wti);
+dbgprintf("XXX: batchProcessed deletes %d records\n", pWti->batch.nElemDeq);
+
+	DeleteProcessedBatch(pThis, &pWti->batch);
+	qqueueChkPersist(pThis, pWti->batch.nElemDeq);
+
+	RETiRet;
+}
+
 
 /* This is the queue consumer in the regular (non-DA) case. It is 
  * protected by the queue mutex, but MUST release it as soon as possible.
  * rgerhards, 2008-01-21
  */
 static rsRetVal
-qqueueConsumerReg(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave)
+ConsumerReg(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave)
 {
 	DEFiRet;
 
 	ISOBJ_TYPE_assert(pThis, qqueue);
 	ISOBJ_TYPE_assert(pWti, wti);
 
-	CHKiRet(qqueueDequeueConsumable(pThis, pWti, iCancelStateSave));
-	CHKiRet(pThis->pConsumer(pThis->pUsr, pWti->pUsrp));
+	CHKiRet(DequeueForConsumer(pThis, pWti, iCancelStateSave));
+	CHKiRet(pThis->pConsumer(pThis->pUsr, &pWti->batch));
 
 	/* we now need to check if we should deliberately delay processing a bit
 	 * and, if so, do that. -- rgerhards, 2008-01-30
 	 */
+//TODO: MULTIQUEUE: the following setting is no longer correct - need to think about how to do that...
 	if(pThis->iDeqSlowdown) {
 		dbgoprint((obj_t*) pThis, "sleeping %d microseconds as requested by config params\n",
 			  pThis->iDeqSlowdown);
@@ -1616,11 +1890,12 @@ qqueueConsumerReg(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave)
 	}
 
 finalize_it:
+dbgprintf("XXX: regular consumer finished, iret=%d, szlog %d sz phys %d\n", iRet, getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis));
 	RETiRet;
 }
 
 
-/* This is a special consumer to feed the disk-queue in disk-assited mode.
+/* This is a special consumer to feed the disk-queue in disk-assisted mode.
  * When active, our own queue more or less acts as a memory buffer to the disk.
  * So this consumer just needs to drain the memory queue and submit entries
  * to the disk queue. The disk queue will then call the actual consumer from
@@ -1630,15 +1905,23 @@ finalize_it:
  * rgerhards, 2008-01-14
  */
 static rsRetVal
-qqueueConsumerDA(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave)
+ConsumerDA(qqueue_t *pThis, wti_t *pWti, int iCancelStateSave)
 {
+	int i;
 	DEFiRet;
 
 	ISOBJ_TYPE_assert(pThis, qqueue);
 	ISOBJ_TYPE_assert(pWti, wti);
 
-	CHKiRet(qqueueDequeueConsumable(pThis, pWti, iCancelStateSave));
-	CHKiRet(qqueueEnqObj(pThis->pqDA, eFLOWCTL_NO_DELAY, pWti->pUsrp));
+	CHKiRet(DequeueForConsumer(pThis, pWti, iCancelStateSave));
+	/* iterate over returned results and enqueue them in DA queue */
+	for(i = 0 ; i < pWti->batch.nElem ; i++) {
+		/* TODO: we must add a generic "addRef" mechanism, because the disk queue enqueue destructs
+		 * the message. So far, we simply assume we always have msg_t, what currently is always the case.
+		 * rgerhards, 2009-05-28
+		 */
+		CHKiRet(qqueueEnqObj(pThis->pqDA, eFLOWCTL_NO_DELAY, (obj_t*)MsgAddRef((msg_t*)(pWti->batch.pElem[i].pUsrp))));
+	}
 
 finalize_it:
 	dbgoprint((obj_t*) pThis, "DAConsumer returns with iRet %d\n", iRet);
@@ -1651,20 +1934,17 @@ finalize_it:
  * If we are a child, we have done our duty when the queue is empty. In that case,
  * we can terminate.
  * Version for the DA worker thread. NOTE: the pThis->bRunsDA is different from
- * the DA queue
+ * the DA queue.
+ * If our queue is in destruction, we drain to the DA queue and so we shall not terminate
+ * until we have done so.
  */
-static int
+static rsRetVal
 qqueueChkStopWrkrDA(qqueue_t *pThis)
 {
-	/* if our queue is in destruction, we drain to the DA queue and so we shall not terminate
-	 * until we have done so.
-	 */
-	int bStopWrkr;
-
-	BEGINfunc
+	DEFiRet;
 
 	if(pThis->bEnqOnly) {
-		bStopWrkr = 1;
+		iRet = RS_RET_TERMINATE_WHEN_IDLE;
 	} else {
 		if(pThis->bRunsDA) {
 			ASSERT(pThis->pqDA != NULL);
@@ -1672,19 +1952,21 @@ qqueueChkStopWrkrDA(qqueue_t *pThis)
 			   && pThis->pqDA->sizeOnDiskMax > 0
 			   && pThis->pqDA->tVars.disk.sizeOnDisk > pThis->pqDA->sizeOnDiskMax) {
 				/* this queue can never grow, so we can give up... */
-				bStopWrkr = 1;
-			} else if(qqueueGetOverallQueueSize(pThis) < pThis->iHighWtrMrk && pThis->bQueueStarted == 1) {
-				bStopWrkr = 1;
-			} else {
-				bStopWrkr = 0;
+				iRet = RS_RET_TERMINATE_NOW;
+			} else if(getPhysicalQueueSize(pThis) < pThis->iHighWtrMrk && pThis->bQueueStarted == 1) {
+dbgprintf("XXX: terminate_NOW DA worker: queue size %d, high water mark %d\n", getPhysicalQueueSize(pThis), pThis->iHighWtrMrk);
+				iRet = RS_RET_TERMINATE_NOW;
+RUNLOG_STR("XXX: re-start reg worker");
+qqueueAdviseMaxWorkers(pThis);
+RUNLOG_STR("XXX: done re-start reg worker");
 			}
 		} else {
-			bStopWrkr = 1;
+		// experimental	iRet = RS_RET_TERMINATE_NOW;
+		;
 		}
 	}
 
-	ENDfunc
-	return  bStopWrkr;
+	RETiRet;
 }
 
 
@@ -1695,38 +1977,51 @@ qqueueChkStopWrkrDA(qqueue_t *pThis)
  * Version for the regular worker thread. NOTE: the pThis->bRunsDA is different from
  * the DA queue
  */
-static int
-qqueueChkStopWrkrReg(qqueue_t *pThis)
+static rsRetVal
+ChkStopWrkrReg(qqueue_t *pThis)
+{
+	DEFiRet;
+	if(pThis->bEnqOnly) {
+dbgprintf("XXX: terminate_NOW queue:Reg worker: enqOnly! queue size %d\n", getPhysicalQueueSize(pThis));
+		iRet = RS_RET_TERMINATE_NOW;
+	} else if(pThis->pqParent != NULL) {
+		iRet = RS_RET_TERMINATE_WHEN_IDLE;
+	}
+
+	RETiRet;
+}
+
+
+/* return the configured "deq max at once" interval
+ * rgerhards, 2009-04-22
+ */
+static rsRetVal
+GetDeqBatchSize(qqueue_t *pThis, int *pVal)
 {
-	return pThis->bEnqOnly || pThis->bRunsDA || (pThis->pqParent != NULL && qqueueGetOverallQueueSize(pThis) == 0);
+	DEFiRet;
+	assert(pVal != NULL);
+	*pVal = pThis->iDeqBatchSize;
+if(pThis->pqParent != NULL)
+	*pVal = 16;
+	RETiRet;
 }
 
 
 /* must only be called when the queue mutex is locked, else results
- * are not stable! DA queue version
+ * are not stable! DA worker version (pThis *is* the *main* queue, not DA!)
  */
 static int
 qqueueIsIdleDA(qqueue_t *pThis)
 {
-	/* remember: iQueueSize is the DA queue size, not the main queue! */
-	/* TODO: I think we need just a single function for DA and non-DA mode - but I leave it for now as is */
-	return(qqueueGetOverallQueueSize(pThis) == 0 || (pThis->bRunsDA && qqueueGetOverallQueueSize(pThis) <= pThis->iLowWtrMrk));
+	return(getPhysicalQueueSize(pThis) <= pThis->iLowWtrMrk);
 }
 /* must only be called when the queue mutex is locked, else results
- * are not stable! Regular queue version
+ * are not stable! Regular worker version.
  */
 static int
-qqueueIsIdleReg(qqueue_t *pThis)
-{
-#if 0 /* enable for performance testing */
-	int ret;
-	ret = qqueueGetOverallQueueSize(pThis) == 0 || (pThis->bRunsDA && qqueueGetOverallQueueSize(pThis) <= pThis->iLowWtrMrk);
-	if(ret) fprintf(stderr, "queue is idle\n");
-	return ret;
-#else 
-	/* regular code! */
-	return(qqueueGetOverallQueueSize(pThis) == 0 || (pThis->bRunsDA && qqueueGetOverallQueueSize(pThis) <= pThis->iLowWtrMrk));
-#endif
+IsIdleReg(qqueue_t *pThis)
+{
+	return(getPhysicalQueueSize(pThis) == 0);
 }
 
 
@@ -1744,7 +2039,7 @@ qqueueIsIdleReg(qqueue_t *pThis)
  * I am telling this, because I, too, always get confused by those...
  */
 static rsRetVal
-qqueueRegOnWrkrShutdown(qqueue_t *pThis)
+RegOnWrkrShutdown(qqueue_t *pThis)
 {
 	DEFiRet;
 
@@ -1765,7 +2060,7 @@ qqueueRegOnWrkrShutdown(qqueue_t *pThis)
  * hook to indicate in the parent queue (if we are a child) that we are not done yet.
  */
 static rsRetVal
-qqueueRegOnWrkrStartup(qqueue_t *pThis)
+RegOnWrkrStartup(qqueue_t *pThis)
 {
 	DEFiRet;
 
@@ -1782,7 +2077,8 @@ qqueueRegOnWrkrStartup(qqueue_t *pThis)
 /* start up the queue - it must have been constructed and parameters defined
  * before.
  */
-rsRetVal qqueueStart(qqueue_t *pThis) /* this is the ConstructionFinalizer */
+rsRetVal
+qqueueStart(qqueue_t *pThis) /* this is the ConstructionFinalizer */
 {
 	DEFiRet;
 	rsRetVal iRetLocal;
@@ -1818,11 +2114,12 @@ rsRetVal qqueueStart(qqueue_t *pThis) /* this is the ConstructionFinalizer */
 	/* call type-specific constructor */
 	CHKiRet(pThis->qConstruct(pThis)); /* this also sets bIsDA */
 
-	dbgoprint((obj_t*) pThis, "type %d, enq-only %d, disk assisted %d, maxFileSz %lld, qsize %d, child %d, "
-				  "full delay %d, light delay %d starting\n",
+	dbgoprint((obj_t*) pThis, "type %d, enq-only %d, disk assisted %d, maxFileSz %lld, lqsize %d, pqsize %d, child %d, "
+				  "full delay %d, light delay %d, deq batch size %d starting\n",
 		  pThis->qType, pThis->bEnqOnly, pThis->bIsDA, pThis->iMaxFileSize,
-		  qqueueGetOverallQueueSize(pThis), pThis->pqParent == NULL ? 0 : 1,
-		  pThis->iFullDlyMrk, pThis->iLightDlyMrk);
+		  getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis),
+		  pThis->pqParent == NULL ? 0 : 1, pThis->iFullDlyMrk, pThis->iLightDlyMrk,
+		  pThis->iDeqBatchSize);
 
 	if(pThis->qType == QUEUETYPE_DIRECT)
 		FINALIZE;	/* with direct queues, we are already finished... */
@@ -1833,13 +2130,14 @@ rsRetVal qqueueStart(qqueue_t *pThis) /* this is the ConstructionFinalizer */
 	lenBuf = snprintf((char*)pszBuf, sizeof(pszBuf), "%s:Reg", obj.GetName((obj_t*) pThis));
 	CHKiRet(wtpConstruct		(&pThis->pWtpReg));
 	CHKiRet(wtpSetDbgHdr		(pThis->pWtpReg, pszBuf, lenBuf));
-	CHKiRet(wtpSetpfRateLimiter	(pThis->pWtpReg, (rsRetVal (*)(void *pUsr)) qqueueRateLimiter));
-	CHKiRet(wtpSetpfChkStopWrkr	(pThis->pWtpReg, (rsRetVal (*)(void *pUsr, int)) qqueueChkStopWrkrReg));
-	CHKiRet(wtpSetpfIsIdle		(pThis->pWtpReg, (rsRetVal (*)(void *pUsr, int)) qqueueIsIdleReg));
-	CHKiRet(wtpSetpfDoWork		(pThis->pWtpReg, (rsRetVal (*)(void *pUsr, void *pWti, int)) qqueueConsumerReg));
-	CHKiRet(wtpSetpfOnWorkerCancel	(pThis->pWtpReg, (rsRetVal (*)(void *pUsr, void*pWti))qqueueConsumerCancelCleanup));
-	CHKiRet(wtpSetpfOnWorkerStartup	(pThis->pWtpReg, (rsRetVal (*)(void *pUsr)) qqueueRegOnWrkrStartup));
-	CHKiRet(wtpSetpfOnWorkerShutdown(pThis->pWtpReg, (rsRetVal (*)(void *pUsr)) qqueueRegOnWrkrShutdown));
+	CHKiRet(wtpSetpfRateLimiter	(pThis->pWtpReg, (rsRetVal (*)(void *pUsr)) RateLimiter));
+	CHKiRet(wtpSetpfChkStopWrkr	(pThis->pWtpReg, (rsRetVal (*)(void *pUsr, int)) ChkStopWrkrReg));
+	CHKiRet(wtpSetpfGetDeqBatchSize	(pThis->pWtpReg, (rsRetVal (*)(void *pUsr, int*)) GetDeqBatchSize));
+	CHKiRet(wtpSetpfIsIdle		(pThis->pWtpReg, (rsRetVal (*)(void *pUsr, wtp_t*)) IsIdleReg));
+	CHKiRet(wtpSetpfDoWork		(pThis->pWtpReg, (rsRetVal (*)(void *pUsr, void *pWti, int)) ConsumerReg));
+	CHKiRet(wtpSetpfObjProcessed	(pThis->pWtpReg, (rsRetVal (*)(void *pUsr, wti_t *pWti)) batchProcessed));
+	CHKiRet(wtpSetpfOnWorkerStartup	(pThis->pWtpReg, (rsRetVal (*)(void *pUsr)) RegOnWrkrStartup));
+	CHKiRet(wtpSetpfOnWorkerShutdown(pThis->pWtpReg, (rsRetVal (*)(void *pUsr)) RegOnWrkrShutdown));
 	CHKiRet(wtpSetpmutUsr		(pThis->pWtpReg, pThis->mut));
 	CHKiRet(wtpSetpcondBusy		(pThis->pWtpReg, &pThis->notEmpty));
 	CHKiRet(wtpSetiNumWorkerThreads	(pThis->pWtpReg, pThis->iNumWorkerThreads));
@@ -1855,7 +2153,7 @@ rsRetVal qqueueStart(qqueue_t *pThis) /* this is the ConstructionFinalizer */
 		iRetLocal = qqueueHaveQIF(pThis);
 		if(iRetLocal == RS_RET_OK) {
 			dbgoprint((obj_t*) pThis, "on-disk queue present, needs to be reloaded\n");
-			qqueueInitDA(pThis, QUEUE_MODE_ENQDEQ, LOCK_MUTEX); /* initiate DA mode */
+			InitDA(pThis, QUEUE_MODE_ENQDEQ, LOCK_MUTEX); /* initiate DA mode */
 			bInitialized = 1; /* we are done */
 		} else {
 			/* TODO: use logerror? -- rgerhards, 2008-01-16 */
@@ -1864,7 +2162,7 @@ rsRetVal qqueueStart(qqueue_t *pThis) /* this is the ConstructionFinalizer */
 		}
 	}
 
-	if(!bInitialized) {
+	if(Debug && !bInitialized) {
 		dbgoprint((obj_t*) pThis, "queue starts up without (loading) any DA disk state (this is normal for the DA "
 			  "queue itself!)\n");
 	}
@@ -1893,12 +2191,11 @@ static rsRetVal qqueuePersist(qqueue_t *pThis, int bIsCheckpoint)
 	strm_t *psQIF = NULL; /* Queue Info File */
 	uchar pszQIFNam[MAXFNAME];
 	size_t lenQIFNam;
-	obj_t *pUsr;
 
 	ASSERT(pThis != NULL);
 
 	if(pThis->qType != QUEUETYPE_DISK) {
-		if(qqueueGetOverallQueueSize(pThis) > 0) {
+		if(getPhysicalQueueSize(pThis) > 0) {
 			/* This error code is OK, but we will probably not implement this any time
  			 * The reason is that persistence happens via DA queues. But I would like to
 			 * leave the code as is, as we so have a hook in case we need one.
@@ -1909,19 +2206,19 @@ static rsRetVal qqueuePersist(qqueue_t *pThis, int bIsCheckpoint)
 			FINALIZE; /* if the queue is empty, we are happy and done... */
 	}
 
-	dbgoprint((obj_t*) pThis, "persisting queue to disk, %d entries...\n", qqueueGetOverallQueueSize(pThis));
+	dbgoprint((obj_t*) pThis, "persisting queue to disk, %d entries...\n", getPhysicalQueueSize(pThis));
 
 	/* Construct file name */
 	lenQIFNam = snprintf((char*)pszQIFNam, sizeof(pszQIFNam) / sizeof(uchar), "%s/%s.qi",
 			     (char*) glbl.GetWorkDir(), (char*)pThis->pszFilePrefix);
 
-	if((bIsCheckpoint != QUEUE_CHECKPOINT) && (qqueueGetOverallQueueSize(pThis) == 0)) {
+	if((bIsCheckpoint != QUEUE_CHECKPOINT) && (getPhysicalQueueSize(pThis) == 0)) {
 		if(pThis->bNeedDelQIF) {
 			unlink((char*)pszQIFNam);
 			pThis->bNeedDelQIF = 0;
 		}
 		/* indicate spool file needs to be deleted */
-		CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pRead, 1));
+		CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pReadDel, 1));
 		FINALIZE; /* nothing left to do, so be happy */
 	}
 
@@ -1940,29 +2237,19 @@ static rsRetVal qqueuePersist(qqueue_t *pThis, int bIsCheckpoint)
 	 */
 	CHKiRet(obj.BeginSerializePropBag(psQIF, (obj_t*) pThis));
 	objSerializeSCALAR(psQIF, iQueueSize, INT);
-	objSerializeSCALAR(psQIF, iUngottenObjs, INT);
 	objSerializeSCALAR(psQIF, tVars.disk.sizeOnDisk, INT64);
 	objSerializeSCALAR(psQIF, tVars.disk.bytesRead, INT64);
 	CHKiRet(obj.EndSerialize(psQIF));
 
-	/* now we must persist all objects on the ungotten queue - they can not go to
-	 * to the regular files. -- rgerhards, 2008-01-29
-	 */
-	while(pThis->iUngottenObjs > 0) {
-		CHKiRet(qqueueGetUngottenObj(pThis, &pUsr));
-		CHKiRet((objSerialize(pUsr))(pUsr, psQIF));
-		objDestruct(pUsr);
-	}
-
 	/* now persist the stream info */
 	CHKiRet(strm.Serialize(pThis->tVars.disk.pWrite, psQIF));
-	CHKiRet(strm.Serialize(pThis->tVars.disk.pRead, psQIF));
+	CHKiRet(strm.Serialize(pThis->tVars.disk.pReadDel, psQIF));
 	
 	/* tell the input file object that it must not delete the file on close if the queue
 	 * is non-empty - but only if we are not during a simple checkpoint
 	 */
 	if(bIsCheckpoint != QUEUE_CHECKPOINT) {
-		CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pRead, 0));
+		CHKiRet(strm.SetbDeleteOnClose(pThis->tVars.disk.pReadDel, 0));
 	}
 
 	/* we have persisted the queue object. So whenever it comes to an empty queue,
@@ -1979,20 +2266,67 @@ finalize_it:
 
 
 /* check if we need to persist the current queue info. If an
- * error occurs, thus should be ignored by caller (but we still
+ * error occurs, this should be ignored by caller (but we still
  * abide to our regular call interface)...
  * rgerhards, 2008-01-13
+ * nUpdates is the number of updates since the last call to this function.
+ * It may be > 1 due to batches. -- rgerhards, 2009-05-12
  */
-static rsRetVal qqueueChkPersist(qqueue_t *pThis)
+static rsRetVal qqueueChkPersist(qqueue_t *pThis, int nUpdates)
 {
+	DEFiRet;
 	ISOBJ_TYPE_assert(pThis, qqueue);
+	assert(nUpdates >= 0);
+
+	if(nUpdates == 0)
+		FINALIZE;
 
-	if(pThis->iPersistUpdCnt && ++pThis->iUpdsSincePersist >= pThis->iPersistUpdCnt) {
+	pThis->iUpdsSincePersist += nUpdates;
+	if(pThis->iPersistUpdCnt && pThis->iUpdsSincePersist >= pThis->iPersistUpdCnt) {
 		qqueuePersist(pThis, QUEUE_CHECKPOINT);
 		pThis->iUpdsSincePersist = 0;
 	}
 
-	return RS_RET_OK;
+finalize_it:
+	RETiRet;
+}
+
+
+/* persist a queue with all data elements to disk - this is used to handle
+ * bSaveOnShutdown. We utilize the DA worker to do this. This must only
+ * be called after all workers have been shut down and if bSaveOnShutdown
+ * is actually set. Note that this function may potentially run long,
+ * depending on the queue configuration (e.g. store on remote machine).
+ * rgerhards, 2009-05-26
+ */
+static inline rsRetVal
+DoSaveOnShutdown(qqueue_t *pThis)
+{
+	struct timespec tTimeout;
+	rsRetVal iRetLocal;
+	DEFiRet;
+
+	ISOBJ_TYPE_assert(pThis, qqueue);
+
+dbgprintf("after InitDA, queue log %d, phys %d\n", getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis));
+	if(pThis->bRunsDA != 2) {
+		InitDA(pThis, QUEUE_MODE_ENQONLY, LOCK_MUTEX); /* switch to DA mode */
+dbgprintf("after InitDA, queue log %d, phys %d\n", getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis));
+		qqueueWaitDAModeInitialized(pThis);	/* make sure DA mode is actually started, else we may have a race! */
+	}
+	/* make sure we do not timeout before we are done */
+	dbgoprint((obj_t*) pThis, "bSaveOnShutdown configured, infinite timeout set\n");
+	timeoutComp(&tTimeout, QUEUE_TIMEOUT_ETERNAL);
+	/* and run the primary queue's DA worker to drain the queue */
+	iRetLocal = wtpShutdownAll(pThis->pWtpDA, wtpState_SHUTDOWN, &tTimeout);
+	dbgoprint((obj_t*) pThis, "end queue persistence run, iRet %d, queue size log %d, phys %d\n",
+		  iRetLocal, getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis));
+	if(iRetLocal != RS_RET_OK) {
+		dbgoprint((obj_t*) pThis, "unexpected iRet state %d after trying to shut down primary queue in disk save mode, "
+			  "continuing, but results are unpredictable\n", iRetLocal);
+	}
+
+	RETiRet;
 }
 
 
@@ -2001,14 +2335,26 @@ BEGINobjDestruct(qqueue) /* be sure to specify the object type also in END and C
 CODESTARTobjDestruct(qqueue)
 	pThis->bQueueInDestruction = 1; /* indicate we are in destruction (modifies some behaviour) */
 
-	/* shut down all workers (handles *all* of the persistence logic)
-	 * See function head comment of queueShutdownWorkers () on why we don't call it
-	 * We also do not need to shutdown workers when we are in enqueue-only mode or we are a
+	/* shut down all workers
+	 * We do not need to shutdown workers when we are in enqueue-only mode or we are a
 	 * direct queue - because in both cases we have none... ;)
 	 * with a child! -- rgerhards, 2008-01-28
 	 */
 	if(pThis->qType != QUEUETYPE_DIRECT && !pThis->bEnqOnly && pThis->pqParent == NULL)
-		qqueueShutdownWorkers(pThis);
+		ShutdownWorkers(pThis);
+
+	/* now all workers are terminated. Messages may exist. Also, some logically dequeued
+	 * messages may never have been processed because their worker was terminated. So
+	 * we need to reset the logical dequeue pointer, persist the queue if configured to do
+	 * so and then destruct everything. -- rgerhards, 2009-05-26
+	 */
+dbgprintf("XXX: pre unDeq disk log %d, phys %d\n", getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis));
+	CHKiRet(pThis->qUnDeqAll(pThis));
+dbgprintf("XXX: post unDeq disk log %d, phys %d\n", getLogicalQueueSize(pThis), getPhysicalQueueSize(pThis));
+
+	if(pThis->bIsDA && getPhysicalQueueSize(pThis) > 0 && pThis->bSaveOnShutdown) {
+		CHKiRet(DoSaveOnShutdown(pThis));
+	}
 
 	/* finally destruct our (regular) worker thread pool
 	 * Note: currently pWtpReg is never NULL, but if we optimize our logic, this may happen,
@@ -2063,11 +2409,8 @@ CODESTARTobjDestruct(qqueue)
 	/* type-specific destructor */
 	iRet = pThis->qDestruct(pThis);
 
-	if(pThis->pszFilePrefix != NULL)
-		free(pThis->pszFilePrefix);
-
-	if(pThis->pszSpoolDir != NULL)
-		free(pThis->pszSpoolDir);
+	free(pThis->pszFilePrefix);
+	free(pThis->pszSpoolDir);
 ENDobjDestruct(qqueue)
 
 
@@ -2081,8 +2424,8 @@ qqueueSetFilePrefix(qqueue_t *pThis, uchar *pszPrefix, size_t iLenPrefix)
 {
 	DEFiRet;
 
-	if(pThis->pszFilePrefix != NULL)
-		free(pThis->pszFilePrefix);
+	free(pThis->pszFilePrefix);
+	pThis->pszFilePrefix = NULL;
 
 	if(pszPrefix == NULL) /* just unset the prefix! */
 		ABORT_FINALIZE(RS_RET_OK);
@@ -2129,15 +2472,6 @@ qqueueEnqObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr)
 
 	ISOBJ_TYPE_assert(pThis, qqueue);
 
-	/* first check if we need to discard this message (which will cause CHKiRet() to exit)
-	 * rgerhards, 2008-10-07: It is OK to do this outside of mutex protection. The iQueueSize
-	 * and bRunsDA parameters may not reflect the correct settings here, but they are
-	 * "good enough" in the sense that they can be used to drive the decision. Valgrind's
-	 * threading tools may point this access to be an error, but this is done
-	 * intentional. I do not see this causes problems to us.
-	 */
-	CHKiRet(qqueueChkDiscardMsg(pThis, pThis->iQueueSize, pThis->bRunsDA, pUsr));
-
 	/* Please note that this function is not cancel-safe and consequently
 	 * sets the calling thread's cancelibility state to PTHREAD_CANCEL_DISABLE
 	 * during its execution. If that is not done, race conditions occur if the
@@ -2149,9 +2483,13 @@ qqueueEnqObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr)
 		d_pthread_mutex_lock(pThis->mut);
 	}
 
+	/* first check if we need to discard this message (which will cause CHKiRet() to exit)
+	 */
+	CHKiRet(qqueueChkDiscardMsg(pThis, getPhysicalQueueSize(pThis), pThis->bRunsDA, pUsr));
+
 	/* then check if we need to add an assistance disk queue */
 	if(pThis->bIsDA)
-		CHKiRet(qqueueChkStrtDA(pThis));
+		CHKiRet(ChkStrtDA(pThis));
 	
 	/* handle flow control
 	 * There are two different flow control mechanisms: basic and advanced flow control.
@@ -2174,12 +2512,12 @@ qqueueEnqObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr)
 	 * It's a side effect, but a good one ;) -- rgerhards, 2008-03-14
 	 */
 	if(flowCtlType == eFLOWCTL_FULL_DELAY) {
-		while(pThis->iQueueSize >= pThis->iFullDlyMrk) {
+		while(getPhysicalQueueSize(pThis) >= pThis->iFullDlyMrk) {
 			dbgoprint((obj_t*) pThis, "enqueueMsg: FullDelay mark reached for full delayable message - blocking.\n");
 			pthread_cond_wait(&pThis->belowFullDlyWtrMrk, pThis->mut); /* TODO error check? But what do then? */
 		}
 	} else if(flowCtlType == eFLOWCTL_LIGHT_DELAY) {
-		if(pThis->iQueueSize >= pThis->iLightDlyMrk) {
+		if(getPhysicalQueueSize(pThis) >= pThis->iLightDlyMrk) {
 			dbgoprint((obj_t*) pThis, "enqueueMsg: LightDelay mark reached for light delayable message - blocking a bit.\n");
 			timeoutComp(&t, 1000); /* 1000 millisconds = 1 second TODO: make configurable */
 			pthread_cond_timedwait(&pThis->belowLightDlyWtrMrk, pThis->mut, &t); /* TODO error check? But what do then? */
@@ -2191,7 +2529,7 @@ qqueueEnqObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr)
 	 * is not the case, basic flow control enters the field, which means we wait for
 	 * the queue to become ready or drop the new message. -- rgerhards, 2008-03-14
 	 */
-	while(   (pThis->iMaxQueueSize > 0 && pThis->iQueueSize >= pThis->iMaxQueueSize)
+	while(   (pThis->iMaxQueueSize > 0 && getPhysicalQueueSize(pThis) >= pThis->iMaxQueueSize)
 	      || (pThis->qType == QUEUETYPE_DISK && pThis->sizeOnDiskMax != 0
 	      	  && pThis->tVars.disk.sizeOnDisk > pThis->sizeOnDiskMax)) {
 		dbgoprint((obj_t*) pThis, "enqueueMsg: queue FULL - waiting to drain.\n");
@@ -2205,7 +2543,7 @@ qqueueEnqObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr)
 
 	/* and finally enqueue the message */
 	CHKiRet(qqueueAdd(pThis, pUsr));
-	qqueueChkPersist(pThis);
+	qqueueChkPersist(pThis, 1);
 
 finalize_it:
 	if(pThis->qType != QUEUETYPE_DIRECT) {
@@ -2237,7 +2575,7 @@ doEnqSingleObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr)
 
 	/* then check if we need to add an assistance disk queue */
 	if(pThis->bIsDA)
-		CHKiRet(qqueueChkStrtDA(pThis));
+		CHKiRet(ChkStrtDA(pThis));
 	
 	/* handle flow control
 	 * There are two different flow control mechanisms: basic and advanced flow control.
@@ -2291,7 +2629,6 @@ doEnqSingleObj(qqueue_t *pThis, flowControl_t flowCtlType, void *pUsr)
 
 	/* and finally enqueue the message */
 	CHKiRet(qqueueAdd(pThis, pUsr));
-	qqueueChkPersist(pThis); // TODO: optimize, do in outer function! (but we need parts from v5?)
 
 finalize_it:
 	RETiRet;
@@ -2325,6 +2662,8 @@ dbgprintf("queueMultiEnq: %d\n", i);
 		CHKiRet(doEnqSingleObj(pThis, pMultiSub->ppMsgs[i]->flowCtlType, (void*)pMultiSub->ppMsgs[i]));
 	}
 
+	qqueueChkPersist(pThis, pMultiSub->nElem);
+
 finalize_it:
 	if(pThis->qType != QUEUETYPE_DIRECT) {
 		/* make sure at least one worker is running. */
@@ -2348,7 +2687,7 @@ finalize_it:
  * rgerhards, 2008-01-16
  */
 static rsRetVal
-qqueueSetEnqOnly(qqueue_t *pThis, int bEnqOnly, int bLockMutex)
+SetEnqOnly(qqueue_t *pThis, int bEnqOnly, int bLockMutex)
 {
 	DEFiRet;
 	DEFVARS_mutexProtection;
@@ -2410,6 +2749,7 @@ DEFpropSetMeth(qqueue, iMinMsgsPerWrkr, int)
 DEFpropSetMeth(qqueue, bSaveOnShutdown, int)
 DEFpropSetMeth(qqueue, pUsr, void*)
 DEFpropSetMeth(qqueue, iDeqSlowdown, int)
+DEFpropSetMeth(qqueue, iDeqBatchSize, int)
 DEFpropSetMeth(qqueue, sizeOnDiskMax, int64)
 
 
@@ -2428,8 +2768,6 @@ static rsRetVal qqueueSetProperty(qqueue_t *pThis, var_t *pProp)
 
  	if(isProp("iQueueSize")) {
 		pThis->iQueueSize = pProp->val.num;
- 	} else if(isProp("iUngottenObjs")) {
-		pThis->iUngottenObjs = pProp->val.num;
  	} else if(isProp("tVars.disk.sizeOnDisk")) {
 		pThis->tVars.disk.sizeOnDisk = pProp->val.num;
  	} else if(isProp("tVars.disk.bytesRead")) {
diff --git a/runtime/queue.h b/runtime/queue.h
index 1d82d8d9..e873c456 100644
--- a/runtime/queue.h
+++ b/runtime/queue.h
@@ -27,8 +27,18 @@
 #include <pthread.h>
 #include "obj.h"
 #include "wtp.h"
+#include "batch.h"
 #include "stream.h"
 
+/* support for the toDelete list */
+typedef struct toDeleteLst_s toDeleteLst_t;
+struct toDeleteLst_s {
+	qDeqID	deqID;
+	int	nElemDeq;	/* numbe of elements that were dequeued and as such must now be discarded */
+	struct toDeleteLst_s *pNext;
+};
+
+
 /* queue types */
 typedef enum {
 	QUEUETYPE_FIXED_ARRAY = 0,/* a simple queue made out of a fixed (initially malloced) array fast but memoryhog */
@@ -54,10 +64,12 @@ typedef struct qWrkThrd_s {
 	pthread_mutex_t mut;
 } qWrkThrd_t;	/* type for queue worker threads */
 
+
 /* the queue object */
 typedef struct queue_s {
 	BEGINobjInstance;
 	queueType_t	qType;
+	int	nLogDeq;	/* number of elements currently logically dequeued */
 	bool	bEnqOnly;	/* does queue run in enqueue-only mode (1) or not (0)? */
 	bool	bSaveOnShutdown;/* persists everthing on shutdown (if DA!)? 1-yes, 0-no */
 	bool	bQueueStarted;	/* has queueStart() been called on this queue? 1-yes, 0-no */
@@ -83,7 +95,9 @@ typedef struct queue_s {
 	int	toQShutdown;	/* timeout for regular queue shutdown in ms */
 	int	toActShutdown;	/* timeout for long-running action shutdown in ms */
 	int	toWrkShutdown;	/* timeout for idle workers in ms, -1 means indefinite (0 is immediate) */
+	toDeleteLst_t *toDeleteLst;/* this queue's to-delete list */
 	int	toEnq;		/* enqueue timeout */
+	int	iDeqBatchSize;	/* max number of elements that shall be dequeued at once */
 	/* rate limiting settings (will be expanded) */
 	int	iDeqSlowdown; /* slow down dequeue by specified nbr of microseconds */
 	/* end rate limiting */
@@ -97,18 +111,19 @@ typedef struct queue_s {
 	 * applied to detect user configuration errors (and tell me how should we detect what
 	 * the user really wanted...). -- rgerhards, 2008-04-02
 	 */
-	/* ane dequeue time window */
-	rsRetVal (*pConsumer)(void *,void*); /* user-supplied consumer function for dequeued messages */
+	/* end dequeue time window */
+	rsRetVal (*pConsumer)(void *,batch_t*); /* user-supplied consumer function for dequeued messages */
 	/* calling interface for pConsumer: arg1 is the global user pointer from this structure, arg2 is the
-	 * user pointer that was dequeued (actual sample: for actions, arg1 is the pAction and arg2 is pointer
-	 * to message)
-	 * rgerhards, 2008-01-28
+	 * user pointer array that was dequeued (actual sample: for actions, arg1 is the pAction and arg2
+	 * is pointer to an array of message message pointers)
 	 */
 	/* type-specific handlers (set during construction) */
 	rsRetVal (*qConstruct)(struct queue_s *pThis);
 	rsRetVal (*qDestruct)(struct queue_s *pThis);
 	rsRetVal (*qAdd)(struct queue_s *pThis, void *pUsr);
-	rsRetVal (*qDel)(struct queue_s *pThis, void **ppUsr);
+	rsRetVal (*qDeq)(struct queue_s *pThis, void **ppUsr);
+	rsRetVal (*qDel)(struct queue_s *pThis);
+	rsRetVal (*qUnDeqAll)(struct queue_s *pThis);
 	/* end type-specific handler */
 	/* synchronization variables */
 	pthread_mutex_t mutThrdMgmt; /* mutex for the queue's thread management */
@@ -132,32 +147,30 @@ typedef struct queue_s {
 	int iNumberFiles;	/* how many files make up the queue? */
 	int64 iMaxFileSize;	/* max size for a single queue file */
 	int64 sizeOnDiskMax;    /* maximum size on disk allowed */
+	qDeqID deqIDAdd;	/* next dequeue ID to use during add to queue store */
+	qDeqID deqIDDel;	/* queue store delete position */
 	int bIsDA;		/* is this queue disk assisted? */
 	int bRunsDA;		/* is this queue actually *running* disk assisted? */
 	struct queue_s *pqDA;	/* queue for disk-assisted modes */
 	struct queue_s *pqParent;/* pointer to the parent (if this is a child queue) */
 	int	bDAEnqOnly;	/* EnqOnly setting for DA queue */
-	/* some data elements for the queueUngetObj() functionality. This list should always be short
-	 * and is always kept in memory
-	 */
-	qLinkedList_t *pUngetRoot;
-	qLinkedList_t *pUngetLast;
-	int iUngottenObjs;	/* number of objects currently in the "ungotten" list */
 	/* now follow queueing mode specific data elements */
 	union {			/* different data elements based on queue type (qType) */
 		struct {
-			long head, tail;
+			long deqhead, head, tail;
 			void** pBuf;		/* the queued user data structure */
 		} farray;
 		struct {
-			qLinkedList_t *pRoot;
+			qLinkedList_t *pDeqRoot;
+			qLinkedList_t *pDelRoot;
 			qLinkedList_t *pLast;
 		} linklist;
 		struct {
 			int64 sizeOnDisk; /* current amount of disk space used */
 			int64 bytesRead;  /* number of bytes read from current (undeleted!) file */
-			strm_t *pWrite; /* current file to be written */
-			strm_t *pRead;  /* current file to be read */
+			strm_t *pWrite;   /* current file to be written */
+			strm_t *pReadDeq; /* current file for dequeueing */
+			strm_t *pReadDel; /* current file for deleting */
 		} disk;
 	} tVars;
 } qqueue_t;
@@ -184,7 +197,7 @@ rsRetVal qqueueStart(qqueue_t *pThis);
 rsRetVal qqueueSetMaxFileSize(qqueue_t *pThis, size_t iMaxFileSize);
 rsRetVal qqueueSetFilePrefix(qqueue_t *pThis, uchar *pszPrefix, size_t iLenPrefix);
 rsRetVal qqueueConstruct(qqueue_t **ppThis, queueType_t qType, int iWorkerThreads,
-		        int iMaxQueueSize, rsRetVal (*pConsumer)(void*,void*));
+		        int iMaxQueueSize, rsRetVal (*pConsumer)(void*,batch_t*));
 PROTOTYPEObjClassInit(qqueue);
 PROTOTYPEpropSetMeth(qqueue, iPersistUpdCnt, int);
 PROTOTYPEpropSetMeth(qqueue, bSyncQueueFiles, int);
@@ -203,6 +216,7 @@ PROTOTYPEpropSetMeth(qqueue, bSaveOnShutdown, int);
 PROTOTYPEpropSetMeth(qqueue, pUsr, void*);
 PROTOTYPEpropSetMeth(qqueue, iDeqSlowdown, int);
 PROTOTYPEpropSetMeth(qqueue, sizeOnDiskMax, int64);
+PROTOTYPEpropSetMeth(qqueue, iDeqBatchSize, int);
 #define qqueueGetID(pThis) ((unsigned long) pThis)
 
 #endif /* #ifndef QUEUE_H_INCLUDED */
diff --git a/runtime/rsyslog.h b/runtime/rsyslog.h
index 8a043dde..c51b1272 100644
--- a/runtime/rsyslog.h
+++ b/runtime/rsyslog.h
@@ -69,8 +69,24 @@
 #endif
 
 
+/* the rsyslog core provides information about present feature to plugins
+ * asking it. Below are feature-test macros which must be used to query 
+ * features. Note that this must be powers of two, so that multiple queries
+ * can be combined. -- rgerhards, 2009-04-27
+ */
+#define CORE_FEATURE_BATCHING	1
+/*#define CORE_FEATURE_whatever 2 ... and so on ... */
+
+/* some universal fixed size integer defines ... */
+typedef long long int64;
+typedef long long unsigned uint64;
+typedef int64 number_t; /* type to use for numbers - TODO: maybe an autoconf option? */
+typedef char intTiny; 	/* 0..127! */
+typedef unsigned char uintTiny;	/* 0..255! */
+
 /* define some base data types */
 typedef unsigned char uchar;/* get rid of the unhandy "unsigned char" */
+typedef struct aUsrp_s aUsrp_t;
 typedef struct thrdInfo thrdInfo_t;
 typedef struct obj_s obj_t;
 typedef struct ruleset_s ruleset_t;
@@ -87,6 +103,7 @@ typedef struct nsd_gsspi_s nsd_gsspi_t;
 typedef struct nsd_nss_s nsd_nss_t;
 typedef struct nsdsel_ptcp_s nsdsel_ptcp_t;
 typedef struct nsdsel_gtls_s nsdsel_gtls_t;
+typedef struct wti_s wti_t;
 typedef obj_t nsd_t;
 typedef obj_t nsdsel_t;
 typedef struct msg msg_t;
@@ -102,18 +119,15 @@ typedef struct tcps_sess_s tcps_sess_t;
 typedef struct strmsrv_s strmsrv_t;
 typedef struct strms_sess_s strms_sess_t;
 typedef struct vmstk_s vmstk_t;
+typedef struct batch_obj_s batch_obj_t;
+typedef struct batch_s batch_t;
+typedef struct wtp_s wtp_t;
 typedef rsRetVal (*prsf_t)(struct vmstk_s*, int);	/* pointer to a RainerScript function */
+typedef uint64 qDeqID;	/* queue Dequeue order ID. 32 bits is considered dangerously few */
 
 typedef struct tcpLstnPortList_s tcpLstnPortList_t; // TODO: rename?
 typedef struct strmLstnPortList_s strmLstnPortList_t; // TODO: rename?
 
-/* some universal 64 bit define... */
-typedef long long int64;
-typedef long long unsigned uint64;
-typedef int64 number_t; /* type to use for numbers - TODO: maybe an autoconf option? */
-typedef char intTiny; 	/* 0..127! */
-typedef uchar uintTiny;	/* 0..255! */
-
 #ifdef __hpux
 typedef unsigned int u_int32_t; /* TODO: is this correct? */
 typedef int socklen_t;
@@ -372,10 +386,12 @@ enum rsRetVal_				/** return value. All methods return this if not specified oth
 	RS_RET_SYSVAR_NOT_FOUND = 1001, /**< system variable could not be found (maybe misspelled) */
 
 	/* some generic error/status codes */
+	RS_RET_OK = 0,			/**< operation successful */
 	RS_RET_OK_DELETE_LISTENTRY = 1,	/**< operation successful, but callee requested the deletion of an entry (special state) */
 	RS_RET_TERMINATE_NOW = 2,	/**< operation successful, function is requested to terminate (mostly used with threads) */
 	RS_RET_NO_RUN = 3,		/**< operation successful, but function does not like to be executed */
-	RS_RET_OK = 0			/**< operation successful */
+	RS_RET_IDLE = 4,		/**< operation successful, but callee is idle (e.g. because queue is empty) */
+	RS_RET_TERMINATE_WHEN_IDLE = 5	/**< operation successful, function is requested to terminate when idle */
 };
 
 /* some helpful macros to work with srRetVals.
diff --git a/runtime/srUtils.h b/runtime/srUtils.h
index 16766312..6d5a784b 100644
--- a/runtime/srUtils.h
+++ b/runtime/srUtils.h
@@ -116,11 +116,13 @@ rsRetVal getFileSize(uchar *pszName, off_t *pSize);
 	if(bMustLock == LOCK_MUTEX) { \
 		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave); \
 		d_pthread_mutex_lock(mut); \
+		assert(bLockedOpIsLocked == 0); \
 		bLockedOpIsLocked = 1; \
 	}
 #define END_MTX_PROTECTED_OPERATIONS(mut) \
 	if(bLockedOpIsLocked) { \
 		d_pthread_mutex_unlock(mut); \
+		bLockedOpIsLocked = 0; \
 		pthread_setcancelstate(iCancelStateSave, NULL); \
 	}
 
diff --git a/runtime/stream.c b/runtime/stream.c
index 00c726d9..67941062 100644
--- a/runtime/stream.c
+++ b/runtime/stream.c
@@ -72,10 +72,12 @@ flushApc(void *param1, void __attribute__((unused)) *param2)
 	strm_t *pThis = (strm_t*)  param1;
 	ISOBJ_TYPE_assert(pThis, strm);
 
+	BEGINfunc
 	BEGIN_MTX_PROTECTED_OPERATIONS_UNCOND(&pThis->mut);
 	strmFlush(pThis);
 	pThis->apcRequested = 0;
 	END_MTX_PROTECTED_OPERATIONS_UNCOND(&pThis->mut);
+	ENDfunc
 }
 
 
@@ -254,7 +256,7 @@ static rsRetVal strmOpenFile(strm_t *pThis)
 				    pThis->pszFName, pThis->lenFName, pThis->iCurrFNum, pThis->iFileNumDigits));
 	} else {
 		if(pThis->pszDir == NULL) {
-			if((pThis->pszCurrFName = (uchar*) strdup((char*) pThis->pszFName)) == NULL)
+			if((pThis->pszCurrFName = ustrdup(pThis->pszFName)) == NULL)
 				ABORT_FINALIZE(RS_RET_OUT_OF_MEMORY);
 		} else {
 			CHKiRet(genFileName(&pThis->pszCurrFName, pThis->pszDir, pThis->lenDir,
@@ -1250,6 +1252,46 @@ finalize_it:
 }
 
 
+/* duplicate a stream object excluding dynamic properties. This function is
+ * primarily meant to provide a duplicate that later on can be used to access
+ * the data. This is needed, for example, for a restart of the disk queue.
+ * Note that ConstructFinalize() is NOT called. So our caller may change some
+ * properties before finalizing things.
+ * rgerhards, 2009-05-26
+ */
+rsRetVal
+strmDup(strm_t *pThis, strm_t **ppNew)
+{
+	strm_t *pNew = NULL;
+	DEFiRet;
+
+	ISOBJ_TYPE_assert(pThis, strm);
+	assert(ppNew != NULL);
+
+	CHKiRet(strmConstruct(&pNew));
+	pNew->sType = pThis->sType;
+	pNew->iCurrFNum = pThis->iCurrFNum;
+	CHKmalloc(pNew->pszFName = ustrdup(pThis->pszFName));
+	pNew->lenFName = pThis->lenFName;
+	CHKmalloc(pNew->pszDir = ustrdup(pThis->pszDir));
+	pNew->lenDir = pThis->lenDir;
+	pNew->tOperationsMode = pThis->tOperationsMode;
+	pNew->tOpenMode = pThis->tOpenMode;
+	pNew->iMaxFileSize = pThis->iMaxFileSize;
+	pNew->iMaxFiles = pThis->iMaxFiles;
+	pNew->iFileNumDigits = pThis->iFileNumDigits;
+	pNew->bDeleteOnClose = pThis->bDeleteOnClose;
+	pNew->iCurrOffs = pThis->iCurrOffs;
+	
+	*ppNew = pNew;
+	pNew = NULL;
+
+finalize_it:
+	if(pNew != NULL)
+		strmDestruct(&pNew);
+
+	RETiRet;
+}
 
 /* set a user write-counter. This counter is initialized to zero and
  * receives the number of bytes written. It is accurate only after a
@@ -1365,6 +1407,7 @@ CODESTARTobjQueryInterface(strm)
 	pIf->RecordEnd = strmRecordEnd;
 	pIf->Serialize = strmSerialize;
 	pIf->GetCurrOffset = strmGetCurrOffset;
+	pIf->Dup = strmDup;
 	pIf->SetWCntr = strmSetWCntr;
 	/* set methods */
 	pIf->SetbDeleteOnClose = strmSetbDeleteOnClose;
diff --git a/runtime/stream.h b/runtime/stream.h
index ac003c7b..e8f4acf9 100644
--- a/runtime/stream.h
+++ b/runtime/stream.h
@@ -152,6 +152,7 @@ BEGINinterface(strm) /* name must also be changed in ENDinterface macro! */
 	rsRetVal (*Serialize)(strm_t *pThis, strm_t *pStrm);
 	rsRetVal (*GetCurrOffset)(strm_t *pThis, int64 *pOffs);
 	rsRetVal (*SetWCntr)(strm_t *pThis, number_t *pWCnt);
+	rsRetVal (*Dup)(strm_t *pThis, strm_t **ppNew);
 	INTERFACEpropSetMeth(strm, bDeleteOnClose, int);
 	INTERFACEpropSetMeth(strm, iMaxFileSize, int);
 	INTERFACEpropSetMeth(strm, iMaxFiles, int);
@@ -166,7 +167,7 @@ BEGINinterface(strm) /* name must also be changed in ENDinterface macro! */
 	INTERFACEpropSetMeth(strm, iFlushInterval, int);
 	INTERFACEpropSetMeth(strm, pszSizeLimitCmd, uchar*);
 ENDinterface(strm)
-#define strmCURR_IF_VERSION 2 /* increment whenever you change the interface structure! */
+#define strmCURR_IF_VERSION 5 /* increment whenever you change the interface structure! */
 
 
 /* prototypes */
diff --git a/runtime/wti.c b/runtime/wti.c
index 156d8116..917b456b 100644
--- a/runtime/wti.c
+++ b/runtime/wti.c
@@ -39,10 +39,10 @@
 #include <pthread.h>
 #include <errno.h>
 
-#ifdef OS_SOLARIS
-#	include <sched.h>
-#	define pthread_yield() sched_yield()
-#endif
+/// TODO: check on solaris if this is any longer needed - I don't think so - rgerhards, 2009-09-20
+//#ifdef OS_SOLARIS
+//#	include <sched.h>
+//#endif
 
 #include "rsyslog.h"
 #include "stringbuf.h"
@@ -147,11 +147,12 @@ wtiSetState(wti_t *pThis, qWrkCmd_t tCmd, int bActiveOnly, int bLockMutex)
 				break;
 		}
 		/* apply the new state */
+dbgprintf("worker terminator will write stateval %d\n", tCmd);
 		unsigned val = ATOMIC_CAS_VAL(pThis->tCurrCmd, tCurrCmd, tCmd);
 		if(val != tCurrCmd) {
 			DBGPRINTF("wtiSetState PROBLEM, tCurrCmd %d overwritten with %d, wanted to set %d\n", tCurrCmd, val, tCmd);
 		}
-
+//dbgprintf("worker terminator has written stateval %d\n", tCmd);
 	}
 
 	END_MTX_PROTECTED_OPERATIONS(&pThis->mut);
@@ -211,6 +212,7 @@ CODESTARTobjDestruct(wti)
 	pthread_cond_destroy(&pThis->condExitDone);
 	pthread_mutex_destroy(&pThis->mut);
 
+	free(pThis->batch.pElem);
 	free(pThis->pszDbgHdr);
 ENDobjDestruct(wti)
 
@@ -230,15 +232,20 @@ rsRetVal
 wtiConstructFinalize(wti_t *pThis)
 {
 	DEFiRet;
+	int iDeqBatchSize;
 
 	ISOBJ_TYPE_assert(pThis, wti);
 
 	dbgprintf("%s: finalizing construction of worker instance data\n", wtiGetDbgHdr(pThis));
 
 	/* initialize our thread instance descriptor */
-	pThis->pUsrp = NULL;
 	pThis->tCurrCmd = eWRKTHRD_STOPPED;
 
+	/* we now alloc the array for user pointers. We obtain the max from the queue itself. */
+	CHKiRet(pThis->pWtp->pfGetDeqBatchSize(pThis->pWtp->pUsr, &iDeqBatchSize));
+	CHKmalloc(pThis->batch.pElem = calloc((size_t)iDeqBatchSize, sizeof(batch_obj_t)));
+
+finalize_it:
 	RETiRet;
 }
 
@@ -322,7 +329,7 @@ wtiWorkerCancelCleanup(void *arg)
 	DBGPRINTF("%s: cancelation cleanup handler called.\n", wtiGetDbgHdr(pThis));
 	
 	/* call user supplied handler (that one e.g. requeues the element) */
-	pWtp->pfOnWorkerCancel(pThis->pWtp->pUsr, pThis->pUsrp);
+	pWtp->pfOnWorkerCancel(pThis->pWtp->pUsr, pThis->batch.pElem[0].pUsrp);
 
 	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &iCancelStateSave);
 	d_pthread_mutex_lock(&pWtp->mut);
@@ -336,75 +343,108 @@ wtiWorkerCancelCleanup(void *arg)
 }
 
 
+/* wait for queue to become non-empty or timeout
+ * helper to wtiWorker
+ * IMPORTANT: mutex must be locked when this code is called!
+ * rgerhards, 2009-05-20
+ */
+static inline void
+doIdleProcessing(wti_t *pThis, wtp_t *pWtp, int *pbInactivityTOOccured)
+{
+	struct timespec t;
+
+	BEGINfunc
+	DBGPRINTF("%s: worker IDLE, waiting for work.\n", wtiGetDbgHdr(pThis));
+	pWtp->pfOnIdle(pWtp->pUsr, MUTEX_ALREADY_LOCKED);
+
+	if(pWtp->toWrkShutdown == -1) {
+		/* never shut down any started worker */
+		d_pthread_cond_wait(pWtp->pcondBusy, pWtp->pmutUsr);
+	} else {
+		timeoutComp(&t, pWtp->toWrkShutdown);/* get absolute timeout */
+		if(d_pthread_cond_timedwait(pWtp->pcondBusy, pWtp->pmutUsr, &t) != 0) {
+			DBGPRINTF("%s: inactivity timeout, worker terminating...\n", wtiGetDbgHdr(pThis));
+			*pbInactivityTOOccured = 1; /* indicate we had a timeout */
+		}
+	}
+	ENDfunc
+}
+
+
 /* generic worker thread framework
  */
 #pragma GCC diagnostic ignored "-Wempty-body"
 rsRetVal
 wtiWorker(wti_t *pThis)
 {
-	DEFiRet;
-	DEFVARS_mutexProtection;
-	struct timespec t;
+	DEFVARS_mutexProtection_uncond;
 	wtp_t *pWtp;		/* our worker thread pool */
 	int bInactivityTOOccured = 0;
+	rsRetVal localRet;
+	rsRetVal terminateRet;
+	bool bMutexIsLocked;
+	DEFiRet;
 
 	ISOBJ_TYPE_assert(pThis, wti);
 	pWtp = pThis->pWtp; /* shortcut */
 	ISOBJ_TYPE_assert(pWtp, wtp);
 
 	dbgSetThrdName(pThis->pszDbgHdr);
-	pThis->pUsrp = NULL;
 	pthread_cleanup_push(wtiWorkerCancelCleanup, pThis);
 
-	BEGIN_MTX_PROTECTED_OPERATIONS(pWtp->pmutUsr, LOCK_MUTEX);
+	BEGIN_MTX_PROTECTED_OPERATIONS_UNCOND(pWtp->pmutUsr);
 	pWtp->pfOnWorkerStartup(pWtp->pUsr);
-	END_MTX_PROTECTED_OPERATIONS(pWtp->pmutUsr);
+	END_MTX_PROTECTED_OPERATIONS_UNCOND(pWtp->pmutUsr);
 
 	/* now we have our identity, on to real processing */
 	while(1) { /* loop will be broken below - need to do mutex locks */
 		/* process any pending thread requests */
 		wtpProcessThrdChanges(pWtp);
 
-		/* if we have a rate-limiter set for this worker pool, let's call it. Please
-		 * keep in mind that the rate-limiter may hold us for an extended period
-		 * of time. -- rgerhards, 2008-04-02
-		 */
-		if(pWtp->pfRateLimiter != NULL) {
+		if(pWtp->pfRateLimiter != NULL) { /* call rate-limiter, if defined */
 			pWtp->pfRateLimiter(pWtp->pUsr);
 		}
 		
 		wtpSetInactivityGuard(pThis->pWtp, 0, LOCK_MUTEX); /* must be set before usr mutex is locked! */
-		BEGIN_MTX_PROTECTED_OPERATIONS(pWtp->pmutUsr, LOCK_MUTEX);
-
-		if(  (bInactivityTOOccured && pWtp->pfIsIdle(pWtp->pUsr, MUTEX_ALREADY_LOCKED))
-		   || wtpChkStopWrkr(pWtp, LOCK_MUTEX, MUTEX_ALREADY_LOCKED)) {
-			END_MTX_PROTECTED_OPERATIONS(pWtp->pmutUsr);
-			break; /* end worker thread run */
+		BEGIN_MTX_PROTECTED_OPERATIONS_UNCOND(pWtp->pmutUsr);
+		bMutexIsLocked = TRUE;
+
+		/* first check if we are in shutdown process (but evaluate a bit later) */
+		terminateRet = wtpChkStopWrkr(pWtp, LOCK_MUTEX, MUTEX_ALREADY_LOCKED);
+		if(terminateRet == RS_RET_TERMINATE_NOW) {
+			/* we now need to free the old batch */
+			localRet = pWtp->pfObjProcessed(pWtp->pUsr, pThis);
+			dbgoprint((obj_t*) pThis, "terminating worker because of TERMINATE_NOW mode, del iRet %d\n",
+				 localRet);
+			break;
 		}
-		bInactivityTOOccured = 0; /* reset for next run */
 
-		/* if we reach this point, we are still protected by the mutex */
-
-		if(pWtp->pfIsIdle(pWtp->pUsr, MUTEX_ALREADY_LOCKED)) {
-			DBGPRINTF("%s: worker IDLE, waiting for work.\n", wtiGetDbgHdr(pThis));
-			pWtp->pfOnIdle(pWtp->pUsr, MUTEX_ALREADY_LOCKED);
-
-			if(pWtp->toWrkShutdown == -1) {
-				/* never shut down any started worker */
-				d_pthread_cond_wait(pWtp->pcondBusy, pWtp->pmutUsr);
-			} else {
-				timeoutComp(&t, pWtp->toWrkShutdown);/* get absolute timeout */
-				if(d_pthread_cond_timedwait(pWtp->pcondBusy, pWtp->pmutUsr, &t) != 0) {
-					DBGPRINTF("%s: inactivity timeout, worker terminating...\n", wtiGetDbgHdr(pThis));
-					bInactivityTOOccured = 1; /* indicate we had a timeout */
-				}
+		/* try to execute and process whatever we have */
+		/* This function must and does RELEASE the MUTEX! */
+		localRet = pWtp->pfDoWork(pWtp->pUsr, pThis, iCancelStateSave);
+		bMutexIsLocked = FALSE;
+
+		if(localRet == RS_RET_IDLE) {
+			if(terminateRet == RS_RET_TERMINATE_WHEN_IDLE) {
+				break;	/* end of loop */
+			}
+
+			if(bInactivityTOOccured) {
+				/* we had an inactivity timeout in the last run and are still idle, so it is time to exit... */
+				break; /* end worker thread run */
 			}
-			END_MTX_PROTECTED_OPERATIONS(pWtp->pmutUsr);
+			BEGIN_MTX_PROTECTED_OPERATIONS_UNCOND(pWtp->pmutUsr);
+			doIdleProcessing(pThis, pWtp, &bInactivityTOOccured);
+			END_MTX_PROTECTED_OPERATIONS_UNCOND(pWtp->pmutUsr);
 			continue; /* request next iteration */
 		}
 
-		/* if we reach this point, we have a non-empty queue (and are still protected by mutex) */
-		pWtp->pfDoWork(pWtp->pUsr, pThis, iCancelStateSave);
+		bInactivityTOOccured = 0; /* reset for next run */
+	}
+
+	/* if we exit the loop, the mutex may be locked and, if so, must be unlocked */
+	if(bMutexIsLocked) {
+		END_MTX_PROTECTED_OPERATIONS_UNCOND(pWtp->pmutUsr);
 	}
 
 	/* indicate termination */
@@ -412,6 +452,7 @@ wtiWorker(wti_t *pThis)
 	d_pthread_mutex_lock(&pThis->mut);
 	pthread_cleanup_pop(0); /* remove cleanup handler */
 
+RUNLOG_STR("XXX: Worker shutdown");
 	pWtp->pfOnWorkerShutdown(pWtp->pUsr);
 
 	wtiSetState(pThis, eWRKTHRD_TERMINATING, 0, MUTEX_ALREADY_LOCKED);
diff --git a/runtime/wti.h b/runtime/wti.h
index 72653b15..2acd3cf6 100644
--- a/runtime/wti.h
+++ b/runtime/wti.h
@@ -1,6 +1,6 @@
 /* Definition of the worker thread instance (wti) class.
  *
- * Copyright 2008 Rainer Gerhards and Adiscon GmbH.
+ * Copyright 2008, 2009 by Rainer Gerhards and Adiscon GmbH.
  *
  * This file is part of the rsyslog runtime library.
  *
@@ -27,19 +27,21 @@
 #include <pthread.h>
 #include "wtp.h"
 #include "obj.h"
+#include "batch.h"
+
 
 /* the worker thread instance class */
-typedef struct wti_s {
+struct wti_s {
 	BEGINobjInstance;
 	pthread_t thrdID;  /* thread ID */
 	qWrkCmd_t tCurrCmd; /* current command to be carried out by worker */
-	obj_t *pUsrp;		/* pointer to an object meaningful for current user pointer (e.g. queue pUsr data elemt) */
 	wtp_t *pWtp; /* my worker thread pool (important if only the work thread instance is passed! */
 	pthread_cond_t condExitDone; /* signaled when the thread exit is done (once per thread existance) */
 	pthread_mutex_t mut;
+	batch_t batch; /* pointer to an object array meaningful for current user pointer (e.g. queue pUsr data elemt) */
 	bool bShutdownRqtd;	/* shutdown for this thread requested? 0 - no , 1 - yes */
 	uchar *pszDbgHdr;	/* header string for debug messages */
-} wti_t;
+};
 
 /* some symbolic constants for easier reference */
 
diff --git a/runtime/wtp.c b/runtime/wtp.c
index 02662cde..dc9cbb75 100644
--- a/runtime/wtp.c
+++ b/runtime/wtp.c
@@ -42,10 +42,10 @@
 #include <atomic.h>
 #include <sys/prctl.h>
 
-#ifdef OS_SOLARIS
-#	include <sched.h>
-#	define pthread_yield() sched_yield()
-#endif
+/// TODO: check on solaris if this is any longer needed - I don't think so - rgerhards, 2009-09-20
+//#ifdef OS_SOLARIS
+//#	include <sched.h>
+//#endif
 
 #include "rsyslog.h"
 #include "stringbuf.h"
@@ -80,7 +80,7 @@ wtpGetDbgHdr(wtp_t *pThis)
 
 
 /* Not implemented dummy function for constructor */
-static rsRetVal NotImplementedDummy() { return RS_RET_OK; }
+static rsRetVal NotImplementedDummy() { return RS_RET_NOT_IMPLEMENTED; }
 /* Standard-Constructor for the wtp object
  */
 BEGINobjConstruct(wtp) /* be sure to specify the object type also in END macro! */
@@ -89,12 +89,15 @@ BEGINobjConstruct(wtp) /* be sure to specify the object type also in END macro!
 	pthread_cond_init(&pThis->condThrdTrm, NULL);
 	/* set all function pointers to "not implemented" dummy so that we can safely call them */
 	pThis->pfChkStopWrkr = NotImplementedDummy;
+	pThis->pfGetDeqBatchSize = NotImplementedDummy;
 	pThis->pfIsIdle = NotImplementedDummy;
 	pThis->pfDoWork = NotImplementedDummy;
+	pThis->pfObjProcessed = NotImplementedDummy;
 	pThis->pfOnIdle = NotImplementedDummy;
 	pThis->pfOnWorkerCancel = NotImplementedDummy;
 	pThis->pfOnWorkerStartup = NotImplementedDummy;
 	pThis->pfOnWorkerShutdown = NotImplementedDummy;
+dbgprintf("XXX: wtpConstruct: %d\n", pThis->wtpState);
 ENDobjConstruct(wtp)
 
 
@@ -118,7 +121,7 @@ wtpConstructFinalize(wtp_t *pThis)
 	 */
 	if((pThis->pWrkr = malloc(sizeof(wti_t*) * pThis->iNumWorkerThreads)) == NULL)
 		ABORT_FINALIZE(RS_RET_OUT_OF_MEMORY);
-
+	
 	for(i = 0 ; i < pThis->iNumWorkerThreads ; ++i) {
 		CHKiRet(wtiConstruct(&pThis->pWrkr[i]));
 		pWti = pThis->pWrkr[i];
@@ -248,7 +251,6 @@ wtpSetState(wtp_t *pThis, wtpState_t iNewState)
 
 
 /* check if the worker shall shutdown (1 = yes, 0 = no)
- * TODO: check if we can use atomic operations to enhance performance
  * Note: there may be two mutexes locked, the bLockUsrMutex is the one in our "user"
  * (e.g. the queue clas)
  * rgerhards, 2008-01-21
@@ -262,16 +264,19 @@ wtpChkStopWrkr(wtp_t *pThis, int bLockMutex, int bLockUsrMutex)
 	ISOBJ_TYPE_assert(pThis, wtp);
 
 	BEGIN_MTX_PROTECTED_OPERATIONS(&pThis->mut, bLockMutex);
-	if(   (pThis->wtpState == wtpState_SHUTDOWN_IMMEDIATE)
-	   || ((pThis->wtpState == wtpState_SHUTDOWN) && pThis->pfIsIdle(pThis->pUsr, bLockUsrMutex)))
-		iRet = RS_RET_TERMINATE_NOW;
-	END_MTX_PROTECTED_OPERATIONS(&pThis->mut);
+	if(pThis->wtpState == wtpState_SHUTDOWN_IMMEDIATE) {
+		ABORT_FINALIZE(RS_RET_TERMINATE_NOW);
+	} else if(pThis->wtpState == wtpState_SHUTDOWN) {
+		ABORT_FINALIZE(RS_RET_TERMINATE_WHEN_IDLE);
+	}
 
 	/* try customer handler if one was set and we do not yet have a definite result */
-	if(iRet == RS_RET_OK && pThis->pfChkStopWrkr != NULL) {
+	if(pThis->pfChkStopWrkr != NULL) {
 		iRet = pThis->pfChkStopWrkr(pThis->pUsr, bLockUsrMutex);
 	}
 
+finalize_it:
+	END_MTX_PROTECTED_OPERATIONS(&pThis->mut);
 	RETiRet;
 }
 
@@ -581,8 +586,10 @@ DEFpropSetMethPTR(wtp, pmutUsr, pthread_mutex_t)
 DEFpropSetMethPTR(wtp, pcondBusy, pthread_cond_t)
 DEFpropSetMethFP(wtp, pfChkStopWrkr, rsRetVal(*pVal)(void*, int))
 DEFpropSetMethFP(wtp, pfRateLimiter, rsRetVal(*pVal)(void*))
-DEFpropSetMethFP(wtp, pfIsIdle, rsRetVal(*pVal)(void*, int))
+DEFpropSetMethFP(wtp, pfGetDeqBatchSize, rsRetVal(*pVal)(void*, int*))
+DEFpropSetMethFP(wtp, pfIsIdle, rsRetVal(*pVal)(void*, wtp_t*))
 DEFpropSetMethFP(wtp, pfDoWork, rsRetVal(*pVal)(void*, void*, int))
+DEFpropSetMethFP(wtp, pfObjProcessed, rsRetVal(*pVal)(void*, wti_t*))
 DEFpropSetMethFP(wtp, pfOnIdle, rsRetVal(*pVal)(void*, int))
 DEFpropSetMethFP(wtp, pfOnWorkerCancel, rsRetVal(*pVal)(void*, void*))
 DEFpropSetMethFP(wtp, pfOnWorkerStartup, rsRetVal(*pVal)(void*))
diff --git a/runtime/wtp.h b/runtime/wtp.h
index 1ce171cc..ef61d7e6 100644
--- a/runtime/wtp.h
+++ b/runtime/wtp.h
@@ -50,7 +50,7 @@ typedef enum {
 
 
 /* the worker thread pool (wtp) object */
-typedef struct wtp_s {
+struct wtp_s {
 	BEGINobjInstance;
 	wtpState_t wtpState;
 	int 	iNumWorkerThreads;/* number of worker threads to use */
@@ -66,12 +66,14 @@ typedef struct wtp_s {
 	int bThrdStateChanged;	/* at least one thread state has changed if 1 */
 	/* end sync variables */
 	/* user objects */
-	void *pUsr;		/* pointer to user object */
+	void *pUsr;		/* pointer to user object (in this case, the queue the wtp belongs to) */
 	pthread_mutex_t *pmutUsr;
 	pthread_cond_t *pcondBusy; /* condition the user will signal "busy again, keep runing" on (awakes worker) */
 	rsRetVal (*pfChkStopWrkr)(void *pUsr, int);
+	rsRetVal (*pfGetDeqBatchSize)(void *pUsr, int*); /* obtains max dequeue count from queue config */
+	rsRetVal (*pfObjProcessed)(void *pUsr, wti_t *pWti); /* indicate user object is processed */
 	rsRetVal (*pfRateLimiter)(void *pUsr);
-	rsRetVal (*pfIsIdle)(void *pUsr, int);
+	rsRetVal (*pfIsIdle)(void *pUsr, wtp_t *pWtp);
 	rsRetVal (*pfDoWork)(void *pUsr, void *pWti, int);
 	rsRetVal (*pfOnIdle)(void *pUsr, int);
 	rsRetVal (*pfOnWorkerCancel)(void *pUsr, void*pWti);
@@ -79,7 +81,7 @@ typedef struct wtp_s {
 	rsRetVal (*pfOnWorkerShutdown)(void *pUsr);
 	/* end user objects */
 	uchar *pszDbgHdr;	/* header string for debug messages */
-} wtp_t;
+};
 
 /* some symbolic constants for easier reference */
 
@@ -103,8 +105,10 @@ int wtpGetCurNumWrkr(wtp_t *pThis, int bLockMutex);
 PROTOTYPEObjClassInit(wtp);
 PROTOTYPEpropSetMethFP(wtp, pfChkStopWrkr, rsRetVal(*pVal)(void*, int));
 PROTOTYPEpropSetMethFP(wtp, pfRateLimiter, rsRetVal(*pVal)(void*));
-PROTOTYPEpropSetMethFP(wtp, pfIsIdle, rsRetVal(*pVal)(void*, int));
+PROTOTYPEpropSetMethFP(wtp, pfGetDeqBatchSize, rsRetVal(*pVal)(void*, int*));
+PROTOTYPEpropSetMethFP(wtp, pfIsIdle, rsRetVal(*pVal)(void*, wtp_t*));
 PROTOTYPEpropSetMethFP(wtp, pfDoWork, rsRetVal(*pVal)(void*, void*, int));
+PROTOTYPEpropSetMethFP(wtp, pfObjProcessed, rsRetVal(*pVal)(void*, wti_t*));
 PROTOTYPEpropSetMethFP(wtp, pfOnIdle, rsRetVal(*pVal)(void*, int));
 PROTOTYPEpropSetMethFP(wtp, pfOnWorkerCancel, rsRetVal(*pVal)(void*,void*));
 PROTOTYPEpropSetMethFP(wtp, pfOnWorkerStartup, rsRetVal(*pVal)(void*));
diff --git a/tests/Makefile.am b/tests/Makefile.am
index b34daa06..e5b1f819 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -2,8 +2,12 @@ if ENABLE_TESTBENCH
 TESTRUNS = rt_init rscript
 check_PROGRAMS = $(TESTRUNS) ourtail nettester tcpflood chkseq
 TESTS = $(TESTRUNS) cfg.sh \
+	arrayqueue.sh \
+	linkedlistqueue.sh \
+	da-mainmsg-q.sh \
 	validation-run.sh \
 	imtcp-multiport.sh \
+	daqueue-persist.sh \
 	diskqueue.sh \
 	diskqueue-fsync.sh \
 	manytcp.sh \
@@ -91,6 +95,12 @@ EXTRA_DIST= 1.rstest 2.rstest 3.rstest err1.rstest \
 	   fieldtest.sh \
 	   diskqueue.sh \
 	   testsuites/diskqueue.conf \
+	   arrayqueue.sh \
+	   testsuites/arrayqueue.conf \
+	   linkedlistqueue.sh \
+	   testsuites/linkedlistqueue.conf \
+	   da-mainmsg-q.sh \
+	   testsuites/da-mainmsg-q.conf \
 	   diskqueue-fsync.sh \
 	   testsuites/diskqueue-fsync.conf \
 	   imtcp-multiport.sh \
@@ -105,6 +115,8 @@ EXTRA_DIST= 1.rstest 2.rstest 3.rstest err1.rstest \
 	   omod-if-array.sh \
 	   diag.sh \
 	   testsuites/diag-common.conf \
+           daqueue-persist.sh \
+	   daqueue-persist-drvr.sh \
 	   queue-persist.sh \
 	   queue-persist-drvr.sh \
 	   testsuites/queue-persist.conf \
diff --git a/tests/arrayqueue.sh b/tests/arrayqueue.sh
new file mode 100755
index 00000000..01fc133b
--- /dev/null
+++ b/tests/arrayqueue.sh
@@ -0,0 +1,14 @@
+# Test for fixedArray queue mode
+# added 2009-05-20 by rgerhards
+# This file is part of the rsyslog project, released  under GPLv3
+echo testing queue fixedArray queue mode
+source $srcdir/diag.sh init
+source $srcdir/diag.sh startup arrayqueue.conf
+
+# 40000 messages should be enough
+source $srcdir/diag.sh injectmsg  0 40000
+
+# terminate *now* (don't wait for queue to drain!)
+kill `cat rsyslog.pid`
+source $srcdir/diag.sh seq-check 0 39999
+source $srcdir/diag.sh exit
diff --git a/tests/da-mainmsg-q.sh b/tests/da-mainmsg-q.sh
new file mode 100755
index 00000000..6ec2f3a9
--- /dev/null
+++ b/tests/da-mainmsg-q.sh
@@ -0,0 +1,31 @@
+# Test for DA mode on the main message queue
+# This test checks if DA mode operates correctly. To do so,
+# it uses a small in-memory queue size, so that DA mode is initiated
+# rather soon, and disk spooling used. There is some uncertainty (based
+# on machine speeds), but in general the test should work rather well.
+# We add a few messages after the initial run, just so that we can
+# check everything recovers from DA mode correctly.
+# added 2009-04-22 by Rgerhards
+# This file is part of the rsyslog project, released  under GPLv3
+echo "testing main message queue in DA mode (going to disk)"
+source $srcdir/diag.sh init
+source $srcdir/diag.sh startup da-mainmsg-q.conf
+
+# part1: send first 50 messages (in memory, only)
+#source $srcdir/diag.sh tcpflood 127.0.0.1 13514 1 50
+source $srcdir/diag.sh injectmsg 0 50
+source $srcdir/diag.sh wait-queueempty # let queue drain for this test case
+
+# part 2: send bunch of messages. This should trigger DA mode
+#source $srcdir/diag.sh injectmsg 50 20000
+source $srcdir/diag.sh injectmsg 50 2000
+ls -l test-spool	 # for manual review
+
+# send another handful
+source $srcdir/diag.sh injectmsg 2050 50
+#sleep 1 # we need this so that rsyslogd can receive all outstanding messages
+
+# clean up and check test result
+source $srcdir/diag.sh shutdown-when-empty # shut down rsyslogd when done processing messages
+source $srcdir/diag.sh seq-check 2099
+source $srcdir/diag.sh exit
diff --git a/tests/daqueue-persist-drvr.sh b/tests/daqueue-persist-drvr.sh
new file mode 100755
index 00000000..7b6ec6dd
--- /dev/null
+++ b/tests/daqueue-persist-drvr.sh
@@ -0,0 +1,30 @@
+# Test for queue data persisting at shutdown. The
+# plan is to start an instance, emit some data, do a relatively
+# fast shutdown and then re-start the engine to process the 
+# remaining data.
+# added 2009-05-27 by Rgerhards
+# This file is part of the rsyslog project, released  under GPLv3
+# uncomment for debugging support:
+echo testing memory daqueue persisting to disk, mode $1
+source $srcdir/diag.sh init
+
+# prepare config
+echo \$MainMsgQueueType $1 > work-queuemode.conf
+echo "*.*     :omtesting:sleep 0 1000" > work-delay.conf
+
+# inject 10000 msgs, so that DO hit the high watermark
+source $srcdir/diag.sh startup queue-persist.conf
+source $srcdir/diag.sh injectmsg 0 10000
+$srcdir/diag.sh shutdown-immediate
+$srcdir/diag.sh wait-shutdown
+source $srcdir/diag.sh check-mainq-spool
+
+#exit
+
+# restart engine and have rest processed
+#remove delay
+echo "#" > work-delay.conf
+source $srcdir/diag.sh startup queue-persist.conf
+source $srcdir/diag.sh shutdown-when-empty # shut down rsyslogd when done processing messages
+source $srcdir/diag.sh seq-check 0 4999
+source $srcdir/diag.sh exit
diff --git a/tests/daqueue-persist.sh b/tests/daqueue-persist.sh
new file mode 100755
index 00000000..157b8f7a
--- /dev/null
+++ b/tests/daqueue-persist.sh
@@ -0,0 +1,11 @@
+# Test for queue data persisting at shutdown. We use the actual driver
+# to carry out multiple tests with different queue modes
+# added 2009-05-27 by Rgerhards
+# This file is part of the rsyslog project, released  under GPLv3
+source $srcdir/daqueue-persist-drvr.sh LinkedList
+#source $srcdir/daqueue-persist-drvr.sh FixedArray
+# the disk test should not fail, however, the config is extreme and using
+# it more or less is a config error
+#source $srcdir/daqueue-persist-drvr.sh Disk
+# we do not test Direct mode because this absolute can not work in direct mode
+# (maybe we should do a fail-type of test?)
diff --git a/tests/linkedlistqueue.sh b/tests/linkedlistqueue.sh
new file mode 100755
index 00000000..9570ed2b
--- /dev/null
+++ b/tests/linkedlistqueue.sh
@@ -0,0 +1,14 @@
+# Test for Linkedlist queue mode
+# added 2009-05-20 by rgerhards
+# This file is part of the rsyslog project, released  under GPLv3
+echo testing queue Linkedlist queue mode
+source $srcdir/diag.sh init
+source $srcdir/diag.sh startup linkedlistqueue.conf
+
+# 40000 messages should be enough
+source $srcdir/diag.sh injectmsg  0 40000
+
+# terminate *now* (don't wait for queue to drain)
+kill `cat rsyslog.pid`
+source $srcdir/diag.sh seq-check 0 39999
+source $srcdir/diag.sh exit
diff --git a/tests/testsuites/arrayqueue.conf b/tests/testsuites/arrayqueue.conf
new file mode 100644
index 00000000..c5874a83
--- /dev/null
+++ b/tests/testsuites/arrayqueue.conf
@@ -0,0 +1,14 @@
+# Test for queue fixedArray mode (see .sh file for details)
+# rgerhards, 2009-04-17
+$IncludeConfig diag-common.conf
+
+$ModLoad ../plugins/imtcp/.libs/imtcp
+$MainMsgQueueTimeoutShutdown 10000
+$InputTCPServerRun 13514
+
+# set spool locations and switch queue to disk-only mode
+$MainMsgQueueType FixedArray
+
+$template outfmt,"%msg:F,58:2%\n"
+$template dynfile,"rsyslog.out.log" # trick to use relative path names!
+:msg, contains, "msgnum:" ?dynfile;outfmt
diff --git a/tests/testsuites/da-mainmsg-q.conf b/tests/testsuites/da-mainmsg-q.conf
new file mode 100644
index 00000000..843a3e4f
--- /dev/null
+++ b/tests/testsuites/da-mainmsg-q.conf
@@ -0,0 +1,21 @@
+# Test for DA mode in main message queue (see .sh file for details)
+# rgerhards, 2009-04-22
+$ModLoad ../plugins/imtcp/.libs/imtcp
+$MainMsgQueueTimeoutShutdown 10000
+$InputTCPServerRun 13514
+
+$IncludeConfig diag-common.conf
+
+# set spool locations and switch queue to disk assisted mode
+$WorkDirectory test-spool
+$MainMsgQueueSize 200 # this *should* trigger moving on to DA mode...
+# note: we must set QueueSize sufficiently high, so that 70% (light delay mark)
+# is high enough above HighWatermark!
+$MainMsgQueueHighWatermark 80
+$MainMsgQueueLowWatermark 40
+$MainMsgQueueFilename mainq
+$MainMsgQueueType linkedlist
+
+$template outfmt,"%msg:F,58:2%\n"
+$template dynfile,"rsyslog.out.log" # trick to use relative path names!
+:msg, contains, "msgnum:" ?dynfile;outfmt
diff --git a/tests/testsuites/linkedlistqueue.conf b/tests/testsuites/linkedlistqueue.conf
new file mode 100644
index 00000000..92a9649c
--- /dev/null
+++ b/tests/testsuites/linkedlistqueue.conf
@@ -0,0 +1,16 @@
+# Test for queue LinkedList mode (see .sh file for details)
+# rgerhards, 2009-04-17
+$IncludeConfig diag-common.conf
+
+$ModLoad ../plugins/imtcp/.libs/imtcp
+$MainMsgQueueTimeoutShutdown 10000
+$InputTCPServerRun 13514
+
+$ErrorMessagesToStderr off
+
+# set spool locations and switch queue to disk-only mode
+$MainMsgQueueType LinkedList
+
+$template outfmt,"%msg:F,58:2%\n"
+$template dynfile,"rsyslog.out.log" # trick to use relative path names!
+:msg, contains, "msgnum:" ?dynfile;outfmt
diff --git a/tools/syslogd.c b/tools/syslogd.c
index f7253a8e..e3824d9e 100644
--- a/tools/syslogd.c
+++ b/tools/syslogd.c
@@ -129,12 +129,15 @@
 #include "omfile.h"
 #include "omdiscard.h"
 #include "threads.h"
+#include "wti.h"
 #include "queue.h"
 #include "stream.h"
 #include "conf.h"
 #include "errmsg.h"
 #include "datetime.h"
 #include "parser.h"
+//#include "sysvar.h"
+#include "batch.h"
 #include "unicode-helper.h"
 #include "ruleset.h"
 #include "rule.h"
@@ -293,6 +296,7 @@ static int iMainMsgQtoWrkShutdown = 60000;			/* timeout for worker thread shutdo
 static int iMainMsgQWrkMinMsgs = 100;				/* minimum messages per worker needed to start a new one */
 static int iMainMsgQDeqSlowdown = 0;				/* dequeue slowdown (simple rate limiting) */
 static int64 iMainMsgQueMaxDiskSpace = 0;			/* max disk space allocated 0 ==> unlimited */
+static int iMainMsgQueDeqBatchSize = 32;			/* dequeue batch size */
 static int bMainMsgQSaveOnShutdown = 1;				/* save queue on shutdown (when DA enabled)? */
 static int iMainMsgQueueDeqtWinFromHr = 0;			/* hour begin of time frame when queue is to be dequeued */
 static int iMainMsgQueueDeqtWinToHr = 25;			/* hour begin of time frame when queue is to be dequeued */
@@ -359,6 +363,7 @@ static rsRetVal resetConfigVariables(uchar __attribute__((unused)) *pp, void __a
 	bMainMsgQSaveOnShutdown = 1;
 	MainMsgQueType = QUEUETYPE_FIXED_ARRAY;
 	iMainMsgQueMaxDiskSpace = 0;
+	iMainMsgQueDeqBatchSize = 32;
 	glbliActionResumeRetryCount = 0;
 
 	return RS_RET_OK;
@@ -915,25 +920,263 @@ finalize_it:
 	RETiRet;
 }
 
+#if 0 // Code obsoleted by merge imfile, but check for changes if there are problems
+//thus I leave it in for the time being TODO: remove
+/* This functions looks at the given message and checks if it matches the
+ * provided filter condition. If so, it returns true, else it returns
+ * false. This is a helper to logmsg() and meant to drive the decision
+ * process if a message is to be processed or not. As I expect this
+ * decision code to grow more complex over time AND logmsg() is already
+ * a very lengthy function, I thought a separate function is more appropriate.
+ * 2005-09-19 rgerhards
+ * 2008-02-25 rgerhards: changed interface, now utilizes iRet, bProcessMsg
+ * returns is message should be procesed.
+ */
+static rsRetVal shouldProcessThisMessage(selector_t *f, msg_t *pMsg, int *bProcessMsg)
+{
+	DEFiRet;
+	unsigned short pbMustBeFreed;
+	char *pszPropVal;
+	int bRet = 0;
+	vm_t *pVM = NULL;
+	var_t *pResult = NULL;
+
+	assert(f != NULL);
+	assert(pMsg != NULL);
+
+	/* we first have a look at the global, BSD-style block filters (for tag
+	 * and host). Only if they match, we evaluate the actual filter.
+	 * rgerhards, 2005-10-18
+	 */
+	if(f->eHostnameCmpMode == HN_NO_COMP) {
+		/* EMPTY BY INTENSION - we check this value first, because
+		 * it is the one most often used, so this saves us time!
+		 */
+	} else if(f->eHostnameCmpMode == HN_COMP_MATCH) {
+		if(rsCStrSzStrCmp(f->pCSHostnameComp, (uchar*) getHOSTNAME(pMsg), getHOSTNAMELen(pMsg))) {
+			/* not equal, so we are already done... */
+			dbgprintf("hostname filter '+%s' does not match '%s'\n", 
+				rsCStrGetSzStrNoNULL(f->pCSHostnameComp), getHOSTNAME(pMsg));
+			FINALIZE;
+		}
+	} else { /* must be -hostname */
+		if(!rsCStrSzStrCmp(f->pCSHostnameComp, (uchar*) getHOSTNAME(pMsg), getHOSTNAMELen(pMsg))) {
+			/* not equal, so we are already done... */
+			dbgprintf("hostname filter '-%s' does not match '%s'\n", 
+				rsCStrGetSzStrNoNULL(f->pCSHostnameComp), getHOSTNAME(pMsg));
+			FINALIZE;
+		}
+	}
+	
+	if(f->pCSProgNameComp != NULL) {
+		int bInv = 0, bEqv = 0, offset = 0;
+		if(*(rsCStrGetSzStrNoNULL(f->pCSProgNameComp)) == '-') {
+			if(*(rsCStrGetSzStrNoNULL(f->pCSProgNameComp) + 1) == '-')
+				offset = 1;
+			else {
+				bInv = 1;
+				offset = 1;
+			}
+		}
+		if(!rsCStrOffsetSzStrCmp(f->pCSProgNameComp, offset, (uchar*) getProgramName(pMsg), getProgramNameLen(pMsg)))
+			bEqv = 1;
+
+		if((!bEqv && !bInv) || (bEqv && bInv)) {
+			/* not equal or inverted selection, so we are already done... */
+			dbgprintf("programname filter '%s' does not match '%s'\n", 
+				rsCStrGetSzStrNoNULL(f->pCSProgNameComp), getProgramName(pMsg));
+			FINALIZE;
+		}
+	}
+	
+	/* done with the BSD-style block filters */
+
+	if(f->f_filter_type == FILTER_PRI) {
+		/* skip messages that are incorrect priority */
+		if ( (f->f_filterData.f_pmask[pMsg->iFacility] == TABLE_NOPRI) || \
+		    ((f->f_filterData.f_pmask[pMsg->iFacility] & (1<<pMsg->iSeverity)) == 0) )
+			bRet = 0;
+		else
+			bRet = 1;
+	} else if(f->f_filter_type == FILTER_EXPR) {
+		CHKiRet(vm.Construct(&pVM));
+		CHKiRet(vm.ConstructFinalize(pVM));
+		CHKiRet(vm.SetMsg(pVM, pMsg));
+		CHKiRet(vm.ExecProg(pVM, f->f_filterData.f_expr->pVmprg));
+		CHKiRet(vm.PopBoolFromStack(pVM, &pResult));
+		dbgprintf("result of expression evaluation: %lld\n", pResult->val.num);
+		/* VM is destructed on function exit */
+		bRet = (pResult->val.num) ? 1 : 0;
+	} else {
+		assert(f->f_filter_type == FILTER_PROP); /* assert() just in case... */
+		pszPropVal = MsgGetProp(pMsg, NULL, f->f_filterData.prop.pCSPropName, &pbMustBeFreed);
+
+		/* Now do the compares (short list currently ;)) */
+		switch(f->f_filterData.prop.operation ) {
+		case FIOP_CONTAINS:
+			if(rsCStrLocateInSzStr(f->f_filterData.prop.pCSCompValue, (uchar*) pszPropVal) != -1)
+				bRet = 1;
+			break;
+		case FIOP_ISEQUAL:
+			if(rsCStrSzStrCmp(f->f_filterData.prop.pCSCompValue,
+					  (uchar*) pszPropVal, strlen(pszPropVal)) == 0)
+				bRet = 1; /* process message! */
+			break;
+		case FIOP_STARTSWITH:
+			if(rsCStrSzStrStartsWithCStr(f->f_filterData.prop.pCSCompValue,
+					  (uchar*) pszPropVal, strlen(pszPropVal)) == 0)
+				bRet = 1; /* process message! */
+			break;
+		case FIOP_REGEX:
+			if(rsCStrSzStrMatchRegex(f->f_filterData.prop.pCSCompValue,
+					(unsigned char*) pszPropVal, 0, &f->f_filterData.prop.regex_cache) == RS_RET_OK)
+				bRet = 1;
+			break;
+		case FIOP_EREREGEX:
+			if(rsCStrSzStrMatchRegex(f->f_filterData.prop.pCSCompValue,
+					  (unsigned char*) pszPropVal, 1, &f->f_filterData.prop.regex_cache) == RS_RET_OK)
+				bRet = 1;
+			break;
+		default:
+			/* here, it handles NOP (for performance reasons) */
+			assert(f->f_filterData.prop.operation == FIOP_NOP);
+			bRet = 1; /* as good as any other default ;) */
+			break;
+		}
+
+		/* now check if the value must be negated */
+		if(f->f_filterData.prop.isNegated)
+			bRet = (bRet == 1) ?  0 : 1;
+
+		if(Debug) {
+			dbgprintf("Filter: check for property '%s' (value '%s') ",
+			        rsCStrGetSzStrNoNULL(f->f_filterData.prop.pCSPropName),
+			        pszPropVal);
+			if(f->f_filterData.prop.isNegated)
+				dbgprintf("NOT ");
+			dbgprintf("%s '%s': %s\n",
+			       getFIOPName(f->f_filterData.prop.operation),
+			       rsCStrGetSzStrNoNULL(f->f_filterData.prop.pCSCompValue),
+			       bRet ? "TRUE" : "FALSE");
+		}
+
+		/* cleanup */
+		if(pbMustBeFreed)
+			free(pszPropVal);
+	}
+
+finalize_it:
+	/* destruct in any case, not just on error, but it makes error handling much easier */
+	if(pVM != NULL)
+		vm.Destruct(&pVM);
+
+	if(pResult != NULL)
+		var.Destruct(&pResult);
+
+	*bProcessMsg = bRet;
+	RETiRet;
+}
+
+
+/* helper to processMsg(), used to call the configured actions. It is
+ * executed from within llExecFunc() of the action list.
+ * rgerhards, 2007-08-02
+ */
+typedef struct processMsgDoActions_s {
+	int bPrevWasSuspended; /* was the previous action suspended? */
+	msg_t *pMsg;
+} processMsgDoActions_t;
+DEFFUNC_llExecFunc(processMsgDoActions)
+{
+	DEFiRet;
+	rsRetVal iRetMod;	/* return value of module - we do not always pass that back */
+	action_t *pAction = (action_t*) pData;
+	processMsgDoActions_t *pDoActData = (processMsgDoActions_t*) pParam;
+
+	assert(pAction != NULL);
+
+	if((pAction->bExecWhenPrevSusp  == 1) && (pDoActData->bPrevWasSuspended == 0)) {
+		dbgprintf("not calling action because the previous one is not suspended\n");
+		ABORT_FINALIZE(RS_RET_OK);
+	}
+
+	/* MULTIQUEUE: look at this below! (I say: batch states!) */
+	iRetMod = actionCallAction(pAction, pDoActData->pMsg);
+	if(iRetMod == RS_RET_DISCARDMSG) {
+		ABORT_FINALIZE(RS_RET_DISCARDMSG);
+	} else if(iRetMod == RS_RET_SUSPENDED) {
+		/* indicate suspension for next module to be called */
+		pDoActData->bPrevWasSuspended = 1;
+	} else {
+		pDoActData->bPrevWasSuspended = 0;
+	}
+
+finalize_it:
+	RETiRet;
+}
+
+
+/* Process (consume) a received message from the main queue. Here, messages are
+ * filtered and those where the filter evaluates to true are passed to the action
+ * queue for further processing.
+ * rgerhards, 2005-10-13
+ */
+static void
+processMsg(msg_t *pMsg)
+{
+	selector_t *f;
+	int bContinue;
+	int bProcessMsg;
+	processMsgDoActions_t DoActData;
+	rsRetVal iRet;
+
+	BEGINfunc
+	assert(pMsg != NULL);
+
+	/* log the message to the particular outputs */
+
+	bContinue = 1;
+	for (f = Files; f != NULL && bContinue ; f = f->f_next) {
+		/* first check the filters... */
+		iRet = shouldProcessThisMessage(f, pMsg, &bProcessMsg);
+		if(!bProcessMsg) {
+			continue;
+		}
+
+		/* ok -- from here, we have action-specific code, nothing really selector-specific -- rger 2007-08-01 */
+		DoActData.pMsg = pMsg;
+		DoActData.bPrevWasSuspended = 0;
+		if(llExecFunc(&f->llActList, processMsgDoActions, (void*)&DoActData) == RS_RET_DISCARDMSG)
+			bContinue = 0;
+	}
+	ENDfunc
+}
+
+#endif // if 0 from merge omfile
 
 /* The consumer of dequeued messages. This function is called by the
  * queue engine on dequeueing of a message. It runs on a SEPARATE
- * THREAD.
- * Please note: the message object is destructed by the queue itself!
+ * THREAD. It receives an array of pointers, which it must iterate
+ * over. We do not do any further batching, as this is of no benefit
+ * for the main queue.
  */
 static rsRetVal
-msgConsumer(void __attribute__((unused)) *notNeeded, void *pUsr)
+msgConsumer(void __attribute__((unused)) *notNeeded, batch_t *pBatch)
 {
+	int i;
+	msg_t *pMsg;
 	DEFiRet;
-	msg_t *pMsg = (msg_t*) pUsr;
 
-	assert(pMsg != NULL);
+	assert(pBatch != NULL);
 
-	if((pMsg->msgFlags & NEEDS_PARSING) != 0) {
-		parseMsg(pMsg);
+	for(i = 0 ; i < pBatch->nElem ; i++) {
+		pMsg = (msg_t*) pBatch->pElem[i].pUsrp;
+		DBGPRINTF("msgConsumer processes msg %d/%d\n", i, pBatch->nElem);
+		if((pMsg->msgFlags & NEEDS_PARSING) != 0) {
+			parseMsg(pMsg);
+		}
+		ruleset.ProcessMsg(pMsg);
 	}
-	ruleset.ProcessMsg(pMsg);
-	msgDestruct(&pMsg);
 
 	RETiRet;
 }
@@ -1588,6 +1831,7 @@ static void doDie(int sig)
 #	define MSG1 "DoDie called.\n"
 #	define MSG2 "DoDie called 5 times - unconditional exit\n"
 	static int iRetries = 0; /* debug aid */
+	dbgprintf(MSG1);
 	if(Debug)
 		write(1, MSG1, sizeof(MSG1) - 1);
 	if(iRetries++ == 4) {
@@ -2304,6 +2548,7 @@ init()
 
 	setQPROP(qqueueSetMaxFileSize, "$MainMsgQueueFileSize", iMainMsgQueMaxFileSize);
 	setQPROP(qqueueSetsizeOnDiskMax, "$MainMsgQueueMaxDiskSpace", iMainMsgQueMaxDiskSpace);
+	setQPROP(qqueueSetiDeqBatchSize, "$MainMsgQueueDequeueBatchSize", iMainMsgQueDeqBatchSize);
 	setQPROPstr(qqueueSetFilePrefix, "$MainMsgQueueFileName", pszMainMsgQFName);
 	setQPROP(qqueueSetiPersistUpdCnt, "$MainMsgQueueCheckpointInterval", iMainMsgQPersistUpdCnt);
 	setQPROP(qqueueSetbSyncQueueFiles, "$MainMsgQueueSyncQueueFiles", bMainMsgQSyncQeueFiles);
@@ -2650,6 +2895,7 @@ static rsRetVal loadBuildInModules(void)
 	CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueuedequeueslowdown", 0, eCmdHdlrInt, NULL, &iMainMsgQDeqSlowdown, NULL));
 	CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueueworkerthreadminimummessages", 0, eCmdHdlrInt, NULL, &iMainMsgQWrkMinMsgs, NULL));
 	CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueuemaxfilesize", 0, eCmdHdlrSize, NULL, &iMainMsgQueMaxFileSize, NULL));
+	CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueuedequeuebatchsize", 0, eCmdHdlrSize, NULL, &iMainMsgQueDeqBatchSize, NULL));
 	CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueuemaxdiskspace", 0, eCmdHdlrSize, NULL, &iMainMsgQueMaxDiskSpace, NULL));
 	CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueuesaveonshutdown", 0, eCmdHdlrBinary, NULL, &bMainMsgQSaveOnShutdown, NULL));
 	CHKiRet(regCfSysLineHdlr((uchar *)"mainmsgqueuedequeuetimebegin", 0, eCmdHdlrInt, NULL, &iMainMsgQueueDeqtWinFromHr, NULL));
author	Rainer Gerhards <rgerhards@adiscon.com>	2009-07-03 12:05:50 +0200
committer	Rainer Gerhards <rgerhards@adiscon.com>	2009-07-03 12:05:50 +0200
commit	22328c72aef2df73741afefe0e8b53d3ad83f15d (patch)
tree	4a0b311508ebb28a0dc72e12993f704c2709d46a
parent	da933a7e105acf814d5e7955d39d29eab3a96613 (diff)
parent	7c0ca7553738161c681c0d0600de99e9fabee81d (diff)
download	rsyslog-22328c72aef2df73741afefe0e8b53d3ad83f15d.tar.gz rsyslog-22328c72aef2df73741afefe0e8b53d3ad83f15d.tar.xz rsyslog-22328c72aef2df73741afefe0e8b53d3ad83f15d.zip