summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--source4/rpc_server/lsa/dcesrv_lsa.c39
-rw-r--r--source4/torture/rpc/lsa.c32
2 files changed, 63 insertions, 8 deletions
diff --git a/source4/rpc_server/lsa/dcesrv_lsa.c b/source4/rpc_server/lsa/dcesrv_lsa.c
index 7ed3b63540..f67b5dee10 100644
--- a/source4/rpc_server/lsa/dcesrv_lsa.c
+++ b/source4/rpc_server/lsa/dcesrv_lsa.c
@@ -23,6 +23,8 @@
#include "rpc_server/lsa/lsa.h"
#include "util/util_ldb.h"
#include "libcli/ldap/ldap_ndr.h"
+#include "system/kerberos.h"
+#include "auth/kerberos/kerberos.h"
/*
this type allows us to distinguish handle types
@@ -2502,7 +2504,42 @@ static NTSTATUS dcesrv_lsa_QueryDomainInformationPolicy(struct dcesrv_call_state
TALLOC_CTX *mem_ctx,
struct lsa_QueryDomainInformationPolicy *r)
{
- DCESRV_FAULT(DCERPC_FAULT_OP_RNG_ERROR);
+ r->out.info = talloc(mem_ctx, union lsa_DomainInformationPolicy);
+ if (!r->out.info) {
+ return NT_STATUS_NO_MEMORY;
+ }
+
+ switch (r->in.level) {
+ case LSA_DOMAIN_INFO_POLICY_EFS:
+ talloc_free(r->out.info);
+ r->out.info = NULL;
+ return NT_STATUS_OBJECT_NAME_NOT_FOUND;
+ case LSA_DOMAIN_INFO_POLICY_KERBEROS:
+ {
+ struct lsa_DomainInfoKerberos *k = &r->out.info->kerberos_info;
+ struct smb_krb5_context *smb_krb5_context;
+ int ret = smb_krb5_init_context(mem_ctx,
+ dce_call->event_ctx,
+ dce_call->conn->dce_ctx->lp_ctx,
+ &smb_krb5_context);
+ if (ret != 0) {
+ talloc_free(r->out.info);
+ r->out.info = NULL;
+ return NT_STATUS_INTERNAL_ERROR;
+ }
+ k->enforce_restrictions = 0; /* FIXME, details missing from MS-LSAD 2.2.53 */
+ k->service_tkt_lifetime = 0; /* Need to find somewhere to store this, and query in KDC too */
+ k->user_tkt_lifetime = 0; /* Need to find somewhere to store this, and query in KDC too */
+ k->user_tkt_renewaltime = 0; /* Need to find somewhere to store this, and query in KDC too */
+ k->clock_skew = krb5_get_max_time_skew(smb_krb5_context->krb5_context);
+ talloc_free(smb_krb5_context);
+ return NT_STATUS_OK;
+ }
+ default:
+ talloc_free(r->out.info);
+ r->out.info = NULL;
+ return NT_STATUS_INVALID_INFO_CLASS;
+ }
}
/*
diff --git a/source4/torture/rpc/lsa.c b/source4/torture/rpc/lsa.c
index 31dc38500d..ec74426ac6 100644
--- a/source4/torture/rpc/lsa.c
+++ b/source4/torture/rpc/lsa.c
@@ -779,6 +779,7 @@ static bool test_LookupPrivName(struct dcerpc_pipe *p,
static bool test_RemovePrivilegesFromAccount(struct dcerpc_pipe *p,
TALLOC_CTX *mem_ctx,
+ struct policy_handle *handle,
struct policy_handle *acct_handle,
struct lsa_LUID *luid)
{
@@ -801,7 +802,25 @@ static bool test_RemovePrivilegesFromAccount(struct dcerpc_pipe *p,
status = dcerpc_lsa_RemovePrivilegesFromAccount(p, mem_ctx, &r);
if (!NT_STATUS_IS_OK(status)) {
- printf("RemovePrivilegesFromAccount failed - %s\n", nt_errstr(status));
+
+ struct lsa_LookupPrivName r_name;
+
+ r_name.in.handle = handle;
+ r_name.in.luid = luid;
+
+ status = dcerpc_lsa_LookupPrivName(p, mem_ctx, &r_name);
+ if (!NT_STATUS_IS_OK(status)) {
+ printf("\nLookupPrivName failed - %s\n", nt_errstr(status));
+ return false;
+ }
+ /* Windows 2008 does not allow this to be removed */
+ if (strcmp("SeAuditPrivilege", r_name.out.name->string) == 0) {
+ return ret;
+ }
+
+ printf("RemovePrivilegesFromAccount failed to remove %s - %s\n",
+ r_name.out.name->string,
+ nt_errstr(status));
return false;
}
@@ -864,7 +883,7 @@ static bool test_EnumPrivsAccount(struct dcerpc_pipe *p,
&r.out.privs->set[i].luid);
}
- ret &= test_RemovePrivilegesFromAccount(p, mem_ctx, acct_handle,
+ ret &= test_RemovePrivilegesFromAccount(p, mem_ctx, handle, acct_handle,
&r.out.privs->set[0].luid);
ret &= test_AddPrivilegesToAccount(p, mem_ctx, acct_handle,
&r.out.privs->set[0].luid);
@@ -2036,10 +2055,6 @@ static bool test_QueryDomainInfoPolicy(struct dcerpc_pipe *p,
NTSTATUS status;
int i;
bool ret = true;
- if (torture_setting_bool(tctx, "samba4", false)) {
- printf("skipping QueryDomainInformationPolicy test against Samba4\n");
- return true;
- }
printf("\nTesting QueryDomainInformationPolicy\n");
@@ -2051,7 +2066,10 @@ static bool test_QueryDomainInfoPolicy(struct dcerpc_pipe *p,
status = dcerpc_lsa_QueryDomainInformationPolicy(p, tctx, &r);
- if (!NT_STATUS_IS_OK(status)) {
+ /* If the server does not support EFS, then this is the correct return */
+ if (i == LSA_DOMAIN_INFO_POLICY_EFS && NT_STATUS_EQUAL(status, NT_STATUS_OBJECT_NAME_NOT_FOUND)) {
+ continue;
+ } else if (!NT_STATUS_IS_OK(status)) {
printf("QueryDomainInformationPolicy failed - %s\n", nt_errstr(status));
ret = false;
continue;
='#n301'>301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859
\documentclass[a4paper,10pt]{article}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{algorithm,algorithmic}
\usepackage{float}

\pagestyle{headings}

\newcommand{\IN}{\mathbb{N}}
\newcommand{\MM}{\mathcal{M}}
\newcommand{\QQ}{\mathcal{Q}}
\newcommand{\AAA}{\mathcal{A}}
\title{Rsyslog Design and Internals}
\author{Rainer Gerhards\\
rgerhards@adiscon.com}

\begin{document}

\maketitle

\begin{abstract}
This paper describes rsyslog design and internals. It is created to facilitate a discussion about the implementation of "batched queue processing". As such, it does not describe the full design of rsyslog but rather those elements that are relevant to queues. However, the document may be expanded in the future.
\end{abstract}

\tableofcontents

\section{Preliminaries}
\subsection{On the Use of English}
\begin{quotation}
\begin{flushright}
I ventured to write this book in English because ... \\
it will be more easily read in poor English, \\
than in good German by 90\% of my intended readers. \\
--- HANS J. STETTER, Analysis of Discretization Methods for \\
Ordinary Differential Equations (1973)
\end{flushright}
\end{quotation}

There is not much I could add to Mr. Stetter's thought, except, maybe, that the number to quote probably tends more to 99\% in this case than to the 90\% Mr. Stetter notes. So please pardon those errors in language use that I have not yet been able to fix or even see. Suggestions for corrections and improvements are always welcome.
\subsection{Notational Conventions}
In general, in rsyslog there exists single objects $o$, which are used to build larger sets $O$, which form a superset $\mathcal{O}$ of all those objects that exist at a given time inside a running instance of rsyslog. As seen above, single objects are always described by lower case letters ($o$), larger sets by upper case letters ($O$) and the ``all-sets'' in caligraphic letters ($\mathcal{O}$). Often, objects $O_i, i \in \IN, i \le |\mathcal{O}|$ partition $\mathcal{O}$, but this is not necessarily the case.

\subsection{Definitions}
\subsubsection{Sudden Fatal Failure}
As sudden fatal failure is one that occurs at some instant and causes Complete loss of processing capabilities. The two major cases are a sudden power loss or a ``kill -9'' of the process. There are more exotic cases, too, like disasters.

One may argue that it is possible to protect against many sudden fatal failure cases. For example, using an uninterruptable power supply (UPS) will prevent a sudden power loss. While this is true in most cases, it does not hold if looked very closely: in the case of the UPS, for example, a failure in the UPS itself may cause a sudden power loss, which can not be mitigated. Well, actually there can be several layers of mitigation, but always one more potential failure scenario remains. So it is not possible to totally solve the issue.

The concept of ``sudden fatal failure'' now covers all these rest risk that result in termiantion of rsyslogd without the ability execute any code before this happens. This is a very important concept in regard to audit-gradeness.

\subsubsection{Audit Grade}
In the context of this document, ``audit grade'' means that a subsystem never loses a message that it has taken responsibility for, not even in cases of sudden fatal failures. The only limit in this restriction is that a subsystem does not guarantee message survival if the subsytem at large is being destroyed (e.g. during a disaster) or some of its components are not of audit-grade. This draws a fine limitation on the audit-grade of a subsystem.

For example, the rsyslog queue subsystem receives messages and acknowledges them to the submitter (e.g. an input), when they have been enqueued in the storage system. If the queue system is configured to provide audit-grade operation\footnote{Audit-grade queue operation is considerably slower than regular operations, as such this mode is not enabled by default. Most installations will never need a completely audit-grade queue}, the queue relies on the storage subsystem to work properly. If, for example, a disk read error occurs, the message may no longer be readable from the disk and as such is lost. The root cause here is that the disk subsystem was not of audit grade, because it otherwise would not have lost the message. So in this case the queue code is of audit grade, but the one of its components, the disk subsytem, was not. So the overall system is not of audit grade.

To simplify talking about the audit-gradness of several subsytems, we assume that all of their subsystems are also of audit grade. In an actual deployment, however, this means the the system designer must carefully select audit-grade subsystems. Overlooking a single non-audit-grade component will make the whole system of not audit grade quality.

Please note that it can be rather tricky to ensure a complete system is of audit grade. A border case is main memory integrity. Even with error-correcting memory, there may situations arise where a memory error occurs (probably due to a very unlikely series of well-hitting cosmic rays) that is unrecoverable. At this point, system integrity is at risk. The only real solution is to immediately shut down the system and restart it (without giving any process a chance to execute). Note, however, that in an extreme view, an operating system routine that does so can also be considered dangerous, as memory in use by this routine might be affected by the malfunction. We could extend this scenario and further complicate it, but that goes beyond the scope of this paper. The example was primarily meant to show how subtle audit-grade reliability is.

In rsyslog, we currently use a slightly \marginpar{duplication\\permitted}relaxed consistency condition for message integrity inside an audit-grade subsystem. While we do not accept message loss, we permit slight message \emph{duplication}, but only in exceptional cases. This is permitted because, with proper message generation, the dulication problem can be easily fixed at the end-to-end layer. For example, the original sender can include a UUID, which can be used to sort out duplicates at the final destination. Insisting on not allowing duplication complicates matters and is often impossible with today's logging protocols. So, for the time being, we aim at this relaxed criteria, which is hard enough to achive. After we have achieved that goal, we may further try to solve the duplicaton problem. Some hooks already exist. But we do not guarantee such an effort will be made any time soon.

\section{Overall Design}
From a high-level prespective, rsyslogd is ``just'' a high-performance message router. It accepts messages from various sources, applies user-configured filters to them, and routes potentially transformed messages to destinations based on these filters.
\section{Objects}
\subsection{Plugins}
Plugins provide code potentially written by a third party to extend rsyslog.

Conceptually, a plugin is a tuple of callable functions $(\phi_1, \phi_2, \ldots)$ which implement an interface. There are three different types of plugins: input, output and library. The plugin type denotes the primary interface implemented by the plugin. Additional interfaces may be implemented\footnote{This is not yet done in plugins, but is possible and assumed to be done at a later point in time}.

In the context of this paper, the output plugin interface is most important. It implements three entry points:

\paragraph{doAction()}
is used to submit messages to the output plugin. The entry point may or may not commit the messages to their ultimate destination.

\paragraph{beginTransaction()}
is used to inform the plugin that a new transaction begins. It must prepare for processing.

\paragraph{endTransaction()}
is indicated that the upper layer \emph{needs} to close the transaction. If there is any uncommited data left, it must be commited or rolled back.

Every instance of an output plugin is guaranteed \emph{not} to be called concurrently by multiple threads. Further, no context switch will happen between calls to $doAction()$ and $endTransaction()$.

\subsection{State Sets}
Several object have associated state based on a specific state set. These state sets are described together with the objects.

As a general rule, individual state is associated with all instances $o$ of a class of objects. This state is called the object's \marginpar{state component} \emph{state component} $s$. If we want to obtain an object's state, we write $S(o)$. Please note that $S(o)$ is only defined for those objects that have a state component.

\subsection{Messages} 
A message $m$ represents a a single syslog message inside the system. It is a tuple of attributes. Some of these attributes directly orginate from the message content, some others are meta-information taken from the context. For example, there is an meta-attribute ``time of reception'' which conveys when the message was received by rsyslog's input subsystem. We do not list attributes here, as there are many and it is not of importance which exactly they are.

The set $\MM$ is composed of all messages that exist at a given time inside rsyslog.

\subsection{Queue}
A queue
$$Q = (C, \Phi, M)$$
is a triplet of a set of configuration parameters $C$, a set of callbacks $\Phi$ and a set of messages $M \subseteq \MM$.

If we need to obtain the set of message from a queue, we write $M(Q)$. The elements of the set of configuration parameters are written as $C_{param}$ where $param$ is an abbreviation of the parameter's meaning. To obtain a specific parameter from a queue, we write $C_{param}(Q)$. The most important elements of $C$ are:

\paragraph{$C_{type}$} which denotes the queue implementation type. Most importantly, this selects from a set of queue drivers (for example disk-only or in-memory driver), which affects the basic operation of the queue instance.

\paragraph{$C_{mMsg}$} which denotes the upper bound on the cardinality of $M$.

\paragraph{$C_{mBatch}$} which denotes the upper bound of the cardinality of message batches created for this queue.

Be $\QQ = \{Q_m, Q_1, Q_2, \ldots, Q_{|\AAA|}\}$ the set of all queues that exist inside rsyslog after the configuration file has been processed, with $|\QQ| = |\AAA| + 1$.

Then
$$M_0 = \MM \setminus \bigcup_{i=1}^{|\QQ|} Q_i(M)$$
\marginpar{at-risk-set}is the set of non-queued messages. The messages have either never been enqueued or have been dequeued but not finally been processed. This set represents the messages that may potentially be lost during an unclean shutdown of rsyslogd. This is why I call this set the ``\emph{at-risk-set}''.


\subsection{Batches}
A batch represents multiple processable messages. It is a unit of processing inside rsyslog's output system. Batches are used to dequeue a number of messages from a queue and then submit them to the lower action layer. Batches are natural \emph{transaction boundaries}, in the sense that multiple output transactions may be done on the messages inside a batch, but each transaction must end at the end of the batch. A batch is always associated to a specific queue $Q$.

A batch
$$B = (b_1, b_2, \ldots, b_n )$$
is a $n$-tuple of \marginpar{processable\\message}processable messages
$$b = (m, s)$$
which are an ordered pair of a message $m$ and an associated processing state $s$. To denote the $n$-th message inside the batch, we write $m(b_n)$, to denote the status component of the $n$-th message, we write $S(b_n)$.

\begin{figure}
\begin{center}
\includegraphics[scale=0.4]{batch_state.jpeg}
\end{center}
\caption{batch message processing states}
\label{fig_batchmsg_states}
\end{figure}

The state set for the processing states is defined as follows:
$$
S_B = \{ rdy, bad, sub, disc \}
$$

With the semantics of the various states being the following:

\begin{center}
\begin{tabular}{|l|l|} \hline
	State		& Semantics \\\hline
	rdy 		& ready for processing\\
	bad		& this message triggered an unrecoverable failure in action\\
			& processing and must not be resubmitted to this action\\
	sub		& message submitted for processsing, result yet unknown \\
	disc		& action sucessfully processed, but must not be submitted \\
			& to any further action in action unit \\\hline
\end{tabular}
\end{center}
The associated state diagram is shown in figure \ref{fig_batchmsg_states} on page \pageref{fig_batchmsg_states}.

Batch sizes vary. The actual cardinality is a function of the cardinality of $M(Q)$ at the time of batch creation and the queue configuration:

$$1 \leq |B| \leq \max(C_{mBatch}(Q), |M(Q)|)$$

\subsection{Action Unit}
An action unit 
$$u = (f, a_1, \ldots, a_n), a_i \in \AAA \text{ for } i \in \IN, i \le n$$
is a tuple consisting of a filter function $f$ and $n \in \IN$ actions. \emph{Does rsyslog still support nonsense action units with $n=0$? - check!}

\subsection{Action}
An action
$$a = (a_C, a_\psi)$$
is an ordered pair of a tuple of configuration attributes $a_C$, and a tuple of processing functions $a_\psi$. Be the set $\AAA$ composed of all actions that exist in rsyslog after the configuration file has been processed.


\section{Processing}
\subsection{Object States}
Various objects keep state. Some of these objects, like messages, batches and actions seem to share state. However, thinking about shared state leads to very complex setup. As such, state is modelled for each object $o$ individually. Instead, the state function $S_O(o)$ can be used to obtain an obtain an individual objects state. That state can be used to modify the state diagrams of the other objects with which relationships exist.

\subsubsection{Actions}
Actions are provided by output plugins. An action enables the engine to write messages to some destination. It is important to note that ``destination'' is a very broad abstraction. A destination may be a file inside a local or remote file system, a database table or a remote syslog server in another network.

Actions are transactional in the following sense: more than one message can be submitted to an action. The action does not necessarily process the submitted messages unless the caller ends the transaction. However, the action itself may also end the transaction and notify the caller. This is \emph{not} considered an error condition and \emph{must} be handled gracefully by the caller. If a transaction aborts, the caller \emph{must} assume that none of the elements submitted since the begin of transaction have been processed. The action will try to backout anything that was already processed at the time the transaction failed. However, not all outputs work on actually transactional destination. As such, an action is permitted not to backout incomplete interim results. As such, after a transaction abort, some message duplication may occur. We call this the \emph{relaxed integrity condition} for actions.

An output transaction is started by calling \emph{beginTransaction()} either explicitely or implicitely by a call to \emph{doAction()} without calling \emph{beginTransaction()} before. Then, one or more calls to \emph{doAction()} follow. When the caller intends to finish the transaction, it calls \emph{endTransaction()}. However, the transaction may also be terminated from the action itself in response to a \emph{doAction()} call.

Mathematically, an action transaction builds a totally ordered set of uncommitted messages $M_u$. The order relation is defined over the sequence in which messages are being provided to \emph{doAction()}. At any time a commit is attempted, the full set $M_u$ is committed and may either succeeed completely or not at all (in the sense of the relaxed integrity condition described above). 

A commit is attempted when 
\begin{enumerate}
\item the caller decides to call \emph{endTransaction()} 
\item or earlier if the action decides it needs to commit now (e.g. because of buffers filling up).
\end{enumerate}

In the seconds case, the action may decide to commit all message but the current one or all (this is depending on action logic). So if the action decideds to commit a transaction before the caller calls \emph{endTransaction()}, a set of commited messages $M_c$ is build and $M_u$ is modified. Be $n$ the $n$-th iterated \emph{doAction()} call and $m_n$ the current message of this call, then the sets are build as follows:

\begin{algorithm}
%\caption{}
\begin{algorithmic}
\IF{action commits $m_n$}
	\STATE $M_c = M_u \cup m_n$
	\STATE $M_u = \emptyset$
\ELSE
	\STATE $M_c = M_u$
	\STATE $M_u = \{ m_n\}$
\ENDIF
\end{algorithmic}
\end{algorithm}

In other words, if anything is committed early, it is always the full set $M_u$, with or without the current message. The caller needs to know which messages are already commited. As \emph{doAction()} finishes one transaction and starts a new one in a single call, we can not use action state the let the caller know this happened. So we use our above finding and just convey back if the transacton is still continuing or the current message or all others before it were committed. The caller must then act accordingly. Please note that when an error happens, the whole transaction must still be considered failed. As such, ``partial commit'' states need not to be mixed with failure states.

Please note that the above method leaves a small potential issue unaddressed: if the action does an early commit of $M_u \setminus m_n$, an error happens when adding $m_n$ to the new $M_u$ (like running out of resources), the action would need to convey both the successful transaction as well as the failure state. This is not possible with the current interface. We could use callbacks to provide such notification, but this complicates the code. So, if that situaton arises, the action must temporarily buffer the error condition and convey it as part of either the next \emph{doAction()} call or during \emph{endTransation()} processing. This can be done, for example, by advancing its internal state accordingly.

The state set for a actions is defined as follows:
$$
S_A = \{ rdy, itx, comm, rtry, susp, died \}
$$

With the semantics of the various states being the following:

\begin{center}
\begin{tabular}{|l|l|} \hline
	State		& Semantics \\\hline
	rdy 		& ready, waiting for transaction begin\\
	itx		& in transaction, accept more data \\
	comm		& transaction finished \\
	rtry		& action failed but may be able to recover \\
	susp		& action currently defunctional until timeout expires \\
	died		& unrecoverable error condition occured, no longer usable \\\hline
\end{tabular}
\end{center}

In the associated state diagram in figure \ref{fig_action_states}, we do not include the \emph{died} state, because it is entered whenever a totally unrecoverable error state may occur. This is a very exceptional incident (which most output plugins do not even support), so we have kept the diagram simple.

\begin{figure}
\begin{center}
\includegraphics[scale=0.5]{action_state.jpeg}
\end{center}
\caption{Action State Diagram}
\label{fig_action_states}
\end{figure}

\emph{Note well} that the state diagram describes the action state. It does \emph{not} describe the transaction state. While action- and transaction state are closely related to each other, they are different entities.

The return code of \emph{doAction()} and \emph{endTransaction()} is used to convey the transaction state. As such, it is a function of the actions's current state after processing the request. The mapping is as shown below:

\begin{center}
\begin{tabular}{|l|l|} \hline
	State		& Return Code (RS\_RET\_\ldots)\\\hline
	rdy 		& OK \\
	itx		& COMMITTED (if there was an auto-commit without $m_n$)\\
			& DEFER\_COMMIT (if there was no auto-commit)\\
	comm		& internal state, not to be exposed to upper layer \\
	rtry		& SUSPENDED \emph{(new code needed)} \\
	susp		& SUSPENDED \\
	died		& DISABLED \\\hline
\end{tabular}
\end{center}

For the rest of this document, let's assume there is a function \emph{getReturnCode()} that implements this mapping.

It is important to think about how retries are handled. There is a user-configured per-action upper number of retries $C_r$ and retry interval $C_i$. In \emph{rsyslog v3}, there is no concept of output transactions. As such, only single messages are processed. When a temporary action failure occurs, the action is re-tried $C_r$ times, where the action processing thread is waiting in a \emph{sleep()} $C_i$ operating system API call\footnote{a suitable API is used, not \emph{sleep()} itself}. If the action succeeds during the retry processing, everything continues as usual. If it does not succeed, two things happen:
\begin{itemize}
\item the message is flagged as ``action permanent failure'' (what may trigger backup processing)
\item the action is actually suspended for $C_i$ seconds
\end{itemize}
If then a new message is sent to the action, and $C_i$ seconds have not yet elapsed, the action is flagged as having failed without being re-tried again\footnote{During the analysis for this paper, it was seen that actually $C_r$ retries are attempted in v3, but each of them will never actually re-try the action. This is a software bug, which does not cause any harm and thus will not be fixed in v3. The new implementation in v4 will obviously not inherit this problem}. This is done in an effort to reduce resource utilization and prevent the system from slowing down e.g. by too-many retries to a remote server that went offline.

With transactional output mode in \emph{rsyslog v4}, the logic above can no longer work. First of all, retrying single actions does not help, because all of the current transaction needs to be resubmitted. As such, the upper layers need to be notified of failure. Then, they need to resubmit the batch. In that design, the lower layer needs to return immediately after detecting the failure. Recovery handling is now to be done when the next transaction is started. However, we must make sure that we do not do excessive retries. So retry processing is only to be carried out if it was not tried less than $C_i$ seconds ago.

The required functionality can be implemeted by a \emph{prepareAction} function that readies the action for processing if there is need to do so. That function is then called in all entry points before anything else is done. Then, actual processing is carried out and the resulting action state be used to generate the return code for the upper-layer caller. Find below a rough pseudocode to do so:

\lstset{language=python}
\begin{lstlisting}
def prepareAction():
   if state == rtry:
      try recovery (adjust state accordingly)
   if state == rdy:
      beginTransaction() [output plugin]
    
def processMessage(message):
   prepareAction()
   if state == itx