From db5bda56bf089ec6052d92bb78f3b49f7c812e00 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Thu, 21 Feb 2013 16:34:32 +0100
Subject: tdb: add TDB_MUTEX_LOCKING support

This adds optional support for locking based on
shared robust mutexes.

The caller can use the TDB_MUTEX_LOCKING flag
together with TDB_CLEAR_IF_FIRST after verifying
with tdb_runtime_check_for_robust_mutexes() that
it's supported by the current system.

The caller should be aware that using TDB_MUTEX_LOCKING
implies some limitations, e.g. it's not possible to
have multiple read chainlocks on a given hash chain
from multiple processes.

Note: that this doesn't make tdb thread safe!

Pair-Programmed-With: Stefan Metzmacher <metze@samba.org>
Pair-Programmed-With: Michael Adam <obnox@samba.org>
Signed-off-by: Volker Lendecke <vl@samba.org>
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Michael Adam <obnox@samba.org>
Reviewed-by: Jeremy Allison <jra@samba.org>
---
 lib/tdb/ABI/tdb-1.3.0.sigs                 |   68 ++
 lib/tdb/common/io.c                        |    3 +-
 lib/tdb/common/lock.c                      |   79 ++-
 lib/tdb/common/mutex.c                     | 1000 ++++++++++++++++++++++++++++
 lib/tdb/common/open.c                      |  200 ++++++
 lib/tdb/common/summary.c                   |    2 +
 lib/tdb/common/tdb.c                       |    9 +
 lib/tdb/common/tdb_private.h               |   30 +-
 lib/tdb/common/transaction.c               |    3 +-
 lib/tdb/docs/mutex.txt                     |  136 ++++
 lib/tdb/include/tdb.h                      |   34 +
 lib/tdb/test/run-3G-file.c                 |    1 +
 lib/tdb/test/run-bad-tdb-header.c          |    1 +
 lib/tdb/test/run-check.c                   |    1 +
 lib/tdb/test/run-corrupt.c                 |    1 +
 lib/tdb/test/run-die-during-transaction.c  |    1 +
 lib/tdb/test/run-endian.c                  |    1 +
 lib/tdb/test/run-incompatible.c            |    1 +
 lib/tdb/test/run-nested-transactions.c     |    1 +
 lib/tdb/test/run-nested-traverse.c         |    1 +
 lib/tdb/test/run-no-lock-during-traverse.c |    1 +
 lib/tdb/test/run-oldhash.c                 |    1 +
 lib/tdb/test/run-open-during-transaction.c |    1 +
 lib/tdb/test/run-readonly-check.c          |    1 +
 lib/tdb/test/run-rescue-find_entry.c       |    1 +
 lib/tdb/test/run-rescue.c                  |    1 +
 lib/tdb/test/run-rwlock-check.c            |    1 +
 lib/tdb/test/run-summary.c                 |    1 +
 lib/tdb/test/run-transaction-expand.c      |    1 +
 lib/tdb/test/run-traverse-in-transaction.c |    1 +
 lib/tdb/test/run-wronghash-fail.c          |    1 +
 lib/tdb/test/run-zero-append.c             |    1 +
 lib/tdb/test/run.c                         |    1 +
 lib/tdb/wscript                            |   36 +-
 34 files changed, 1601 insertions(+), 21 deletions(-)
 create mode 100644 lib/tdb/ABI/tdb-1.3.0.sigs
 create mode 100644 lib/tdb/common/mutex.c
 create mode 100644 lib/tdb/docs/mutex.txt

(limited to 'lib/tdb')

diff --git a/lib/tdb/ABI/tdb-1.3.0.sigs b/lib/tdb/ABI/tdb-1.3.0.sigs
new file mode 100644
index 00000000000..7d3e46987ec
--- /dev/null
+++ b/lib/tdb/ABI/tdb-1.3.0.sigs
@@ -0,0 +1,68 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_jenkins_hash: unsigned int (TDB_DATA *)
+tdb_lock_nonblock: int (struct tdb_context *, int, int)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_null: dptr = 0xXXXX, dsize = 0
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_rescue: int (struct tdb_context *, void (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_runtime_check_for_robust_mutexes: bool (void)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_summary: char *(struct tdb_context *)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_transaction_write_lock_mark: int (struct tdb_context *)
+tdb_transaction_write_lock_unmark: int (struct tdb_context *)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlock: int (struct tdb_context *, int, int)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
diff --git a/lib/tdb/common/io.c b/lib/tdb/common/io.c
index 07d22ccdb21..fe47d18a5a4 100644
--- a/lib/tdb/common/io.c
+++ b/lib/tdb/common/io.c
@@ -29,7 +29,8 @@
 #include "tdb_private.h"
 
 /*
- * tdb->hdr_ofs is 0 for now.
+ * We prepend the mutex area, so fixup offsets. See mutex.c for details.
+ * tdb->hdr_ofs is 0 or header.mutex_size.
  *
  * Note: that we only have the 4GB limit of tdb_off_t for
  * tdb->map_size. The file size on disk can be 4GB + tdb->hdr_ofs!
diff --git a/lib/tdb/common/lock.c b/lib/tdb/common/lock.c
index 486de797381..6644c4034e0 100644
--- a/lib/tdb/common/lock.c
+++ b/lib/tdb/common/lock.c
@@ -38,6 +38,15 @@ static int fcntl_lock(struct tdb_context *tdb,
 	struct flock fl;
 	int cmd;
 
+#ifdef USE_TDB_MUTEX_LOCKING
+	{
+		int ret;
+		if (tdb_mutex_lock(tdb, rw, off, len, waitflag, &ret)) {
+			return ret;
+		}
+	}
+#endif
+
 	fl.l_type = rw;
 	fl.l_whence = SEEK_SET;
 	fl.l_start = off;
@@ -110,6 +119,15 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
 	fclose(locks);
 #endif
 
+#ifdef USE_TDB_MUTEX_LOCKING
+	{
+		int ret;
+		if (tdb_mutex_unlock(tdb, rw, off, len, &ret)) {
+			return ret;
+		}
+	}
+#endif
+
 	fl.l_type = F_UNLCK;
 	fl.l_whence = SEEK_SET;
 	fl.l_start = off;
@@ -248,13 +266,27 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
 		return -1;
 	}
 
-	ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0,
-			       TDB_LOCK_WAIT|TDB_LOCK_PROBE);
+	if (tdb_have_mutexes(tdb)) {
+		ret = tdb_mutex_allrecord_upgrade(tdb);
+		if (ret == -1) {
+			goto fail;
+		}
+		ret = tdb_brlock_retry(tdb, F_WRLCK, lock_offset(tdb->hash_size),
+				       0, TDB_LOCK_WAIT|TDB_LOCK_PROBE);
+		if (ret == -1) {
+			tdb_mutex_allrecord_downgrade(tdb);
+		}
+	} else {
+		ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0,
+				       TDB_LOCK_WAIT|TDB_LOCK_PROBE);
+	}
+
 	if (ret == 0) {
 		tdb->allrecord_lock.ltype = F_WRLCK;
 		tdb->allrecord_lock.off = 0;
 		return 0;
 	}
+fail:
 	TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_allrecord_upgrade failed\n"));
 	return -1;
 }
@@ -593,6 +625,8 @@ static int tdb_chainlock_gradual(struct tdb_context *tdb,
 int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 		       enum tdb_lock_flags flags, bool upgradable)
 {
+	int ret;
+
 	switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) {
 	case -1:
 		return -1;
@@ -607,16 +641,27 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 	 *
 	 * It is (1) which cause the starvation problem, so we're only
 	 * gradual for that. */
-	if (tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP,
-				  tdb->hash_size * 4) == -1) {
+
+	if (tdb_have_mutexes(tdb)) {
+		ret = tdb_mutex_allrecord_lock(tdb, ltype, flags);
+	} else {
+		ret = tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP,
+					    tdb->hash_size * 4);
+	}
+
+	if (ret == -1) {
 		return -1;
 	}
 
 	/* Grab individual record locks. */
 	if (tdb_brlock(tdb, ltype, lock_offset(tdb->hash_size), 0,
 		       flags) == -1) {
-		tdb_brunlock(tdb, ltype, FREELIST_TOP,
-			     tdb->hash_size * 4);
+		if (tdb_have_mutexes(tdb)) {
+			tdb_mutex_allrecord_unlock(tdb);
+		} else {
+			tdb_brunlock(tdb, ltype, FREELIST_TOP,
+				     tdb->hash_size * 4);
+		}
 		return -1;
 	}
 
@@ -672,9 +717,25 @@ int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock)
 		return 0;
 	}
 
-	if (!mark_lock && tdb_brunlock(tdb, ltype, FREELIST_TOP, 0)) {
-		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed (%s)\n", strerror(errno)));
-		return -1;
+	if (!mark_lock) {
+		int ret;
+
+		if (tdb_have_mutexes(tdb)) {
+			ret = tdb_mutex_allrecord_unlock(tdb);
+			if (ret == 0) {
+				ret = tdb_brunlock(tdb, ltype,
+						   lock_offset(tdb->hash_size),
+						   0);
+			}
+		} else {
+			ret = tdb_brunlock(tdb, ltype, FREELIST_TOP, 0);
+		}
+
+		if (ret != 0) {
+			TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed "
+				 "(%s)\n", strerror(errno)));
+			return -1;
+		}
 	}
 
 	tdb->allrecord_lock.count = 0;
diff --git a/lib/tdb/common/mutex.c b/lib/tdb/common/mutex.c
new file mode 100644
index 00000000000..bdc4c28cb6c
--- /dev/null
+++ b/lib/tdb/common/mutex.c
@@ -0,0 +1,1000 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Volker Lendecke 2012,2013
+   Copyright (C) Stefan Metzmacher 2013,2014
+   Copyright (C) Michael Adam 2014
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "tdb_private.h"
+#include "system/threads.h"
+
+#ifdef USE_TDB_MUTEX_LOCKING
+
+/*
+ * If we run with mutexes, we store the "struct tdb_mutexes" at the
+ * beginning of the file. We store an additional tdb_header right
+ * beyond the mutex area, page aligned. All the offsets within the tdb
+ * are relative to the area behind the mutex area. tdb->map_ptr points
+ * behind the mmap area as well, so the read and write path in the
+ * mutex case can remain unchanged.
+ *
+ * Early in the mutex development the mutexes were placed between the hash
+ * chain pointers and the real tdb data. This had two drawbacks: First, it
+ * made pointer calculations more complex. Second, we had to mmap the mutex
+ * area twice. One was the normal map_ptr in the tdb. This frequently changed
+ * from within tdb_oob. At least the Linux glibc robust mutex code assumes
+ * constant pointers in memory, so a constantly changing mmap area destroys
+ * the mutex list. So we had to mmap the first bytes of the file with a second
+ * mmap call. With that scheme, very weird errors happened that could be
+ * easily fixed by doing the mutex mmap in a second file. It seemed that
+ * mapping the same memory area twice does not end up in accessing the same
+ * physical page, looking at the mutexes in gdb it seemed that old data showed
+ * up after some re-mapping. To avoid a separate mutex file, the code now puts
+ * the real content of the tdb file after the mutex area. This way we do not
+ * have overlapping mmap areas, the mutex area is mmapped once and not
+ * changed, the tdb data area's mmap is constantly changed but does not
+ * overlap.
+ */
+
+struct tdb_mutexes {
+	struct tdb_header hdr;
+
+	/* protect allrecord_lock */
+	pthread_mutex_t allrecord_mutex;
+
+	/*
+	 * F_UNLCK: free,
+	 * F_RDLCK: shared,
+	 * F_WRLCK: exclusive
+	 */
+	short int allrecord_lock;
+
+	/*
+	 * Index 0 is the freelist mutex, followed by
+	 * one mutex per hashchain.
+	 */
+	pthread_mutex_t hashchains[1];
+};
+
+bool tdb_have_mutexes(struct tdb_context *tdb)
+{
+	return ((tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) != 0);
+}
+
+size_t tdb_mutex_size(struct tdb_context *tdb)
+{
+	size_t mutex_size;
+
+	if (!tdb_have_mutexes(tdb)) {
+		return 0;
+	}
+
+	mutex_size = sizeof(struct tdb_mutexes);
+	mutex_size += tdb->hash_size * sizeof(pthread_mutex_t);
+
+	return TDB_ALIGN(mutex_size, tdb->page_size);
+}
+
+/*
+ * Get the index for a chain mutex
+ */
+static bool tdb_mutex_index(struct tdb_context *tdb, off_t off, off_t len,
+			    unsigned *idx)
+{
+	/*
+	 * Weird but true: We fcntl lock 1 byte at an offset 4 bytes before
+	 * the 4 bytes of the freelist start and the hash chain that is about
+	 * to be locked. See lock_offset() where the freelist is -1 vs the
+	 * "+1" in TDB_HASH_TOP(). Because the mutex array is represented in
+	 * the tdb file itself as data, we need to adjust the offset here.
+	 */
+	const off_t freelist_lock_ofs = FREELIST_TOP - sizeof(tdb_off_t);
+
+	if (!tdb_have_mutexes(tdb)) {
+		return false;
+	}
+	if (len != 1) {
+		/* Possibly the allrecord lock */
+		return false;
+	}
+	if (off < freelist_lock_ofs) {
+		/* One of the special locks */
+		return false;
+	}
+	if (tdb->hash_size == 0) {
+		/* tdb not initialized yet, called from tdb_open_ex() */
+		return false;
+	}
+	if (off >= TDB_DATA_START(tdb->hash_size)) {
+		/* Single record lock from traverses */
+		return false;
+	}
+
+	/*
+	 * Now we know it's a freelist or hash chain lock. Those are always 4
+	 * byte aligned. Paranoia check.
+	 */
+	if ((off % sizeof(tdb_off_t)) != 0) {
+		abort();
+	}
+
+	/*
+	 * Re-index the fcntl offset into an offset into the mutex array
+	 */
+	off -= freelist_lock_ofs; /* rebase to index 0 */
+	off /= sizeof(tdb_off_t); /* 0 for freelist 1-n for hashchain */
+
+	*idx = off;
+	return true;
+}
+
+static bool tdb_have_mutex_chainlocks(struct tdb_context *tdb)
+{
+	size_t i;
+
+	for (i=0; i < tdb->num_lockrecs; i++) {
+		bool ret;
+		unsigned idx;
+
+		ret = tdb_mutex_index(tdb,
+				      tdb->lockrecs[i].off,
+				      tdb->lockrecs[i].count,
+				      &idx);
+		if (!ret) {
+			continue;
+		}
+
+		if (idx == 0) {
+			/* this is the freelist mutex */
+			continue;
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
+static int chain_mutex_lock(pthread_mutex_t *m, bool waitflag)
+{
+	int ret;
+
+	if (waitflag) {
+		ret = pthread_mutex_lock(m);
+	} else {
+		ret = pthread_mutex_trylock(m);
+	}
+	if (ret != EOWNERDEAD) {
+		return ret;
+	}
+
+	/*
+	 * For chainlocks, we don't do any cleanup (yet?)
+	 */
+	return pthread_mutex_consistent(m);
+}
+
+static int allrecord_mutex_lock(struct tdb_mutexes *m, bool waitflag)
+{
+	int ret;
+
+	if (waitflag) {
+		ret = pthread_mutex_lock(&m->allrecord_mutex);
+	} else {
+		ret = pthread_mutex_trylock(&m->allrecord_mutex);
+	}
+	if (ret != EOWNERDEAD) {
+		return ret;
+	}
+
+	/*
+	 * The allrecord lock holder died. We need to reset the allrecord_lock
+	 * to F_UNLCK. This should also be the indication for
+	 * tdb_needs_recovery.
+	 */
+	m->allrecord_lock = F_UNLCK;
+
+	return pthread_mutex_consistent(&m->allrecord_mutex);
+}
+
+bool tdb_mutex_lock(struct tdb_context *tdb, int rw, off_t off, off_t len,
+		    bool waitflag, int *pret)
+{
+	struct tdb_mutexes *m = tdb->mutexes;
+	pthread_mutex_t *chain;
+	int ret;
+	unsigned idx;
+	bool allrecord_ok;
+
+	if (!tdb_mutex_index(tdb, off, len, &idx)) {
+		return false;
+	}
+	chain = &m->hashchains[idx];
+
+again:
+	ret = chain_mutex_lock(chain, waitflag);
+	if (ret == EBUSY) {
+		ret = EAGAIN;
+	}
+	if (ret != 0) {
+		errno = ret;
+		goto fail;
+	}
+
+	if (idx == 0) {
+		/*
+		 * This is a freelist lock, which is independent to
+		 * the allrecord lock. So we're done once we got the
+		 * freelist mutex.
+		 */
+		*pret = 0;
+		return true;
+	}
+
+	if (tdb_have_mutex_chainlocks(tdb)) {
+		/*
+		 * We can only check the allrecord lock once. If we do it with
+		 * one chain mutex locked, we will deadlock with the allrecord
+		 * locker process in the following way: We lock the first hash
+		 * chain, we check for the allrecord lock. We keep the hash
+		 * chain locked. Then the allrecord locker locks the
+		 * allrecord_mutex. It walks the list of chain mutexes,
+		 * locking them all in sequence. Meanwhile, we have the chain
+		 * mutex locked, so the allrecord locker blocks trying to lock
+		 * our chain mutex. Then we come in and try to lock the second
+		 * chain lock, which in most cases will be the freelist. We
+		 * see that the allrecord lock is locked and put ourselves on
+		 * the allrecord_mutex. This will never be signalled though
+		 * because the allrecord locker waits for us to give up the
+		 * chain lock.
+		 */
+
+		*pret = 0;
+		return true;
+	}
+
+	/*
+	 * Check if someone is has the allrecord lock: queue if so.
+	 */
+
+	allrecord_ok = false;
+
+	if (m->allrecord_lock == F_UNLCK) {
+		/*
+		 * allrecord lock not taken
+		 */
+		allrecord_ok = true;
+	}
+
+	if ((m->allrecord_lock == F_RDLCK) && (rw == F_RDLCK)) {
+		/*
+		 * allrecord shared lock taken, but we only want to read
+		 */
+		allrecord_ok = true;
+	}
+
+	if (allrecord_ok) {
+		*pret = 0;
+		return true;
+	}
+
+	ret = pthread_mutex_unlock(chain);
+	if (ret != 0) {
+		TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
+			 "(chain_mutex) failed: %s\n", strerror(ret)));
+		errno = ret;
+		goto fail;
+	}
+	ret = allrecord_mutex_lock(m, waitflag);
+	if (ret == EBUSY) {
+		ret = EAGAIN;
+	}
+	if (ret != 0) {
+		if (waitflag || (ret != EAGAIN)) {
+			TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_%slock"
+				 "(allrecord_mutex) failed: %s\n",
+				 waitflag ? "" : "try_",  strerror(ret)));
+		}
+		errno = ret;
+		goto fail;
+	}
+	ret = pthread_mutex_unlock(&m->allrecord_mutex);
+	if (ret != 0) {
+		TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
+			 "(allrecord_mutex) failed: %s\n", strerror(ret)));
+		errno = ret;
+		goto fail;
+	}
+	goto again;
+
+fail:
+	*pret = -1;
+	return true;
+}
+
+bool tdb_mutex_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len,
+		      int *pret)
+{
+	struct tdb_mutexes *m = tdb->mutexes;
+	pthread_mutex_t *chain;
+	int ret;
+	unsigned idx;
+
+	if (!tdb_mutex_index(tdb, off, len, &idx)) {
+		return false;
+	}
+	chain = &m->hashchains[idx];
+
+	ret = pthread_mutex_unlock(chain);
+	if (ret == 0) {
+		*pret = 0;
+		return true;
+	}
+	errno = ret;
+	*pret = -1;
+	return true;
+}
+
+int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
+			     enum tdb_lock_flags flags)
+{
+	struct tdb_mutexes *m = tdb->mutexes;
+	int ret;
+	uint32_t i;
+	bool waitflag = (flags & TDB_LOCK_WAIT);
+	int saved_errno;
+
+	if (tdb->flags & TDB_NOLOCK) {
+		return 0;
+	}
+
+	if (flags & TDB_LOCK_MARK_ONLY) {
+		return 0;
+	}
+
+	ret = allrecord_mutex_lock(m, waitflag);
+	if (!waitflag && (ret == EBUSY)) {
+		errno = EAGAIN;
+		tdb->ecode = TDB_ERR_LOCK;
+		return -1;
+	}
+	if (ret != 0) {
+		if (!(flags & TDB_LOCK_PROBE)) {
+			TDB_LOG((tdb, TDB_DEBUG_TRACE,
+				 "allrecord_mutex_lock() failed: %s\n",
+				 strerror(ret)));
+		}
+		tdb->ecode = TDB_ERR_LOCK;
+		return -1;
+	}
+
+	if (m->allrecord_lock != F_UNLCK) {
+		TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
+			 (int)m->allrecord_lock));
+		goto fail_unlock_allrecord_mutex;
+	}
+	m->allrecord_lock = (ltype == F_RDLCK) ? F_RDLCK : F_WRLCK;
+
+	for (i=0; i<tdb->hash_size; i++) {
+
+		/* ignore hashchains[0], the freelist */
+		pthread_mutex_t *chain = &m->hashchains[i+1];
+
+		ret = chain_mutex_lock(chain, waitflag);
+		if (!waitflag && (ret == EBUSY)) {
+			errno = EAGAIN;
+			goto fail_unroll_allrecord_lock;
+		}
+		if (ret != 0) {
+			if (!(flags & TDB_LOCK_PROBE)) {
+				TDB_LOG((tdb, TDB_DEBUG_TRACE,
+					 "chain_mutex_lock() failed: %s\n",
+					 strerror(ret)));
+			}
+			errno = ret;
+			goto fail_unroll_allrecord_lock;
+		}
+
+		ret = pthread_mutex_unlock(chain);
+		if (ret != 0) {
+			TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
+				 "(chainlock) failed: %s\n", strerror(ret)));
+			errno = ret;
+			goto fail_unroll_allrecord_lock;
+		}
+	}
+	/*
+	 * We leave this routine with m->allrecord_mutex locked
+	 */
+	return 0;
+
+fail_unroll_allrecord_lock:
+	m->allrecord_lock = F_UNLCK;
+
+fail_unlock_allrecord_mutex:
+	saved_errno = errno;
+	ret = pthread_mutex_unlock(&m->allrecord_mutex);
+	if (ret != 0) {
+		TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
+			 "(allrecord_mutex) failed: %s\n", strerror(ret)));
+	}
+	errno = saved_errno;
+	tdb->ecode = TDB_ERR_LOCK;
+	return -1;
+}
+
+int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb)
+{
+	struct tdb_mutexes *m = tdb->mutexes;
+	int ret;
+	uint32_t i;
+
+	if (tdb->flags & TDB_NOLOCK) {
+		return 0;
+	}
+
+	/*
+	 * Our only caller tdb_allrecord_upgrade()
+	 * garantees that we already own the allrecord lock.
+	 *
+	 * Which means m->allrecord_mutex is still locked by us.
+	 */
+
+	if (m->allrecord_lock != F_RDLCK) {
+		tdb->ecode = TDB_ERR_LOCK;
+		TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
+			 (int)m->allrecord_lock));
+		return -1;
+	}
+
+	m->allrecord_lock = F_WRLCK;
+
+	for (i=0; i<tdb->hash_size; i++) {
+
+		/* ignore hashchains[0], the freelist */
+		pthread_mutex_t *chain = &m->hashchains[i+1];
+
+		ret = chain_mutex_lock(chain, true);
+		if (ret != 0) {
+			TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_lock"
+				 "(chainlock) failed: %s\n", strerror(ret)));
+			goto fail_unroll_allrecord_lock;
+		}
+
+		ret = pthread_mutex_unlock(chain);
+		if (ret != 0) {
+			TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
+				 "(chainlock) failed: %s\n", strerror(ret)));
+			goto fail_unroll_allrecord_lock;
+		}
+	}
+
+	return 0;
+
+fail_unroll_allrecord_lock:
+	m->allrecord_lock = F_RDLCK;
+	tdb->ecode = TDB_ERR_LOCK;
+	return -1;
+}
+
+void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb)
+{
+	struct tdb_mutexes *m = tdb->mutexes;
+
+	/*
+	 * Our only caller tdb_allrecord_upgrade() (in the error case)
+	 * garantees that we already own the allrecord lock.
+	 *
+	 * Which means m->allrecord_mutex is still locked by us.
+	 */
+
+	if (m->allrecord_lock != F_WRLCK) {
+		TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
+			 (int)m->allrecord_lock));
+		return;
+	}
+
+	m->allrecord_lock = F_RDLCK;
+	return;
+}
+
+
+int tdb_mutex_allrecord_unlock(struct tdb_context *tdb)
+{
+	struct tdb_mutexes *m = tdb->mutexes;
+	short old;
+	int ret;
+
+	if (tdb->flags & TDB_NOLOCK) {
+		return 0;
+	}
+
+	/*
+	 * Our only callers tdb_allrecord_unlock() and
+	 * tdb_allrecord_lock() (in the error path)
+	 * garantee that we already own the allrecord lock.
+	 *
+	 * Which means m->allrecord_mutex is still locked by us.
+	 */
+
+	if ((m->allrecord_lock != F_RDLCK) && (m->allrecord_lock != F_WRLCK)) {
+		TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
+			 (int)m->allrecord_lock));
+		return -1;
+	}
+
+	old = m->allrecord_lock;
+	m->allrecord_lock = F_UNLCK;
+
+	ret = pthread_mutex_unlock(&m->allrecord_mutex);
+	if (ret != 0) {
+		m->allrecord_lock = old;
+		TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
+			 "(allrecord_mutex) failed: %s\n", strerror(ret)));
+		return -1;
+	}
+	return 0;
+}
+
+int tdb_mutex_init(struct tdb_context *tdb)
+{
+	struct tdb_mutexes *m;
+	pthread_mutexattr_t ma;
+	int i, ret;
+
+	ret = tdb_mutex_mmap(tdb);
+	if (ret == -1) {
+		return -1;
+	}
+	m = tdb->mutexes;
+
+	ret = pthread_mutexattr_init(&ma);
+	if (ret != 0) {
+		goto fail_munmap;
+	}
+	ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
+	if (ret != 0) {
+		goto fail;
+	}
+	ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
+	if (ret != 0) {
+		goto fail;
+	}
+	ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
+	if (ret != 0) {
+		goto fail;
+	}
+
+	for (i=0; i<tdb->hash_size+1; i++) {
+		pthread_mutex_t *chain = &m->hashchains[i];
+
+		ret = pthread_mutex_init(chain, &ma);
+		if (ret != 0) {
+			goto fail;
+		}
+	}
+
+	m->allrecord_lock = F_UNLCK;
+
+	ret = pthread_mutex_init(&m->allrecord_mutex, &ma);
+	if (ret != 0) {
+		goto fail;
+	}
+	ret = 0;
+fail:
+	pthread_mutexattr_destroy(&ma);
+fail_munmap:
+	tdb_mutex_munmap(tdb);
+
+	if (ret == 0) {
+		return 0;
+	}
+
+	errno = ret;
+	return -1;
+}
+
+int tdb_mutex_mmap(struct tdb_context *tdb)
+{
+	size_t len;
+	void *ptr;
+
+	len = tdb_mutex_size(tdb);
+	if (len == 0) {
+		return 0;
+	}
+
+	ptr = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FILE,
+		   tdb->fd, 0);
+	if (ptr == MAP_FAILED) {
+		return -1;
+	}
+	tdb->mutexes = (struct tdb_mutexes *)ptr;
+
+	return 0;
+}
+
+int tdb_mutex_munmap(struct tdb_context *tdb)
+{
+	size_t len;
+
+	len = tdb_mutex_size(tdb);
+	if (len == 0) {
+		return 0;
+	}
+
+	return munmap(tdb->mutexes, len);
+}
+
+static bool tdb_mutex_locking_cached;
+
+static bool tdb_mutex_locking_supported(void)
+{
+	pthread_mutexattr_t ma;
+	pthread_mutex_t m;
+	int ret;
+	static bool initialized;
+
+	if (initialized) {
+		return tdb_mutex_locking_cached;
+	}
+
+	initialized = true;
+
+	ret = pthread_mutexattr_init(&ma);
+	if (ret != 0) {
+		return false;
+	}
+	ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
+	if (ret != 0) {
+		goto cleanup_ma;
+	}
+	ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
+	if (ret != 0) {
+		goto cleanup_ma;
+	}
+	ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
+	if (ret != 0) {
+		goto cleanup_ma;
+	}
+	ret = pthread_mutex_init(&m, &ma);
+	if (ret != 0) {
+		goto cleanup_ma;
+	}
+	ret = pthread_mutex_lock(&m);
+	if (ret != 0) {
+		goto cleanup_m;
+	}
+	/*
+	 * This makes sure we have real mutexes
+	 * from a threading library instead of just
+	 * stubs from libc.
+	 */
+	ret = pthread_mutex_lock(&m);
+	if (ret != EDEADLK) {
+		goto cleanup_lock;
+	}
+	ret = pthread_mutex_unlock(&m);
+	if (ret != 0) {
+		goto cleanup_m;
+	}
+
+	tdb_mutex_locking_cached = true;
+	goto cleanup_m;
+
+cleanup_lock:
+	pthread_mutex_unlock(&m);
+cleanup_m:
+	pthread_mutex_destroy(&m);
+cleanup_ma:
+	pthread_mutexattr_destroy(&ma);
+	return tdb_mutex_locking_cached;
+}
+
+static void (*tdb_robust_mutext_old_handler)(int) = SIG_ERR;
+static pid_t tdb_robust_mutex_pid = -1;
+
+static void tdb_robust_mutex_handler(int sig)
+{
+	if (tdb_robust_mutex_pid != -1) {
+		pid_t pid;
+		int status;
+
+		pid = waitpid(tdb_robust_mutex_pid, &status, WNOHANG);
+		if (pid == tdb_robust_mutex_pid) {
+			tdb_robust_mutex_pid = -1;
+			return;
+		}
+	}
+
+	if (tdb_robust_mutext_old_handler == SIG_DFL) {
+		return;
+	}
+	if (tdb_robust_mutext_old_handler == SIG_IGN) {
+		return;
+	}
+	if (tdb_robust_mutext_old_handler == SIG_ERR) {
+		return;
+	}
+
+	tdb_robust_mutext_old_handler(sig);
+}
+
+_PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void)
+{
+	void *ptr;
+	pthread_mutex_t *m;
+	pthread_mutexattr_t ma;
+	int ret = 1;
+	int pipe_down[2] = { -1, -1 };
+	int pipe_up[2] = { -1, -1 };
+	ssize_t nread;
+	char c = 0;
+	bool ok;
+	int status;
+	static bool initialized;
+
+	if (initialized) {
+		return tdb_mutex_locking_cached;
+	}
+
+	initialized = true;
+
+	ok = tdb_mutex_locking_supported();
+	if (!ok) {
+		return false;
+	}
+
+	tdb_mutex_locking_cached = false;
+
+	ptr = mmap(NULL, sizeof(pthread_mutex_t), PROT_READ|PROT_WRITE,
+		   MAP_SHARED|MAP_ANON, -1 /* fd */, 0);
+	if (ptr == MAP_FAILED) {
+		return false;
+	}
+	m = (pthread_mutex_t *)ptr;
+
+	ret = pipe(pipe_down);
+	if (ret != 0) {
+		goto cleanup_mmap;
+	}
+	ret = pipe(pipe_up);
+	if (ret != 0) {
+		goto cleanup_pipe;
+	}
+
+	ret = pthread_mutexattr_init(&ma);
+	if (ret != 0) {
+		goto cleanup_pipe;
+	}
+	ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
+	if (ret != 0) {
+		goto cleanup_ma;
+	}
+	ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
+	if (ret != 0) {
+		goto cleanup_ma;
+	}
+	ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
+	if (ret != 0) {
+		goto cleanup_ma;
+	}
+	ret = pthread_mutex_init(m, &ma);
+	if (ret != 0) {
+		goto cleanup_ma;
+	}
+
+	tdb_robust_mutext_old_handler = signal(SIGCHLD,
+					       tdb_robust_mutex_handler);
+
+	tdb_robust_mutex_pid = fork();
+	if (tdb_robust_mutex_pid == 0) {
+		size_t nwritten;
+		close(pipe_down[1]);
+		close(pipe_up[0]);
+		ret = pthread_mutex_lock(m);
+		nwritten = write(pipe_up[1], &ret, sizeof(ret));
+		if (nwritten != sizeof(ret)) {
+			exit(1);
+		}
+		if (ret != 0) {
+			exit(1);
+		}
+		nread = read(pipe_down[0], &c, 1);
+		if (nread != 1) {
+			exit(1);
+		}
+		/* leave locked */
+		exit(0);
+	}
+	if (tdb_robust_mutex_pid == -1) {
+		goto cleanup_sig_child;
+	}
+	close(pipe_down[0]);
+	pipe_down[0] = -1;
+	close(pipe_up[1]);
+	pipe_up[1] = -1;
+
+	nread = read(pipe_up[0], &ret, sizeof(ret));
+	if (nread != sizeof(ret)) {
+		goto cleanup_child;
+	}
+
+	ret = pthread_mutex_trylock(m);
+	if (ret != EBUSY) {
+		if (ret == 0) {
+			pthread_mutex_unlock(m);
+		}
+		goto cleanup_child;
+	}
+
+	if (write(pipe_down[1], &c, 1) != 1) {
+		goto cleanup_child;
+	}
+
+	nread = read(pipe_up[0], &c, 1);
+	if (nread != 0) {
+		goto cleanup_child;
+	}
+
+	while (tdb_robust_mutex_pid > 0) {
+		pid_t pid;
+
+		errno = 0;
+		pid = waitpid(tdb_robust_mutex_pid, &status, 0);
+		if (pid == tdb_robust_mutex_pid) {
+			tdb_robust_mutex_pid = -1;
+			break;
+		}
+		if (pid == -1 && errno != EINTR) {
+			goto cleanup_child;
+		}
+	}
+	signal(SIGCHLD, tdb_robust_mutext_old_handler);
+
+	ret = pthread_mutex_trylock(m);
+	if (ret != EOWNERDEAD) {
+		if (ret == 0) {
+			pthread_mutex_unlock(m);
+		}
+		goto cleanup_m;
+	}
+
+	ret = pthread_mutex_consistent(m);
+	if (ret != 0) {
+		goto cleanup_m;
+	}
+
+	ret = pthread_mutex_trylock(m);
+	if (ret != EDEADLK) {
+		pthread_mutex_unlock(m);
+		goto cleanup_m;
+	}
+
+	ret = pthread_mutex_unlock(m);
+	if (ret != 0) {
+		goto cleanup_m;
+	}
+
+	tdb_mutex_locking_cached = true;
+	goto cleanup_m;
+
+cleanup_child:
+	while (tdb_robust_mutex_pid > 0) {
+		pid_t pid;
+
+		kill(tdb_robust_mutex_pid, SIGKILL);
+
+		errno = 0;
+		pid = waitpid(tdb_robust_mutex_pid, &status, 0);
+		if (pid == tdb_robust_mutex_pid) {
+			tdb_robust_mutex_pid = -1;
+			break;
+		}
+		if (pid == -1 && errno != EINTR) {
+			break;
+		}
+	}
+cleanup_sig_child:
+	signal(SIGCHLD, tdb_robust_mutext_old_handler);
+cleanup_m:
+	pthread_mutex_destroy(m);
+cleanup_ma:
+	pthread_mutexattr_destroy(&ma);
+cleanup_pipe:
+	if (pipe_down[0] != -1) {
+		close(pipe_down[0]);
+	}
+	if (pipe_down[1] != -1) {
+		close(pipe_down[1]);
+	}
+	if (pipe_up[0] != -1) {
+		close(pipe_up[0]);
+	}
+	if (pipe_up[1] != -1) {
+		close(pipe_up[1]);
+	}
+cleanup_mmap:
+	munmap(ptr, sizeof(pthread_mutex_t));
+
+	return tdb_mutex_locking_cached;
+}
+
+#else
+
+size_t tdb_mutex_size(struct tdb_context *tdb)
+{
+	return 0;
+}
+
+bool tdb_have_mutexes(struct tdb_context *tdb)
+{
+	return false;
+}
+
+int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
+			     enum tdb_lock_flags flags)
+{
+	tdb->ecode = TDB_ERR_LOCK;
+	return -1;
+}
+
+int tdb_mutex_allrecord_unlock(struct tdb_context *tdb)
+{
+	return -1;
+}
+
+int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb)
+{
+	tdb->ecode = TDB_ERR_LOCK;
+	return -1;
+}
+
+void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb)
+{
+	return;
+}
+
+int tdb_mutex_mmap(struct tdb_context *tdb)
+{
+	errno = ENOSYS;
+	return -1;
+}
+
+int tdb_mutex_munmap(struct tdb_context *tdb)
+{
+	errno = ENOSYS;
+	return -1;
+}
+
+int tdb_mutex_init(struct tdb_context *tdb)
+{
+	errno = ENOSYS;
+	return -1;
+}
+
+_PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void)
+{
+	return false;
+}
+
+#endif
diff --git a/lib/tdb/common/open.c b/lib/tdb/common/open.c
index 162f30d4047..16a76a347fc 100644
--- a/lib/tdb/common/open.c
+++ b/lib/tdb/common/open.c
@@ -76,6 +76,15 @@ static int tdb_new_database(struct tdb_context *tdb, struct tdb_header *header,
 	if (tdb->flags & TDB_INCOMPATIBLE_HASH)
 		newdb->rwlocks = TDB_HASH_RWLOCK_MAGIC;
 
+	/*
+	 * We create a tdb with TDB_FEATURE_FLAG_MUTEX support,
+	 * the flag combination and runtime feature checks
+	 * are done by the caller already.
+	 */
+	if (tdb->flags & TDB_MUTEX_LOCKING) {
+		newdb->feature_flags |= TDB_FEATURE_FLAG_MUTEX;
+	}
+
 	/*
 	 * If we have any features we add the FEATURE_FLAG_MAGIC, overwriting the
 	 * TDB_HASH_RWLOCK_MAGIC above.
@@ -87,8 +96,11 @@ static int tdb_new_database(struct tdb_context *tdb, struct tdb_header *header,
 	/*
 	 * It's required for some following code pathes
 	 * to have the fields on 'tdb' up-to-date.
+	 *
+	 * E.g. tdb_mutex_size() requires it
 	 */
 	tdb->feature_flags = newdb->feature_flags;
+	tdb->hash_size = newdb->hash_size;
 
 	if (tdb->flags & TDB_INTERNAL) {
 		tdb->map_size = size;
@@ -104,6 +116,11 @@ static int tdb_new_database(struct tdb_context *tdb, struct tdb_header *header,
 	if (ftruncate(tdb->fd, 0) == -1)
 		goto fail;
 
+	if (newdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) {
+		newdb->mutex_size = tdb_mutex_size(tdb);
+		tdb->hdr_ofs = newdb->mutex_size;
+	}
+
 	/* This creates an endian-converted header, as if read from disk */
 	CONVERT(*newdb);
 	memcpy(header, newdb, sizeof(*header));
@@ -113,6 +130,37 @@ static int tdb_new_database(struct tdb_context *tdb, struct tdb_header *header,
 	if (!tdb_write_all(tdb->fd, newdb, size))
 		goto fail;
 
+	if (newdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) {
+
+		/*
+		 * Now we init the mutex area
+		 * followed by a second header.
+		 */
+
+		ret = ftruncate(
+			tdb->fd,
+			newdb->mutex_size + sizeof(struct tdb_header));
+		if (ret == -1) {
+			goto fail;
+		}
+		ret = tdb_mutex_init(tdb);
+		if (ret == -1) {
+			goto fail;
+		}
+
+		/*
+		 * Write a second header behind the mutexes. That's the area
+		 * that will be mmapp'ed.
+		 */
+		ret = lseek(tdb->fd, newdb->mutex_size, SEEK_SET);
+		if (ret == -1) {
+			goto fail;
+		}
+		if (!tdb_write_all(tdb->fd, newdb, size)) {
+			goto fail;
+		}
+	}
+
 	ret = 0;
   fail:
 	SAFE_FREE(newdb);
@@ -179,6 +227,70 @@ static bool check_header_hash(struct tdb_context *tdb,
 	return check_header_hash(tdb, header, false, m1, m2);
 }
 
+static bool tdb_mutex_open_ok(struct tdb_context *tdb,
+			      const struct tdb_header *header)
+{
+	int locked;
+
+	locked = tdb_nest_lock(tdb, ACTIVE_LOCK, F_WRLCK,
+			       TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
+
+	if ((locked == -1) && (tdb->ecode == TDB_ERR_LOCK)) {
+		/*
+		 * CLEAR_IF_FIRST still active. The tdb was created on this
+		 * host, so we can assume the mutex implementation is
+		 * compatible. Important for tools like tdbdump on a still
+		 * open locking.tdb.
+		 */
+		goto check_local_settings;
+	}
+
+	/*
+	 * We got the CLEAR_IF_FIRST lock. That means the database was
+	 * potentially copied from somewhere else. The mutex implementation
+	 * might be incompatible.
+	 */
+
+	if (tdb_nest_unlock(tdb, ACTIVE_LOCK, F_WRLCK, false) == -1) {
+		/*
+		 * Should not happen
+		 */
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_mutex_open_ok: "
+			 "failed to release ACTIVE_LOCK on %s: %s\n",
+			 tdb->name, strerror(errno)));
+		return false;
+	}
+
+	if (tdb->flags & TDB_NOLOCK) {
+		/*
+		 * We don't look at locks, so it does not matter to have a
+		 * compatible mutex implementation. Allow the open.
+		 */
+		return true;
+	}
+
+check_local_settings:
+
+	if (!(tdb->flags & TDB_MUTEX_LOCKING)) {
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_mutex_open_ok[%s]: "
+			 "Can use mutexes only with "
+			 "MUTEX_LOCKING or NOLOCK\n",
+			 tdb->name));
+		return false;
+	}
+
+	if (tdb_mutex_size(tdb) != header->mutex_size) {
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_mutex_open_ok[%s]: "
+			 "Mutex size changed from %u to %u\n.",
+			 tdb->name,
+			 (unsigned int)header->mutex_size,
+			 (unsigned int)tdb_mutex_size(tdb)));
+		return false;
+	}
+
+	return true;
+}
+
 _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
 				int open_flags, mode_t mode,
 				const struct tdb_logging_context *log_ctx,
@@ -208,6 +320,9 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td
 	if (tdb_flags & TDB_INTERNAL) {
 		tdb_flags |= TDB_INCOMPATIBLE_HASH;
 	}
+	if (tdb_flags & TDB_MUTEX_LOCKING) {
+		tdb_flags |= TDB_INCOMPATIBLE_HASH;
+	}
 
 	tdb->fd = -1;
 #ifdef TDB_TRACE
@@ -296,6 +411,64 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td
 		goto fail;
 	}
 
+	if (tdb->flags & TDB_MUTEX_LOCKING) {
+		/*
+		 * Here we catch bugs in the callers,
+		 * the runtime check for existing tdb's comes later.
+		 */
+
+		if (!(tdb->flags & TDB_CLEAR_IF_FIRST)) {
+			TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
+				"invalid flags for %s - TDB_MUTEX_LOCKING "
+				"requires TDB_CLEAR_IF_FIRST\n", name));
+			errno = EINVAL;
+			goto fail;
+		}
+
+		if (tdb->flags & TDB_INTERNAL) {
+			TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
+				"invalid flags for %s - TDB_MUTEX_LOCKING and "
+				"TDB_INTERNAL are not allowed together\n", name));
+			errno = EINVAL;
+			goto fail;
+		}
+
+		if (tdb->flags & TDB_NOMMAP) {
+			TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
+				"invalid flags for %s - TDB_MUTEX_LOCKING and "
+				"TDB_NOMMAP are not allowed together\n", name));
+			errno = EINVAL;
+			goto fail;
+		}
+
+		if (tdb->read_only) {
+			TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
+				"invalid flags for %s - TDB_MUTEX_LOCKING "
+				"not allowed read only\n", name));
+			errno = EINVAL;
+			goto fail;
+		}
+
+		/*
+		 * The callers should have called
+		 * tdb_runtime_check_for_robust_mutexes()
+		 * before using TDB_MUTEX_LOCKING!
+		 *
+		 * This makes sure the caller understands
+		 * that the locking may behave a bit differently
+		 * than with pure fcntl locking. E.g. multiple
+		 * read locks are not supported.
+		 */
+		if (!tdb_runtime_check_for_robust_mutexes()) {
+			TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
+				"invalid flags for %s - TDB_MUTEX_LOCKING "
+				"requires support for robust_mutexes\n",
+				name));
+			errno = ENOSYS;
+			goto fail;
+		}
+	}
+
 	if (getenv("TDB_NO_FSYNC")) {
 		tdb->flags |= TDB_NOSYNC;
 	}
@@ -435,6 +608,21 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td
 		goto fail;
 	}
 
+	if (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) {
+		if (!tdb_mutex_open_ok(tdb, &header)) {
+			errno = EINVAL;
+			goto fail;
+		}
+
+		/*
+		 * We need to remember the hdr_ofs
+		 * also for the TDB_NOLOCK case
+		 * if the current library doesn't support
+		 * mutex locking.
+		 */
+		tdb->hdr_ofs = header.mutex_size;
+	}
+
 	if ((header.magic1_hash == 0) && (header.magic2_hash == 0)) {
 		/* older TDB without magic hash references */
 		tdb->hash_fn = tdb_old_hash;
@@ -477,6 +665,15 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td
 		goto fail;
 	}
 
+	if (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) {
+		if (!(tdb->flags & TDB_NOLOCK)) {
+			ret = tdb_mutex_mmap(tdb);
+			if (ret != 0) {
+				goto fail;
+			}
+		}
+	}
+
 	if (locked) {
 		if (tdb_nest_unlock(tdb, ACTIVE_LOCK, F_WRLCK, false) == -1) {
 			TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
@@ -587,6 +784,9 @@ _PUBLIC_ int tdb_close(struct tdb_context *tdb)
 		else
 			tdb_munmap(tdb);
 	}
+
+	tdb_mutex_munmap(tdb);
+
 	SAFE_FREE(tdb->name);
 	if (tdb->fd != -1) {
 		ret = close(tdb->fd);
diff --git a/lib/tdb/common/summary.c b/lib/tdb/common/summary.c
index e9989f676f7..d786132d4a1 100644
--- a/lib/tdb/common/summary.c
+++ b/lib/tdb/common/summary.c
@@ -23,6 +23,7 @@
 	"Number of records: %zu\n" \
 	"Incompatible hash: %s\n" \
 	"Active/supported feature flags: 0x%08x/0x%08x\n" \
+	"Robust mutexes locking: %s\n" \
 	"Smallest/average/largest keys: %zu/%zu/%zu\n" \
 	"Smallest/average/largest data: %zu/%zu/%zu\n" \
 	"Smallest/average/largest padding: %zu/%zu/%zu\n" \
@@ -175,6 +176,7 @@ _PUBLIC_ char *tdb_summary(struct tdb_context *tdb)
 		 keys.num,
 		 (tdb->hash_fn == tdb_jenkins_hash)?"yes":"no",
 		 (unsigned)tdb->feature_flags, TDB_SUPPORTED_FEATURE_FLAGS,
+		 (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX)?"yes":"no",
 		 keys.min, tally_mean(&keys), keys.max,
 		 data.min, tally_mean(&data), data.max,
 		 extra.min, tally_mean(&extra), extra.max,
diff --git a/lib/tdb/common/tdb.c b/lib/tdb/common/tdb.c
index ebd4ffe3e01..ae98c9619d1 100644
--- a/lib/tdb/common/tdb.c
+++ b/lib/tdb/common/tdb.c
@@ -723,6 +723,15 @@ _PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
 		return;
 	}
 
+	if ((flags & TDB_NOLOCK) &&
+	    (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) &&
+	    (tdb->mutexes == NULL)) {
+		tdb->ecode = TDB_ERR_LOCK;
+		TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
+			 "Can not remove NOLOCK flag on mutexed databases"));
+		return;
+	}
+
 	if (flags & TDB_ALLOW_NESTING) {
 		tdb->flags |= TDB_DISALLOW_NESTING;
 	}
diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h
index 4981e2cd6ac..de8d9e68fb5 100644
--- a/lib/tdb/common/tdb_private.h
+++ b/lib/tdb/common/tdb_private.h
@@ -69,7 +69,11 @@ typedef uint32_t tdb_off_t;
 #define TDB_PAD_BYTE 0x42
 #define TDB_PAD_U32  0x42424242
 
-#define TDB_SUPPORTED_FEATURE_FLAGS 0
+#define TDB_FEATURE_FLAG_MUTEX 0x00000001
+
+#define TDB_SUPPORTED_FEATURE_FLAGS ( \
+	TDB_FEATURE_FLAG_MUTEX | \
+	0)
 
 /* NB assumes there is a local variable called "tdb" that is the
  * current context, also takes doubly-parenthesized print-style
@@ -156,7 +160,8 @@ struct tdb_header {
 	uint32_t magic1_hash; /* hash of TDB_MAGIC_FOOD. */
 	uint32_t magic2_hash; /* hash of TDB_MAGIC. */
 	uint32_t feature_flags;
-	tdb_off_t reserved[26];
+	tdb_len_t mutex_size; /* set if TDB_FEATURE_FLAG_MUTEX is set */
+	tdb_off_t reserved[25];
 };
 
 struct tdb_lock_type {
@@ -190,6 +195,8 @@ struct tdb_methods {
 	int (*tdb_expand_file)(struct tdb_context *, tdb_off_t , tdb_off_t );
 };
 
+struct tdb_mutexes;
+
 struct tdb_context {
 	char *name; /* the name of the database */
 	void *map_ptr; /* where it is currently mapped */
@@ -203,7 +210,8 @@ struct tdb_context {
 	struct tdb_lock_type *lockrecs; /* only real locks, all with count>0 */
 	int lockrecs_array_length;
 
-	tdb_off_t hdr_ofs; /* this is 0 for now */
+	tdb_off_t hdr_ofs; /* this is 0 or header.mutex_size */
+	struct tdb_mutexes *mutexes; /* mmap of the mutex area */
 
 	enum TDB_ERROR ecode; /* error code for last tdb error */
 	uint32_t hash_size;
@@ -300,4 +308,20 @@ bool tdb_add_off_t(tdb_off_t a, tdb_off_t b, tdb_off_t *pret);
 
 /* tdb_off_t and tdb_len_t right now are both uint32_t */
 #define tdb_add_len_t tdb_add_off_t
+
+size_t tdb_mutex_size(struct tdb_context *tdb);
+bool tdb_have_mutexes(struct tdb_context *tdb);
+int tdb_mutex_init(struct tdb_context *tdb);
+int tdb_mutex_mmap(struct tdb_context *tdb);
+int tdb_mutex_munmap(struct tdb_context *tdb);
+bool tdb_mutex_lock(struct tdb_context *tdb, int rw, off_t off, off_t len,
+		    bool waitflag, int *pret);
+bool tdb_mutex_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len,
+		      int *pret);
+int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
+			     enum tdb_lock_flags flags);
+int tdb_mutex_allrecord_unlock(struct tdb_context *tdb);
+int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb);
+void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb);
+
 #endif /* TDB_PRIVATE_H */
diff --git a/lib/tdb/common/transaction.c b/lib/tdb/common/transaction.c
index a2c3bbdff37..caef0bedd82 100644
--- a/lib/tdb/common/transaction.c
+++ b/lib/tdb/common/transaction.c
@@ -421,7 +421,8 @@ static int _tdb_transaction_start(struct tdb_context *tdb,
 				  enum tdb_lock_flags lockflags)
 {
 	/* some sanity checks */
-	if (tdb->read_only || (tdb->flags & TDB_INTERNAL) || tdb->traverse_read) {
+	if (tdb->read_only || (tdb->flags & (TDB_INTERNAL|TDB_MUTEX_LOCKING))
+	    || tdb->traverse_read) {
 		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
 		tdb->ecode = TDB_ERR_EINVAL;
 		return -1;
diff --git a/lib/tdb/docs/mutex.txt b/lib/tdb/docs/mutex.txt
new file mode 100644
index 00000000000..7625662e9db
--- /dev/null
+++ b/lib/tdb/docs/mutex.txt
@@ -0,0 +1,136 @@
+Tdb is a hashtable database with multiple concurrent writer and external
+record lock support. For speed reasons, wherever possible tdb uses a shared
+memory mapped area for data access. In its currently released form, it uses
+fcntl byte-range locks to coordinate access to the data itself.
+
+The tdb data is organized as a hashtable. Hash collisions are dealt with by
+forming a linked list of records that share a hash value. The individual
+linked lists are protected across processes with 1-byte fcntl locks on the
+starting pointer of the linked list representing a hash value.
+
+The external locking API of tdb allows to lock individual records. Instead of
+really locking individual records, the tdb API locks a complete linked list
+with a fcntl lock.
+
+The external locking API of tdb also allows to lock the complete database, and
+ctdb uses this facility to freeze databases during a recovery. While the
+so-called allrecord lock is held, all linked lists and all individual records
+are frozen alltogether. Tdb achieves this by locking the complete file range
+with a single fcntl lock. Individual 1-byte locks for the linked lists
+conflict with this. Access to records is prevented by the one large fnctl byte
+range lock.
+
+Fcntl locks have been chosen for tdb for two reasons: First they are portable
+across all current unixes. Secondly they provide auto-cleanup. If a process
+dies while holding a fcntl lock, the lock is given up as if it was explicitly
+unlocked. Thus fcntl locks provide a very robust locking scheme, if a process
+dies for any reason the database will not stay blocked until reboot. This
+robustness is very important for long-running services, a reboot is not an
+option for most users of tdb.
+
+Unfortunately, during stress testing, fcntl locks have turned out to be a major
+problem for performance. The particular problem that was seen happens when
+ctdb on a busy server does a recovery. A recovery means that ctdb has to
+freeze all tdb databases for some time, usually a few seconds. This is done
+with the allrecord lock. During the recovery phase on a busy server many smbd
+processes try to access the tdb file with blocking fcntl calls. The specific
+test in question easily reproduces 7,000 processes piling up waiting for
+1-byte fcntl locks. When ctdb is done with the recovery, it gives up the
+allrecord lock, covering the whole file range. All 7,000 processes waiting for
+1-byte fcntl locks are woken up, trying to acquire their lock. The special
+implementation of fcntl locks in Linux (up to 2013-02-12 at least) protects
+all fcntl lock operations with a single system-wide spinlock. If 7,000 process
+waiting for the allrecord lock to become released this leads to a thundering
+herd condition, all CPUs are spinning on that single spinlock.
+
+Functionally the kernel is fine, eventually the thundering herd slows down and
+every process correctly gets his share and locking range, but the performance
+of the system while the herd is active is worse than expected.
+
+The thundering herd is only the worst case scenario for fcntl lock use. The
+single spinlock for fcntl operations is also a performance penalty for normal
+operations. In the cluster case, every read and write SMB request has to do
+two fcntl calls to provide correct SMB mandatory locks. The single spinlock
+is one source of serialization for the SMB read/write requests, limiting the
+parallelism that can be achieved in a multi-core system.
+
+While trying to tune his servers, Ira Cooper, Samba Team member, found fcntl
+locks to be a problem on Solaris as well. Ira pointed out that there is a
+potential alternative locking mechanism that might be more scalable: Process
+shared robust mutexes, as defined by Posix 2008 for example via
+
+http://pubs.opengroup.org/onlinepubs/9699919799/functions/pthread_mutexattr_setpshared.html
+http://pubs.opengroup.org/onlinepubs/9699919799/functions/pthread_mutexattr_setrobust.html
+
+Pthread mutexes provide one of the core mechanisms in posix threads to protect
+in-process data structures from concurrent access by multiple threads. In the
+Linux implementation, a pthread_mutex_t is represented by a data structure in
+user space that requires no kernel calls in the uncontended case for locking
+and unlocking. Locking and unlocking in the uncontended case is implemented
+purely in user space with atomic CPU instructions and thus are very fast.
+
+The setpshared functions indicate to the kernel that the mutex is about to be
+shared between processes in a common shared memory area.
+
+The process shared posix mutexes have the potential to replace fcntl locking
+to coordinate mmap access for tdbs. However, they are missing the criticial
+auto-cleanup property that fcntl provides when a process dies. A process that
+dies hard while holding a shared mutex has no chance to clean up the protected
+data structures and unlock the shared mutex. Thus with a pure process shared
+mutex the mutex will remain locked forever until the data structures are
+re-initialized from scratch.
+
+With the robust mutexes defined by Posix the process shared mutexes have been
+extended with a limited auto-cleanup property. If a mutex has been declared
+robust, when a process exits while holding that mutex, the next process trying
+to lock the mutex will get the special error message EOWNERDEAD. This informs
+the caller that the data structures the mutex protects are potentially corrupt
+and need to be cleaned up.
+
+The error message EOWNERDEAD when trying to lock a mutex is an extension over
+the fcntl functionality. A process that does a blocking fcntl lock call is not
+informed about whether the lock was explicitly freed by a process still alive
+or due to an unplanned process exit. At the time of this writing (February
+2013), at least Linux and OpenSolaris also implement the robustness feature of
+process-shared mutexes.
+
+Converting the tdb locking mechanism from fcntl to mutexes has to take care of
+both types of locks that are used on tdb files.
+
+The easy part is to use mutexes to replace the 1-byte linked list locks
+covering the individual hashes. Those can be represented by a mutex each.
+
+Covering the allrecord lock is more difficult. The allrecord lock uses a fcntl
+lock spanning all hash list locks simultaneously. This basic functionality is
+not easily possible with mutexes. A mutex carries 1 bit of information, a
+fcntl lock can carry an arbitrary amount of information.
+
+In order to support the allrecord lock, we have an allrecord_lock variable
+protected by an allrecord_mutex. The coordination between the allrecord lock
+and the chainlocks works like this:
+
+- Getting a chain lock works like this:
+
+  1. get chain mutex
+  2. return success if allrecord_lock is F_UNLCK (not locked)
+  3. return success if allrecord_lock is F_RDLCK (locked readonly)
+     and we only need a read lock.
+  4. release chain mutex
+  5. wait for allrecord_mutex
+  6. unlock allrecord_mutex
+  7. goto 1.
+
+- Getting the allrecord lock:
+
+  1. get the allrecord mutex
+  2. return error if allrecord_lock is not F_UNLCK (it's locked)
+  3. set allrecord_lock to the desired value.
+  4. in a loop: lock(blocking) / unlock each chain mutex.
+  5. return success.
+
+- allrecord lock upgrade:
+
+  1. check we already have the allrecord lock with F_RDLCK.
+  3. set allrecord_lock to F_WRLCK
+  4. in a loop: lock(blocking) / unlock each chain mutex.
+  5. return success.
diff --git a/lib/tdb/include/tdb.h b/lib/tdb/include/tdb.h
index a34f089e66d..5ea5e6048c6 100644
--- a/lib/tdb/include/tdb.h
+++ b/lib/tdb/include/tdb.h
@@ -80,6 +80,9 @@ extern "C" {
 #define TDB_ALLOW_NESTING 512 /** Allow transactions to nest */
 #define TDB_DISALLOW_NESTING 1024 /** Disallow transactions to nest */
 #define TDB_INCOMPATIBLE_HASH 2048 /** Better hashing: can't be opened by tdb < 1.2.6. */
+#define TDB_MUTEX_LOCKING 4096 /** optimized locking using robust mutexes if supported,
+                                   only with tdb >= 1.3.0 and TDB_CLEAR_IF_FIRST
+                                   after checking tdb_runtime_check_for_robust_mutexes() */
 
 /** The tdb error codes */
 enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK, 
@@ -143,6 +146,11 @@ struct tdb_logging_context {
  *                                        default 5.\n
  *                         TDB_ALLOW_NESTING - Allow transactions to nest.\n
  *                         TDB_DISALLOW_NESTING - Disallow transactions to nest.\n
+ *                         TDB_INCOMPATIBLE_HASH - Better hashing: can't be opened by tdb < 1.2.6.\n
+ *                         TDB_MUTEX_LOCKING - Optimized locking using robust mutexes if supported,
+ *                                             can't be opened by tdb < 1.3.0.
+ *                                             Only valid in combination with TDB_CLEAR_IF_FIRST
+ *                                             after checking tdb_runtime_check_for_robust_mutexes()\n
  *
  * @param[in]  open_flags Flags for the open(2) function.
  *
@@ -179,6 +187,11 @@ struct tdb_context *tdb_open(const char *name, int hash_size, int tdb_flags,
  *                                        default 5.\n
  *                         TDB_ALLOW_NESTING - Allow transactions to nest.\n
  *                         TDB_DISALLOW_NESTING - Disallow transactions to nest.\n
+ *                         TDB_INCOMPATIBLE_HASH - Better hashing: can't be opened by tdb < 1.2.6.\n
+ *                         TDB_MUTEX_LOCKING - Optimized locking using robust mutexes if supported,
+ *                                             can't be opened by tdb < 1.3.0.
+ *                                             Only valid in combination with TDB_CLEAR_IF_FIRST
+ *                                             after checking tdb_runtime_check_for_robust_mutexes()\n
  *
  * @param[in]  open_flags Flags for the open(2) function.
  *
@@ -842,6 +855,27 @@ int tdb_rescue(struct tdb_context *tdb,
 	       void (*walk) (TDB_DATA key, TDB_DATA data, void *private_data),
 	       void *private_data);
 
+/**
+ * @brief Check if support for TDB_MUTEX_LOCKING is available at runtime.
+ *
+ * On some systems the API for pthread_mutexattr_setrobust() is not available.
+ * On other systems there are some bugs in the interaction between glibc and
+ * the linux kernel.
+ *
+ * This function provides a runtime check if robust mutexes are really
+ * available.
+ *
+ * This needs to be called and return true before TDB_MUTEX_LOCKING
+ * can be used at runtime.
+ *
+ * @note This calls fork(), but the SIGCHILD handling should be transparent.
+ *
+ * @return              true if supported, false otherwise.
+ *
+ * @see TDB_MUTEX_LOCKING
+ */
+bool tdb_runtime_check_for_robust_mutexes(void);
+
 /* @} ******************************************************************/
 
 /* Low level locking functions: use with care */
diff --git a/lib/tdb/test/run-3G-file.c b/lib/tdb/test/run-3G-file.c
index 900b1a667a2..748c972284a 100644
--- a/lib/tdb/test/run-3G-file.c
+++ b/lib/tdb/test/run-3G-file.c
@@ -9,6 +9,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/test/run-bad-tdb-header.c b/lib/tdb/test/run-bad-tdb-header.c
index b00fb8934a4..9d29fdf5e8e 100644
--- a/lib/tdb/test/run-bad-tdb-header.c
+++ b/lib/tdb/test/run-bad-tdb-header.c
@@ -9,6 +9,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/test/run-check.c b/lib/tdb/test/run-check.c
index b2756914831..ce389a2d14d 100644
--- a/lib/tdb/test/run-check.c
+++ b/lib/tdb/test/run-check.c
@@ -9,6 +9,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/test/run-corrupt.c b/lib/tdb/test/run-corrupt.c
index 93eae42ce16..e6fc751842f 100644
--- a/lib/tdb/test/run-corrupt.c
+++ b/lib/tdb/test/run-corrupt.c
@@ -9,6 +9,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/test/run-die-during-transaction.c b/lib/tdb/test/run-die-during-transaction.c
index 9b9041552ed..c636d87322d 100644
--- a/lib/tdb/test/run-die-during-transaction.c
+++ b/lib/tdb/test/run-die-during-transaction.c
@@ -19,6 +19,7 @@ static int ftruncate_check(int fd, off_t length);
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include <stdbool.h>
diff --git a/lib/tdb/test/run-endian.c b/lib/tdb/test/run-endian.c
index 3116f7da51b..9d4d5f59f8e 100644
--- a/lib/tdb/test/run-endian.c
+++ b/lib/tdb/test/run-endian.c
@@ -9,6 +9,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/test/run-incompatible.c b/lib/tdb/test/run-incompatible.c
index af01ca6a395..b8e95b5e778 100644
--- a/lib/tdb/test/run-incompatible.c
+++ b/lib/tdb/test/run-incompatible.c
@@ -9,6 +9,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 
diff --git a/lib/tdb/test/run-nested-transactions.c b/lib/tdb/test/run-nested-transactions.c
index bf08e55afee..864adf2d716 100644
--- a/lib/tdb/test/run-nested-transactions.c
+++ b/lib/tdb/test/run-nested-transactions.c
@@ -9,6 +9,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include <stdbool.h>
diff --git a/lib/tdb/test/run-nested-traverse.c b/lib/tdb/test/run-nested-traverse.c
index 361dc2ece71..22ee3e2a2a6 100644
--- a/lib/tdb/test/run-nested-traverse.c
+++ b/lib/tdb/test/run-nested-traverse.c
@@ -11,6 +11,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #undef fcntl
 #include <stdlib.h>
diff --git a/lib/tdb/test/run-no-lock-during-traverse.c b/lib/tdb/test/run-no-lock-during-traverse.c
index b5e31dc1001..737a32f1115 100644
--- a/lib/tdb/test/run-no-lock-during-traverse.c
+++ b/lib/tdb/test/run-no-lock-during-traverse.c
@@ -13,6 +13,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/test/run-oldhash.c b/lib/tdb/test/run-oldhash.c
index 535336cb473..aaee6f62ef7 100644
--- a/lib/tdb/test/run-oldhash.c
+++ b/lib/tdb/test/run-oldhash.c
@@ -9,6 +9,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/test/run-open-during-transaction.c b/lib/tdb/test/run-open-during-transaction.c
index 04ba956d45a..16053765e92 100644
--- a/lib/tdb/test/run-open-during-transaction.c
+++ b/lib/tdb/test/run-open-during-transaction.c
@@ -20,6 +20,7 @@ static int ftruncate_check(int fd, off_t length);
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include <stdbool.h>
diff --git a/lib/tdb/test/run-readonly-check.c b/lib/tdb/test/run-readonly-check.c
index e5185324246..c5e0f7dccbc 100644
--- a/lib/tdb/test/run-readonly-check.c
+++ b/lib/tdb/test/run-readonly-check.c
@@ -11,6 +11,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/test/run-rescue-find_entry.c b/lib/tdb/test/run-rescue-find_entry.c
index 25f4f1c05f4..5d6f8f711d0 100644
--- a/lib/tdb/test/run-rescue-find_entry.c
+++ b/lib/tdb/test/run-rescue-find_entry.c
@@ -10,6 +10,7 @@
 #include "../common/check.c"
 #include "../common/hash.c"
 #include "../common/rescue.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/test/run-rescue.c b/lib/tdb/test/run-rescue.c
index 7c806a40b45..e43f53be9d8 100644
--- a/lib/tdb/test/run-rescue.c
+++ b/lib/tdb/test/run-rescue.c
@@ -10,6 +10,7 @@
 #include "../common/check.c"
 #include "../common/hash.c"
 #include "../common/rescue.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/test/run-rwlock-check.c b/lib/tdb/test/run-rwlock-check.c
index 8b8072db1e6..2ac9dc3d7ca 100644
--- a/lib/tdb/test/run-rwlock-check.c
+++ b/lib/tdb/test/run-rwlock-check.c
@@ -9,6 +9,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 
diff --git a/lib/tdb/test/run-summary.c b/lib/tdb/test/run-summary.c
index 22312843e79..8b9a1a0f69d 100644
--- a/lib/tdb/test/run-summary.c
+++ b/lib/tdb/test/run-summary.c
@@ -10,6 +10,7 @@
 #include "../common/check.c"
 #include "../common/hash.c"
 #include "../common/summary.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 
diff --git a/lib/tdb/test/run-transaction-expand.c b/lib/tdb/test/run-transaction-expand.c
index ddf1f2432da..d36b894dbed 100644
--- a/lib/tdb/test/run-transaction-expand.c
+++ b/lib/tdb/test/run-transaction-expand.c
@@ -37,6 +37,7 @@ static inline int fake_fdatasync(int fd)
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/test/run-traverse-in-transaction.c b/lib/tdb/test/run-traverse-in-transaction.c
index 48194b8fdbf..17d64129642 100644
--- a/lib/tdb/test/run-traverse-in-transaction.c
+++ b/lib/tdb/test/run-traverse-in-transaction.c
@@ -11,6 +11,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #undef fcntl_with_lockcheck
 #include <stdlib.h>
diff --git a/lib/tdb/test/run-wronghash-fail.c b/lib/tdb/test/run-wronghash-fail.c
index 9c78fc5e3e1..c44b0f5aaee 100644
--- a/lib/tdb/test/run-wronghash-fail.c
+++ b/lib/tdb/test/run-wronghash-fail.c
@@ -9,6 +9,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 
diff --git a/lib/tdb/test/run-zero-append.c b/lib/tdb/test/run-zero-append.c
index a2324c437a4..f9eba1b7b32 100644
--- a/lib/tdb/test/run-zero-append.c
+++ b/lib/tdb/test/run-zero-append.c
@@ -9,6 +9,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/test/run.c b/lib/tdb/test/run.c
index f61fcf68204..c744c4ddca4 100644
--- a/lib/tdb/test/run.c
+++ b/lib/tdb/test/run.c
@@ -9,6 +9,7 @@
 #include "../common/open.c"
 #include "../common/check.c"
 #include "../common/hash.c"
+#include "../common/mutex.c"
 #include "tap-interface.h"
 #include <stdlib.h>
 #include "logging.h"
diff --git a/lib/tdb/wscript b/lib/tdb/wscript
index 70196938ed2..6243ccff8df 100644
--- a/lib/tdb/wscript
+++ b/lib/tdb/wscript
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 APPNAME = 'tdb'
-VERSION = '1.2.13'
+VERSION = '1.3.0'
 
 blddir = 'bin'
 
@@ -46,6 +46,10 @@ def set_options(opt):
     opt.BUILTIN_DEFAULT('replace')
     opt.PRIVATE_EXTENSION_DEFAULT('tdb', noextension='tdb')
     opt.RECURSE('lib/replace')
+    opt.add_option('--disable-tdb-mutex-locking',
+                   help=("Disable the use of pthread robust mutexes"),
+                   action="store_true", dest='disable_tdb_mutex_locking',
+                   default=False)
     if opt.IN_LAUNCH_DIR():
         opt.add_option('--disable-python',
                        help=("disable the pytdb module"),
@@ -53,6 +57,11 @@ def set_options(opt):
 
 
 def configure(conf):
+    conf.env.disable_tdb_mutex_locking = getattr(Options.options,
+                                                 'disable_tdb_mutex_locking',
+                                                 False)
+    if not conf.env.disable_tdb_mutex_locking:
+        conf.env.replace_add_global_pthread = True
     conf.RECURSE('lib/replace')
 
     conf.env.standalone_tdb = conf.IN_LAUNCH_DIR()
@@ -68,6 +77,11 @@ def configure(conf):
 
     conf.env.disable_python = getattr(Options.options, 'disable_python', False)
 
+    if (conf.CONFIG_SET('HAVE_ROBUST_MUTEXES') and
+        conf.env.building_tdb and
+        not conf.env.disable_tdb_mutex_locking):
+        conf.define('USE_TDB_MUTEX_LOCKING', 1)
+
     conf.CHECK_XSLTPROC_MANPAGES()
 
     if not conf.env.disable_python:
@@ -87,10 +101,12 @@ def configure(conf):
 def build(bld):
     bld.RECURSE('lib/replace')
 
-    COMMON_SRC = bld.SUBDIR('common',
-                            '''check.c error.c tdb.c traverse.c
-                            freelistcheck.c lock.c dump.c freelist.c
-                            io.c open.c transaction.c hash.c summary.c rescue.c''')
+    COMMON_FILES='''check.c error.c tdb.c traverse.c
+                    freelistcheck.c lock.c dump.c freelist.c
+                    io.c open.c transaction.c hash.c summary.c rescue.c
+                    mutex.c'''
+
+    COMMON_SRC = bld.SUBDIR('common', COMMON_FILES)
 
     if bld.env.standalone_tdb:
         bld.env.PKGCONFIGDIR = '${LIBDIR}/pkgconfig'
@@ -99,9 +115,15 @@ def build(bld):
         private_library = True
 
     if not bld.CONFIG_SET('USING_SYSTEM_TDB'):
+
+        tdb_deps = 'replace'
+
+        if bld.CONFIG_SET('USE_TDB_MUTEX_LOCKING'):
+            tdb_deps += ' pthread'
+
         bld.SAMBA_LIBRARY('tdb',
                           COMMON_SRC,
-                          deps='replace',
+                          deps=tdb_deps,
                           includes='include',
                           abi_directory='ABI',
                           abi_match='tdb_*',
@@ -137,7 +159,7 @@ def build(bld):
         # FIXME: This hardcoded list is stupid, stupid, stupid.
         bld.SAMBA_SUBSYSTEM('tdb-test-helpers',
                             'test/external-agent.c test/lock-tracking.c test/logging.c',
-                            'replace',
+                            tdb_deps,
                             includes='include')
 
         for t in tdb1_unit_tests:
-- 
cgit