summaryrefslogtreecommitdiffstats
path: root/lib/tdb/common
diff options
context:
space:
mode:
Diffstat (limited to 'lib/tdb/common')
-rw-r--r--lib/tdb/common/io.c3
-rw-r--r--lib/tdb/common/lock.c79
-rw-r--r--lib/tdb/common/mutex.c1000
-rw-r--r--lib/tdb/common/open.c200
-rw-r--r--lib/tdb/common/summary.c2
-rw-r--r--lib/tdb/common/tdb.c9
-rw-r--r--lib/tdb/common/tdb_private.h30
-rw-r--r--lib/tdb/common/transaction.c3
8 files changed, 1312 insertions, 14 deletions
diff --git a/lib/tdb/common/io.c b/lib/tdb/common/io.c
index 07d22ccdb21..fe47d18a5a4 100644
--- a/lib/tdb/common/io.c
+++ b/lib/tdb/common/io.c
@@ -29,7 +29,8 @@
#include "tdb_private.h"
/*
- * tdb->hdr_ofs is 0 for now.
+ * We prepend the mutex area, so fixup offsets. See mutex.c for details.
+ * tdb->hdr_ofs is 0 or header.mutex_size.
*
* Note: that we only have the 4GB limit of tdb_off_t for
* tdb->map_size. The file size on disk can be 4GB + tdb->hdr_ofs!
diff --git a/lib/tdb/common/lock.c b/lib/tdb/common/lock.c
index 486de797381..6644c4034e0 100644
--- a/lib/tdb/common/lock.c
+++ b/lib/tdb/common/lock.c
@@ -38,6 +38,15 @@ static int fcntl_lock(struct tdb_context *tdb,
struct flock fl;
int cmd;
+#ifdef USE_TDB_MUTEX_LOCKING
+ {
+ int ret;
+ if (tdb_mutex_lock(tdb, rw, off, len, waitflag, &ret)) {
+ return ret;
+ }
+ }
+#endif
+
fl.l_type = rw;
fl.l_whence = SEEK_SET;
fl.l_start = off;
@@ -110,6 +119,15 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
fclose(locks);
#endif
+#ifdef USE_TDB_MUTEX_LOCKING
+ {
+ int ret;
+ if (tdb_mutex_unlock(tdb, rw, off, len, &ret)) {
+ return ret;
+ }
+ }
+#endif
+
fl.l_type = F_UNLCK;
fl.l_whence = SEEK_SET;
fl.l_start = off;
@@ -248,13 +266,27 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
return -1;
}
- ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0,
- TDB_LOCK_WAIT|TDB_LOCK_PROBE);
+ if (tdb_have_mutexes(tdb)) {
+ ret = tdb_mutex_allrecord_upgrade(tdb);
+ if (ret == -1) {
+ goto fail;
+ }
+ ret = tdb_brlock_retry(tdb, F_WRLCK, lock_offset(tdb->hash_size),
+ 0, TDB_LOCK_WAIT|TDB_LOCK_PROBE);
+ if (ret == -1) {
+ tdb_mutex_allrecord_downgrade(tdb);
+ }
+ } else {
+ ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0,
+ TDB_LOCK_WAIT|TDB_LOCK_PROBE);
+ }
+
if (ret == 0) {
tdb->allrecord_lock.ltype = F_WRLCK;
tdb->allrecord_lock.off = 0;
return 0;
}
+fail:
TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_allrecord_upgrade failed\n"));
return -1;
}
@@ -593,6 +625,8 @@ static int tdb_chainlock_gradual(struct tdb_context *tdb,
int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
enum tdb_lock_flags flags, bool upgradable)
{
+ int ret;
+
switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) {
case -1:
return -1;
@@ -607,16 +641,27 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
*
* It is (1) which cause the starvation problem, so we're only
* gradual for that. */
- if (tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP,
- tdb->hash_size * 4) == -1) {
+
+ if (tdb_have_mutexes(tdb)) {
+ ret = tdb_mutex_allrecord_lock(tdb, ltype, flags);
+ } else {
+ ret = tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP,
+ tdb->hash_size * 4);
+ }
+
+ if (ret == -1) {
return -1;
}
/* Grab individual record locks. */
if (tdb_brlock(tdb, ltype, lock_offset(tdb->hash_size), 0,
flags) == -1) {
- tdb_brunlock(tdb, ltype, FREELIST_TOP,
- tdb->hash_size * 4);
+ if (tdb_have_mutexes(tdb)) {
+ tdb_mutex_allrecord_unlock(tdb);
+ } else {
+ tdb_brunlock(tdb, ltype, FREELIST_TOP,
+ tdb->hash_size * 4);
+ }
return -1;
}
@@ -672,9 +717,25 @@ int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock)
return 0;
}
- if (!mark_lock && tdb_brunlock(tdb, ltype, FREELIST_TOP, 0)) {
- TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed (%s)\n", strerror(errno)));
- return -1;
+ if (!mark_lock) {
+ int ret;
+
+ if (tdb_have_mutexes(tdb)) {
+ ret = tdb_mutex_allrecord_unlock(tdb);
+ if (ret == 0) {
+ ret = tdb_brunlock(tdb, ltype,
+ lock_offset(tdb->hash_size),
+ 0);
+ }
+ } else {
+ ret = tdb_brunlock(tdb, ltype, FREELIST_TOP, 0);
+ }
+
+ if (ret != 0) {
+ TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed "
+ "(%s)\n", strerror(errno)));
+ return -1;
+ }
}
tdb->allrecord_lock.count = 0;
diff --git a/lib/tdb/common/mutex.c b/lib/tdb/common/mutex.c
new file mode 100644
index 00000000000..bdc4c28cb6c
--- /dev/null
+++ b/lib/tdb/common/mutex.c
@@ -0,0 +1,1000 @@
+/*
+ Unix SMB/CIFS implementation.
+
+ trivial database library
+
+ Copyright (C) Volker Lendecke 2012,2013
+ Copyright (C) Stefan Metzmacher 2013,2014
+ Copyright (C) Michael Adam 2014
+
+ ** NOTE! The following LGPL license applies to the tdb
+ ** library. This does NOT imply that all of Samba is released
+ ** under the LGPL
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "tdb_private.h"
+#include "system/threads.h"
+
+#ifdef USE_TDB_MUTEX_LOCKING
+
+/*
+ * If we run with mutexes, we store the "struct tdb_mutexes" at the
+ * beginning of the file. We store an additional tdb_header right
+ * beyond the mutex area, page aligned. All the offsets within the tdb
+ * are relative to the area behind the mutex area. tdb->map_ptr points
+ * behind the mmap area as well, so the read and write path in the
+ * mutex case can remain unchanged.
+ *
+ * Early in the mutex development the mutexes were placed between the hash
+ * chain pointers and the real tdb data. This had two drawbacks: First, it
+ * made pointer calculations more complex. Second, we had to mmap the mutex
+ * area twice. One was the normal map_ptr in the tdb. This frequently changed
+ * from within tdb_oob. At least the Linux glibc robust mutex code assumes
+ * constant pointers in memory, so a constantly changing mmap area destroys
+ * the mutex list. So we had to mmap the first bytes of the file with a second
+ * mmap call. With that scheme, very weird errors happened that could be
+ * easily fixed by doing the mutex mmap in a second file. It seemed that
+ * mapping the same memory area twice does not end up in accessing the same
+ * physical page, looking at the mutexes in gdb it seemed that old data showed
+ * up after some re-mapping. To avoid a separate mutex file, the code now puts
+ * the real content of the tdb file after the mutex area. This way we do not
+ * have overlapping mmap areas, the mutex area is mmapped once and not
+ * changed, the tdb data area's mmap is constantly changed but does not
+ * overlap.
+ */
+
+struct tdb_mutexes {
+ struct tdb_header hdr;
+
+ /* protect allrecord_lock */
+ pthread_mutex_t allrecord_mutex;
+
+ /*
+ * F_UNLCK: free,
+ * F_RDLCK: shared,
+ * F_WRLCK: exclusive
+ */
+ short int allrecord_lock;
+
+ /*
+ * Index 0 is the freelist mutex, followed by
+ * one mutex per hashchain.
+ */
+ pthread_mutex_t hashchains[1];
+};
+
+bool tdb_have_mutexes(struct tdb_context *tdb)
+{
+ return ((tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) != 0);
+}
+
+size_t tdb_mutex_size(struct tdb_context *tdb)
+{
+ size_t mutex_size;
+
+ if (!tdb_have_mutexes(tdb)) {
+ return 0;
+ }
+
+ mutex_size = sizeof(struct tdb_mutexes);
+ mutex_size += tdb->hash_size * sizeof(pthread_mutex_t);
+
+ return TDB_ALIGN(mutex_size, tdb->page_size);
+}
+
+/*
+ * Get the index for a chain mutex
+ */
+static bool tdb_mutex_index(struct tdb_context *tdb, off_t off, off_t len,
+ unsigned *idx)
+{
+ /*
+ * Weird but true: We fcntl lock 1 byte at an offset 4 bytes before
+ * the 4 bytes of the freelist start and the hash chain that is about
+ * to be locked. See lock_offset() where the freelist is -1 vs the
+ * "+1" in TDB_HASH_TOP(). Because the mutex array is represented in
+ * the tdb file itself as data, we need to adjust the offset here.
+ */
+ const off_t freelist_lock_ofs = FREELIST_TOP - sizeof(tdb_off_t);
+
+ if (!tdb_have_mutexes(tdb)) {
+ return false;
+ }
+ if (len != 1) {
+ /* Possibly the allrecord lock */
+ return false;
+ }
+ if (off < freelist_lock_ofs) {
+ /* One of the special locks */
+ return false;
+ }
+ if (tdb->hash_size == 0) {
+ /* tdb not initialized yet, called from tdb_open_ex() */
+ return false;
+ }
+ if (off >= TDB_DATA_START(tdb->hash_size)) {
+ /* Single record lock from traverses */
+ return false;
+ }
+
+ /*
+ * Now we know it's a freelist or hash chain lock. Those are always 4
+ * byte aligned. Paranoia check.
+ */
+ if ((off % sizeof(tdb_off_t)) != 0) {
+ abort();
+ }
+
+ /*
+ * Re-index the fcntl offset into an offset into the mutex array
+ */
+ off -= freelist_lock_ofs; /* rebase to index 0 */
+ off /= sizeof(tdb_off_t); /* 0 for freelist 1-n for hashchain */
+
+ *idx = off;
+ return true;
+}
+
+static bool tdb_have_mutex_chainlocks(struct tdb_context *tdb)
+{
+ size_t i;
+
+ for (i=0; i < tdb->num_lockrecs; i++) {
+ bool ret;
+ unsigned idx;
+
+ ret = tdb_mutex_index(tdb,
+ tdb->lockrecs[i].off,
+ tdb->lockrecs[i].count,
+ &idx);
+ if (!ret) {
+ continue;
+ }
+
+ if (idx == 0) {
+ /* this is the freelist mutex */
+ continue;
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
+static int chain_mutex_lock(pthread_mutex_t *m, bool waitflag)
+{
+ int ret;
+
+ if (waitflag) {
+ ret = pthread_mutex_lock(m);
+ } else {
+ ret = pthread_mutex_trylock(m);
+ }
+ if (ret != EOWNERDEAD) {
+ return ret;
+ }
+
+ /*
+ * For chainlocks, we don't do any cleanup (yet?)
+ */
+ return pthread_mutex_consistent(m);
+}
+
+static int allrecord_mutex_lock(struct tdb_mutexes *m, bool waitflag)
+{
+ int ret;
+
+ if (waitflag) {
+ ret = pthread_mutex_lock(&m->allrecord_mutex);
+ } else {
+ ret = pthread_mutex_trylock(&m->allrecord_mutex);
+ }
+ if (ret != EOWNERDEAD) {
+ return ret;
+ }
+
+ /*
+ * The allrecord lock holder died. We need to reset the allrecord_lock
+ * to F_UNLCK. This should also be the indication for
+ * tdb_needs_recovery.
+ */
+ m->allrecord_lock = F_UNLCK;
+
+ return pthread_mutex_consistent(&m->allrecord_mutex);
+}
+
+bool tdb_mutex_lock(struct tdb_context *tdb, int rw, off_t off, off_t len,
+ bool waitflag, int *pret)
+{
+ struct tdb_mutexes *m = tdb->mutexes;
+ pthread_mutex_t *chain;
+ int ret;
+ unsigned idx;
+ bool allrecord_ok;
+
+ if (!tdb_mutex_index(tdb, off, len, &idx)) {
+ return false;
+ }
+ chain = &m->hashchains[idx];
+
+again:
+ ret = chain_mutex_lock(chain, waitflag);
+ if (ret == EBUSY) {
+ ret = EAGAIN;
+ }
+ if (ret != 0) {
+ errno = ret;
+ goto fail;
+ }
+
+ if (idx == 0) {
+ /*
+ * This is a freelist lock, which is independent to
+ * the allrecord lock. So we're done once we got the
+ * freelist mutex.
+ */
+ *pret = 0;
+ return true;
+ }
+
+ if (tdb_have_mutex_chainlocks(tdb)) {
+ /*
+ * We can only check the allrecord lock once. If we do it with
+ * one chain mutex locked, we will deadlock with the allrecord
+ * locker process in the following way: We lock the first hash
+ * chain, we check for the allrecord lock. We keep the hash
+ * chain locked. Then the allrecord locker locks the
+ * allrecord_mutex. It walks the list of chain mutexes,
+ * locking them all in sequence. Meanwhile, we have the chain
+ * mutex locked, so the allrecord locker blocks trying to lock
+ * our chain mutex. Then we come in and try to lock the second
+ * chain lock, which in most cases will be the freelist. We
+ * see that the allrecord lock is locked and put ourselves on
+ * the allrecord_mutex. This will never be signalled though
+ * because the allrecord locker waits for us to give up the
+ * chain lock.
+ */
+
+ *pret = 0;
+ return true;
+ }
+
+ /*
+ * Check if someone is has the allrecord lock: queue if so.
+ */
+
+ allrecord_ok = false;
+
+ if (m->allrecord_lock == F_UNLCK) {
+ /*
+ * allrecord lock not taken
+ */
+ allrecord_ok = true;
+ }
+
+ if ((m->allrecord_lock == F_RDLCK) && (rw == F_RDLCK)) {
+ /*
+ * allrecord shared lock taken, but we only want to read
+ */
+ allrecord_ok = true;
+ }
+
+ if (allrecord_ok) {
+ *pret = 0;
+ return true;
+ }
+
+ ret = pthread_mutex_unlock(chain);
+ if (ret != 0) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
+ "(chain_mutex) failed: %s\n", strerror(ret)));
+ errno = ret;
+ goto fail;
+ }
+ ret = allrecord_mutex_lock(m, waitflag);
+ if (ret == EBUSY) {
+ ret = EAGAIN;
+ }
+ if (ret != 0) {
+ if (waitflag || (ret != EAGAIN)) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_%slock"
+ "(allrecord_mutex) failed: %s\n",
+ waitflag ? "" : "try_", strerror(ret)));
+ }
+ errno = ret;
+ goto fail;
+ }
+ ret = pthread_mutex_unlock(&m->allrecord_mutex);
+ if (ret != 0) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
+ "(allrecord_mutex) failed: %s\n", strerror(ret)));
+ errno = ret;
+ goto fail;
+ }
+ goto again;
+
+fail:
+ *pret = -1;
+ return true;
+}
+
+bool tdb_mutex_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len,
+ int *pret)
+{
+ struct tdb_mutexes *m = tdb->mutexes;
+ pthread_mutex_t *chain;
+ int ret;
+ unsigned idx;
+
+ if (!tdb_mutex_index(tdb, off, len, &idx)) {
+ return false;
+ }
+ chain = &m->hashchains[idx];
+
+ ret = pthread_mutex_unlock(chain);
+ if (ret == 0) {
+ *pret = 0;
+ return true;
+ }
+ errno = ret;
+ *pret = -1;
+ return true;
+}
+
+int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
+ enum tdb_lock_flags flags)
+{
+ struct tdb_mutexes *m = tdb->mutexes;
+ int ret;
+ uint32_t i;
+ bool waitflag = (flags & TDB_LOCK_WAIT);
+ int saved_errno;
+
+ if (tdb->flags & TDB_NOLOCK) {
+ return 0;
+ }
+
+ if (flags & TDB_LOCK_MARK_ONLY) {
+ return 0;
+ }
+
+ ret = allrecord_mutex_lock(m, waitflag);
+ if (!waitflag && (ret == EBUSY)) {
+ errno = EAGAIN;
+ tdb->ecode = TDB_ERR_LOCK;
+ return -1;
+ }
+ if (ret != 0) {
+ if (!(flags & TDB_LOCK_PROBE)) {
+ TDB_LOG((tdb, TDB_DEBUG_TRACE,
+ "allrecord_mutex_lock() failed: %s\n",
+ strerror(ret)));
+ }
+ tdb->ecode = TDB_ERR_LOCK;
+ return -1;
+ }
+
+ if (m->allrecord_lock != F_UNLCK) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
+ (int)m->allrecord_lock));
+ goto fail_unlock_allrecord_mutex;
+ }
+ m->allrecord_lock = (ltype == F_RDLCK) ? F_RDLCK : F_WRLCK;
+
+ for (i=0; i<tdb->hash_size; i++) {
+
+ /* ignore hashchains[0], the freelist */
+ pthread_mutex_t *chain = &m->hashchains[i+1];
+
+ ret = chain_mutex_lock(chain, waitflag);
+ if (!waitflag && (ret == EBUSY)) {
+ errno = EAGAIN;
+ goto fail_unroll_allrecord_lock;
+ }
+ if (ret != 0) {
+ if (!(flags & TDB_LOCK_PROBE)) {
+ TDB_LOG((tdb, TDB_DEBUG_TRACE,
+ "chain_mutex_lock() failed: %s\n",
+ strerror(ret)));
+ }
+ errno = ret;
+ goto fail_unroll_allrecord_lock;
+ }
+
+ ret = pthread_mutex_unlock(chain);
+ if (ret != 0) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
+ "(chainlock) failed: %s\n", strerror(ret)));
+ errno = ret;
+ goto fail_unroll_allrecord_lock;
+ }
+ }
+ /*
+ * We leave this routine with m->allrecord_mutex locked
+ */
+ return 0;
+
+fail_unroll_allrecord_lock:
+ m->allrecord_lock = F_UNLCK;
+
+fail_unlock_allrecord_mutex:
+ saved_errno = errno;
+ ret = pthread_mutex_unlock(&m->allrecord_mutex);
+ if (ret != 0) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
+ "(allrecord_mutex) failed: %s\n", strerror(ret)));
+ }
+ errno = saved_errno;
+ tdb->ecode = TDB_ERR_LOCK;
+ return -1;
+}
+
+int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb)
+{
+ struct tdb_mutexes *m = tdb->mutexes;
+ int ret;
+ uint32_t i;
+
+ if (tdb->flags & TDB_NOLOCK) {
+ return 0;
+ }
+
+ /*
+ * Our only caller tdb_allrecord_upgrade()
+ * garantees that we already own the allrecord lock.
+ *
+ * Which means m->allrecord_mutex is still locked by us.
+ */
+
+ if (m->allrecord_lock != F_RDLCK) {
+ tdb->ecode = TDB_ERR_LOCK;
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
+ (int)m->allrecord_lock));
+ return -1;
+ }
+
+ m->allrecord_lock = F_WRLCK;
+
+ for (i=0; i<tdb->hash_size; i++) {
+
+ /* ignore hashchains[0], the freelist */
+ pthread_mutex_t *chain = &m->hashchains[i+1];
+
+ ret = chain_mutex_lock(chain, true);
+ if (ret != 0) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_lock"
+ "(chainlock) failed: %s\n", strerror(ret)));
+ goto fail_unroll_allrecord_lock;
+ }
+
+ ret = pthread_mutex_unlock(chain);
+ if (ret != 0) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
+ "(chainlock) failed: %s\n", strerror(ret)));
+ goto fail_unroll_allrecord_lock;
+ }
+ }
+
+ return 0;
+
+fail_unroll_allrecord_lock:
+ m->allrecord_lock = F_RDLCK;
+ tdb->ecode = TDB_ERR_LOCK;
+ return -1;
+}
+
+void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb)
+{
+ struct tdb_mutexes *m = tdb->mutexes;
+
+ /*
+ * Our only caller tdb_allrecord_upgrade() (in the error case)
+ * garantees that we already own the allrecord lock.
+ *
+ * Which means m->allrecord_mutex is still locked by us.
+ */
+
+ if (m->allrecord_lock != F_WRLCK) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
+ (int)m->allrecord_lock));
+ return;
+ }
+
+ m->allrecord_lock = F_RDLCK;
+ return;
+}
+
+
+int tdb_mutex_allrecord_unlock(struct tdb_context *tdb)
+{
+ struct tdb_mutexes *m = tdb->mutexes;
+ short old;
+ int ret;
+
+ if (tdb->flags & TDB_NOLOCK) {
+ return 0;
+ }
+
+ /*
+ * Our only callers tdb_allrecord_unlock() and
+ * tdb_allrecord_lock() (in the error path)
+ * garantee that we already own the allrecord lock.
+ *
+ * Which means m->allrecord_mutex is still locked by us.
+ */
+
+ if ((m->allrecord_lock != F_RDLCK) && (m->allrecord_lock != F_WRLCK)) {
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
+ (int)m->allrecord_lock));
+ return -1;
+ }
+
+ old = m->allrecord_lock;
+ m->allrecord_lock = F_UNLCK;
+
+ ret = pthread_mutex_unlock(&m->allrecord_mutex);
+ if (ret != 0) {
+ m->allrecord_lock = old;
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
+ "(allrecord_mutex) failed: %s\n", strerror(ret)));
+ return -1;
+ }
+ return 0;
+}
+
+int tdb_mutex_init(struct tdb_context *tdb)
+{
+ struct tdb_mutexes *m;
+ pthread_mutexattr_t ma;
+ int i, ret;
+
+ ret = tdb_mutex_mmap(tdb);
+ if (ret == -1) {
+ return -1;
+ }
+ m = tdb->mutexes;
+
+ ret = pthread_mutexattr_init(&ma);
+ if (ret != 0) {
+ goto fail_munmap;
+ }
+ ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
+ if (ret != 0) {
+ goto fail;
+ }
+ ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
+ if (ret != 0) {
+ goto fail;
+ }
+ ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
+ if (ret != 0) {
+ goto fail;
+ }
+
+ for (i=0; i<tdb->hash_size+1; i++) {
+ pthread_mutex_t *chain = &m->hashchains[i];
+
+ ret = pthread_mutex_init(chain, &ma);
+ if (ret != 0) {
+ goto fail;
+ }
+ }
+
+ m->allrecord_lock = F_UNLCK;
+
+ ret = pthread_mutex_init(&m->allrecord_mutex, &ma);
+ if (ret != 0) {
+ goto fail;
+ }
+ ret = 0;
+fail:
+ pthread_mutexattr_destroy(&ma);
+fail_munmap:
+ tdb_mutex_munmap(tdb);
+
+ if (ret == 0) {
+ return 0;
+ }
+
+ errno = ret;
+ return -1;
+}
+
+int tdb_mutex_mmap(struct tdb_context *tdb)
+{
+ size_t len;
+ void *ptr;
+
+ len = tdb_mutex_size(tdb);
+ if (len == 0) {
+ return 0;
+ }
+
+ ptr = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FILE,
+ tdb->fd, 0);
+ if (ptr == MAP_FAILED) {
+ return -1;
+ }
+ tdb->mutexes = (struct tdb_mutexes *)ptr;
+
+ return 0;
+}
+
+int tdb_mutex_munmap(struct tdb_context *tdb)
+{
+ size_t len;
+
+ len = tdb_mutex_size(tdb);
+ if (len == 0) {
+ return 0;
+ }
+
+ return munmap(tdb->mutexes, len);
+}
+
+static bool tdb_mutex_locking_cached;
+
+static bool tdb_mutex_locking_supported(void)
+{
+ pthread_mutexattr_t ma;
+ pthread_mutex_t m;
+ int ret;
+ static bool initialized;
+
+ if (initialized) {
+ return tdb_mutex_locking_cached;
+ }
+
+ initialized = true;
+
+ ret = pthread_mutexattr_init(&ma);
+ if (ret != 0) {
+ return false;
+ }
+ ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
+ if (ret != 0) {
+ goto cleanup_ma;
+ }
+ ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
+ if (ret != 0) {
+ goto cleanup_ma;
+ }
+ ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
+ if (ret != 0) {
+ goto cleanup_ma;
+ }
+ ret = pthread_mutex_init(&m, &ma);
+ if (ret != 0) {
+ goto cleanup_ma;
+ }
+ ret = pthread_mutex_lock(&m);
+ if (ret != 0) {
+ goto cleanup_m;
+ }
+ /*
+ * This makes sure we have real mutexes
+ * from a threading library instead of just
+ * stubs from libc.
+ */
+ ret = pthread_mutex_lock(&m);
+ if (ret != EDEADLK) {
+ goto cleanup_lock;
+ }
+ ret = pthread_mutex_unlock(&m);
+ if (ret != 0) {
+ goto cleanup_m;
+ }
+
+ tdb_mutex_locking_cached = true;
+ goto cleanup_m;
+
+cleanup_lock:
+ pthread_mutex_unlock(&m);
+cleanup_m:
+ pthread_mutex_destroy(&m);
+cleanup_ma:
+ pthread_mutexattr_destroy(&ma);
+ return tdb_mutex_locking_cached;
+}
+
+static void (*tdb_robust_mutext_old_handler)(int) = SIG_ERR;
+static pid_t tdb_robust_mutex_pid = -1;
+
+static void tdb_robust_mutex_handler(int sig)
+{
+ if (tdb_robust_mutex_pid != -1) {
+ pid_t pid;
+ int status;
+
+ pid = waitpid(tdb_robust_mutex_pid, &status, WNOHANG);
+ if (pid == tdb_robust_mutex_pid) {
+ tdb_robust_mutex_pid = -1;
+ return;
+ }
+ }
+
+ if (tdb_robust_mutext_old_handler == SIG_DFL) {
+ return;
+ }
+ if (tdb_robust_mutext_old_handler == SIG_IGN) {
+ return;
+ }
+ if (tdb_robust_mutext_old_handler == SIG_ERR) {
+ return;
+ }
+
+ tdb_robust_mutext_old_handler(sig);
+}
+
+_PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void)
+{
+ void *ptr;
+ pthread_mutex_t *m;
+ pthread_mutexattr_t ma;
+ int ret = 1;
+ int pipe_down[2] = { -1, -1 };
+ int pipe_up[2] = { -1, -1 };
+ ssize_t nread;
+ char c = 0;
+ bool ok;
+ int status;
+ static bool initialized;
+
+ if (initialized) {
+ return tdb_mutex_locking_cached;
+ }
+
+ initialized = true;
+
+ ok = tdb_mutex_locking_supported();
+ if (!ok) {
+ return false;
+ }
+
+ tdb_mutex_locking_cached = false;
+
+ ptr = mmap(NULL, sizeof(pthread_mutex_t), PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_ANON, -1 /* fd */, 0);
+ if (ptr == MAP_FAILED) {
+ return false;
+ }
+ m = (pthread_mutex_t *)ptr;
+
+ ret = pipe(pipe_down);
+ if (ret != 0) {
+ goto cleanup_mmap;
+ }
+ ret = pipe(pipe_up);
+ if (ret != 0) {
+ goto cleanup_pipe;
+ }
+
+ ret = pthread_mutexattr_init(&ma);
+ if (ret != 0) {
+ goto cleanup_pipe;
+ }
+ ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
+ if (ret != 0) {
+ goto cleanup_ma;
+ }
+ ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
+ if (ret != 0) {
+ goto cleanup_ma;
+ }
+ ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
+ if (ret != 0) {
+ goto cleanup_ma;
+ }
+ ret = pthread_mutex_init(m, &ma);
+ if (ret != 0) {
+ goto cleanup_ma;
+ }
+
+ tdb_robust_mutext_old_handler = signal(SIGCHLD,
+ tdb_robust_mutex_handler);
+
+ tdb_robust_mutex_pid = fork();
+ if (tdb_robust_mutex_pid == 0) {
+ size_t nwritten;
+ close(pipe_down[1]);
+ close(pipe_up[0]);
+ ret = pthread_mutex_lock(m);
+ nwritten = write(pipe_up[1], &ret, sizeof(ret));
+ if (nwritten != sizeof(ret)) {
+ exit(1);
+ }
+ if (ret != 0) {
+ exit(1);
+ }
+ nread = read(pipe_down[0], &c, 1);
+ if (nread != 1) {
+ exit(1);
+ }
+ /* leave locked */
+ exit(0);
+ }
+ if (tdb_robust_mutex_pid == -1) {
+ goto cleanup_sig_child;
+ }
+ close(pipe_down[0]);
+ pipe_down[0] = -1;
+ close(pipe_up[1]);
+ pipe_up[1] = -1;
+
+ nread = read(pipe_up[0], &ret, sizeof(ret));
+ if (nread != sizeof(ret)) {
+ goto cleanup_child;
+ }
+
+ ret = pthread_mutex_trylock(m);
+ if (ret != EBUSY) {
+ if (ret == 0) {
+ pthread_mutex_unlock(m);
+ }
+ goto cleanup_child;
+ }
+
+ if (write(pipe_down[1], &c, 1) != 1) {
+ goto cleanup_child;
+ }
+
+ nread = read(pipe_up[0], &c, 1);
+ if (nread != 0) {
+ goto cleanup_child;
+ }
+
+ while (tdb_robust_mutex_pid > 0) {
+ pid_t pid;
+
+ errno = 0;
+ pid = waitpid(tdb_robust_mutex_pid, &status, 0);
+ if (pid == tdb_robust_mutex_pid) {
+ tdb_robust_mutex_pid = -1;
+ break;
+ }
+ if (pid == -1 && errno != EINTR) {
+ goto cleanup_child;
+ }
+ }
+ signal(SIGCHLD, tdb_robust_mutext_old_handler);
+
+ ret = pthread_mutex_trylock(m);
+ if (ret != EOWNERDEAD) {
+ if (ret == 0) {
+ pthread_mutex_unlock(m);
+ }
+ goto cleanup_m;
+ }
+
+ ret = pthread_mutex_consistent(m);
+ if (ret != 0) {
+ goto cleanup_m;
+ }
+
+ ret = pthread_mutex_trylock(m);
+ if (ret != EDEADLK) {
+ pthread_mutex_unlock(m);
+ goto cleanup_m;
+ }
+
+ ret = pthread_mutex_unlock(m);
+ if (ret != 0) {
+ goto cleanup_m;
+ }
+
+ tdb_mutex_locking_cached = true;
+ goto cleanup_m;
+
+cleanup_child:
+ while (tdb_robust_mutex_pid > 0) {
+ pid_t pid;
+
+ kill(tdb_robust_mutex_pid, SIGKILL);
+
+ errno = 0;
+ pid = waitpid(tdb_robust_mutex_pid, &status, 0);
+ if (pid == tdb_robust_mutex_pid) {
+ tdb_robust_mutex_pid = -1;
+ break;
+ }
+ if (pid == -1 && errno != EINTR) {
+ break;
+ }
+ }
+cleanup_sig_child:
+ signal(SIGCHLD, tdb_robust_mutext_old_handler);
+cleanup_m:
+ pthread_mutex_destroy(m);
+cleanup_ma:
+ pthread_mutexattr_destroy(&ma);
+cleanup_pipe:
+ if (pipe_down[0] != -1) {
+ close(pipe_down[0]);
+ }
+ if (pipe_down[1] != -1) {
+ close(pipe_down[1]);
+ }
+ if (pipe_up[0] != -1) {
+ close(pipe_up[0]);
+ }
+ if (pipe_up[1] != -1) {
+ close(pipe_up[1]);
+ }
+cleanup_mmap:
+ munmap(ptr, sizeof(pthread_mutex_t));
+
+ return tdb_mutex_locking_cached;
+}
+
+#else
+
+size_t tdb_mutex_size(struct tdb_context *tdb)
+{
+ return 0;
+}
+
+bool tdb_have_mutexes(struct tdb_context *tdb)
+{
+ return false;
+}
+
+int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
+ enum tdb_lock_flags flags)
+{
+ tdb->ecode = TDB_ERR_LOCK;
+ return -1;
+}
+
+int tdb_mutex_allrecord_unlock(struct tdb_context *tdb)
+{
+ return -1;
+}
+
+int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb)
+{
+ tdb->ecode = TDB_ERR_LOCK;
+ return -1;
+}
+
+void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb)
+{
+ return;
+}
+
+int tdb_mutex_mmap(struct tdb_context *tdb)
+{
+ errno = ENOSYS;
+ return -1;
+}
+
+int tdb_mutex_munmap(struct tdb_context *tdb)
+{
+ errno = ENOSYS;
+ return -1;
+}
+
+int tdb_mutex_init(struct tdb_context *tdb)
+{
+ errno = ENOSYS;
+ return -1;
+}
+
+_PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void)
+{
+ return false;
+}
+
+#endif
diff --git a/lib/tdb/common/open.c b/lib/tdb/common/open.c
index 162f30d4047..16a76a347fc 100644
--- a/lib/tdb/common/open.c
+++ b/lib/tdb/common/open.c
@@ -77,6 +77,15 @@ static int tdb_new_database(struct tdb_context *tdb, struct tdb_header *header,
newdb->rwlocks = TDB_HASH_RWLOCK_MAGIC;
/*
+ * We create a tdb with TDB_FEATURE_FLAG_MUTEX support,
+ * the flag combination and runtime feature checks
+ * are done by the caller already.
+ */
+ if (tdb->flags & TDB_MUTEX_LOCKING) {
+ newdb->feature_flags |= TDB_FEATURE_FLAG_MUTEX;
+ }
+
+ /*
* If we have any features we add the FEATURE_FLAG_MAGIC, overwriting the
* TDB_HASH_RWLOCK_MAGIC above.
*/
@@ -87,8 +96,11 @@ static int tdb_new_database(struct tdb_context *tdb, struct tdb_header *header,
/*
* It's required for some following code pathes
* to have the fields on 'tdb' up-to-date.
+ *
+ * E.g. tdb_mutex_size() requires it
*/
tdb->feature_flags = newdb->feature_flags;
+ tdb->hash_size = newdb->hash_size;
if (tdb->flags & TDB_INTERNAL) {
tdb->map_size = size;
@@ -104,6 +116,11 @@ static int tdb_new_database(struct tdb_context *tdb, struct tdb_header *header,
if (ftruncate(tdb->fd, 0) == -1)
goto fail;
+ if (newdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) {
+ newdb->mutex_size = tdb_mutex_size(tdb);
+ tdb->hdr_ofs = newdb->mutex_size;
+ }
+
/* This creates an endian-converted header, as if read from disk */
CONVERT(*newdb);
memcpy(header, newdb, sizeof(*header));
@@ -113,6 +130,37 @@ static int tdb_new_database(struct tdb_context *tdb, struct tdb_header *header,
if (!tdb_write_all(tdb->fd, newdb, size))
goto fail;
+ if (newdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) {
+
+ /*
+ * Now we init the mutex area
+ * followed by a second header.
+ */
+
+ ret = ftruncate(
+ tdb->fd,
+ newdb->mutex_size + sizeof(struct tdb_header));
+ if (ret == -1) {
+ goto fail;
+ }
+ ret = tdb_mutex_init(tdb);
+ if (ret == -1) {
+ goto fail;
+ }
+
+ /*
+ * Write a second header behind the mutexes. That's the area
+ * that will be mmapp'ed.
+ */
+ ret = lseek(tdb->fd, newdb->mutex_size, SEEK_SET);
+ if (ret == -1) {
+ goto fail;
+ }
+ if (!tdb_write_all(tdb->fd, newdb, size)) {
+ goto fail;
+ }
+ }
+
ret = 0;
fail:
SAFE_FREE(newdb);
@@ -179,6 +227,70 @@ static bool check_header_hash(struct tdb_context *tdb,
return check_header_hash(tdb, header, false, m1, m2);
}
+static bool tdb_mutex_open_ok(struct tdb_context *tdb,
+ const struct tdb_header *header)
+{
+ int locked;
+
+ locked = tdb_nest_lock(tdb, ACTIVE_LOCK, F_WRLCK,
+ TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
+
+ if ((locked == -1) && (tdb->ecode == TDB_ERR_LOCK)) {
+ /*
+ * CLEAR_IF_FIRST still active. The tdb was created on this
+ * host, so we can assume the mutex implementation is
+ * compatible. Important for tools like tdbdump on a still
+ * open locking.tdb.
+ */
+ goto check_local_settings;
+ }
+
+ /*
+ * We got the CLEAR_IF_FIRST lock. That means the database was
+ * potentially copied from somewhere else. The mutex implementation
+ * might be incompatible.
+ */
+
+ if (tdb_nest_unlock(tdb, ACTIVE_LOCK, F_WRLCK, false) == -1) {
+ /*
+ * Should not happen
+ */
+ TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_mutex_open_ok: "
+ "failed to release ACTIVE_LOCK on %s: %s\n",
+ tdb->name, strerror(errno)));
+ return false;
+ }
+
+ if (tdb->flags & TDB_NOLOCK) {
+ /*
+ * We don't look at locks, so it does not matter to have a
+ * compatible mutex implementation. Allow the open.
+ */
+ return true;
+ }
+
+check_local_settings:
+
+ if (!(tdb->flags & TDB_MUTEX_LOCKING)) {
+ TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_mutex_open_ok[%s]: "
+ "Can use mutexes only with "
+ "MUTEX_LOCKING or NOLOCK\n",
+ tdb->name));
+ return false;
+ }
+
+ if (tdb_mutex_size(tdb) != header->mutex_size) {
+ TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_mutex_open_ok[%s]: "
+ "Mutex size changed from %u to %u\n.",
+ tdb->name,
+ (unsigned int)header->mutex_size,
+ (unsigned int)tdb_mutex_size(tdb)));
+ return false;
+ }
+
+ return true;
+}
+
_PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
int open_flags, mode_t mode,
const struct tdb_logging_context *log_ctx,
@@ -208,6 +320,9 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td
if (tdb_flags & TDB_INTERNAL) {
tdb_flags |= TDB_INCOMPATIBLE_HASH;
}
+ if (tdb_flags & TDB_MUTEX_LOCKING) {
+ tdb_flags |= TDB_INCOMPATIBLE_HASH;
+ }
tdb->fd = -1;
#ifdef TDB_TRACE
@@ -296,6 +411,64 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td
goto fail;
}
+ if (tdb->flags & TDB_MUTEX_LOCKING) {
+ /*
+ * Here we catch bugs in the callers,
+ * the runtime check for existing tdb's comes later.
+ */
+
+ if (!(tdb->flags & TDB_CLEAR_IF_FIRST)) {
+ TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
+ "invalid flags for %s - TDB_MUTEX_LOCKING "
+ "requires TDB_CLEAR_IF_FIRST\n", name));
+ errno = EINVAL;
+ goto fail;
+ }
+
+ if (tdb->flags & TDB_INTERNAL) {
+ TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
+ "invalid flags for %s - TDB_MUTEX_LOCKING and "
+ "TDB_INTERNAL are not allowed together\n", name));
+ errno = EINVAL;
+ goto fail;
+ }
+
+ if (tdb->flags & TDB_NOMMAP) {
+ TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
+ "invalid flags for %s - TDB_MUTEX_LOCKING and "
+ "TDB_NOMMAP are not allowed together\n", name));
+ errno = EINVAL;
+ goto fail;
+ }
+
+ if (tdb->read_only) {
+ TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
+ "invalid flags for %s - TDB_MUTEX_LOCKING "
+ "not allowed read only\n", name));
+ errno = EINVAL;
+ goto fail;
+ }
+
+ /*
+ * The callers should have called
+ * tdb_runtime_check_for_robust_mutexes()
+ * before using TDB_MUTEX_LOCKING!
+ *
+ * This makes sure the caller understands
+ * that the locking may behave a bit differently
+ * than with pure fcntl locking. E.g. multiple
+ * read locks are not supported.
+ */
+ if (!tdb_runtime_check_for_robust_mutexes()) {
+ TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
+ "invalid flags for %s - TDB_MUTEX_LOCKING "
+ "requires support for robust_mutexes\n",
+ name));
+ errno = ENOSYS;
+ goto fail;
+ }
+ }
+
if (getenv("TDB_NO_FSYNC")) {
tdb->flags |= TDB_NOSYNC;
}
@@ -435,6 +608,21 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td
goto fail;
}
+ if (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) {
+ if (!tdb_mutex_open_ok(tdb, &header)) {
+ errno = EINVAL;
+ goto fail;
+ }
+
+ /*
+ * We need to remember the hdr_ofs
+ * also for the TDB_NOLOCK case
+ * if the current library doesn't support
+ * mutex locking.
+ */
+ tdb->hdr_ofs = header.mutex_size;
+ }
+
if ((header.magic1_hash == 0) && (header.magic2_hash == 0)) {
/* older TDB without magic hash references */
tdb->hash_fn = tdb_old_hash;
@@ -477,6 +665,15 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td
goto fail;
}
+ if (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) {
+ if (!(tdb->flags & TDB_NOLOCK)) {
+ ret = tdb_mutex_mmap(tdb);
+ if (ret != 0) {
+ goto fail;
+ }
+ }
+ }
+
if (locked) {
if (tdb_nest_unlock(tdb, ACTIVE_LOCK, F_WRLCK, false) == -1) {
TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
@@ -587,6 +784,9 @@ _PUBLIC_ int tdb_close(struct tdb_context *tdb)
else
tdb_munmap(tdb);
}
+
+ tdb_mutex_munmap(tdb);
+
SAFE_FREE(tdb->name);
if (tdb->fd != -1) {
ret = close(tdb->fd);
diff --git a/lib/tdb/common/summary.c b/lib/tdb/common/summary.c
index e9989f676f7..d786132d4a1 100644
--- a/lib/tdb/common/summary.c
+++ b/lib/tdb/common/summary.c
@@ -23,6 +23,7 @@
"Number of records: %zu\n" \
"Incompatible hash: %s\n" \
"Active/supported feature flags: 0x%08x/0x%08x\n" \
+ "Robust mutexes locking: %s\n" \
"Smallest/average/largest keys: %zu/%zu/%zu\n" \
"Smallest/average/largest data: %zu/%zu/%zu\n" \
"Smallest/average/largest padding: %zu/%zu/%zu\n" \
@@ -175,6 +176,7 @@ _PUBLIC_ char *tdb_summary(struct tdb_context *tdb)
keys.num,
(tdb->hash_fn == tdb_jenkins_hash)?"yes":"no",
(unsigned)tdb->feature_flags, TDB_SUPPORTED_FEATURE_FLAGS,
+ (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX)?"yes":"no",
keys.min, tally_mean(&keys), keys.max,
data.min, tally_mean(&data), data.max,
extra.min, tally_mean(&extra), extra.max,
diff --git a/lib/tdb/common/tdb.c b/lib/tdb/common/tdb.c
index ebd4ffe3e01..ae98c9619d1 100644
--- a/lib/tdb/common/tdb.c
+++ b/lib/tdb/common/tdb.c
@@ -723,6 +723,15 @@ _PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
return;
}
+ if ((flags & TDB_NOLOCK) &&
+ (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) &&
+ (tdb->mutexes == NULL)) {
+ tdb->ecode = TDB_ERR_LOCK;
+ TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
+ "Can not remove NOLOCK flag on mutexed databases"));
+ return;
+ }
+
if (flags & TDB_ALLOW_NESTING) {
tdb->flags |= TDB_DISALLOW_NESTING;
}
diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h
index 4981e2cd6ac..de8d9e68fb5 100644
--- a/lib/tdb/common/tdb_private.h
+++ b/lib/tdb/common/tdb_private.h
@@ -69,7 +69,11 @@ typedef uint32_t tdb_off_t;
#define TDB_PAD_BYTE 0x42
#define TDB_PAD_U32 0x42424242
-#define TDB_SUPPORTED_FEATURE_FLAGS 0
+#define TDB_FEATURE_FLAG_MUTEX 0x00000001
+
+#define TDB_SUPPORTED_FEATURE_FLAGS ( \
+ TDB_FEATURE_FLAG_MUTEX | \
+ 0)
/* NB assumes there is a local variable called "tdb" that is the
* current context, also takes doubly-parenthesized print-style
@@ -156,7 +160,8 @@ struct tdb_header {
uint32_t magic1_hash; /* hash of TDB_MAGIC_FOOD. */
uint32_t magic2_hash; /* hash of TDB_MAGIC. */
uint32_t feature_flags;
- tdb_off_t reserved[26];
+ tdb_len_t mutex_size; /* set if TDB_FEATURE_FLAG_MUTEX is set */
+ tdb_off_t reserved[25];
};
struct tdb_lock_type {
@@ -190,6 +195,8 @@ struct tdb_methods {
int (*tdb_expand_file)(struct tdb_context *, tdb_off_t , tdb_off_t );
};
+struct tdb_mutexes;
+
struct tdb_context {
char *name; /* the name of the database */
void *map_ptr; /* where it is currently mapped */
@@ -203,7 +210,8 @@ struct tdb_context {
struct tdb_lock_type *lockrecs; /* only real locks, all with count>0 */
int lockrecs_array_length;
- tdb_off_t hdr_ofs; /* this is 0 for now */
+ tdb_off_t hdr_ofs; /* this is 0 or header.mutex_size */
+ struct tdb_mutexes *mutexes; /* mmap of the mutex area */
enum TDB_ERROR ecode; /* error code for last tdb error */
uint32_t hash_size;
@@ -300,4 +308,20 @@ bool tdb_add_off_t(tdb_off_t a, tdb_off_t b, tdb_off_t *pret);
/* tdb_off_t and tdb_len_t right now are both uint32_t */
#define tdb_add_len_t tdb_add_off_t
+
+size_t tdb_mutex_size(struct tdb_context *tdb);
+bool tdb_have_mutexes(struct tdb_context *tdb);
+int tdb_mutex_init(struct tdb_context *tdb);
+int tdb_mutex_mmap(struct tdb_context *tdb);
+int tdb_mutex_munmap(struct tdb_context *tdb);
+bool tdb_mutex_lock(struct tdb_context *tdb, int rw, off_t off, off_t len,
+ bool waitflag, int *pret);
+bool tdb_mutex_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len,
+ int *pret);
+int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
+ enum tdb_lock_flags flags);
+int tdb_mutex_allrecord_unlock(struct tdb_context *tdb);
+int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb);
+void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb);
+
#endif /* TDB_PRIVATE_H */
diff --git a/lib/tdb/common/transaction.c b/lib/tdb/common/transaction.c
index a2c3bbdff37..caef0bedd82 100644
--- a/lib/tdb/common/transaction.c
+++ b/lib/tdb/common/transaction.c
@@ -421,7 +421,8 @@ static int _tdb_transaction_start(struct tdb_context *tdb,
enum tdb_lock_flags lockflags)
{
/* some sanity checks */
- if (tdb->read_only || (tdb->flags & TDB_INTERNAL) || tdb->traverse_read) {
+ if (tdb->read_only || (tdb->flags & (TDB_INTERNAL|TDB_MUTEX_LOCKING))
+ || tdb->traverse_read) {
TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
tdb->ecode = TDB_ERR_EINVAL;
return -1;