From db5bda56bf089ec6052d92bb78f3b49f7c812e00 Mon Sep 17 00:00:00 2001 From: Volker Lendecke Date: Thu, 21 Feb 2013 16:34:32 +0100 Subject: tdb: add TDB_MUTEX_LOCKING support This adds optional support for locking based on shared robust mutexes. The caller can use the TDB_MUTEX_LOCKING flag together with TDB_CLEAR_IF_FIRST after verifying with tdb_runtime_check_for_robust_mutexes() that it's supported by the current system. The caller should be aware that using TDB_MUTEX_LOCKING implies some limitations, e.g. it's not possible to have multiple read chainlocks on a given hash chain from multiple processes. Note: that this doesn't make tdb thread safe! Pair-Programmed-With: Stefan Metzmacher Pair-Programmed-With: Michael Adam Signed-off-by: Volker Lendecke Signed-off-by: Stefan Metzmacher Signed-off-by: Michael Adam Reviewed-by: Jeremy Allison --- lib/tdb/ABI/tdb-1.3.0.sigs | 68 ++ lib/tdb/common/io.c | 3 +- lib/tdb/common/lock.c | 79 ++- lib/tdb/common/mutex.c | 1000 ++++++++++++++++++++++++++++ lib/tdb/common/open.c | 200 ++++++ lib/tdb/common/summary.c | 2 + lib/tdb/common/tdb.c | 9 + lib/tdb/common/tdb_private.h | 30 +- lib/tdb/common/transaction.c | 3 +- lib/tdb/docs/mutex.txt | 136 ++++ lib/tdb/include/tdb.h | 34 + lib/tdb/test/run-3G-file.c | 1 + lib/tdb/test/run-bad-tdb-header.c | 1 + lib/tdb/test/run-check.c | 1 + lib/tdb/test/run-corrupt.c | 1 + lib/tdb/test/run-die-during-transaction.c | 1 + lib/tdb/test/run-endian.c | 1 + lib/tdb/test/run-incompatible.c | 1 + lib/tdb/test/run-nested-transactions.c | 1 + lib/tdb/test/run-nested-traverse.c | 1 + lib/tdb/test/run-no-lock-during-traverse.c | 1 + lib/tdb/test/run-oldhash.c | 1 + lib/tdb/test/run-open-during-transaction.c | 1 + lib/tdb/test/run-readonly-check.c | 1 + lib/tdb/test/run-rescue-find_entry.c | 1 + lib/tdb/test/run-rescue.c | 1 + lib/tdb/test/run-rwlock-check.c | 1 + lib/tdb/test/run-summary.c | 1 + lib/tdb/test/run-transaction-expand.c | 1 + lib/tdb/test/run-traverse-in-transaction.c | 1 + lib/tdb/test/run-wronghash-fail.c | 1 + lib/tdb/test/run-zero-append.c | 1 + lib/tdb/test/run.c | 1 + lib/tdb/wscript | 36 +- 34 files changed, 1601 insertions(+), 21 deletions(-) create mode 100644 lib/tdb/ABI/tdb-1.3.0.sigs create mode 100644 lib/tdb/common/mutex.c create mode 100644 lib/tdb/docs/mutex.txt (limited to 'lib/tdb') diff --git a/lib/tdb/ABI/tdb-1.3.0.sigs b/lib/tdb/ABI/tdb-1.3.0.sigs new file mode 100644 index 00000000000..7d3e46987ec --- /dev/null +++ b/lib/tdb/ABI/tdb-1.3.0.sigs @@ -0,0 +1,68 @@ +tdb_add_flags: void (struct tdb_context *, unsigned int) +tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA) +tdb_chainlock: int (struct tdb_context *, TDB_DATA) +tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA) +tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA) +tdb_chainlock_read: int (struct tdb_context *, TDB_DATA) +tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA) +tdb_chainunlock: int (struct tdb_context *, TDB_DATA) +tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA) +tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *) +tdb_close: int (struct tdb_context *) +tdb_delete: int (struct tdb_context *, TDB_DATA) +tdb_dump_all: void (struct tdb_context *) +tdb_enable_seqnum: void (struct tdb_context *) +tdb_error: enum TDB_ERROR (struct tdb_context *) +tdb_errorstr: const char *(struct tdb_context *) +tdb_exists: int (struct tdb_context *, TDB_DATA) +tdb_fd: int (struct tdb_context *) +tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA) +tdb_firstkey: TDB_DATA (struct tdb_context *) +tdb_freelist_size: int (struct tdb_context *) +tdb_get_flags: int (struct tdb_context *) +tdb_get_logging_private: void *(struct tdb_context *) +tdb_get_seqnum: int (struct tdb_context *) +tdb_hash_size: int (struct tdb_context *) +tdb_increment_seqnum_nonblock: void (struct tdb_context *) +tdb_jenkins_hash: unsigned int (TDB_DATA *) +tdb_lock_nonblock: int (struct tdb_context *, int, int) +tdb_lockall: int (struct tdb_context *) +tdb_lockall_mark: int (struct tdb_context *) +tdb_lockall_nonblock: int (struct tdb_context *) +tdb_lockall_read: int (struct tdb_context *) +tdb_lockall_read_nonblock: int (struct tdb_context *) +tdb_lockall_unmark: int (struct tdb_context *) +tdb_log_fn: tdb_log_func (struct tdb_context *) +tdb_map_size: size_t (struct tdb_context *) +tdb_name: const char *(struct tdb_context *) +tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA) +tdb_null: dptr = 0xXXXX, dsize = 0 +tdb_open: struct tdb_context *(const char *, int, int, int, mode_t) +tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func) +tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *) +tdb_printfreelist: int (struct tdb_context *) +tdb_remove_flags: void (struct tdb_context *, unsigned int) +tdb_reopen: int (struct tdb_context *) +tdb_reopen_all: int (int) +tdb_repack: int (struct tdb_context *) +tdb_rescue: int (struct tdb_context *, void (*)(TDB_DATA, TDB_DATA, void *), void *) +tdb_runtime_check_for_robust_mutexes: bool (void) +tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *) +tdb_set_max_dead: void (struct tdb_context *, int) +tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *) +tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int) +tdb_summary: char *(struct tdb_context *) +tdb_transaction_cancel: int (struct tdb_context *) +tdb_transaction_commit: int (struct tdb_context *) +tdb_transaction_prepare_commit: int (struct tdb_context *) +tdb_transaction_start: int (struct tdb_context *) +tdb_transaction_start_nonblock: int (struct tdb_context *) +tdb_transaction_write_lock_mark: int (struct tdb_context *) +tdb_transaction_write_lock_unmark: int (struct tdb_context *) +tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *) +tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *) +tdb_unlock: int (struct tdb_context *, int, int) +tdb_unlockall: int (struct tdb_context *) +tdb_unlockall_read: int (struct tdb_context *) +tdb_validate_freelist: int (struct tdb_context *, int *) +tdb_wipe_all: int (struct tdb_context *) diff --git a/lib/tdb/common/io.c b/lib/tdb/common/io.c index 07d22ccdb21..fe47d18a5a4 100644 --- a/lib/tdb/common/io.c +++ b/lib/tdb/common/io.c @@ -29,7 +29,8 @@ #include "tdb_private.h" /* - * tdb->hdr_ofs is 0 for now. + * We prepend the mutex area, so fixup offsets. See mutex.c for details. + * tdb->hdr_ofs is 0 or header.mutex_size. * * Note: that we only have the 4GB limit of tdb_off_t for * tdb->map_size. The file size on disk can be 4GB + tdb->hdr_ofs! diff --git a/lib/tdb/common/lock.c b/lib/tdb/common/lock.c index 486de797381..6644c4034e0 100644 --- a/lib/tdb/common/lock.c +++ b/lib/tdb/common/lock.c @@ -38,6 +38,15 @@ static int fcntl_lock(struct tdb_context *tdb, struct flock fl; int cmd; +#ifdef USE_TDB_MUTEX_LOCKING + { + int ret; + if (tdb_mutex_lock(tdb, rw, off, len, waitflag, &ret)) { + return ret; + } + } +#endif + fl.l_type = rw; fl.l_whence = SEEK_SET; fl.l_start = off; @@ -110,6 +119,15 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len) fclose(locks); #endif +#ifdef USE_TDB_MUTEX_LOCKING + { + int ret; + if (tdb_mutex_unlock(tdb, rw, off, len, &ret)) { + return ret; + } + } +#endif + fl.l_type = F_UNLCK; fl.l_whence = SEEK_SET; fl.l_start = off; @@ -248,13 +266,27 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb) return -1; } - ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0, - TDB_LOCK_WAIT|TDB_LOCK_PROBE); + if (tdb_have_mutexes(tdb)) { + ret = tdb_mutex_allrecord_upgrade(tdb); + if (ret == -1) { + goto fail; + } + ret = tdb_brlock_retry(tdb, F_WRLCK, lock_offset(tdb->hash_size), + 0, TDB_LOCK_WAIT|TDB_LOCK_PROBE); + if (ret == -1) { + tdb_mutex_allrecord_downgrade(tdb); + } + } else { + ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0, + TDB_LOCK_WAIT|TDB_LOCK_PROBE); + } + if (ret == 0) { tdb->allrecord_lock.ltype = F_WRLCK; tdb->allrecord_lock.off = 0; return 0; } +fail: TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_allrecord_upgrade failed\n")); return -1; } @@ -593,6 +625,8 @@ static int tdb_chainlock_gradual(struct tdb_context *tdb, int tdb_allrecord_lock(struct tdb_context *tdb, int ltype, enum tdb_lock_flags flags, bool upgradable) { + int ret; + switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) { case -1: return -1; @@ -607,16 +641,27 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype, * * It is (1) which cause the starvation problem, so we're only * gradual for that. */ - if (tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP, - tdb->hash_size * 4) == -1) { + + if (tdb_have_mutexes(tdb)) { + ret = tdb_mutex_allrecord_lock(tdb, ltype, flags); + } else { + ret = tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP, + tdb->hash_size * 4); + } + + if (ret == -1) { return -1; } /* Grab individual record locks. */ if (tdb_brlock(tdb, ltype, lock_offset(tdb->hash_size), 0, flags) == -1) { - tdb_brunlock(tdb, ltype, FREELIST_TOP, - tdb->hash_size * 4); + if (tdb_have_mutexes(tdb)) { + tdb_mutex_allrecord_unlock(tdb); + } else { + tdb_brunlock(tdb, ltype, FREELIST_TOP, + tdb->hash_size * 4); + } return -1; } @@ -672,9 +717,25 @@ int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock) return 0; } - if (!mark_lock && tdb_brunlock(tdb, ltype, FREELIST_TOP, 0)) { - TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed (%s)\n", strerror(errno))); - return -1; + if (!mark_lock) { + int ret; + + if (tdb_have_mutexes(tdb)) { + ret = tdb_mutex_allrecord_unlock(tdb); + if (ret == 0) { + ret = tdb_brunlock(tdb, ltype, + lock_offset(tdb->hash_size), + 0); + } + } else { + ret = tdb_brunlock(tdb, ltype, FREELIST_TOP, 0); + } + + if (ret != 0) { + TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed " + "(%s)\n", strerror(errno))); + return -1; + } } tdb->allrecord_lock.count = 0; diff --git a/lib/tdb/common/mutex.c b/lib/tdb/common/mutex.c new file mode 100644 index 00000000000..bdc4c28cb6c --- /dev/null +++ b/lib/tdb/common/mutex.c @@ -0,0 +1,1000 @@ +/* + Unix SMB/CIFS implementation. + + trivial database library + + Copyright (C) Volker Lendecke 2012,2013 + Copyright (C) Stefan Metzmacher 2013,2014 + Copyright (C) Michael Adam 2014 + + ** NOTE! The following LGPL license applies to the tdb + ** library. This does NOT imply that all of Samba is released + ** under the LGPL + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, see . +*/ +#include "tdb_private.h" +#include "system/threads.h" + +#ifdef USE_TDB_MUTEX_LOCKING + +/* + * If we run with mutexes, we store the "struct tdb_mutexes" at the + * beginning of the file. We store an additional tdb_header right + * beyond the mutex area, page aligned. All the offsets within the tdb + * are relative to the area behind the mutex area. tdb->map_ptr points + * behind the mmap area as well, so the read and write path in the + * mutex case can remain unchanged. + * + * Early in the mutex development the mutexes were placed between the hash + * chain pointers and the real tdb data. This had two drawbacks: First, it + * made pointer calculations more complex. Second, we had to mmap the mutex + * area twice. One was the normal map_ptr in the tdb. This frequently changed + * from within tdb_oob. At least the Linux glibc robust mutex code assumes + * constant pointers in memory, so a constantly changing mmap area destroys + * the mutex list. So we had to mmap the first bytes of the file with a second + * mmap call. With that scheme, very weird errors happened that could be + * easily fixed by doing the mutex mmap in a second file. It seemed that + * mapping the same memory area twice does not end up in accessing the same + * physical page, looking at the mutexes in gdb it seemed that old data showed + * up after some re-mapping. To avoid a separate mutex file, the code now puts + * the real content of the tdb file after the mutex area. This way we do not + * have overlapping mmap areas, the mutex area is mmapped once and not + * changed, the tdb data area's mmap is constantly changed but does not + * overlap. + */ + +struct tdb_mutexes { + struct tdb_header hdr; + + /* protect allrecord_lock */ + pthread_mutex_t allrecord_mutex; + + /* + * F_UNLCK: free, + * F_RDLCK: shared, + * F_WRLCK: exclusive + */ + short int allrecord_lock; + + /* + * Index 0 is the freelist mutex, followed by + * one mutex per hashchain. + */ + pthread_mutex_t hashchains[1]; +}; + +bool tdb_have_mutexes(struct tdb_context *tdb) +{ + return ((tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) != 0); +} + +size_t tdb_mutex_size(struct tdb_context *tdb) +{ + size_t mutex_size; + + if (!tdb_have_mutexes(tdb)) { + return 0; + } + + mutex_size = sizeof(struct tdb_mutexes); + mutex_size += tdb->hash_size * sizeof(pthread_mutex_t); + + return TDB_ALIGN(mutex_size, tdb->page_size); +} + +/* + * Get the index for a chain mutex + */ +static bool tdb_mutex_index(struct tdb_context *tdb, off_t off, off_t len, + unsigned *idx) +{ + /* + * Weird but true: We fcntl lock 1 byte at an offset 4 bytes before + * the 4 bytes of the freelist start and the hash chain that is about + * to be locked. See lock_offset() where the freelist is -1 vs the + * "+1" in TDB_HASH_TOP(). Because the mutex array is represented in + * the tdb file itself as data, we need to adjust the offset here. + */ + const off_t freelist_lock_ofs = FREELIST_TOP - sizeof(tdb_off_t); + + if (!tdb_have_mutexes(tdb)) { + return false; + } + if (len != 1) { + /* Possibly the allrecord lock */ + return false; + } + if (off < freelist_lock_ofs) { + /* One of the special locks */ + return false; + } + if (tdb->hash_size == 0) { + /* tdb not initialized yet, called from tdb_open_ex() */ + return false; + } + if (off >= TDB_DATA_START(tdb->hash_size)) { + /* Single record lock from traverses */ + return false; + } + + /* + * Now we know it's a freelist or hash chain lock. Those are always 4 + * byte aligned. Paranoia check. + */ + if ((off % sizeof(tdb_off_t)) != 0) { + abort(); + } + + /* + * Re-index the fcntl offset into an offset into the mutex array + */ + off -= freelist_lock_ofs; /* rebase to index 0 */ + off /= sizeof(tdb_off_t); /* 0 for freelist 1-n for hashchain */ + + *idx = off; + return true; +} + +static bool tdb_have_mutex_chainlocks(struct tdb_context *tdb) +{ + size_t i; + + for (i=0; i < tdb->num_lockrecs; i++) { + bool ret; + unsigned idx; + + ret = tdb_mutex_index(tdb, + tdb->lockrecs[i].off, + tdb->lockrecs[i].count, + &idx); + if (!ret) { + continue; + } + + if (idx == 0) { + /* this is the freelist mutex */ + continue; + } + + return true; + } + + return false; +} + +static int chain_mutex_lock(pthread_mutex_t *m, bool waitflag) +{ + int ret; + + if (waitflag) { + ret = pthread_mutex_lock(m); + } else { + ret = pthread_mutex_trylock(m); + } + if (ret != EOWNERDEAD) { + return ret; + } + + /* + * For chainlocks, we don't do any cleanup (yet?) + */ + return pthread_mutex_consistent(m); +} + +static int allrecord_mutex_lock(struct tdb_mutexes *m, bool waitflag) +{ + int ret; + + if (waitflag) { + ret = pthread_mutex_lock(&m->allrecord_mutex); + } else { + ret = pthread_mutex_trylock(&m->allrecord_mutex); + } + if (ret != EOWNERDEAD) { + return ret; + } + + /* + * The allrecord lock holder died. We need to reset the allrecord_lock + * to F_UNLCK. This should also be the indication for + * tdb_needs_recovery. + */ + m->allrecord_lock = F_UNLCK; + + return pthread_mutex_consistent(&m->allrecord_mutex); +} + +bool tdb_mutex_lock(struct tdb_context *tdb, int rw, off_t off, off_t len, + bool waitflag, int *pret) +{ + struct tdb_mutexes *m = tdb->mutexes; + pthread_mutex_t *chain; + int ret; + unsigned idx; + bool allrecord_ok; + + if (!tdb_mutex_index(tdb, off, len, &idx)) { + return false; + } + chain = &m->hashchains[idx]; + +again: + ret = chain_mutex_lock(chain, waitflag); + if (ret == EBUSY) { + ret = EAGAIN; + } + if (ret != 0) { + errno = ret; + goto fail; + } + + if (idx == 0) { + /* + * This is a freelist lock, which is independent to + * the allrecord lock. So we're done once we got the + * freelist mutex. + */ + *pret = 0; + return true; + } + + if (tdb_have_mutex_chainlocks(tdb)) { + /* + * We can only check the allrecord lock once. If we do it with + * one chain mutex locked, we will deadlock with the allrecord + * locker process in the following way: We lock the first hash + * chain, we check for the allrecord lock. We keep the hash + * chain locked. Then the allrecord locker locks the + * allrecord_mutex. It walks the list of chain mutexes, + * locking them all in sequence. Meanwhile, we have the chain + * mutex locked, so the allrecord locker blocks trying to lock + * our chain mutex. Then we come in and try to lock the second + * chain lock, which in most cases will be the freelist. We + * see that the allrecord lock is locked and put ourselves on + * the allrecord_mutex. This will never be signalled though + * because the allrecord locker waits for us to give up the + * chain lock. + */ + + *pret = 0; + return true; + } + + /* + * Check if someone is has the allrecord lock: queue if so. + */ + + allrecord_ok = false; + + if (m->allrecord_lock == F_UNLCK) { + /* + * allrecord lock not taken + */ + allrecord_ok = true; + } + + if ((m->allrecord_lock == F_RDLCK) && (rw == F_RDLCK)) { + /* + * allrecord shared lock taken, but we only want to read + */ + allrecord_ok = true; + } + + if (allrecord_ok) { + *pret = 0; + return true; + } + + ret = pthread_mutex_unlock(chain); + if (ret != 0) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock" + "(chain_mutex) failed: %s\n", strerror(ret))); + errno = ret; + goto fail; + } + ret = allrecord_mutex_lock(m, waitflag); + if (ret == EBUSY) { + ret = EAGAIN; + } + if (ret != 0) { + if (waitflag || (ret != EAGAIN)) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_%slock" + "(allrecord_mutex) failed: %s\n", + waitflag ? "" : "try_", strerror(ret))); + } + errno = ret; + goto fail; + } + ret = pthread_mutex_unlock(&m->allrecord_mutex); + if (ret != 0) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock" + "(allrecord_mutex) failed: %s\n", strerror(ret))); + errno = ret; + goto fail; + } + goto again; + +fail: + *pret = -1; + return true; +} + +bool tdb_mutex_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len, + int *pret) +{ + struct tdb_mutexes *m = tdb->mutexes; + pthread_mutex_t *chain; + int ret; + unsigned idx; + + if (!tdb_mutex_index(tdb, off, len, &idx)) { + return false; + } + chain = &m->hashchains[idx]; + + ret = pthread_mutex_unlock(chain); + if (ret == 0) { + *pret = 0; + return true; + } + errno = ret; + *pret = -1; + return true; +} + +int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype, + enum tdb_lock_flags flags) +{ + struct tdb_mutexes *m = tdb->mutexes; + int ret; + uint32_t i; + bool waitflag = (flags & TDB_LOCK_WAIT); + int saved_errno; + + if (tdb->flags & TDB_NOLOCK) { + return 0; + } + + if (flags & TDB_LOCK_MARK_ONLY) { + return 0; + } + + ret = allrecord_mutex_lock(m, waitflag); + if (!waitflag && (ret == EBUSY)) { + errno = EAGAIN; + tdb->ecode = TDB_ERR_LOCK; + return -1; + } + if (ret != 0) { + if (!(flags & TDB_LOCK_PROBE)) { + TDB_LOG((tdb, TDB_DEBUG_TRACE, + "allrecord_mutex_lock() failed: %s\n", + strerror(ret))); + } + tdb->ecode = TDB_ERR_LOCK; + return -1; + } + + if (m->allrecord_lock != F_UNLCK) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n", + (int)m->allrecord_lock)); + goto fail_unlock_allrecord_mutex; + } + m->allrecord_lock = (ltype == F_RDLCK) ? F_RDLCK : F_WRLCK; + + for (i=0; ihash_size; i++) { + + /* ignore hashchains[0], the freelist */ + pthread_mutex_t *chain = &m->hashchains[i+1]; + + ret = chain_mutex_lock(chain, waitflag); + if (!waitflag && (ret == EBUSY)) { + errno = EAGAIN; + goto fail_unroll_allrecord_lock; + } + if (ret != 0) { + if (!(flags & TDB_LOCK_PROBE)) { + TDB_LOG((tdb, TDB_DEBUG_TRACE, + "chain_mutex_lock() failed: %s\n", + strerror(ret))); + } + errno = ret; + goto fail_unroll_allrecord_lock; + } + + ret = pthread_mutex_unlock(chain); + if (ret != 0) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock" + "(chainlock) failed: %s\n", strerror(ret))); + errno = ret; + goto fail_unroll_allrecord_lock; + } + } + /* + * We leave this routine with m->allrecord_mutex locked + */ + return 0; + +fail_unroll_allrecord_lock: + m->allrecord_lock = F_UNLCK; + +fail_unlock_allrecord_mutex: + saved_errno = errno; + ret = pthread_mutex_unlock(&m->allrecord_mutex); + if (ret != 0) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock" + "(allrecord_mutex) failed: %s\n", strerror(ret))); + } + errno = saved_errno; + tdb->ecode = TDB_ERR_LOCK; + return -1; +} + +int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb) +{ + struct tdb_mutexes *m = tdb->mutexes; + int ret; + uint32_t i; + + if (tdb->flags & TDB_NOLOCK) { + return 0; + } + + /* + * Our only caller tdb_allrecord_upgrade() + * garantees that we already own the allrecord lock. + * + * Which means m->allrecord_mutex is still locked by us. + */ + + if (m->allrecord_lock != F_RDLCK) { + tdb->ecode = TDB_ERR_LOCK; + TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n", + (int)m->allrecord_lock)); + return -1; + } + + m->allrecord_lock = F_WRLCK; + + for (i=0; ihash_size; i++) { + + /* ignore hashchains[0], the freelist */ + pthread_mutex_t *chain = &m->hashchains[i+1]; + + ret = chain_mutex_lock(chain, true); + if (ret != 0) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_lock" + "(chainlock) failed: %s\n", strerror(ret))); + goto fail_unroll_allrecord_lock; + } + + ret = pthread_mutex_unlock(chain); + if (ret != 0) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock" + "(chainlock) failed: %s\n", strerror(ret))); + goto fail_unroll_allrecord_lock; + } + } + + return 0; + +fail_unroll_allrecord_lock: + m->allrecord_lock = F_RDLCK; + tdb->ecode = TDB_ERR_LOCK; + return -1; +} + +void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb) +{ + struct tdb_mutexes *m = tdb->mutexes; + + /* + * Our only caller tdb_allrecord_upgrade() (in the error case) + * garantees that we already own the allrecord lock. + * + * Which means m->allrecord_mutex is still locked by us. + */ + + if (m->allrecord_lock != F_WRLCK) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n", + (int)m->allrecord_lock)); + return; + } + + m->allrecord_lock = F_RDLCK; + return; +} + + +int tdb_mutex_allrecord_unlock(struct tdb_context *tdb) +{ + struct tdb_mutexes *m = tdb->mutexes; + short old; + int ret; + + if (tdb->flags & TDB_NOLOCK) { + return 0; + } + + /* + * Our only callers tdb_allrecord_unlock() and + * tdb_allrecord_lock() (in the error path) + * garantee that we already own the allrecord lock. + * + * Which means m->allrecord_mutex is still locked by us. + */ + + if ((m->allrecord_lock != F_RDLCK) && (m->allrecord_lock != F_WRLCK)) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n", + (int)m->allrecord_lock)); + return -1; + } + + old = m->allrecord_lock; + m->allrecord_lock = F_UNLCK; + + ret = pthread_mutex_unlock(&m->allrecord_mutex); + if (ret != 0) { + m->allrecord_lock = old; + TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock" + "(allrecord_mutex) failed: %s\n", strerror(ret))); + return -1; + } + return 0; +} + +int tdb_mutex_init(struct tdb_context *tdb) +{ + struct tdb_mutexes *m; + pthread_mutexattr_t ma; + int i, ret; + + ret = tdb_mutex_mmap(tdb); + if (ret == -1) { + return -1; + } + m = tdb->mutexes; + + ret = pthread_mutexattr_init(&ma); + if (ret != 0) { + goto fail_munmap; + } + ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); + if (ret != 0) { + goto fail; + } + ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); + if (ret != 0) { + goto fail; + } + ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); + if (ret != 0) { + goto fail; + } + + for (i=0; ihash_size+1; i++) { + pthread_mutex_t *chain = &m->hashchains[i]; + + ret = pthread_mutex_init(chain, &ma); + if (ret != 0) { + goto fail; + } + } + + m->allrecord_lock = F_UNLCK; + + ret = pthread_mutex_init(&m->allrecord_mutex, &ma); + if (ret != 0) { + goto fail; + } + ret = 0; +fail: + pthread_mutexattr_destroy(&ma); +fail_munmap: + tdb_mutex_munmap(tdb); + + if (ret == 0) { + return 0; + } + + errno = ret; + return -1; +} + +int tdb_mutex_mmap(struct tdb_context *tdb) +{ + size_t len; + void *ptr; + + len = tdb_mutex_size(tdb); + if (len == 0) { + return 0; + } + + ptr = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FILE, + tdb->fd, 0); + if (ptr == MAP_FAILED) { + return -1; + } + tdb->mutexes = (struct tdb_mutexes *)ptr; + + return 0; +} + +int tdb_mutex_munmap(struct tdb_context *tdb) +{ + size_t len; + + len = tdb_mutex_size(tdb); + if (len == 0) { + return 0; + } + + return munmap(tdb->mutexes, len); +} + +static bool tdb_mutex_locking_cached; + +static bool tdb_mutex_locking_supported(void) +{ + pthread_mutexattr_t ma; + pthread_mutex_t m; + int ret; + static bool initialized; + + if (initialized) { + return tdb_mutex_locking_cached; + } + + initialized = true; + + ret = pthread_mutexattr_init(&ma); + if (ret != 0) { + return false; + } + ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); + if (ret != 0) { + goto cleanup_ma; + } + ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); + if (ret != 0) { + goto cleanup_ma; + } + ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); + if (ret != 0) { + goto cleanup_ma; + } + ret = pthread_mutex_init(&m, &ma); + if (ret != 0) { + goto cleanup_ma; + } + ret = pthread_mutex_lock(&m); + if (ret != 0) { + goto cleanup_m; + } + /* + * This makes sure we have real mutexes + * from a threading library instead of just + * stubs from libc. + */ + ret = pthread_mutex_lock(&m); + if (ret != EDEADLK) { + goto cleanup_lock; + } + ret = pthread_mutex_unlock(&m); + if (ret != 0) { + goto cleanup_m; + } + + tdb_mutex_locking_cached = true; + goto cleanup_m; + +cleanup_lock: + pthread_mutex_unlock(&m); +cleanup_m: + pthread_mutex_destroy(&m); +cleanup_ma: + pthread_mutexattr_destroy(&ma); + return tdb_mutex_locking_cached; +} + +static void (*tdb_robust_mutext_old_handler)(int) = SIG_ERR; +static pid_t tdb_robust_mutex_pid = -1; + +static void tdb_robust_mutex_handler(int sig) +{ + if (tdb_robust_mutex_pid != -1) { + pid_t pid; + int status; + + pid = waitpid(tdb_robust_mutex_pid, &status, WNOHANG); + if (pid == tdb_robust_mutex_pid) { + tdb_robust_mutex_pid = -1; + return; + } + } + + if (tdb_robust_mutext_old_handler == SIG_DFL) { + return; + } + if (tdb_robust_mutext_old_handler == SIG_IGN) { + return; + } + if (tdb_robust_mutext_old_handler == SIG_ERR) { + return; + } + + tdb_robust_mutext_old_handler(sig); +} + +_PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void) +{ + void *ptr; + pthread_mutex_t *m; + pthread_mutexattr_t ma; + int ret = 1; + int pipe_down[2] = { -1, -1 }; + int pipe_up[2] = { -1, -1 }; + ssize_t nread; + char c = 0; + bool ok; + int status; + static bool initialized; + + if (initialized) { + return tdb_mutex_locking_cached; + } + + initialized = true; + + ok = tdb_mutex_locking_supported(); + if (!ok) { + return false; + } + + tdb_mutex_locking_cached = false; + + ptr = mmap(NULL, sizeof(pthread_mutex_t), PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANON, -1 /* fd */, 0); + if (ptr == MAP_FAILED) { + return false; + } + m = (pthread_mutex_t *)ptr; + + ret = pipe(pipe_down); + if (ret != 0) { + goto cleanup_mmap; + } + ret = pipe(pipe_up); + if (ret != 0) { + goto cleanup_pipe; + } + + ret = pthread_mutexattr_init(&ma); + if (ret != 0) { + goto cleanup_pipe; + } + ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); + if (ret != 0) { + goto cleanup_ma; + } + ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); + if (ret != 0) { + goto cleanup_ma; + } + ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); + if (ret != 0) { + goto cleanup_ma; + } + ret = pthread_mutex_init(m, &ma); + if (ret != 0) { + goto cleanup_ma; + } + + tdb_robust_mutext_old_handler = signal(SIGCHLD, + tdb_robust_mutex_handler); + + tdb_robust_mutex_pid = fork(); + if (tdb_robust_mutex_pid == 0) { + size_t nwritten; + close(pipe_down[1]); + close(pipe_up[0]); + ret = pthread_mutex_lock(m); + nwritten = write(pipe_up[1], &ret, sizeof(ret)); + if (nwritten != sizeof(ret)) { + exit(1); + } + if (ret != 0) { + exit(1); + } + nread = read(pipe_down[0], &c, 1); + if (nread != 1) { + exit(1); + } + /* leave locked */ + exit(0); + } + if (tdb_robust_mutex_pid == -1) { + goto cleanup_sig_child; + } + close(pipe_down[0]); + pipe_down[0] = -1; + close(pipe_up[1]); + pipe_up[1] = -1; + + nread = read(pipe_up[0], &ret, sizeof(ret)); + if (nread != sizeof(ret)) { + goto cleanup_child; + } + + ret = pthread_mutex_trylock(m); + if (ret != EBUSY) { + if (ret == 0) { + pthread_mutex_unlock(m); + } + goto cleanup_child; + } + + if (write(pipe_down[1], &c, 1) != 1) { + goto cleanup_child; + } + + nread = read(pipe_up[0], &c, 1); + if (nread != 0) { + goto cleanup_child; + } + + while (tdb_robust_mutex_pid > 0) { + pid_t pid; + + errno = 0; + pid = waitpid(tdb_robust_mutex_pid, &status, 0); + if (pid == tdb_robust_mutex_pid) { + tdb_robust_mutex_pid = -1; + break; + } + if (pid == -1 && errno != EINTR) { + goto cleanup_child; + } + } + signal(SIGCHLD, tdb_robust_mutext_old_handler); + + ret = pthread_mutex_trylock(m); + if (ret != EOWNERDEAD) { + if (ret == 0) { + pthread_mutex_unlock(m); + } + goto cleanup_m; + } + + ret = pthread_mutex_consistent(m); + if (ret != 0) { + goto cleanup_m; + } + + ret = pthread_mutex_trylock(m); + if (ret != EDEADLK) { + pthread_mutex_unlock(m); + goto cleanup_m; + } + + ret = pthread_mutex_unlock(m); + if (ret != 0) { + goto cleanup_m; + } + + tdb_mutex_locking_cached = true; + goto cleanup_m; + +cleanup_child: + while (tdb_robust_mutex_pid > 0) { + pid_t pid; + + kill(tdb_robust_mutex_pid, SIGKILL); + + errno = 0; + pid = waitpid(tdb_robust_mutex_pid, &status, 0); + if (pid == tdb_robust_mutex_pid) { + tdb_robust_mutex_pid = -1; + break; + } + if (pid == -1 && errno != EINTR) { + break; + } + } +cleanup_sig_child: + signal(SIGCHLD, tdb_robust_mutext_old_handler); +cleanup_m: + pthread_mutex_destroy(m); +cleanup_ma: + pthread_mutexattr_destroy(&ma); +cleanup_pipe: + if (pipe_down[0] != -1) { + close(pipe_down[0]); + } + if (pipe_down[1] != -1) { + close(pipe_down[1]); + } + if (pipe_up[0] != -1) { + close(pipe_up[0]); + } + if (pipe_up[1] != -1) { + close(pipe_up[1]); + } +cleanup_mmap: + munmap(ptr, sizeof(pthread_mutex_t)); + + return tdb_mutex_locking_cached; +} + +#else + +size_t tdb_mutex_size(struct tdb_context *tdb) +{ + return 0; +} + +bool tdb_have_mutexes(struct tdb_context *tdb) +{ + return false; +} + +int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype, + enum tdb_lock_flags flags) +{ + tdb->ecode = TDB_ERR_LOCK; + return -1; +} + +int tdb_mutex_allrecord_unlock(struct tdb_context *tdb) +{ + return -1; +} + +int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb) +{ + tdb->ecode = TDB_ERR_LOCK; + return -1; +} + +void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb) +{ + return; +} + +int tdb_mutex_mmap(struct tdb_context *tdb) +{ + errno = ENOSYS; + return -1; +} + +int tdb_mutex_munmap(struct tdb_context *tdb) +{ + errno = ENOSYS; + return -1; +} + +int tdb_mutex_init(struct tdb_context *tdb) +{ + errno = ENOSYS; + return -1; +} + +_PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void) +{ + return false; +} + +#endif diff --git a/lib/tdb/common/open.c b/lib/tdb/common/open.c index 162f30d4047..16a76a347fc 100644 --- a/lib/tdb/common/open.c +++ b/lib/tdb/common/open.c @@ -76,6 +76,15 @@ static int tdb_new_database(struct tdb_context *tdb, struct tdb_header *header, if (tdb->flags & TDB_INCOMPATIBLE_HASH) newdb->rwlocks = TDB_HASH_RWLOCK_MAGIC; + /* + * We create a tdb with TDB_FEATURE_FLAG_MUTEX support, + * the flag combination and runtime feature checks + * are done by the caller already. + */ + if (tdb->flags & TDB_MUTEX_LOCKING) { + newdb->feature_flags |= TDB_FEATURE_FLAG_MUTEX; + } + /* * If we have any features we add the FEATURE_FLAG_MAGIC, overwriting the * TDB_HASH_RWLOCK_MAGIC above. @@ -87,8 +96,11 @@ static int tdb_new_database(struct tdb_context *tdb, struct tdb_header *header, /* * It's required for some following code pathes * to have the fields on 'tdb' up-to-date. + * + * E.g. tdb_mutex_size() requires it */ tdb->feature_flags = newdb->feature_flags; + tdb->hash_size = newdb->hash_size; if (tdb->flags & TDB_INTERNAL) { tdb->map_size = size; @@ -104,6 +116,11 @@ static int tdb_new_database(struct tdb_context *tdb, struct tdb_header *header, if (ftruncate(tdb->fd, 0) == -1) goto fail; + if (newdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) { + newdb->mutex_size = tdb_mutex_size(tdb); + tdb->hdr_ofs = newdb->mutex_size; + } + /* This creates an endian-converted header, as if read from disk */ CONVERT(*newdb); memcpy(header, newdb, sizeof(*header)); @@ -113,6 +130,37 @@ static int tdb_new_database(struct tdb_context *tdb, struct tdb_header *header, if (!tdb_write_all(tdb->fd, newdb, size)) goto fail; + if (newdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) { + + /* + * Now we init the mutex area + * followed by a second header. + */ + + ret = ftruncate( + tdb->fd, + newdb->mutex_size + sizeof(struct tdb_header)); + if (ret == -1) { + goto fail; + } + ret = tdb_mutex_init(tdb); + if (ret == -1) { + goto fail; + } + + /* + * Write a second header behind the mutexes. That's the area + * that will be mmapp'ed. + */ + ret = lseek(tdb->fd, newdb->mutex_size, SEEK_SET); + if (ret == -1) { + goto fail; + } + if (!tdb_write_all(tdb->fd, newdb, size)) { + goto fail; + } + } + ret = 0; fail: SAFE_FREE(newdb); @@ -179,6 +227,70 @@ static bool check_header_hash(struct tdb_context *tdb, return check_header_hash(tdb, header, false, m1, m2); } +static bool tdb_mutex_open_ok(struct tdb_context *tdb, + const struct tdb_header *header) +{ + int locked; + + locked = tdb_nest_lock(tdb, ACTIVE_LOCK, F_WRLCK, + TDB_LOCK_NOWAIT|TDB_LOCK_PROBE); + + if ((locked == -1) && (tdb->ecode == TDB_ERR_LOCK)) { + /* + * CLEAR_IF_FIRST still active. The tdb was created on this + * host, so we can assume the mutex implementation is + * compatible. Important for tools like tdbdump on a still + * open locking.tdb. + */ + goto check_local_settings; + } + + /* + * We got the CLEAR_IF_FIRST lock. That means the database was + * potentially copied from somewhere else. The mutex implementation + * might be incompatible. + */ + + if (tdb_nest_unlock(tdb, ACTIVE_LOCK, F_WRLCK, false) == -1) { + /* + * Should not happen + */ + TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_mutex_open_ok: " + "failed to release ACTIVE_LOCK on %s: %s\n", + tdb->name, strerror(errno))); + return false; + } + + if (tdb->flags & TDB_NOLOCK) { + /* + * We don't look at locks, so it does not matter to have a + * compatible mutex implementation. Allow the open. + */ + return true; + } + +check_local_settings: + + if (!(tdb->flags & TDB_MUTEX_LOCKING)) { + TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_mutex_open_ok[%s]: " + "Can use mutexes only with " + "MUTEX_LOCKING or NOLOCK\n", + tdb->name)); + return false; + } + + if (tdb_mutex_size(tdb) != header->mutex_size) { + TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_mutex_open_ok[%s]: " + "Mutex size changed from %u to %u\n.", + tdb->name, + (unsigned int)header->mutex_size, + (unsigned int)tdb_mutex_size(tdb))); + return false; + } + + return true; +} + _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, int open_flags, mode_t mode, const struct tdb_logging_context *log_ctx, @@ -208,6 +320,9 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td if (tdb_flags & TDB_INTERNAL) { tdb_flags |= TDB_INCOMPATIBLE_HASH; } + if (tdb_flags & TDB_MUTEX_LOCKING) { + tdb_flags |= TDB_INCOMPATIBLE_HASH; + } tdb->fd = -1; #ifdef TDB_TRACE @@ -296,6 +411,64 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td goto fail; } + if (tdb->flags & TDB_MUTEX_LOCKING) { + /* + * Here we catch bugs in the callers, + * the runtime check for existing tdb's comes later. + */ + + if (!(tdb->flags & TDB_CLEAR_IF_FIRST)) { + TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: " + "invalid flags for %s - TDB_MUTEX_LOCKING " + "requires TDB_CLEAR_IF_FIRST\n", name)); + errno = EINVAL; + goto fail; + } + + if (tdb->flags & TDB_INTERNAL) { + TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: " + "invalid flags for %s - TDB_MUTEX_LOCKING and " + "TDB_INTERNAL are not allowed together\n", name)); + errno = EINVAL; + goto fail; + } + + if (tdb->flags & TDB_NOMMAP) { + TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: " + "invalid flags for %s - TDB_MUTEX_LOCKING and " + "TDB_NOMMAP are not allowed together\n", name)); + errno = EINVAL; + goto fail; + } + + if (tdb->read_only) { + TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: " + "invalid flags for %s - TDB_MUTEX_LOCKING " + "not allowed read only\n", name)); + errno = EINVAL; + goto fail; + } + + /* + * The callers should have called + * tdb_runtime_check_for_robust_mutexes() + * before using TDB_MUTEX_LOCKING! + * + * This makes sure the caller understands + * that the locking may behave a bit differently + * than with pure fcntl locking. E.g. multiple + * read locks are not supported. + */ + if (!tdb_runtime_check_for_robust_mutexes()) { + TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: " + "invalid flags for %s - TDB_MUTEX_LOCKING " + "requires support for robust_mutexes\n", + name)); + errno = ENOSYS; + goto fail; + } + } + if (getenv("TDB_NO_FSYNC")) { tdb->flags |= TDB_NOSYNC; } @@ -435,6 +608,21 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td goto fail; } + if (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) { + if (!tdb_mutex_open_ok(tdb, &header)) { + errno = EINVAL; + goto fail; + } + + /* + * We need to remember the hdr_ofs + * also for the TDB_NOLOCK case + * if the current library doesn't support + * mutex locking. + */ + tdb->hdr_ofs = header.mutex_size; + } + if ((header.magic1_hash == 0) && (header.magic2_hash == 0)) { /* older TDB without magic hash references */ tdb->hash_fn = tdb_old_hash; @@ -477,6 +665,15 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int td goto fail; } + if (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) { + if (!(tdb->flags & TDB_NOLOCK)) { + ret = tdb_mutex_mmap(tdb); + if (ret != 0) { + goto fail; + } + } + } + if (locked) { if (tdb_nest_unlock(tdb, ACTIVE_LOCK, F_WRLCK, false) == -1) { TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: " @@ -587,6 +784,9 @@ _PUBLIC_ int tdb_close(struct tdb_context *tdb) else tdb_munmap(tdb); } + + tdb_mutex_munmap(tdb); + SAFE_FREE(tdb->name); if (tdb->fd != -1) { ret = close(tdb->fd); diff --git a/lib/tdb/common/summary.c b/lib/tdb/common/summary.c index e9989f676f7..d786132d4a1 100644 --- a/lib/tdb/common/summary.c +++ b/lib/tdb/common/summary.c @@ -23,6 +23,7 @@ "Number of records: %zu\n" \ "Incompatible hash: %s\n" \ "Active/supported feature flags: 0x%08x/0x%08x\n" \ + "Robust mutexes locking: %s\n" \ "Smallest/average/largest keys: %zu/%zu/%zu\n" \ "Smallest/average/largest data: %zu/%zu/%zu\n" \ "Smallest/average/largest padding: %zu/%zu/%zu\n" \ @@ -175,6 +176,7 @@ _PUBLIC_ char *tdb_summary(struct tdb_context *tdb) keys.num, (tdb->hash_fn == tdb_jenkins_hash)?"yes":"no", (unsigned)tdb->feature_flags, TDB_SUPPORTED_FEATURE_FLAGS, + (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX)?"yes":"no", keys.min, tally_mean(&keys), keys.max, data.min, tally_mean(&data), data.max, extra.min, tally_mean(&extra), extra.max, diff --git a/lib/tdb/common/tdb.c b/lib/tdb/common/tdb.c index ebd4ffe3e01..ae98c9619d1 100644 --- a/lib/tdb/common/tdb.c +++ b/lib/tdb/common/tdb.c @@ -723,6 +723,15 @@ _PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags) return; } + if ((flags & TDB_NOLOCK) && + (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) && + (tdb->mutexes == NULL)) { + tdb->ecode = TDB_ERR_LOCK; + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: " + "Can not remove NOLOCK flag on mutexed databases")); + return; + } + if (flags & TDB_ALLOW_NESTING) { tdb->flags |= TDB_DISALLOW_NESTING; } diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h index 4981e2cd6ac..de8d9e68fb5 100644 --- a/lib/tdb/common/tdb_private.h +++ b/lib/tdb/common/tdb_private.h @@ -69,7 +69,11 @@ typedef uint32_t tdb_off_t; #define TDB_PAD_BYTE 0x42 #define TDB_PAD_U32 0x42424242 -#define TDB_SUPPORTED_FEATURE_FLAGS 0 +#define TDB_FEATURE_FLAG_MUTEX 0x00000001 + +#define TDB_SUPPORTED_FEATURE_FLAGS ( \ + TDB_FEATURE_FLAG_MUTEX | \ + 0) /* NB assumes there is a local variable called "tdb" that is the * current context, also takes doubly-parenthesized print-style @@ -156,7 +160,8 @@ struct tdb_header { uint32_t magic1_hash; /* hash of TDB_MAGIC_FOOD. */ uint32_t magic2_hash; /* hash of TDB_MAGIC. */ uint32_t feature_flags; - tdb_off_t reserved[26]; + tdb_len_t mutex_size; /* set if TDB_FEATURE_FLAG_MUTEX is set */ + tdb_off_t reserved[25]; }; struct tdb_lock_type { @@ -190,6 +195,8 @@ struct tdb_methods { int (*tdb_expand_file)(struct tdb_context *, tdb_off_t , tdb_off_t ); }; +struct tdb_mutexes; + struct tdb_context { char *name; /* the name of the database */ void *map_ptr; /* where it is currently mapped */ @@ -203,7 +210,8 @@ struct tdb_context { struct tdb_lock_type *lockrecs; /* only real locks, all with count>0 */ int lockrecs_array_length; - tdb_off_t hdr_ofs; /* this is 0 for now */ + tdb_off_t hdr_ofs; /* this is 0 or header.mutex_size */ + struct tdb_mutexes *mutexes; /* mmap of the mutex area */ enum TDB_ERROR ecode; /* error code for last tdb error */ uint32_t hash_size; @@ -300,4 +308,20 @@ bool tdb_add_off_t(tdb_off_t a, tdb_off_t b, tdb_off_t *pret); /* tdb_off_t and tdb_len_t right now are both uint32_t */ #define tdb_add_len_t tdb_add_off_t + +size_t tdb_mutex_size(struct tdb_context *tdb); +bool tdb_have_mutexes(struct tdb_context *tdb); +int tdb_mutex_init(struct tdb_context *tdb); +int tdb_mutex_mmap(struct tdb_context *tdb); +int tdb_mutex_munmap(struct tdb_context *tdb); +bool tdb_mutex_lock(struct tdb_context *tdb, int rw, off_t off, off_t len, + bool waitflag, int *pret); +bool tdb_mutex_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len, + int *pret); +int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype, + enum tdb_lock_flags flags); +int tdb_mutex_allrecord_unlock(struct tdb_context *tdb); +int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb); +void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb); + #endif /* TDB_PRIVATE_H */ diff --git a/lib/tdb/common/transaction.c b/lib/tdb/common/transaction.c index a2c3bbdff37..caef0bedd82 100644 --- a/lib/tdb/common/transaction.c +++ b/lib/tdb/common/transaction.c @@ -421,7 +421,8 @@ static int _tdb_transaction_start(struct tdb_context *tdb, enum tdb_lock_flags lockflags) { /* some sanity checks */ - if (tdb->read_only || (tdb->flags & TDB_INTERNAL) || tdb->traverse_read) { + if (tdb->read_only || (tdb->flags & (TDB_INTERNAL|TDB_MUTEX_LOCKING)) + || tdb->traverse_read) { TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n")); tdb->ecode = TDB_ERR_EINVAL; return -1; diff --git a/lib/tdb/docs/mutex.txt b/lib/tdb/docs/mutex.txt new file mode 100644 index 00000000000..7625662e9db --- /dev/null +++ b/lib/tdb/docs/mutex.txt @@ -0,0 +1,136 @@ +Tdb is a hashtable database with multiple concurrent writer and external +record lock support. For speed reasons, wherever possible tdb uses a shared +memory mapped area for data access. In its currently released form, it uses +fcntl byte-range locks to coordinate access to the data itself. + +The tdb data is organized as a hashtable. Hash collisions are dealt with by +forming a linked list of records that share a hash value. The individual +linked lists are protected across processes with 1-byte fcntl locks on the +starting pointer of the linked list representing a hash value. + +The external locking API of tdb allows to lock individual records. Instead of +really locking individual records, the tdb API locks a complete linked list +with a fcntl lock. + +The external locking API of tdb also allows to lock the complete database, and +ctdb uses this facility to freeze databases during a recovery. While the +so-called allrecord lock is held, all linked lists and all individual records +are frozen alltogether. Tdb achieves this by locking the complete file range +with a single fcntl lock. Individual 1-byte locks for the linked lists +conflict with this. Access to records is prevented by the one large fnctl byte +range lock. + +Fcntl locks have been chosen for tdb for two reasons: First they are portable +across all current unixes. Secondly they provide auto-cleanup. If a process +dies while holding a fcntl lock, the lock is given up as if it was explicitly +unlocked. Thus fcntl locks provide a very robust locking scheme, if a process +dies for any reason the database will not stay blocked until reboot. This +robustness is very important for long-running services, a reboot is not an +option for most users of tdb. + +Unfortunately, during stress testing, fcntl locks have turned out to be a major +problem for performance. The particular problem that was seen happens when +ctdb on a busy server does a recovery. A recovery means that ctdb has to +freeze all tdb databases for some time, usually a few seconds. This is done +with the allrecord lock. During the recovery phase on a busy server many smbd +processes try to access the tdb file with blocking fcntl calls. The specific +test in question easily reproduces 7,000 processes piling up waiting for +1-byte fcntl locks. When ctdb is done with the recovery, it gives up the +allrecord lock, covering the whole file range. All 7,000 processes waiting for +1-byte fcntl locks are woken up, trying to acquire their lock. The special +implementation of fcntl locks in Linux (up to 2013-02-12 at least) protects +all fcntl lock operations with a single system-wide spinlock. If 7,000 process +waiting for the allrecord lock to become released this leads to a thundering +herd condition, all CPUs are spinning on that single spinlock. + +Functionally the kernel is fine, eventually the thundering herd slows down and +every process correctly gets his share and locking range, but the performance +of the system while the herd is active is worse than expected. + +The thundering herd is only the worst case scenario for fcntl lock use. The +single spinlock for fcntl operations is also a performance penalty for normal +operations. In the cluster case, every read and write SMB request has to do +two fcntl calls to provide correct SMB mandatory locks. The single spinlock +is one source of serialization for the SMB read/write requests, limiting the +parallelism that can be achieved in a multi-core system. + +While trying to tune his servers, Ira Cooper, Samba Team member, found fcntl +locks to be a problem on Solaris as well. Ira pointed out that there is a +potential alternative locking mechanism that might be more scalable: Process +shared robust mutexes, as defined by Posix 2008 for example via + +http://pubs.opengroup.org/onlinepubs/9699919799/functions/pthread_mutexattr_setpshared.html +http://pubs.opengroup.org/onlinepubs/9699919799/functions/pthread_mutexattr_setrobust.html + +Pthread mutexes provide one of the core mechanisms in posix threads to protect +in-process data structures from concurrent access by multiple threads. In the +Linux implementation, a pthread_mutex_t is represented by a data structure in +user space that requires no kernel calls in the uncontended case for locking +and unlocking. Locking and unlocking in the uncontended case is implemented +purely in user space with atomic CPU instructions and thus are very fast. + +The setpshared functions indicate to the kernel that the mutex is about to be +shared between processes in a common shared memory area. + +The process shared posix mutexes have the potential to replace fcntl locking +to coordinate mmap access for tdbs. However, they are missing the criticial +auto-cleanup property that fcntl provides when a process dies. A process that +dies hard while holding a shared mutex has no chance to clean up the protected +data structures and unlock the shared mutex. Thus with a pure process shared +mutex the mutex will remain locked forever until the data structures are +re-initialized from scratch. + +With the robust mutexes defined by Posix the process shared mutexes have been +extended with a limited auto-cleanup property. If a mutex has been declared +robust, when a process exits while holding that mutex, the next process trying +to lock the mutex will get the special error message EOWNERDEAD. This informs +the caller that the data structures the mutex protects are potentially corrupt +and need to be cleaned up. + +The error message EOWNERDEAD when trying to lock a mutex is an extension over +the fcntl functionality. A process that does a blocking fcntl lock call is not +informed about whether the lock was explicitly freed by a process still alive +or due to an unplanned process exit. At the time of this writing (February +2013), at least Linux and OpenSolaris also implement the robustness feature of +process-shared mutexes. + +Converting the tdb locking mechanism from fcntl to mutexes has to take care of +both types of locks that are used on tdb files. + +The easy part is to use mutexes to replace the 1-byte linked list locks +covering the individual hashes. Those can be represented by a mutex each. + +Covering the allrecord lock is more difficult. The allrecord lock uses a fcntl +lock spanning all hash list locks simultaneously. This basic functionality is +not easily possible with mutexes. A mutex carries 1 bit of information, a +fcntl lock can carry an arbitrary amount of information. + +In order to support the allrecord lock, we have an allrecord_lock variable +protected by an allrecord_mutex. The coordination between the allrecord lock +and the chainlocks works like this: + +- Getting a chain lock works like this: + + 1. get chain mutex + 2. return success if allrecord_lock is F_UNLCK (not locked) + 3. return success if allrecord_lock is F_RDLCK (locked readonly) + and we only need a read lock. + 4. release chain mutex + 5. wait for allrecord_mutex + 6. unlock allrecord_mutex + 7. goto 1. + +- Getting the allrecord lock: + + 1. get the allrecord mutex + 2. return error if allrecord_lock is not F_UNLCK (it's locked) + 3. set allrecord_lock to the desired value. + 4. in a loop: lock(blocking) / unlock each chain mutex. + 5. return success. + +- allrecord lock upgrade: + + 1. check we already have the allrecord lock with F_RDLCK. + 3. set allrecord_lock to F_WRLCK + 4. in a loop: lock(blocking) / unlock each chain mutex. + 5. return success. diff --git a/lib/tdb/include/tdb.h b/lib/tdb/include/tdb.h index a34f089e66d..5ea5e6048c6 100644 --- a/lib/tdb/include/tdb.h +++ b/lib/tdb/include/tdb.h @@ -80,6 +80,9 @@ extern "C" { #define TDB_ALLOW_NESTING 512 /** Allow transactions to nest */ #define TDB_DISALLOW_NESTING 1024 /** Disallow transactions to nest */ #define TDB_INCOMPATIBLE_HASH 2048 /** Better hashing: can't be opened by tdb < 1.2.6. */ +#define TDB_MUTEX_LOCKING 4096 /** optimized locking using robust mutexes if supported, + only with tdb >= 1.3.0 and TDB_CLEAR_IF_FIRST + after checking tdb_runtime_check_for_robust_mutexes() */ /** The tdb error codes */ enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK, @@ -143,6 +146,11 @@ struct tdb_logging_context { * default 5.\n * TDB_ALLOW_NESTING - Allow transactions to nest.\n * TDB_DISALLOW_NESTING - Disallow transactions to nest.\n + * TDB_INCOMPATIBLE_HASH - Better hashing: can't be opened by tdb < 1.2.6.\n + * TDB_MUTEX_LOCKING - Optimized locking using robust mutexes if supported, + * can't be opened by tdb < 1.3.0. + * Only valid in combination with TDB_CLEAR_IF_FIRST + * after checking tdb_runtime_check_for_robust_mutexes()\n * * @param[in] open_flags Flags for the open(2) function. * @@ -179,6 +187,11 @@ struct tdb_context *tdb_open(const char *name, int hash_size, int tdb_flags, * default 5.\n * TDB_ALLOW_NESTING - Allow transactions to nest.\n * TDB_DISALLOW_NESTING - Disallow transactions to nest.\n + * TDB_INCOMPATIBLE_HASH - Better hashing: can't be opened by tdb < 1.2.6.\n + * TDB_MUTEX_LOCKING - Optimized locking using robust mutexes if supported, + * can't be opened by tdb < 1.3.0. + * Only valid in combination with TDB_CLEAR_IF_FIRST + * after checking tdb_runtime_check_for_robust_mutexes()\n * * @param[in] open_flags Flags for the open(2) function. * @@ -842,6 +855,27 @@ int tdb_rescue(struct tdb_context *tdb, void (*walk) (TDB_DATA key, TDB_DATA data, void *private_data), void *private_data); +/** + * @brief Check if support for TDB_MUTEX_LOCKING is available at runtime. + * + * On some systems the API for pthread_mutexattr_setrobust() is not available. + * On other systems there are some bugs in the interaction between glibc and + * the linux kernel. + * + * This function provides a runtime check if robust mutexes are really + * available. + * + * This needs to be called and return true before TDB_MUTEX_LOCKING + * can be used at runtime. + * + * @note This calls fork(), but the SIGCHILD handling should be transparent. + * + * @return true if supported, false otherwise. + * + * @see TDB_MUTEX_LOCKING + */ +bool tdb_runtime_check_for_robust_mutexes(void); + /* @} ******************************************************************/ /* Low level locking functions: use with care */ diff --git a/lib/tdb/test/run-3G-file.c b/lib/tdb/test/run-3G-file.c index 900b1a667a2..748c972284a 100644 --- a/lib/tdb/test/run-3G-file.c +++ b/lib/tdb/test/run-3G-file.c @@ -9,6 +9,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/test/run-bad-tdb-header.c b/lib/tdb/test/run-bad-tdb-header.c index b00fb8934a4..9d29fdf5e8e 100644 --- a/lib/tdb/test/run-bad-tdb-header.c +++ b/lib/tdb/test/run-bad-tdb-header.c @@ -9,6 +9,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/test/run-check.c b/lib/tdb/test/run-check.c index b2756914831..ce389a2d14d 100644 --- a/lib/tdb/test/run-check.c +++ b/lib/tdb/test/run-check.c @@ -9,6 +9,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/test/run-corrupt.c b/lib/tdb/test/run-corrupt.c index 93eae42ce16..e6fc751842f 100644 --- a/lib/tdb/test/run-corrupt.c +++ b/lib/tdb/test/run-corrupt.c @@ -9,6 +9,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/test/run-die-during-transaction.c b/lib/tdb/test/run-die-during-transaction.c index 9b9041552ed..c636d87322d 100644 --- a/lib/tdb/test/run-die-during-transaction.c +++ b/lib/tdb/test/run-die-during-transaction.c @@ -19,6 +19,7 @@ static int ftruncate_check(int fd, off_t length); #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include diff --git a/lib/tdb/test/run-endian.c b/lib/tdb/test/run-endian.c index 3116f7da51b..9d4d5f59f8e 100644 --- a/lib/tdb/test/run-endian.c +++ b/lib/tdb/test/run-endian.c @@ -9,6 +9,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/test/run-incompatible.c b/lib/tdb/test/run-incompatible.c index af01ca6a395..b8e95b5e778 100644 --- a/lib/tdb/test/run-incompatible.c +++ b/lib/tdb/test/run-incompatible.c @@ -9,6 +9,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include diff --git a/lib/tdb/test/run-nested-transactions.c b/lib/tdb/test/run-nested-transactions.c index bf08e55afee..864adf2d716 100644 --- a/lib/tdb/test/run-nested-transactions.c +++ b/lib/tdb/test/run-nested-transactions.c @@ -9,6 +9,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include diff --git a/lib/tdb/test/run-nested-traverse.c b/lib/tdb/test/run-nested-traverse.c index 361dc2ece71..22ee3e2a2a6 100644 --- a/lib/tdb/test/run-nested-traverse.c +++ b/lib/tdb/test/run-nested-traverse.c @@ -11,6 +11,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #undef fcntl #include diff --git a/lib/tdb/test/run-no-lock-during-traverse.c b/lib/tdb/test/run-no-lock-during-traverse.c index b5e31dc1001..737a32f1115 100644 --- a/lib/tdb/test/run-no-lock-during-traverse.c +++ b/lib/tdb/test/run-no-lock-during-traverse.c @@ -13,6 +13,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/test/run-oldhash.c b/lib/tdb/test/run-oldhash.c index 535336cb473..aaee6f62ef7 100644 --- a/lib/tdb/test/run-oldhash.c +++ b/lib/tdb/test/run-oldhash.c @@ -9,6 +9,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/test/run-open-during-transaction.c b/lib/tdb/test/run-open-during-transaction.c index 04ba956d45a..16053765e92 100644 --- a/lib/tdb/test/run-open-during-transaction.c +++ b/lib/tdb/test/run-open-during-transaction.c @@ -20,6 +20,7 @@ static int ftruncate_check(int fd, off_t length); #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include diff --git a/lib/tdb/test/run-readonly-check.c b/lib/tdb/test/run-readonly-check.c index e5185324246..c5e0f7dccbc 100644 --- a/lib/tdb/test/run-readonly-check.c +++ b/lib/tdb/test/run-readonly-check.c @@ -11,6 +11,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/test/run-rescue-find_entry.c b/lib/tdb/test/run-rescue-find_entry.c index 25f4f1c05f4..5d6f8f711d0 100644 --- a/lib/tdb/test/run-rescue-find_entry.c +++ b/lib/tdb/test/run-rescue-find_entry.c @@ -10,6 +10,7 @@ #include "../common/check.c" #include "../common/hash.c" #include "../common/rescue.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/test/run-rescue.c b/lib/tdb/test/run-rescue.c index 7c806a40b45..e43f53be9d8 100644 --- a/lib/tdb/test/run-rescue.c +++ b/lib/tdb/test/run-rescue.c @@ -10,6 +10,7 @@ #include "../common/check.c" #include "../common/hash.c" #include "../common/rescue.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/test/run-rwlock-check.c b/lib/tdb/test/run-rwlock-check.c index 8b8072db1e6..2ac9dc3d7ca 100644 --- a/lib/tdb/test/run-rwlock-check.c +++ b/lib/tdb/test/run-rwlock-check.c @@ -9,6 +9,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include diff --git a/lib/tdb/test/run-summary.c b/lib/tdb/test/run-summary.c index 22312843e79..8b9a1a0f69d 100644 --- a/lib/tdb/test/run-summary.c +++ b/lib/tdb/test/run-summary.c @@ -10,6 +10,7 @@ #include "../common/check.c" #include "../common/hash.c" #include "../common/summary.c" +#include "../common/mutex.c" #include "tap-interface.h" #include diff --git a/lib/tdb/test/run-transaction-expand.c b/lib/tdb/test/run-transaction-expand.c index ddf1f2432da..d36b894dbed 100644 --- a/lib/tdb/test/run-transaction-expand.c +++ b/lib/tdb/test/run-transaction-expand.c @@ -37,6 +37,7 @@ static inline int fake_fdatasync(int fd) #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/test/run-traverse-in-transaction.c b/lib/tdb/test/run-traverse-in-transaction.c index 48194b8fdbf..17d64129642 100644 --- a/lib/tdb/test/run-traverse-in-transaction.c +++ b/lib/tdb/test/run-traverse-in-transaction.c @@ -11,6 +11,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #undef fcntl_with_lockcheck #include diff --git a/lib/tdb/test/run-wronghash-fail.c b/lib/tdb/test/run-wronghash-fail.c index 9c78fc5e3e1..c44b0f5aaee 100644 --- a/lib/tdb/test/run-wronghash-fail.c +++ b/lib/tdb/test/run-wronghash-fail.c @@ -9,6 +9,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include diff --git a/lib/tdb/test/run-zero-append.c b/lib/tdb/test/run-zero-append.c index a2324c437a4..f9eba1b7b32 100644 --- a/lib/tdb/test/run-zero-append.c +++ b/lib/tdb/test/run-zero-append.c @@ -9,6 +9,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/test/run.c b/lib/tdb/test/run.c index f61fcf68204..c744c4ddca4 100644 --- a/lib/tdb/test/run.c +++ b/lib/tdb/test/run.c @@ -9,6 +9,7 @@ #include "../common/open.c" #include "../common/check.c" #include "../common/hash.c" +#include "../common/mutex.c" #include "tap-interface.h" #include #include "logging.h" diff --git a/lib/tdb/wscript b/lib/tdb/wscript index 70196938ed2..6243ccff8df 100644 --- a/lib/tdb/wscript +++ b/lib/tdb/wscript @@ -1,7 +1,7 @@ #!/usr/bin/env python APPNAME = 'tdb' -VERSION = '1.2.13' +VERSION = '1.3.0' blddir = 'bin' @@ -46,6 +46,10 @@ def set_options(opt): opt.BUILTIN_DEFAULT('replace') opt.PRIVATE_EXTENSION_DEFAULT('tdb', noextension='tdb') opt.RECURSE('lib/replace') + opt.add_option('--disable-tdb-mutex-locking', + help=("Disable the use of pthread robust mutexes"), + action="store_true", dest='disable_tdb_mutex_locking', + default=False) if opt.IN_LAUNCH_DIR(): opt.add_option('--disable-python', help=("disable the pytdb module"), @@ -53,6 +57,11 @@ def set_options(opt): def configure(conf): + conf.env.disable_tdb_mutex_locking = getattr(Options.options, + 'disable_tdb_mutex_locking', + False) + if not conf.env.disable_tdb_mutex_locking: + conf.env.replace_add_global_pthread = True conf.RECURSE('lib/replace') conf.env.standalone_tdb = conf.IN_LAUNCH_DIR() @@ -68,6 +77,11 @@ def configure(conf): conf.env.disable_python = getattr(Options.options, 'disable_python', False) + if (conf.CONFIG_SET('HAVE_ROBUST_MUTEXES') and + conf.env.building_tdb and + not conf.env.disable_tdb_mutex_locking): + conf.define('USE_TDB_MUTEX_LOCKING', 1) + conf.CHECK_XSLTPROC_MANPAGES() if not conf.env.disable_python: @@ -87,10 +101,12 @@ def configure(conf): def build(bld): bld.RECURSE('lib/replace') - COMMON_SRC = bld.SUBDIR('common', - '''check.c error.c tdb.c traverse.c - freelistcheck.c lock.c dump.c freelist.c - io.c open.c transaction.c hash.c summary.c rescue.c''') + COMMON_FILES='''check.c error.c tdb.c traverse.c + freelistcheck.c lock.c dump.c freelist.c + io.c open.c transaction.c hash.c summary.c rescue.c + mutex.c''' + + COMMON_SRC = bld.SUBDIR('common', COMMON_FILES) if bld.env.standalone_tdb: bld.env.PKGCONFIGDIR = '${LIBDIR}/pkgconfig' @@ -99,9 +115,15 @@ def build(bld): private_library = True if not bld.CONFIG_SET('USING_SYSTEM_TDB'): + + tdb_deps = 'replace' + + if bld.CONFIG_SET('USE_TDB_MUTEX_LOCKING'): + tdb_deps += ' pthread' + bld.SAMBA_LIBRARY('tdb', COMMON_SRC, - deps='replace', + deps=tdb_deps, includes='include', abi_directory='ABI', abi_match='tdb_*', @@ -137,7 +159,7 @@ def build(bld): # FIXME: This hardcoded list is stupid, stupid, stupid. bld.SAMBA_SUBSYSTEM('tdb-test-helpers', 'test/external-agent.c test/lock-tracking.c test/logging.c', - 'replace', + tdb_deps, includes='include') for t in tdb1_unit_tests: -- cgit