diff options
author | Volker Lendecke <vl@samba.org> | 2014-03-18 08:04:42 +0100 |
---|---|---|
committer | Michael Adam <obnox@samba.org> | 2014-03-18 13:42:10 +0100 |
commit | 3034a5a62b8eaac7bdc52bfb7af1beb29e888b34 (patch) | |
tree | a45c3019d6e2d34f898332bc623b4a5d38602266 | |
parent | 1461362e936e5beebeaae1555cf96f6731287c35 (diff) | |
download | samba-3034a5a62b8eaac7bdc52bfb7af1beb29e888b34.tar.gz samba-3034a5a62b8eaac7bdc52bfb7af1beb29e888b34.tar.xz samba-3034a5a62b8eaac7bdc52bfb7af1beb29e888b34.zip |
tdb: Reduce freelist contention
In a metadata-intensive benchmark we have seen the locking.tdb freelist to be
one of the central contention points. This patch removes most of the contention
on the freelist. Ages ago we already reduced freelist contention by using the
even much older DEAD records: If TDB_VOLATILE is set, don't directly put
deleted records on the freelist, but just mark a few of them just as DEAD. The
next new record can them re-use that space without consulting the freelist.
This patch builds upon the DEAD records: If we need space and the freelist is
busy, instead of doing a blocking wait on the freelist, start looking into
other chains for DEAD records and steal them from there. This way every hash
chain becomes a small freelist. Just wander around the hash chains as long as
the freelist is still busy.
With this patch and the tdb mutex patch (following hopefully some time soon)
you can see a heavily busy clustered smbd run without locking.tdb futex
syscalls.
Signed-off-by: Volker Lendecke <vl@samba.org>
Reviewed-by: Michael Adam <obnox@samba.org>
Reviewed-by: Stefan Metzmacher <metze@samba.org>
-rw-r--r-- | lib/tdb/common/freelist.c | 100 | ||||
-rw-r--r-- | lib/tdb/common/tdb.c | 20 | ||||
-rw-r--r-- | lib/tdb/common/tdb_private.h | 3 |
3 files changed, 93 insertions, 30 deletions
diff --git a/lib/tdb/common/freelist.c b/lib/tdb/common/freelist.c index ea14dd02ab9..6f8f812d416 100644 --- a/lib/tdb/common/freelist.c +++ b/lib/tdb/common/freelist.c @@ -273,7 +273,8 @@ static tdb_off_t tdb_allocate_ofs(struct tdb_context *tdb, 0 is returned if the space could not be allocated */ -tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec) +static tdb_off_t tdb_allocate_from_freelist( + struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec) { tdb_off_t rec_ptr, last_ptr, newrec_ptr; struct { @@ -282,9 +283,6 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_rec } bestfit; float multiplier = 1.0; - if (tdb_lock(tdb, -1, F_WRLCK) == -1) - return 0; - /* over-allocate to reduce fragmentation */ length *= 1.25; @@ -297,7 +295,7 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_rec /* read in the freelist top */ if (tdb_ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1) - goto fail; + return 0; bestfit.rec_ptr = 0; bestfit.last_ptr = 0; @@ -310,7 +308,7 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_rec */ while (rec_ptr) { if (tdb_rec_free_read(tdb, rec_ptr, rec) == -1) { - goto fail; + return 0; } if (rec->rec_len >= length) { @@ -344,12 +342,11 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_rec if (bestfit.rec_ptr != 0) { if (tdb_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) { - goto fail; + return 0; } newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr); - tdb_unlock(tdb, -1, F_WRLCK); return newrec_ptr; } @@ -357,12 +354,95 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_rec database and if we can then try again */ if (tdb_expand(tdb, length + sizeof(*rec)) == 0) goto again; - fail: - tdb_unlock(tdb, -1, F_WRLCK); + return 0; } +static bool tdb_alloc_dead( + struct tdb_context *tdb, int hash, tdb_len_t length, + tdb_off_t *rec_ptr, struct tdb_record *rec) +{ + tdb_off_t last_ptr; + + *rec_ptr = tdb_find_dead(tdb, hash, rec, length, &last_ptr); + if (*rec_ptr == 0) { + return false; + } + /* + * Unlink the record from the hash chain, it's about to be moved into + * another one. + */ + return (tdb_ofs_write(tdb, last_ptr, &rec->next) == 0); +} + +/* + * Chain "hash" is assumed to be locked + */ + +tdb_off_t tdb_allocate(struct tdb_context *tdb, int hash, tdb_len_t length, + struct tdb_record *rec) +{ + tdb_off_t ret; + int i; + + if (tdb->max_dead_records == 0) { + /* + * No dead records to expect anywhere. Do the blocking + * freelist lock without trying to steal from others + */ + goto blocking_freelist_allocate; + } + + /* + * The following loop tries to get the freelist lock nonblocking. If + * it gets the lock, allocate from there. If the freelist is busy, + * instead of waiting we try to steal dead records from other hash + * chains. + * + * Be aware that we do nonblocking locks on the other hash chains as + * well and fail gracefully. This way we avoid deadlocks (we block two + * hash chains, something which is pretty bad normally) + */ + + for (i=1; i<tdb->hash_size; i++) { + + int list; + + if (tdb_lock_nonblock(tdb, -1, F_WRLCK) == 0) { + /* + * Under the freelist lock take the chance to give + * back our dead records. + */ + tdb_purge_dead(tdb, hash); + + ret = tdb_allocate_from_freelist(tdb, length, rec); + tdb_unlock(tdb, -1, F_WRLCK); + return ret; + } + + list = BUCKET(hash+i); + + if (tdb_lock_nonblock(tdb, list, F_WRLCK) == 0) { + bool got_dead; + got_dead = tdb_alloc_dead(tdb, list, length, &ret, rec); + tdb_unlock(tdb, list, F_WRLCK); + + if (got_dead) { + return ret; + } + } + } + +blocking_freelist_allocate: + + if (tdb_lock(tdb, -1, F_WRLCK) == -1) { + return 0; + } + ret = tdb_allocate_from_freelist(tdb, length, rec); + tdb_unlock(tdb, -1, F_WRLCK); + return ret; +} /* return the size of the freelist - used to decide if we should repack diff --git a/lib/tdb/common/tdb.c b/lib/tdb/common/tdb.c index a7111dea96d..ba1c98edbe6 100644 --- a/lib/tdb/common/tdb.c +++ b/lib/tdb/common/tdb.c @@ -550,26 +550,8 @@ static int _tdb_store(struct tdb_context *tdb, TDB_DATA key, } } - /* - * We have to allocate some space from the freelist, so this means we - * have to lock it. Use the chance to purge all the DEAD records from - * the hash chain under the freelist lock. - */ - - if (tdb_lock(tdb, -1, F_WRLCK) == -1) { - goto fail; - } - - if ((tdb->max_dead_records != 0) - && (tdb_purge_dead(tdb, hash) == -1)) { - tdb_unlock(tdb, -1, F_WRLCK); - goto fail; - } - /* we have to allocate some space */ - rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec); - - tdb_unlock(tdb, -1, F_WRLCK); + rec_ptr = tdb_allocate(tdb, hash, key.dsize + dbuf.dsize, &rec); if (rec_ptr == 0) { goto fail; diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h index f62c0a3a6f3..a6721595782 100644 --- a/lib/tdb/common/tdb_private.h +++ b/lib/tdb/common/tdb_private.h @@ -255,7 +255,8 @@ int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d); int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d); void *tdb_convert(void *buf, uint32_t size); int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec); -tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec); +tdb_off_t tdb_allocate(struct tdb_context *tdb, int hash, tdb_len_t length, + struct tdb_record *rec); int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d); int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d); int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off); |