diff options
| author | Ronnie Sahlberg <ronniesahlberg@gmail.com> | 2010-10-08 12:49:08 +1100 |
|---|---|---|
| committer | Ronnie Sahlberg <ronniesahlberg@gmail.com> | 2010-10-08 12:49:08 +1100 |
| commit | 8bdaa7d41f4aad5f5f245b364816aba6f52c0304 (patch) | |
| tree | 6e7ec54ab2d3e76ae7330f0209ef5d43a08b0b75 /ctdb/lib/tdb/common | |
| parent | 7253342a70b145d0a3689e898acc485859fc5f8b (diff) | |
| parent | c333126496954b4a7a829f3d8fac5190b5b8e816 (diff) | |
| download | samba-8bdaa7d41f4aad5f5f245b364816aba6f52c0304.tar.gz samba-8bdaa7d41f4aad5f5f245b364816aba6f52c0304.tar.xz samba-8bdaa7d41f4aad5f5f245b364816aba6f52c0304.zip | |
Merge commit 'rusty/tdb-update'
(This used to be ctdb commit 23510bf858c06a3710d1cc741d32bad3675fd97e)
Diffstat (limited to 'ctdb/lib/tdb/common')
| -rw-r--r-- | ctdb/lib/tdb/common/check.c | 73 | ||||
| -rw-r--r-- | ctdb/lib/tdb/common/hash.c | 380 | ||||
| -rw-r--r-- | ctdb/lib/tdb/common/open.c | 95 | ||||
| -rw-r--r-- | ctdb/lib/tdb/common/tdb_private.h | 8 |
4 files changed, 531 insertions, 25 deletions
diff --git a/ctdb/lib/tdb/common/check.c b/ctdb/lib/tdb/common/check.c index 2c640434ee..58c9c26540 100644 --- a/ctdb/lib/tdb/common/check.c +++ b/ctdb/lib/tdb/common/check.c @@ -28,8 +28,9 @@ static bool tdb_check_header(struct tdb_context *tdb, tdb_off_t *recovery) { struct tdb_header hdr; + uint32_t h1, h2; - if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), DOCONV()) == -1) + if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1) return false; if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) goto corrupt; @@ -38,7 +39,12 @@ static bool tdb_check_header(struct tdb_context *tdb, tdb_off_t *recovery) if (hdr.version != TDB_VERSION) goto corrupt; - if (hdr.rwlocks != 0) + if (hdr.rwlocks != 0 && hdr.rwlocks != TDB_HASH_RWLOCK_MAGIC) + goto corrupt; + + tdb_header_hash(tdb, &h1, &h2); + if (hdr.magic1_hash && hdr.magic2_hash && + (hdr.magic1_hash != h1 || hdr.magic2_hash != h2)) goto corrupt; if (hdr.hash_size == 0) @@ -301,6 +307,21 @@ static bool tdb_check_free_record(struct tdb_context *tdb, return true; } +/* Slow, but should be very rare. */ +static size_t dead_space(struct tdb_context *tdb, tdb_off_t off) +{ + size_t len; + + for (len = 0; off + len < tdb->map_size; len++) { + char c; + if (tdb->methods->tdb_read(tdb, off, &c, 1, 0)) + return 0; + if (c != 0 && c != 0x42) + break; + } + return len; +} + int tdb_check(struct tdb_context *tdb, int (*check)(TDB_DATA key, TDB_DATA data, void *private_data), void *private_data) @@ -310,9 +331,18 @@ int tdb_check(struct tdb_context *tdb, tdb_off_t off, recovery_start; struct tdb_record rec; bool found_recovery = false; - - if (tdb_lockall_read(tdb) == -1) - return -1; + tdb_len_t dead; + bool locked; + + /* Read-only databases use no locking at all: it's best-effort. + * We may have a write lock already, so skip that case too. */ + if (tdb->read_only || tdb->allrecord_lock.count != 0) { + locked = false; + } else { + if (tdb_lockall_read(tdb) == -1) + return -1; + locked = true; + } /* Make sure we know true size of the underlying file. */ tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1); @@ -369,8 +399,23 @@ int tdb_check(struct tdb_context *tdb, if (!tdb_check_free_record(tdb, off, &rec, hashes)) goto free; break; - case TDB_RECOVERY_MAGIC: + /* If we crash after ftruncate, we can get zeroes or fill. */ case TDB_RECOVERY_INVALID_MAGIC: + case 0x42424242: + if (recovery_start == off) { + found_recovery = true; + break; + } + dead = dead_space(tdb, off); + if (dead < sizeof(rec)) + goto corrupt; + + TDB_LOG((tdb, TDB_DEBUG_ERROR, + "Dead space at %d-%d (of %u)\n", + off, off + dead, tdb->map_size)); + rec.rec_len = dead - sizeof(rec); + break; + case TDB_RECOVERY_MAGIC: if (recovery_start != off) { TDB_LOG((tdb, TDB_DEBUG_ERROR, "Unexpected recovery record at offset %d\n", @@ -379,7 +424,8 @@ int tdb_check(struct tdb_context *tdb, } found_recovery = true; break; - default: + default: ; + corrupt: tdb->ecode = TDB_ERR_CORRUPT; TDB_LOG((tdb, TDB_DEBUG_ERROR, "Bad magic 0x%x at offset %d\n", @@ -405,19 +451,22 @@ int tdb_check(struct tdb_context *tdb, /* We must have found recovery area if there was one. */ if (recovery_start != 0 && !found_recovery) { TDB_LOG((tdb, TDB_DEBUG_ERROR, - "Expected %s recovery area, got %s\n", - recovery_start ? "a" : "no", - found_recovery ? "one" : "none")); + "Expected a recovery area at %u\n", + recovery_start)); goto free; } free(hashes); - tdb_unlockall_read(tdb); + if (locked) { + tdb_unlockall_read(tdb); + } return 0; free: free(hashes); unlock: - tdb_unlockall_read(tdb); + if (locked) { + tdb_unlockall_read(tdb); + } return -1; } diff --git a/ctdb/lib/tdb/common/hash.c b/ctdb/lib/tdb/common/hash.c new file mode 100644 index 0000000000..c07297ec19 --- /dev/null +++ b/ctdb/lib/tdb/common/hash.c @@ -0,0 +1,380 @@ + /* + Unix SMB/CIFS implementation. + + trivial database library + + Copyright (C) Rusty Russell 2010 + + ** NOTE! The following LGPL license applies to the tdb + ** library. This does NOT imply that all of Samba is released + ** under the LGPL + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, see <http://www.gnu.org/licenses/>. +*/ +#include "tdb_private.h" + +/* This is based on the hash algorithm from gdbm */ +unsigned int tdb_old_hash(TDB_DATA *key) +{ + uint32_t value; /* Used to compute the hash value. */ + uint32_t i; /* Used to cycle through random values. */ + + /* Set the initial value from the key size. */ + for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++) + value = (value + (key->dptr[i] << (i*5 % 24))); + + return (1103515243 * value + 12345); +} + +#ifndef WORDS_BIGENDIAN +# define HASH_LITTLE_ENDIAN 1 +# define HASH_BIG_ENDIAN 0 +#else +# define HASH_LITTLE_ENDIAN 0 +# define HASH_BIG_ENDIAN 1 +#endif + +/* +------------------------------------------------------------------------------- +lookup3.c, by Bob Jenkins, May 2006, Public Domain. + +These are functions for producing 32-bit hashes for hash table lookup. +hash_word(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() +are externally useful functions. Routines to test the hash are included +if SELF_TEST is defined. You can use this free for any purpose. It's in +the public domain. It has no warranty. + +You probably want to use hashlittle(). hashlittle() and hashbig() +hash byte arrays. hashlittle() is is faster than hashbig() on +little-endian machines. Intel and AMD are little-endian machines. +On second thought, you probably want hashlittle2(), which is identical to +hashlittle() except it returns two 32-bit hashes for the price of one. +You could implement hashbig2() if you wanted but I haven't bothered here. + +If you want to find a hash of, say, exactly 7 integers, do + a = i1; b = i2; c = i3; + mix(a,b,c); + a += i4; b += i5; c += i6; + mix(a,b,c); + a += i7; + final(a,b,c); +then use c as the hash value. If you have a variable length array of +4-byte integers to hash, use hash_word(). If you have a byte array (like +a character string), use hashlittle(). If you have several byte arrays, or +a mix of things, see the comments above hashlittle(). + +Why is this so big? I read 12 bytes at a time into 3 4-byte integers, +then mix those integers. This is fast (you can do a lot more thorough +mixing with 12*3 instructions on 3 integers than you can with 3 instructions +on 1 byte), but shoehorning those bytes into integers efficiently is messy. +*/ + +#define hashsize(n) ((uint32_t)1<<(n)) +#define hashmask(n) (hashsize(n)-1) +#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) + +/* +------------------------------------------------------------------------------- +mix -- mix 3 32-bit values reversibly. + +This is reversible, so any information in (a,b,c) before mix() is +still in (a,b,c) after mix(). + +If four pairs of (a,b,c) inputs are run through mix(), or through +mix() in reverse, there are at least 32 bits of the output that +are sometimes the same for one pair and different for another pair. +This was tested for: +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that +satisfy this are + 4 6 8 16 19 4 + 9 15 3 18 27 15 + 14 9 3 7 17 3 +Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing +for "differ" defined as + with a one-bit base and a two-bit delta. I +used http://burtleburtle.net/bob/hash/avalanche.html to choose +the operations, constants, and arrangements of the variables. + +This does not achieve avalanche. There are input bits of (a,b,c) +that fail to affect some output bits of (a,b,c), especially of a. The +most thoroughly mixed value is c, but it doesn't really even achieve +avalanche in c. + +This allows some parallelism. Read-after-writes are good at doubling +the number of bits affected, so the goal of mixing pulls in the opposite +direction as the goal of parallelism. I did what I could. Rotates +seem to cost as much as shifts on every machine I could lay my hands +on, and rotates are much kinder to the top and bottom bits, so I used +rotates. +------------------------------------------------------------------------------- +*/ +#define mix(a,b,c) \ +{ \ + a -= c; a ^= rot(c, 4); c += b; \ + b -= a; b ^= rot(a, 6); a += c; \ + c -= b; c ^= rot(b, 8); b += a; \ + a -= c; a ^= rot(c,16); c += b; \ + b -= a; b ^= rot(a,19); a += c; \ + c -= b; c ^= rot(b, 4); b += a; \ +} + +/* +------------------------------------------------------------------------------- +final -- final mixing of 3 32-bit values (a,b,c) into c + +Pairs of (a,b,c) values differing in only a few bits will usually +produce values of c that look totally different. This was tested for +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +These constants passed: + 14 11 25 16 4 14 24 + 12 14 25 16 4 14 24 +and these came close: + 4 8 15 26 3 22 24 + 10 8 15 26 3 22 24 + 11 8 15 26 3 22 24 +------------------------------------------------------------------------------- +*/ +#define final(a,b,c) \ +{ \ + c ^= b; c -= rot(b,14); \ + a ^= c; a -= rot(c,11); \ + b ^= a; b -= rot(a,25); \ + c ^= b; c -= rot(b,16); \ + a ^= c; a -= rot(c,4); \ + b ^= a; b -= rot(a,14); \ + c ^= b; c -= rot(b,24); \ +} + + +/* +------------------------------------------------------------------------------- +hashlittle() -- hash a variable-length key into a 32-bit value + k : the key (the unaligned variable-length array of bytes) + length : the length of the key, counting by bytes + val2 : IN: can be any 4-byte value OUT: second 32 bit hash. +Returns a 32-bit value. Every bit of the key affects every bit of +the return value. Two keys differing by one or two bits will have +totally different hash values. Note that the return value is better +mixed than val2, so use that first. + +The best hash table sizes are powers of 2. There is no need to do +mod a prime (mod is sooo slow!). If you need less than 32 bits, +use a bitmask. For example, if you need only 10 bits, do + h = (h & hashmask(10)); +In which case, the hash table should have hashsize(10) elements. + +If you are hashing n strings (uint8_t **)k, do it like this: + for (i=0, h=0; i<n; ++i) h = hashlittle( k[i], len[i], h); + +By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this +code any way you wish, private, educational, or commercial. It's free. + +Use for hash table lookup, or anything where one collision in 2^^32 is +acceptable. Do NOT use for cryptographic purposes. +------------------------------------------------------------------------------- +*/ + +static uint32_t hashlittle( const void *key, size_t length ) +{ + uint32_t a,b,c; /* internal state */ + union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */ + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)length); + + u.ptr = key; + if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) { + const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ +#ifdef VALGRIND + const uint8_t *k8; +#endif + + /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ + while (length > 12) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 12; + k += 3; + } + + /*----------------------------- handle the last (probably partial) block */ + /* + * "k[2]&0xffffff" actually reads beyond the end of the string, but + * then masks off the part it's not allowed to read. Because the + * string is aligned, the masked-off tail is in the same word as the + * rest of the string. Every machine with memory protection I've seen + * does it on word boundaries, so is OK with this. But VALGRIND will + * still catch it and complain. The masking trick does make the hash + * noticably faster for short strings (like English words). + */ +#ifndef VALGRIND + + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; + case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; + case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=k[1]&0xffffff; a+=k[0]; break; + case 6 : b+=k[1]&0xffff; a+=k[0]; break; + case 5 : b+=k[1]&0xff; a+=k[0]; break; + case 4 : a+=k[0]; break; + case 3 : a+=k[0]&0xffffff; break; + case 2 : a+=k[0]&0xffff; break; + case 1 : a+=k[0]&0xff; break; + case 0 : return c; /* zero length strings require no mixing */ + } + +#else /* make valgrind happy */ + + k8 = (const uint8_t *)k; + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ + case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ + case 9 : c+=k8[8]; /* fall through */ + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ + case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ + case 5 : b+=k8[4]; /* fall through */ + case 4 : a+=k[0]; break; + case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ + case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ + case 1 : a+=k8[0]; break; + case 0 : return c; + } + +#endif /* !valgrind */ + + } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { + const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ + const uint8_t *k8; + + /*--------------- all but last block: aligned reads and different mixing */ + while (length > 12) + { + a += k[0] + (((uint32_t)k[1])<<16); + b += k[2] + (((uint32_t)k[3])<<16); + c += k[4] + (((uint32_t)k[5])<<16); + mix(a,b,c); + length -= 12; + k += 6; + } + + /*----------------------------- handle the last (probably partial) block */ + k8 = (const uint8_t *)k; + switch(length) + { + case 12: c+=k[4]+(((uint32_t)k[5])<<16); + b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ + case 10: c+=k[4]; + b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 9 : c+=k8[8]; /* fall through */ + case 8 : b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ + case 6 : b+=k[2]; + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 5 : b+=k8[4]; /* fall through */ + case 4 : a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ + case 2 : a+=k[0]; + break; + case 1 : a+=k8[0]; + break; + case 0 : return c; /* zero length requires no mixing */ + } + + } else { /* need to read the key one byte at a time */ + const uint8_t *k = (const uint8_t *)key; + + /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ + while (length > 12) + { + a += k[0]; + a += ((uint32_t)k[1])<<8; + a += ((uint32_t)k[2])<<16; + a += ((uint32_t)k[3])<<24; + b += k[4]; + b += ((uint32_t)k[5])<<8; + b += ((uint32_t)k[6])<<16; + b += ((uint32_t)k[7])<<24; + c += k[8]; + c += ((uint32_t)k[9])<<8; + c += ((uint32_t)k[10])<<16; + c += ((uint32_t)k[11])<<24; + mix(a,b,c); + length -= 12; + k += 12; + } + + /*-------------------------------- last block: affect all 32 bits of (c) */ + switch(length) /* all the case statements fall through */ + { + case 12: c+=((uint32_t)k[11])<<24; + case 11: c+=((uint32_t)k[10])<<16; + case 10: c+=((uint32_t)k[9])<<8; + case 9 : c+=k[8]; + case 8 : b+=((uint32_t)k[7])<<24; + case 7 : b+=((uint32_t)k[6])<<16; + case 6 : b+=((uint32_t)k[5])<<8; + case 5 : b+=k[4]; + case 4 : a+=((uint32_t)k[3])<<24; + case 3 : a+=((uint32_t)k[2])<<16; + case 2 : a+=((uint32_t)k[1])<<8; + case 1 : a+=k[0]; + break; + case 0 : return c; + } + } + + final(a,b,c); + return c; +} + +unsigned int tdb_jenkins_hash(TDB_DATA *key) +{ + return hashlittle(key->dptr, key->dsize); +} diff --git a/ctdb/lib/tdb/common/open.c b/ctdb/lib/tdb/common/open.c index 7687ff6e32..66539c3f6c 100644 --- a/ctdb/lib/tdb/common/open.c +++ b/ctdb/lib/tdb/common/open.c @@ -30,20 +30,25 @@ /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */ static struct tdb_context *tdbs = NULL; - -/* This is based on the hash algorithm from gdbm */ -static unsigned int default_tdb_hash(TDB_DATA *key) +/* We use two hashes to double-check they're using the right hash function. */ +void tdb_header_hash(struct tdb_context *tdb, + uint32_t *magic1_hash, uint32_t *magic2_hash) { - uint32_t value; /* Used to compute the hash value. */ - uint32_t i; /* Used to cycle through random values. */ + TDB_DATA hash_key; + uint32_t tdb_magic = TDB_MAGIC; - /* Set the initial value from the key size. */ - for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++) - value = (value + (key->dptr[i] << (i*5 % 24))); + hash_key.dptr = discard_const_p(unsigned char, TDB_MAGIC_FOOD); + hash_key.dsize = sizeof(TDB_MAGIC_FOOD); + *magic1_hash = tdb->hash_fn(&hash_key); - return (1103515243 * value + 12345); -} + hash_key.dptr = (unsigned char *)CONVERT(tdb_magic); + hash_key.dsize = sizeof(tdb_magic); + *magic2_hash = tdb->hash_fn(&hash_key); + /* Make sure at least one hash is non-zero! */ + if (*magic1_hash == 0 && *magic2_hash == 0) + *magic1_hash = 1; +} /* initialise a new database with a specified hash size */ static int tdb_new_database(struct tdb_context *tdb, int hash_size) @@ -62,6 +67,14 @@ static int tdb_new_database(struct tdb_context *tdb, int hash_size) /* Fill in the header */ newdb->version = TDB_VERSION; newdb->hash_size = hash_size; + + tdb_header_hash(tdb, &newdb->magic1_hash, &newdb->magic2_hash); + + /* Make sure older tdbs (which don't check the magic hash fields) + * will refuse to open this TDB. */ + if (tdb->flags & TDB_INCOMPATIBLE_HASH) + newdb->rwlocks = TDB_HASH_RWLOCK_MAGIC; + if (tdb->flags & TDB_INTERNAL) { tdb->map_size = size; tdb->map_ptr = (char *)newdb; @@ -128,6 +141,26 @@ static void null_log_fn(struct tdb_context *tdb, enum tdb_debug_level level, con { } +static bool check_header_hash(struct tdb_context *tdb, + bool default_hash, uint32_t *m1, uint32_t *m2) +{ + tdb_header_hash(tdb, m1, m2); + if (tdb->header.magic1_hash == *m1 && + tdb->header.magic2_hash == *m2) { + return true; + } + + /* If they explicitly set a hash, always respect it. */ + if (!default_hash) + return false; + + /* Otherwise, try the other inbuilt hash. */ + if (tdb->hash_fn == tdb_old_hash) + tdb->hash_fn = tdb_jenkins_hash; + else + tdb->hash_fn = tdb_old_hash; + return check_header_hash(tdb, false, m1, m2); +} struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, int open_flags, mode_t mode, @@ -140,6 +173,8 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, unsigned char *vp; uint32_t vertest; unsigned v; + const char *hash_alg; + uint32_t magic1, magic2; if (!(tdb = (struct tdb_context *)calloc(1, sizeof *tdb))) { /* Can't log this */ @@ -161,7 +196,19 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, tdb->log.log_fn = null_log_fn; tdb->log.log_private = NULL; } - tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash; + + if (hash_fn) { + tdb->hash_fn = hash_fn; + hash_alg = "the user defined"; + } else { + /* This controls what we use when creating a tdb. */ + if (tdb->flags & TDB_INCOMPATIBLE_HASH) { + tdb->hash_fn = tdb_jenkins_hash; + } else { + tdb->hash_fn = tdb_old_hash; + } + hash_alg = "either default"; + } /* cache the page size */ tdb->page_size = getpagesize(); @@ -196,6 +243,10 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, goto fail; } + if (getenv("TDB_NO_FSYNC")) { + tdb->flags |= TDB_NOSYNC; + } + /* * TDB_ALLOW_NESTING is the default behavior. * Note: this may change in future versions! @@ -274,11 +325,31 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, if (fstat(tdb->fd, &st) == -1) goto fail; - if (tdb->header.rwlocks != 0) { + if (tdb->header.rwlocks != 0 && + tdb->header.rwlocks != TDB_HASH_RWLOCK_MAGIC) { TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: spinlocks no longer supported\n")); goto fail; } + if ((tdb->header.magic1_hash == 0) && (tdb->header.magic2_hash == 0)) { + /* older TDB without magic hash references */ + tdb->hash_fn = tdb_old_hash; + } else if (!check_header_hash(tdb, !hash_fn, &magic1, &magic2)) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: " + "%s was not created with %s hash function we are using\n" + "magic1_hash[0x%08X %s 0x%08X] " + "magic2_hash[0x%08X %s 0x%08X]\n", + name, hash_alg, + tdb->header.magic1_hash, + (tdb->header.magic1_hash == magic1) ? "==" : "!=", + magic1, + tdb->header.magic2_hash, + (tdb->header.magic2_hash == magic2) ? "==" : "!=", + magic2)); + errno = EINVAL; + goto fail; + } + /* Is it already in the open list? If so, fail. */ if (tdb_already_open(st.st_dev, st.st_ino)) { TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: " diff --git a/ctdb/lib/tdb/common/tdb_private.h b/ctdb/lib/tdb/common/tdb_private.h index 9d0f3bcd70..0c621636fa 100644 --- a/ctdb/lib/tdb/common/tdb_private.h +++ b/ctdb/lib/tdb/common/tdb_private.h @@ -50,6 +50,7 @@ typedef uint32_t tdb_off_t; #define TDB_DEAD_MAGIC (0xFEE1DEAD) #define TDB_RECOVERY_MAGIC (0xf53bc0e7U) #define TDB_RECOVERY_INVALID_MAGIC (0x0) +#define TDB_HASH_RWLOCK_MAGIC (0xbad1a51U) #define TDB_ALIGNMENT 4 #define DEFAULT_HASH_SIZE 131 #define FREELIST_TOP (sizeof(struct tdb_header)) @@ -147,7 +148,9 @@ struct tdb_header { tdb_off_t rwlocks; /* obsolete - kept to detect old formats */ tdb_off_t recovery_start; /* offset of transaction recovery region */ tdb_off_t sequence_number; /* used when TDB_SEQNUM is set */ - tdb_off_t reserved[29]; + uint32_t magic1_hash; /* hash of TDB_MAGIC_FOOD. */ + uint32_t magic2_hash; /* hash of TDB_MAGIC. */ + tdb_off_t reserved[27]; }; struct tdb_lock_type { @@ -268,3 +271,6 @@ int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct tdb_record *rec); bool tdb_write_all(int fd, const void *buf, size_t count); int tdb_transaction_recover(struct tdb_context *tdb); +void tdb_header_hash(struct tdb_context *tdb, + uint32_t *magic1_hash, uint32_t *magic2_hash); +unsigned int tdb_old_hash(TDB_DATA *key); |
