Merge branch 'master-readonly-records' into foo

Conflicts: Makefile.in tools/ctdb.c (This used to be ctdb commit 0fedef0ffba4178126eee9544c5e2db52f5db893)
author: Ronnie Sahlberg <ronniesahlberg@gmail.com> 2011-09-12 09:34:34 +1000
committer: Ronnie Sahlberg <ronniesahlberg@gmail.com> 2011-09-12 09:34:34 +1000
commit: 0dc5584101e61eeadf908d3340c2ef2fecd4cc22 (patch)
tree: fbd9296e38e71309d80c9ddf9abcc58aae9d9c4e
parent: d78b0ff985c7a389ab4678fef4c2cc30cd278f42 (diff)
parent: 01388c4414fcd976581f661cbe764fa0f984b293 (diff)
download: samba-0dc5584101e61eeadf908d3340c2ef2fecd4cc22.tar.gz
samba-0dc5584101e61eeadf908d3340c2ef2fecd4cc22.tar.xz
samba-0dc5584101e61eeadf908d3340c2ef2fecd4cc22.zip
19 files changed, 2092 insertions, 38 deletions
diff --git a/ctdb/Makefile.in b/ctdb/Makefile.in
index 01483d0b77..7ec430c164 100755
--- a/ctdb/Makefile.in
+++ b/ctdb/Makefile.in
@@ -71,10 +71,12 @@ CTDB_SERVER_OBJ = server/ctdbd.o server/ctdb_daemon.o server/ctdb_lockwait.o \
 	$(CTDB_CLIENT_OBJ) $(CTDB_TCP_OBJ) @INFINIBAND_WRAPPER_OBJ@
 
 TEST_BINS=tests/bin/ctdb_bench tests/bin/ctdb_fetch tests/bin/ctdb_fetch_one \
-	tests/bin/ctdb_fetch_lock_once tests/bin/ctdb_store \
+	tests/bin/ctdb_fetch_lock_once \
+	tests/bin/ctdb_fetch_readonly_once tests/bin/ctdb_fetch_readonly_loop \
+	tests/bin/ctdb_store tests/bin/ctdb_trackingdb_test \
 	tests/bin/ctdb_randrec tests/bin/ctdb_persistent \
 	tests/bin/ctdb_traverse tests/bin/rb_test tests/bin/ctdb_transaction \
-	tests/bin/ctdb_takeover_tests
+	tests/bin/ctdb_takeover_tests tests/bin/ctdb_update_record \
 	@INFINIBAND_BINS@
 
 BINS = bin/ctdb @CTDB_SCSI_IO@ bin/smnotify bin/ping_pong bin/ltdbtool @CTDB_PMDA@
@@ -175,10 +177,26 @@ tests/bin/ctdb_fetch_one: $(CTDB_CLIENT_OBJ) tests/src/ctdb_fetch_one.o
 	@echo Linking $@
 	@$(CC) $(CFLAGS) -o $@ tests/src/ctdb_fetch_one.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
 
-tests/bin/ctdb_fetch_lock_once: libctdb/libctdb.a tests/src/ctdb_fetch_lock_once.o @TDB_OBJ@ @POPT_OBJ@
+tests/bin/ctdb_fetch_lock_once: libctdb/libctdb.a tests/src/ctdb_fetch_lock_once.o 
 	@echo Linking $@
 	@$(CC) $(CFLAGS) -o $@ tests/src/ctdb_fetch_lock_once.o @TDB_OBJ@ @POPT_OBJ@ libctdb/libctdb.a $(LIB_FLAGS)
 
+tests/bin/ctdb_fetch_readonly_once: $(CTDB_CLIENT_OBJ) tests/src/ctdb_fetch_readonly_once.o
+	@echo Linking $@
+	@$(CC) $(CFLAGS) -o $@ tests/src/ctdb_fetch_readonly_once.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_fetch_readonly_loop: $(CTDB_CLIENT_OBJ) tests/src/ctdb_fetch_readonly_loop.o
+	@echo Linking $@
+	@$(CC) $(CFLAGS) -o $@ tests/src/ctdb_fetch_readonly_loop.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_trackingdb_test: $(CTDB_CLIENT_OBJ) tests/src/ctdb_trackingdb_test.o
+	@echo Linking $@
+	@$(CC) $(CFLAGS) -o $@ tests/src/ctdb_trackingdb_test.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_update_record: $(CTDB_CLIENT_OBJ) tests/src/ctdb_update_record.o 
+	@echo Linking $@
+	@$(CC) $(CFLAGS) -o $@ tests/src/ctdb_update_record.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
 tests/bin/ctdb_store: $(CTDB_CLIENT_OBJ) tests/src/ctdb_store.o 
 	@echo Linking $@
 	@$(CC) $(CFLAGS) -o $@ tests/src/ctdb_store.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index 0828e989d2..89eeb4836a 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -72,7 +72,7 @@ struct ctdb_req_header *_ctdbd_allocate_pkt(struct ctdb_context *ctdb,
 */
 int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
 		    struct ctdb_ltdb_header *header, TALLOC_CTX *mem_ctx,
-		    TDB_DATA *data)
+		    TDB_DATA *data, bool updatetdb)
 {
 	struct ctdb_call_info *c;
 	struct ctdb_registered_call *fn;
@@ -89,6 +89,7 @@ int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
 	c->new_data = NULL;
 	c->reply_data = NULL;
 	c->status = 0;
+	c->header = header;
 
 	for (fn=ctdb_db->calls;fn;fn=fn->next) {
 		if (fn->id == call->call_id) break;
@@ -110,7 +111,7 @@ int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
 		c->new_data = &c->record_data;
 	}
 
-	if (c->new_data) {
+	if (c->new_data && updatetdb) {
 		/* XXX check that we always have the lock here? */
 		if (ctdb_ltdb_store(ctdb_db, call->key, header, *c->new_data) != 0) {
 			ctdb_set_error(ctdb, "ctdb_call tdb_store failed\n");
@@ -345,7 +346,7 @@ int ctdb_call_recv(struct ctdb_client_call_state *state, struct ctdb_call *call)
 	call->status = state->call->status;
 	talloc_free(state);
 
-	return 0;
+	return call->status;
 }
 
 
@@ -386,7 +387,7 @@ static struct ctdb_client_call_state *ctdb_client_call_local_send(struct ctdb_db
 	*(state->call) = *call;
 	state->ctdb_db = ctdb_db;
 
-	ret = ctdb_call_local(ctdb_db, state->call, header, state, data);
+	ret = ctdb_call_local(ctdb_db, state->call, header, state, data, true);
 
 	return state;
 }
@@ -421,6 +422,10 @@ struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db,
 
 	ret = ctdb_ltdb_fetch(ctdb_db, call->key, &header, ctdb_db, &data);
 
+	if ((call->flags & CTDB_IMMEDIATE_MIGRATION) && (header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) {
+		ret = -1;
+	}
+
 	if (ret == 0 && header.dmaster == ctdb->pnn) {
 		state = ctdb_client_call_local_send(ctdb_db, call, &header, &data);
 		talloc_free(data.dptr);
@@ -584,6 +589,48 @@ static int ctdb_client_force_migration(struct ctdb_db_context *ctdb_db, TDB_DATA
 }
 
 /*
+  try to fetch a readonly copy of a record
+ */
+static int
+ctdb_client_fetch_readonly(struct ctdb_db_context *ctdb_db, TDB_DATA key, TALLOC_CTX *mem_ctx, struct ctdb_ltdb_header **hdr, TDB_DATA *data)
+{
+	int ret;
+
+	struct ctdb_call call;
+	ZERO_STRUCT(call);
+
+	call.call_id = CTDB_FETCH_WITH_HEADER_FUNC;
+	call.call_data.dptr = NULL;
+	call.call_data.dsize = 0;
+	call.key = key;
+	call.flags = CTDB_WANT_READONLY;
+	ret = ctdb_call(ctdb_db, &call);
+
+	if (ret != 0) {
+		return -1;
+	}
+	if (call.reply_data.dsize < sizeof(struct ctdb_ltdb_header)) {
+		return -1;
+	}
+
+	*hdr = talloc_memdup(mem_ctx, &call.reply_data.dptr[0], sizeof(struct ctdb_ltdb_header));
+	if (*hdr == NULL) {
+		talloc_free(call.reply_data.dptr);
+		return -1;
+	}
+
+	data->dsize = call.reply_data.dsize - sizeof(struct ctdb_ltdb_header);
+	data->dptr  = talloc_memdup(mem_ctx, &call.reply_data.dptr[sizeof(struct ctdb_ltdb_header)], data->dsize);
+	if (data->dptr == NULL) {
+		talloc_free(call.reply_data.dptr);
+		talloc_free(hdr);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
   get a lock on a record, and return the records data. Blocks until it gets the lock
  */
 struct ctdb_record_handle *ctdb_fetch_lock(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx, 
@@ -660,6 +707,185 @@ again:
 }
 
 /*
+  get a readonly lock on a record, and return the records data. Blocks until it gets the lock
+ */
+struct ctdb_record_handle *
+ctdb_fetch_readonly_lock(
+	struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx, 
+	TDB_DATA key, TDB_DATA *data,
+	int read_only)
+{
+	int ret;
+	struct ctdb_record_handle *h;
+	struct ctdb_ltdb_header *roheader = NULL;
+
+	h = talloc_zero(mem_ctx, struct ctdb_record_handle);
+	if (h == NULL) {
+		return NULL;
+	}
+
+	h->ctdb_db = ctdb_db;
+	h->key     = key;
+	h->key.dptr = talloc_memdup(h, key.dptr, key.dsize);
+	if (h->key.dptr == NULL) {
+		talloc_free(h);
+		return NULL;
+	}
+	h->data    = data;
+
+	data->dptr = NULL;
+	data->dsize = 0;
+
+
+again:
+	talloc_free(roheader);
+	roheader = NULL;
+
+	talloc_free(data->dptr);
+	data->dptr = NULL;
+	data->dsize = 0;
+
+	/* Lock the record/chain */
+	ret = ctdb_ltdb_lock(ctdb_db, key);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " failed to lock ltdb record\n"));
+		talloc_free(h);
+		return NULL;
+	}
+
+	talloc_set_destructor(h, fetch_lock_destructor);
+
+	/* Check if record exists yet in the TDB */
+	ret = ctdb_ltdb_fetch_readonly(ctdb_db, key, &h->header, h, data);
+	if (ret != 0) {
+		ctdb_ltdb_unlock(ctdb_db, key);
+		ret = ctdb_client_force_migration(ctdb_db, key);
+		if (ret != 0) {
+			DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+			talloc_free(h);
+			return NULL;
+		}
+		goto again;
+	}
+
+	/* if this is a request for read/write and we have delegations
+	   we have to revoke all delegations first
+	*/
+	if ((read_only == 0) 
+	&&  (h->header.dmaster == ctdb_db->ctdb->pnn)
+	&&  (h->header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) {
+		ctdb_ltdb_unlock(ctdb_db, key);
+		ret = ctdb_client_force_migration(ctdb_db, key);
+		if (ret != 0) {
+			DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+			talloc_free(h);
+			return NULL;
+		}
+		goto again;
+	}
+
+	/* if we are dmaster, just return the handle */
+	if (h->header.dmaster == ctdb_db->ctdb->pnn) {
+		return h;
+	}
+
+	if (read_only != 0) {
+		TDB_DATA rodata = {NULL, 0};
+
+		if ((h->header.flags & CTDB_REC_RO_HAVE_READONLY)
+		||  (h->header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) {
+			return h;
+		}
+
+		ctdb_ltdb_unlock(ctdb_db, key);
+		ret = ctdb_client_fetch_readonly(ctdb_db, key, h, &roheader, &rodata);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,("ctdb_fetch_readonly_lock:  failed. force migration and try again\n"));
+			ret = ctdb_client_force_migration(ctdb_db, key);
+			if (ret != 0) {
+				DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+				talloc_free(h);
+				return NULL;
+			}
+
+			goto again;
+		}
+
+		if (!(roheader->flags&CTDB_REC_RO_HAVE_READONLY)) {
+			ret = ctdb_client_force_migration(ctdb_db, key);
+			if (ret != 0) {
+				DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+				talloc_free(h);
+				return NULL;
+			}
+
+			goto again;
+		}
+
+		ret = ctdb_ltdb_lock(ctdb_db, key);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR, (__location__ " failed to lock ltdb record\n"));
+			talloc_free(h);
+			return NULL;
+		}
+
+		ret = ctdb_ltdb_fetch_readonly(ctdb_db, key, &h->header, h, data);
+		if (ret != 0) {
+			ctdb_ltdb_unlock(ctdb_db, key);
+
+			ret = ctdb_client_force_migration(ctdb_db, key);
+			if (ret != 0) {
+				DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+				talloc_free(h);
+				return NULL;
+			}
+
+			goto again;
+		}
+
+		if (h->header.rsn >= roheader->rsn) {
+			DEBUG(DEBUG_ERR,("READONLY RECORD: Too small RSN, migrate and try again\n"));
+			ctdb_ltdb_unlock(ctdb_db, key);
+
+			ret = ctdb_client_force_migration(ctdb_db, key);
+			if (ret != 0) {
+				DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+				talloc_free(h);
+				return NULL;
+			}
+
+			goto again;
+		}
+
+		if (ctdb_ltdb_store(ctdb_db, key, roheader, rodata) != 0) {
+			ctdb_ltdb_unlock(ctdb_db, key);
+
+			ret = ctdb_client_force_migration(ctdb_db, key);
+			if (ret != 0) {
+				DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+				talloc_free(h);
+				return NULL;
+			}
+
+			goto again;
+		}
+		return h;
+	}
+
+	/* we are not dmaster and this was not a request for a readonly lock
+	 * so unlock the record, migrate it and try again
+	 */
+	ctdb_ltdb_unlock(ctdb_db, key);
+	ret = ctdb_client_force_migration(ctdb_db, key);
+	if (ret != 0) {
+		DEBUG(DEBUG_DEBUG,("ctdb_fetch_lock: force_migration failed\n"));
+		talloc_free(h);
+		return NULL;
+	}
+	goto again;
+}
+
+/*
   store some data to the record that was locked with ctdb_fetch_lock()
 */
 int ctdb_record_store(struct ctdb_record_handle *h, TDB_DATA data)
@@ -684,6 +910,7 @@ int ctdb_fetch(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx,
 	call.call_id = CTDB_FETCH_FUNC;
 	call.call_data.dptr = NULL;
 	call.call_data.dsize = 0;
+	call.key = key;
 
 	ret = ctdb_call(ctdb_db, &call);
 
@@ -1694,6 +1921,27 @@ static int ctdb_fetch_func(struct ctdb_call_info *call)
 }
 
 /*
+  this is a plain fetch procedure that all databases support
+  this returns the full record including the ltdb header
+*/
+static int ctdb_fetch_with_header_func(struct ctdb_call_info *call)
+{
+	call->reply_data = talloc(call, TDB_DATA);
+	if (call->reply_data == NULL) {
+		return -1;
+	}
+	call->reply_data->dsize = sizeof(struct ctdb_ltdb_header) + call->record_data.dsize;
+	call->reply_data->dptr  = talloc_size(call->reply_data, call->reply_data->dsize);
+	if (call->reply_data->dptr == NULL) {
+		return -1;
+	}
+	memcpy(call->reply_data->dptr, call->header, sizeof(struct ctdb_ltdb_header));
+	memcpy(&call->reply_data->dptr[sizeof(struct ctdb_ltdb_header)], call->record_data.dptr, call->record_data.dsize);
+
+	return 0;
+}
+
+/*
   attach to a specific database - client call
 */
 struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb,
@@ -1762,6 +2010,7 @@ struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb,
 	/* add well known functions */
 	ctdb_set_call(ctdb_db, ctdb_null_func, CTDB_NULL_FUNC);
 	ctdb_set_call(ctdb_db, ctdb_fetch_func, CTDB_FETCH_FUNC);
+	ctdb_set_call(ctdb_db, ctdb_fetch_with_header_func, CTDB_FETCH_WITH_HEADER_FUNC);
 
 	return ctdb_db;
 }
@@ -1927,6 +2176,15 @@ int ctdb_dumpdb_record(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, v
 
 	fprintf(f, "dmaster: %u\n", h->dmaster);
 	fprintf(f, "rsn: %llu\n", (unsigned long long)h->rsn);
+	fprintf(f, "flags: 0x%08x", h->flags);
+	if (h->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA) printf(" MIGRATED_WITH_DATA");
+	if (h->flags & CTDB_REC_FLAG_VACUUM_MIGRATED) printf(" VACUUM_MIGRATED");
+	if (h->flags & CTDB_REC_FLAG_AUTOMATIC) printf(" AUTOMATIC");
+	if (h->flags & CTDB_REC_RO_HAVE_DELEGATIONS) printf(" RO_HAVE_DELEGATIONS");
+	if (h->flags & CTDB_REC_RO_HAVE_READONLY) printf(" RO_HAVE_READONLY");
+	if (h->flags & CTDB_REC_RO_REVOKING_READONLY) printf(" RO_REVOKING_READONLY");
+	if (h->flags & CTDB_REC_RO_REVOKE_COMPLETE) printf(" RO_REVOKE_COMPLETE");
+	fprintf(f, "\n");
 
 	fprintf(f, "data(%u) = \"", (unsigned)(data.dsize - sizeof(*h)));
 	for (i=sizeof(*h);i<data.dsize;i++) {
@@ -4272,3 +4530,112 @@ struct ctdb_ltdb_header *ctdb_header_from_record_handle(struct ctdb_record_handl
 
 	return &h->header;
 }
+
+
+struct ctdb_client_control_state *
+ctdb_ctrl_updaterecord_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+	struct ctdb_client_control_state *handle;
+	struct ctdb_marshall_buffer *m;
+	struct ctdb_rec_data *rec;
+	TDB_DATA outdata;
+
+	m = talloc_zero(mem_ctx, struct ctdb_marshall_buffer);
+	if (m == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to allocate marshall buffer for update record\n"));
+		return NULL;
+	}
+
+	m->db_id = ctdb_db->db_id;
+
+	rec = ctdb_marshall_record(m, 0, key, header, data);
+	if (rec == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to marshall record for update record\n"));
+		talloc_free(m);
+		return NULL;
+	}
+	m = talloc_realloc_size(mem_ctx, m, rec->length + offsetof(struct ctdb_marshall_buffer, data));
+	if (m == NULL) {
+		DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata\n"));
+		talloc_free(m);
+		return NULL;
+	}
+	m->count++;
+	memcpy((uint8_t *)m + offsetof(struct ctdb_marshall_buffer, data), rec, rec->length);
+
+
+	outdata.dptr = (uint8_t *)m;
+	outdata.dsize = talloc_get_size(m);
+
+	handle = ctdb_control_send(ctdb, destnode, 0, 
+			   CTDB_CONTROL_UPDATE_RECORD, 0, outdata,
+			   mem_ctx, &timeout, NULL);
+	talloc_free(m);
+	return handle;
+}
+
+int ctdb_ctrl_updaterecord_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state)
+{
+	int ret;
+	int32_t res;
+
+	ret = ctdb_control_recv(ctdb, state, state, NULL, &res, NULL);
+	if ( (ret != 0) || (res != 0) ){
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_update_record_recv failed\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+ctdb_ctrl_updaterecord(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+	struct ctdb_client_control_state *state;
+
+	state = ctdb_ctrl_updaterecord_send(ctdb, mem_ctx, timeout, destnode, ctdb_db, key, header, data);
+	return ctdb_ctrl_updaterecord_recv(ctdb, state);
+}
+
+
+
+
+
+
+/*
+  set a database to be readonly
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_set_db_readonly_send(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid)
+{
+	TDB_DATA data;
+
+	data.dptr = (uint8_t *)&dbid;
+	data.dsize = sizeof(dbid);
+
+	return ctdb_control_send(ctdb, destnode, 0, 
+			   CTDB_CONTROL_SET_DB_READONLY, 0, data, 
+			   ctdb, NULL, NULL);
+}
+
+int ctdb_ctrl_set_db_readonly_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state)
+{
+	int ret;
+	int32_t res;
+
+	ret = ctdb_control_recv(ctdb, state, ctdb, NULL, &res, NULL);
+	if (ret != 0 || res != 0) {
+	  DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_set_db_readonly_recv failed  ret:%d res:%d\n", ret, res));
+		return -1;
+	}
+
+	return 0;
+}
+
+int ctdb_ctrl_set_db_readonly(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid)
+{
+	struct ctdb_client_control_state *state;
+
+	state = ctdb_ctrl_set_db_readonly_send(ctdb, destnode, dbid);
+	return ctdb_ctrl_set_db_readonly_recv(ctdb, state);
+}
diff --git a/ctdb/common/ctdb_ltdb.c b/ctdb/common/ctdb_ltdb.c
index 0dc8711c7d..76274ceafb 100644
--- a/ctdb/common/ctdb_ltdb.c
+++ b/ctdb/common/ctdb_ltdb.c
@@ -2,6 +2,7 @@
    ctdb ltdb code
 
    Copyright (C) Andrew Tridgell  2006
+   Copyright (C) Ronnie sahlberg  2011
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -118,6 +119,40 @@ int ctdb_ltdb_fetch(struct ctdb_db_context *ctdb_db,
 	return 0;
 }
 
+/*
+  fetch a record from the ltdb, separating out the header information
+  and returning the body of the record.
+  if the record does not exist, *header will be NULL
+  and data = {0, NULL}
+*/
+int ctdb_ltdb_fetch_readonly(struct ctdb_db_context *ctdb_db, 
+		    TDB_DATA key, struct ctdb_ltdb_header *header, 
+		    TALLOC_CTX *mem_ctx, TDB_DATA *data)
+{
+	TDB_DATA rec;
+
+	rec = tdb_fetch(ctdb_db->ltdb->tdb, key);
+	if (rec.dsize < sizeof(*header)) {
+		free(rec.dptr);
+
+		data->dsize = 0;
+		data->dptr = NULL;
+		return -1;
+	}
+
+	*header = *(struct ctdb_ltdb_header *)rec.dptr;
+	if (data) {
+		data->dsize = rec.dsize - sizeof(struct ctdb_ltdb_header);
+		data->dptr = talloc_memdup(mem_ctx, 
+					   sizeof(struct ctdb_ltdb_header)+rec.dptr,
+					   data->dsize);
+	}
+
+	free(rec.dptr);
+
+	return 0;
+}
+
 
 /*
   write a record to a normal database
@@ -216,3 +251,48 @@ int ctdb_ltdb_delete(struct ctdb_db_context *ctdb_db, TDB_DATA key)
 	}
 	return 0;
 }
+
+int ctdb_trackingdb_add_pnn(struct ctdb_context *ctdb, TDB_DATA *data, uint32_t pnn)
+{
+	int byte_pos = pnn / 8;
+	int bit_mask   = 1 << (pnn % 8);
+
+	if (byte_pos + 1 > data->dsize) {
+		char *buf;
+
+		buf = malloc(byte_pos + 1);
+		memset(buf, 0, byte_pos + 1);
+		if (buf == NULL) {
+			DEBUG(DEBUG_ERR, ("Out of memory when allocating buffer of %d bytes for trackingdb\n", byte_pos + 1));
+			return -1;
+		}
+		if (data->dptr != NULL) {
+			memcpy(buf, data->dptr, data->dsize);
+			free(data->dptr);
+		}
+		data->dptr  = (uint8_t *)buf;
+		data->dsize = byte_pos + 1;
+	}
+
+	data->dptr[byte_pos] |= bit_mask;
+	return 0;
+}
+
+void ctdb_trackingdb_traverse(struct ctdb_context *ctdb, TDB_DATA data, ctdb_trackingdb_cb cb, void *private_data)
+{
+	int i;
+
+	for(i = 0; i < data.dsize; i++) {
+		int j;
+
+		for (j=0; j<8; j++) {
+			int mask = 1<<j;
+
+			if (data.dptr[i] & mask) {
+				cb(ctdb, i * 8 + j, private_data);
+			}
+		}
+	}
+}
+
+
diff --git a/ctdb/doc/readonlyrecords.txt b/ctdb/doc/readonlyrecords.txt
new file mode 100644
index 0000000000..c07d5830be
--- /dev/null
+++ b/ctdb/doc/readonlyrecords.txt
@@ -0,0 +1,264 @@
+Read-Only locks in CTDB
+=======================
+
+Problem
+=======
+CTDB currently only supports exclusive Read-Write locks for clients(samba) accessing the TDB databases.
+This works well mostly, but when very mny number of clients are accessing the same file, at the same time,
+this will cause the exclusive lock as well as the record itself to rapidly bounce between nodes, and acts as
+a scalability limitation.
+
+This primarily affects locking.tdb and brlock.tdb, two databases where record access is mostly read, and where a read request is magnitudes more common than read-write requests.
+
+For the common case, if CTDB provided shared non-exlusive Read-Only lock semantincs, this would greatly improve scaling for these workloads.
+
+
+Desired properties
+==================
+We can not make backward incompatible changes the ctdb/ltdb header for the records.
+
+A Read-Only lock enabled ctdb demon must be able to interoperate with a non-Read-Only lock enbled daemon.
+
+Getting a Read-Only look should not be slower than getting a Read-Write lock.
+
+Requesting a Read-Only lock should never trigger a record migration.
+
+When revoking Read-Only locks for a record, this should involve only those nodes that hold a Read-Only lock right now and should avoid broadcasting opportunistic revocations. (must track which nodes are delegated to)
+
+When a Read-Write lock is requested, if there are Read-Only locks delegated to other nodes, the DMASTER will defer the record migration until all read-only locks are first revoked (synchronous revoke).
+
+Due to the cost of revoking Read-Only locks has on getting a Read-Write lock, the implementation should try to avoid
+creating Read-Only locks, unless it has indication that there is contention. This may mean that even if client requests a Read-Only lock we may still provide a full Read-Write lock in order to avoid the cost of revoking the locks in some cases.
+
+Read-Only locks require additional state to be stored in a separate database, containing information about which nodes have have been delegated Read-Only locks. This database should be kept at minimal size.
+
+Read-Only locks should not significantly complicate the normal record/create/migration/deletion cycle for normal records.
+
+Read-Only locks should not complicate the recovery process.
+
+Read-Only locks should not complicate the vacuuming process.
+
+We should avoid forking new child processes as far as possible from the main daemon.
+
+Client-side implementation, samba, libctdb, others, should have minimal impact when Read-Only locks are implemented.
+Client-side implementation must be possible with only minor conditionals added to the existing lock-check-fetch-unlock loop that clients use today for Read-Write locks. So that clients only need one single loop that can handle both Read-Write locking as well as Read-Only locking. Clients should not need two nearly identical loops.
+
+
+Implementation
+==============
+
+Four new flags are allocated in the ctdb/ltdb record header.
+HAVE_DELEGATIONS, HAVE_READONLY_LOCK, REVOKING_READONLY and REVOKE_COMPLETE
+
+HAVE_DELEGATIONS is a flag that can only be set on the node that is currently the DMASTER for the record. When set, this
+flag indicates that there are Read-Only locks delegated to other nodes in the cluster for this record.
+
+HAVE_READONLY is a flag that is only set on nodes that are NOT the DMASTER for the record. If set this flag
+indicates that this record contains an up-to-date Read-Only version of this record.
+A client that only needs to read, but not to write, the record can safely use the content of this record as is regardless of the value of the DMASTER field of the record.
+
+REVOKING_READONLY is a flag that is used while a set of read only delegations are being revoked.
+This flag is only set when HAVE_DELEGATIONS is also set, and is cleared at the same time as HAVE_DELEGATIONS is cleared.
+Normal operations is that first the HAVE_DELEGATIONS flag is set when the first delegation is generated.
+When the delegations are about to be revoked, the REVOKING_READONLY flag is set too.
+Once all delegations are revoked, both flags are cleared at the same time.
+While REVOKING_READONLY is set, any requests for the record, either normal request or request for readonly will be deferred.
+Deferred requests are linked to a list of deferred requests for the hash of the record until the time that the revokation is completed.
+This flags is set by the main ctdb daemon when it starts revoking this record.
+
+REVOKE_COMPLETE
+The actual revoke of records is done by a child process, spawned from the ctdb amin daemon when it starts the process to revoke the records.
+Once the child process has finished revoking all delegations, it will set the flag REVOKE_COMPLETE for this record to signal to the master daemon that the record has been successfully revoked.
+At this stage the child process will also trigger an event in the main daemon that revoke is complete, and that the main dameon should start re-processing all deferred calls.
+
+
+
+Once the revoke process is completed. There will be at least one deferred call to access this record, the initical call to for an exclusive fetch_lock() that triggered the revoke process to be started.
+In addition to nthis deferred call there may also be additional requests that have also become deferred while the revoke was in process. These can be either exclusive fetch_locks() or they can be other calls to request a new readonly lock on the record.
+Once the revoke is completed, the main daemon will reprocess all exclusive fetch_lock() requests immediately and respond to the clients.
+But any requests for readadonly locks will be deferred for an additional period of time before they are re-processed.
+This is to allow the client that needs a fetch_lock() to update the record to get some time to access and work on the record without having to compete with the possibly very many requests to get new readonly delegations created.
+
+
+
+
+
+
+The ctdbdb structure is expanded so that it contains one extra TDB database for each normal, non-persistent datbase.
+This new database is used for tracking delegations for the records. A record in the normal database that has "HAVE_DELEGATION" set will always have a corresponding record at the same key. This record contains the set of all nodes that the record is delegated to.
+This tracking database is lockless, using TDB_NOLOCK, and is only ever accessed by the main ctdbd daemon.
+The lockless nature and the fact that no other process ever access this TDB means we are guranteed non-blocking access to records in the trcking database.
+
+The ctdb_call PDU is allocated with two new flags WANT_READONLY and WITH_HEADER.
+This first flag is used to explicitely requesting a read-only record from the DMASTER/LMASTER.
+The second flag is used to request that the fetch operation will return not only the data for the record but also
+the record header. 
+If the record does not yet exist, this is a returned as an error to the client and the client will retry the request loop.
+
+A new control is added to make remote nodes remove the HAVE_READONLY_LOCK from a record.
+
+
+
+Client implementation
+=====================
+Clients today use a loop for record fetch lock that looks like this
+    try_again:
+        lock record in tdb
+
+        if record does not exist in tdb,
+            unlock record
+            ask ctdb to migrate record onto the node
+            goto try_again
+
+        if record dmaster != this node pnn
+            unlock record
+            ask ctdb to migrate record onto the node
+            goto try_again
+
+    finished:
+
+where we basically spin, until the record is migrated onto the node and we have managed pinning it down.
+
+This will change to instead do
+
+    try_again:
+        lock record in tdb
+
+        if record does not exist in tdb,
+            unlock record
+            ask ctdb to migrate record onto the node
+            goto try_again
+
+        if record dmaster == current node pnn
+            goto finished
+
+        if read-only lock
+            if HAVE_READONLY_LOCK or HAVE_DELEGATIONS is set
+                goto finished
+            else
+                unlock record 
+                ask ctdb for read-only copy (WANT_READONLY|WITH_HEADER)
+                if failed to get read-only copy (*A)
+                    ask ctdb to migrate the record onto the node
+                    goto try_again
+                lock record in tdb
+                if record fails RSN test
+                    unlock record
+                    ask ctdb to migrate the record onto the node
+                    goto try_again
+                write the updated record from ctdb to tdb
+                goto finished
+
+        unlock record
+        ask ctdb to migrate record onto the node
+        goto try_again
+
+    finished:
+
+If the record does not yet exist in the local TDB, we always perform a full fetch for a Read-Write lock, even if only a Read-Only lock ws requested.
+This means that for first access, we use the the current grab a Read-Write lock, just like we do today.
+This creates the record and migrates it onto the node. The local node becomes DMASTER for the record.
+This did not yet trigger a Read-Only delegation to be created.
+Future reference to this record by the local samba daemons will still access/lock the record locally without triggereing a Read-Only delegation to be created since the record is already hosted on the local node as DMASTER.
+
+Only if the record is contended, i.e. it has been created but we are no longer the DMASTER for the record, only for this case will we create a Read-Only delegation for this record.
+
+This heuristics provide a mechanism where we will not create Read-Only delegations until we have some indication that the record may be contended.
+
+This avoids creating and revoking Read-Only delegations when only a single client is repeatedly accessing the same set of records.
+This also aims to limit the size of the tracking tdb.
+
+Note that writing the copy of the Read-Only record to the TDB database is done by the client, not by ctdbd. This is to avoid a probable need for creating a child process for a likely contended record where locking the record would likely block.
+
+
+Server implementation
+=====================
+When receiving a ctdb_call with the WANT_READONLY flag:
+
+If this is the LMASTER for the record and the record does not yet exist, LMASTER will return an error back to the client (*A above) and the client will try to recover. In particular, LMASTER will not create a new record for this case.
+
+If this is the LMASTER for the record and the record exists, the PDU will be forwrded to the DMASTER for the record.
+
+If this node is not the DMASTER for this record, we forward the PDU back to the LMASTER. Just as we always do today.
+
+If this is the DMASTER for the record, we need to create a Read-Only delegation. This is done by
+     lock record
+     increase the RSN by one for this record
+     set the HAVE_DELEGATIONS flag for the record
+     write the updated record to the TDB
+     create/update the tracking TDB nd add this new node to the set of delegations
+     send a modified copy of the record back to the requesting client.
+         modifications are that RSN is decremented by one, so delegated records are "older" than on the DMASTER,
+         it has HAVE_DELEGATIONS flag stripped off, and has HAVE_READONLY_LOCK added.
+     unlock record
+
+Important to note is that this does not trigger a record migration.
+
+
+When receiving a ctdb_call without the WANT_READONLY flag:
+
+If this is the DMASTER for the this might trigger a migration. So,
+IF the record has HAVE_DELEGATIONS set, we create a child process and defer processing of this PDU until the child process has completed.
+
+From the child process we will call out to all nodes that have delegations for this record and tell them to invalidate this record by clearing the HAVE_READONLY_LOCK from the record.
+Once all delegated nodes respond back, the child process signals back to the main daemon the revoke has completed.
+(child process may not access the tracking tdb since it is lockless)
+
+Main process is triggered to re-process the PDU once the child process has finished.
+Main daemon deletes the corresponding record in the tracking database, clears the HAVE_DELEGATIONS flag for the record and then proceeds to perform the migration as usual.
+
+When receiving a ctdb_call without the flag we want all delegations to be revoked, so we must take care that the delegations are revoked unconditionally before we even check if we are already the DMASTER (in which case thie ctdb_call would normally just be  no-op  (*B below))
+
+
+
+Recovery process changes
+========================
+A recovery implicitely clears/revokes any read only records and delegations from all databases.
+
+During delegations of Read-Only locks, this is done in such way that delegated records will have a RSN smaller than the DMASTER. This guarantees that read-only copies always have a RSN that is smaller than the DMASTER.
+
+During recoveries we do not need to take any special action other than always picking the copy of the record that has the highest RSN, which is what we already do today.
+
+During the recovery process, we strip all flags off all records while writing the new contnent of the database during the PUSH_DB control. 
+
+During processing of the PUSH_DB control and once the new database has been written we then also wipe the tracking database.
+
+This makes changes to the recovery process minimal and nonintrusive.
+
+
+
+Vacuuming process
+=================
+Vacuuming needs only minimal changes.
+
+
+When vacuuming runs, it will do a fetch_lock to migrate any remote records back onto the LMASTER before the record can be purged. This will automatically force all delegations for that record to be revoked before the migration is copied back onto the LMASTER.
+This handles the case where LMASTER is not the DMASTER for the record that will be purged.
+The migration here does force any delegations to be revoked before the migration takes place.
+
+Missing is the case when delegations exist and the LMASTER is also the DMASTER.
+For this case we need to change the vacuuming to unconditionally always try to do a fetch_lock when HAVE_DELEGATIONS is set, even if the record is already stored locally. (*B)
+This fetch lock will not cause any migrations by the ctdb daemon, but since it does not have the WANT_READONLY
+this will still force the delegations to be revoked but no migrations trigger.
+
+
+Traversal process
+=================
+Traversal process is changed to ignore any records with the HAVE_READONLY_LOCK
+
+
+Forward/Backward Compatibility
+==============================
+Non-readonly locking daemons must be able to interoperate with readonly locking enabled daemons.
+
+Non-readonly enabled daemons fetching records from Readonly enabled daemons:
+Non-readonly enabled daemons do not know, and never set the WANT_READONLY flag so these daemons will always request a full migration for a full fetch-lock for all records. Thus a request from a non-readonly enabled daemon will always cause any existing delegations to be immediately revoked. Access will work but performance may be harmed since there will be a lot of revoking of delegations.
+
+Readonly enabled dameons fetching records with WANT_READONLY from non-readonly enabled daemons:
+Non-readonly enabled daemons ingore the WANT_READONLY flag and never return delegations. They always return a full record migration.
+Full record migration is allowed by the protocol, even if the originator only requests the 'hint' WANT_READONLY,
+so this access also interoperates between daemons with different capabilities.
+
+
+
+
diff --git a/ctdb/include/ctdb_client.h b/ctdb/include/ctdb_client.h
index 95b907378a..01219f341b 100644
--- a/ctdb/include/ctdb_client.h
+++ b/ctdb/include/ctdb_client.h
@@ -179,6 +179,8 @@ int ctdb_client_send_message(struct ctdb_context *ctdb, uint32_t pnn,
 struct ctdb_record_handle *ctdb_fetch_lock(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx,
 					   TDB_DATA key, TDB_DATA *data);
 
+struct ctdb_record_handle *ctdb_fetch_readonly_lock(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx, TDB_DATA key, TDB_DATA *data, int read_only);
+
 int ctdb_record_store(struct ctdb_record_handle *h, TDB_DATA data);
 
 int ctdb_fetch(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx,
@@ -215,7 +217,9 @@ struct ctdb_dbid_map {
 	uint32_t num;
 	struct ctdb_dbid {
 		uint32_t dbid;
-		bool persistent;
+#define CTDB_DB_FLAGS_PERSISTENT	0x01
+#define CTDB_DB_FLAGS_READONLY		0x02
+		uint8_t flags;
 	} dbs[1];
 };
 int ctdb_ctrl_getdbmap(struct ctdb_context *ctdb,
@@ -592,4 +596,19 @@ int ctdb_ctrl_get_db_priority(struct ctdb_context *ctdb, struct timeval timeout,
 int ctdb_ctrl_getstathistory(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_statistics_wire **stats);
 
 
+
+struct ctdb_client_control_state *
+ctdb_ctrl_updaterecord_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data);
+
+int ctdb_ctrl_updaterecord_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state);
+
+int
+ctdb_ctrl_updaterecord(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data);
+
+
+struct ctdb_client_control_state *
+ctdb_ctrl_set_db_readonly_send(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid);
+int ctdb_ctrl_set_db_readonly_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state);
+int ctdb_ctrl_set_db_readonly(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid);
+
 #endif /* _CTDB_CLIENT_H */
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index edf1d88dd8..b093bf09ac 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -504,9 +504,11 @@ struct ctdb_db_context {
 	uint32_t db_id;
 	uint32_t priority;
 	bool persistent;
+	bool readonly; /* Do we support read-only delegations ? */
 	const char *db_name;
 	const char *db_path;
 	struct tdb_wrap *ltdb;
+	struct tdb_context *rottdb; /* ReadOnly tracking TDB */
 	struct ctdb_registered_call *calls; /* list of registered calls */
 	uint32_t seqnum;
 	struct timed_event *seqnum_update;
@@ -517,6 +519,7 @@ struct ctdb_db_context {
 	int pending_requests;
 	struct lockwait_handle *lockwait_active;
 	struct lockwait_handle *lockwait_overflow;
+	struct revokechild_handle *revokechild_active;
 	struct ctdb_persistent_state *persistent_state;
 	struct trbt_tree *delete_queue;
 	int (*ctdb_ltdb_store_fn)(struct ctdb_db_context *ctdb_db,
@@ -668,6 +671,9 @@ int ctdb_ltdb_fetch(struct ctdb_db_context *ctdb_db,
 int ctdb_ltdb_store(struct ctdb_db_context *ctdb_db, TDB_DATA key, 
 		    struct ctdb_ltdb_header *header, TDB_DATA data);
 int ctdb_ltdb_delete(struct ctdb_db_context *ctdb_db, TDB_DATA key);
+int ctdb_ltdb_fetch_readonly(struct ctdb_db_context *ctdb_db, 
+		    TDB_DATA key, struct ctdb_ltdb_header *header, 
+		    TALLOC_CTX *mem_ctx, TDB_DATA *data);
 int32_t ctdb_control_start_persistent_update(struct ctdb_context *ctdb, 
 			struct ctdb_req_control *c,
 			TDB_DATA recdata);
@@ -791,7 +797,7 @@ struct ctdb_call_state *ctdb_daemon_call_send_remote(struct ctdb_db_context *ctd
 
 int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
 		    struct ctdb_ltdb_header *header, TALLOC_CTX *mem_ctx,
-		    TDB_DATA *data);
+		    TDB_DATA *data, bool updatetdb);
 
 #define ctdb_reqid_find(ctdb, reqid, type)	(type *)_ctdb_reqid_find(ctdb, reqid, #type, __location__)
 
@@ -1448,5 +1454,18 @@ void ctdb_takeover_run_core(struct ctdb_context *ctdb,
 			    struct ctdb_node_map *nodemap,
 			    struct ctdb_public_ip_list **all_ips_p);
 
+int ctdb_trackingdb_add_pnn(struct ctdb_context *ctdb, TDB_DATA *data, uint32_t pnn);
+
+typedef void (*ctdb_trackingdb_cb)(struct ctdb_context *ctdb, uint32_t pnn, void *private_data);
+
+void ctdb_trackingdb_traverse(struct ctdb_context *ctdb, TDB_DATA data, ctdb_trackingdb_cb cb, void *private_data);
+
+int ctdb_start_revoke_ro_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data);
+
+typedef void (*deferred_requeue_fn)(void *call_context, struct ctdb_req_header *hdr);
+
+int ctdb_add_revoke_deferred_call(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr, deferred_requeue_fn fn, void *call_context);
+
+int ctdb_set_db_readonly(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db);
 
 #endif
diff --git a/ctdb/include/ctdb_protocol.h b/ctdb/include/ctdb_protocol.h
index 0422afeb7b..f4019ab172 100644
--- a/ctdb/include/ctdb_protocol.h
+++ b/ctdb/include/ctdb_protocol.h
@@ -30,8 +30,9 @@
 #define CTDB_DS_ALIGNMENT 8
 
 
-#define CTDB_NULL_FUNC      0xFF000001
-#define CTDB_FETCH_FUNC     0xFF000002
+#define CTDB_NULL_FUNC                  0xFF000001
+#define CTDB_FETCH_FUNC                 0xFF000002
+#define CTDB_FETCH_WITH_HEADER_FUNC     0xFF000003
 
 
 struct ctdb_call {
@@ -40,8 +41,9 @@ struct ctdb_call {
 	TDB_DATA call_data;
 	TDB_DATA reply_data;
 	uint32_t status;
-#define CTDB_IMMEDIATE_MIGRATION	0x00000001
+#define CTDB_IMMEDIATE_MIGRATION		0x00000001
 #define CTDB_CALL_FLAG_VACUUM_MIGRATION		0x00000002
+#define CTDB_WANT_READONLY			0x00000004
 	uint32_t flags;
 };
 
@@ -50,6 +52,7 @@ struct ctdb_call {
 */
 struct ctdb_call_info {
 	TDB_DATA key;          /* record key */
+	struct ctdb_ltdb_header *header;
 	TDB_DATA record_data;  /* current data in the record */
 	TDB_DATA *new_data;    /* optionally updated record data */
 	TDB_DATA *call_data;   /* optionally passed from caller */
@@ -363,6 +366,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS          = 0,
 		    CTDB_CONTROL_TCP_ADD_DELAYED_UPDATE  = 126,
 		    CTDB_CONTROL_GET_STAT_HISTORY	 = 127,
 		    CTDB_CONTROL_SCHEDULE_FOR_DELETION   = 128,
+		    CTDB_CONTROL_SET_DB_READONLY	 = 129,
 };
 
 /*
@@ -486,6 +490,10 @@ struct ctdb_ltdb_header {
 #define CTDB_REC_FLAG_MIGRATED_WITH_DATA	0x00010000
 #define CTDB_REC_FLAG_VACUUM_MIGRATED		0x00020000
 #define CTDB_REC_FLAG_AUTOMATIC			0x00040000
+#define CTDB_REC_RO_HAVE_DELEGATIONS		0x01000000
+#define CTDB_REC_RO_HAVE_READONLY		0x02000000
+#define CTDB_REC_RO_REVOKING_READONLY		0x04000000
+#define CTDB_REC_RO_REVOKE_COMPLETE		0x08000000
 	uint32_t flags;
 };
 
diff --git a/ctdb/server/ctdb_call.c b/ctdb/server/ctdb_call.c
index 0ea76bf44f..9e8642221c 100644
--- a/ctdb/server/ctdb_call.c
+++ b/ctdb/server/ctdb_call.c
@@ -43,7 +43,6 @@
 	return ctdb_db;
 }
 
-
 /*
   a varient of input packet that can be used in lock requeue
 */
@@ -339,7 +338,7 @@ static void ctdb_become_dmaster(struct ctdb_db_context *ctdb_db,
 		return;
 	}
 
-	ctdb_call_local(ctdb_db, state->call, &header, state, &data);
+	ctdb_call_local(ctdb_db, state->call, &header, state, &data, true);
 
 	ret = ctdb_ltdb_unlock(ctdb_db, state->call->key);
 	if (ret != 0) {
@@ -489,6 +488,8 @@ void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
 	call->key.dsize = c->keylen;
 	call->call_data.dptr = c->data + c->keylen;
 	call->call_data.dsize = c->calldatalen;
+	call->reply_data.dptr  = NULL;
+	call->reply_data.dsize = 0;
 
 	/* determine if we are the dmaster for this key. This also
 	   fetches the record data (if any), thus avoiding a 2nd fetch of the data 
@@ -505,9 +506,40 @@ void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
 		return;
 	}
 
-	/* if we are not the dmaster, then send a redirect to the
-	   requesting node */
-	if (header.dmaster != ctdb->pnn) {
+	/* Dont do READONLY if we dont have a tracking database */
+	if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db->readonly) {
+		c->flags &= ~CTDB_WANT_READONLY;
+	}
+
+	if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
+		header.flags &= ~(CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE);
+		if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
+		}
+		/* and clear out the tracking data */
+		if (tdb_delete(ctdb_db->rottdb, call->key) != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
+		}
+	}
+
+	/* if we are revoking, we must defer all other calls until the revoke
+	 * had completed.
+	 */
+	if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
+		talloc_free(data.dptr);
+		ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+
+		if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, call->key, hdr, ctdb_call_input_pkt, ctdb) != 0) {
+			ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+		}
+		talloc_free(call);
+		return;
+	}
+
+	/* if we are not the dmaster and are not hosting any delegations,
+	   then send a redirect to the requesting node */
+	if ((header.dmaster != ctdb->pnn) 
+	    && (!(header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) ) {
 		talloc_free(data.dptr);
 		ctdb_call_send_redirect(ctdb, call->key, c, &header);
 
@@ -518,6 +550,80 @@ void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
 		return;
 	}
 
+	if ( (!(c->flags & CTDB_WANT_READONLY))
+	&& (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
+		header.flags   |= CTDB_REC_RO_REVOKING_READONLY;
+		if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+		}
+		ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+
+		if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, call->key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to start record revoke");
+		}
+		talloc_free(data.dptr);
+
+		if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, call->key, hdr, ctdb_call_input_pkt, ctdb) != 0) {
+			ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+		}
+		talloc_free(call);
+
+		return;
+	}		
+
+	/* If this is the first request for delegation. bump rsn and set
+	 * the delegations flag
+	 */
+	if ((c->flags & CTDB_WANT_READONLY)
+	&&  (c->callid == CTDB_FETCH_WITH_HEADER_FUNC)
+	&&  (!(header.flags & CTDB_REC_RO_HAVE_DELEGATIONS))) {
+		header.rsn     += 3;
+		header.flags   |= CTDB_REC_RO_HAVE_DELEGATIONS;
+		if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+		}
+	}
+	if ((c->flags & CTDB_WANT_READONLY) 
+	&&  (call->call_id == CTDB_FETCH_WITH_HEADER_FUNC)) {
+		TDB_DATA tdata;
+
+		tdata = tdb_fetch(ctdb_db->rottdb, call->key);
+		if (ctdb_trackingdb_add_pnn(ctdb, &tdata, c->hdr.srcnode) != 0) {
+			ctdb_fatal(ctdb, "Failed to add node to trackingdb");
+		}
+		if (tdb_store(ctdb_db->rottdb, call->key, tdata, TDB_REPLACE) != 0) {
+			ctdb_fatal(ctdb, "Failed to store trackingdb data");
+		}
+		free(tdata.dptr);
+
+		ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+		}
+
+		len = offsetof(struct ctdb_reply_call, data) + data.dsize + sizeof(struct ctdb_ltdb_header);
+		r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CALL, len, 
+					    struct ctdb_reply_call);
+		CTDB_NO_MEMORY_FATAL(ctdb, r);
+		r->hdr.destnode  = c->hdr.srcnode;
+		r->hdr.reqid     = c->hdr.reqid;
+		r->status        = 0;
+		r->datalen       = data.dsize + sizeof(struct ctdb_ltdb_header);
+		header.rsn      -= 2;
+		header.flags   |= CTDB_REC_RO_HAVE_READONLY;
+		header.flags   &= ~CTDB_REC_RO_HAVE_DELEGATIONS;
+		memcpy(&r->data[0], &header, sizeof(struct ctdb_ltdb_header));
+
+		if (data.dsize) {
+			memcpy(&r->data[sizeof(struct ctdb_ltdb_header)], data.dptr, data.dsize);
+		}
+
+		ctdb_queue_packet(ctdb, &r->hdr);
+
+		talloc_free(r);
+		return;
+	}
+
 	CTDB_UPDATE_STAT(ctdb, max_hop_count, c->hopcount);
 
 	/* Try if possible to migrate the record off to the caller node.
@@ -543,7 +649,11 @@ void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
 		}
 	}
 
-	ctdb_call_local(ctdb_db, call, &header, hdr, &data);
+	ret = ctdb_call_local(ctdb_db, call, &header, hdr, &data, true);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " ctdb_call_local failed\n"));
+		call->status = -1;
+	}
 
 	ret = ctdb_ltdb_unlock(ctdb_db, call->key);
 	if (ret != 0) {
@@ -766,7 +876,7 @@ struct ctdb_call_state *ctdb_call_local_send(struct ctdb_db_context *ctdb_db,
 	*(state->call) = *call;
 	state->ctdb_db = ctdb_db;
 
-	ret = ctdb_call_local(ctdb_db, state->call, header, state, data);
+	ret = ctdb_call_local(ctdb_db, state->call, header, state, data, true);
 
 	event_add_timed(ctdb->ev, state, timeval_zero(), call_local_trigger, state);
 
@@ -890,3 +1000,362 @@ void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode)
 
 	talloc_free(r);
 }
+
+
+
+struct revokechild_deferred_call {
+	struct ctdb_context *ctdb;
+	struct ctdb_req_header *hdr;
+	deferred_requeue_fn fn;
+	void *ctx;
+};
+
+struct revokechild_handle {
+	struct revokechild_handle *next, *prev;
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
+	struct fd_event *fde;
+	int status;
+	int fd[2];
+	pid_t child;
+	TDB_DATA key;
+};
+
+struct revokechild_requeue_handle {
+	struct ctdb_context *ctdb;
+	struct ctdb_req_header *hdr;
+	deferred_requeue_fn fn;
+	void *ctx;
+};
+
+static void deferred_call_requeue(struct event_context *ev, struct timed_event *te, 
+		       struct timeval t, void *private_data)
+{
+	struct revokechild_requeue_handle *requeue_handle = talloc_get_type(private_data, struct revokechild_requeue_handle);
+
+	requeue_handle->fn(requeue_handle->ctx, requeue_handle->hdr);
+	talloc_free(requeue_handle);
+}
+
+static int deferred_call_destructor(struct revokechild_deferred_call *deferred_call)
+{
+	struct ctdb_context *ctdb = deferred_call->ctdb;
+	struct revokechild_requeue_handle *requeue_handle = talloc(ctdb, struct revokechild_requeue_handle);
+	struct ctdb_req_call *c = (struct ctdb_req_call *)deferred_call->hdr;
+
+	requeue_handle->ctdb = ctdb;
+	requeue_handle->hdr  = deferred_call->hdr;
+	requeue_handle->fn   = deferred_call->fn;
+	requeue_handle->ctx  = deferred_call->ctx;
+	talloc_steal(requeue_handle, requeue_handle->hdr);
+
+	/* when revoking, any READONLY requests have 1 second grace to let read/write finish first */
+	event_add_timed(ctdb->ev, requeue_handle, timeval_current_ofs(c->flags & CTDB_WANT_READONLY ? 1 : 0, 0), deferred_call_requeue, requeue_handle);
+
+	return 0;
+}
+
+
+static int revokechild_destructor(struct revokechild_handle *rc)
+{
+	if (rc->fde != NULL) {
+		talloc_free(rc->fde);
+	}
+
+	if (rc->fd[0] != -1) {
+		close(rc->fd[0]);
+	}
+	if (rc->fd[1] != -1) {
+		close(rc->fd[1]);
+	}
+	kill(rc->child, SIGKILL);
+
+	DLIST_REMOVE(rc->ctdb_db->revokechild_active, rc);
+	return 0;
+}
+
+static void revokechild_handler(struct event_context *ev, struct fd_event *fde, 
+			     uint16_t flags, void *private_data)
+{
+	struct revokechild_handle *rc = talloc_get_type(private_data, 
+						     struct revokechild_handle);
+	int ret;
+	char c;
+
+	ret = read(rc->fd[0], &c, 1);
+	if (ret != 1) {
+		DEBUG(DEBUG_ERR,("Failed to read status from revokechild. errno:%d\n", errno));
+		rc->status = -1;
+		talloc_free(rc);
+		return;
+	}
+	if (c != 0) {
+		DEBUG(DEBUG_ERR,("revokechild returned failure. status:%d\n", c));
+		rc->status = -1;
+		talloc_free(rc);
+		return;
+	}
+
+	talloc_free(rc);
+}
+
+struct ctdb_revoke_state {
+	struct ctdb_db_context *ctdb_db;
+	TDB_DATA key;
+	struct ctdb_ltdb_header *header;
+	TDB_DATA data;
+	int count;
+	int status;
+	int finished;
+};
+
+static void update_record_cb(struct ctdb_client_control_state *state)
+{
+	struct ctdb_revoke_state *revoke_state;
+	int ret;
+	int32_t res;
+
+	if (state == NULL) {
+		return;
+	}
+	revoke_state = state->async.private_data;
+
+	state->async.fn = NULL;
+        ret = ctdb_control_recv(state->ctdb, state, state, NULL, &res, NULL);
+        if ((ret != 0) || (res != 0)) {
+		DEBUG(DEBUG_ERR,("Recv for revoke update record failed ret:%d res:%d\n", ret, res));
+		revoke_state->status = -1;
+	}
+
+	revoke_state->count--;
+	if (revoke_state->count <= 0) {
+		revoke_state->finished = 1;
+	}
+}
+
+static void revoke_send_cb(struct ctdb_context *ctdb, uint32_t pnn, void *private_data)
+{
+	struct ctdb_revoke_state *revoke_state = private_data;
+	struct ctdb_client_control_state *state;
+
+	state = ctdb_ctrl_updaterecord_send(ctdb, revoke_state, timeval_current_ofs(5,0), pnn, revoke_state->ctdb_db, revoke_state->key, revoke_state->header, revoke_state->data);
+	if (state == NULL) {
+		DEBUG(DEBUG_ERR,("Failure to send update record to revoke readonly delegation\n"));
+		revoke_state->status = -1;
+		return;
+	}
+	state->async.fn           = update_record_cb;
+	state->async.private_data = revoke_state;
+
+	revoke_state->count++;
+
+}
+
+static void ctdb_revoke_timeout_handler(struct event_context *ev, struct timed_event *te, 
+			      struct timeval yt, void *private_data)
+{
+	struct ctdb_revoke_state *state = private_data;
+
+	DEBUG(DEBUG_ERR,("Timed out waiting for revoke to finish\n"));
+	state->finished = 1;
+	state->status   = -1;
+}
+
+static int ctdb_revoke_all_delegations(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA tdata, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+	struct ctdb_revoke_state *state = talloc_zero(ctdb, struct ctdb_revoke_state);
+	int status;
+
+	state->ctdb_db = ctdb_db;
+	state->key     = key;
+	state->header  = header;
+	state->data    = data;
+ 
+	ctdb_trackingdb_traverse(ctdb, tdata, revoke_send_cb, state);
+
+	event_add_timed(ctdb->ev, state, timeval_current_ofs(5, 0), ctdb_revoke_timeout_handler, state);
+
+	while (state->finished == 0) {
+		event_loop_once(ctdb->ev);
+	}
+
+	status = state->status;
+
+	if (status == 0) {
+		struct ctdb_ltdb_header new_header;
+		TDB_DATA new_data;
+
+		if (ctdb_ltdb_lock(ctdb_db, key) != 0) {
+			DEBUG(DEBUG_ERR,("Failed to chainlock the database in revokechild\n"));
+			talloc_free(state);
+			return -1;
+		}
+		if (ctdb_ltdb_fetch(ctdb_db, key, &new_header, state, &new_data) != 0) {
+			ctdb_ltdb_unlock(ctdb_db, key);
+			DEBUG(DEBUG_ERR,("Failed for fetch tdb record in revokechild\n"));
+			talloc_free(state);
+			return -1;
+		}
+		header->rsn++;
+		if (new_header.rsn > header->rsn) {
+			ctdb_ltdb_unlock(ctdb_db, key);
+			DEBUG(DEBUG_ERR,("RSN too high in tdb record in revokechild\n"));
+			talloc_free(state);
+			return -1;
+		}
+		if ( (new_header.flags & (CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS)) != (CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS) ) {
+			ctdb_ltdb_unlock(ctdb_db, key);
+			DEBUG(DEBUG_ERR,("Flags are wrong in tdb record in revokechild\n"));
+			talloc_free(state);
+			return -1;
+		}
+		new_header.rsn++;
+		new_header.flags |= CTDB_REC_RO_REVOKE_COMPLETE;
+		if (ctdb_ltdb_store(ctdb_db, key, &new_header, new_data) != 0) {
+			ctdb_ltdb_unlock(ctdb_db, key);
+			DEBUG(DEBUG_ERR,("Failed to write new record in revokechild\n"));
+			talloc_free(state);
+			return -1;
+		}
+		ctdb_ltdb_unlock(ctdb_db, key);
+	}
+
+	talloc_free(state);
+	return status;
+}
+
+
+int ctdb_start_revoke_ro_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+	TDB_DATA tdata;
+	struct revokechild_handle *rc;
+	pid_t parent = getpid();
+	int ret;
+
+	header->flags &= ~(CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY);
+	header->rsn   -= 1;
+
+	if ((rc = talloc_zero(ctdb_db, struct revokechild_handle)) == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate revokechild_handle\n"));
+		return -1;
+	}
+
+	tdata = tdb_fetch(ctdb_db->rottdb, key);
+	if (tdata.dsize > 0) {
+		uint8_t *tmp;
+
+		tmp = tdata.dptr;
+		tdata.dptr = talloc_memdup(rc, tdata.dptr, tdata.dsize);
+		free(tmp);
+	}
+
+	rc->status    = 0;
+	rc->ctdb      = ctdb;
+	rc->ctdb_db   = ctdb_db;
+	rc->fd[0]     = -1;
+	rc->fd[1]     = -1;
+
+	talloc_set_destructor(rc, revokechild_destructor);
+
+	rc->key.dsize = key.dsize;
+	rc->key.dptr  = talloc_memdup(rc, key.dptr, key.dsize);
+	if (rc->key.dptr == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate key for revokechild_handle\n"));
+		talloc_free(rc);
+		return -1;
+	}
+
+	ret = pipe(rc->fd);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Failed to allocate key for revokechild_handle\n"));
+		talloc_free(rc);
+		return -1;
+	}
+
+
+	rc->child = ctdb_fork(ctdb);
+	if (rc->child == (pid_t)-1) {
+		DEBUG(DEBUG_ERR,("Failed to fork child for revokechild\n"));
+		talloc_free(rc);
+		return -1;
+	}
+
+	if (rc->child == 0) {
+		char c = 0;
+		close(rc->fd[0]);
+		debug_extra = talloc_asprintf(NULL, "revokechild-%s:", ctdb_db->db_name);
+
+		if (switch_from_server_to_client(ctdb, "revokechild-%s", ctdb_db->db_name) != 0) {
+			DEBUG(DEBUG_ERR,("Failed to switch from server to client for revokechild process\n"));
+			c = 1;
+			goto child_finished;
+		}
+
+		c = ctdb_revoke_all_delegations(ctdb, ctdb_db, tdata, key, header, data);
+
+child_finished:
+		write(rc->fd[1], &c, 1);
+		/* make sure we die when our parent dies */
+		while (kill(parent, 0) == 0 || errno != ESRCH) {
+			sleep(5);
+		}
+		_exit(0);
+	}
+
+	close(rc->fd[1]);
+	rc->fd[1] = -1;
+	set_close_on_exec(rc->fd[0]);
+
+	/* This is an active revokechild child process */
+	DLIST_ADD_END(ctdb_db->revokechild_active, rc, NULL);
+
+	rc->fde = event_add_fd(ctdb->ev, rc, rc->fd[0],
+				   EVENT_FD_READ, revokechild_handler,
+				   (void *)rc);
+	if (rc->fde == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to set up fd event for revokechild process\n"));
+		talloc_free(rc);
+	}
+	tevent_fd_set_auto_close(rc->fde);
+
+	return 0;
+}
+
+int ctdb_add_revoke_deferred_call(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr, deferred_requeue_fn fn, void *call_context)
+{
+	struct revokechild_handle *rc;
+	struct revokechild_deferred_call *deferred_call;
+
+	for (rc = ctdb_db->revokechild_active; rc; rc = rc->next) {
+		if (rc->key.dsize == 0) {
+			continue;
+		}
+		if (rc->key.dsize != key.dsize) {
+			continue;
+		}
+		if (!memcmp(rc->key.dptr, key.dptr, key.dsize)) {
+			break;
+		}
+	}
+
+	if (rc == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to add deferred call to revoke list. revoke structure not found\n"));
+		return -1;
+	}
+
+	deferred_call = talloc(rc, struct revokechild_deferred_call);
+	if (deferred_call == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate deferred call structure for revoking record\n"));
+		return -1;
+	}
+
+	deferred_call->ctdb = ctdb;
+	deferred_call->hdr  = hdr;
+	deferred_call->fn   = fn;
+	deferred_call->ctx  = call_context;
+
+	talloc_set_destructor(deferred_call, deferred_call_destructor);
+	talloc_steal(deferred_call, hdr);
+
+	return 0;
+}
diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c
index 748907f2a9..9c2f7429dd 100644
--- a/ctdb/server/ctdb_control.c
+++ b/ctdb/server/ctdb_control.c
@@ -194,6 +194,16 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
 		CHECK_CONTROL_DATA_SIZE(0);
 		return ctdb->statistics.num_clients;
 
+	case CTDB_CONTROL_SET_DB_READONLY: {
+		uint32_t db_id;
+		struct ctdb_db_context *ctdb_db;
+
+		CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+		db_id = *(uint32_t *)indata.dptr;
+		ctdb_db = find_ctdb_db(ctdb, db_id);
+		if (ctdb_db == NULL) return -1;
+		return ctdb_set_db_readonly(ctdb, ctdb_db);
+	}
 	case CTDB_CONTROL_GET_DBNAME: {
 		uint32_t db_id;
 		struct ctdb_db_context *ctdb_db;
diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c
index 75344ad386..88d12103f7 100644
--- a/ctdb/server/ctdb_daemon.c
+++ b/ctdb/server/ctdb_daemon.c
@@ -312,6 +312,7 @@ static void daemon_call_from_client_callback(struct ctdb_call_state *state)
 	}
 	r->hdr.reqid        = dstate->reqid;
 	r->datalen          = dstate->call->reply_data.dsize;
+	r->status           = dstate->call->status;
 	memcpy(&r->data[0], dstate->call->reply_data.dptr, r->datalen);
 
 	res = daemon_queue_send(client, &r->hdr);
@@ -423,6 +424,56 @@ static void daemon_request_call_from_client(struct ctdb_client *client,
 		return;
 	}
 
+	/* Dont do READONLY if we dont have a tracking database */
+	if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db->readonly) {
+		c->flags &= ~CTDB_WANT_READONLY;
+	}
+
+	if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
+		header.flags &= ~(CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE);
+		if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
+		}
+		/* and clear out the tracking data */
+		if (tdb_delete(ctdb_db->rottdb, key) != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
+		}
+	}
+
+	/* if we are revoking, we must defer all other calls until the revoke
+	 * had completed.
+	 */
+	if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
+		talloc_free(data.dptr);
+		ret = ctdb_ltdb_unlock(ctdb_db, key);
+
+		if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
+			ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+		}
+		return;
+	}
+
+	if ((header.dmaster == ctdb->pnn)
+	&& (!(c->flags & CTDB_WANT_READONLY))
+	&& (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
+		header.flags   |= CTDB_REC_RO_REVOKING_READONLY;
+		if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+		}
+		ret = ctdb_ltdb_unlock(ctdb_db, key);
+
+		if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, key, &header, data) != 0) {
+			ctdb_fatal(ctdb, "Failed to start record revoke");
+		}
+		talloc_free(data.dptr);
+
+		if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
+			ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+		}
+
+		return;
+	}		
+
 	dstate = talloc(client, struct daemon_call_state);
 	if (dstate == NULL) {
 		ret = ctdb_ltdb_unlock(ctdb_db, key);
diff --git a/ctdb/server/ctdb_ltdb_server.c b/ctdb/server/ctdb_ltdb_server.c
index a0fe2c529c..b76ae6a622 100644
--- a/ctdb/server/ctdb_ltdb_server.c
+++ b/ctdb/server/ctdb_ltdb_server.c
@@ -49,6 +49,27 @@ static int ctdb_fetch_func(struct ctdb_call_info *call)
 	return 0;
 }
 
+/*
+  this is a plain fetch procedure that all databases support
+  this returns the full record including the ltdb header
+*/
+static int ctdb_fetch_with_header_func(struct ctdb_call_info *call)
+{
+	call->reply_data = talloc(call, TDB_DATA);
+	if (call->reply_data == NULL) {
+		return -1;
+	}
+	call->reply_data->dsize = sizeof(struct ctdb_ltdb_header) + call->record_data.dsize;
+	call->reply_data->dptr  = talloc_size(call->reply_data, call->reply_data->dsize);
+	if (call->reply_data->dptr == NULL) {
+		return -1;
+	}
+	memcpy(call->reply_data->dptr, call->header, sizeof(struct ctdb_ltdb_header));
+	memcpy(&call->reply_data->dptr[sizeof(struct ctdb_ltdb_header)], call->record_data.dptr, call->record_data.dsize);
+
+	return 0;
+}
+
 
 /**
  * write a record to a normal database
@@ -702,6 +723,44 @@ int32_t ctdb_control_db_get_health(struct ctdb_context *ctdb,
 	return 0;
 }
 
+
+int ctdb_set_db_readonly(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db)
+{
+	char *ropath;
+
+	DEBUG(DEBUG_ERR,("XXX set db readonly %s\n", ctdb_db->db_name));
+
+	if (ctdb_db->readonly) {
+		return 0;
+	}
+
+	if (ctdb_db->persistent) {
+		DEBUG(DEBUG_ERR,("Trying to set persistent database with readonly property\n"));
+		return -1;
+	}
+
+	ropath = talloc_asprintf(ctdb_db, "%s.RO", ctdb_db->db_path);
+	if (ropath == NULL) {
+		DEBUG(DEBUG_CRIT,("Failed to asprintf the tracking database\n"));
+		return -1;
+	}
+	ctdb_db->rottdb = tdb_open(ropath, 
+			      ctdb->tunable.database_hash_size, 
+			      TDB_NOLOCK|TDB_CLEAR_IF_FIRST|TDB_NOSYNC,
+			      O_CREAT|O_RDWR, 0);
+	if (ctdb_db->rottdb == NULL) {
+		DEBUG(DEBUG_CRIT,("Failed to open/create the tracking database '%s'\n", ropath));
+		talloc_free(ropath);
+		return -1;
+	}
+
+	DEBUG(DEBUG_NOTICE,("OPENED tracking database : '%s'\n", ropath));
+
+	ctdb_db->readonly = true;
+	talloc_free(ropath);
+	return 0;
+}
+
 /*
   attach to a database, handling both persistent and non-persistent databases
   return 0 on success, -1 on failure
@@ -932,6 +991,17 @@ again:
 		return -1;
 	}
 
+	/* 
+	   all databases support the "fetch_with_header" function. we need this
+	   for efficient readonly record fetches
+	*/
+	ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_fetch_with_header_func, CTDB_FETCH_WITH_HEADER_FUNC);
+	if (ret != 0) {
+		DEBUG(DEBUG_CRIT,("Failed to setup fetch function for '%s'\n", ctdb_db->db_name));
+		talloc_free(ctdb_db);
+		return -1;
+	}
+
 	ret = ctdb_vacuum_init(ctdb_db);
 	if (ret != 0) {
 		DEBUG(DEBUG_CRIT,("Failed to setup vacuuming for "
diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
index eb3bf0a50c..5d98c4480b 100644
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@@ -187,7 +187,12 @@ ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indat
 	dbid_map->num = len;
 	for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
 		dbid_map->dbs[i].dbid       = ctdb_db->db_id;
-		dbid_map->dbs[i].persistent = ctdb_db->persistent;
+		if (ctdb_db->persistent != 0) {
+			dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_PERSISTENT;
+		}
+		if (ctdb_db->readonly != 0) {
+			dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_READONLY;
+		}
 	}
 
 	return 0;
@@ -469,6 +474,11 @@ int32_t ctdb_control_push_db(struct ctdb_context *ctdb, TDB_DATA indata)
 			goto failed;
 		}
 		hdr = (struct ctdb_ltdb_header *)data.dptr;
+		/* strip off any read only record flags. All readonly records
+		   are revoked implicitely by a recovery
+		*/
+		hdr->flags &= ~(CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE);
+
 		data.dptr += sizeof(*hdr);
 		data.dsize -= sizeof(*hdr);
 
@@ -484,6 +494,22 @@ int32_t ctdb_control_push_db(struct ctdb_context *ctdb, TDB_DATA indata)
 	DEBUG(DEBUG_DEBUG,("finished push of %u records for dbid 0x%x\n",
 		 reply->count, reply->db_id));
 
+	if (ctdb_db->readonly) {
+		DEBUG(DEBUG_CRIT,("Clearing the tracking database for dbid 0x%x\n",
+				  ctdb_db->db_id));
+		if (tdb_wipe_all(ctdb_db->rottdb) != 0) {
+			DEBUG(DEBUG_ERR,("Failed to wipe tracking database for 0x%x. Dropping read-only delegation support\n", ctdb_db->db_id));
+			ctdb_db->readonly = false;
+			tdb_close(ctdb_db->rottdb);
+			ctdb_db->rottdb = NULL;
+			ctdb_db->readonly = false;
+		}
+		while (ctdb_db->revokechild_active != NULL) {
+			talloc_free(ctdb_db->revokechild_active);
+			ctdb_db->revokechild_active = NULL;
+		}
+	}
+
 	ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
 	return 0;
 
@@ -1298,3 +1324,4 @@ int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
 
 	return 0;
 }
+
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 0fcfd3370d..631f53e89b 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -439,7 +439,8 @@ static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctd
 				return -1;
 			}
 			ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
-					   mem_ctx, name, dbmap->dbs[db].persistent);
+					   mem_ctx, name,
+					   dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 			if (ret != 0) {
 				DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
 				return -1;
@@ -502,7 +503,7 @@ static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb
 				return -1;
 			}
 			ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name, 
-					   remote_dbmap->dbs[db].persistent);
+					   remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 			if (ret != 0) {
 				DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
 				return -1;
@@ -823,7 +824,7 @@ static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
 
 	for (i=0;i<dbmap->num;i++) {
 		if (dbmap->dbs[i].dbid == recs->db_id) {
-			persistent = dbmap->dbs[i].persistent;
+			persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
 			break;
 		}
 	}
@@ -1515,7 +1516,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
 	for (i=0;i<dbmap->num;i++) {
 		ret = recover_database(rec, mem_ctx,
 				       dbmap->dbs[i].dbid,
-				       dbmap->dbs[i].persistent,
+				       dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
 				       pnn, nodemap, generation);
 		if (ret != 0) {
 			DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
diff --git a/ctdb/tests/src/ctdb_fetch_readonly_loop.c b/ctdb/tests/src/ctdb_fetch_readonly_loop.c
new file mode 100644
index 0000000000..7d73046300
--- /dev/null
+++ b/ctdb/tests/src/ctdb_fetch_readonly_loop.c
@@ -0,0 +1,134 @@
+/* 
+   simple ctdb test tool
+   This test just fetch_locks a record and releases it in a loop.
+
+   Copyright (C) Ronnie Sahlberg 2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/tevent/tevent.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+#include "ctdb_private.h"
+
+static struct ctdb_db_context *ctdb_db;
+
+char *TESTKEY = "testkey";
+static int count;
+
+/*
+	Just try locking/unlocking a single record once
+*/
+static void fetch_lock_once(struct ctdb_context *ctdb, struct event_context *ev)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+	TDB_DATA key, data;
+	struct ctdb_record_handle *h;
+
+	key.dptr = discard_const(TESTKEY);
+	key.dsize = strlen(TESTKEY);
+
+//	printf("Trying to fetch lock the record ...\n");
+
+	h = ctdb_fetch_readonly_lock(ctdb_db, tmp_ctx, key, &data, true);
+	if (h == NULL) {
+		printf("Failed to fetch record '%s' on node %d\n", 
+	       		(const char *)key.dptr, ctdb_get_pnn(ctdb));
+		talloc_free(tmp_ctx);
+		exit(10);
+	}
+
+	count++;
+	printf("%d   data:%.*s\n", count, data.dsize, data.dptr);
+	talloc_free(tmp_ctx);
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+	struct ctdb_context *ctdb;
+	TDB_DATA key;
+
+	struct poptOption popt_options[] = {
+		POPT_AUTOHELP
+		POPT_CTDB_CMDLINE
+		{ "record",      'r', POPT_ARG_STRING, &TESTKEY, 0, "record", "string" },
+		POPT_TABLEEND
+	};
+	int opt, ret;
+	const char **extra_argv;
+	int extra_argc = 0;
+	poptContext pc;
+	struct event_context *ev;
+
+	pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+	while ((opt = poptGetNextOpt(pc)) != -1) {
+		switch (opt) {
+		default:
+			fprintf(stderr, "Invalid option %s: %s\n", 
+				poptBadOption(pc, 0), poptStrerror(opt));
+			exit(1);
+		}
+	}
+
+	/* setup the remaining options for the main program to use */
+	extra_argv = poptGetArgs(pc);
+	if (extra_argv) {
+		extra_argv++;
+		while (extra_argv[extra_argc]) extra_argc++;
+	}
+
+	ev = event_context_init(NULL);
+
+	ctdb = ctdb_cmdline_client(ev);
+
+	key.dptr  = TESTKEY;
+	key.dsize = strlen(TESTKEY);
+
+	ret = ctdb_ctrl_getvnnmap(ctdb, timeval_zero(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map);
+	if (ret != 0) {
+		printf("failed to get vnnmap\n");
+		exit(10);
+	}
+	printf("Record:%s\n", TESTKEY);
+	printf("Lmaster : %d\n", ctdb_lmaster(ctdb, &key)); 
+
+	/* attach to a specific database */
+	ctdb_db = ctdb_attach(ctdb, "test.tdb", false, 0);
+	if (!ctdb_db) {
+		printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+		exit(1);
+	}
+
+	printf("Waiting for cluster\n");
+	while (1) {
+		uint32_t recmode=1;
+		ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+		if (recmode == 0) break;
+		event_loop_once(ev);
+	}
+
+	while (1) {
+		fetch_lock_once(ctdb, ev);
+		usleep(10000);
+	}
+
+	return 0;
+}
diff --git a/ctdb/tests/src/ctdb_fetch_readonly_once.c b/ctdb/tests/src/ctdb_fetch_readonly_once.c
new file mode 100644
index 0000000000..662f4704d4
--- /dev/null
+++ b/ctdb/tests/src/ctdb_fetch_readonly_once.c
@@ -0,0 +1,134 @@
+/* 
+   simple ctdb test tool
+   This test just fetch_locks a record and releases it once.
+
+   Copyright (C) Ronnie Sahlberg 2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/tevent/tevent.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+#include "ctdb_private.h"
+
+static struct ctdb_db_context *ctdb_db;
+
+char *TESTKEY = "testkey";
+
+
+/*
+	Just try locking/unlocking a single record once
+*/
+static void fetch_lock_once(struct ctdb_context *ctdb, struct event_context *ev)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+	TDB_DATA key, data;
+	struct ctdb_record_handle *h;
+
+	key.dptr = discard_const(TESTKEY);
+	key.dsize = strlen(TESTKEY);
+
+	printf("Trying to fetch lock the record ...\n");
+
+	h = ctdb_fetch_readonly_lock(ctdb_db, tmp_ctx, key, &data, true);
+	if (h == NULL) {
+		printf("Failed to fetch record '%s' on node %d\n", 
+	       		(const char *)key.dptr, ctdb_get_pnn(ctdb));
+		talloc_free(tmp_ctx);
+		exit(10);
+	}
+
+	printf("Record fetchlocked.\n");
+	printf("Press enter to release the record ...\n");
+	(void)getchar();
+
+	talloc_free(tmp_ctx);
+	printf("Record released.\n");
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+	struct ctdb_context *ctdb;
+	TDB_DATA key;
+
+	struct poptOption popt_options[] = {
+		POPT_AUTOHELP
+		POPT_CTDB_CMDLINE
+		{ "record",      'r', POPT_ARG_STRING, &TESTKEY, 0, "record", "string" },
+		POPT_TABLEEND
+	};
+	int opt, ret;
+	const char **extra_argv;
+	int extra_argc = 0;
+	poptContext pc;
+	struct event_context *ev;
+
+	pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+	while ((opt = poptGetNextOpt(pc)) != -1) {
+		switch (opt) {
+		default:
+			fprintf(stderr, "Invalid option %s: %s\n", 
+				poptBadOption(pc, 0), poptStrerror(opt));
+			exit(1);
+		}
+	}
+
+	/* setup the remaining options for the main program to use */
+	extra_argv = poptGetArgs(pc);
+	if (extra_argv) {
+		extra_argv++;
+		while (extra_argv[extra_argc]) extra_argc++;
+	}
+
+	ev = event_context_init(NULL);
+
+	ctdb = ctdb_cmdline_client(ev);
+
+	key.dptr  = TESTKEY;
+	key.dsize = strlen(TESTKEY);
+
+	ret = ctdb_ctrl_getvnnmap(ctdb, timeval_zero(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map);
+	if (ret != 0) {
+		printf("failed to get vnnmap\n");
+		exit(10);
+	}
+	printf("Record:%s\n", TESTKEY);
+	printf("Lmaster : %d\n", ctdb_lmaster(ctdb, &key)); 
+
+	/* attach to a specific database */
+	ctdb_db = ctdb_attach(ctdb, "test.tdb", false, 0);
+	if (!ctdb_db) {
+		printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+		exit(1);
+	}
+
+	printf("Waiting for cluster\n");
+	while (1) {
+		uint32_t recmode=1;
+		ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+		if (recmode == 0) break;
+		event_loop_once(ev);
+	}
+
+	fetch_lock_once(ctdb, ev);
+
+	return 0;
+}
diff --git a/ctdb/tests/src/ctdb_trackingdb_test.c b/ctdb/tests/src/ctdb_trackingdb_test.c
new file mode 100644
index 0000000000..77eb7b9f08
--- /dev/null
+++ b/ctdb/tests/src/ctdb_trackingdb_test.c
@@ -0,0 +1,136 @@
+/* 
+   simple trackingdb test tool
+
+   This program is used to test the funcitons to manipulate and enumerate
+   the trackingdb records :
+	ctdb_trackingdb_add_pnn()
+	ctdb_trackingdb_traverse()
+
+   Copyright (C) Ronnie Sahlberg 2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <stdlib.h>
+#include <time.h>
+#include "includes.h"
+#include "lib/tevent/tevent.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+#include "ctdb_private.h"
+#include "db_wrap.h"
+
+#define MAXINDEX 64
+char indices[MAXINDEX];
+
+void vn_cb(struct ctdb_context *ctdb, uint32_t pnn, void *private_data)
+{
+	char *ind = private_data;
+
+	printf("Callback for node %d\n", pnn);
+	if (ind[pnn] == 0) {
+		printf("ERROR, node %d from callback was never added\n", pnn);
+		exit(10);
+	}
+	ind[pnn] = 0;
+}
+
+void verify_nodes(struct ctdb_context *ctdb, TDB_DATA data)
+{
+	int i;
+
+	printf("Verify the nodes\n");
+	ctdb_trackingdb_traverse(ctdb, data, vn_cb, indices);
+	for(i = 0; i < MAXINDEX; i++) {
+		if (indices[i] != 0) {
+			printf("Callback for %d was never invoked\n", i);
+			exit(0);
+		}
+	}
+}
+
+	
+	
+void add_node(struct ctdb_context *ctdb, TDB_DATA *data, int pnn)
+{
+	int i;
+
+	printf("Add node %d\n", pnn);
+	if (ctdb_trackingdb_add_pnn(ctdb, data, pnn)) {
+		printf("Failed to add tracking db data\n");
+		exit(10);
+	}
+	indices[pnn] = 1;
+}
+
+static void trackdb_test(struct ctdb_context *ctdb)
+{
+	TDB_DATA data = {NULL,0};
+	int i;
+
+	printf("Add 10 nodes\n");
+	srandom(time(NULL));
+	for(i=0; i<10; i++) {
+		add_node(ctdb, &data, random()%MAXINDEX);
+	}
+
+	verify_nodes(ctdb, data);
+	printf("OK all seems well\n");
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+	struct ctdb_context *ctdb;
+
+	struct poptOption popt_options[] = {
+		POPT_AUTOHELP
+		POPT_CTDB_CMDLINE
+		POPT_TABLEEND
+	};
+	int opt;
+	const char **extra_argv;
+	int extra_argc = 0;
+	poptContext pc;
+	struct event_context *ev;
+
+	pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+	while ((opt = poptGetNextOpt(pc)) != -1) {
+		switch (opt) {
+		default:
+			fprintf(stderr, "Invalid option %s: %s\n", 
+				poptBadOption(pc, 0), poptStrerror(opt));
+			exit(1);
+		}
+	}
+
+	/* setup the remaining options for the main program to use */
+	extra_argv = poptGetArgs(pc);
+	if (extra_argv) {
+		extra_argv++;
+		while (extra_argv[extra_argc]) extra_argc++;
+	}
+
+	ev = event_context_init(NULL);
+
+	ctdb = ctdb_cmdline_client(ev);
+
+	trackdb_test(ctdb);
+
+	return 0;
+}
diff --git a/ctdb/tests/src/ctdb_update_record.c b/ctdb/tests/src/ctdb_update_record.c
new file mode 100644
index 0000000000..5f4b9c1e21
--- /dev/null
+++ b/ctdb/tests/src/ctdb_update_record.c
@@ -0,0 +1,158 @@
+/* 
+   simple ctdb test tool
+   This test just fetch_locks a record bumps the RSN and then writes new content
+
+   Copyright (C) Ronnie Sahlberg 2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/tevent/tevent.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+#include "ctdb_private.h"
+
+static struct ctdb_db_context *ctdb_db;
+
+#define TESTKEY "testkey"
+
+
+/*
+	Just try locking/unlocking a single record once
+*/
+static void fetch_lock_once(struct ctdb_context *ctdb, struct event_context *ev, uint32_t generation)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+	TDB_DATA key, data;
+	struct ctdb_record_handle *h;
+	struct ctdb_ltdb_header *header;
+	int ret;
+
+	key.dptr = discard_const(TESTKEY);
+	key.dsize = strlen(TESTKEY);
+
+	printf("Trying to fetch lock the record ...\n");
+
+	h = ctdb_fetch_readonly_lock(ctdb_db, tmp_ctx, key, &data, false);
+	if (h == NULL) {
+		printf("Failed to fetch record '%s' on node %d\n", 
+	       		(const char *)key.dptr, ctdb_get_pnn(ctdb));
+		talloc_free(tmp_ctx);
+		exit(10);
+	}
+
+	printf("Record fetchlocked.\n");
+	header = talloc_memdup(tmp_ctx, ctdb_header_from_record_handle(h), sizeof(*header));
+       	printf("RSN:%d\n", (int)header->rsn);
+	talloc_free(h);
+	printf("Record released.\n");
+
+	printf("Write new record with RSN+10\n");
+	header->rsn += 10;
+	data.dptr = (void *)talloc_asprintf(tmp_ctx, "%d", (int)header->rsn);
+	data.dsize = strlen((char *)data.dptr);
+
+	ret = ctdb_ctrl_updaterecord(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, ctdb_db, key, header, data);
+	if (ret != 0) {
+		printf("Failed to writerecord,  ret==%d\n", ret);	
+		exit(1);
+	}
+
+	printf("re-fetch the record\n");
+	h = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, &data);
+	if (h == NULL) {
+		printf("Failed to fetch record '%s' on node %d\n", 
+	       		(const char *)key.dptr, ctdb_get_pnn(ctdb));
+		talloc_free(tmp_ctx);
+		exit(10);
+	}
+
+	printf("Record fetchlocked.\n");
+	header = talloc_memdup(tmp_ctx, ctdb_header_from_record_handle(h), sizeof(*header));
+       	printf("RSN:%d\n", (int)header->rsn);
+	talloc_free(h);
+	printf("Record released.\n");
+
+	talloc_free(tmp_ctx);
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+	struct ctdb_context *ctdb;
+
+	struct poptOption popt_options[] = {
+		POPT_AUTOHELP
+		POPT_CTDB_CMDLINE
+		POPT_TABLEEND
+	};
+	int opt;
+	const char **extra_argv;
+	int extra_argc = 0;
+	poptContext pc;
+	struct event_context *ev;
+	struct ctdb_vnn_map *vnnmap=NULL;
+
+	pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+	while ((opt = poptGetNextOpt(pc)) != -1) {
+		switch (opt) {
+		default:
+			fprintf(stderr, "Invalid option %s: %s\n", 
+				poptBadOption(pc, 0), poptStrerror(opt));
+			exit(1);
+		}
+	}
+
+	/* setup the remaining options for the main program to use */
+	extra_argv = poptGetArgs(pc);
+	if (extra_argv) {
+		extra_argv++;
+		while (extra_argv[extra_argc]) extra_argc++;
+	}
+
+	ev = event_context_init(NULL);
+
+	ctdb = ctdb_cmdline_client(ev);
+
+	/* attach to a specific database */
+	ctdb_db = ctdb_attach(ctdb, "test.tdb", false, 0);
+	if (!ctdb_db) {
+		printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+		exit(1);
+	}
+
+	printf("Waiting for cluster\n");
+	while (1) {
+		uint32_t recmode=1;
+		ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+		if (recmode == 0) break;
+		event_loop_once(ev);
+	}
+
+
+	if (ctdb_ctrl_getvnnmap(ctdb, timeval_zero(), CTDB_CURRENT_NODE, ctdb, &vnnmap) != 0) {
+		printf("Unable to get vnnmap from local node\n");
+		exit(1);
+	}
+	printf("Current Generation %d\n", (int)vnnmap->generation);
+
+	fetch_lock_once(ctdb, ev, vnnmap->generation);
+
+	return 0;
+}
diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c
index a8474cf9b8..26c0bfb769 100644
--- a/ctdb/tools/ctdb.c
+++ b/ctdb/tools/ctdb.c
@@ -123,7 +123,7 @@ static int db_exists(struct ctdb_context *ctdb, const char *db_name, bool *persi
 		ctdb_ctrl_getdbname(ctdb, TIMELIMIT(), options.pnn, dbmap->dbs[i].dbid, ctdb, &name);
 		if (!strcmp(name, db_name)) {
 			if (persistent) {
-				*persistent = dbmap->dbs[i].persistent;
+				*persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
 			}
 			return 0;
 		}
@@ -3007,6 +3007,62 @@ static int control_catdb(struct ctdb_context *ctdb, int argc, const char **argv)
 	return 0;
 }
 
+struct cattdb_data {
+	struct ctdb_context *ctdb;
+	uint32_t count;
+};
+
+static int cattdb_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
+{
+	struct cattdb_data *d = private_data;
+
+	d->count++;
+	
+	return ctdb_dumpdb_record(d->ctdb, key, data, stdout);
+}
+
+/*
+  cat the local tdb database using same format as catdb
+ */
+static int control_cattdb(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+	const char *db_name;
+	struct ctdb_db_context *ctdb_db;
+	struct cattdb_data d;
+	bool persistent;
+
+	if (argc < 1) {
+		usage();
+	}
+
+	db_name = argv[0];
+
+
+	if (db_exists(ctdb, db_name, &persistent)) {
+		DEBUG(DEBUG_ERR,("Database '%s' does not exist\n", db_name));
+		return -1;
+	}
+
+	ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), db_name, false, 0);
+
+	if (ctdb_db == NULL) {
+		DEBUG(DEBUG_ERR,("Unable to attach to database '%s'\n", db_name));
+		return -1;
+	}
+
+	/* traverse the local tdb */
+	d.count = 0;
+	d.ctdb  = ctdb;
+	if (tdb_traverse_read(ctdb_db->ltdb->tdb, cattdb_traverse, &d) == -1) {
+		printf("Failed to cattdb data\n");
+		exit(10);
+	}
+	talloc_free(ctdb_db);
+
+	printf("Dumped %d records\n", d.count);
+	return 0;
+}
+
 /*
   display the content of a database key
  */
@@ -3492,12 +3548,13 @@ static int control_getdbmap(struct ctdb_context *ctdb, int argc, const char **ar
 	}
 
 	if(options.machinereadable){
-		printf(":ID:Name:Path:Persistent:Unhealthy:\n");
+		printf(":ID:Name:Path:Persistent:Unhealthy:ReadOnly:\n");
 		for(i=0;i<dbmap->num;i++){
 			const char *path;
 			const char *name;
 			const char *health;
 			bool persistent;
+			bool readonly;
 
 			ctdb_ctrl_getdbpath(ctdb, TIMELIMIT(), options.pnn,
 					    dbmap->dbs[i].dbid, ctdb, &path);
@@ -3505,10 +3562,11 @@ static int control_getdbmap(struct ctdb_context *ctdb, int argc, const char **ar
 					    dbmap->dbs[i].dbid, ctdb, &name);
 			ctdb_ctrl_getdbhealth(ctdb, TIMELIMIT(), options.pnn,
 					      dbmap->dbs[i].dbid, ctdb, &health);
-			persistent = dbmap->dbs[i].persistent;
-			printf(":0x%08X:%s:%s:%d:%d:\n",
+			persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
+			readonly   = dbmap->dbs[i].flags & CTDB_DB_FLAGS_READONLY;
+			printf(":0x%08X:%s:%s:%d:%d:%d:\n",
 			       dbmap->dbs[i].dbid, name, path,
-			       !!(persistent), !!(health));
+			       !!(persistent), !!(health), !!(readonly));
 		}
 		return 0;
 	}
@@ -3519,14 +3577,17 @@ static int control_getdbmap(struct ctdb_context *ctdb, int argc, const char **ar
 		const char *name;
 		const char *health;
 		bool persistent;
+		bool readonly;
 
 		ctdb_ctrl_getdbpath(ctdb, TIMELIMIT(), options.pnn, dbmap->dbs[i].dbid, ctdb, &path);
 		ctdb_ctrl_getdbname(ctdb, TIMELIMIT(), options.pnn, dbmap->dbs[i].dbid, ctdb, &name);
 		ctdb_ctrl_getdbhealth(ctdb, TIMELIMIT(), options.pnn, dbmap->dbs[i].dbid, ctdb, &health);
-		persistent = dbmap->dbs[i].persistent;
-		printf("dbid:0x%08x name:%s path:%s%s%s\n",
+		persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
+		readonly   = dbmap->dbs[i].flags & CTDB_DB_FLAGS_READONLY;
+		printf("dbid:0x%08x name:%s path:%s%s%s%s\n",
 		       dbmap->dbs[i].dbid, name, path,
 		       persistent?" PERSISTENT":"",
+		       readonly?" READONLY":"",
 		       health?" UNHEALTHY":"");
 	}
 
@@ -3559,6 +3620,7 @@ static int control_getdbstatus(struct ctdb_context *ctdb, int argc, const char *
 		const char *name;
 		const char *health;
 		bool persistent;
+		bool readonly;
 
 		ctdb_ctrl_getdbname(ctdb, TIMELIMIT(), options.pnn, dbmap->dbs[i].dbid, ctdb, &name);
 		if (strcmp(name, db_name) != 0) {
@@ -3567,10 +3629,12 @@ static int control_getdbstatus(struct ctdb_context *ctdb, int argc, const char *
 
 		ctdb_ctrl_getdbpath(ctdb, TIMELIMIT(), options.pnn, dbmap->dbs[i].dbid, ctdb, &path);
 		ctdb_ctrl_getdbhealth(ctdb, TIMELIMIT(), options.pnn, dbmap->dbs[i].dbid, ctdb, &health);
-		persistent = dbmap->dbs[i].persistent;
-		printf("dbid: 0x%08x\nname: %s\npath: %s\nPERSISTENT: %s\nHEALTH: %s\n",
+		persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
+		readonly   = dbmap->dbs[i].flags & CTDB_DB_FLAGS_READONLY;
+		printf("dbid: 0x%08x\nname: %s\npath: %s\nPERSISTENT: %s\nREADONLY: %s\nHEALTH: %s\n",
 		       dbmap->dbs[i].dbid, name, path,
 		       persistent?"yes":"no",
+		       readonly?"yes":"no",
 		       health?health:"OK");
 		return 0;
 	}
@@ -4010,6 +4074,29 @@ static int control_getdbprio(struct ctdb_context *ctdb, int argc, const char **a
 }
 
 /*
+  set the readonly capability for a database
+ */
+static int control_setdbreadonly(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+	uint32_t db_id;
+	int ret;
+
+	if (argc < 1) {
+		usage();
+	}
+
+	db_id = strtoul(argv[0], NULL, 0);
+
+	ret = ctdb_ctrl_set_db_readonly(ctdb, options.pnn, db_id);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Unable to set db to support readonly\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
   run an eventscript on a node
  */
 static int control_eventscript(struct ctdb_context *ctdb, int argc, const char **argv)
@@ -4159,7 +4246,7 @@ static int control_backupdb(struct ctdb_context *ctdb, int argc, const char **ar
 				     allow_unhealthy));
 	}
 
-	ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), argv[0], dbmap->dbs[i].persistent, 0);
+	ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), argv[0], dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT, 0);
 	if (ctdb_db == NULL) {
 		DEBUG(DEBUG_ERR,("Unable to attach to database '%s'\n", argv[0]));
 		talloc_free(tmp_ctx);
@@ -4211,7 +4298,7 @@ static int control_backupdb(struct ctdb_context *ctdb, int argc, const char **ar
 
 	dbhdr.version = DB_VERSION;
 	dbhdr.timestamp = time(NULL);
-	dbhdr.persistent = dbmap->dbs[i].persistent;
+	dbhdr.persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
 	dbhdr.size = bd->len;
 	if (strlen(argv[0]) >= MAX_DB_NAME) {
 		DEBUG(DEBUG_ERR,("Too long dbname\n"));
@@ -4558,7 +4645,7 @@ static int control_wipedb(struct ctdb_context *ctdb, int argc,
 		return -1;
 	}
 
-	ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), argv[0], dbmap->dbs[i].persistent, 0);
+	ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), argv[0], dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT, 0);
 	if (ctdb_db == NULL) {
 		DEBUG(DEBUG_ERR, ("Unable to attach to database '%s'\n",
 				  argv[0]));
@@ -4934,6 +5021,7 @@ static const struct {
 	{ "getdbmap",        control_getdbmap,          true,	false,  "show the database map" },
 	{ "getdbstatus",     control_getdbstatus,       true,	false,  "show the status of a database", "<dbname>" },
 	{ "catdb",           control_catdb,             true,	false,  "dump a database" ,                     "<dbname>"},
+	{ "cattdb",          control_cattdb,            true,	false,  "dump a database" ,                     "<dbname>"},
 	{ "getmonmode",      control_getmonmode,        true,	false,  "show monitoring mode" },
 	{ "getcapabilities", control_getcapabilities,   true,	false,  "show node capabilities" },
 	{ "pnn",             control_pnn,               true,	false,  "show the pnn of the currnet node" },
@@ -4999,6 +5087,7 @@ static const struct {
 	{ "setrecmasterrole", control_setrecmasterrole,	false,	false, "Set RECMASTER role to on/off", "{on|off}"},
 	{ "setdbprio",        control_setdbprio,	false,	false, "Set DB priority", "<dbid> <prio:1-3>"},
 	{ "getdbprio",        control_getdbprio,	false,	false, "Get DB priority", "<dbid>"},
+	{ "setdbreadonly",    control_setdbreadonly,	false,	false, "Set DB readonly capable", "<dbid>"},
 	{ "msglisten",        control_msglisten,	false,	false, "Listen on a srvid port for messages", "<msg srvid>"},
 	{ "msgsend",          control_msgsend,	false,	false, "Send a message to srvid", "<srvid> <message>"},
 	{ "sync", 	     control_ipreallocate,      false,	false,  "wait until ctdbd has synced all state changes" },
diff --git a/ctdb/tools/ctdb_vacuum.c b/ctdb/tools/ctdb_vacuum.c
index e5ed9515fd..a9dc28a9a6 100644
--- a/ctdb/tools/ctdb_vacuum.c
+++ b/ctdb/tools/ctdb_vacuum.c
@@ -467,7 +467,7 @@ int ctdb_vacuum(struct ctdb_context *ctdb, int argc, const char **argv)
 
 	for (i=0;i<dbmap->num;i++) {
 		if (ctdb_vacuum_db(ctdb, dbmap->dbs[i].dbid, nodemap, 
-				   dbmap->dbs[i].persistent, vacuum_limit) != 0) {
+				   dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT, vacuum_limit) != 0) {
 			DEBUG(DEBUG_ERR,("Failed to vacuum db 0x%x\n", dbmap->dbs[i].dbid));
 			return -1;
 		}
@@ -630,7 +630,7 @@ int ctdb_repack(struct ctdb_context *ctdb, int argc, const char **argv)
 
 	for (i=0;i<dbmap->num;i++) {
 		if (ctdb_repack_db(ctdb, dbmap->dbs[i].dbid, 
-				   dbmap->dbs[i].persistent, repack_limit) != 0) {
+				   dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT, repack_limit) != 0) {
 			DEBUG(DEBUG_ERR,("Failed to repack db 0x%x\n", dbmap->dbs[i].dbid));
 			return -1;
 		}
author	Ronnie Sahlberg <ronniesahlberg@gmail.com>	2011-09-12 09:34:34 +1000
committer	Ronnie Sahlberg <ronniesahlberg@gmail.com>	2011-09-12 09:34:34 +1000
commit	0dc5584101e61eeadf908d3340c2ef2fecd4cc22 (patch)
tree	fbd9296e38e71309d80c9ddf9abcc58aae9d9c4e
parent	d78b0ff985c7a389ab4678fef4c2cc30cd278f42 (diff)
parent	01388c4414fcd976581f661cbe764fa0f984b293 (diff)
download	samba-0dc5584101e61eeadf908d3340c2ef2fecd4cc22.tar.gz samba-0dc5584101e61eeadf908d3340c2ef2fecd4cc22.tar.xz samba-0dc5584101e61eeadf908d3340c2ef2fecd4cc22.zip