summaryrefslogtreecommitdiffstats
path: root/ctdb/client
diff options
context:
space:
mode:
authorAndrew Tridgell <tridge@samba.org>2008-08-04 14:51:51 +1000
committerAndrew Tridgell <tridge@samba.org>2008-08-04 14:51:51 +1000
commit78acc59784522f52fe75b9a6dd7784595035a93e (patch)
tree5d6c6e155187fcd870784173fe328e55ddbda03d /ctdb/client
parent8d76f55bfc0806bf339c548e53fd1a142b464e5c (diff)
downloadsamba-78acc59784522f52fe75b9a6dd7784595035a93e.tar.gz
samba-78acc59784522f52fe75b9a6dd7784595035a93e.tar.xz
samba-78acc59784522f52fe75b9a6dd7784595035a93e.zip
implemented replayable transactions in ctdb to prevent deadlock
(This used to be ctdb commit b6d9a0396fb4b325778d3810dc656f719f31b9f1)
Diffstat (limited to 'ctdb/client')
-rw-r--r--ctdb/client/ctdb_client.c176
1 files changed, 145 insertions, 31 deletions
diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index 5004a69b0e..48eb19d969 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -2956,7 +2956,12 @@ int ctdb_ctrl_getcapabilities(struct ctdb_context *ctdb, struct timeval timeout,
struct ctdb_transaction_handle {
struct ctdb_db_context *ctdb_db;
- struct ctdb_marshall_buffer *m;
+ bool in_replay;
+ /* we store the reads and writes done under a transaction one
+ list stores both reads and writes, the other just writes
+ */
+ struct ctdb_marshall_buffer *m_all;
+ struct ctdb_marshall_buffer *m_write;
};
/* start a transaction on a database */
@@ -2967,33 +2972,32 @@ static int ctdb_transaction_destructor(struct ctdb_transaction_handle *h)
}
/* start a transaction on a database */
-struct ctdb_transaction_handle *ctdb_transaction_start(struct ctdb_db_context *ctdb_db,
- TALLOC_CTX *mem_ctx)
+static int ctdb_transaction_fetch_start(struct ctdb_transaction_handle *h)
{
struct ctdb_record_handle *rh;
- struct ctdb_transaction_handle *h;
TDB_DATA key;
struct ctdb_ltdb_header header;
TALLOC_CTX *tmp_ctx;
const char *keyname = CTDB_TRANSACTION_LOCK_KEY;
int ret;
+ struct ctdb_db_context *ctdb_db = h->ctdb_db;
key.dptr = discard_const(keyname);
key.dsize = strlen(keyname);
if (!ctdb_db->persistent) {
DEBUG(DEBUG_ERR,(__location__ " Attempted transaction on non-persistent database\n"));
- return NULL;
+ return -1;
}
again:
- tmp_ctx = talloc_new(mem_ctx);
+ tmp_ctx = talloc_new(h);
rh = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, NULL);
if (rh == NULL) {
DEBUG(DEBUG_ERR,(__location__ " Failed to fetch_lock database\n"));
talloc_free(tmp_ctx);
- return NULL;
+ return -1;
}
talloc_free(rh);
@@ -3001,7 +3005,7 @@ again:
if (ret != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to start tdb transaction\n"));
talloc_free(tmp_ctx);
- return NULL;
+ return -1;
}
ret = ctdb_ltdb_fetch(ctdb_db, key, &header, tmp_ctx, NULL);
@@ -3013,16 +3017,32 @@ again:
talloc_free(tmp_ctx);
+ return 0;
+}
+
+
+/* start a transaction on a database */
+struct ctdb_transaction_handle *ctdb_transaction_start(struct ctdb_db_context *ctdb_db,
+ TALLOC_CTX *mem_ctx)
+{
+ struct ctdb_transaction_handle *h;
+ int ret;
+
/* we have a good transaction */
h = talloc_zero(mem_ctx, struct ctdb_transaction_handle);
if (h == NULL) {
- tdb_transaction_cancel(ctdb_db->ltdb->tdb);
DEBUG(DEBUG_ERR,(__location__ " oom for transaction handle\n"));
return NULL;
}
h->ctdb_db = ctdb_db;
+ ret = ctdb_transaction_fetch_start(h);
+ if (ret != 0) {
+ talloc_free(h);
+ return NULL;
+ }
+
talloc_set_destructor(h, ctdb_transaction_destructor);
return h;
@@ -3046,9 +3066,22 @@ int ctdb_transaction_fetch(struct ctdb_transaction_handle *h,
if (ret == -1 && header.dmaster == (uint32_t)-1) {
/* record doesn't exist yet */
*data = tdb_null;
- return 0;
+ ret = 0;
}
- return ret;
+
+ if (ret != 0) {
+ return ret;
+ }
+
+ if (!h->in_replay) {
+ h->m_all = ctdb_marshall_add(h, h->m_all, h->ctdb_db->db_id, 1, key, NULL, *data);
+ if (h->m_all == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to add to marshalling record\n"));
+ return -1;
+ }
+ }
+
+ return 0;
}
/*
@@ -3079,11 +3112,20 @@ int ctdb_transaction_store(struct ctdb_transaction_handle *h,
header.rsn++;
- h->m = ctdb_marshall_add(h, h->m, h->ctdb_db->db_id, 0, key, &header, data);
- if (h->m == NULL) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to add to marshalling record\n"));
- talloc_free(tmp_ctx);
- return -1;
+ if (!h->in_replay) {
+ h->m_all = ctdb_marshall_add(h, h->m_all, h->ctdb_db->db_id, 0, key, NULL, data);
+ if (h->m_all == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to add to marshalling record\n"));
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ h->m_write = ctdb_marshall_add(h, h->m_write, h->ctdb_db->db_id, 0, key, &header, data);
+ if (h->m_write == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to add to marshalling record\n"));
+ talloc_free(tmp_ctx);
+ return -1;
+ }
}
ret = ctdb_ltdb_store(h->ctdb_db, key, &header, data);
@@ -3094,6 +3136,61 @@ int ctdb_transaction_store(struct ctdb_transaction_handle *h,
}
/*
+ replay a transaction
+ */
+static int ctdb_replay_transaction(struct ctdb_transaction_handle *h)
+{
+ int ret, i;
+ struct ctdb_rec_data *rec = NULL;
+
+ h->in_replay = true;
+
+ ret = ctdb_transaction_fetch_start(h);
+ if (ret != 0) {
+ return ret;
+ }
+
+ for (i=0;i<h->m_all->count;i++) {
+ TDB_DATA key, data;
+
+ rec = ctdb_marshall_loop_next(h->m_all, rec, NULL, NULL, &key, &data);
+ if (rec == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " Out of records in ctdb_replay_transaction?\n"));
+ goto failed;
+ }
+
+ if (rec->reqid == 0) {
+ /* its a store */
+ if (ctdb_transaction_store(h, key, data) != 0) {
+ goto failed;
+ }
+ } else {
+ TDB_DATA data2;
+ TALLOC_CTX *tmp_ctx = talloc_new(h);
+
+ if (ctdb_transaction_fetch(h, tmp_ctx, key, &data2) != 0) {
+ talloc_free(tmp_ctx);
+ goto failed;
+ }
+ if (data2.dsize != data.dsize ||
+ memcmp(data2.dptr, data.dptr, data.dsize) != 0) {
+ /* the record has changed on us - we have to give up */
+ talloc_free(tmp_ctx);
+ goto failed;
+ }
+ talloc_free(tmp_ctx);
+ }
+ }
+
+ return 0;
+
+failed:
+ tdb_transaction_cancel(h->ctdb_db->ltdb->tdb);
+ return -1;
+}
+
+
+/*
commit a transaction
*/
int ctdb_transaction_commit(struct ctdb_transaction_handle *h)
@@ -3101,23 +3198,47 @@ int ctdb_transaction_commit(struct ctdb_transaction_handle *h)
int ret;
int32_t status;
struct ctdb_context *ctdb = h->ctdb_db->ctdb;
+ struct timeval timeout;
talloc_set_destructor(h, NULL);
- if (h->m == NULL) {
+ if (h->m_write == NULL) {
/* no changes were made */
talloc_free(h);
return 0;
}
+ /* our commit strategy is quite complex.
+
+ - we first try to commit the changes to all other nodes
+
+ - if that works, then we commit locally and we are done
+
+ - if a commit on another node fails, then we need to cancel
+ the transaction, then restart the transaction (thus
+ opening a window of time for a pending recovery to
+ complete), then replay the transaction, checking all the
+ reads and writes (checking that reads give the same data,
+ and writes succeed). Then we retry the transaction to the
+ other nodes
+ */
+
+again:
/* tell ctdbd to commit to the other nodes */
+ timeout = timeval_current_ofs(1, 0);
ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id,
CTDB_CONTROL_TRANS2_COMMIT, 0,
- ctdb_marshall_finish(h->m), NULL, NULL, &status, NULL, NULL);
+ ctdb_marshall_finish(h->m_write), NULL, NULL, &status,
+ &timeout, NULL);
if (ret != 0 || status != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Control failed for remote transaction commit\n"));
- talloc_free(h);
- return -1;
+ tdb_transaction_cancel(h->ctdb_db->ltdb->tdb);
+ sleep(1);
+ if (ctdb_replay_transaction(h) != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to replay transaction\n"));
+ talloc_free(h);
+ return -1;
+ }
+ goto again;
}
/* do the real commit locally */
@@ -3132,16 +3253,9 @@ int ctdb_transaction_commit(struct ctdb_transaction_handle *h)
}
/* tell ctdbd that we are finished with our local commit */
- ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id,
- CTDB_CONTROL_TRANS2_FINISHED, 0,
- tdb_null, NULL, NULL, &status, NULL, NULL);
- if (ret != 0 || status != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Control failed to finish transaction commit\n"));
- talloc_free(h);
- return -1;
- }
-
-
+ ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id,
+ CTDB_CONTROL_TRANS2_FINISHED, CTDB_CTRL_FLAG_NOREPLY,
+ tdb_null, NULL, NULL, &status, NULL, NULL);
talloc_free(h);
return 0;
}