summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorkarthik-us <ksubrahm@redhat.com>2018-01-17 17:30:06 +0530
committerShyamsundar Ranganathan <srangana@redhat.com>2018-01-19 14:24:52 +0000
commitb9e3b7f2753a5c8d56c1ed627a22dca3f2c3dd89 (patch)
tree1729543b0b3dcc4c737489c06d267877bc9fb7e0
parentc5ed8dfeb1168e764e53a69eb3ac564e8d0172bf (diff)
downloadglusterfs-b9e3b7f2753a5c8d56c1ed627a22dca3f2c3dd89.tar.gz
glusterfs-b9e3b7f2753a5c8d56c1ed627a22dca3f2c3dd89.tar.xz
glusterfs-b9e3b7f2753a5c8d56c1ed627a22dca3f2c3dd89.zip
cluster/afr: Adding option to take full file lock
Problem: In replica 3 volumes there is a possibilities of ending up in split brain scenario, when multiple clients writing data on the same file at non overlapping regions in parallel. Scenario: - Initially all the copies are good and all the clients gets the value of data readables as all good. - Client C0 performs write W1 which fails on brick B0 and succeeds on other two bricks. - C1 performs write W2 which fails on B1 and succeeds on other two bricks. - C2 performs write W3 which fails on B2 and succeeds on other two bricks. - All the 3 writes above happen in parallel and fall on different ranges so afr takes granular locks and all the writes are performed in parallel. Since each client had data-readables as good, it does not see file going into split-brain in the in_flight_split_brain check, hence performs the post-op marking the pending xattrs. Now all the bricks are being blamed by each other, ending up in split-brain. Fix: Have an option to take either full lock or range lock on files while doing data transactions, to prevent the possibility of ending up in split brains. With this change, by default the files will take full lock while doing IO. If you want to make use of the old range lock change the value of "cluster.full-lock" to "no". Change-Id: I7893fa33005328ed63daa2f7c35eeed7c5218962 BUG: 1535438 Signed-off-by: karthik-us <ksubrahm@redhat.com>
-rw-r--r--libglusterfs/src/globals.h5
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c2
-rw-r--r--xlators/cluster/afr/src/afr.c11
-rw-r--r--xlators/cluster/afr/src/afr.h5
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c6
5 files changed, 25 insertions, 4 deletions
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
index bbc87f502d..9231fbc138 100644
--- a/libglusterfs/src/globals.h
+++ b/libglusterfs/src/globals.h
@@ -38,7 +38,7 @@
*/
#define GD_OP_VERSION_MIN 1 /* MIN is the fresh start op-version, mostly
should not change */
-#define GD_OP_VERSION_MAX GD_OP_VERSION_3_13_1 /* MAX VERSION is the maximum
+#define GD_OP_VERSION_MAX GD_OP_VERSION_3_13_2 /* MAX VERSION is the maximum
count in VME table, should
keep changing with
introduction of newer
@@ -96,6 +96,9 @@
#define GD_OP_VERSION_3_13_1 31301 /* Op-version for GlusterFS 3.13.1 */
+#define GD_OP_VERSION_3_13_2 31302 /* Op-version for GlusterFS 3.13.2 */
+
+
#define GD_OP_VER_PERSISTENT_AFR_XATTRS GD_OP_VERSION_3_6_0
#include "xlator.h"
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 7e40bba5b5..ba621204fd 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -1962,7 +1962,7 @@ afr_set_transaction_flock (xlator_t *this, afr_local_t *local)
inodelk = afr_get_inodelk (int_lock, int_lock->domain);
priv = this->private;
- if (priv->arbiter_count &&
+ if ((priv->arbiter_count || priv->full_lock) &&
local->transaction.type == AFR_DATA_TRANSACTION) {
/*Lock entire file to avoid network split brains.*/
inodelk->flock.l_len = 0;
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index b18f60fcc7..4338939073 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -244,6 +244,7 @@ reconfigure (xlator_t *this, dict_t *options)
out);
GF_OPTION_RECONF ("locking-scheme", priv->locking_scheme, options, str,
out);
+ GF_OPTION_RECONF ("full-lock", priv->full_lock, options, bool, out);
GF_OPTION_RECONF ("use-compound-fops", priv->use_compound_fops,
options, bool,
out);
@@ -532,6 +533,7 @@ init (xlator_t *this)
GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out);
GF_OPTION_INIT ("locking-scheme", priv->locking_scheme, str, out);
+ GF_OPTION_INIT ("full-lock", priv->full_lock, bool, out);
GF_OPTION_INIT ("use-compound-fops", priv->use_compound_fops,
bool, out);
GF_OPTION_INIT ("granular-entry-heal", priv->esh_granular, bool, out);
@@ -1088,6 +1090,15 @@ struct volume_options options[] = {
"stop being compatible with afr-v1, which helps afr "
"be more granular while self-healing",
},
+ { .key = {"full-lock"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "yes",
+ .op_version = {GD_OP_VERSION_3_13_2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "If this option is disabled, then the IOs will take "
+ "range locks same as versions till 3.13.1."
+ },
{ .key = {"granular-entry-heal"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "no",
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 18bf249fa3..dbc9e3cc55 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -177,9 +177,10 @@ typedef struct _afr_private {
void *pump_private;
gf_boolean_t use_afr_in_pump;
char *locking_scheme;
- gf_boolean_t esh_granular;
+ gf_boolean_t full_lock;
+ gf_boolean_t esh_granular;
gf_boolean_t consistent_io;
- gf_boolean_t use_compound_fops;
+ gf_boolean_t use_compound_fops;
} afr_private_t;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index edb31b685c..9de9cc560f 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1506,6 +1506,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_7_12,
.flags = VOLOPT_FLAG_CLIENT_OPT
},
+ { .key = "cluster.full-lock",
+ .voltype = "cluster/replicate",
+ .type = NO_DOC,
+ .op_version = GD_OP_VERSION_3_13_2,
+ .flags = VOLOPT_FLAG_CLIENT_OPT
+ },
/* stripe xlator options */
{ .key = "cluster.stripe-block-size",