diff options
-rw-r--r-- | tests/basic/afr/ta-shd.t | 49 | ||||
-rw-r--r-- | tests/thin-arbiter.rc | 181 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 7 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heald.c | 245 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.c | 1 |
5 files changed, 392 insertions, 91 deletions
diff --git a/tests/basic/afr/ta-shd.t b/tests/basic/afr/ta-shd.t new file mode 100644 index 0000000000..bb2e58b3f7 --- /dev/null +++ b/tests/basic/afr/ta-shd.t @@ -0,0 +1,49 @@ +#!/bin/bash +#Self-heal tests + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../thin-arbiter.rc +cleanup; +TEST ta_create_brick_and_volfile brick0 +TEST ta_create_brick_and_volfile brick1 +TEST ta_create_ta_and_volfile ta +TEST ta_start_brick_process brick0 +TEST ta_start_brick_process brick1 +TEST ta_start_ta_process ta + +TEST ta_create_mount_volfile brick0 brick1 ta +TEST ta_start_mount_process $M0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "trusted.afr.patchy-ta-2" ls $B0/ta + +TEST ta_create_shd_volfile brick0 brick1 ta +TEST ta_start_shd_process glustershd + +TEST touch $M0/a.txt +TEST ta_kill_brick brick0 +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 +echo "Hello" >> $M0/a.txt +EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/a.txt +EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.$V0-ta-2 + +#TODO: After the write txn changes are merged, take statedump of TA process and +#check whether AFR_TA_DOM_NOTIFY lock is held by the client here. Take the +#statedump again after line #38 to check AFR_TA_DOM_NOTIFY lock is released by +#the SHD process. + +TEST ta_start_brick_process brick0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 +EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/a.txt +EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.$V0-ta-2 + +#Kill the previously up brick and try reading from other brick. Since the heal +#has happened file content should be same. +TEST ta_kill_brick brick1 +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1 +#Umount and mount to remove cached data. +TEST umount $M0 +TEST ta_start_mount_process $M0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0 +EXPECT "Hello" cat $M0/a.txt +cleanup; diff --git a/tests/thin-arbiter.rc b/tests/thin-arbiter.rc index 36d11cea61..c5ac00baaa 100644 --- a/tests/thin-arbiter.rc +++ b/tests/thin-arbiter.rc @@ -431,3 +431,184 @@ function ta_up_status() local replica_id=$3 grep -E "^up = " $m/.meta/graphs/active/${v}-replicate-${replica_id}/private | cut -f2 -d'=' } + +function ta_create_shd_volfile() +{ + local b0=$B0/$1 + local b1=$B0/$2 + local ta=$B0/$3 + local b0_port=${PORTMAP[$1]} + local b1_port=${PORTMAP[$2]} + local ta_port=${PORTMAP[$3]} +cat > $B0/glustershd.vol <<EOF +volume ${V0}-replicate-0-client-0 + type protocol/client + option send-gids on + option transport.socket.lowlat off + option transport.socket.keepalive-interval 2 + option remote-host $H0 + option remote-subvolume $b0 + option ping-timeout 42 + option client-bind-insecure off + option transport.socket.own-thread off + option frame-timeout 1800 + option non-blocking-io off + option transport.socket.keepalive 1 + option transport.socket.keepalive-count 9 + option transport.tcp-user-timeout 0 + option transport.socket.nodelay 1 + option transport.socket.keepalive-time 20 + option transport.socket.read-fail-log off + option transport-type tcp + option filter-O_DIRECT disable + option event-threads 2 + option transport.listen-backlog 1024 + option transport.socket.ssl-enabled off + option password a0ad63dd-8314-4f97-9160-1b93e3cb1f0b + option username 459d48e8-2a92-4f11-89f2-077b29f6f86d + option remote-port $b0_port +end-volume + +volume ${V0}-replicate-0-client-1 + type protocol/client + option remote-host $H0 + option transport.socket.keepalive-time 20 + option transport.socket.keepalive-count 9 + option transport.socket.own-thread off + option transport.socket.ssl-enabled off + option transport-type tcp + option remote-subvolume $b1 + option event-threads 2 + option transport.tcp-user-timeout 0 + option transport.socket.keepalive 1 + option transport.socket.nodelay 1 + option transport.socket.read-fail-log off + option frame-timeout 1800 + option ping-timeout 42 + option client-bind-insecure off + option filter-O_DIRECT disable + option send-gids on + option non-blocking-io off + option transport.listen-backlog 1024 + option transport.socket.lowlat off + option transport.socket.keepalive-interval 2 + option password a0ad63dd-8314-4f97-9160-1b93e3cb1f0b + option username 459d48e8-2a92-4f11-89f2-077b29f6f86d + option remote-port $b1_port +end-volume + +volume ${V0}-replicate-0-thin-arbiter-client + type protocol/client + option frame-timeout 1800 + option event-threads 2 + option transport.listen-backlog 1024 + option transport.socket.nodelay 1 + option transport.socket.keepalive-count 9 + option transport.socket.ssl-enabled off + option transport-type tcp + option remote-subvolume $ta + option filter-O_DIRECT disable + option non-blocking-io off + option transport.socket.lowlat off + option transport.socket.keepalive-interval 2 + option transport.socket.read-fail-log off + option remote-host $H0 + option send-gids on + option transport.tcp-user-timeout 0 + option transport.socket.keepalive-time 20 + option ping-timeout 42 + option client-bind-insecure off + option transport.socket.keepalive 1 + option transport.socket.own-thread off + option remote-port $ta_port +end-volume + +volume ${V0}-replicate-0 + type cluster/replicate + option background-self-heal-count 8 + option metadata-self-heal on + option data-change-log on + option entrylk-trace off + option iam-self-heal-daemon yes + option afr-dirty-xattr trusted.afr.dirty + option heal-timeout 10 + option read-hash-mode 1 + option metadata-splitbrain-forced-heal off + option thin-arbiter $H0:$ta + option shd-max-threads 1 + option afr-pending-xattr ${V0}-client-0,${V0}-client-1,${V0}-ta-2 + option halo-max-latency 5 + option halo-max-replicas 99999 + option entry-change-log on + option halo-nfsd-max-latency 5 + option inodelk-trace off + option pre-op-compat on + option eager-lock on + option self-heal-readdir-size 1KB + option ensure-durability on + option locking-scheme full + option halo-enabled False + option heal-wait-queue-length 128 + option entry-self-heal on + option self-heal-daemon on + option quorum-reads no + option shd-wait-qlength 1024 + option choose-local true + option halo-min-replicas 2 + option data-self-heal on + option metadata-change-log on + option consistent-metadata no + option full-lock yes + option use-compound-fops no + option halo-shd-max-latency 99999 + option quorum-type none + option favorite-child-policy none + option read-subvolume-index -1 + option optimistic-change-log on + option iam-nfs-daemon off + option post-op-delay-secs 1 + option granular-entry-heal no + option consistent-io no + option data-self-heal-window-size 1 + subvolumes ${V0}-replicate-0-client-0 ${V0}-replicate-0-client-1 ${V0}-replicate-0-thin-arbiter-client +end-volume + +volume glustershd + type debug/io-stats + option log-buf-size 5 + option ios-dump-format json + option latency-measurement off + option sys-log-level CRITICAL + option brick-log-level INFO + option client-logger gluster-log + option client-log-format with-msg-id + option brick-log-format with-msg-id + option client-log-buf-size 5 + option log-flush-timeout 120 + option ios-dump-interval 0 + option ios-sample-interval 0 + option ios-dnscache-ttl-sec 86400 + option count-fop-hits off + option client-log-level INFO + option brick-logger gluster-log + option brick-log-buf-size 5 + option ios-sample-buf-size 65535 + option client-log-flush-timeout 120 + option brick-log-flush-timeout 120 + option unique-id /no/such/path + option dump-fd-stats off + subvolumes ${V0}-replicate-0 +end-volume +EOF +} + +function ta_start_shd_process() +{ + if glusterfs -p $B0/${1}.pid --volfile=$B0/${1}.vol -l $(gluster --print-logdir)/${1}.log --process-name=glustershd + then + cat $B0/${1}.pid + else + echo "" + return 1 + fi +} diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index eb0e7330a9..73f1d72880 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -6717,7 +6717,8 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc) }; int32_t cmd = 0; - GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); + if (!priv->shd.iamshd) + GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); flock1.l_type = F_WRLCK; while (!locked) { @@ -6725,7 +6726,6 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc) cmd = F_SETLKW; flock1.l_start = 0; flock1.l_len = 0; - } else { cmd = F_SETLK; if (priv->ta_notify_dom_lock_offset) { @@ -6780,7 +6780,8 @@ afr_ta_post_op_unlock(xlator_t *this, loc_t *loc) }; int ret = 0; - GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); + if (!priv->shd.iamshd) + GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); flock.l_type = F_UNLCK; flock.l_start = 0; flock.l_len = 0; diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 0cf01a041b..53d7ef8bb8 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -546,14 +546,128 @@ afr_shd_full_sweep(struct subvol_healer *healer, inode_t *inode) GF_CLIENT_PID_SELF_HEALD, healer, afr_shd_full_heal); } -void -afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) +int +afr_shd_fill_ta_loc(xlator_t *this, loc_t *loc) { afr_private_t *priv = NULL; - dict_t *xattr = NULL; - struct gf_flock flock = { + struct iatt stbuf = { 0, }; + int ret = -1; + + priv = this->private; + loc->parent = inode_ref(this->itable->root); + gf_uuid_copy(loc->pargfid, loc->parent->gfid); + loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; + loc->inode = inode_new(loc->parent->table); + GF_CHECK_ALLOC(loc->inode, ret, out); + + if (!gf_uuid_is_null(priv->ta_gfid)) + goto assign_gfid; + + ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf, + 0, 0, 0); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed lookup on file %s.", loc->name); + goto out; + } + + gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid); + +assign_gfid: + gf_uuid_copy(loc->gfid, priv->ta_gfid); + ret = 0; + +out: + if (ret) + loc_wipe(loc); + + return ret; +} + +int +_afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata) +{ + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int *raw = NULL; + int ret = -1; + int i = 0; + + priv = this->private; + + xattr = dict_new(); + if (!xattr) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_GET_FAILED, + "Failed to create dict."); + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t); + if (!raw) + goto out; + + ret = dict_set_bin(xattr, priv->pending_key[i], raw, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + GF_FREE(raw); + goto out; + } + } + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL); + if (ret || !(*xdata)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Xattrop failed on %s.", loc->name); + } + +out: + if (xattr) + dict_unref(xattr); + + return ret; +} + +void +afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer, + dict_t **xdata) +{ + int ret = 0; + + loc_wipe(loc); + if (afr_shd_fill_ta_loc(this, loc)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc->name); + goto out; + } + + ret = afr_ta_post_op_lock(this, loc); + if (ret) + goto out; + + ret = _afr_shd_ta_get_xattrs(this, loc, xdata); + if (ret) { + if (*xdata) { + dict_unref(*xdata); + *xdata = NULL; + } + } + + afr_ta_post_op_unlock(this, loc); + +out: + if (ret) + healer->rerun = 1; +} + +int +afr_shd_ta_unset_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) +{ + afr_private_t *priv = NULL; + dict_t *xattr = NULL; gf_boolean_t need_xattrop = _gf_false; void *pending_raw = NULL; int *raw = NULL; @@ -563,7 +677,7 @@ afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) int i = 0; int j = 0; int val = 0; - int ret = 0; + int ret = -1; priv = this->private; @@ -598,6 +712,7 @@ afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) "not the good shd. Skipping. " "SHD = %d.", healer); + ret = 0; GF_FREE(raw); goto out; } @@ -607,113 +722,69 @@ afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) } ret = dict_set_bin(xattr, priv->pending_key[i], raw, - AFR_NUM_CHANGE_LOGS * sizeof(int)); + AFR_NUM_CHANGE_LOGS * sizeof (int)); if (ret) { - GF_FREE(raw); + GF_FREE (raw); goto out; } - memset(pending, 0, sizeof(pending)); + if (need_xattrop) + break; } if (!need_xattrop) { + ret = 0; goto out; } - flock.l_type = F_WRLCK; - flock.l_start = 0; - flock.l_len = 0; - - ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], - AFR_TA_DOM_NOTIFY, loc, F_SETLKW, &flock, NULL, NULL); - if (ret) - goto out; - ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); if (ret) gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Xattrop failed."); - flock.l_type = F_UNLCK; - syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY, - loc, F_SETLKW, &flock, NULL, NULL); - out: if (xattr) dict_unref(xattr); - return; + + return ret; } void -afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata) +afr_shd_ta_check_and_unset_xattrs(xlator_t *this, loc_t *loc, + struct subvol_healer *healer, + dict_t *pre_crawl_xdata) { - afr_private_t *priv = NULL; - dict_t *xattr = NULL; - struct iatt stbuf = { - 0, - }; - int *raw = NULL; + int ret_lock = 0; int ret = 0; - int i = 0; + dict_t *post_crawl_xdata = NULL; - priv = this->private; - - loc->parent = inode_ref(this->itable->root); - gf_uuid_copy(loc->pargfid, loc->parent->gfid); - loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; - loc->inode = inode_new(loc->parent->table); - if (!loc->inode) { - goto out; - } + ret_lock = afr_ta_post_op_lock(this, loc); + if (ret_lock) + goto unref; - ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf, - 0, 0, 0); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Failed lookup on file %s.", loc->name); - goto out; - } - - gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid); - gf_uuid_copy(loc->gfid, priv->ta_gfid); + ret = _afr_shd_ta_get_xattrs(this, loc, &post_crawl_xdata); + if (ret) + goto unref; - xattr = dict_new(); - if (!xattr) { - gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_GET_FAILED, - "Failed to create dict."); - goto out; + if (!are_dicts_equal(pre_crawl_xdata, post_crawl_xdata, NULL, NULL)) { + ret = -1; + goto unref; } - for (i = 0; i < priv->child_count; i++) { - raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t); - if (!raw) { - goto out; - } + ret = afr_shd_ta_unset_xattrs(this, loc, &post_crawl_xdata, healer->subvol); - ret = dict_set_bin(xattr, priv->pending_key[i], raw, - AFR_NUM_CHANGE_LOGS * sizeof(int)); - if (ret) { - GF_FREE(raw); - goto out; - } +unref: + if (post_crawl_xdata) { + dict_unref(post_crawl_xdata); + post_crawl_xdata = NULL; } - ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, - GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Xattrop failed."); - goto out; - } - if (!(*xdata)) - gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_GET_FAILED, - "Xdata response is empty."); + if (ret || ret_lock) + healer->rerun = 1; -out: - if (xattr) - dict_unref(xattr); - return; + if (!ret_lock) + afr_ta_post_op_unlock(this, loc); } void * @@ -723,7 +794,7 @@ afr_shd_index_healer(void *data) xlator_t *this = NULL; int ret = 0; afr_private_t *priv = NULL; - dict_t *xdata = NULL; + dict_t *pre_crawl_xdata = NULL; loc_t loc = { 0, }; @@ -739,8 +810,7 @@ afr_shd_index_healer(void *data) priv->local[healer->subvol] = healer->local; if (priv->thin_arbiter_count) { - loc_wipe(&loc); - afr_shd_ta_get_xattrs(this, &loc, &xdata); + afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata); } do { @@ -770,15 +840,14 @@ afr_shd_index_healer(void *data) sleep(1); } while (ret > 0); - if (xdata && !healer->crawl_event.heal_failed_count) { - afr_shd_ta_set_xattrs(this, &loc, &xdata, healer->subvol); - dict_unref(xdata); - xdata = NULL; + if (pre_crawl_xdata && !healer->crawl_event.heal_failed_count) { + afr_shd_ta_check_and_unset_xattrs(this, &loc, healer, + pre_crawl_xdata); + dict_unref(pre_crawl_xdata); + pre_crawl_xdata = NULL; } } - loc_wipe(&loc); - return NULL; } diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 568293cdf2..26950fd792 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -384,6 +384,7 @@ init(xlator_t *this) priv->child_count--; priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; priv->ta_notify_dom_lock_offset = 0; + *priv->ta_gfid = 0; } INIT_LIST_HEAD(&priv->healing); INIT_LIST_HEAD(&priv->heal_waiting); |