diff options
-rw-r--r-- | tests/basic/afr/split-brain-favorite-child-policy-client-side-healing.t | 124 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 45 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-open.c | 8 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-read-txn.c | 8 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 4 |
5 files changed, 173 insertions, 16 deletions
diff --git a/tests/basic/afr/split-brain-favorite-child-policy-client-side-healing.t b/tests/basic/afr/split-brain-favorite-child-policy-client-side-healing.t new file mode 100644 index 0000000000..7c249c4bcb --- /dev/null +++ b/tests/basic/afr/split-brain-favorite-child-policy-client-side-healing.t @@ -0,0 +1,124 @@ +#!/bin/bash + +#Test the client side split-brain resolution +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +GET_MDATA_PATH=$(dirname $0)/../../utils +build_tester $GET_MDATA_PATH/get-mdata-xattr.c + +TEST glusterd +TEST pidof glusterd + +count_files () { + ls $1 | wc -l +} + +#Create replica 2 volume +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume heal $V0 disable +TEST $CLI volume set $V0 cluster.quorum-type fixed +TEST $CLI volume set $V0 cluster.quorum-count 1 +TEST $CLI volume set $V0 cluster.metadata-self-heal on +TEST $CLI volume set $V0 cluster.data-self-heal on +TEST $CLI volume set $V0 cluster.entry-self-heal on + +TEST $CLI volume start $V0 +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1 + +TEST mkdir $M0/data +TEST touch $M0/data/file + + +############ Client side healing using favorite-child-policy = mtime ################# +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/data/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST dd if=/dev/urandom of=$M0/data/file bs=1024 count=1024 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 + +mtime1=$(get_mtime $B0/${V0}0/data/file) +mtime2=$(get_mtime $B0/${V0}1/data/file) +if (( $(echo "$mtime1 > $mtime2" | bc -l) )); then + LATEST_MTIME_MD5=$(md5sum $B0/${V0}0/data/file | cut -d\ -f1) +else + LATEST_MTIME_MD5=$(md5sum $B0/${V0}1/data/file | cut -d\ -f1) +fi + +#file will be in split-brain +cat $M0/data/file > /dev/null +EXPECT "1" echo $? + +TEST $CLI volume set $V0 cluster.favorite-child-policy mtime +TEST $CLI volume start $V0 force + +EXPECT_WITHIN $HEAL_TIMEOUT "^2$" afr_get_split_brain_count $V0 +cat $M0/data/file > /dev/null +EXPECT "0" echo $? +M0_MD5=$(md5sum $M0/data/file | cut -d\ -f1) +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" afr_get_split_brain_count $V0 +TEST [ "$LATEST_MTIME_MD5" == "$M0_MD5" ] + +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +B0_MD5=$(md5sum $B0/${V0}0/data/file | cut -d\ -f1) +B1_MD5=$(md5sum $B0/${V0}1/data/file | cut -d\ -f1) +TEST [ "$LATEST_MTIME_MD5" == "$B0_MD5" ] +TEST [ "$LATEST_MTIME_MD5" == "$B1_MD5" ] + +############ Client side directory conservative merge ################# +TEST $CLI volume reset $V0 cluster.favorite-child-policy +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST touch $M0/data/test +files=$(count_files $M0/data) +EXPECT "2" echo $files +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST touch $M0/data/test1 +files=$(count_files $M0/data) +EXPECT "2" echo $files + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 + +#data dir will be in entry split-brain +ls $M0/data > /dev/null +EXPECT "2" echo $? + +TEST $CLI volume set $V0 cluster.favorite-child-policy mtime + +EXPECT_WITHIN $HEAL_TIMEOUT "^2$" afr_get_split_brain_count $V0 + + +ls $M0/data > /dev/null +EXPECT "0" echo $? + +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" afr_get_split_brain_count $V0 +#Entry Split-brain is gone, but data self-heal is pending on the files +EXPECT_WITHIN $HEAL_TIMEOUT "^2$" get_pending_heal_count $V0 + +cat $M0/data/test > /dev/null +cat $M0/data/test1 > /dev/null + +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +files=$(count_files $M0/data) +EXPECT "3" echo $files + +TEST force_umount $M0 +TEST rm $GET_MDATA_PATH/get-mdata-xattr + +cleanup diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 8dbdb572ab..1fc66ba1b2 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1197,12 +1197,11 @@ afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this, return 0; } -int +static int afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, int *spb_choice) { int ret = -1; - GF_VALIDATE_OR_GOTO(this->name, inode, out); LOCK(&inode->lock); @@ -1214,6 +1213,40 @@ out: return ret; } +/* + * frame is used to get the favourite policy. Since + * afr_inode_split_brain_choice_get was called with afr_open, it is possible to + * have a frame with out local->replies. So in that case, frame is passed as + * null, hence this function will handle the frame NULL case. + */ +int +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol) +{ + int ret = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("afr", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out); + + priv = this->private; + + ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol); + if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) { + local = frame->local; + *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode, + NULL); + if (*spb_subvol >= 0) { + ret = 0; + } + } + +out: + return ret; +} int afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data, unsigned char *metadata, int event) @@ -2823,7 +2856,7 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int spb_choice = -1; + int spb_subvol = -1; int child_count = -1; if (*read_subvol != -1) @@ -2833,10 +2866,10 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this, local = frame->local; child_count = priv->child_count; - afr_inode_split_brain_choice_get(local->inode, this, &spb_choice); - if ((spb_choice >= 0) && + afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol); + if ((spb_subvol >= 0) && (AFR_COUNT(success_replies, child_count) == child_count)) { - *read_subvol = spb_choice; + *read_subvol = spb_subvol; } else if (!priv->quorum_count || frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) { *read_subvol = afr_first_up_child(frame, this); diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index a5b004f425..64856042b6 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -137,7 +137,7 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int spb_choice = 0; + int spb_subvol = 0; int event_generation = 0; int ret = 0; int32_t op_errno = 0; @@ -179,9 +179,9 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, ret = afr_inode_get_readable(frame, local->inode, this, NULL, &event_generation, AFR_DATA_TRANSACTION); if ((ret < 0) && - (afr_inode_split_brain_choice_get(local->inode, this, &spb_choice) == - 0) && - spb_choice < 0) { + (afr_split_brain_read_subvol_get(local->inode, this, NULL, + &spb_subvol) == 0) && + spb_subvol < 0) { afr_inode_refresh(frame, this, local->inode, local->inode->gfid, afr_open_continue); } else { diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 772b59f9a2..5964ba80ad 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -272,7 +272,7 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) int read_subvol = -1; inode_t *inode = NULL; int ret = -1; - int spb_choice = -1; + int spb_subvol = -1; local = frame->local; inode = local->inode; @@ -303,9 +303,9 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) local->read_attempted[read_subvol] = 1; readfn: if (read_subvol == -1) { - ret = afr_inode_split_brain_choice_get(inode, this, &spb_choice); - if ((ret == 0) && spb_choice >= 0) - read_subvol = spb_choice; + ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol); + if ((ret == 0) && spb_subvol >= 0) + read_subvol = spb_subvol; } if (read_subvol == -1) { diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 1fff564094..667b5636d5 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1271,8 +1271,8 @@ int afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, int spb_choice); int -afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, - int *spb_choice); +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol); int afr_get_child_index_from_name(xlator_t *this, char *name); |