cluster/afr: Delegate metadata heal with pending xattrs to SHD

Problem: When metadata-self-heal is triggered on the mount, it blocks lookup until metadata-self-heal completes. But that can lead to hangs when lot of clients are accessing a directory which needs metadata heal and all of them trigger heals waiting for other clients to complete heal. Fix: Only when the heal is needed but the pending xattrs are not set, trigger metadata heal that could block lookup. This is the only case where different clients may give different metadata to the clients without heals, which should be avoided. Updates bz#1625588 Change-Id: I6089e9fda0770a83fb287941b229c882711f4e66 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
author: Pranith Kumar K <pkarampu@redhat.com> 2018-08-27 11:46:33 +0530
committer: jiffin tony Thottan <jthottan@redhat.com> 2018-10-12 04:08:57 +0000
commit: f030db7bec36f0d97f2beacb3306d31379e4a79f (patch)
tree: 3f81165d7d7e8925d017a7267afb79f98947a512
parent: a570ee702d968d1733a3e31b259d4d0fbf5bca3c (diff)
download: glusterfs-f030db7bec36f0d97f2beacb3306d31379e4a79f.tar.gz
glusterfs-f030db7bec36f0d97f2beacb3306d31379e4a79f.tar.xz
glusterfs-f030db7bec36f0d97f2beacb3306d31379e4a79f.zip
5 files changed, 72 insertions, 51 deletions
diff --git a/tests/basic/afr/client-side-heal.t b/tests/basic/afr/client-side-heal.t
index eba7dc2b3c..1e9336184b 100755
--- a/tests/basic/afr/client-side-heal.t
+++ b/tests/basic/afr/client-side-heal.t
@@ -17,6 +17,7 @@ TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
 echo "some data" > $M0/datafile
 EXPECT 0 echo $?
 TEST touch $M0/mdatafile
+TEST touch $M0/mdatafile-backend-direct-modify
 TEST mkdir $M0/dir
 
 #Kill a brick and perform I/O to have pending heals.
@@ -29,6 +30,7 @@ EXPECT 0 echo $?
 
 #pending metadata heal
 TEST chmod +x $M0/mdatafile
+TEST chmod +x $B0/${V0}0/mdatafile-backend-direct-modify
 
 #pending entry heal. Also causes pending metadata/data heals on file{1..5}
 TEST touch $M0/dir/file{1..5}
@@ -40,9 +42,12 @@ TEST $CLI volume start $V0 force
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
 
 #Medatada heal via explicit lookup must not happen
-TEST ls $M0/mdatafile
+TEST getfattr -d -m. -e hex $M0/mdatafile
+TEST ls $M0/mdatafile-backend-direct-modify
 
-#Inode refresh must not trigger data and entry heals.
+TEST [[ "$(stat -c %A $B0/${V0}0/mdatafile-backend-direct-modify)" != "$(stat -c %A $B0/${V0}1/mdatafile-backend-direct-modify)" ]]
+
+#Inode refresh must not trigger data metadata and entry heals.
 #To trigger inode refresh for sure, the volume is unmounted and mounted each time.
 #Check that data heal does not happen.
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
@@ -52,7 +57,6 @@ TEST cat $M0/datafile
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
 TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
 TEST ls $M0/dir
-
 #No heal must have happened
 EXPECT 8 get_pending_heal_count $V0
 
@@ -61,21 +65,25 @@ TEST $CLI volume set $V0 cluster.data-self-heal on
 TEST $CLI volume set $V0 cluster.metadata-self-heal on
 TEST $CLI volume set $V0 cluster.entry-self-heal on
 
-#Metadata heal is triggered by lookup without need for inode refresh.
-TEST ls $M0/mdatafile
-EXPECT 7 get_pending_heal_count $V0
-
-#Inode refresh must trigger data and entry heals.
+#Inode refresh must trigger data metadata and entry heals.
 #To trigger inode refresh for sure, the volume is unmounted and mounted each time.
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
 TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
+TEST ls $M0/mdatafile-backend-direct-modify
+
+TEST [[ "$(stat -c %A $B0/${V0}0/mdatafile-backend-direct-modify)" == "$(stat -c %A $B0/${V0}1/mdatafile-backend-direct-modify)" ]]
+
+
+TEST getfattr -d -m. -e hex $M0/mdatafile
+EXPECT_WITHIN $HEAL_TIMEOUT 7 get_pending_heal_count $V0
+
 TEST cat $M0/datafile
 EXPECT_WITHIN $HEAL_TIMEOUT 6 get_pending_heal_count $V0
 
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
 TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
 TEST ls $M0/dir
-EXPECT 5 get_pending_heal_count $V0
+EXPECT_WITHIN $HEAL_TIMEOUT 5 get_pending_heal_count $V0
 
 TEST cat  $M0/dir/file1
 TEST cat  $M0/dir/file2
@@ -83,5 +91,5 @@ TEST cat  $M0/dir/file3
 TEST cat  $M0/dir/file4
 TEST cat  $M0/dir/file5
 
-EXPECT 0 get_pending_heal_count $V0
+EXPECT_WITHIN $HEAL_TIMEOUT 0 get_pending_heal_count $V0
 cleanup;
diff --git a/tests/bugs/glusterfs/bug-906646.t b/tests/bugs/glusterfs/bug-906646.t
index 45c85d9f67..37b8fe5c8e 100644
--- a/tests/bugs/glusterfs/bug-906646.t
+++ b/tests/bugs/glusterfs/bug-906646.t
@@ -13,7 +13,6 @@ TEST pidof glusterd
 TEST $CLI volume create $V0 replica $REPLICA $H0:$B0/${V0}-00 $H0:$B0/${V0}-01 $H0:$B0/${V0}-10 $H0:$B0/${V0}-11
 TEST $CLI volume start $V0
 
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
 TEST $CLI volume set $V0 cluster.background-self-heal-count 0
 
 ## Mount FUSE with caching disabled
@@ -82,10 +81,15 @@ EXPECT 1 xattr_query_check ${backend_paths_array[1]} "trusted.name"
 # restart the brick process
 TEST $CLI volume start $V0 force
 
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 `expr $brick_id - 1`
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 3
 
-cat $pth >/dev/null
+TEST $CLI volume heal $V0
 
+EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
 # check backends - xattr should not be present anywhere
 EXPECT 1 xattr_query_check ${backend_paths_array[0]} "trusted.name"
 EXPECT 1 xattr_query_check ${backend_paths_array[1]} "trusted.name"
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index f17ae09ccc..aa05454175 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -2575,6 +2575,42 @@ out:
         return 0;
 }
 
+gf_boolean_t
+afr_is_pending_set (xlator_t *this, dict_t *xdata, int type)
+{
+        int idx = -1;
+        afr_private_t *priv = NULL;
+        void *pending_raw = NULL;
+        int *pending_int = NULL;
+        int i = 0;
+
+        priv = this->private;
+        idx = afr_index_for_transaction_type (type);
+
+        if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) {
+                if (pending_raw) {
+                        pending_int = pending_raw;
+
+                        if (ntoh32 (pending_int[idx]))
+                                return _gf_true;
+                }
+        }
+
+        for (i = 0; i < priv->child_count; i++) {
+                if (dict_get_ptr (xdata, priv->pending_key[i],
+                                  &pending_raw))
+                        continue;
+                if (!pending_raw)
+                        continue;
+                pending_int = pending_raw;
+
+                if (ntoh32 (pending_int[idx]))
+                        return _gf_true;
+        }
+
+        return _gf_false;
+}
+
 static gf_boolean_t
 afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this)
 {
@@ -2601,6 +2637,14 @@ afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this)
                         continue;
                 }
 
+                if (afr_is_pending_set (this, replies[i].xdata,
+                                        AFR_METADATA_TRANSACTION)) {
+                        /* Let shd do the heal so that lookup is not blocked
+                         * on getting metadata lock/doing the heal */
+                        start = _gf_false;
+                        break;
+                }
+
                 if (gf_uuid_compare (stbuf.ia_gfid, replies[i].poststat.ia_gfid)) {
                         start = _gf_false;
                         break;
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index c2d5058081..a84386d0af 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -2176,44 +2176,6 @@ afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
 	return 0;
 }
 
-
-gf_boolean_t
-afr_is_pending_set (xlator_t *this, dict_t *xdata, int type)
-{
-	int idx = -1;
-	afr_private_t *priv = NULL;
-	void *pending_raw = NULL;
-	int *pending_int = NULL;
-	int i = 0;
-
-	priv = this->private;
-	idx = afr_index_for_transaction_type (type);
-
-	if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) {
-		if (pending_raw) {
-			pending_int = pending_raw;
-
-			if (ntoh32 (pending_int[idx]))
-				return _gf_true;
-		}
-	}
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (dict_get_ptr (xdata, priv->pending_key[i],
-				  &pending_raw))
-			continue;
-		if (!pending_raw)
-			continue;
-		pending_int = pending_raw;
-
-		if (ntoh32 (pending_int[idx]))
-			return _gf_true;
-	}
-
-	return _gf_false;
-}
-
-
 gf_boolean_t
 afr_is_data_set (xlator_t *this, dict_t *xdata)
 {
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 36333a3cfa..8d11f2bb64 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1287,4 +1287,7 @@ afr_write_subvol_reset (call_frame_t *frame, xlator_t *this);
 
 int
 afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode);
+
+gf_boolean_t
+afr_is_pending_set (xlator_t *this, dict_t *xdata, int type);
 #endif /* __AFR_H__ */
author	Pranith Kumar K <pkarampu@redhat.com>	2018-08-27 11:46:33 +0530
committer	jiffin tony Thottan <jthottan@redhat.com>	2018-10-12 04:08:57 +0000
commit	f030db7bec36f0d97f2beacb3306d31379e4a79f (patch)
tree	3f81165d7d7e8925d017a7267afb79f98947a512
parent	a570ee702d968d1733a3e31b259d4d0fbf5bca3c (diff)
download	glusterfs-f030db7bec36f0d97f2beacb3306d31379e4a79f.tar.gz glusterfs-f030db7bec36f0d97f2beacb3306d31379e4a79f.tar.xz glusterfs-f030db7bec36f0d97f2beacb3306d31379e4a79f.zip