dht: fix rebalance of sparse files (#2318)

Current implementation of rebalance for sparse files has a bug that, in some cases, causes a read of 0 bytes from the source subvolume. Posix xlator doesn't allow 0 byte reads and fails them with EINVAL, which causes rebalance to abort the migration. This patch implements a more robust way of finding data segments in a sparse file that avoids 0 byte reads, allowing the file to be migrated successfully. Fixes: #2317 Change-Id: Iff168dda2fb0f2edf716b21eb04cc2cc8ac3915c Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
author: Xavi Hernandez <xhernandez@gmail.com> 2021-04-09 18:13:30 +0200
committer: GitHub <noreply@github.com> 2021-04-09 18:13:30 +0200
commit: a62c5666e2b58668d0ddec7f3331446313539e72 (patch)
tree: 9e6c50d543a37203b653eb8fc4ae79bbe0c9f4bd
parent: 7a9f7cb3f6929ed2d4af603f1aa64ca967140940 (diff)
download: glusterfs-a62c5666e2b58668d0ddec7f3331446313539e72.tar.gz
glusterfs-a62c5666e2b58668d0ddec7f3331446313539e72.tar.xz
glusterfs-a62c5666e2b58668d0ddec7f3331446313539e72.zip
3 files changed, 93 insertions, 56 deletions
diff --git a/tests/bugs/distribute/issue-2317.t b/tests/bugs/distribute/issue-2317.t
new file mode 100755
index 0000000000..e29d0039d8
--- /dev/null
+++ b/tests/bugs/distribute/issue-2317.t
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+TESTS_EXPECTED_IN_LOOP=126
+
+cleanup
+
+TEST glusterd
+TEST ${CLI} volume create ${V0} replica 3 ${H0}:/$B0/${V0}_{0..2}
+TEST ${CLI} volume start ${V0}
+
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
+
+# Create several files to make sure that at least some of them should be
+# migrated by rebalance.
+for i in {0..63}; do
+    TEST dd if=/dev/urandom of=${M0}/file.${i} bs=4k count=1
+    TEST dd if=/dev/urandom of=${M0}/file.${i} bs=4k count=1 seek=128
+done
+
+TEST ${CLI} volume add-brick ${V0} ${H0}:${B0}/${V0}_{3..5}
+TEST ${CLI} volume rebalance ${V0} start force
+EXPECT_WITHIN ${REBALANCE_TIMEOUT} "completed" rebalance_status_field "${V0}"
+
+EXPECT "^0$" rebalance_failed_field "${V0}"
+
+cleanup
diff --git a/tests/volume.rc b/tests/volume.rc
index afb296cb9e..6a0b5b80be 100644
--- a/tests/volume.rc
+++ b/tests/volume.rc
@@ -83,6 +83,10 @@ function rebalanced_files_field {
         $CLI volume rebalance $1 status | awk '{print $2}' | sed -n 3p
 }
 
+function rebalance_failed_field {
+        $CLI volume rebalance $1 status | awk '{print $5}' | sed -n 3p
+}
+
 function fix-layout_status_field {
         #The fix-layout status can be up to 3 words, (ex:'fix-layout in progress'), hence the awk-print $2 thru $4.
         #But if the status is less than 3 words, it also prints the next field i.e the run_time_in_secs.(ex:'completed 3.00').
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index d15c36bf57..6edcca721c 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -994,6 +994,46 @@ out:
     return ret;
 }
 
+static int32_t
+dht_rebalance_sparse_segment(xlator_t *subvol, fd_t *fd, off_t *offset,
+                             size_t *size)
+{
+    off_t hole;
+    int32_t ret;
+
+    do {
+        ret = syncop_seek(subvol, fd, *offset, GF_SEEK_DATA, NULL, offset);
+        if (ret >= 0) {
+            /* Starting at the offset of the last data segment, find the
+             * next hole. After a data segment there should always be a
+             * hole, since EOF is considered a hole. */
+            ret = syncop_seek(subvol, fd, *offset, GF_SEEK_HOLE, NULL, &hole);
+        }
+
+        if (ret < 0) {
+            if (ret == -ENXIO) {
+                /* This can happen if there are no more data segments (i.e.
+                 * the offset is at EOF), or there was a data segment but the
+                 * file has been truncated to a smaller size between both
+                 * seek requests. In both cases we are done. The file doesn't
+                 * contain more data. */
+                ret = 0;
+            }
+            return ret;
+        }
+
+        /* It could happen that at the same offset we detected data in the
+         * first seek, there could be a hole in the second seek if user is
+         * modifying the file concurrently. In this case we need to find a
+         * new data segment to migrate. */
+    } while (hole <= *offset);
+
+    /* Calculate the total size of the current data block */
+    *size = hole - *offset;
+
+    return 1;
+}
+
 static int
 __dht_rebalance_migrate_data(xlator_t *this, xlator_t *from, xlator_t *to,
                              fd_t *src, fd_t *dst, uint64_t ia_size,
@@ -1002,8 +1042,6 @@ __dht_rebalance_migrate_data(xlator_t *this, xlator_t *from, xlator_t *to,
     int ret = 0;
     int count = 0;
     off_t offset = 0;
-    off_t data_offset = 0;
-    off_t hole_offset = 0;
     struct iovec *vector = NULL;
     struct iobref *iobref = NULL;
     uint64_t total = 0;
@@ -1018,71 +1056,36 @@ __dht_rebalance_migrate_data(xlator_t *this, xlator_t *from, xlator_t *to,
     while (total < ia_size) {
         /* This is a regular file - read it sequentially */
         if (!hole_exists) {
-            read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
-                             ? DHT_REBALANCE_BLKSIZE
-                             : (ia_size - total));
+            data_block_size = ia_size - total;
         } else {
             /* This is a sparse file - read only the data segments in the file
              */
 
             /* If the previous data block is fully copied, find the next data
-             * segment
-             * starting at the offset of the last read and written byte,  */
+             * segment starting at the offset of the last read and written
+             * byte. */
             if (data_block_size <= 0) {
-                ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
-                                  &data_offset);
-                if (ret) {
-                    if (ret == -ENXIO)
-                        ret = 0; /* No more data segments */
-                    else
-                        *fop_errno = -ret; /* Error occurred */
-
+                ret = dht_rebalance_sparse_segment(from, src, &offset,
+                                                   &data_block_size);
+                if (ret <= 0) {
+                    *fop_errno = -ret;
                     break;
                 }
-
-                /* If the position of the current data segment is greater than
-                 * the position of the next hole, find the next hole in order to
-                 * calculate the length of the new data segment */
-                if (data_offset > hole_offset) {
-                    /* Starting at the offset of the last data segment, find the
-                     * next hole */
-                    ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
-                                      NULL, &hole_offset);
-                    if (ret) {
-                        /* If an error occurred here it's a real error because
-                         * if the seek for a data segment was successful then
-                         * necessarily another hole must exist (EOF is a hole)
-                         */
-                        *fop_errno = -ret;
-                        break;
-                    }
-
-                    /* Calculate the total size of the current data block */
-                    data_block_size = hole_offset - data_offset;
-                }
-            } else {
-                /* There is still data in the current segment, move the
-                 * data_offset to the position of the last written byte */
-                data_offset = offset;
             }
-
-            /* Calculate how much data needs to be read and written. If the data
-             * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
-             * write DHT_REBALANCE_BLKSIZE data length and the rest in the
-             * next iteration(s) */
-            read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
-                             ? DHT_REBALANCE_BLKSIZE
-                             : data_block_size);
-
-            /* Calculate the remaining size of the data block - maybe there's no
-             * need to seek for data in the next iteration */
-            data_block_size -= read_size;
-
-            /* Set offset to the offset of the data segment so read and write
-             * will have the correct position */
-            offset = data_offset;
         }
 
+        /* Calculate how much data needs to be read and written. If the data
+         * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
+         * write DHT_REBALANCE_BLKSIZE data length and the rest in the
+         * next iteration(s) */
+        read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
+                         ? DHT_REBALANCE_BLKSIZE
+                         : data_block_size);
+
+        /* Calculate the remaining size of the data block - maybe there's no
+         * need to seek for data in the next iteration */
+        data_block_size -= read_size;
+
         ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
                            &iobref, NULL, NULL, NULL);
 
@@ -1145,6 +1148,7 @@ __dht_rebalance_migrate_data(xlator_t *this, xlator_t *from, xlator_t *to,
         iobref = NULL;
         vector = NULL;
     }
+
     if (iobref)
         iobref_unref(iobref);
     GF_FREE(vector);
author	Xavi Hernandez <xhernandez@gmail.com>	2021-04-09 18:13:30 +0200
committer	GitHub <noreply@github.com>	2021-04-09 18:13:30 +0200
commit	a62c5666e2b58668d0ddec7f3331446313539e72 (patch)
tree	9e6c50d543a37203b653eb8fc4ae79bbe0c9f4bd
parent	7a9f7cb3f6929ed2d4af603f1aa64ca967140940 (diff)
download	glusterfs-a62c5666e2b58668d0ddec7f3331446313539e72.tar.gz glusterfs-a62c5666e2b58668d0ddec7f3331446313539e72.tar.xz glusterfs-a62c5666e2b58668d0ddec7f3331446313539e72.zip