diff options
author | Xavi Hernandez <xhernandez@gmail.com> | 2021-04-09 18:13:30 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-09 18:13:30 +0200 |
commit | a62c5666e2b58668d0ddec7f3331446313539e72 (patch) | |
tree | 9e6c50d543a37203b653eb8fc4ae79bbe0c9f4bd | |
parent | 7a9f7cb3f6929ed2d4af603f1aa64ca967140940 (diff) | |
download | glusterfs-a62c5666e2b58668d0ddec7f3331446313539e72.tar.gz glusterfs-a62c5666e2b58668d0ddec7f3331446313539e72.tar.xz glusterfs-a62c5666e2b58668d0ddec7f3331446313539e72.zip |
dht: fix rebalance of sparse files (#2318)
Current implementation of rebalance for sparse files has a bug that,
in some cases, causes a read of 0 bytes from the source subvolume.
Posix xlator doesn't allow 0 byte reads and fails them with EINVAL,
which causes rebalance to abort the migration.
This patch implements a more robust way of finding data segments in
a sparse file that avoids 0 byte reads, allowing the file to be
migrated successfully.
Fixes: #2317
Change-Id: Iff168dda2fb0f2edf716b21eb04cc2cc8ac3915c
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
-rwxr-xr-x | tests/bugs/distribute/issue-2317.t | 29 | ||||
-rw-r--r-- | tests/volume.rc | 4 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 116 |
3 files changed, 93 insertions, 56 deletions
diff --git a/tests/bugs/distribute/issue-2317.t b/tests/bugs/distribute/issue-2317.t new file mode 100755 index 0000000000..e29d0039d8 --- /dev/null +++ b/tests/bugs/distribute/issue-2317.t @@ -0,0 +1,29 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +TESTS_EXPECTED_IN_LOOP=126 + +cleanup + +TEST glusterd +TEST ${CLI} volume create ${V0} replica 3 ${H0}:/$B0/${V0}_{0..2} +TEST ${CLI} volume start ${V0} + +TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} + +# Create several files to make sure that at least some of them should be +# migrated by rebalance. +for i in {0..63}; do + TEST dd if=/dev/urandom of=${M0}/file.${i} bs=4k count=1 + TEST dd if=/dev/urandom of=${M0}/file.${i} bs=4k count=1 seek=128 +done + +TEST ${CLI} volume add-brick ${V0} ${H0}:${B0}/${V0}_{3..5} +TEST ${CLI} volume rebalance ${V0} start force +EXPECT_WITHIN ${REBALANCE_TIMEOUT} "completed" rebalance_status_field "${V0}" + +EXPECT "^0$" rebalance_failed_field "${V0}" + +cleanup diff --git a/tests/volume.rc b/tests/volume.rc index afb296cb9e..6a0b5b80be 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -83,6 +83,10 @@ function rebalanced_files_field { $CLI volume rebalance $1 status | awk '{print $2}' | sed -n 3p } +function rebalance_failed_field { + $CLI volume rebalance $1 status | awk '{print $5}' | sed -n 3p +} + function fix-layout_status_field { #The fix-layout status can be up to 3 words, (ex:'fix-layout in progress'), hence the awk-print $2 thru $4. #But if the status is less than 3 words, it also prints the next field i.e the run_time_in_secs.(ex:'completed 3.00'). diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index d15c36bf57..6edcca721c 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -994,6 +994,46 @@ out: return ret; } +static int32_t +dht_rebalance_sparse_segment(xlator_t *subvol, fd_t *fd, off_t *offset, + size_t *size) +{ + off_t hole; + int32_t ret; + + do { + ret = syncop_seek(subvol, fd, *offset, GF_SEEK_DATA, NULL, offset); + if (ret >= 0) { + /* Starting at the offset of the last data segment, find the + * next hole. After a data segment there should always be a + * hole, since EOF is considered a hole. */ + ret = syncop_seek(subvol, fd, *offset, GF_SEEK_HOLE, NULL, &hole); + } + + if (ret < 0) { + if (ret == -ENXIO) { + /* This can happen if there are no more data segments (i.e. + * the offset is at EOF), or there was a data segment but the + * file has been truncated to a smaller size between both + * seek requests. In both cases we are done. The file doesn't + * contain more data. */ + ret = 0; + } + return ret; + } + + /* It could happen that at the same offset we detected data in the + * first seek, there could be a hole in the second seek if user is + * modifying the file concurrently. In this case we need to find a + * new data segment to migrate. */ + } while (hole <= *offset); + + /* Calculate the total size of the current data block */ + *size = hole - *offset; + + return 1; +} + static int __dht_rebalance_migrate_data(xlator_t *this, xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, uint64_t ia_size, @@ -1002,8 +1042,6 @@ __dht_rebalance_migrate_data(xlator_t *this, xlator_t *from, xlator_t *to, int ret = 0; int count = 0; off_t offset = 0; - off_t data_offset = 0; - off_t hole_offset = 0; struct iovec *vector = NULL; struct iobref *iobref = NULL; uint64_t total = 0; @@ -1018,71 +1056,36 @@ __dht_rebalance_migrate_data(xlator_t *this, xlator_t *from, xlator_t *to, while (total < ia_size) { /* This is a regular file - read it sequentially */ if (!hole_exists) { - read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) - ? DHT_REBALANCE_BLKSIZE - : (ia_size - total)); + data_block_size = ia_size - total; } else { /* This is a sparse file - read only the data segments in the file */ /* If the previous data block is fully copied, find the next data - * segment - * starting at the offset of the last read and written byte, */ + * segment starting at the offset of the last read and written + * byte. */ if (data_block_size <= 0) { - ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL, - &data_offset); - if (ret) { - if (ret == -ENXIO) - ret = 0; /* No more data segments */ - else - *fop_errno = -ret; /* Error occurred */ - + ret = dht_rebalance_sparse_segment(from, src, &offset, + &data_block_size); + if (ret <= 0) { + *fop_errno = -ret; break; } - - /* If the position of the current data segment is greater than - * the position of the next hole, find the next hole in order to - * calculate the length of the new data segment */ - if (data_offset > hole_offset) { - /* Starting at the offset of the last data segment, find the - * next hole */ - ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE, - NULL, &hole_offset); - if (ret) { - /* If an error occurred here it's a real error because - * if the seek for a data segment was successful then - * necessarily another hole must exist (EOF is a hole) - */ - *fop_errno = -ret; - break; - } - - /* Calculate the total size of the current data block */ - data_block_size = hole_offset - data_offset; - } - } else { - /* There is still data in the current segment, move the - * data_offset to the position of the last written byte */ - data_offset = offset; } - - /* Calculate how much data needs to be read and written. If the data - * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and - * write DHT_REBALANCE_BLKSIZE data length and the rest in the - * next iteration(s) */ - read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE) - ? DHT_REBALANCE_BLKSIZE - : data_block_size); - - /* Calculate the remaining size of the data block - maybe there's no - * need to seek for data in the next iteration */ - data_block_size -= read_size; - - /* Set offset to the offset of the data segment so read and write - * will have the correct position */ - offset = data_offset; } + /* Calculate how much data needs to be read and written. If the data + * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and + * write DHT_REBALANCE_BLKSIZE data length and the rest in the + * next iteration(s) */ + read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE) + ? DHT_REBALANCE_BLKSIZE + : data_block_size); + + /* Calculate the remaining size of the data block - maybe there's no + * need to seek for data in the next iteration */ + data_block_size -= read_size; + ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count, &iobref, NULL, NULL, NULL); @@ -1145,6 +1148,7 @@ __dht_rebalance_migrate_data(xlator_t *this, xlator_t *from, xlator_t *to, iobref = NULL; vector = NULL; } + if (iobref) iobref_unref(iobref); GF_FREE(vector); |