diff options
author | Pranith Kumar Karampuri <pranith.karampuri@phonepe.com> | 2021-03-22 10:19:27 +0530 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-22 10:19:27 +0530 |
commit | ec189a499d85c2aad1d54e55e47df6b95ba02922 (patch) | |
tree | 3211292a75638e8612486b6e981158a3a5bce0cf | |
parent | 1da141ab38463c4a26474456eec81c5992367af9 (diff) | |
download | glusterfs-ec189a499d85c2aad1d54e55e47df6b95ba02922.tar.gz glusterfs-ec189a499d85c2aad1d54e55e47df6b95ba02922.tar.xz glusterfs-ec189a499d85c2aad1d54e55e47df6b95ba02922.zip |
cluster/dht: use readdir for fix-layout in rebalance (#2243)
Problem:
On a cluster with 15 million files, when fix-layout was started, it was
not progressing at all. So we tried to do a os.walk() + os.stat() on the
backend filesystem directly. It took 2.5 days. We removed os.stat() and
re-ran it on another brick with similar data-set. It took 15 minutes. We
realized that readdirp is extremely costly compared to readdir if the
stat is not useful. fix-layout operation only needs to know that the
entry is a directory so that fix-layout operation can be triggered on
it. Most of the modern filesystems provide this information in readdir
operation. We don't need readdirp i.e. readdir+stat.
Fix:
Use readdir operation in fix-layout. Do readdir+stat/lookup for
filesystems that don't provide d_type in readdir operation.
fixes: #2241
Change-Id: I5fe2ecea25a399ad58e31a2e322caf69fc7f49eb
Signed-off-by: Pranith Kumar K <pranith.karampuri@phonepe.com>
-rw-r--r-- | cli/src/cli-rpc-ops.c | 8 | ||||
-rw-r--r-- | libglusterfs/src/common-utils.c | 28 | ||||
-rw-r--r-- | libglusterfs/src/glusterfs/common-utils.h | 2 | ||||
-rw-r--r-- | libglusterfs/src/libglusterfs.sym | 1 | ||||
-rw-r--r-- | tests/bugs/glusterd/rebalance-operations-in-single-node.t | 6 | ||||
-rw-r--r-- | tests/bugs/glusterd/replace-brick-operations.t | 2 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 24 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 59 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-rebalance.c | 2 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-inode-fd-ops.c | 97 |
10 files changed, 131 insertions, 98 deletions
diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index 55e4a346d2..9f22ca14c3 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -906,9 +906,9 @@ xml_output: replica_count, disperse_count, redundancy_count, arbiter_count); - cli_out("Transport-type: %s", - ((transport == 0) ? "tcp" - : (transport == 1) ? "rdma" : "tcp,rdma")); + cli_out("Transport-type: %s", ((transport == 0) ? "tcp" + : (transport == 1) ? "rdma" + : "tcp,rdma")); j = 1; GF_FREE(local->get_vol.volname); @@ -1576,7 +1576,7 @@ gf_cli_print_rebalance_status(dict_t *dict, enum gf_task_types task_type) sec = ((uint64_t)elapsed % 3600) % 60; if (fix_layout) { - cli_out("%35s %50s %8d:%d:%d", node_name, status_str, hrs, min, + cli_out("%35s %50s %8d:%02d:%02d", node_name, status_str, hrs, min, sec); } else { if (size_str) { diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c index 1fa37ee01f..69e344741d 100644 --- a/libglusterfs/src/common-utils.c +++ b/libglusterfs/src/common-utils.c @@ -3325,7 +3325,7 @@ gf_process_reserved_ports(unsigned char *ports, uint32_t ceiling) out: GF_FREE(ports_info); -#else /* FIXME: Non Linux Host */ +#else /* FIXME: Non Linux Host */ ret = 0; #endif /* GF_LINUX_HOST_OS */ @@ -5197,7 +5197,7 @@ close_fds_except_custom(int *fdv, size_t count, void *prm, closer(i, prm); } sys_closedir(d); -#else /* !GF_LINUX_HOST_OS */ +#else /* !GF_LINUX_HOST_OS */ struct rlimit rl; int ret = -1; @@ -5434,6 +5434,30 @@ gf_d_type_from_ia_type(ia_type_t type) } int +gf_d_type_from_st_mode(mode_t st_mode) +{ + switch (st_mode & S_IFMT) { + case S_IFREG: + return DT_REG; + case S_IFDIR: + return DT_DIR; + case S_IFLNK: + return DT_LNK; + case S_IFBLK: + return DT_BLK; + case S_IFCHR: + return DT_CHR; + case S_IFIFO: + return DT_FIFO; + case S_IFSOCK: + return DT_SOCK; + default: + return DT_UNKNOWN; + } + return DT_UNKNOWN; +} + +int gf_nanosleep(uint64_t nsec) { struct timespec req; diff --git a/libglusterfs/src/glusterfs/common-utils.h b/libglusterfs/src/glusterfs/common-utils.h index a93baa3545..3999c57f92 100644 --- a/libglusterfs/src/glusterfs/common-utils.h +++ b/libglusterfs/src/glusterfs/common-utils.h @@ -1257,4 +1257,6 @@ gf_tsdiff(struct timespec *start, struct timespec *end) (int64_t)(end->tv_nsec - start->tv_nsec); } +int +gf_d_type_from_st_mode(mode_t st_mode); #endif /* _COMMON_UTILS_H */ diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym index f4fd4a71a5..d92dc8c952 100644 --- a/libglusterfs/src/libglusterfs.sym +++ b/libglusterfs/src/libglusterfs.sym @@ -1179,6 +1179,7 @@ gf_changelog_register_generic gf_gfid_generate_from_xxh64 find_xlator_option_in_cmd_args_t gf_d_type_from_ia_type +gf_d_type_from_st_mode glusterfs_graph_fini glusterfs_process_svc_attach_volfp glusterfs_mux_volfile_reconfigure diff --git a/tests/bugs/glusterd/rebalance-operations-in-single-node.t b/tests/bugs/glusterd/rebalance-operations-in-single-node.t index ef85887f44..6b6f0177aa 100644 --- a/tests/bugs/glusterd/rebalance-operations-in-single-node.t +++ b/tests/bugs/glusterd/rebalance-operations-in-single-node.t @@ -108,10 +108,16 @@ done TEST $CLI volume add-brick $V0 $H0:$B0/${V0}{5,6}; #perform rebalance fix-layout +TEST $CLI volume profile $V0 start TEST $CLI volume rebalance $V0 fix-layout start EXPECT_WITHIN $REBALANCE_TIMEOUT "fix-layout completed" fix-layout_status_field $V0; +readdir_count=$($CLI volume profile $V0 info | grep -w READDIR | wc -l) +readdirp_count=$($CLI volume profile $V0 info | grep -w READDIRP | wc -l) +EXPECT_NOT "^0$" echo $readdir_count +EXPECT "^0$" echo $readdirp_count + #bug-1075087 - rebalance post add brick TEST mkdir $M0/dir{21..30}; TEST touch $M0/dir{21..30}/files{1..10}; diff --git a/tests/bugs/glusterd/replace-brick-operations.t b/tests/bugs/glusterd/replace-brick-operations.t index 044aa3d6c6..c8b6596903 100644 --- a/tests/bugs/glusterd/replace-brick-operations.t +++ b/tests/bugs/glusterd/replace-brick-operations.t @@ -32,6 +32,8 @@ TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}2 $H0:$B0/${V0}3 commit force #bug-1242543-replace-brick validation TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0; +#Make sure new brick comes online before doing replace-brick on next-brick. +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 # Replace brick1 without killing TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}1_new commit force diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index fcaf1fc336..4b7bc46860 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -7033,6 +7033,8 @@ dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, dht_conf_t *conf = NULL; dht_methods_t *methods = NULL; gf_boolean_t skip_hashed_check = _gf_false; + gf_boolean_t readdir_optimize = _gf_false; + gf_boolean_t add = _gf_false; INIT_LIST_HEAD(&entries.list); @@ -7042,6 +7044,7 @@ dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, conf = this->private; GF_VALIDATE_OR_GOTO(this->name, conf, done); + readdir_optimize = conf->readdir_optimize; methods = &(conf->methods); if (op_ret <= 0) @@ -7065,12 +7068,25 @@ dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, { next_offset = orig_entry->d_off; - gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name, - orig_entry->d_name, orig_entry->d_type); - subvol = methods->layout_search(this, layout, orig_entry->d_name); + gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d %p, %p", + prev->name, orig_entry->d_name, orig_entry->d_type, subvol, + prev); + + /* a) If rebalance is running, pick from first_up_subvol + */ + if (DT_ISDIR(orig_entry->d_type) && readdir_optimize) { + if (prev == local->first_up_subvol) { + add = _gf_true; + } else { + continue; + } + } else if (!subvol || (subvol == prev)) { + add = _gf_true; + } - if (!subvol || (subvol == prev)) { + if (add) { + add = _gf_false; entry = gf_dirent_for_name(orig_entry->d_name); if (!entry) { gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index c3d60b7d2d..d15c36bf57 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -3608,6 +3608,9 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, struct iatt iatt = { 0, }; + struct iatt entry_iatt = { + 0, + }; inode_t *linked_inode = NULL, *inode = NULL; dht_conf_t *conf = NULL; int perrno = 0; @@ -3643,6 +3646,12 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, goto out; } + linked_inode = inode_link(loc->inode, loc->parent, loc->name, &iatt); + + inode = loc->inode; + loc->inode = linked_inode; + inode_unref(inode); + fd = fd_create(loc->inode, defrag->pid); if (!fd) { gf_log(this->name, GF_LOG_ERROR, "Failed to create fd"); @@ -3675,8 +3684,8 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, fd_bind(fd); INIT_LIST_HEAD(&entries.list); - while ((ret = syncop_readdirp(this, fd, 131072, offset, &entries, NULL, - NULL)) != 0) { + while ((ret = syncop_readdir(this, fd, 131072, offset, &entries, NULL, + NULL)) != 0) { if (ret < 0) { if (-ret == ENOENT || -ret == ESTALE) { if (conf->decommission_subvols_cnt) { @@ -3711,9 +3720,11 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) continue; - if (!IA_ISDIR(entry->d_stat.ia_type)) { + + if ((DT_DIR != entry->d_type) && (DT_UNKNOWN != entry->d_type)) { continue; } + loc_wipe(&entry_loc); ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name); @@ -3734,40 +3745,18 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, } } - if (gf_uuid_is_null(entry->d_stat.ia_gfid)) { - gf_log(this->name, GF_LOG_ERROR, - "%s/%s" - " gfid not present", - loc->path, entry->d_name); - continue; - } - - gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid); - - /*In case the gfid stored in the inode by inode_link - * and the gfid obtained in the lookup differs, then - * client3_3_lookup_cbk will return ESTALE and proper - * error will be captured - */ - - linked_inode = inode_link(entry_loc.inode, loc->inode, - entry->d_name, &entry->d_stat); - - inode = entry_loc.inode; - entry_loc.inode = linked_inode; - inode_unref(inode); - - if (gf_uuid_is_null(loc->gfid)) { - gf_log(this->name, GF_LOG_ERROR, - "%s/%s" - " gfid not present", - loc->path, entry->d_name); - defrag->total_failures++; - continue; + if (DT_UNKNOWN == entry->d_type) { + ret = syncop_lookup(this, &entry_loc, &entry_iatt, NULL, NULL, + NULL); + if ((ret == 0) && (entry_iatt.ia_type != IA_IFDIR)) { + continue; + } + /*If it is directory, gf_defrag_fix_layout() call will again do + * one more lookup. Not optimizing this part as all modern + * filesystems populate entry->d_type. We can optimize it when + * such a filesystem is found.*/ } - gf_uuid_copy(entry_loc.pargfid, loc->gfid); - /* A return value of 2 means, either process_dir or * lookup of a dir failed. Hence, don't commit hash * for the current directory*/ diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c index def5420f85..cdd111529e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c +++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c @@ -289,7 +289,7 @@ glusterd_handle_defrag_start(glusterd_volinfo_t *volinfo, char *op_errstr, runner_add_args( &runner, SBIN_DIR "/glusterfs", "-s", volfileserver, "--volfile-id", - volname, "--xlator-option", "*dht.use-readdirp=yes", "--xlator-option", + volname, "--xlator-option", "*dht.use-readdirp=no", "--xlator-option", "*dht.lookup-unhashed=yes", "--xlator-option", "*dht.assert-no-child-down=yes", "--xlator-option", "*dht.readdir-optimize=on", "--process-name", "rebalance", NULL); diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c index 1ecf272d01..bc43a3525d 100644 --- a/xlators/storage/posix/src/posix-inode-fd-ops.c +++ b/xlators/storage/posix/src/posix-inode-fd-ops.c @@ -1095,7 +1095,7 @@ posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, #ifndef FALLOC_FL_KEEP_SIZE ret = EOPNOTSUPP; -#else /* FALLOC_FL_KEEP_SIZE */ +#else /* FALLOC_FL_KEEP_SIZE */ int32_t flags = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; struct iatt statpre = { 0, @@ -5488,8 +5488,8 @@ posix_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, return 0; } -int -posix_fill_readdir(fd_t *fd, DIR *dir, off_t off, size_t size, +static int +posix_fill_readdir(fd_t *fd, struct posix_fd *pfd, off_t off, size_t size, gf_dirent_t *entries, xlator_t *this, int32_t skip_dirs) { off_t in_case = -1; @@ -5498,14 +5498,10 @@ posix_fill_readdir(fd_t *fd, DIR *dir, off_t off, size_t size, int count = 0; int32_t this_size = -1; gf_dirent_t *this_entry = NULL; - struct posix_fd *pfd = NULL; struct stat stbuf = { 0, }; - char *hpath = NULL; - int len = 0; - int ret = 0; - int op_errno = 0; + int stat_ret = 1; struct dirent *entry = NULL; struct dirent scratch[2] = { { @@ -5513,38 +5509,17 @@ posix_fill_readdir(fd_t *fd, DIR *dir, off_t off, size_t size, }, }; - ret = posix_fd_ctx_get(fd, this, &pfd, &op_errno); - if (ret < 0) { - gf_msg(this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, - "pfd is NULL, fd=%p", fd); - count = -1; - errno = op_errno; - goto out; - } - - if (skip_dirs) { - hpath = alloca(PATH_MAX); - len = posix_handle_path(this, fd->inode->gfid, NULL, hpath, PATH_MAX); - if (len <= 0) { - errno = ESTALE; - count = -1; - goto out; - } - len = strlen(hpath); - hpath[len] = '/'; - } - if (!off) { - rewinddir(dir); + rewinddir(pfd->dir); } else { - seekdir(dir, off); + seekdir(pfd->dir, off); #ifndef GF_LINUX_HOST_OS - if ((u_long)telldir(dir) != off && off != pfd->dir_eof) { + if ((u_long)telldir(pfd->dir) != off && off != pfd->dir_eof) { gf_msg(THIS->name, GF_LOG_ERROR, EINVAL, P_MSG_DIR_OPERATION_FAILED, "seekdir(0x%llx) failed on dir=%p: " "Invalid argument (offset reused from " "another DIR * structure?)", - off, dir); + off, pfd->dir); errno = EINVAL; count = -1; goto out; @@ -5553,27 +5528,28 @@ posix_fill_readdir(fd_t *fd, DIR *dir, off_t off, size_t size, } while (filled <= size) { - in_case = (u_long)telldir(dir); + in_case = (u_long)telldir(pfd->dir); if (in_case == -1) { gf_msg(THIS->name, GF_LOG_ERROR, errno, P_MSG_DIR_OPERATION_FAILED, - "telldir failed on dir=%p", dir); + "telldir failed on dir=%p", pfd->dir); goto out; } errno = 0; - entry = sys_readdir(dir, scratch); + entry = sys_readdir(pfd->dir, scratch); if (!entry || errno != 0) { if (errno == EBADF) { gf_msg(THIS->name, GF_LOG_WARNING, errno, P_MSG_DIR_OPERATION_FAILED, "readdir failed on dir=%p", - dir); + pfd->dir); goto out; } break; } + stat_ret = 1; #ifdef __NetBSD__ /* @@ -5597,10 +5573,11 @@ posix_fill_readdir(fd_t *fd, DIR *dir, off_t off, size_t size, if (skip_dirs) { if (DT_ISDIR(entry->d_type)) { continue; - } else if (hpath) { - strcpy(&hpath[len + 1], entry->d_name); - ret = sys_lstat(hpath, &stbuf); - if (!ret && S_ISDIR(stbuf.st_mode)) + } else if (DT_UNKNOWN == entry->d_type) { + memset(&stbuf, 0, sizeof(stbuf)); + stat_ret = sys_fstatat(pfd->fd, entry->d_name, &stbuf, + AT_SYMLINK_NOFOLLOW); + if ((stat_ret == 0) && S_ISDIR(stbuf.st_mode)) continue; } } @@ -5609,15 +5586,15 @@ posix_fill_readdir(fd_t *fd, DIR *dir, off_t off, size_t size, strlen(entry->d_name) + 1; if (this_size + filled > size) { - seekdir(dir, in_case); + seekdir(pfd->dir, in_case); #ifndef GF_LINUX_HOST_OS - if ((u_long)telldir(dir) != in_case && in_case != pfd->dir_eof) { + if ((u_long)telldir(pfd->dir) != in_case && in_case != pfd->dir_eof) { gf_msg(THIS->name, GF_LOG_ERROR, EINVAL, P_MSG_DIR_OPERATION_FAILED, "seekdir(0x%llx) failed on dir=%p: " "Invalid argument (offset reused from " "another DIR * structure?)", - in_case, dir); + in_case, pfd->dir); errno = EINVAL; count = -1; goto out; @@ -5636,13 +5613,31 @@ posix_fill_readdir(fd_t *fd, DIR *dir, off_t off, size_t size, entry->d_name); goto out; } + + if (DT_UNKNOWN == entry->d_type) { + if (stat_ret == 1) { + memset(&stbuf, 0, sizeof(stbuf)); + stat_ret = sys_fstatat(pfd->fd, entry->d_name, &stbuf, + AT_SYMLINK_NOFOLLOW); + } + + if (stat_ret == 0) { + entry->d_type = gf_d_type_from_st_mode(stbuf.st_mode); + } else { + gf_msg_debug(THIS->name, errno, + "fstatat failed on" + " %s/%s", + uuid_utoa(fd->inode->gfid), entry->d_name); + } + } /* * we store the offset of next entry here, which is * probably not intended, but code using syncop_readdir() * (glfs-heal.c, afr-self-heald.c, pump.c) rely on it * for directory read resumption. */ - last_off = (u_long)telldir(dir); + + last_off = (u_long)telldir(pfd->dir); this_entry->d_off = last_off; this_entry->d_ino = entry->d_ino; this_entry->d_type = entry->d_type; @@ -5653,7 +5648,7 @@ posix_fill_readdir(fd_t *fd, DIR *dir, off_t off, size_t size, count++; } - if ((!sys_readdir(dir, scratch) && (errno == 0))) { + if ((!sys_readdir(pfd->dir, scratch) && (errno == 0))) { /* Indicate EOF */ errno = ENOENT; /* Remember EOF offset for later detection */ @@ -5763,7 +5758,6 @@ posix_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off, int whichop, dict_t *dict) { struct posix_fd *pfd = NULL; - DIR *dir = NULL; int ret = -1; int count = 0; int32_t op_ret = -1; @@ -5784,11 +5778,10 @@ posix_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, goto out; } - dir = pfd->dir; - - if (!dir) { + if ((!pfd->dir) || (pfd->fd < 0)) { gf_msg(this->name, GF_LOG_WARNING, EINVAL, P_MSG_PFD_NULL, - "dir is NULL for fd=%p", fd); + "posix ctx(%p, %d) is invalid for %s", pfd->dir, pfd->fd, + uuid_utoa(fd->inode->gfid)); op_errno = EINVAL; goto out; } @@ -5811,7 +5804,7 @@ posix_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, It would also help, in the future, to replace the loop around readdir() with a single large getdents() call. */ - count = posix_fill_readdir(fd, dir, off, size, &entries, this, + count = posix_fill_readdir(fd, pfd, off, size, &entries, this, skip_dirs); } UNLOCK(&fd->lock); |