diff options
author | Ravishankar N <ravishankar@redhat.com> | 2020-09-04 10:37:32 +0530 |
---|---|---|
committer | Xavi Hernandez <xhernandez@users.noreply.github.com> | 2020-10-23 19:12:35 +0200 |
commit | 4b9f95b6993b606b2f97d55694910c2cbe3107a2 (patch) | |
tree | 89f47782ed0460ae088948d1d04783267cb066cb | |
parent | f5e1eb87d4af44be3b317b7f99ab88f89c2f0b1a (diff) | |
download | glusterfs-4b9f95b6993b606b2f97d55694910c2cbe3107a2.tar.gz glusterfs-4b9f95b6993b606b2f97d55694910c2cbe3107a2.tar.xz glusterfs-4b9f95b6993b606b2f97d55694910c2cbe3107a2.zip |
posix: add io_uring support
This patch adds support for reads, writes and fsyncs using io-uring in
posix xlator. A volume option 'storage.linux-io_uring' is introduced to turn
it on or off.
When I ran tests with fio and iozone on physical machines on a 2x3
volume + single client, the numbers were not any worse than regular
pwrite/pread syscalls which posix does with default volume
configurations. But given that io_uring itself undergoing rapid
improvement and bug fixes,it is good to have an intial working implementation
in gluster and iteratively develop on top of it.
Updates: #1398
Change-Id: Ia47456ebb4c16a3b66ad9beb6a9043cc090fed2b
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
-rw-r--r-- | configure.ac | 11 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-op-sm.c | 19 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 3 | ||||
-rw-r--r-- | xlators/storage/posix/src/Makefile.am | 6 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-common.c | 23 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-handle.h | 3 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-io-uring.c | 633 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-io-uring.h | 31 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-mem-types.h | 1 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-messages.h | 2 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix.h | 17 |
11 files changed, 744 insertions, 5 deletions
diff --git a/configure.ac b/configure.ac index d40b8c6d92..2ce2a91137 100644 --- a/configure.ac +++ b/configure.ac @@ -6,6 +6,7 @@ dnl General Public License, version 3 or any later version (LGPLv3 or dnl later), or the GNU General Public License, version 2 (GPLv2), in all dnl cases as published by the Free Software Foundation. +// clang-format off AC_INIT([glusterfs], [m4_esyscmd([build-aux/pkg-version --version])], [gluster-users@gluster.org],,[https://github.com/gluster/glusterfs.git]) @@ -1427,6 +1428,14 @@ if test -n "$LIBAIO"; then BUILD_LIBAIO=yes fi +BUILD_LIBURING=no +AC_CHECK_HEADERS([liburing.h]) +AC_CHECK_LIB([uring],[io_uring_queue_init],[LIBURING="-luring"]) + +if test -n "$LIBURING"; then + AC_DEFINE(HAVE_LIBURING, 1, [io-uring based POSIX enabled]) + BUILD_LIBURING=yes +fi dnl gnfs section BUILD_GNFS="no" RPCBIND_SERVICE="" @@ -1608,6 +1617,7 @@ AC_SUBST(GF_FUSE_LDADD) AC_SUBST(GF_FUSE_CFLAGS) AC_SUBST(RLLIBS) AC_SUBST(LIBAIO) +AC_SUBST(LIBURING) AC_SUBST(AM_MAKEFLAGS) AC_SUBST(AM_LIBTOOLFLAGS) AC_SUBST(GF_NO_UNDEFINED) @@ -1690,6 +1700,7 @@ echo "fusermount : $BUILD_FUSERMOUNT" echo "readline : $BUILD_READLINE" echo "georeplication : $BUILD_SYNCDAEMON" echo "Linux-AIO : $BUILD_LIBAIO" +echo "Linux-io_uring : $BUILD_LIBURING" echo "Enable Debug : $BUILD_DEBUG" echo "Run with Valgrind : $VALGRIND_TOOL" echo "Sanitizer enabled : $SANITIZER" diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index c537fc33a8..7762fe44fa 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -1224,7 +1224,24 @@ glusterd_op_stage_set_volume(dict_t *dict, char **op_errstr) key = key_fixed; keylen = strlen(key_fixed); } - +#ifdef HAVE_LIBURING + if (len_strcmp(key, keylen, "storage.linux-io_uring")) { + if (volinfo == NULL) { + snprintf(errstr, sizeof(errstr), "vol info is NULL for %s.", + volname); + ret = -1; + goto out; + } + if (volinfo->status == GLUSTERD_STATUS_STARTED) { + snprintf(errstr, sizeof(errstr), + "Changing this option is " + "not supported when volume is in started state. " + "Please stop the volume."); + ret = -1; + goto out; + } + } +#endif if (len_strcmp(key, keylen, "cluster.granular-entry-heal")) { /* For granular entry-heal, if the set command was * invoked through volume-set CLI, then allow the diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 398b4d76f5..fb93bc2432 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -2387,6 +2387,9 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_8_0, }, {.key = "storage.linux-aio", .voltype = "storage/posix", .op_version = 1}, + {.key = "storage.linux-io_uring", + .voltype = "storage/posix", + .op_version = GD_OP_VERSION_9_0}, {.key = "storage.batch-fsync-mode", .voltype = "storage/posix", .op_version = 3}, diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am index c080a229ff..ed69720f5e 100644 --- a/xlators/storage/posix/src/Makefile.am +++ b/xlators/storage/posix/src/Makefile.am @@ -7,13 +7,13 @@ posix_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c \ posix-gfid-path.c posix-entry-ops.c posix-inode-fd-ops.c \ - posix-common.c posix-metadata.c + posix-common.c posix-metadata.c posix-io-uring.c posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO) \ - $(ACL_LIBS) + $(LIBURING) $(ACL_LIBS) noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h posix-aio.h \ posix-messages.h posix-gfid-path.h posix-inode-handle.h \ - posix-metadata.h posix-metadata-disk.h + posix-metadata.h posix-metadata-disk.h posix-io-uring.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ diff --git a/xlators/storage/posix/src/posix-common.c b/xlators/storage/posix/src/posix-common.c index f10722ec3f..db30b595d1 100644 --- a/xlators/storage/posix/src/posix-common.c +++ b/xlators/storage/posix/src/posix-common.c @@ -45,6 +45,7 @@ #include <glusterfs/timer.h> #include "glusterfs3-xdr.h" #include "posix-aio.h" +#include "posix-io-uring.h" #include <glusterfs/glusterfs-acl.h> #include "posix-messages.h" #include <glusterfs/events.h> @@ -359,6 +360,14 @@ posix_reconfigure(xlator_t *this, dict_t *options) else posix_aio_off(this); + GF_OPTION_RECONF("linux-io_uring", priv->io_uring_configured, options, bool, + out); + + if (priv->io_uring_configured) + posix_io_uring_on(this); + else + posix_io_uring_off(this); + GF_OPTION_RECONF("update-link-count-parent", priv->update_pgfid_nlinks, options, bool, out); @@ -1049,6 +1058,14 @@ posix_init(xlator_t *this) } } + GF_OPTION_INIT("linux-io_uring", _private->io_uring_configured, bool, out); + if (_private->io_uring_configured) { + op_ret = posix_io_uring_on(this); + if (op_ret < 0) { + _private->io_uring_configured = _gf_false; + } + } + GF_OPTION_INIT("node-uuid-pathinfo", _private->node_uuid_pathinfo, bool, out); if (_private->node_uuid_pathinfo && @@ -1325,6 +1342,12 @@ struct volume_options posix_options[] = { .description = "Support for native Linux AIO", .op_version = {1}, .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"linux-io_uring"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Support for Linux io_uring", + .op_version = {GD_OP_VERSION_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, {.key = {"brick-uid"}, .type = GF_OPTION_TYPE_INT, .min = -1, diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h index f33ed92620..940939ec6e 100644 --- a/xlators/storage/posix/src/posix-handle.h +++ b/xlators/storage/posix/src/posix-handle.h @@ -218,4 +218,7 @@ posix_check_internal_writes(xlator_t *this, fd_t *fd, int sysfd, dict_t *xdata); void posix_disk_space_check(xlator_t *this); + +dict_t * +_fill_writev_xdata(fd_t *fd, dict_t *xdata, xlator_t *this, int is_append); #endif /* !_POSIX_HANDLE_H */ diff --git a/xlators/storage/posix/src/posix-io-uring.c b/xlators/storage/posix/src/posix-io-uring.c new file mode 100644 index 0000000000..567a6a83ef --- /dev/null +++ b/xlators/storage/posix/src/posix-io-uring.c @@ -0,0 +1,633 @@ +/* + Copyright (c) 2020 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "posix.h" +#include "posix-messages.h" +#include "posix-io-uring.h" +#include "posix-handle.h" + +#ifdef HAVE_LIBURING +#include <liburing.h> + +struct posix_uring_ctx; +typedef void(fop_unwind_f)(struct posix_uring_ctx *, int32_t); +typedef void(fop_prep_f)(struct io_uring_sqe *sqe, struct posix_uring_ctx *); +static int +posix_io_uring_submit(xlator_t *this, struct posix_uring_ctx *ctx); + +struct posix_uring_ctx { + call_frame_t *frame; + struct iatt prebuf; + dict_t *xdata; + fd_t *fd; + int _fd; + int op; + + union { + struct { + struct iovec *iov; + int count; + off_t offset; + } write; + + struct { + struct iobuf *iobuf; + struct iovec iovec; + off_t offset; + } read; + + struct { + int32_t datasync; + } fsync; + } fop; + + fop_prep_f *prepare; + fop_unwind_f *unwind; +}; + +static void +posix_io_uring_ctx_free(struct posix_uring_ctx *ctx) +{ + if (!ctx) + return; + if (ctx->fd) + fd_unref(ctx->fd); + if (ctx->xdata) + dict_unref(ctx->xdata); + switch (ctx->op) { + case GF_FOP_READ: + if (ctx->fop.read.iobuf) + iobuf_unref(ctx->fop.read.iobuf); + break; + default: + break; + } + GF_FREE(ctx); +} + +struct posix_uring_ctx * +posix_io_uring_ctx_init(call_frame_t *frame, xlator_t *this, fd_t *fd, int op, + fop_prep_f prepare, fop_unwind_f unwind, + int32_t *op_errno, dict_t *xdata) +{ + struct posix_uring_ctx *ctx = NULL; + struct posix_fd *pfd = NULL; + int ret = 0; + + ctx = GF_CALLOC(1, sizeof(*ctx), gf_posix_mt_uring_ctx); + if (!ctx) { + return NULL; + } + + ctx->frame = frame; + ctx->fd = fd_ref(fd); + ctx->prepare = prepare; + ctx->unwind = unwind; + if (xdata) + ctx->xdata = dict_ref(xdata); + ctx->op = op; + + ret = posix_fd_ctx_get(fd, this, &pfd, op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, *op_errno, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd); + goto err; + } + ctx->_fd = pfd->fd; + + /* TODO: Explore filling up pre and post bufs using IOSQE_IO_LINK*/ + if ((op == GF_FOP_WRITE) || (op == GF_FOP_FSYNC)) { + if (posix_fdstat(this, fd->inode, pfd->fd, &ctx->prebuf) != 0) { + *op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, *op_errno, P_MSG_FSTAT_FAILED, + "fstat failed on fd=%p", fd); + goto err; + } + } + + return ctx; + +err: + posix_io_uring_ctx_free(ctx); + return NULL; +} + +static void +posix_io_uring_readv_complete(struct posix_uring_ctx *ctx, int32_t res) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct posix_private *priv = NULL; + struct iobref *iobref = NULL; + struct iobuf *iobuf = NULL; + struct iatt postbuf = { + 0, + }; + struct iovec iov = { + 0, + }; + fd_t *fd = NULL; + int _fd = -1; + int ret = 0; + int op_ret = -1; + int op_errno = 0; + off_t offset = 0; + + frame = ctx->frame; + this = frame->this; + priv = this->private; + fd = ctx->fd; + _fd = ctx->_fd; + iobuf = ctx->fop.read.iobuf; + offset = ctx->fop.read.offset; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_READV_FAILED, + "readv(async) failed fd=%d.", _fd); + goto out; + } + + ret = posix_fdstat(this, fd->inode, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED, + "fstat failed on fd=%d", _fd); + goto out; + } + + op_ret = res; + op_errno = 0; + + iobref = iobref_new(); + if (!iobref) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + iobref_add(iobref, iobuf); + + iov.iov_base = iobuf_ptr(iobuf); + iov.iov_len = op_ret; + + /* Hack to notify higher layers of EOF. */ + if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size) + op_errno = ENOENT; + + GF_ATOMIC_ADD(priv->read_value, op_ret); + // response xdata is used only for cloudsync, so ignore for now. +out: + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, &iov, 1, &postbuf, + iobref, NULL); + if (iobref) + iobref_unref(iobref); + posix_io_uring_ctx_free(ctx); +} + +static void +posix_prep_readv(struct io_uring_sqe *sqe, struct posix_uring_ctx *ctx) +{ + sqe->flags |= IOSQE_ASYNC; + io_uring_prep_readv(sqe, ctx->_fd, &ctx->fop.read.iovec, 1, + ctx->fop.read.offset); +} + +int +posix_io_uring_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + struct posix_uring_ctx *ctx = NULL; + int32_t op_errno = ENOMEM; + struct iobuf *iobuf = NULL; + int ret = 0; + + ctx = posix_io_uring_ctx_init( + frame, this, fd, GF_FOP_READ, posix_prep_readv, + posix_io_uring_readv_complete, &op_errno, xdata); + if (!ctx) { + goto err; + } + + iobuf = iobuf_get2(this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto err; + } + ctx->fop.read.iobuf = iobuf; + ctx->fop.read.iovec.iov_base = iobuf_ptr(iobuf); + ctx->fop.read.iovec.iov_len = size; + ctx->fop.read.offset = offset; + + ret = posix_io_uring_submit(this, ctx); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_POSIX_IO_URING, + "Failed to submit sqe"); + op_errno = -ret; + goto err; + } + if (ret == 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_POSIX_IO_URING, + "submit sqe got zero"); + } + return 0; +err: + STACK_UNWIND_STRICT(readv, frame, -1, op_errno, NULL, 1, NULL, NULL, NULL); + posix_io_uring_ctx_free(ctx); + return 0; +} + +static void +posix_writev_fill_rsp_dict(struct posix_uring_ctx *ctx, xlator_t *this, + dict_t **rsp_xdata) +{ + int is_append = 0; + + if (ctx->xdata && dict_get(ctx->xdata, GLUSTERFS_WRITE_IS_APPEND)) { + if (ctx->prebuf.ia_size == ctx->fop.write.offset || + (ctx->fd->flags & O_APPEND)) + is_append = 1; + } + *rsp_xdata = _fill_writev_xdata(ctx->fd, ctx->xdata, this, is_append); +} + +static void +posix_io_uring_writev_complete(struct posix_uring_ctx *ctx, int32_t res) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct posix_private *priv = NULL; + struct iatt postbuf = { + 0, + }; + fd_t *fd = NULL; + int _fd = -1; + int ret = 0; + int op_ret = -1; + int op_errno = 0; + dict_t *rsp_xdata = NULL; + frame = ctx->frame; + this = frame->this; + priv = this->private; + fd = ctx->fd; + _fd = ctx->_fd; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_WRITEV_FAILED, + "writev(async) failed fd=%d.", _fd); + goto out; + } + + ret = posix_fdstat(this, fd->inode, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED, + "fstat failed on fd=%d", _fd); + goto out; + } + + op_ret = res; + op_errno = 0; + posix_writev_fill_rsp_dict(ctx, this, &rsp_xdata); + GF_ATOMIC_ADD(priv->write_value, op_ret); +out: + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, &ctx->prebuf, &postbuf, + rsp_xdata); + if (rsp_xdata) + dict_unref(rsp_xdata); + posix_io_uring_ctx_free(ctx); +} + +static void +posix_prep_writev(struct io_uring_sqe *sqe, struct posix_uring_ctx *ctx) +{ + io_uring_prep_writev(sqe, ctx->_fd, ctx->fop.write.iov, + ctx->fop.write.count, ctx->fop.write.offset); +} + +int +posix_io_uring_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *iov, int count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + struct posix_uring_ctx *ctx = NULL; + int32_t op_errno = ENOMEM; + int ret = 0; + + ctx = posix_io_uring_ctx_init( + frame, this, fd, GF_FOP_WRITE, posix_prep_writev, + posix_io_uring_writev_complete, &op_errno, xdata); + if (!ctx) { + goto err; + } + + ctx->fop.write.iov = iov; + ctx->fop.write.count = count; + ctx->fop.write.offset = offset; + + ret = posix_io_uring_submit(this, ctx); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_POSIX_IO_URING, + "Failed to submit sqe"); + op_errno = -ret; + goto err; + } + if (ret == 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_POSIX_IO_URING, + "submit sqe got zero"); + } + return 0; +err: + STACK_UNWIND_STRICT(writev, frame, -1, op_errno, 0, 0, 0); + posix_io_uring_ctx_free(ctx); + return 0; +} + +static void +posix_io_uring_fsync_complete(struct posix_uring_ctx *ctx, int32_t res) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct posix_private *priv = NULL; + struct iatt postbuf = { + 0, + }; + fd_t *fd = NULL; + int _fd = -1; + int ret = 0; + int op_ret = -1; + int op_errno = 0; + + frame = ctx->frame; + this = frame->this; + priv = this->private; + fd = ctx->fd; + _fd = ctx->_fd; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSYNC_FAILED, + "writev(async) failed fd=%d.", _fd); + goto out; + } + + ret = posix_fdstat(this, fd->inode, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED, + "fstat failed on fd=%d", _fd); + goto out; + } + + op_ret = res; + op_errno = 0; + GF_ATOMIC_ADD(priv->write_value, op_ret); +out: + STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, &ctx->prebuf, &postbuf, + NULL); + posix_io_uring_ctx_free(ctx); +} + +static void +posix_prep_fsync(struct io_uring_sqe *sqe, struct posix_uring_ctx *ctx) +{ + io_uring_prep_fsync(sqe, ctx->_fd, ctx->fop.fsync.datasync); +} + +int +posix_io_uring_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t datasync, dict_t *xdata) +{ + struct posix_uring_ctx *ctx = NULL; + int32_t op_errno = ENOMEM; + int ret = 0; + + ctx = posix_io_uring_ctx_init( + frame, this, fd, GF_FOP_FSYNC, posix_prep_fsync, + posix_io_uring_fsync_complete, &op_errno, xdata); + if (!ctx) { + goto err; + } + + if (datasync) + ctx->fop.fsync.datasync |= IORING_FSYNC_DATASYNC; + + ret = posix_io_uring_submit(this, ctx); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_POSIX_IO_URING, + "Failed to submit sqe"); + op_errno = -ret; + goto err; + } + if (ret == 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_POSIX_IO_URING, + "submit sqe got zero"); + } + return 0; +err: + posix_io_uring_ctx_free(ctx); + STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, 0, 0, NULL); + return 0; +} + +static int +posix_io_uring_submit(xlator_t *this, struct posix_uring_ctx *ctx) +{ + struct posix_private *priv = this->private; + struct io_uring_sqe *sqe = NULL; + int ret = 0; + + pthread_mutex_lock(&priv->sq_mutex); + { + sqe = io_uring_get_sqe(&priv->ring); + if (!sqe) { + /*TODO: Retry until we get an sqe instead of failing. */ + pthread_mutex_unlock(&priv->sq_mutex); + ret = -EAGAIN; + gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_POSIX_IO_URING, + "Failed to get sqe"); + goto out; + } + ctx->prepare(sqe, ctx); + io_uring_sqe_set_data(sqe, ctx); + ret = io_uring_submit(&priv->ring); + } + pthread_mutex_unlock(&priv->sq_mutex); + +out: + return ret; +} + +static void * +posix_io_uring_thread(void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + int ret = 0; + int32_t res = 0; + struct io_uring_cqe *cqe = NULL; + struct posix_uring_ctx *ctx = NULL; + + this = data; + THIS = this; + priv = this->private; + while (1) { + pthread_mutex_lock(&priv->cq_mutex); + { + ret = io_uring_wait_cqe(&priv->ring, &cqe); + } + pthread_mutex_unlock(&priv->cq_mutex); + if (ret != 0) { + if (ret == -EINTR) + continue; + gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_POSIX_IO_URING, + "Unable to get cqe. Exiting."); + abort(); + } + + ctx = (struct posix_uring_ctx *)io_uring_cqe_get_data(cqe); + if (priv->uring_thread_exit == _gf_true && ctx == NULL) + pthread_exit(NULL); + res = cqe->res; + io_uring_cqe_seen(&priv->ring, cqe); + ctx->unwind(ctx, res); + } + + return NULL; +} + +int +posix_io_uring_init(xlator_t *this) +{ + int ret = -1; + unsigned flags = 0; + struct posix_private *priv = this->private; + + // TODO:Try-out flags |= IORING_SETUP_IOPOLL; + ret = io_uring_queue_init(POSIX_URING_MAX_ENTRIES, &priv->ring, flags); + if (ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_POSIX_IO_URING, + "io_uring init failed."); + goto out; + } + + pthread_mutex_init(&priv->sq_mutex, NULL); + pthread_mutex_init(&priv->cq_mutex, NULL); + ret = gf_thread_create(&priv->uring_thread, NULL, posix_io_uring_thread, + this, "posix-iouring"); + if (ret != 0) { + io_uring_queue_exit(&priv->ring); + pthread_mutex_destroy(&priv->sq_mutex); + pthread_mutex_destroy(&priv->cq_mutex); + goto out; + } + +out: + return ret; +} + +static int +posix_io_uring_drain(struct posix_private *priv) +{ + struct io_uring_sqe *sqe = NULL; + int ret = -1; + + priv->uring_thread_exit = _gf_true; + sqe = io_uring_get_sqe(&priv->ring); + if (!sqe) + return ret; + + io_uring_sqe_set_flags(sqe, IOSQE_IO_DRAIN); + io_uring_sqe_set_data(sqe, NULL); + io_uring_prep_nop(sqe); + ret = io_uring_submit(&priv->ring); + + return ret; +} + +void +posix_io_uring_fini(xlator_t *this) +{ + struct posix_private *priv = this->private; + + posix_io_uring_drain(priv); + (void)pthread_join(priv->uring_thread, NULL); + io_uring_queue_exit(&priv->ring); + pthread_mutex_destroy(&priv->sq_mutex); + pthread_mutex_destroy(&priv->cq_mutex); +} + +int +posix_io_uring_on(xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = -1; + + priv = this->private; + + if (!priv->io_uring_init_done) { + ret = posix_io_uring_init(this); + if (ret == 0) + priv->io_uring_capable = _gf_true; + else + priv->io_uring_capable = _gf_false; + priv->io_uring_init_done = _gf_true; + } + + if (priv->io_uring_capable) { + this->fops->readv = posix_io_uring_readv; + this->fops->writev = posix_io_uring_writev; + this->fops->fsync = posix_io_uring_fsync; + ret = 0; + } + + if (ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_POSIX_IO_URING, + "Posix io_uring init failed, falling back to the previous " + "IO mechanism."); + } + return ret; +} + +int +posix_io_uring_off(xlator_t *this) +{ + this->fops->readv = posix_readv; + this->fops->writev = posix_writev; + this->fops->fsync = posix_fsync; + posix_io_uring_fini(this); + + return 0; +} + +#else +int +posix_io_uring_on(xlator_t *this) +{ + gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_AIO_UNAVAILABLE, + "Linux io_uring not available at build-time. " + "Continuing with synchronous IO"); + return -1; +} + +int +posix_io_uring_off(xlator_t *this) +{ + return 0; +} + +#endif diff --git a/xlators/storage/posix/src/posix-io-uring.h b/xlators/storage/posix/src/posix-io-uring.h new file mode 100644 index 0000000000..3d902e8b32 --- /dev/null +++ b/xlators/storage/posix/src/posix-io-uring.h @@ -0,0 +1,31 @@ +/* + Copyright (c) 2020 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _POSIX_IO_URING_H +#define _POSIX_IO_URING_H + +#define POSIX_URING_MAX_ENTRIES 512 +int +posix_io_uring_on(xlator_t *this); + +int +posix_io_uring_off(xlator_t *this); + +#ifdef HAVE_LIBURING +int +posix_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int +posix_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata); +#endif + +#endif /* _POSIX_IO_URING_H */ diff --git a/xlators/storage/posix/src/posix-mem-types.h b/xlators/storage/posix/src/posix-mem-types.h index 2253f381ac..81957736a4 100644 --- a/xlators/storage/posix/src/posix-mem-types.h +++ b/xlators/storage/posix/src/posix-mem-types.h @@ -20,6 +20,7 @@ enum gf_posix_mem_types_ { gf_posix_mt_paiocb, gf_posix_mt_inode_ctx_t, gf_posix_mt_mdata_attr, + gf_posix_mt_uring_ctx, gf_posix_mt_end }; #endif diff --git a/xlators/storage/posix/src/posix-messages.h b/xlators/storage/posix/src/posix-messages.h index f5bede266d..46518eb616 100644 --- a/xlators/storage/posix/src/posix-messages.h +++ b/xlators/storage/posix/src/posix-messages.h @@ -69,6 +69,6 @@ GLFS_MSGID(POSIX, P_MSG_XATTR_FAILED, P_MSG_NULL_GFID, P_MSG_FCNTL_FAILED, P_MSG_FETCHMDATA_FAILED, P_MSG_GETMDATA_FAILED, P_MSG_SETMDATA_FAILED, P_MSG_FRESHFILE, P_MSG_MUTEX_FAILED, P_MSG_COPY_FILE_RANGE_FAILED, P_MSG_TIMER_DELETE_FAILED, P_MSG_NOMEM, - P_MSG_PSTAT_FAILED, P_MSG_FDSTAT_FAILED); + P_MSG_PSTAT_FAILED, P_MSG_FDSTAT_FAILED, P_MSG_POSIX_IO_URING); #endif /* !_GLUSTERD_MESSAGES_H_ */ diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index b8db146eef..92f7272325 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -38,6 +38,11 @@ #include "posix-aio.h" #endif +#ifdef HAVE_LIBURING +#include <liburing.h> +#include "posix-io-uring.h" +#endif + #define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/ #define MAX_NO_VECT 1024 @@ -252,6 +257,18 @@ struct posix_private { gf_boolean_t aio_init_done; gf_boolean_t aio_capable; uint32_t rel_fdcount; + + /*io_uring related.*/ + gf_boolean_t io_uring_configured; +#ifdef HAVE_LIBURING + struct io_uring ring; + gf_boolean_t io_uring_init_done; + gf_boolean_t io_uring_capable; + gf_boolean_t uring_thread_exit; + pthread_t uring_thread; + pthread_mutex_t sq_mutex; + pthread_mutex_t cq_mutex; +#endif }; typedef struct { |