summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2020-09-04 10:37:32 +0530
committerXavi Hernandez <xhernandez@users.noreply.github.com>2020-10-23 19:12:35 +0200
commit4b9f95b6993b606b2f97d55694910c2cbe3107a2 (patch)
tree89f47782ed0460ae088948d1d04783267cb066cb
parentf5e1eb87d4af44be3b317b7f99ab88f89c2f0b1a (diff)
downloadglusterfs-4b9f95b6993b606b2f97d55694910c2cbe3107a2.tar.gz
glusterfs-4b9f95b6993b606b2f97d55694910c2cbe3107a2.tar.xz
glusterfs-4b9f95b6993b606b2f97d55694910c2cbe3107a2.zip
posix: add io_uring support
This patch adds support for reads, writes and fsyncs using io-uring in posix xlator. A volume option 'storage.linux-io_uring' is introduced to turn it on or off. When I ran tests with fio and iozone on physical machines on a 2x3 volume + single client, the numbers were not any worse than regular pwrite/pread syscalls which posix does with default volume configurations. But given that io_uring itself undergoing rapid improvement and bug fixes,it is good to have an intial working implementation in gluster and iteratively develop on top of it. Updates: #1398 Change-Id: Ia47456ebb4c16a3b66ad9beb6a9043cc090fed2b Signed-off-by: Ravishankar N <ravishankar@redhat.com>
-rw-r--r--configure.ac11
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.c19
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c3
-rw-r--r--xlators/storage/posix/src/Makefile.am6
-rw-r--r--xlators/storage/posix/src/posix-common.c23
-rw-r--r--xlators/storage/posix/src/posix-handle.h3
-rw-r--r--xlators/storage/posix/src/posix-io-uring.c633
-rw-r--r--xlators/storage/posix/src/posix-io-uring.h31
-rw-r--r--xlators/storage/posix/src/posix-mem-types.h1
-rw-r--r--xlators/storage/posix/src/posix-messages.h2
-rw-r--r--xlators/storage/posix/src/posix.h17
11 files changed, 744 insertions, 5 deletions
diff --git a/configure.ac b/configure.ac
index d40b8c6d92..2ce2a91137 100644
--- a/configure.ac
+++ b/configure.ac
@@ -6,6 +6,7 @@ dnl General Public License, version 3 or any later version (LGPLv3 or
dnl later), or the GNU General Public License, version 2 (GPLv2), in all
dnl cases as published by the Free Software Foundation.
+// clang-format off
AC_INIT([glusterfs],
[m4_esyscmd([build-aux/pkg-version --version])],
[gluster-users@gluster.org],,[https://github.com/gluster/glusterfs.git])
@@ -1427,6 +1428,14 @@ if test -n "$LIBAIO"; then
BUILD_LIBAIO=yes
fi
+BUILD_LIBURING=no
+AC_CHECK_HEADERS([liburing.h])
+AC_CHECK_LIB([uring],[io_uring_queue_init],[LIBURING="-luring"])
+
+if test -n "$LIBURING"; then
+ AC_DEFINE(HAVE_LIBURING, 1, [io-uring based POSIX enabled])
+ BUILD_LIBURING=yes
+fi
dnl gnfs section
BUILD_GNFS="no"
RPCBIND_SERVICE=""
@@ -1608,6 +1617,7 @@ AC_SUBST(GF_FUSE_LDADD)
AC_SUBST(GF_FUSE_CFLAGS)
AC_SUBST(RLLIBS)
AC_SUBST(LIBAIO)
+AC_SUBST(LIBURING)
AC_SUBST(AM_MAKEFLAGS)
AC_SUBST(AM_LIBTOOLFLAGS)
AC_SUBST(GF_NO_UNDEFINED)
@@ -1690,6 +1700,7 @@ echo "fusermount : $BUILD_FUSERMOUNT"
echo "readline : $BUILD_READLINE"
echo "georeplication : $BUILD_SYNCDAEMON"
echo "Linux-AIO : $BUILD_LIBAIO"
+echo "Linux-io_uring : $BUILD_LIBURING"
echo "Enable Debug : $BUILD_DEBUG"
echo "Run with Valgrind : $VALGRIND_TOOL"
echo "Sanitizer enabled : $SANITIZER"
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
index c537fc33a8..7762fe44fa 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
@@ -1224,7 +1224,24 @@ glusterd_op_stage_set_volume(dict_t *dict, char **op_errstr)
key = key_fixed;
keylen = strlen(key_fixed);
}
-
+#ifdef HAVE_LIBURING
+ if (len_strcmp(key, keylen, "storage.linux-io_uring")) {
+ if (volinfo == NULL) {
+ snprintf(errstr, sizeof(errstr), "vol info is NULL for %s.",
+ volname);
+ ret = -1;
+ goto out;
+ }
+ if (volinfo->status == GLUSTERD_STATUS_STARTED) {
+ snprintf(errstr, sizeof(errstr),
+ "Changing this option is "
+ "not supported when volume is in started state. "
+ "Please stop the volume.");
+ ret = -1;
+ goto out;
+ }
+ }
+#endif
if (len_strcmp(key, keylen, "cluster.granular-entry-heal")) {
/* For granular entry-heal, if the set command was
* invoked through volume-set CLI, then allow the
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 398b4d76f5..fb93bc2432 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -2387,6 +2387,9 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_8_0,
},
{.key = "storage.linux-aio", .voltype = "storage/posix", .op_version = 1},
+ {.key = "storage.linux-io_uring",
+ .voltype = "storage/posix",
+ .op_version = GD_OP_VERSION_9_0},
{.key = "storage.batch-fsync-mode",
.voltype = "storage/posix",
.op_version = 3},
diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am
index c080a229ff..ed69720f5e 100644
--- a/xlators/storage/posix/src/Makefile.am
+++ b/xlators/storage/posix/src/Makefile.am
@@ -7,13 +7,13 @@ posix_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c \
posix-gfid-path.c posix-entry-ops.c posix-inode-fd-ops.c \
- posix-common.c posix-metadata.c
+ posix-common.c posix-metadata.c posix-io-uring.c
posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO) \
- $(ACL_LIBS)
+ $(LIBURING) $(ACL_LIBS)
noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h posix-aio.h \
posix-messages.h posix-gfid-path.h posix-inode-handle.h \
- posix-metadata.h posix-metadata-disk.h
+ posix-metadata.h posix-metadata-disk.h posix-io-uring.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
diff --git a/xlators/storage/posix/src/posix-common.c b/xlators/storage/posix/src/posix-common.c
index f10722ec3f..db30b595d1 100644
--- a/xlators/storage/posix/src/posix-common.c
+++ b/xlators/storage/posix/src/posix-common.c
@@ -45,6 +45,7 @@
#include <glusterfs/timer.h>
#include "glusterfs3-xdr.h"
#include "posix-aio.h"
+#include "posix-io-uring.h"
#include <glusterfs/glusterfs-acl.h>
#include "posix-messages.h"
#include <glusterfs/events.h>
@@ -359,6 +360,14 @@ posix_reconfigure(xlator_t *this, dict_t *options)
else
posix_aio_off(this);
+ GF_OPTION_RECONF("linux-io_uring", priv->io_uring_configured, options, bool,
+ out);
+
+ if (priv->io_uring_configured)
+ posix_io_uring_on(this);
+ else
+ posix_io_uring_off(this);
+
GF_OPTION_RECONF("update-link-count-parent", priv->update_pgfid_nlinks,
options, bool, out);
@@ -1049,6 +1058,14 @@ posix_init(xlator_t *this)
}
}
+ GF_OPTION_INIT("linux-io_uring", _private->io_uring_configured, bool, out);
+ if (_private->io_uring_configured) {
+ op_ret = posix_io_uring_on(this);
+ if (op_ret < 0) {
+ _private->io_uring_configured = _gf_false;
+ }
+ }
+
GF_OPTION_INIT("node-uuid-pathinfo", _private->node_uuid_pathinfo, bool,
out);
if (_private->node_uuid_pathinfo &&
@@ -1325,6 +1342,12 @@ struct volume_options posix_options[] = {
.description = "Support for native Linux AIO",
.op_version = {1},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+ {.key = {"linux-io_uring"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Support for Linux io_uring",
+ .op_version = {GD_OP_VERSION_9_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"brick-uid"},
.type = GF_OPTION_TYPE_INT,
.min = -1,
diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h
index f33ed92620..940939ec6e 100644
--- a/xlators/storage/posix/src/posix-handle.h
+++ b/xlators/storage/posix/src/posix-handle.h
@@ -218,4 +218,7 @@ posix_check_internal_writes(xlator_t *this, fd_t *fd, int sysfd, dict_t *xdata);
void
posix_disk_space_check(xlator_t *this);
+
+dict_t *
+_fill_writev_xdata(fd_t *fd, dict_t *xdata, xlator_t *this, int is_append);
#endif /* !_POSIX_HANDLE_H */
diff --git a/xlators/storage/posix/src/posix-io-uring.c b/xlators/storage/posix/src/posix-io-uring.c
new file mode 100644
index 0000000000..567a6a83ef
--- /dev/null
+++ b/xlators/storage/posix/src/posix-io-uring.c
@@ -0,0 +1,633 @@
+/*
+ Copyright (c) 2020 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "posix.h"
+#include "posix-messages.h"
+#include "posix-io-uring.h"
+#include "posix-handle.h"
+
+#ifdef HAVE_LIBURING
+#include <liburing.h>
+
+struct posix_uring_ctx;
+typedef void(fop_unwind_f)(struct posix_uring_ctx *, int32_t);
+typedef void(fop_prep_f)(struct io_uring_sqe *sqe, struct posix_uring_ctx *);
+static int
+posix_io_uring_submit(xlator_t *this, struct posix_uring_ctx *ctx);
+
+struct posix_uring_ctx {
+ call_frame_t *frame;
+ struct iatt prebuf;
+ dict_t *xdata;
+ fd_t *fd;
+ int _fd;
+ int op;
+
+ union {
+ struct {
+ struct iovec *iov;
+ int count;
+ off_t offset;
+ } write;
+
+ struct {
+ struct iobuf *iobuf;
+ struct iovec iovec;
+ off_t offset;
+ } read;
+
+ struct {
+ int32_t datasync;
+ } fsync;
+ } fop;
+
+ fop_prep_f *prepare;
+ fop_unwind_f *unwind;
+};
+
+static void
+posix_io_uring_ctx_free(struct posix_uring_ctx *ctx)
+{
+ if (!ctx)
+ return;
+ if (ctx->fd)
+ fd_unref(ctx->fd);
+ if (ctx->xdata)
+ dict_unref(ctx->xdata);
+ switch (ctx->op) {
+ case GF_FOP_READ:
+ if (ctx->fop.read.iobuf)
+ iobuf_unref(ctx->fop.read.iobuf);
+ break;
+ default:
+ break;
+ }
+ GF_FREE(ctx);
+}
+
+struct posix_uring_ctx *
+posix_io_uring_ctx_init(call_frame_t *frame, xlator_t *this, fd_t *fd, int op,
+ fop_prep_f prepare, fop_unwind_f unwind,
+ int32_t *op_errno, dict_t *xdata)
+{
+ struct posix_uring_ctx *ctx = NULL;
+ struct posix_fd *pfd = NULL;
+ int ret = 0;
+
+ ctx = GF_CALLOC(1, sizeof(*ctx), gf_posix_mt_uring_ctx);
+ if (!ctx) {
+ return NULL;
+ }
+
+ ctx->frame = frame;
+ ctx->fd = fd_ref(fd);
+ ctx->prepare = prepare;
+ ctx->unwind = unwind;
+ if (xdata)
+ ctx->xdata = dict_ref(xdata);
+ ctx->op = op;
+
+ ret = posix_fd_ctx_get(fd, this, &pfd, op_errno);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, *op_errno, P_MSG_PFD_NULL,
+ "pfd is NULL from fd=%p", fd);
+ goto err;
+ }
+ ctx->_fd = pfd->fd;
+
+ /* TODO: Explore filling up pre and post bufs using IOSQE_IO_LINK*/
+ if ((op == GF_FOP_WRITE) || (op == GF_FOP_FSYNC)) {
+ if (posix_fdstat(this, fd->inode, pfd->fd, &ctx->prebuf) != 0) {
+ *op_errno = errno;
+ gf_msg(this->name, GF_LOG_ERROR, *op_errno, P_MSG_FSTAT_FAILED,
+ "fstat failed on fd=%p", fd);
+ goto err;
+ }
+ }
+
+ return ctx;
+
+err:
+ posix_io_uring_ctx_free(ctx);
+ return NULL;
+}
+
+static void
+posix_io_uring_readv_complete(struct posix_uring_ctx *ctx, int32_t res)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct posix_private *priv = NULL;
+ struct iobref *iobref = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iatt postbuf = {
+ 0,
+ };
+ struct iovec iov = {
+ 0,
+ };
+ fd_t *fd = NULL;
+ int _fd = -1;
+ int ret = 0;
+ int op_ret = -1;
+ int op_errno = 0;
+ off_t offset = 0;
+
+ frame = ctx->frame;
+ this = frame->this;
+ priv = this->private;
+ fd = ctx->fd;
+ _fd = ctx->_fd;
+ iobuf = ctx->fop.read.iobuf;
+ offset = ctx->fop.read.offset;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_READV_FAILED,
+ "readv(async) failed fd=%d.", _fd);
+ goto out;
+ }
+
+ ret = posix_fdstat(this, fd->inode, _fd, &postbuf);
+ if (ret != 0) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED,
+ "fstat failed on fd=%d", _fd);
+ goto out;
+ }
+
+ op_ret = res;
+ op_errno = 0;
+
+ iobref = iobref_new();
+ if (!iobref) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref_add(iobref, iobuf);
+
+ iov.iov_base = iobuf_ptr(iobuf);
+ iov.iov_len = op_ret;
+
+ /* Hack to notify higher layers of EOF. */
+ if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size)
+ op_errno = ENOENT;
+
+ GF_ATOMIC_ADD(priv->read_value, op_ret);
+ // response xdata is used only for cloudsync, so ignore for now.
+out:
+ STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, &iov, 1, &postbuf,
+ iobref, NULL);
+ if (iobref)
+ iobref_unref(iobref);
+ posix_io_uring_ctx_free(ctx);
+}
+
+static void
+posix_prep_readv(struct io_uring_sqe *sqe, struct posix_uring_ctx *ctx)
+{
+ sqe->flags |= IOSQE_ASYNC;
+ io_uring_prep_readv(sqe, ctx->_fd, &ctx->fop.read.iovec, 1,
+ ctx->fop.read.offset);
+}
+
+int
+posix_io_uring_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ struct posix_uring_ctx *ctx = NULL;
+ int32_t op_errno = ENOMEM;
+ struct iobuf *iobuf = NULL;
+ int ret = 0;
+
+ ctx = posix_io_uring_ctx_init(
+ frame, this, fd, GF_FOP_READ, posix_prep_readv,
+ posix_io_uring_readv_complete, &op_errno, xdata);
+ if (!ctx) {
+ goto err;
+ }
+
+ iobuf = iobuf_get2(this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ ctx->fop.read.iobuf = iobuf;
+ ctx->fop.read.iovec.iov_base = iobuf_ptr(iobuf);
+ ctx->fop.read.iovec.iov_len = size;
+ ctx->fop.read.offset = offset;
+
+ ret = posix_io_uring_submit(this, ctx);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_POSIX_IO_URING,
+ "Failed to submit sqe");
+ op_errno = -ret;
+ goto err;
+ }
+ if (ret == 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_POSIX_IO_URING,
+ "submit sqe got zero");
+ }
+ return 0;
+err:
+ STACK_UNWIND_STRICT(readv, frame, -1, op_errno, NULL, 1, NULL, NULL, NULL);
+ posix_io_uring_ctx_free(ctx);
+ return 0;
+}
+
+static void
+posix_writev_fill_rsp_dict(struct posix_uring_ctx *ctx, xlator_t *this,
+ dict_t **rsp_xdata)
+{
+ int is_append = 0;
+
+ if (ctx->xdata && dict_get(ctx->xdata, GLUSTERFS_WRITE_IS_APPEND)) {
+ if (ctx->prebuf.ia_size == ctx->fop.write.offset ||
+ (ctx->fd->flags & O_APPEND))
+ is_append = 1;
+ }
+ *rsp_xdata = _fill_writev_xdata(ctx->fd, ctx->xdata, this, is_append);
+}
+
+static void
+posix_io_uring_writev_complete(struct posix_uring_ctx *ctx, int32_t res)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct posix_private *priv = NULL;
+ struct iatt postbuf = {
+ 0,
+ };
+ fd_t *fd = NULL;
+ int _fd = -1;
+ int ret = 0;
+ int op_ret = -1;
+ int op_errno = 0;
+ dict_t *rsp_xdata = NULL;
+ frame = ctx->frame;
+ this = frame->this;
+ priv = this->private;
+ fd = ctx->fd;
+ _fd = ctx->_fd;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_WRITEV_FAILED,
+ "writev(async) failed fd=%d.", _fd);
+ goto out;
+ }
+
+ ret = posix_fdstat(this, fd->inode, _fd, &postbuf);
+ if (ret != 0) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED,
+ "fstat failed on fd=%d", _fd);
+ goto out;
+ }
+
+ op_ret = res;
+ op_errno = 0;
+ posix_writev_fill_rsp_dict(ctx, this, &rsp_xdata);
+ GF_ATOMIC_ADD(priv->write_value, op_ret);
+out:
+ STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, &ctx->prebuf, &postbuf,
+ rsp_xdata);
+ if (rsp_xdata)
+ dict_unref(rsp_xdata);
+ posix_io_uring_ctx_free(ctx);
+}
+
+static void
+posix_prep_writev(struct io_uring_sqe *sqe, struct posix_uring_ctx *ctx)
+{
+ io_uring_prep_writev(sqe, ctx->_fd, ctx->fop.write.iov,
+ ctx->fop.write.count, ctx->fop.write.offset);
+}
+
+int
+posix_io_uring_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *iov, int count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+ struct posix_uring_ctx *ctx = NULL;
+ int32_t op_errno = ENOMEM;
+ int ret = 0;
+
+ ctx = posix_io_uring_ctx_init(
+ frame, this, fd, GF_FOP_WRITE, posix_prep_writev,
+ posix_io_uring_writev_complete, &op_errno, xdata);
+ if (!ctx) {
+ goto err;
+ }
+
+ ctx->fop.write.iov = iov;
+ ctx->fop.write.count = count;
+ ctx->fop.write.offset = offset;
+
+ ret = posix_io_uring_submit(this, ctx);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_POSIX_IO_URING,
+ "Failed to submit sqe");
+ op_errno = -ret;
+ goto err;
+ }
+ if (ret == 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_POSIX_IO_URING,
+ "submit sqe got zero");
+ }
+ return 0;
+err:
+ STACK_UNWIND_STRICT(writev, frame, -1, op_errno, 0, 0, 0);
+ posix_io_uring_ctx_free(ctx);
+ return 0;
+}
+
+static void
+posix_io_uring_fsync_complete(struct posix_uring_ctx *ctx, int32_t res)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct posix_private *priv = NULL;
+ struct iatt postbuf = {
+ 0,
+ };
+ fd_t *fd = NULL;
+ int _fd = -1;
+ int ret = 0;
+ int op_ret = -1;
+ int op_errno = 0;
+
+ frame = ctx->frame;
+ this = frame->this;
+ priv = this->private;
+ fd = ctx->fd;
+ _fd = ctx->_fd;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSYNC_FAILED,
+ "writev(async) failed fd=%d.", _fd);
+ goto out;
+ }
+
+ ret = posix_fdstat(this, fd->inode, _fd, &postbuf);
+ if (ret != 0) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_FSTAT_FAILED,
+ "fstat failed on fd=%d", _fd);
+ goto out;
+ }
+
+ op_ret = res;
+ op_errno = 0;
+ GF_ATOMIC_ADD(priv->write_value, op_ret);
+out:
+ STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, &ctx->prebuf, &postbuf,
+ NULL);
+ posix_io_uring_ctx_free(ctx);
+}
+
+static void
+posix_prep_fsync(struct io_uring_sqe *sqe, struct posix_uring_ctx *ctx)
+{
+ io_uring_prep_fsync(sqe, ctx->_fd, ctx->fop.fsync.datasync);
+}
+
+int
+posix_io_uring_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t datasync, dict_t *xdata)
+{
+ struct posix_uring_ctx *ctx = NULL;
+ int32_t op_errno = ENOMEM;
+ int ret = 0;
+
+ ctx = posix_io_uring_ctx_init(
+ frame, this, fd, GF_FOP_FSYNC, posix_prep_fsync,
+ posix_io_uring_fsync_complete, &op_errno, xdata);
+ if (!ctx) {
+ goto err;
+ }
+
+ if (datasync)
+ ctx->fop.fsync.datasync |= IORING_FSYNC_DATASYNC;
+
+ ret = posix_io_uring_submit(this, ctx);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_POSIX_IO_URING,
+ "Failed to submit sqe");
+ op_errno = -ret;
+ goto err;
+ }
+ if (ret == 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_POSIX_IO_URING,
+ "submit sqe got zero");
+ }
+ return 0;
+err:
+ posix_io_uring_ctx_free(ctx);
+ STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, 0, 0, NULL);
+ return 0;
+}
+
+static int
+posix_io_uring_submit(xlator_t *this, struct posix_uring_ctx *ctx)
+{
+ struct posix_private *priv = this->private;
+ struct io_uring_sqe *sqe = NULL;
+ int ret = 0;
+
+ pthread_mutex_lock(&priv->sq_mutex);
+ {
+ sqe = io_uring_get_sqe(&priv->ring);
+ if (!sqe) {
+ /*TODO: Retry until we get an sqe instead of failing. */
+ pthread_mutex_unlock(&priv->sq_mutex);
+ ret = -EAGAIN;
+ gf_msg(this->name, GF_LOG_ERROR, -ret, P_MSG_POSIX_IO_URING,
+ "Failed to get sqe");
+ goto out;
+ }
+ ctx->prepare(sqe, ctx);
+ io_uring_sqe_set_data(sqe, ctx);
+ ret = io_uring_submit(&priv->ring);
+ }
+ pthread_mutex_unlock(&priv->sq_mutex);
+
+out:
+ return ret;
+}
+
+static void *
+posix_io_uring_thread(void *data)
+{
+ xlator_t *this = NULL;
+ struct posix_private *priv = NULL;
+ int ret = 0;
+ int32_t res = 0;
+ struct io_uring_cqe *cqe = NULL;
+ struct posix_uring_ctx *ctx = NULL;
+
+ this = data;
+ THIS = this;
+ priv = this->private;
+ while (1) {
+ pthread_mutex_lock(&priv->cq_mutex);
+ {
+ ret = io_uring_wait_cqe(&priv->ring, &cqe);
+ }
+ pthread_mutex_unlock(&priv->cq_mutex);
+ if (ret != 0) {
+ if (ret == -EINTR)
+ continue;
+ gf_msg(this->name, GF_LOG_WARNING, -ret, P_MSG_POSIX_IO_URING,
+ "Unable to get cqe. Exiting.");
+ abort();
+ }
+
+ ctx = (struct posix_uring_ctx *)io_uring_cqe_get_data(cqe);
+ if (priv->uring_thread_exit == _gf_true && ctx == NULL)
+ pthread_exit(NULL);
+ res = cqe->res;
+ io_uring_cqe_seen(&priv->ring, cqe);
+ ctx->unwind(ctx, res);
+ }
+
+ return NULL;
+}
+
+int
+posix_io_uring_init(xlator_t *this)
+{
+ int ret = -1;
+ unsigned flags = 0;
+ struct posix_private *priv = this->private;
+
+ // TODO:Try-out flags |= IORING_SETUP_IOPOLL;
+ ret = io_uring_queue_init(POSIX_URING_MAX_ENTRIES, &priv->ring, flags);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_POSIX_IO_URING,
+ "io_uring init failed.");
+ goto out;
+ }
+
+ pthread_mutex_init(&priv->sq_mutex, NULL);
+ pthread_mutex_init(&priv->cq_mutex, NULL);
+ ret = gf_thread_create(&priv->uring_thread, NULL, posix_io_uring_thread,
+ this, "posix-iouring");
+ if (ret != 0) {
+ io_uring_queue_exit(&priv->ring);
+ pthread_mutex_destroy(&priv->sq_mutex);
+ pthread_mutex_destroy(&priv->cq_mutex);
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+static int
+posix_io_uring_drain(struct posix_private *priv)
+{
+ struct io_uring_sqe *sqe = NULL;
+ int ret = -1;
+
+ priv->uring_thread_exit = _gf_true;
+ sqe = io_uring_get_sqe(&priv->ring);
+ if (!sqe)
+ return ret;
+
+ io_uring_sqe_set_flags(sqe, IOSQE_IO_DRAIN);
+ io_uring_sqe_set_data(sqe, NULL);
+ io_uring_prep_nop(sqe);
+ ret = io_uring_submit(&priv->ring);
+
+ return ret;
+}
+
+void
+posix_io_uring_fini(xlator_t *this)
+{
+ struct posix_private *priv = this->private;
+
+ posix_io_uring_drain(priv);
+ (void)pthread_join(priv->uring_thread, NULL);
+ io_uring_queue_exit(&priv->ring);
+ pthread_mutex_destroy(&priv->sq_mutex);
+ pthread_mutex_destroy(&priv->cq_mutex);
+}
+
+int
+posix_io_uring_on(xlator_t *this)
+{
+ struct posix_private *priv = NULL;
+ int ret = -1;
+
+ priv = this->private;
+
+ if (!priv->io_uring_init_done) {
+ ret = posix_io_uring_init(this);
+ if (ret == 0)
+ priv->io_uring_capable = _gf_true;
+ else
+ priv->io_uring_capable = _gf_false;
+ priv->io_uring_init_done = _gf_true;
+ }
+
+ if (priv->io_uring_capable) {
+ this->fops->readv = posix_io_uring_readv;
+ this->fops->writev = posix_io_uring_writev;
+ this->fops->fsync = posix_io_uring_fsync;
+ ret = 0;
+ }
+
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_POSIX_IO_URING,
+ "Posix io_uring init failed, falling back to the previous "
+ "IO mechanism.");
+ }
+ return ret;
+}
+
+int
+posix_io_uring_off(xlator_t *this)
+{
+ this->fops->readv = posix_readv;
+ this->fops->writev = posix_writev;
+ this->fops->fsync = posix_fsync;
+ posix_io_uring_fini(this);
+
+ return 0;
+}
+
+#else
+int
+posix_io_uring_on(xlator_t *this)
+{
+ gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_AIO_UNAVAILABLE,
+ "Linux io_uring not available at build-time. "
+ "Continuing with synchronous IO");
+ return -1;
+}
+
+int
+posix_io_uring_off(xlator_t *this)
+{
+ return 0;
+}
+
+#endif
diff --git a/xlators/storage/posix/src/posix-io-uring.h b/xlators/storage/posix/src/posix-io-uring.h
new file mode 100644
index 0000000000..3d902e8b32
--- /dev/null
+++ b/xlators/storage/posix/src/posix-io-uring.h
@@ -0,0 +1,31 @@
+/*
+ Copyright (c) 2020 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _POSIX_IO_URING_H
+#define _POSIX_IO_URING_H
+
+#define POSIX_URING_MAX_ENTRIES 512
+int
+posix_io_uring_on(xlator_t *this);
+
+int
+posix_io_uring_off(xlator_t *this);
+
+#ifdef HAVE_LIBURING
+int
+posix_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata);
+
+int
+posix_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata);
+#endif
+
+#endif /* _POSIX_IO_URING_H */
diff --git a/xlators/storage/posix/src/posix-mem-types.h b/xlators/storage/posix/src/posix-mem-types.h
index 2253f381ac..81957736a4 100644
--- a/xlators/storage/posix/src/posix-mem-types.h
+++ b/xlators/storage/posix/src/posix-mem-types.h
@@ -20,6 +20,7 @@ enum gf_posix_mem_types_ {
gf_posix_mt_paiocb,
gf_posix_mt_inode_ctx_t,
gf_posix_mt_mdata_attr,
+ gf_posix_mt_uring_ctx,
gf_posix_mt_end
};
#endif
diff --git a/xlators/storage/posix/src/posix-messages.h b/xlators/storage/posix/src/posix-messages.h
index f5bede266d..46518eb616 100644
--- a/xlators/storage/posix/src/posix-messages.h
+++ b/xlators/storage/posix/src/posix-messages.h
@@ -69,6 +69,6 @@ GLFS_MSGID(POSIX, P_MSG_XATTR_FAILED, P_MSG_NULL_GFID, P_MSG_FCNTL_FAILED,
P_MSG_FETCHMDATA_FAILED, P_MSG_GETMDATA_FAILED,
P_MSG_SETMDATA_FAILED, P_MSG_FRESHFILE, P_MSG_MUTEX_FAILED,
P_MSG_COPY_FILE_RANGE_FAILED, P_MSG_TIMER_DELETE_FAILED, P_MSG_NOMEM,
- P_MSG_PSTAT_FAILED, P_MSG_FDSTAT_FAILED);
+ P_MSG_PSTAT_FAILED, P_MSG_FDSTAT_FAILED, P_MSG_POSIX_IO_URING);
#endif /* !_GLUSTERD_MESSAGES_H_ */
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
index b8db146eef..92f7272325 100644
--- a/xlators/storage/posix/src/posix.h
+++ b/xlators/storage/posix/src/posix.h
@@ -38,6 +38,11 @@
#include "posix-aio.h"
#endif
+#ifdef HAVE_LIBURING
+#include <liburing.h>
+#include "posix-io-uring.h"
+#endif
+
#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/
#define MAX_NO_VECT 1024
@@ -252,6 +257,18 @@ struct posix_private {
gf_boolean_t aio_init_done;
gf_boolean_t aio_capable;
uint32_t rel_fdcount;
+
+ /*io_uring related.*/
+ gf_boolean_t io_uring_configured;
+#ifdef HAVE_LIBURING
+ struct io_uring ring;
+ gf_boolean_t io_uring_init_done;
+ gf_boolean_t io_uring_capable;
+ gf_boolean_t uring_thread_exit;
+ pthread_t uring_thread;
+ pthread_mutex_t sq_mutex;
+ pthread_mutex_t cq_mutex;
+#endif
};
typedef struct {