diff options
-rw-r--r-- | configure.ac | 3 | ||||
-rw-r--r-- | libglusterfs/src/common-utils.h | 1 | ||||
-rw-r--r-- | libglusterfs/src/glusterfs.h | 17 | ||||
-rw-r--r-- | xlators/features/Makefile.am | 2 | ||||
-rw-r--r-- | xlators/features/bit-rot/Makefile.am | 1 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/Makefile.am | 1 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/Makefile.am | 17 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-common.h | 165 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h | 24 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub.c | 1428 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub.h | 269 | ||||
-rw-r--r-- | xlators/features/changelog/lib/src/changelog.h | 22 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volgen.c | 21 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 6 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-helpers.c | 70 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix.c | 23 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix.h | 3 |
17 files changed, 2066 insertions, 7 deletions
diff --git a/configure.ac b/configure.ac index 81faaf4654..ee89ce9916 100644 --- a/configure.ac +++ b/configure.ac @@ -162,6 +162,9 @@ AC_CONFIG_FILES([Makefile xlators/features/upcall/src/Makefile xlators/features/shard/Makefile xlators/features/shard/src/Makefile + xlators/features/bit-rot/Makefile + xlators/features/bit-rot/src/Makefile + xlators/features/bit-rot/src/stub/Makefile xlators/playground/Makefile xlators/playground/template/Makefile xlators/playground/template/src/Makefile diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h index 5f67a162d5..c1deeef3c9 100644 --- a/libglusterfs/src/common-utils.h +++ b/libglusterfs/src/common-utils.h @@ -117,6 +117,7 @@ enum _gf_client_pid GF_CLIENT_PID_QUOTA_MOUNT = -5, GF_CLIENT_PID_AFR_SELF_HEALD = -6, GF_CLIENT_PID_GLFS_HEAL = -7, + GF_CLIENT_PID_BITD = -8, }; enum _gf_xlator_ipc_targets { diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 095ca2a138..bcc9f57f99 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -117,6 +117,23 @@ #define GET_ANCESTRY_PATH_KEY "glusterfs.ancestry.path" #define GET_ANCESTRY_DENTRY_KEY "glusterfs.ancestry.dentry" +#define BITROT_DEFAULT_CURRENT_VERSION (unsigned long)1 +#define BITROT_DEFAULT_SIGNING_VERSION (unsigned long)0 + +/* on-disk object signature keys */ +#define BITROT_CURRENT_VERSION_KEY "trusted.glusterfs.bit-rot.version" +#define BITROT_SIGNING_VERSION_KEY "trusted.glusterfs.bit-rot.signature" + +/* GET/SET object signature */ +#define GLUSTERFS_GET_OBJECT_SIGNATURE "trusted.glusterfs.get-signature" +#define GLUSTERFS_SET_OBJECT_SIGNATURE "trusted.glusterfs.set-signature" + +/* operation needs to be durable on-disk */ +#define GLUSTERFS_DURABLE_OP "trusted.glusterfs.durable-op" + +/* key for version exchange b/w bitrot stub and changelog */ +#define GLUSTERFS_VERSION_XCHG_KEY "glusterfs.version.xchg" + #define GLUSTERFS_INTERNAL_FOP_KEY "glusterfs-internal-fop" #define ZR_FILE_CONTENT_STR "glusterfs.file." diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am index 8dc7051a30..7e5783f4f3 100644 --- a/xlators/features/Makefile.am +++ b/xlators/features/Makefile.am @@ -1,5 +1,5 @@ SUBDIRS = locks quota read-only mac-compat quiesce marker index barrier arbiter\ protect compress changelog changetimerecorder ganesha gfid-access $(GLUPY_SUBDIR) qemu-block \ - upcall snapview-client snapview-server trash shard #path-converter # filter + upcall snapview-client snapview-server trash shard bit-rot #path-converter # filter CLEANFILES = diff --git a/xlators/features/bit-rot/Makefile.am b/xlators/features/bit-rot/Makefile.am new file mode 100644 index 0000000000..f963effea2 --- /dev/null +++ b/xlators/features/bit-rot/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src
\ No newline at end of file diff --git a/xlators/features/bit-rot/src/Makefile.am b/xlators/features/bit-rot/src/Makefile.am new file mode 100644 index 0000000000..7581732c7d --- /dev/null +++ b/xlators/features/bit-rot/src/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = stub diff --git a/xlators/features/bit-rot/src/stub/Makefile.am b/xlators/features/bit-rot/src/stub/Makefile.am new file mode 100644 index 0000000000..9abcbb76db --- /dev/null +++ b/xlators/features/bit-rot/src/stub/Makefile.am @@ -0,0 +1,17 @@ +xlator_LTLIBRARIES = bitrot-stub.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +bitrot_stub_la_LDFLAGS = -module -avoid-version + +bitrot_stub_la_SOURCES = bit-rot-stub.c +bitrot_stub_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = bit-rot-stub.h bit-rot-common.h bit-rot-stub-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/xlators/features/changelog/lib/src + + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h new file mode 100644 index 0000000000..9e10152355 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h @@ -0,0 +1,165 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_COMMON_H__ +#define __BIT_ROT_COMMON_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" + +/** + * on-disk formats for ongoing version and object signature. + */ +typedef struct br_version { + unsigned long ongoingversion; + uint32_t timebuf[2]; +} br_version_t; + +typedef struct br_signature { + int8_t signaturetype; + + unsigned long signedversion; + + char signature[0]; +} br_signature_t; + +#define BR_VXATTR_VERSION (1 << 0) +#define BR_VXATTR_SIGNATURE (1 << 1) + +#define BR_VXATTR_ALL_MISSING \ + (BR_VXATTR_VERSION | BR_VXATTR_SIGNATURE) + +typedef enum br_vxattr_state { + BR_VXATTR_STATUS_MISSING = 0, + BR_VXATTR_STATUS_PARTIAL = 1, + BR_VXATTR_STATUS_FULL = 2, +} br_vxattr_status_t; + +static inline br_vxattr_status_t +br_version_xattr_state (dict_t *xattr, + br_version_t **obuf, br_signature_t **sbuf) +{ + int32_t ret = 0; + int32_t vxattr = 0; + br_vxattr_status_t status; + + ret = dict_get_bin (xattr, BITROT_CURRENT_VERSION_KEY, (void **)obuf); + if (ret) + vxattr |= BR_VXATTR_VERSION; + + ret = dict_get_bin (xattr, BITROT_SIGNING_VERSION_KEY, (void **)sbuf); + if (ret) + vxattr |= BR_VXATTR_SIGNATURE; + + switch (vxattr) { + case 0: + status = BR_VXATTR_STATUS_FULL; + break; + case BR_VXATTR_ALL_MISSING: + status = BR_VXATTR_STATUS_MISSING; + break; + default: + status = BR_VXATTR_STATUS_PARTIAL; + } + + return status; +} + +/** + * in-memory representation of signature used by signer for object + * signing. + */ +typedef struct br_isignature_in { + int8_t signaturetype; /* signature type */ + + unsigned long signedversion; /* version against which the + object was signed */ + + char signature[0]; /* object signature */ +} br_isignature_t; + +/** + * in-memory representation of signature used by scrubber for object + * verification. + */ +typedef struct br_isignature_out { + char stale; /* stale signature? */ + + uint32_t time[2]; /* time when the object + got dirtied */ + + int8_t signaturetype; /* hash type */ + char signature[0]; /* signature (hash) */ +} br_isignature_out_t; + +typedef struct br_stub_init { + uint32_t timebuf[2]; + char export[PATH_MAX]; +} br_stub_init_t; + +typedef enum { + BR_SIGNATURE_TYPE_VOID = -1, /* object is not signed */ + BR_SIGNATURE_TYPE_ZERO = 0, /* min boundary */ + BR_SIGNATURE_TYPE_SHA256 = 1, /* signed with SHA256 */ + BR_SIGNATURE_TYPE_MAX = 2, /* max boundary */ +} br_signature_type; + +/* BitRot stub start time (virtual xattr) */ +#define GLUSTERFS_GET_BR_STUB_INIT_TIME "trusted.glusterfs.bit-rot.stub-init" + +static inline int +br_is_signature_type_valid (int8_t signaturetype) +{ + return ((signaturetype > BR_SIGNATURE_TYPE_ZERO) + && (signaturetype < BR_SIGNATURE_TYPE_MAX)); +} + +static inline void +br_set_default_ongoingversion (br_version_t *buf, uint32_t *tv) +{ + buf->ongoingversion = BITROT_DEFAULT_CURRENT_VERSION; + buf->timebuf[0] = tv[0]; + buf->timebuf[1] = tv[1]; +} + +static inline void +br_set_default_signature (br_signature_t *buf, size_t *size) +{ + buf->signaturetype = (int8_t) BR_SIGNATURE_TYPE_VOID; + buf->signedversion = BITROT_DEFAULT_SIGNING_VERSION; + + *size = sizeof (br_signature_t); /* no signature */ +} + +static inline void +br_set_ongoingversion (br_version_t *buf, + unsigned long version, uint32_t *tv) +{ + buf->ongoingversion = version; + buf->timebuf[0] = tv[0]; + buf->timebuf[1] = tv[1]; +} + +static inline void +br_set_signature (br_signature_t *buf, + br_isignature_t *sign, size_t signaturelen, size_t *size) +{ + buf->signaturetype = sign->signaturetype; + buf->signedversion = ntohl (sign->signedversion); + + memcpy (buf->signature, sign->signature, signaturelen); + *size = sizeof (br_signature_t) + signaturelen; +} + +#endif /* __BIT_ROT_COMMON_H__ */ diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h new file mode 100644 index 0000000000..64779923fd --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _BR_MEM_TYPES_H +#define _BR_MEM_TYPES_H + +#include "mem-types.h" + +enum br_mem_types { + gf_br_stub_mt_private_t = gf_common_mt_end + 1, + gf_br_stub_mt_version_t = gf_common_mt_end + 2, + gf_br_stub_mt_inode_ctx_t = gf_common_mt_end + 3, + gf_br_stub_mt_signature_t = gf_common_mt_end + 4, + gf_br_stub_mt_end +}; + +#endif diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c new file mode 100644 index 0000000000..420f145a84 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c @@ -0,0 +1,1428 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <ctype.h> +#include <sys/uio.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "logging.h" +#include "changelog.h" +#include "compat-errno.h" + +#include "bit-rot-stub.h" +#include "bit-rot-stub-mem-types.h" + +#include "bit-rot-common.h" + +#define BR_STUB_REQUEST_COOKIE 0x1 + +int32_t +mem_acct_init (xlator_t *this) +{ + int32_t ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_br_stub_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, "Memory accounting" + " init failed"); + return ret; + } + + return ret; +} + +int32_t +init (xlator_t *this) +{ + char *tmp = NULL; + struct timeval tv = {0,}; + br_stub_private_t *priv = NULL; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, "FATAL: no children"); + goto error_return; + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_br_stub_mt_private_t); + if (!priv) + goto error_return; + + priv->local_pool = mem_pool_new (br_stub_local_t, 512); + if (!priv->local_pool) + goto free_priv; + + GF_OPTION_INIT ("bitrot", priv->go, bool, free_mempool); + + GF_OPTION_INIT ("export", tmp, str, free_mempool); + memcpy (priv->export, tmp, strlen (tmp) + 1); + + (void) gettimeofday (&tv, NULL); + + /* boot time is in network endian format */ + priv->boot[0] = htonl (tv.tv_sec); + priv->boot[1] = htonl (tv.tv_usec); + + gf_log (this->name, GF_LOG_DEBUG, "bit-rot stub loaded"); + this->private = priv; + return 0; + + free_mempool: + mem_pool_destroy (priv->local_pool); + free_priv: + GF_FREE (priv); + error_return: + return -1; +} + +void +fini (xlator_t *this) +{ + br_stub_private_t *priv = this->private; + + if (!priv) + return; + this->private = NULL; + GF_FREE (priv); + + return; +} + +static inline int +br_stub_alloc_versions (br_version_t **obuf, + br_signature_t **sbuf, size_t signaturelen) +{ + void *mem = NULL; + size_t size = 0; + + if (obuf) + size += sizeof (br_version_t); + if (sbuf) + size += sizeof (br_signature_t) + signaturelen; + + mem = GF_CALLOC (1, size, gf_br_stub_mt_version_t); + if (!mem) + goto error_return; + + if (obuf) { + *obuf = (br_version_t *)mem; + mem = ((char *)mem + sizeof (br_version_t)); + } + if (sbuf) { + *sbuf = (br_signature_t *)mem; + } + + return 0; + + error_return: + return -1; +} + +static inline void +br_stub_dealloc_versions (void *mem) +{ + GF_FREE (mem); +} + +static inline br_stub_local_t * +br_stub_alloc_local (xlator_t *this) +{ + br_stub_private_t *priv = this->private; + + return mem_get0 (priv->local_pool); +} + +static inline void +br_stub_dealloc_local (br_stub_local_t *ptr) +{ + mem_put (ptr); +} + +static inline int +br_stub_prepare_default_request (xlator_t *this, dict_t *dict, + br_version_t *obuf, br_signature_t *sbuf) +{ + int32_t ret = 0; + size_t size = 0; + br_stub_private_t *priv = NULL; + + priv = this->private; + + /** Prepare ongoing version */ + br_set_default_ongoingversion (obuf, priv->boot); + ret = dict_set_static_bin (dict, BITROT_CURRENT_VERSION_KEY, + (void *)obuf, sizeof (br_version_t)); + if (ret) + return -1; + + /** Prepare signature version */ + br_set_default_signature (sbuf, &size); + return dict_set_static_bin (dict, BITROT_SIGNING_VERSION_KEY, + (void *)sbuf, size); +} + +static inline int +br_stub_prepare_version_request (xlator_t *this, dict_t *dict, + br_version_t *obuf, unsigned long oversion) +{ + br_stub_private_t *priv = NULL; + + priv = this->private; + br_set_ongoingversion (obuf, oversion, priv->boot); + + return dict_set_static_bin (dict, BITROT_CURRENT_VERSION_KEY, + (void *)obuf, sizeof (br_version_t)); +} + +static inline int +br_stub_prepare_signing_request (dict_t *dict, + br_signature_t *sbuf, + br_isignature_t *sign, size_t signaturelen) +{ + size_t size = 0; + + br_set_signature (sbuf, sign, signaturelen, &size); + + return dict_set_static_bin (dict, BITROT_SIGNING_VERSION_KEY, + (void *)sbuf, size); +} + +/** + * initialize an inode context starting with a given ongoing version. + * a fresh lookup() or a first creat() call initializes the inode + * context, hence the inode is marked dirty. this routine also + * initializes the transient inode version. + */ +static inline int +br_stub_init_inode_versions (xlator_t *this, fd_t *fd, inode_t *inode, + unsigned long version, gf_boolean_t markdirty) +{ + int32_t ret = 0; + br_stub_inode_ctx_t *ctx = NULL; + + ctx = GF_CALLOC (1, sizeof (br_stub_inode_ctx_t), + gf_br_stub_mt_inode_ctx_t); + if (!ctx) + goto error_return; + + (markdirty) ? __br_stub_mark_inode_dirty (ctx) + : __br_stub_mark_inode_synced (ctx); + __br_stub_set_ongoing_version (ctx, version); + __br_stub_reset_release_counters (ctx); + + if (fd) { + br_stub_require_release_call (this, fd); + __br_stub_track_openfd (fd, ctx); + } + ret = br_stub_set_inode_ctx (this, inode, ctx); + if (ret) + goto free_ctx; + return 0; + + free_ctx: + GF_FREE (ctx); + error_return: + return -1; +} + +/** + * modify the ongoing version of an inode. + */ +static inline int +br_stub_mod_inode_versions (xlator_t *this, + fd_t *fd, inode_t *inode, unsigned long version) +{ + int32_t ret = -1; + br_stub_inode_ctx_t *ctx = 0; + + LOCK (&inode->lock); + { + ctx = __br_stub_get_ongoing_version_ctx (this, inode, NULL); + if (ctx == NULL) + goto unblock; + if (__br_stub_is_inode_dirty (ctx)) { + __br_stub_set_ongoing_version (ctx, version); + __br_stub_mark_inode_synced (ctx); + } + + __br_stub_track_openfd (fd, ctx); + ret = 0; + } + unblock: + UNLOCK (&inode->lock); + + return ret; +} + +static inline void +br_stub_fill_local (br_stub_local_t *local, + call_stub_t *stub, fd_t *fd, inode_t *inode, uuid_t gfid, + int versioningtype, unsigned long memversion, int dirty) +{ + local->fopstub = stub; + local->versioningtype = versioningtype; + local->u.context.version = memversion; + if (fd) + local->u.context.fd = fd_ref (fd); + if (inode) + local->u.context.inode = inode_ref (inode); + uuid_copy (local->u.context.gfid, gfid); + + /* mark inode dirty/fresh according to durability */ + local->u.context.markdirty = (dirty) ? _gf_true : _gf_false; +} + +static inline void +br_stub_cleanup_local (br_stub_local_t *local) +{ + local->fopstub = NULL; + local->versioningtype = 0; + local->u.context.version = 0; + if (local->u.context.fd) { + fd_unref (local->u.context.fd); + local->u.context.fd = NULL; + } + if (local->u.context.inode) { + inode_unref (local->u.context.inode); + local->u.context.inode = NULL; + } + local->u.context.markdirty = _gf_true; + memset (local->u.context.gfid, '\0', sizeof (uuid_t)); +} + +/** + * callback for inode/fd full versioning + */ +int +br_stub_inode_fullversioning_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + fd_t *fd = NULL; + inode_t *inode = NULL; + unsigned long version = 0; + gf_boolean_t dirty = _gf_true; + br_stub_local_t *local = NULL; + + local = (br_stub_local_t *)frame->local; + + /* be graceful to EEXIST */ + if ((op_ret < 0) && (op_errno == EEXIST)) { + op_ret = 0; + goto done; + } + + if (op_ret < 0) + goto done; + + fd = local->u.context.fd; + inode = local->u.context.inode; + version = local->u.context.version; + dirty = local->u.context.markdirty; + + op_ret = br_stub_init_inode_versions (this, fd, inode, version, dirty); + if (op_ret < 0) + op_errno = EINVAL; + + done: + frame->local = NULL; + if (op_ret < 0) + call_unwind_error (local->fopstub, op_ret, op_errno); + else + call_resume (local->fopstub); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + + return 0; +} + +int +br_stub_fd_incversioning_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + fd_t *fd = NULL; + inode_t *inode = NULL; + unsigned long version = 0; + br_stub_local_t *local = NULL; + + local = (br_stub_local_t *)frame->local; + if (op_ret < 0) + goto done; + fd = local->u.context.fd; + inode = local->u.context.inode; + version = local->u.context.version; + + op_ret = br_stub_mod_inode_versions (this, fd, inode, version); + if (op_ret < 0) + op_errno = EINVAL; + + done: + frame->local = NULL; + if (op_ret < 0) + call_unwind_error (local->fopstub, -1, op_errno); + else + call_resume (local->fopstub); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + + return 0; +} + +/** + * Initial object versioning + * + * Version persists two (2) extended attributes as explained below: + * 1. Current (ongoing) version: This is incremented on an open() + * or creat() and is the running version for an object. + * 2. Signing version: This is the version against which an object + * was signed (checksummed). + * + * During initial versioning, both ongoing and signing versions are + * set of one and zero respectively. An open() call increments the + * ongoing version as an indication of modification to the object. + * Additionally this needs to be persisted on disk and needs to be + * durable: fsync().. :-/ + * As an optimization only the first open() synchronizes the ongoing + * version to disk, subsequent open()s before the *last* release() + * are no-op's. + * + * create(), just like lookup() initializes the object versions to + * the default, but persists the version to disk. As an optimization + * this is not a durable operation: in case of a crash, hard reboot + * etc.. absence of versioning xattrs is ignored in scrubber along + * with the one time crawler explicitly triggering signing for such + * objects. + * + * c.f. br_stub_open_cbk() / br_stub_create_cbk() + */ + +/** + * perform full or incremental versioning on an inode pointd by an + * fd. incremental versioning is done when an inode is dirty and a + * writeback is trigerred. + */ + +int +br_stub_fd_versioning (xlator_t *this, call_frame_t *frame, + call_stub_t *stub, dict_t *dict, fd_t *fd, + br_stub_version_cbk *callback, unsigned long memversion, + int versioningtype, int durable, int dirty) +{ + int32_t ret = -1; + int flags = 0; + dict_t *xdata = NULL; + br_stub_local_t *local = NULL; + + if (durable) { + xdata = dict_new (); + if (!xdata) + goto done; + ret = dict_set_int32 (xdata, GLUSTERFS_DURABLE_OP, 0); + if (ret) + goto dealloc_xdata; + } + + local = br_stub_alloc_local (this); + if (!local) { + ret = -1; + goto dealloc_xdata; + } + + if (versioningtype == BR_STUB_FULL_VERSIONING) + flags |= XATTR_CREATE; + + br_stub_fill_local (local, stub, fd, + fd->inode, fd->inode->gfid, + versioningtype, memversion, dirty); + + frame->local = local; + STACK_WIND (frame, callback, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, + fd, dict, flags, xdata); + + ret = 0; + + dealloc_xdata: + if (durable) + dict_unref (xdata); + done: + return ret; +} + +static inline int +br_stub_perform_fullversioning (xlator_t *this, call_frame_t *frame, + call_stub_t *stub, fd_t *fd) +{ + int32_t ret = -1; + dict_t *dict = NULL; + br_version_t *obuf = NULL; + br_signature_t *sbuf = NULL; + int op_errno = 0; + + op_errno = ENOMEM; + dict = dict_new (); + if (!dict) + goto done; + ret = br_stub_alloc_versions (&obuf, &sbuf, 0); + if (ret) + goto dealloc_dict; + + op_errno = EINVAL; + ret = br_stub_prepare_default_request (this, dict, obuf, sbuf); + if (ret) + goto dealloc_versions; + + /** + * Version extended attributes need not be durable at this point of + * time. If the objects (inode) data gets persisted on disk but the + * version extended attributes are lost due to a crash/power failure, + * a subsequent lookup marks the objects signature as stale. This way, + * dentry operation times do not shoot up. + */ + ret = br_stub_fd_versioning (this, frame, stub, dict, fd, + br_stub_inode_fullversioning_cbk, + BITROT_DEFAULT_CURRENT_VERSION, + BR_STUB_FULL_VERSIONING, !WRITEBACK_DURABLE, 0); + + dealloc_versions: + br_stub_dealloc_versions (obuf); + dealloc_dict: + dict_unref (dict); + done: + if (ret) + call_unwind_error (stub, -1, op_errno); + return ret; +} + +static inline int +br_stub_perform_incversioning (xlator_t *this, + call_frame_t *frame, call_stub_t *stub, + fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + dict_t *dict = NULL; + inode_t *inode = NULL; + br_version_t *obuf = NULL; + unsigned long writeback_version = 0; + int op_errno = 0; + + inode = fd->inode; + + op_errno = EINVAL; + ret = br_stub_require_release_call (this, fd); + if (ret) + goto done; + + LOCK (&inode->lock); + { + if (__br_stub_is_inode_dirty (ctx)) + writeback_version = __br_stub_writeback_version (ctx); + else + __br_stub_track_openfd (fd, ctx); + } + UNLOCK (&inode->lock); + + if (!writeback_version) { + ret = 0; + goto done; + } + + /* inode requires writeback to disk */ + op_errno = ENOMEM; + dict = dict_new (); + if (!dict) + goto done; + ret = br_stub_alloc_versions (&obuf, NULL, 0); + if (ret) + goto dealloc_dict; + ret = br_stub_prepare_version_request (this, dict, + obuf, writeback_version); + if (ret) + goto dealloc_versions; + + ret = br_stub_fd_versioning + (this, frame, stub, dict, + fd, br_stub_fd_incversioning_cbk, writeback_version, + BR_STUB_INCREMENTAL_VERSIONING, WRITEBACK_DURABLE, 0); + + dealloc_versions: + br_stub_dealloc_versions (obuf); + dealloc_dict: + dict_unref (dict); + done: + if (!ret && !writeback_version) + call_resume (stub); + if (ret) + call_unwind_error (stub, -1, op_errno); + return ret; +} + +/** {{{ */ + +/* fsetxattr() */ + +static inline int +br_stub_prepare_signature (xlator_t *this, dict_t *dict, + inode_t *inode, br_isignature_t *sign) +{ + int32_t ret = 0; + size_t signaturelen = 0; + br_signature_t *sbuf = NULL; + + if (!br_is_signature_type_valid (sign->signaturetype)) + goto error_return; + + signaturelen = strlen (sign->signature); + ret = br_stub_alloc_versions (NULL, &sbuf, signaturelen); + if (ret) + goto error_return; + ret = br_stub_prepare_signing_request (dict, sbuf, sign, signaturelen); + if (ret) + goto dealloc_versions; + return 0; + + dealloc_versions: + br_stub_dealloc_versions (sbuf); + error_return: + return -1; +} + +int +br_stub_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *dict, int flags, dict_t *xdata) +{ + int32_t ret = 0; + br_isignature_t *sign = NULL; + gf_boolean_t xref = _gf_false; + + if (!IA_ISREG (fd->inode->ia_type)) + goto wind; + ret = dict_get_bin (dict, GLUSTERFS_SET_OBJECT_SIGNATURE, + (void **) &sign); + if (ret < 0) + goto wind; + if (frame->root->pid != GF_CLIENT_PID_BITD) + goto unwind; + + ret = br_stub_prepare_signature (this, dict, fd->inode, sign); + if (ret) + goto unwind; + dict_del (dict, GLUSTERFS_SET_OBJECT_SIGNATURE); + + if (!xdata) { + xdata = dict_new (); + if (!xdata) + goto unwind; + } else { + dict_ref (xdata); + } + + xref = _gf_true; + ret = dict_set_int32 (xdata, GLUSTERFS_DURABLE_OP, 0); + if (ret) + goto unwind; + + wind: + STACK_WIND (frame, default_setxattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd, + dict, flags, xdata); + goto done; + + unwind: + STACK_UNWIND_STRICT (setxattr, frame, -1, EINVAL, NULL); + done: + if (xref) + dict_unref (xdata); + return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* {f}getxattr() */ + +int +br_stub_listxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + if (op_ret < 0) + goto unwind; + + br_stub_remove_vxattrs (xattr); + + unwind: + STACK_UNWIND (frame, op_ret, op_errno, xattr, xdata); + return 0; +} + + +int +br_stub_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + int32_t ret = 0; + ssize_t totallen = 0; + ssize_t signaturelen = 0; + br_version_t *obuf = NULL; + br_signature_t *sbuf = NULL; + br_isignature_out_t *sign = NULL; + br_vxattr_status_t status; + + if (op_ret < 0) + goto unwind; + if (cookie != (void *) BR_STUB_REQUEST_COOKIE) + goto unwind; + + op_ret = -1; + op_errno = EINVAL; + + status = br_version_xattr_state (xattr, &obuf, &sbuf); + if (status == BR_VXATTR_STATUS_PARTIAL) + goto delkeys; + + op_errno = ENODATA; + if (status == BR_VXATTR_STATUS_MISSING) + goto delkeys; + + signaturelen = strlen (sbuf->signature); + totallen = signaturelen + sizeof (br_isignature_out_t); + + op_errno = ENOMEM; + sign = GF_CALLOC (1, totallen, gf_br_stub_mt_signature_t); + if (!sign) + goto delkeys; + + sign->time[0] = obuf->timebuf[0]; + sign->time[1] = obuf->timebuf[1]; + + /* Object's dirty state */ + sign->stale = (obuf->ongoingversion != sbuf->signedversion) ? 1 : 0; + + /* Object's signature */ + sign->signaturetype = sbuf->signaturetype; + (void) memcpy (sign->signature, sbuf->signature, signaturelen); + + op_errno = EINVAL; + ret = dict_set_bin (xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, + (void *)sign, totallen); + if (ret < 0) + goto delkeys; + op_errno = 0; + op_ret = totallen; + + delkeys: + br_stub_remove_vxattrs (xattr); + + unwind: + STACK_UNWIND (frame, op_ret, op_errno, xattr, xdata); + return 0; +} + +static inline void +br_stub_send_stub_init_time (call_frame_t *frame, xlator_t *this) +{ + int op_ret = 0; + int op_errno = 0; + dict_t *xattr = NULL; + br_stub_init_t stub = {{0,},}; + br_stub_private_t *priv = NULL; + + priv = this->private; + + xattr = dict_new (); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + stub.timebuf[0] = priv->boot[0]; + stub.timebuf[1] = priv->boot[1]; + memcpy (stub.export, priv->export, strlen (priv->export) + 1); + + op_ret = dict_set_static_bin (xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME, + (void *) &stub, sizeof (br_stub_init_t)); + if (op_ret < 0) { + op_errno = EINVAL; + goto unwind; + } + + op_ret = sizeof (br_stub_init_t); + + unwind: + STACK_UNWIND (frame, op_ret, op_errno, xattr, NULL); + + if (xattr) + dict_unref (xattr); +} + +int +br_stub_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + void *cookie = NULL; + uuid_t rootgfid = {0, }; + fop_getxattr_cbk_t cbk = br_stub_getxattr_cbk; + + rootgfid[15] = 1; + + if (!name) { + cbk = br_stub_listxattr_cbk; + goto wind; + } + + if (br_stub_is_internal_xattr (name)) + goto wind; + + /** + * this special extended attribute is allowed only on root + */ + if (name + && (strncmp (name, GLUSTERFS_GET_BR_STUB_INIT_TIME, + strlen (GLUSTERFS_GET_BR_STUB_INIT_TIME)) == 0) + && ((uuid_compare (loc->gfid, rootgfid) == 0) + || (uuid_compare (loc->inode->gfid, rootgfid) == 0))) { + br_stub_send_stub_init_time (frame, this); + return 0; + } + + if (!IA_ISREG (loc->inode->ia_type)) + goto wind; + + if (name && (strncmp (name, GLUSTERFS_GET_OBJECT_SIGNATURE, + strlen (GLUSTERFS_GET_OBJECT_SIGNATURE)) == 0)) { + cookie = (void *) BR_STUB_REQUEST_COOKIE; + } + + wind: + STACK_WIND_COOKIE + (frame, cbk, cookie, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->getxattr, loc, name, xdata); + return 0; +} + +int +br_stub_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + void *cookie = NULL; + uuid_t rootgfid = {0, }; + fop_fgetxattr_cbk_t cbk = br_stub_getxattr_cbk; + + rootgfid[15] = 1; + + if (!name) { + cbk = br_stub_listxattr_cbk; + goto wind; + } + + if (br_stub_is_internal_xattr (name)) + goto wind; + + /** + * this special extended attribute is allowed only on root + */ + if (name + && (strncmp (name, GLUSTERFS_GET_BR_STUB_INIT_TIME, + strlen (GLUSTERFS_GET_BR_STUB_INIT_TIME)) == 0) + && (uuid_compare (fd->inode->gfid, rootgfid) == 0)) { + br_stub_send_stub_init_time (frame, this); + return 0; + } + + if (!IA_ISREG (fd->inode->ia_type)) + goto wind; + + if (name && (strncmp (name, GLUSTERFS_GET_OBJECT_SIGNATURE, + strlen (GLUSTERFS_GET_OBJECT_SIGNATURE)) == 0)) { + cookie = (void *) BR_STUB_REQUEST_COOKIE; + } + + wind: + STACK_WIND_COOKIE + (frame, cbk, cookie, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fgetxattr, fd, name, xdata); + return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* open() */ + +int +br_stub_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + call_stub_t *stub = NULL; + + if (op_ret < 0) + goto unwind; + if (cookie != (void *) BR_STUB_REQUEST_COOKIE) + goto unwind; + + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); + if (ret < 0) + goto unwind; + + stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata); + if (!stub) { + op_ret = -1; + op_errno = EINVAL; + goto unwind; + } + + /** + * Ongoing version needs to be incremented. If the inode is not dirty, + * things are simple: increment the ongoing version safely and be done. + * If inode is dirty, a writeback to disk is required. This is tricky in + * case of multiple open()'s as ongoing version needs to be incremented + * on a successful writeback. It's probably safe to remember the ongoing + * version before writeback and *assigning* it in the callback, but that + * may lead to a trustable checksum to be treated as stale by scrubber + * (the case where the in-memory ongoing version is lesser than the + * on-disk version). Therefore, *all* open() calls (which might have + * come in parallel) try to synchronize the next ongoing version to + * disk. In the callback path, the winner marks the inode as synced + * therby loosing open() calls become no-op's. + */ + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + + unwind: + STACK_UNWIND_STRICT (open, frame, + op_ret, op_errno, fd, xdata); + return 0; +} + +int +br_stub_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) +{ + void *cookie = NULL; + + if (!flags) + goto wind; + cookie = (void *) BR_STUB_REQUEST_COOKIE; + + wind: + STACK_WIND_COOKIE (frame, br_stub_open_cbk, cookie, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->open, + loc, flags, fd, xdata); + return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* creat() */ + +int +br_stub_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + call_stub_t *stub = NULL; + br_stub_inode_ctx_t *ctx = NULL; + + if (op_ret < 0) + goto unwind; + + stub = fop_create_cbk_stub (frame, NULL, op_ret, op_errno, fd, inode, + stbuf, preparent, postparent, xdata); + if (!stub) { + op_ret = -1; + op_errno = EINVAL; + goto unwind; + } + + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + + /* see comment in br_stub_open_cbk().. */ + return (ctx) + ? br_stub_perform_incversioning (this, frame, stub, fd, ctx) + : br_stub_perform_fullversioning (this, frame, stub, fd); + + unwind: + STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, + fd, inode, stbuf, preparent, postparent, xdata); + return 0; +} + +int +br_stub_create (call_frame_t *frame, + xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + STACK_WIND (frame, br_stub_create_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; +} + +/** }}} */ + +static inline int32_t +br_stub_lookup_version (xlator_t *this, + uuid_t gfid, inode_t *inode, dict_t *xattr) +{ + unsigned long version = 0; + br_version_t *obuf = NULL; + br_signature_t *sbuf = NULL; + br_vxattr_status_t status; + + /** + * versioning xattrs were requested from POSIX. if available, figure + * out the correct version to use in the inode context (start with + * the default version if unavailable). As of now versions are not + * persisted on-disk. The inode is marked dirty, so that the first + * operation (such as open(), etc..) would trigger synchronization + * to disk. + */ + status = br_version_xattr_state (xattr, &obuf, &sbuf); + + /** + * stub does not know how to handle partial presence of version + * extended attributes, therefore, bail out in such cases. + */ + if (status == BR_VXATTR_STATUS_PARTIAL) { + gf_log (this->name, GF_LOG_ERROR, "Partial version xattrs!.. " + "bailing out [GFID: %s]", uuid_utoa (gfid)); + return -1; + } + + version = (status == BR_VXATTR_STATUS_FULL) + ? obuf->ongoingversion : BITROT_DEFAULT_CURRENT_VERSION; + return br_stub_init_inode_versions (this, NULL, + inode, version, _gf_true); +} + + +/** {{{ */ + +int +br_stub_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *dict) +{ + int32_t ret = 0; + uint64_t ctxaddr = 0; + gf_dirent_t *entry = NULL; + + if (op_ret < 0) + goto unwind; + + list_for_each_entry (entry, &entries->list, list) { + if ((strcmp (entry->d_name, ".") == 0) + || (strcmp (entry->d_name, "..") == 0)) + continue; + + if (!IA_ISREG (entry->d_stat.ia_type)) + continue; + + if (entry->dict) { + br_stub_remove_vxattrs (entry->dict); + } + + ret = br_stub_get_inode_ctx (this, entry->inode, &ctxaddr); + if (ret < 0) + ctxaddr = 0; + if (ctxaddr) /* already has the context */ + continue; + + ret = br_stub_lookup_version + (this, entry->inode->gfid, entry->inode, entry->dict); + if (ret) { + /** + * there's no per-file granularity support in case of + * failure. let's fail the entire request for now.. + */ + break; + } + } + + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + + unwind: + STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, dict); + + return 0; +} + +int +br_stub_readdirp (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, dict_t *dict) +{ + int32_t ret = -1; + int op_errno = 0; + gf_boolean_t xref = _gf_false; + + op_errno = ENOMEM; + if (!dict) { + dict = dict_new (); + if (!dict) + goto unwind; + } else { + dict = dict_ref (dict); + } + + xref = _gf_true; + + op_errno = EINVAL; + ret = dict_set_uint32 (dict, BITROT_CURRENT_VERSION_KEY, 0); + if (ret) + goto unwind; + ret = dict_set_uint32 (dict, BITROT_SIGNING_VERSION_KEY, 0); + if (ret) + goto unwind; + + STACK_WIND (frame, br_stub_readdirp_cbk, FIRST_CHILD (this), + FIRST_CHILD(this)->fops->readdirp, fd, size, + offset, dict); + goto unref_dict; + + unwind: + STACK_UNWIND_STRICT (readdirp, frame, -1, op_errno, NULL, NULL); + return 0; + + unref_dict: + if (xref) + dict_unref (dict); + return 0; +} + +/** }}} */ + + +/** {{{ */ + +/* lookup() */ + +int +br_stub_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, struct iatt *postparent) +{ + int32_t ret = 0; + + if (op_ret < 0) + goto unwind; + if (!IA_ISREG (stbuf->ia_type)) + goto unwind; + + /** + * perform this before checking if we requested xattrs as this + * can happen during revalidate. + */ + br_stub_remove_vxattrs (xattr); + if (cookie != (void *) BR_STUB_REQUEST_COOKIE) + goto unwind; + + ret = br_stub_lookup_version (this, stbuf->ia_gfid, inode, xattr); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + } + + unwind: + STACK_UNWIND_STRICT (lookup, frame, + op_ret, op_errno, inode, stbuf, xattr, postparent); + + return 0; +} + +int +br_stub_lookup (call_frame_t *frame, + xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int32_t ret = 0; + int op_errno = 0; + void *cookie = NULL; + uint64_t ctx_addr = 0; + gf_boolean_t xref = _gf_false; + + ret = br_stub_get_inode_ctx (this, loc->inode, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) + goto wind; + + /** + * fresh lookup: request version keys from POSIX + */ + op_errno = ENOMEM; + if (!xdata) { + xdata = dict_new (); + if (!xdata) + goto unwind; + } else { + xdata = dict_ref (xdata); + } + + xref = _gf_true; + + op_errno = EINVAL; + ret = dict_set_uint32 (xdata, BITROT_CURRENT_VERSION_KEY, 0); + if (ret) + goto unwind; + ret = dict_set_uint32 (xdata, BITROT_SIGNING_VERSION_KEY, 0); + if (ret) + goto unwind; + cookie = (void *) BR_STUB_REQUEST_COOKIE; + + wind: + STACK_WIND_COOKIE (frame, br_stub_lookup_cbk, cookie, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, + loc, xdata); + goto dealloc_dict; + + unwind: + STACK_UNWIND_STRICT (lookup, frame, + -1, op_errno, NULL, NULL, NULL, NULL); + dealloc_dict: + if (xref) + dict_unref (xdata); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* forget() */ + +int +br_stub_forget (xlator_t *this, inode_t *inode) +{ + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + inode_ctx_del (inode, this, &ctx_addr); + if (!ctx_addr) + return 0; + + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + GF_FREE (ctx); + + return 0; +} + +/** }}} */ + +/** {{{ */ + +int32_t +br_stub_noop (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + STACK_DESTROY (frame->root); + return 0; +} + +static inline void +br_stub_send_ipc_fop (xlator_t *this, + fd_t *fd, unsigned long releaseversion, int32_t flags) +{ + int32_t op = 0; + int32_t ret = 0; + dict_t *xdata = NULL; + call_frame_t *frame = NULL; + changelog_event_t ev = {0,}; + + ev.ev_type = CHANGELOG_OP_TYPE_BR_RELEASE; + ev.u.releasebr.flags = flags; + ev.u.releasebr.version = releaseversion; + uuid_copy (ev.u.releasebr.gfid, fd->inode->gfid); + + xdata = dict_new (); + if (!xdata) { + gf_log (this->name, GF_LOG_WARNING, + "dict allocation failed: cannot send IPC FOP " + "to changelog"); + goto out; + } + + ret = dict_set_static_bin (xdata, + "RELEASE-EVENT", &ev, CHANGELOG_EV_SIZE); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "cannot set release event in dict"); + goto dealloc_dict; + } + + frame = create_frame (this, this->ctx->pool); + if (!frame) { + gf_log (this->name, GF_LOG_WARNING, "create_frame() failure"); + goto dealloc_dict; + } + + op = GF_IPC_TARGET_CHANGELOG; + STACK_WIND (frame, br_stub_noop, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->ipc, op, xdata); + return; + + dealloc_dict: + dict_unref (xdata); + out: + return; +} + +int32_t +br_stub_release (xlator_t *this, fd_t *fd) +{ + int32_t ret = 0; + int32_t flags = 0; + inode_t *inode = NULL; + unsigned long releaseversion = 0; + br_stub_inode_ctx_t *ctx = NULL; + + inode = fd->inode; + + LOCK (&inode->lock); + { + ctx = __br_stub_get_ongoing_version_ctx (this, inode, NULL); + if (ctx == NULL) + goto unblock; + __br_stub_track_release (ctx); + ret = __br_stub_can_trigger_release + (inode, ctx, &releaseversion, &flags); + if (ret) { + GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0); + __br_stub_mark_inode_dirty (ctx); + } + } + unblock: + UNLOCK (&inode->lock); + + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "releaseversion: %lu|flags: %d", releaseversion, flags); + br_stub_send_ipc_fop (this, fd, releaseversion, flags); + } + + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* ictxmerge */ + +void +br_stub_ictxmerge (xlator_t *this, fd_t *fd, + inode_t *inode, inode_t *linked_inode) +{ + int32_t ret = 0; + uint64_t ctxaddr = 0; + uint64_t lctxaddr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_inode_ctx_t *lctx = NULL; + + ret = br_stub_get_inode_ctx (this, inode, &ctxaddr); + if (ret < 0) + goto done; + ctx = (br_stub_inode_ctx_t *) ctxaddr; + + LOCK (&linked_inode->lock); + { + ret = __br_stub_get_inode_ctx (this, linked_inode, &lctxaddr); + if (ret < 0) + goto unblock; + lctx = (br_stub_inode_ctx_t *) lctxaddr; + + if (__br_stub_is_inode_dirty (lctx)) { + /** + * RACY code: An inode can end up in this situation + * after a lookup() or after a create() followed by + * a release(). Even if we distinguish b/w the two, + * there needs to be more infrastructure built up + * in stub to handle these races. Note, that it's + * probably OK to ignore the race iff the version + * was initialized on the very first lookup(), i.e., + * [ongoingversion: default]. + * + * FIXME: fixup races [create(1..n)/lookup(1..n)]. + */ + GF_ASSERT (lctx->currentversion + == BITROT_DEFAULT_CURRENT_VERSION); + __br_stub_track_openfd (fd, lctx); + __br_stub_mark_inode_synced (lctx); + } else { + GF_ASSERT (ctx->currentversion <= lctx->currentversion); + __br_stub_track_openfd (fd, lctx); + } + } + unblock: + UNLOCK (&linked_inode->lock); + + done: + return; +} + +/** }}} */ + + +struct xlator_fops fops = { + .lookup = br_stub_lookup, + .open = br_stub_open, + .create = br_stub_create, + .readdirp = br_stub_readdirp, + .getxattr = br_stub_getxattr, + .fgetxattr = br_stub_fgetxattr, + .fsetxattr = br_stub_fsetxattr, +}; + +struct xlator_cbks cbks = { + .forget = br_stub_forget, + .release = br_stub_release, + .ictxmerge = br_stub_ictxmerge, +}; + +struct volume_options options[] = { + { .key = {"bitrot"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "enable/disable bitrot stub" + }, + { .key = {"export"}, + .type = GF_OPTION_TYPE_PATH, + .description = "brick path for versioning" + }, + { .key = {NULL} }, +}; diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h new file mode 100644 index 0000000000..d565112b1a --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h @@ -0,0 +1,269 @@ + /* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __BIT_ROT_STUB_H__ +#define __BIT_ROT_STUB_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "defaults.h" +#include "call-stub.h" + +#include "bit-rot-common.h" + +typedef int (br_stub_version_cbk) (call_frame_t *, void *, + xlator_t *, int32_t, int32_t, dict_t *); + +typedef struct br_stub_inode_ctx { + int need_writeback; /* does the inode need + a writeback to disk? */ + unsigned long currentversion; /* ongoing version */ + + struct release { + int32_t ordflags; + unsigned long opencount; /* number of open()s before + final release() */ + unsigned long releasecount; /* number of release()s */ + } releasectx; +#define BR_STUB_REQUIRE_RELEASE_CBK 0x0E0EA0E +} br_stub_inode_ctx_t; + + +#define I_DIRTY (1<<0) /* inode needs writeback */ +#define WRITEBACK_DURABLE 1 /* writeback is durable */ + +/** + * This could just have been a plain struct without unions and all, + * but we may need additional things in the future. + */ +typedef struct br_stub_local { + call_stub_t *fopstub; /* stub for original fop */ + + int versioningtype; /* not much used atm */ + + union { + struct br_stub_ctx { + fd_t *fd; + uuid_t gfid; + inode_t *inode; + unsigned long version; + gf_boolean_t markdirty; + } context; + } u; +} br_stub_local_t; + +#define BR_STUB_FULL_VERSIONING (1<<0) +#define BR_STUB_INCREMENTAL_VERSIONING (1<<1) + +typedef struct br_stub_private { + gf_boolean_t go; + + uint32_t boot[2]; + char export[PATH_MAX]; + + struct mem_pool *local_pool; +} br_stub_private_t; + +/* inode writeback helpers */ +static inline void +__br_stub_mark_inode_dirty (br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback |= I_DIRTY; +} + +static inline void +__br_stub_mark_inode_synced (br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback &= ~I_DIRTY; +} + +static inline int +__br_stub_is_inode_dirty (br_stub_inode_ctx_t *ctx) +{ + return (ctx->need_writeback & I_DIRTY); +} + +static inline int +br_stub_require_release_call (xlator_t *this, fd_t *fd) +{ + int32_t ret = 0; + + ret = fd_ctx_set (fd, this, + (uint64_t)(long)BR_STUB_REQUIRE_RELEASE_CBK); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "could not set fd context (for release callback"); + return ret; +} + +/* get/set inode context helpers */ + +static inline int +__br_stub_get_inode_ctx (xlator_t *this, + inode_t *inode, uint64_t *ctx) +{ + return __inode_ctx_get (inode, this, ctx); +} + +static inline int +br_stub_get_inode_ctx (xlator_t *this, + inode_t *inode, uint64_t *ctx) +{ + return inode_ctx_get (inode, this, ctx); +} + +static inline int +br_stub_set_inode_ctx (xlator_t *this, + inode_t *inode, br_stub_inode_ctx_t *ctx) +{ + uint64_t ctx_addr = (uint64_t) ctx; + return inode_ctx_set (inode, this, &ctx_addr); +} + +/* version get/set helpers */ + +static inline unsigned long +__br_stub_writeback_version (br_stub_inode_ctx_t *ctx) +{ + return (ctx->currentversion + 1); +} + +static inline void +__br_stub_set_ongoing_version (br_stub_inode_ctx_t *ctx, unsigned long version) +{ + ctx->currentversion = version; +} + +static inline void +__br_stub_reset_release_counters (br_stub_inode_ctx_t *ctx) +{ + ctx->releasectx.ordflags = 0; + ctx->releasectx.opencount = 0; + ctx->releasectx.releasecount = 0; +} + +static inline void +__br_stub_track_release (br_stub_inode_ctx_t *ctx) +{ + ++ctx->releasectx.releasecount; +} + +static inline void +___br_stub_track_open (br_stub_inode_ctx_t *ctx) +{ + ++ctx->releasectx.opencount; +} + +static inline void +___br_stub_track_open_flags (fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + ctx->releasectx.ordflags |= fd->flags; +} + +static inline void +__br_stub_track_openfd (fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + ___br_stub_track_open (ctx); + ___br_stub_track_open_flags (fd, ctx); +} + +static inline int +__br_stub_can_trigger_release (inode_t *inode, + br_stub_inode_ctx_t *ctx, + unsigned long *version, int32_t *flags) +{ + if (list_empty (&inode->fd_list) + && (ctx->releasectx.releasecount == ctx->releasectx.opencount)) { + if (flags) + *flags = htonl (ctx->releasectx.ordflags); + if (version) + *version = htonl (ctx->currentversion); + + __br_stub_reset_release_counters (ctx); + return 1; + } + + return 0; +} + +static inline int32_t +br_stub_get_ongoing_version (xlator_t *this, + inode_t *inode, unsigned long *version) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + LOCK (&inode->lock); + { + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + goto unblock; + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + *version = ctx->currentversion; + } + unblock: + UNLOCK (&inode->lock); + + return ret; +} + +/** + * fetch the current version from inode and return the context. + * inode->lock should be held before invoking this as context + * *needs* to be valid in the caller. + */ +static inline br_stub_inode_ctx_t * +__br_stub_get_ongoing_version_ctx (xlator_t *this, + inode_t *inode, unsigned long *version) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + return NULL; + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + if (version) + *version = ctx->currentversion; + + return ctx; +} + +/* filter for xattr fetch */ +static inline int +br_stub_is_internal_xattr (const char *name) +{ + if (name + && ((strncmp (name, BITROT_CURRENT_VERSION_KEY, + strlen (BITROT_CURRENT_VERSION_KEY)) == 0) + || (strncmp (name, BITROT_SIGNING_VERSION_KEY, + strlen (BITROT_SIGNING_VERSION_KEY)) == 0))) + return 1; + return 0; +} + +static inline void +br_stub_remove_vxattrs (dict_t *xattr) +{ + if (xattr) { + dict_del (xattr, BITROT_CURRENT_VERSION_KEY); + dict_del (xattr, BITROT_SIGNING_VERSION_KEY); + } +} + +#endif /* __BIT_ROT_STUB_H__ */ diff --git a/xlators/features/changelog/lib/src/changelog.h b/xlators/features/changelog/lib/src/changelog.h index 1e0df053a9..0830781070 100644 --- a/xlators/features/changelog/lib/src/changelog.h +++ b/xlators/features/changelog/lib/src/changelog.h @@ -16,13 +16,15 @@ struct gf_brick_spec; /** * Max bit shiter for event selection */ -#define CHANGELOG_EV_SELECTION_RANGE 4 +#define CHANGELOG_EV_SELECTION_RANGE 5 -#define CHANGELOG_OP_TYPE_JOURNAL (1<<0) -#define CHANGELOG_OP_TYPE_OPEN (1<<1) -#define CHANGELOG_OP_TYPE_CREATE (1<<2) -#define CHANGELOG_OP_TYPE_RELEASE (1<<3) -#define CHANGELOG_OP_TYPE_MAX (1<<CHANGELOG_EV_SELECTION_RANGE) +#define CHANGELOG_OP_TYPE_JOURNAL (1<<0) +#define CHANGELOG_OP_TYPE_OPEN (1<<1) +#define CHANGELOG_OP_TYPE_CREATE (1<<2) +#define CHANGELOG_OP_TYPE_RELEASE (1<<3) +#define CHANGELOG_OP_TYPE_BR_RELEASE (1<<4) /* logical release (last close()), + sent by bitrot stub */ +#define CHANGELOG_OP_TYPE_MAX (1<<CHANGELOG_EV_SELECTION_RANGE) struct ev_open { @@ -39,17 +41,25 @@ struct ev_release { unsigned char gfid[16]; }; +struct ev_release_br { + int32_t flags; + unsigned long version; + unsigned char gfid[16]; +}; + struct ev_changelog { char path[PATH_MAX]; }; typedef struct changelog_event { unsigned int ev_type; + union { struct ev_open open; struct ev_creat create; struct ev_release release; struct ev_changelog journal; + struct ev_release_br releasebr; } u; } changelog_event_t; diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index b098a29713..3edd4e2800 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -1550,6 +1550,26 @@ out: } static int +brick_graph_add_bitrot_stub (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + dict_t *set_dict, glusterd_brickinfo_t *brickinfo) +{ + xlator_t *xl = NULL; + int ret = -1; + + if (!graph || !volinfo || !set_dict || !brickinfo) + goto out; + + xl = volgen_graph_add (graph, "features/bitrot-stub", volinfo->volname); + if (!xl) + goto out; + + ret = xlator_set_option (xl, "export", brickinfo->path); + +out: + return ret; +} + +static int brick_graph_add_changelog (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, dict_t *set_dict, glusterd_brickinfo_t *brickinfo) { @@ -2155,6 +2175,7 @@ static volgen_brick_xlator_t server_graph_table[] = { {brick_graph_add_pump, NULL}, {brick_graph_add_locks, "locks"}, {brick_graph_add_acl, "acl"}, + {brick_graph_add_bitrot_stub, "bitrot-stub"}, {brick_graph_add_changelog, "changelog"}, {brick_graph_add_changetimerecorder, "changetimerecorder"}, {brick_graph_add_bd, "bd"}, diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 68e3fb7d46..1e2bbcad26 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1774,6 +1774,12 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_7_0, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "features.bitrot", + .voltype = "features/bitrot-stub", + .value = "disable", + .op_version = GD_OP_VERSION_3_7_0, + .type = NO_DOC, + }, { .key = NULL } }; diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index edbf0241f2..4a062970f7 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -1855,3 +1855,73 @@ posix_fsyncer (void *d) } } } + +/** + * fetch on-disk ongoing version and object signature extended + * attribute. + */ +int32_t +posix_get_objectsignature (char *real_path, dict_t *xattr) +{ + int32_t op_ret = 0; + char *memptr = NULL; + ssize_t xattrsize = 0; + ssize_t allocsize = 0; + + op_ret = -EINVAL; + xattrsize = sys_lgetxattr (real_path, + BITROT_CURRENT_VERSION_KEY, NULL, 0); + if (xattrsize == -1) + goto error_return; + allocsize += xattrsize; + + xattrsize = sys_lgetxattr (real_path, + BITROT_SIGNING_VERSION_KEY, NULL, 0); + if (xattrsize == -1) + goto error_return; + allocsize += xattrsize; + + op_ret = -ENOMEM; + /* bulk alloc */ + memptr = GF_CALLOC (allocsize + 2, sizeof (char), gf_posix_mt_char); + if (!memptr) + goto error_return; + + op_ret = sys_lgetxattr (real_path, BITROT_CURRENT_VERSION_KEY, + memptr, allocsize - xattrsize); + if (op_ret == -1) { + op_ret = -errno; + goto dealloc_mem; + } + + xattrsize = op_ret; /* save for correct _in_ memory pointing */ + + op_ret = sys_lgetxattr (real_path, BITROT_SIGNING_VERSION_KEY, + (memptr + op_ret + 1), allocsize - op_ret); + if (op_ret == -1) { + op_ret = -errno; + goto dealloc_mem; + } + + /* this is a dynamic set */ + op_ret = dict_set_dynptr (xattr, BITROT_CURRENT_VERSION_KEY, + memptr, allocsize); + if (op_ret < 0) + goto dealloc_mem; + + /* rest all should be static */ + op_ret = dict_set_static_ptr (xattr, BITROT_SIGNING_VERSION_KEY, + memptr + xattrsize + 1); + if (op_ret < 0) + goto delkey; + + return allocsize; + + delkey: + dict_del (xattr, BITROT_CURRENT_VERSION_KEY); + dealloc_mem: + GF_FREE (memptr); + error_return: + return op_ret; + +} diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 8001f23861..ae08adcc8e 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -3882,6 +3882,18 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, goto done; } + if (loc->inode && name + && (strncmp (name, GLUSTERFS_GET_OBJECT_SIGNATURE, + strlen (GLUSTERFS_GET_OBJECT_SIGNATURE)) == 0)) { + op_ret = posix_get_objectsignature (real_path, dict); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + } + + goto done; + } + if (name) { strcpy (keybuffer, name); char *key = keybuffer; @@ -4316,6 +4328,17 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this, op_ret = -1; } + if (!ret && xdata && dict_get (xdata, GLUSTERFS_DURABLE_OP)) { + op_ret = fsync (_fd); + if (op_ret < 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "could not satisfy durability request: " + "reason (%s)", strerror (errno)); + } + } + out: SET_TO_OLD_FS_ID (); diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index bdb56b1d59..452248dd79 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -242,4 +242,7 @@ posix_pacl_set (const char *path, const char *key, const char *acl_s); int posix_pacl_get (const char *path, const char *key, char **acl_s); +int32_t +posix_get_objectsignature (char *, dict_t *); + #endif /* _POSIX_H */ |