From 0a43265f1b10c35506fe82a525aa0fa43af6c0cd Mon Sep 17 00:00:00 2001 From: Avra Sengupta Date: Mon, 4 Apr 2016 14:55:20 +0530 Subject: nsr/jbr: Renaming nsr to jbr As per community consensus, we have decided to rename nsr to jbr(Journal-Based-Replication). This is the patch to rename the "nsr" code to "jbr" Change-Id: Id2a9837f2ec4da89afc32438b91a1c302bb4104f BUG: 1328043 Signed-off-by: Avra Sengupta Reviewed-on: http://review.gluster.org/13899 Smoke: Gluster Build System CentOS-regression: Gluster Build System NetBSD-regression: NetBSD Build System Reviewed-by: Jeff Darcy --- .gitignore | 4 +- configure.ac | 8 +- glusterfs.spec.in | 4 +- libglusterfs/src/glfs-message-id.h | 6 +- tests/basic/ec/ec.t | 2 +- tests/basic/jbr/jbr-volgen.t | 37 + tests/basic/jbr/jbr.t | 33 + tests/basic/nsr/nsr-volgen.t | 37 - tests/basic/nsr/nsr.t | 33 - tests/volume.rc | 6 +- xlators/experimental/Makefile.am | 2 +- xlators/experimental/jbr-client/Makefile.am | 3 + xlators/experimental/jbr-client/src/Makefile.am | 32 + xlators/experimental/jbr-client/src/fop-template.c | 113 ++ xlators/experimental/jbr-client/src/gen-fops.py | 57 + xlators/experimental/jbr-client/src/jbr-messages.h | 105 ++ xlators/experimental/jbr-client/src/jbrc.c | 320 ++++++ xlators/experimental/jbr-client/src/jbrc.h | 27 + xlators/experimental/jbr-server/Makefile.am | 3 + xlators/experimental/jbr-server/src/Makefile.am | 35 + .../experimental/jbr-server/src/all-templates.c | 437 ++++++++ xlators/experimental/jbr-server/src/gen-fops.py | 138 +++ xlators/experimental/jbr-server/src/jbr-internal.h | 116 ++ xlators/experimental/jbr-server/src/jbr.c | 1147 ++++++++++++++++++++ xlators/experimental/nsr-client/Makefile.am | 3 - xlators/experimental/nsr-client/src/Makefile.am | 32 - xlators/experimental/nsr-client/src/fop-template.c | 113 -- xlators/experimental/nsr-client/src/gen-fops.py | 57 - xlators/experimental/nsr-client/src/nsr-messages.h | 105 -- xlators/experimental/nsr-client/src/nsrc.c | 320 ------ xlators/experimental/nsr-client/src/nsrc.h | 27 - xlators/experimental/nsr-server/Makefile.am | 3 - xlators/experimental/nsr-server/src/Makefile.am | 35 - .../experimental/nsr-server/src/all-templates.c | 437 -------- xlators/experimental/nsr-server/src/gen-fops.py | 138 --- xlators/experimental/nsr-server/src/nsr-internal.h | 116 -- xlators/experimental/nsr-server/src/nsr.c | 1147 -------------------- xlators/mgmt/glusterd/src/glusterd-volgen.c | 20 +- xlators/mgmt/glusterd/src/glusterd-volume-set.c | 12 +- xlators/mgmt/glusterd/src/glusterd.h | 4 +- 40 files changed, 2637 insertions(+), 2637 deletions(-) create mode 100644 tests/basic/jbr/jbr-volgen.t create mode 100755 tests/basic/jbr/jbr.t delete mode 100644 tests/basic/nsr/nsr-volgen.t delete mode 100755 tests/basic/nsr/nsr.t create mode 100644 xlators/experimental/jbr-client/Makefile.am create mode 100644 xlators/experimental/jbr-client/src/Makefile.am create mode 100644 xlators/experimental/jbr-client/src/fop-template.c create mode 100755 xlators/experimental/jbr-client/src/gen-fops.py create mode 100644 xlators/experimental/jbr-client/src/jbr-messages.h create mode 100644 xlators/experimental/jbr-client/src/jbrc.c create mode 100644 xlators/experimental/jbr-client/src/jbrc.h create mode 100644 xlators/experimental/jbr-server/Makefile.am create mode 100644 xlators/experimental/jbr-server/src/Makefile.am create mode 100644 xlators/experimental/jbr-server/src/all-templates.c create mode 100755 xlators/experimental/jbr-server/src/gen-fops.py create mode 100644 xlators/experimental/jbr-server/src/jbr-internal.h create mode 100644 xlators/experimental/jbr-server/src/jbr.c delete mode 100644 xlators/experimental/nsr-client/Makefile.am delete mode 100644 xlators/experimental/nsr-client/src/Makefile.am delete mode 100644 xlators/experimental/nsr-client/src/fop-template.c delete mode 100755 xlators/experimental/nsr-client/src/gen-fops.py delete mode 100644 xlators/experimental/nsr-client/src/nsr-messages.h delete mode 100644 xlators/experimental/nsr-client/src/nsrc.c delete mode 100644 xlators/experimental/nsr-client/src/nsrc.h delete mode 100644 xlators/experimental/nsr-server/Makefile.am delete mode 100644 xlators/experimental/nsr-server/src/Makefile.am delete mode 100644 xlators/experimental/nsr-server/src/all-templates.c delete mode 100755 xlators/experimental/nsr-server/src/gen-fops.py delete mode 100644 xlators/experimental/nsr-server/src/nsr-internal.h delete mode 100644 xlators/experimental/nsr-server/src/nsr.c diff --git a/.gitignore b/.gitignore index e7cbba41f9..edc935595f 100644 --- a/.gitignore +++ b/.gitignore @@ -98,5 +98,5 @@ xlators/experimental/fdl/src/gf_logdump xlators/experimental/fdl/src/gf_recon xlators/experimental/fdl/src/libfdl.c xlators/experimental/fdl/src/librecon.c -xlators/experimental/nsr-client/src/nsrc-cg.c -xlators/experimental/nsr-server/src/nsr-cg.c +xlators/experimental/jbr-client/src/jbrc-cg.c +xlators/experimental/jbr-server/src/jbr-cg.c diff --git a/configure.ac b/configure.ac index c3d3743e6b..f479dd7113 100644 --- a/configure.ac +++ b/configure.ac @@ -187,10 +187,10 @@ AC_CONFIG_FILES([Makefile xlators/mgmt/glusterd/Makefile xlators/mgmt/glusterd/src/Makefile xlators/experimental/Makefile - xlators/experimental/nsr-client/Makefile - xlators/experimental/nsr-client/src/Makefile - xlators/experimental/nsr-server/Makefile - xlators/experimental/nsr-server/src/Makefile + xlators/experimental/jbr-client/Makefile + xlators/experimental/jbr-client/src/Makefile + xlators/experimental/jbr-server/Makefile + xlators/experimental/jbr-server/src/Makefile xlators/experimental/dht2/Makefile xlators/experimental/dht2/dht2-client/Makefile xlators/experimental/dht2/dht2-client/src/Makefile diff --git a/glusterfs.spec.in b/glusterfs.spec.in index 866bec846f..f91b19879a 100644 --- a/glusterfs.spec.in +++ b/glusterfs.spec.in @@ -1096,8 +1096,8 @@ exit 0 %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/arbiter.so %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/bit-rot.so %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/bitrot-stub.so -%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/experimental/nsrc.so -%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/experimental/nsr.so +%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/experimental/jbrc.so +%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/experimental/jbr.so %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/experimental/dht2s.so %if ( 0%{!?_without_tiering:1} ) %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/changetimerecorder.so diff --git a/libglusterfs/src/glfs-message-id.h b/libglusterfs/src/glfs-message-id.h index a521b7be9e..18104dab90 100644 --- a/libglusterfs/src/glfs-message-id.h +++ b/libglusterfs/src/glfs-message-id.h @@ -163,11 +163,11 @@ GLFS_MSGID_COMP_SYMLINK_CACHE_END #define GLFS_MSGID_COMP_SHARD_END (GLFS_MSGID_COMP_SHARD +\ GLFS_MSGID_SEGMENT) -#define GLFS_MSGID_COMP_NSR GLFS_MSGID_COMP_SHARD_END -#define GLFS_MSGID_COMP_NSR_END (GLFS_MSGID_COMP_SHARD_END+\ +#define GLFS_MSGID_COMP_JBR GLFS_MSGID_COMP_SHARD_END +#define GLFS_MSGID_COMP_JBR_END (GLFS_MSGID_COMP_SHARD_END+\ GLFS_MSGID_SEGMENT) -#define GLFS_MSGID_COMP_PL GLFS_MSGID_COMP_NSR_END +#define GLFS_MSGID_COMP_PL GLFS_MSGID_COMP_JBR_END #define GLFS_MSGID_COMP_PL_END (GLFS_MSGID_COMP_PL +\ GLFS_MSGID_SEGMENT) /* --- new segments for messages goes above this line --- */ diff --git a/tests/basic/ec/ec.t b/tests/basic/ec/ec.t index f98f2110e8..a6ae287b7a 100644 --- a/tests/basic/ec/ec.t +++ b/tests/basic/ec/ec.t @@ -12,7 +12,7 @@ function my_getfattr { } function get_rep_count { - v=$(my_getfattr -n trusted.nsr.rep-count $1) + v=$(my_getfattr -n trusted.jbr.rep-count $1) #echo $v > /dev/tty echo $v } diff --git a/tests/basic/jbr/jbr-volgen.t b/tests/basic/jbr/jbr-volgen.t new file mode 100644 index 0000000000..fcd20e5f99 --- /dev/null +++ b/tests/basic/jbr/jbr-volgen.t @@ -0,0 +1,37 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +volfiles=${GLUSTERD_WORKDIR}/vols/${V0}/ +check_brick_volfiles () { + for vf in ${volfiles}${V0}.$(hostname).*.vol; do + grep -qs experimental/jbr $vf || return + # At least for now, nothing else would put a client translator + # in a brick volfile. + grep -qs protocol/client $vf || return + done + echo "OK" +} + +TEST glusterd +TEST pidof glusterd + +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2} +TEST $CLI volume set $V0 cluster.jbr on + +# Check that the client volfile got modified properly. +TEST grep -qs experimental/jbrc ${volfiles}${V0}.tcp-fuse.vol + +# Check that the brick volfiles got modified as well. +EXPECT "OK" check_brick_volfiles + +# Put things back and make sure the "undo" worked. +TEST $CLI volume set $V0 cluster.jbr off +TEST $CLI volume start $V0 +TEST $GFS -s $H0 --volfile-id $V0 $M0 +echo hello > $M0/probe +EXPECT hello cat ${B0}/${V0}1/probe +EXPECT hello cat ${B0}/${V0}2/probe + +cleanup diff --git a/tests/basic/jbr/jbr.t b/tests/basic/jbr/jbr.t new file mode 100755 index 0000000000..283446c963 --- /dev/null +++ b/tests/basic/jbr/jbr.t @@ -0,0 +1,33 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../cluster.rc +. $(dirname $0)/../../snapshot.rc + +cleanup; + +TEST verify_lvm_version; +#Create cluster with 3 nodes +TEST launch_cluster 3; +TEST setup_lvm 3 + +TEST $CLI_1 peer probe $H2; +TEST $CLI_1 peer probe $H3; +EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count; + +TEST $CLI_1 volume create $V0 replica 3 $H1:$L1 $H2:$L2 $H3:$L3 +TEST $CLI_1 volume set $V0 cluster.jbr on +#TEST $CLI_1 volume set $V0 diagnostics.brick-log-level DEBUG +TEST $CLI_1 volume start $V0 + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H1 --entry-timeout=0 $M0; + +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" jbrc_child_up_status $V0 0 + +echo "file" > $M0/file1 +TEST stat $L1/file1 +TEST stat $L2/file1 +TEST stat $L3/file1 + +cleanup; diff --git a/tests/basic/nsr/nsr-volgen.t b/tests/basic/nsr/nsr-volgen.t deleted file mode 100644 index 99563ef608..0000000000 --- a/tests/basic/nsr/nsr-volgen.t +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -. $(dirname $0)/../../include.rc -. $(dirname $0)/../../volume.rc - -volfiles=${GLUSTERD_WORKDIR}/vols/${V0}/ -check_brick_volfiles () { - for vf in ${volfiles}${V0}.$(hostname).*.vol; do - grep -qs experimental/nsr $vf || return - # At least for now, nothing else would put a client translator - # in a brick volfile. - grep -qs protocol/client $vf || return - done - echo "OK" -} - -TEST glusterd -TEST pidof glusterd - -TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2} -TEST $CLI volume set $V0 cluster.nsr on - -# Check that the client volfile got modified properly. -TEST grep -qs experimental/nsrc ${volfiles}${V0}.tcp-fuse.vol - -# Check that the brick volfiles got modified as well. -EXPECT "OK" check_brick_volfiles - -# Put things back and make sure the "undo" worked. -TEST $CLI volume set $V0 cluster.nsr off -TEST $CLI volume start $V0 -TEST $GFS -s $H0 --volfile-id $V0 $M0 -echo hello > $M0/probe -EXPECT hello cat ${B0}/${V0}1/probe -EXPECT hello cat ${B0}/${V0}2/probe - -cleanup diff --git a/tests/basic/nsr/nsr.t b/tests/basic/nsr/nsr.t deleted file mode 100755 index b5a4aaf105..0000000000 --- a/tests/basic/nsr/nsr.t +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -. $(dirname $0)/../../include.rc -. $(dirname $0)/../../volume.rc -. $(dirname $0)/../../cluster.rc -. $(dirname $0)/../../snapshot.rc - -cleanup; - -TEST verify_lvm_version; -#Create cluster with 3 nodes -TEST launch_cluster 3; -TEST setup_lvm 3 - -TEST $CLI_1 peer probe $H2; -TEST $CLI_1 peer probe $H3; -EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count; - -TEST $CLI_1 volume create $V0 replica 3 $H1:$L1 $H2:$L2 $H3:$L3 -TEST $CLI_1 volume set $V0 cluster.nsr on -#TEST $CLI_1 volume set $V0 diagnostics.brick-log-level DEBUG -TEST $CLI_1 volume start $V0 - -TEST glusterfs --volfile-id=$V0 --volfile-server=$H1 --entry-timeout=0 $M0; - -EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" nsrc_child_up_status $V0 0 - -echo "file" > $M0/file1 -TEST stat $L1/file1 -TEST stat $L2/file1 -TEST stat $L3/file1 - -cleanup; diff --git a/tests/volume.rc b/tests/volume.rc index 71b40b72d6..f46f8a19e6 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -110,7 +110,7 @@ function snap_client_connected_status { echo "$up" } -function _nsrc_child_up_status { +function _jbrc_child_up_status { local vol=$1 #brick_id is (brick-num in volume info - 1) local brick_id=$2 @@ -121,11 +121,11 @@ function _nsrc_child_up_status { echo "$up" } -function nsrc_child_up_status { +function jbrc_child_up_status { local vol=$1 #brick_id is (brick-num in volume info - 1) local brick_id=$2 - _nsrc_child_up_status $vol $brick_id generate_mount_statedump + _jbrc_child_up_status $vol $brick_id generate_mount_statedump } function _afr_child_up_status { diff --git a/xlators/experimental/Makefile.am b/xlators/experimental/Makefile.am index be53a44d4b..927e2df654 100644 --- a/xlators/experimental/Makefile.am +++ b/xlators/experimental/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = nsr-client nsr-server fdl dht2 +SUBDIRS = jbr-client jbr-server fdl dht2 CLEANFILES = diff --git a/xlators/experimental/jbr-client/Makefile.am b/xlators/experimental/jbr-client/Makefile.am new file mode 100644 index 0000000000..a985f42a87 --- /dev/null +++ b/xlators/experimental/jbr-client/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/experimental/jbr-client/src/Makefile.am b/xlators/experimental/jbr-client/src/Makefile.am new file mode 100644 index 0000000000..58f399f060 --- /dev/null +++ b/xlators/experimental/jbr-client/src/Makefile.am @@ -0,0 +1,32 @@ +xlator_LTLIBRARIES = jbrc.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/experimental + +nodist_jbrc_la_SOURCES = jbrc-cg.c +CLEANFILES = $(nodist_jbrc_la_SOURCES) + +jbrc_la_LDFLAGS = -module -avoid-version +jbrc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = \ + $(top_srcdir)/xlators/lib/src/libxlator.h \ + $(top_srcdir)/glusterfsd/src/glusterfsd.h \ + jbrc.h jbr-messages.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +JBRC_PREFIX = $(top_srcdir)/xlators/experimental/jbr-client/src +JBRC_GEN_FOPS = $(JBRC_PREFIX)/gen-fops.py +JBRC_TEMPLATES = $(JBRC_PREFIX)/fop-template.c +JBRC_WRAPPER = $(JBRC_PREFIX)/jbrc.c +noinst_PYTHON = $(JBRC_GEN_FOPS) +EXTRA_DIST = $(JBRC_TEMPLATES) $(JBRC_WRAPPER) + +jbrc-cg.c: $(JBRC_GEN_FOPS) $(JBRC_TEMPLATES) $(JBRC_WRAPPER) + $(PYTHON) $(JBRC_GEN_FOPS) $(JBRC_TEMPLATES) $(JBRC_WRAPPER) > $@ + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/jbr.so diff --git a/xlators/experimental/jbr-client/src/fop-template.c b/xlators/experimental/jbr-client/src/fop-template.c new file mode 100644 index 0000000000..7719f511f0 --- /dev/null +++ b/xlators/experimental/jbr-client/src/fop-template.c @@ -0,0 +1,113 @@ +/* template-name fop */ +int32_t +jbrc_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + jbrc_local_t *local = NULL; + xlator_t *target_xl = ACTIVE_CHILD(this); + + local = mem_get(this->local_pool); + if (!local) { + goto err; + } + + local->stub = fop_@NAME@_stub (frame, jbrc_@NAME@_continue, + @SHORT_ARGS@); + if (!local->stub) { + goto err; + } + local->curr_xl = target_xl; + local->scars = 0; + + frame->local = local; + STACK_WIND_COOKIE (frame, jbrc_@NAME@_cbk, target_xl, + target_xl, target_xl->fops->@NAME@, + @SHORT_ARGS@); + return 0; + +err: + if (local) { + mem_put(local); + } + STACK_UNWIND_STRICT (@NAME@, frame, -1, ENOMEM, + @ERROR_ARGS@); + return 0; +} + +/* template-name cbk */ +int32_t +jbrc_@NAME@_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + @LONG_ARGS@) +{ + jbrc_local_t *local = frame->local; + xlator_t *last_xl = cookie; + xlator_t *next_xl; + jbrc_private_t *priv = this->private; + struct timespec spec; + + if (op_ret != (-1)) { + if (local->scars) { + gf_msg (this->name, GF_LOG_INFO, 0, J_MSG_RETRY_MSG, + HILITE("retried %p OK"), frame->local); + } + priv->active = last_xl; + goto unwind; + } + if ((op_errno != EREMOTE) && (op_errno != ENOTCONN)) { + goto unwind; + } + + /* TBD: get leader ID from xdata? */ + next_xl = next_xlator(this, last_xl); + /* + * We can't just give up after we've tried all bricks, because it's + * quite likely that a new leader election just hasn't finished yet. + * We also shouldn't retry endlessly, and especially not at a high + * rate, but that's good enough while we work on other things. + * + * TBD: implement slow/finite retry via a worker thread + */ + if (!next_xl || (local->scars >= SCAR_LIMIT)) { + gf_msg (this->name, GF_LOG_DEBUG, 0, J_MSG_RETRY_MSG, + HILITE("ran out of retries for %p"), frame->local); + goto unwind; + } + + local->curr_xl = next_xl; + local->scars += 1; + spec.tv_sec = 1; + spec.tv_nsec = 0; + /* + * WARNING + * + * Just calling gf_timer_call_after like this leaves open the + * possibility that writes will get reordered, if a first write is + * rescheduled and then a second comes along to find an updated + * priv->active before the first actually executes. We might need to + * implement a stricter (and more complicated) queuing mechanism to + * ensure absolute consistency in this case. + */ + if (gf_timer_call_after(this->ctx, spec, jbrc_retry_cb, local)) { + return 0; + } + +unwind: + call_stub_destroy(local->stub); + STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno, + @SHORT_ARGS@); + return 0; +} + +/* template-name cont-func */ +int32_t +jbrc_@NAME@_continue (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + jbrc_local_t *local = frame->local; + + STACK_WIND_COOKIE (frame, jbrc_@NAME@_cbk, local->curr_xl, + local->curr_xl, local->curr_xl->fops->@NAME@, + @SHORT_ARGS@); + return 0; +} diff --git a/xlators/experimental/jbr-client/src/gen-fops.py b/xlators/experimental/jbr-client/src/gen-fops.py new file mode 100755 index 0000000000..4d9451f717 --- /dev/null +++ b/xlators/experimental/jbr-client/src/gen-fops.py @@ -0,0 +1,57 @@ +#!/usr/bin/python + +import os +import re +import string +import sys + +curdir = os.path.dirname(sys.argv[0]) +gendir = os.path.join(curdir,'../../../../libglusterfs/src') +sys.path.append(gendir) +from generator import ops, fop_subs, cbk_subs, generate + +# We really want the callback argument list, even when we're generating fop +# code, so we propagate here. +# TBD: this should probably be right in generate.py +for k, v in cbk_subs.iteritems(): + fop_subs[k]['@ERROR_ARGS@'] = v['@ERROR_ARGS@'] + +# Stolen from old codegen.py +def load_templates (path): + templates = {} + tmpl_re = re.compile("/\* template-name (.*) \*/") + templates = {} + t_name = None + for line in open(path,"r").readlines(): + if not line: + break + m = tmpl_re.match(line) + if m: + if t_name: + templates[t_name] = string.join(t_contents,'') + t_name = m.group(1).strip() + t_contents = [] + elif t_name: + t_contents.append(line) + if t_name: + templates[t_name] = string.join(t_contents,'') + return templates + +# Stolen from gen_fdl.py +def gen_client (templates): + for name, value in ops.iteritems(): + if name == 'getspec': + # It's not real if it doesn't have a stub function. + continue + print generate(templates['cbk'],name,cbk_subs) + print generate(templates['cont-func'],name,fop_subs) + print generate(templates['fop'],name,fop_subs) + +tmpl = load_templates(sys.argv[1]) +for l in open(sys.argv[2],'r').readlines(): + if l.find('#pragma generate') != -1: + print "/* BEGIN GENERATED CODE - DO NOT MODIFY */" + gen_client(tmpl) + print "/* END GENERATED CODE */" + else: + print l[:-1] diff --git a/xlators/experimental/jbr-client/src/jbr-messages.h b/xlators/experimental/jbr-client/src/jbr-messages.h new file mode 100644 index 0000000000..61fa725d56 --- /dev/null +++ b/xlators/experimental/jbr-client/src/jbr-messages.h @@ -0,0 +1,105 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _JBR_MESSAGES_H_ +#define _JBR_MESSAGES_H_ + +#include "glfs-message-id.h" + +/* NOTE: Rules for message additions + * 1) Each instance of a message is _better_ left with a unique message ID, even + * if the message format is the same. Reasoning is that, if the message + * format needs to change in one instance, the other instances are not + * impacted or the new change does not change the ID of the instance being + * modified. + * 2) Addition of a message, + * - Should increment the GLFS_NUM_MESSAGES + * - Append to the list of messages defined, towards the end + * - Retain macro naming as glfs_msg_X (for redability across developers) + * NOTE: Rules for message format modifications + * 3) Check acorss the code if the message ID macro in question is reused + * anywhere. If reused then then the modifications should ensure correctness + * everywhere, or needs a new message ID as (1) above was not adhered to. If + * not used anywhere, proceed with the required modification. + * NOTE: Rules for message deletion + * 4) Check (3) and if used anywhere else, then cannot be deleted. If not used + * anywhere, then can be deleted, but will leave a hole by design, as + * addition rules specify modification to the end of the list and not filling + * holes. + */ + +#define JBR_COMP_BASE GLFS_MSGID_COMP_JBR +#define GLFS_NUM_MESSAGES 1 +#define GLFS_MSGID_END (JBR_COMP_BASE + GLFS_NUM_MESSAGES + 1) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define J_MSG_INIT_FAIL (JBR_COMP_BASE + 1) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define J_MSG_RETRY_MSG (JBR_COMP_BASE + 2) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define J_MSG_MEM_ERR (JBR_COMP_BASE + 3) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define J_MSG_DICT_FLR (JBR_COMP_BASE + 4) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define J_MSG_GENERIC (JBR_COMP_BASE + 5) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define J_MSG_INVALID (JBR_COMP_BASE + 6) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define J_MSG_NO_DATA (JBR_COMP_BASE + 7) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define J_MSG_SYS_CALL_FAILURE (JBR_COMP_BASE + 8) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define J_MSG_QUORUM_NOT_MET (JBR_COMP_BASE + 9) + +#endif /* _JBR_MESSAGES_H_ */ diff --git a/xlators/experimental/jbr-client/src/jbrc.c b/xlators/experimental/jbr-client/src/jbrc.c new file mode 100644 index 0000000000..9bb9346c5c --- /dev/null +++ b/xlators/experimental/jbr-client/src/jbrc.c @@ -0,0 +1,320 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "call-stub.h" +#include "defaults.h" +#include "timer.h" +#include "xlator.h" +#include "jbr-messages.h" +#include "jbrc.h" +#include "statedump.h" + +#define SCAR_LIMIT 20 +#define HILITE(x) (""x"") + +/* + * The fops are actually generated by gen-fops.py; the rest was mostly copied + * from defaults.c (commit cd253754 on 27 August 2013). + */ + +enum gf_dht_mem_types_ { + gf_mt_jbrc_private_t = gf_common_mt_end + 1, + gf_mt_jbrc_end +}; + +char *JBRC_XATTR = "user.jbr.active"; + +static inline +xlator_t * +ACTIVE_CHILD (xlator_t *parent) +{ + jbrc_private_t *priv = parent->private; + + return priv ? priv->active : FIRST_CHILD(parent); +} + +xlator_t * +next_xlator (xlator_t *this, xlator_t *prev) +{ + xlator_list_t *trav; + + for (trav = this->children; trav; trav = trav->next) { + if (trav->xlator == prev) { + return trav->next ? trav->next->xlator + : this->children->xlator; + } + } + + return NULL; +} + +void +jbrc_retry_cb (void *cb_arg) +{ + jbrc_local_t *local = cb_arg; + + gf_msg (__func__, GF_LOG_INFO, 0, J_MSG_RETRY_MSG, + HILITE("retrying %p"), local); + call_resume_wind(local->stub); +} + +#pragma generate + +int32_t +jbrc_forget (xlator_t *this, inode_t *inode) +{ + gf_msg_callingfn (this->name, GF_LOG_WARNING, 0, J_MSG_INIT_FAIL, + "xlator does not implement forget_cbk"); + return 0; +} + + +int32_t +jbrc_releasedir (xlator_t *this, fd_t *fd) +{ + gf_msg_callingfn (this->name, GF_LOG_WARNING, 0, J_MSG_INIT_FAIL, + "xlator does not implement releasedir_cbk"); + return 0; +} + +int32_t +jbrc_release (xlator_t *this, fd_t *fd) +{ + gf_msg_callingfn (this->name, GF_LOG_WARNING, 0, J_MSG_INIT_FAIL, + "xlator does not implement release_cbk"); + return 0; +} + +struct xlator_fops fops = { + .lookup = jbrc_lookup, + .stat = jbrc_stat, + .fstat = jbrc_fstat, + .truncate = jbrc_truncate, + .ftruncate = jbrc_ftruncate, + .access = jbrc_access, + .readlink = jbrc_readlink, + .mknod = jbrc_mknod, + .mkdir = jbrc_mkdir, + .unlink = jbrc_unlink, + .rmdir = jbrc_rmdir, + .symlink = jbrc_symlink, + .rename = jbrc_rename, + .link = jbrc_link, + .create = jbrc_create, + .open = jbrc_open, + .readv = jbrc_readv, + .writev = jbrc_writev, + .flush = jbrc_flush, + .fsync = jbrc_fsync, + .opendir = jbrc_opendir, + .readdir = jbrc_readdir, + .readdirp = jbrc_readdirp, + .fsyncdir = jbrc_fsyncdir, + .statfs = jbrc_statfs, + .setxattr = jbrc_setxattr, + .getxattr = jbrc_getxattr, + .fsetxattr = jbrc_fsetxattr, + .fgetxattr = jbrc_fgetxattr, + .removexattr = jbrc_removexattr, + .fremovexattr = jbrc_fremovexattr, + .lk = jbrc_lk, + .inodelk = jbrc_inodelk, + .finodelk = jbrc_finodelk, + .entrylk = jbrc_entrylk, + .fentrylk = jbrc_fentrylk, + .rchecksum = jbrc_rchecksum, + .xattrop = jbrc_xattrop, + .fxattrop = jbrc_fxattrop, + .setattr = jbrc_setattr, + .fsetattr = jbrc_fsetattr, + .fallocate = jbrc_fallocate, + .discard = jbrc_discard, +}; + +struct xlator_cbks cbks = { +}; + + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO ("jbrc", this, out); + + ret = xlator_mem_acct_init (this, gf_mt_jbrc_end + 1); + + if (ret != 0) { + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, J_MSG_MEM_ERR, + "Memory accounting init failed"); + return ret; + } +out: + return ret; +} + + +int32_t +jbrc_init (xlator_t *this) +{ + jbrc_private_t *priv = NULL; + xlator_list_t *trav = NULL; + + this->local_pool = mem_pool_new (jbrc_local_t, 128); + if (!this->local_pool) { + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, J_MSG_MEM_ERR, + "failed to create jbrc_local_t pool"); + goto err; + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_mt_jbrc_private_t); + if (!priv) { + goto err; + } + + for (trav = this->children; trav; trav = trav->next) { + ++(priv->n_children); + } + + priv->active = FIRST_CHILD(this); + this->private = priv; + return 0; + +err: + if (priv) { + GF_FREE(priv); + } + return -1; +} + +void +jbrc_fini (xlator_t *this) +{ + GF_FREE(this->private); +} + +int +jbrc_get_child_index (xlator_t *this, xlator_t *kid) +{ + xlator_list_t *trav; + int retval = -1; + + for (trav = this->children; trav; trav = trav->next) { + ++retval; + if (trav->xlator == kid) { + return retval; + } + } + + return -1; +} + +uint8_t +jbrc_count_up_kids (jbrc_private_t *priv) +{ + uint8_t retval = 0; + uint8_t i; + + for (i = 0; i < priv->n_children; ++i) { + if (priv->kid_state & (1 << i)) { + ++retval; + } + } + + return retval; +} + +int32_t +jbrc_notify (xlator_t *this, int32_t event, void *data, ...) +{ + int32_t ret = 0; + int32_t index = 0; + jbrc_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO (THIS->name, this, out); + priv = this->private; + GF_VALIDATE_OR_GOTO (this->name, priv, out); + + switch (event) { + case GF_EVENT_CHILD_UP: + index = jbrc_get_child_index(this, data); + if (index >= 0) { + priv->kid_state |= (1 << index); + priv->up_children = jbrc_count_up_kids(priv); + gf_msg (this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, + "got CHILD_UP for %s, now %u kids", + ((xlator_t *)data)->name, + priv->up_children); + } + ret = default_notify (this, event, data); + break; + case GF_EVENT_CHILD_DOWN: + index = jbrc_get_child_index(this, data); + if (index >= 0) { + priv->kid_state &= ~(1 << index); + priv->up_children = jbrc_count_up_kids(priv); + gf_msg (this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, + "got CHILD_DOWN for %s, now %u kids", + ((xlator_t *)data)->name, + priv->up_children); + } + break; + default: + ret = default_notify (this, event, data); + } + +out: + return ret; +} + +int +jbrc_priv_dump (xlator_t *this) +{ + jbrc_private_t *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + xlator_list_t *trav = NULL; + int32_t i = -1; + + GF_VALIDATE_OR_GOTO (THIS->name, this, out); + priv = this->private; + GF_VALIDATE_OR_GOTO (this->name, priv, out); + + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", + this->type, this->name); + gf_proc_dump_add_section(key_prefix); + + gf_proc_dump_write("up_children", "%u", priv->up_children); + + for (trav = this->children, i = 0; trav; trav = trav->next, i++) { + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "child_%d", i); + gf_proc_dump_write(key_prefix, "%s", trav->xlator->name); + } + +out: + return 0; +} + +struct xlator_dumpops dumpops = { + .priv = jbrc_priv_dump, +}; + +class_methods_t class_methods = { + .init = jbrc_init, + .fini = jbrc_fini, + .notify = jbrc_notify, +}; + +struct volume_options options[] = { + { .key = {NULL} }, +}; diff --git a/xlators/experimental/jbr-client/src/jbrc.h b/xlators/experimental/jbr-client/src/jbrc.h new file mode 100644 index 0000000000..c83259ca1b --- /dev/null +++ b/xlators/experimental/jbr-client/src/jbrc.h @@ -0,0 +1,27 @@ +/* + Copyright (c) 2016 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _JBRC_H_ +#define _JBRC_H_ + +typedef struct { + xlator_t *active; + uint8_t up_children; + uint8_t n_children; + uint32_t kid_state; +} jbrc_private_t; + +typedef struct { + call_stub_t *stub; + xlator_t *curr_xl; + uint16_t scars; +} jbrc_local_t; + +#endif /* _JBRC_H_ */ diff --git a/xlators/experimental/jbr-server/Makefile.am b/xlators/experimental/jbr-server/Makefile.am new file mode 100644 index 0000000000..a985f42a87 --- /dev/null +++ b/xlators/experimental/jbr-server/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/experimental/jbr-server/src/Makefile.am b/xlators/experimental/jbr-server/src/Makefile.am new file mode 100644 index 0000000000..66f73ba8c9 --- /dev/null +++ b/xlators/experimental/jbr-server/src/Makefile.am @@ -0,0 +1,35 @@ +xlator_LTLIBRARIES = jbr.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/experimental + +nodist_jbr_la_SOURCES = jbr-cg.c +CLEANFILES = $(nodist_jbr_la_SOURCES) + +jbr_la_LDFLAGS = -module -avoid-version +jbr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/api/src/libgfapi.la + +noinst_HEADERS = jbr-internal.h \ + $(top_srcdir)/xlators/lib/src/libxlator.h \ + $(top_srcdir)/glusterfsd/src/glusterfsd.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src -DSBIN_DIR=\"$(sbindir)\" \ + -I$(top_srcdir)/api/src -DJBR_SCRIPT_PREFIX=\"$(jbrdir)\" \ + -I$(top_srcdir)/xlators/experimental/jbr-client/src/ + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +JBR_PREFIX = $(top_srcdir)/xlators/experimental/jbr-server/src +JBR_GEN_FOPS = $(JBR_PREFIX)/gen-fops.py +JBR_TEMPLATES = $(JBR_PREFIX)/all-templates.c +JBR_WRAPPER = $(JBR_PREFIX)/jbr.c +noinst_PYTHON = $(JBR_GEN_FOPS) +EXTRA_DIST = $(JBR_TEMPLATES) $(JBR_WRAPPER) + +jbr-cg.c: $(JBR_GEN_FOPS) $(JBR_TEMPLATES) $(JBR_WRAPPER) + $(PYTHON) $(JBR_GEN_FOPS) $(JBR_TEMPLATES) $(JBR_WRAPPER) > $@ + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/jbr.so diff --git a/xlators/experimental/jbr-server/src/all-templates.c b/xlators/experimental/jbr-server/src/all-templates.c new file mode 100644 index 0000000000..9b9a3e0be5 --- /dev/null +++ b/xlators/experimental/jbr-server/src/all-templates.c @@ -0,0 +1,437 @@ +/* + * You can put anything here - it doesn't even have to be a comment - and it + * will be ignored until we reach the first template-name comment. + */ + + +/* template-name read-fop */ +int32_t +jbr_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + jbr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + /* allow reads during reconciliation * + * TBD: allow "dirty" reads on non-leaders * + */ + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_@NAME@_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + return 0; + +err: + STACK_UNWIND_STRICT (@NAME@, frame, -1, EREMOTE, + @ERROR_ARGS@); + return 0; +} + +/* template-name read-dispatch */ +/* No "dispatch" function needed for @NAME@ */ + +/* template-name read-fan-in */ +/* No "fan-in" function needed for @NAME@ */ + +/* template-name read-continue */ +/* No "continue" function needed for @NAME@ */ + +/* template-name read-complete */ +/* No "complete" function needed for @NAME@ */ + +/* template-name write-fop */ +int32_t +jbr_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + jbr_local_t *local = NULL; + jbr_private_t *priv = this->private; + gf_boolean_t result = _gf_false; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + uint32_t ti = 0; + + /* + * Our first goal here is to avoid "split brain surprise" for users who + * specify exactly 50% with two- or three-way replication. That means + * either a more-than check against half the total replicas or an + * at-least check against half of our peers (one less). Of the two, + * only an at-least check supports the intuitive use of 100% to mean + * all replicas must be present, because "more than 100%" will never + * succeed regardless of which count we use. This leaves us with a + * slightly non-traditional definition of quorum ("at least X% of peers + * not including ourselves") but one that's useful enough to be worth + * it. + * + * Note that n_children and up_children *do* include the local + * subvolume, so we need to subtract one in each case. + */ + if (priv->leader) { + result = fop_quorum_check (this, (double)(priv->n_children - 1), + (double)(priv->up_children - 1)); + + if (result == _gf_false) { + /* Emulate the AFR client-side-quorum behavior. */ + gf_msg (this->name, GF_LOG_ERROR, EROFS, + J_MSG_QUORUM_NOT_MET, "Sufficient number of " + "subvolumes are not up to meet quorum."); + op_errno = EROFS; + goto err; + } + } else { + if (xdata) { + from_leader = !!dict_get(xdata, JBR_TERM_XATTR); + from_recon = !!dict_get(xdata, RECON_TERM_XATTR) + && !!dict_get(xdata, RECON_INDEX_XATTR); + } else { + from_leader = from_recon = _gf_false; + } + + /* follower/recon path * + * just send it to local node * + */ + if (!from_leader && !from_recon) { + op_errno = EREMOTE; + goto err; + } + } + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if defined(JBR_CG_NEED_FD) + local->fd = fd_ref(fd); +#else + local->fd = NULL; +#endif + INIT_LIST_HEAD(&local->qlinks); + frame->local = local; + + /* + * If we let it through despite not being the leader, then we just want + * to pass it on down without all of the additional xattrs, queuing, and + * so on. However, jbr_*_complete does depend on the initialization + * immediately above this. + */ + if (!priv->leader) { + STACK_WIND (frame, jbr_@NAME@_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + return 0; + } + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, + J_MSG_MEM_ERR, "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata, JBR_TERM_XATTR, priv->current_term) != 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_DICT_FLR, "failed to set jbr-term"); + goto err; + } + + LOCK(&priv->index_lock); + ti = ++(priv->index); + UNLOCK(&priv->index_lock); + if (dict_set_int32(xdata, JBR_INDEX_XATTR, ti) != 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_DICT_FLR, "failed to set index"); + goto err; + } + + local->stub = fop_@NAME@_stub (frame, jbr_@NAME@_continue, + @SHORT_ARGS@); + if (!local->stub) { + goto err; + } + + +#if defined(JBR_CG_QUEUE) + jbr_inode_ctx_t *ictx = jbr_get_inode_ctx(this, fd->inode); + + if (!ictx) { + op_errno = EIO; + goto err; + } + LOCK(&ictx->lock); + if (ictx->active) { + gf_msg_debug (this->name, 0, + "queuing request due to conflict"); + /* + * TBD: enqueue only for real conflict + * + * Currently we just act like all writes are in + * conflict with one another. What we should really do + * is check the active/pending queues and defer only if + * there's a conflict there. + * + * It's important to check the pending queue because we + * might have an active request X which conflicts with + * a pending request Y, and this request Z might + * conflict with Y but not X. If we checked only the + * active queue then Z could jump ahead of Y, which + * would be incorrect. + */ + local->qstub = fop_@NAME@_stub (frame, + jbr_@NAME@_dispatch, + @SHORT_ARGS@); + if (!local->qstub) { + UNLOCK(&ictx->lock); + goto err; + } + list_add_tail(&local->qlinks, &ictx->pqueue); + ++(ictx->pending); + UNLOCK(&ictx->lock); + return 0; + } else { + list_add_tail(&local->qlinks, &ictx->aqueue); + ++(ictx->active); + } + UNLOCK(&ictx->lock); +#endif + + return jbr_@NAME@_dispatch (frame, this, @SHORT_ARGS@); + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->qstub) { + call_stub_destroy(local->qstub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (@NAME@, frame, -1, op_errno, + @ERROR_ARGS@); + return 0; +} + +/* template-name write-dispatch */ +int32_t +jbr_@NAME@_dispatch (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + jbr_local_t *local = frame->local; + jbr_private_t *priv = this->private; + xlator_list_t *trav; + + /* + * TBD: unblock pending request(s) if we fail after this point but + * before we get to jbr_@NAME@_complete (where that code currently + * resides). + */ + + local->call_count = priv->n_children - 1; + local->successful_acks = 0; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, jbr_@NAME@_fan_in, + trav->xlator, trav->xlator->fops->@NAME@, + @SHORT_ARGS@); + } + + /* TBD: variable Issue count */ + return 0; +} + +/* template-name write-fan-in */ +int32_t +jbr_@NAME@_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + @LONG_ARGS@) +{ + jbr_local_t *local = frame->local; + uint8_t call_count; + + gf_msg_trace (this->name, 0, "op_ret = %d, op_errno = %d\n", + op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + if (op_ret != -1) { + /* Increment the number of successful acks * + * received for the operation. * + */ + (local->successful_acks)++; + local->successful_op_ret = op_ret; + } + gf_msg_debug (this->name, 0, "succ_acks = %d, op_ret = %d, op_errno = %d\n", + op_ret, op_errno, local->successful_acks); + UNLOCK(&frame->lock); + + /* TBD: variable Completion count */ + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +/* template-name write-continue */ +int32_t +jbr_@NAME@_continue (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + int32_t ret = -1; + gf_boolean_t result = _gf_false; + jbr_local_t *local = NULL; + jbr_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO ("jbr", this, out); + GF_VALIDATE_OR_GOTO (this->name, frame, out); + priv = this->private; + local = frame->local; + GF_VALIDATE_OR_GOTO (this->name, priv, out); + GF_VALIDATE_OR_GOTO (this->name, local, out); + + /* Perform quorum check to see if the leader needs * + * to perform the operation. If the operation will not * + * meet quorum irrespective of the leader's result * + * there is no point in the leader performing the fop * + */ + result = fop_quorum_check (this, (double)priv->n_children, + (double)local->successful_acks + 1); + if (result == _gf_false) { + gf_msg (this->name, GF_LOG_ERROR, EROFS, + J_MSG_QUORUM_NOT_MET, "Didn't receive enough acks " + "to meet quorum. Failing the operation without trying " + "it on the leader."); + STACK_UNWIND_STRICT (@NAME@, frame, -1, EROFS, + @ERROR_ARGS@); + } else { + STACK_WIND (frame, jbr_@NAME@_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + } + + ret = 0; +out: + return ret; +} + +/* template-name write-complete */ +int32_t +jbr_@NAME@_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + @LONG_ARGS@) +{ + gf_boolean_t result = _gf_false; + jbr_private_t *priv = this->private; + + jbr_local_t *local = frame->local; + + /* If the fop failed on the leader, then reduce one succesful ack + * before calculating the fop quorum + */ + LOCK(&frame->lock); + if (op_ret == -1) + (local->successful_acks)--; + UNLOCK(&frame->lock); + +#if defined(JBR_CG_QUEUE) + jbr_inode_ctx_t *ictx; + jbr_local_t *next; + + if (local->qlinks.next != &local->qlinks) { + list_del(&local->qlinks); + ictx = jbr_get_inode_ctx(this, local->fd->inode); + if (ictx) { + LOCK(&ictx->lock); + if (ictx->pending) { + /* + * TBD: dequeue *all* non-conflicting + * reqs + * + * With the stub implementation there + * can only be one request active at a + * time (zero here) so it's not an + * issue. In a real implementation + * there might still be other active + * requests to check against, and + * multiple pending requests that could + * continue. + */ + gf_msg_debug (this->name, 0, + "unblocking next request"); + --(ictx->pending); + next = list_entry (ictx->pqueue.next, + jbr_local_t, qlinks); + list_del(&next->qlinks); + list_add_tail(&next->qlinks, + &ictx->aqueue); + call_resume(next->qstub); + } else { + --(ictx->active); + } + UNLOCK(&ictx->lock); + } + } +#endif + +#if defined(JBR_CG_FSYNC) + jbr_mark_fd_dirty(this, local); +#endif + +#if defined(JBR_CG_NEED_FD) + fd_unref(local->fd); +#endif + + /* After the leader completes the fop, a quorum check is * + * performed, taking into account the outcome of the fop * + * on the leader. Irrespective of the fop being successful * + * or failing on the leader, the result of the quorum will * + * determine if the overall fop is successful or not. For * + * example, a fop might have succeeded on every node except * + * the leader, in which case as quorum is being met, the fop * + * will be treated as a successful fop, even though it failed * + * on the leader. On follower nodes, no quorum check should * + * be done, and the result is returned to the leader as is. * + */ + if (priv->leader) { + result = fop_quorum_check (this, (double)priv->n_children, + (double)local->successful_acks + 1); + if (result == _gf_false) { + op_ret = -1; + op_errno = EROFS; + gf_msg (this->name, GF_LOG_ERROR, EROFS, + J_MSG_QUORUM_NOT_MET, "Quorum is not met. " + "The operation has failed."); + } else { +#if defined(JBR_CG_NEED_FD) + op_ret = local->successful_op_ret; +#else + op_ret = 0; +#endif + op_errno = 0; + gf_msg_debug (this->name, 0, + "Quorum has met. The operation has succeeded."); + } + } + + STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno, + @SHORT_ARGS@); + + + return 0; + +} diff --git a/xlators/experimental/jbr-server/src/gen-fops.py b/xlators/experimental/jbr-server/src/gen-fops.py new file mode 100755 index 0000000000..64cbe4f760 --- /dev/null +++ b/xlators/experimental/jbr-server/src/gen-fops.py @@ -0,0 +1,138 @@ +#!/usr/bin/python + +# This script generates the boilerplate versions of most fops and cbks in the +# server. This allows the details of leadership-status checking, sequencing +# between leader and followers (including fan-out), and basic error checking +# to be centralized one place, with per-operation code kept to a minimum. + +import os +import re +import string +import sys + +curdir = os.path.dirname(sys.argv[0]) +gendir = os.path.join(curdir,'../../../../libglusterfs/src') +sys.path.append(gendir) +from generator import ops, fop_subs, cbk_subs, generate + +# We really want the callback argument list, even when we're generating fop +# code, so we propagate here. +# TBD: this should probably be right in generate.py +for k, v in cbk_subs.iteritems(): + fop_subs[k]['@ERROR_ARGS@'] = v['@ERROR_ARGS@'] + +# Stolen from old codegen.py +def load_templates (path): + templates = {} + tmpl_re = re.compile("/\* template-name (.*) \*/") + templates = {} + t_name = None + for line in open(path,"r").readlines(): + if not line: + break + m = tmpl_re.match(line) + if m: + if t_name: + templates[t_name] = string.join(t_contents,'') + t_name = m.group(1).strip() + t_contents = [] + elif t_name: + t_contents.append(line) + if t_name: + templates[t_name] = string.join(t_contents,'') + return templates + +# We need two types of templates. The first, for pure read operations, just +# needs to do a simple am-i-leader check (augmented to allow dirty reads). +# The second, for pure writes, needs to do fan-out to followers between those +# initial checks and local execution. There are other operations that don't +# fit neatly into either category - e.g. lock ops or fsync - so we'll just have +# to handle those manually. The table thus includes entries only for those we +# can categorize. The special cases, plus any new operations we've never even +# heard of, aren't in there. +# +# Various keywords can be used to define/undefine preprocessor symbols used +# in the templates, on a per-function basis. For example, if the keyword here +# is "fsync" (lowercase word or abbreviation) that will cause JBR_CG_FSYNC +# (prefix plus uppercase version) to be defined above all of the generated code +# for that fop. + +fop_table = { + "access": "read", + "create": "write", + "discard": "write", +# "entrylk": "read", + "fallocate": "write", +# "fentrylk": "read", + "fgetxattr": "read", +# "finodelk": "read", +# "flush": "read", + "fremovexattr": "write", + "fsetattr": "write", + "fsetxattr": "write", + "fstat": "read", +# "fsync": "read", +# "fsyncdir": "read", + "ftruncate": "write", + "fxattrop": "write", + "getxattr": "read", +# "inodelk": "read", + "link": "write", +# "lk": "read", +# "lookup": "read", + "mkdir": "write", + "mknod": "write", + "open": "write", + "opendir": "read", + "rchecksum": "read", + "readdir": "read", + "readdirp": "read", + "readlink": "read", + "readv": "read", + "removexattr": "write", + "rename": "write", + "rmdir": "write", + "setattr": "write", + "setxattr": "write", + "stat": "read", + "statfs": "read", + "symlink": "write", + "truncate": "write", + "unlink": "write", + "writev": "write,fsync,queue", + "xattrop": "write", +} + +# Stolen from gen_fdl.py +def gen_server (templates): + fops_done = [] + for name in fop_table.keys(): + info = fop_table[name].split(",") + kind = info[0] + flags = info[1:] + if ("fsync" in flags) or ("queue" in flags): + flags.append("need_fd") + for fname in flags: + print "#define JBR_CG_%s" % fname.upper() + print generate(templates[kind+"-complete"],name,cbk_subs) + print generate(templates[kind+"-continue"],name,fop_subs) + print generate(templates[kind+"-fan-in"],name,cbk_subs) + print generate(templates[kind+"-dispatch"],name,fop_subs) + print generate(templates[kind+"-fop"],name,fop_subs) + for fname in flags: + print "#undef JBR_CG_%s" % fname.upper() + fops_done.append(name) + # Just for fun, emit the fops table too. + print("struct xlator_fops fops = {") + for x in fops_done: + print(" .%s = jbr_%s,"%(x,x)) + print("};") + +tmpl = load_templates(sys.argv[1]) +for l in open(sys.argv[2],'r').readlines(): + if l.find('#pragma generate') != -1: + print "/* BEGIN GENERATED CODE - DO NOT MODIFY */" + gen_server(tmpl) + print "/* END GENERATED CODE */" + else: + print l[:-1] diff --git a/xlators/experimental/jbr-server/src/jbr-internal.h b/xlators/experimental/jbr-server/src/jbr-internal.h new file mode 100644 index 0000000000..ab1dfc16de --- /dev/null +++ b/xlators/experimental/jbr-server/src/jbr-internal.h @@ -0,0 +1,116 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include +#include + +#define LEADER_XATTR "user.jbr.leader" +#define SECOND_CHILD(xl) (xl->children->next->xlator) +#define RECONCILER_PATH JBR_SCRIPT_PREFIX"/reconciler.py" +#define CHANGELOG_ENTRY_SIZE 128 + +enum { + gf_mt_jbr_private_t = gf_common_mt_end + 1, + gf_mt_jbr_fd_ctx_t, + gf_mt_jbr_inode_ctx_t, + gf_mt_jbr_dirty_t, + gf_mt_jbr_end +}; + +typedef enum jbr_recon_notify_ev_id_t { + JBR_RECON_SET_LEADER = 1, + JBR_RECON_ADD_CHILD = 2 +} jbr_recon_notify_ev_id_t; + +typedef struct _jbr_recon_notify_ev_s { + jbr_recon_notify_ev_id_t id; + uint32_t index; /* in case of add */ + struct list_head list; +} jbr_recon_notify_ev_t; + +typedef struct { + /* + * This is a hack to allow a non-leader to accept requests while the + * leader is down, and it only works for n=2. The way it works is that + * "config_leader" indicates the state from our options (via init or + * reconfigure) but "leader" is what the fop code actually looks at. If + * config_leader is true, then leader will *always* be true as well, + * giving that brick precedence. If config_leader is false, then + * leader will only be true if there is no connection to the other + * brick (tracked in jbr_notify). + * + * TBD: implement real leader election + */ + gf_boolean_t config_leader; + gf_boolean_t leader; + uint8_t up_children; + uint8_t n_children; + char *vol_file; + uint32_t current_term; + uint32_t kid_state; + gf_lock_t dirty_lock; + struct list_head dirty_fds; + uint32_t index; + gf_lock_t index_lock; + double quorum_pct; + int term_fd; + long term_total; + long term_read; + /* + * This is a super-duper hack, but it will do for now. The reason it's + * a hack is that we pass this to dict_set_static_bin, so we don't have + * to mess around with allocating and freeing it on every single IPC + * request, but it's totally not thread-safe. On the other hand, there + * should only be one reconciliation thread running and calling these + * functions at a time, so maybe that doesn't matter. + * + * TBD: re-evaluate how to manage this + */ + char term_buf[CHANGELOG_ENTRY_SIZE]; + gf_boolean_t child_up; /* To maintain the state of * + * the translator */ +} jbr_private_t; + +typedef struct { + call_stub_t *stub; + call_stub_t *qstub; + uint32_t call_count; + uint32_t successful_acks; + uint32_t successful_op_ret; + fd_t *fd; + struct list_head qlinks; +} jbr_local_t; + +/* + * This should match whatever changelog returns on the pre-op for us to pass + * when we're ready for our post-op. + */ +typedef uint32_t log_id_t; + +typedef struct { + struct list_head links; + log_id_t id; +} jbr_dirty_list_t; + +typedef struct { + fd_t *fd; + struct list_head dirty_list; + struct list_head fd_list; +} jbr_fd_ctx_t; + +typedef struct { + gf_lock_t lock; + uint32_t active; + struct list_head aqueue; + uint32_t pending; + struct list_head pqueue; +} jbr_inode_ctx_t; + +void jbr_start_reconciler (xlator_t *this); diff --git a/xlators/experimental/jbr-server/src/jbr.c b/xlators/experimental/jbr-server/src/jbr.c new file mode 100644 index 0000000000..984392c2f8 --- /dev/null +++ b/xlators/experimental/jbr-server/src/jbr.c @@ -0,0 +1,1147 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include "call-stub.h" +#include "defaults.h" +#include "xlator.h" +#include "glfs.h" +#include "glfs-internal.h" +#include "run.h" +#include "common-utils.h" +#include "syncop.h" +#include "syscall.h" +#include "compat-errno.h" + +#include "jbr-internal.h" +#include "jbr-messages.h" + +#define JBR_FLUSH_INTERVAL 5 + +enum { + /* echo "cluster/jbr-server" | md5sum | cut -c 1-8 */ + JBR_SERVER_IPC_BASE = 0x0e2d66a5, + JBR_SERVER_TERM_RANGE, + JBR_SERVER_OPEN_TERM, + JBR_SERVER_NEXT_ENTRY +}; + +/* Used to check the quorum of acks received after the fop + * confirming the status of the fop on all the brick processes + * for this particular subvolume + */ +gf_boolean_t +fop_quorum_check (xlator_t *this, double n_children, + double current_state) +{ + jbr_private_t *priv = NULL; + gf_boolean_t result = _gf_false; + double required = 0; + double current = 0; + + GF_VALIDATE_OR_GOTO ("jbr", this, out); + priv = this->private; + GF_VALIDATE_OR_GOTO (this->name, priv, out); + + required = n_children * priv->quorum_pct; + + /* + * Before performing the fop on the leader, we need to check, + * if there is any merit in performing the fop on the leader. + * In a case, where even a successful write on the leader, will + * not meet quorum, there is no point in trying the fop on the + * leader. + * When this function is called after the leader has tried + * performing the fop, this check will calculate quorum taking into + * account the status of the fop on the leader. If the leader's + * op_ret was -1, the complete function would account that by + * decrementing successful_acks by 1 + */ + + current = current_state * 100.0; + + if (current < required) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_QUORUM_NOT_MET, + "Quorum not met. quorum_pct = %f " + "Current State = %f, Required State = %f", + priv->quorum_pct, current, + required); + } else + result = _gf_true; + +out: + return result; +} + +jbr_inode_ctx_t * +jbr_get_inode_ctx (xlator_t *this, inode_t *inode) +{ + uint64_t ctx_int = 0LL; + jbr_inode_ctx_t *ctx_ptr; + + if (__inode_ctx_get(inode, this, &ctx_int) == 0) { + ctx_ptr = (jbr_inode_ctx_t *)(long)ctx_int; + } else { + ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), + gf_mt_jbr_inode_ctx_t); + if (ctx_ptr) { + ctx_int = (uint64_t)(long)ctx_ptr; + if (__inode_ctx_set(inode, this, &ctx_int) == 0) { + LOCK_INIT(&ctx_ptr->lock); + INIT_LIST_HEAD(&ctx_ptr->aqueue); + INIT_LIST_HEAD(&ctx_ptr->pqueue); + } else { + GF_FREE(ctx_ptr); + ctx_ptr = NULL; + } + } + + } + + return ctx_ptr; +} + +jbr_fd_ctx_t * +jbr_get_fd_ctx (xlator_t *this, fd_t *fd) +{ + uint64_t ctx_int = 0LL; + jbr_fd_ctx_t *ctx_ptr; + + if (__fd_ctx_get(fd, this, &ctx_int) == 0) { + ctx_ptr = (jbr_fd_ctx_t *)(long)ctx_int; + } else { + ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), gf_mt_jbr_fd_ctx_t); + if (ctx_ptr) { + if (__fd_ctx_set(fd, this, (uint64_t)ctx_ptr) == 0) { + INIT_LIST_HEAD(&ctx_ptr->dirty_list); + INIT_LIST_HEAD(&ctx_ptr->fd_list); + } else { + GF_FREE(ctx_ptr); + ctx_ptr = NULL; + } + } + + } + + return ctx_ptr; +} + +void +jbr_mark_fd_dirty (xlator_t *this, jbr_local_t *local) +{ + fd_t *fd = local->fd; + jbr_fd_ctx_t *ctx_ptr; + jbr_dirty_list_t *dirty; + jbr_private_t *priv = this->private; + + /* + * TBD: don't do any of this for O_SYNC/O_DIRECT writes. + * Unfortunately, that optimization requires that we distinguish + * between writev and other "write" calls, saving the original flags + * and checking them in the callback. Too much work for too little + * gain right now. + */ + + LOCK(&fd->lock); + ctx_ptr = jbr_get_fd_ctx(this, fd); + dirty = GF_CALLOC(1, sizeof(*dirty), gf_mt_jbr_dirty_t); + if (ctx_ptr && dirty) { + gf_msg_trace (this->name, 0, + "marking fd %p as dirty (%p)", fd, dirty); + /* TBD: fill dirty->id from what changelog gave us */ + list_add_tail(&dirty->links, &ctx_ptr->dirty_list); + if (list_empty(&ctx_ptr->fd_list)) { + /* Add a ref so _release doesn't get called. */ + ctx_ptr->fd = fd_ref(fd); + LOCK(&priv->dirty_lock); + list_add_tail (&ctx_ptr->fd_list, + &priv->dirty_fds); + UNLOCK(&priv->dirty_lock); + } + } else { + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, + J_MSG_MEM_ERR, "could not mark %p dirty", fd); + if (ctx_ptr) { + GF_FREE(ctx_ptr); + } + if (dirty) { + GF_FREE(dirty); + } + } + UNLOCK(&fd->lock); +} + +#define JBR_TERM_XATTR "trusted.jbr.term" +#define JBR_INDEX_XATTR "trusted.jbr.index" +#define JBR_REP_COUNT_XATTR "trusted.jbr.rep-count" +#define RECON_TERM_XATTR "trusted.jbr.recon-term" +#define RECON_INDEX_XATTR "trusted.jbr.recon-index" + +#pragma generate + +uint8_t +jbr_count_up_kids (jbr_private_t *priv) +{ + uint8_t retval = 0; + uint8_t i; + + for (i = 0; i < priv->n_children; ++i) { + if (priv->kid_state & (1 << i)) { + ++retval; + } + } + + return retval; +} + +/* + * The fsync machinery looks a lot like that for any write call, but there are + * some important differences that are easy to miss. First, we don't care + * about the xdata that shows whether the call came from a leader or + * reconciliation process. If we're the leader we fan out; if we're not we + * don't. Second, we don't wait for followers before we issue the local call. + * The code generation system could be updated to handle this, and still might + * if we need to implement other "almost identical" paths (e.g. for open), but + * a copy is more readable as long as it's just one. + */ + +int32_t +jbr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + jbr_local_t *local = frame->local; + gf_boolean_t unwind; + + LOCK(&frame->lock); + unwind = !--(local->call_count); + UNLOCK(&frame->lock); + + if (unwind) { + STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + } + return 0; +} + +int32_t +jbr_fsync_local_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + jbr_dirty_list_t *dirty; + jbr_dirty_list_t *dtmp; + jbr_local_t *local = frame->local; + + list_for_each_entry_safe (dirty, dtmp, &local->qlinks, links) { + gf_msg_trace (this->name, 0, + "sending post-op on %p (%p)", local->fd, dirty); + GF_FREE(dirty); + } + + return jbr_fsync_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); +} + +int32_t +jbr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + jbr_private_t *priv = this->private; + jbr_local_t *local; + uint64_t ctx_int = 0LL; + jbr_fd_ctx_t *ctx_ptr; + xlator_list_t *trav; + + local = mem_get0(this->local_pool); + if (!local) { + STACK_UNWIND_STRICT(fsync, frame, -1, ENOMEM, + NULL, NULL, xdata); + return 0; + } + INIT_LIST_HEAD(&local->qlinks); + frame->local = local; + + /* Move the dirty list from the fd to the fsync request. */ + LOCK(&fd->lock); + if (__fd_ctx_get(fd, this, &ctx_int) == 0) { + ctx_ptr = (jbr_fd_ctx_t *)(long)ctx_int; + list_splice_init (&ctx_ptr->dirty_list, + &local->qlinks); + } + UNLOCK(&fd->lock); + + /* Issue the local call. */ + local->call_count = priv->leader ? priv->n_children : 1; + STACK_WIND (frame, jbr_fsync_local_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, + fd, flags, xdata); + + /* Issue remote calls if we're the leader. */ + if (priv->leader) { + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, jbr_fsync_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, + fd, flags, xdata); + } + } + + return 0; +} + +int32_t +jbr_getxattr_special (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + dict_t *result; + jbr_private_t *priv = this->private; + + if (!priv->leader) { + STACK_UNWIND_STRICT (getxattr, frame, -1, EREMOTE, NULL, NULL); + return 0; + } + + if (!name || (strcmp(name, JBR_REP_COUNT_XATTR) != 0)) { + STACK_WIND_TAIL (frame, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, name, xdata); + return 0; + } + + result = dict_new(); + if (!result) { + goto dn_failed; + } + + priv->up_children = jbr_count_up_kids(this->private); + if (dict_set_uint32(result, JBR_REP_COUNT_XATTR, + priv->up_children) != 0) { + goto dsu_failed; + } + + STACK_UNWIND_STRICT (getxattr, frame, 0, 0, result, NULL); + dict_destroy(result); + return 0; + +dsu_failed: + dict_destroy(result); +dn_failed: + STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +void +jbr_flush_fd (xlator_t *this, jbr_fd_ctx_t *fd_ctx) +{ + jbr_dirty_list_t *dirty; + jbr_dirty_list_t *dtmp; + + list_for_each_entry_safe (dirty, dtmp, &fd_ctx->dirty_list, links) { + gf_msg_trace (this->name, 0, + "sending post-op on %p (%p)", fd_ctx->fd, dirty); + GF_FREE(dirty); + } + + INIT_LIST_HEAD(&fd_ctx->dirty_list); +} + +void * +jbr_flush_thread (void *ctx) +{ + xlator_t *this = ctx; + jbr_private_t *priv = this->private; + struct list_head dirty_fds; + jbr_fd_ctx_t *fd_ctx; + jbr_fd_ctx_t *fd_tmp; + int ret; + + for (;;) { + /* + * We have to be very careful to avoid lock inversions here, so + * we can't just hold priv->dirty_lock while we take and + * release locks for each fd. Instead, we only hold dirty_lock + * at the beginning of each iteration, as we (effectively) make + * a copy of the current list head and then clear the original. + * This leads to four scenarios for adding the first entry to + * an fd and potentially putting it on the global list. + * + * (1) While we're asleep. No lock contention, it just gets + * added and will be processed on the next iteration. + * + * (2) After we've made a local copy, but before we've started + * processing that fd. The new entry will be added to the + * fd (under its lock), and we'll process it on the current + * iteration. + * + * (3) While we're processing the fd. They'll block on the fd + * lock, then see that the list is empty and put it on the + * global list. We'll process it here on the next + * iteration. + * + * (4) While we're working, but after we've processed that fd. + * Same as (1) as far as that fd is concerned. + */ + INIT_LIST_HEAD(&dirty_fds); + LOCK(&priv->dirty_lock); + list_splice_init(&priv->dirty_fds, &dirty_fds); + UNLOCK(&priv->dirty_lock); + + list_for_each_entry_safe (fd_ctx, fd_tmp, &dirty_fds, fd_list) { + ret = syncop_fsync(FIRST_CHILD(this), fd_ctx->fd, 0, + NULL, NULL); + if (ret) { + gf_msg (this->name, GF_LOG_WARNING, 0, + J_MSG_SYS_CALL_FAILURE, + "failed to fsync %p (%d)", + fd_ctx->fd, -ret); + } + + LOCK(&fd_ctx->fd->lock); + jbr_flush_fd(this, fd_ctx); + list_del_init(&fd_ctx->fd_list); + UNLOCK(&fd_ctx->fd->lock); + fd_unref(fd_ctx->fd); + } + + sleep(JBR_FLUSH_INTERVAL); + } + + return NULL; +} + + +int32_t +jbr_get_changelog_dir (xlator_t *this, char **cl_dir_p) +{ + xlator_t *cl_xl; + + /* Find our changelog translator. */ + cl_xl = this; + while (cl_xl) { + if (strcmp(cl_xl->type, "features/changelog") == 0) { + break; + } + cl_xl = cl_xl->children->xlator; + } + if (!cl_xl) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_INIT_FAIL, + "failed to find changelog translator"); + return ENOENT; + } + + /* Find the actual changelog directory. */ + if (dict_get_str(cl_xl->options, "changelog-dir", cl_dir_p) != 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_INIT_FAIL, + "failed to find changelog-dir for %s", cl_xl->name); + return ENODATA; + } + + return 0; +} + + +void +jbr_get_terms (call_frame_t *frame, xlator_t *this) +{ + int32_t op_errno; + char *cl_dir; + DIR *fp = NULL; + struct dirent *rd_entry; + struct dirent *rd_result; + int32_t term_first = -1; + int32_t term_contig = -1; + int32_t term_last = -1; + int term_num; + char *probe_str; + dict_t *my_xdata = NULL; + + op_errno = jbr_get_changelog_dir(this, &cl_dir); + if (op_errno) { + goto err; /* Error was already logged. */ + } + op_errno = ENODATA; /* Most common error after this. */ + + rd_entry = alloca (offsetof(struct dirent, d_name) + + pathconf(cl_dir, _PC_NAME_MAX) + 1); + if (!rd_entry) { + goto err; + } + + fp = sys_opendir (cl_dir); + if (!fp) { + op_errno = errno; + goto err; + } + + /* Find first and last terms. */ + for (;;) { + if (readdir_r(fp, rd_entry, &rd_result) != 0) { + op_errno = errno; + goto err; + } + if (!rd_result) { + break; + } + if (fnmatch("TERM.*", rd_entry->d_name, FNM_PATHNAME) != 0) { + continue; + } + /* +5 points to the character after the period */ + term_num = atoi(rd_entry->d_name+5); + gf_msg (this->name, GF_LOG_INFO, 0, + J_MSG_GENERIC, + "%s => %d", rd_entry->d_name, term_num); + if (term_num < 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_INVALID, + "invalid term file name %s", rd_entry->d_name); + op_errno = EINVAL; + goto err; + } + if ((term_first < 0) || (term_first > term_num)) { + term_first = term_num; + } + if ((term_last < 0) || (term_last < term_num)) { + term_last = term_num; + } + } + if ((term_first < 0) || (term_last < 0)) { + /* TBD: are we *sure* there should always be at least one? */ + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_NO_DATA, "no terms found"); + op_errno = EINVAL; + goto err; + } + + sys_closedir (fp); + fp = NULL; + + /* + * Find term_contig, which is the earliest term for which there are + * no gaps between it and term_last. + */ + for (term_contig = term_last; term_contig > 0; --term_contig) { + if (gf_asprintf(&probe_str, "%s/TERM.%d", + cl_dir, term_contig-1) <= 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_MEM_ERR, + "failed to format term %d", term_contig-1); + goto err; + } + if (sys_access(probe_str, F_OK) != 0) { + GF_FREE(probe_str); + break; + } + GF_FREE(probe_str); + } + + gf_msg (this->name, GF_LOG_INFO, 0, + J_MSG_GENERIC, + "found terms %d-%d (%d)", + term_first, term_last, term_contig); + + /* Return what we've found */ + my_xdata = dict_new(); + if (!my_xdata) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_MEM_ERR, + "failed to allocate reply dictionary"); + goto err; + } + if (dict_set_int32(my_xdata, "term-first", term_first) != 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_DICT_FLR, + "failed to set term-first"); + goto err; + } + if (dict_set_int32(my_xdata, "term-contig", term_contig) != 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_DICT_FLR, + "failed to set term-contig"); + goto err; + } + if (dict_set_int32(my_xdata, "term-last", term_last) != 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_DICT_FLR, + "failed to set term-last"); + goto err; + } + + /* Finally! */ + STACK_UNWIND_STRICT (ipc, frame, 0, 0, my_xdata); + dict_unref(my_xdata); + return; + +err: + if (fp) { + sys_closedir (fp); + } + if (my_xdata) { + dict_unref(my_xdata); + } + STACK_UNWIND_STRICT (ipc, frame, -1, op_errno, NULL); +} + + +long +get_entry_count (xlator_t *this, int fd) +{ + struct stat buf; + long min; /* last entry not known to be empty */ + long max; /* first entry known to be empty */ + long curr; + char entry[CHANGELOG_ENTRY_SIZE]; + + if (sys_fstat (fd, &buf) < 0) { + return -1; + } + + min = 0; + max = buf.st_size / CHANGELOG_ENTRY_SIZE; + + while ((min+1) < max) { + curr = (min + max) / 2; + if (sys_lseek(fd, curr*CHANGELOG_ENTRY_SIZE, SEEK_SET) < 0) { + return -1; + } + if (sys_read(fd, entry, sizeof(entry)) != sizeof(entry)) { + return -1; + } + if ((entry[0] == '_') && (entry[1] == 'P')) { + min = curr; + } else { + max = curr; + } + } + + if (sys_lseek(fd, 0, SEEK_SET) < 0) { + gf_msg (this->name, GF_LOG_WARNING, 0, + J_MSG_SYS_CALL_FAILURE, + "failed to reset offset"); + } + return max; +} + + +void +jbr_open_term (call_frame_t *frame, xlator_t *this, dict_t *xdata) +{ + int32_t op_errno; + char *cl_dir; + char *term; + char *path; + jbr_private_t *priv = this->private; + + op_errno = jbr_get_changelog_dir(this, &cl_dir); + if (op_errno) { + goto err; + } + + if (dict_get_str(xdata, "term", &term) != 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_NO_DATA, "missing term"); + op_errno = ENODATA; + goto err; + } + + if (gf_asprintf(&path, "%s/TERM.%s", cl_dir, term) < 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_MEM_ERR, "failed to construct path"); + op_errno = ENOMEM; + goto err; + } + + if (priv->term_fd >= 0) { + sys_close (priv->term_fd); + } + priv->term_fd = open(path, O_RDONLY); + if (priv->term_fd < 0) { + op_errno = errno; + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_SYS_CALL_FAILURE, + "failed to open term file"); + goto err; + } + + priv->term_total = get_entry_count(this, priv->term_fd); + if (priv->term_total < 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_NO_DATA, "failed to get entry count"); + sys_close (priv->term_fd); + priv->term_fd = -1; + op_errno = EIO; + goto err; + } + priv->term_read = 0; + + /* Success! */ + STACK_UNWIND_STRICT (ipc, frame, 0, 0, NULL); + return; + +err: + STACK_UNWIND_STRICT (ipc, frame, -1, op_errno, NULL); +} + + +void +jbr_next_entry (call_frame_t *frame, xlator_t *this) +{ + int32_t op_errno = ENOMEM; + jbr_private_t *priv = this->private; + ssize_t nbytes; + dict_t *my_xdata; + + if (priv->term_fd < 0) { + op_errno = EBADFD; + goto err; + } + + if (priv->term_read >= priv->term_total) { + op_errno = ENODATA; + goto err; + } + + nbytes = sys_read (priv->term_fd, priv->term_buf, CHANGELOG_ENTRY_SIZE); + if (nbytes < CHANGELOG_ENTRY_SIZE) { + if (nbytes < 0) { + op_errno = errno; + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_SYS_CALL_FAILURE, + "error reading next entry: %s", + strerror(errno)); + } else { + op_errno = EIO; + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_SYS_CALL_FAILURE, + "got %ld/%d bytes for next entry", + nbytes, CHANGELOG_ENTRY_SIZE); + } + goto err; + } + ++(priv->term_read); + + my_xdata = dict_new(); + if (!my_xdata) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_MEM_ERR, "failed to allocate reply xdata"); + goto err; + } + + if (dict_set_static_bin(my_xdata, "data", + priv->term_buf, CHANGELOG_ENTRY_SIZE) != 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_DICT_FLR, "failed to assign reply xdata"); + goto err; + } + + STACK_UNWIND_STRICT (ipc, frame, 0, 0, my_xdata); + dict_unref(my_xdata); + return; + +err: + STACK_UNWIND_STRICT (ipc, frame, -1, op_errno, NULL); +} + + +int32_t +jbr_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) +{ + switch (op) { + case JBR_SERVER_TERM_RANGE: + jbr_get_terms(frame, this); + break; + case JBR_SERVER_OPEN_TERM: + jbr_open_term(frame, this, xdata); + break; + case JBR_SERVER_NEXT_ENTRY: + jbr_next_entry(frame, this); + break; + default: + STACK_WIND_TAIL (frame, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ipc, + op, xdata); + } + + return 0; +} + + +int32_t +jbr_forget (xlator_t *this, inode_t *inode) +{ + uint64_t ctx = 0LL; + + if ((inode_ctx_del(inode, this, &ctx) == 0) && ctx) { + GF_FREE((void *)(long)ctx); + } + + return 0; +} + +int32_t +jbr_release (xlator_t *this, fd_t *fd) +{ + uint64_t ctx = 0LL; + + if ((fd_ctx_del(fd, this, &ctx) == 0) && ctx) { + GF_FREE((void *)(long)ctx); + } + + return 0; +} + +struct xlator_cbks cbks = { + .forget = jbr_forget, + .release = jbr_release, +}; + +int +jbr_reconfigure (xlator_t *this, dict_t *options) +{ + jbr_private_t *priv = this->private; + + GF_OPTION_RECONF ("leader", + priv->config_leader, options, bool, err); + GF_OPTION_RECONF ("quorum-percent", + priv->quorum_pct, options, percent, err); + gf_msg (this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, + "reconfigure called, config_leader = %d, quorum_pct = %.1f\n", + priv->leader, priv->quorum_pct); + + priv->leader = priv->config_leader; + + return 0; + +err: + return -1; +} + +int +jbr_get_child_index (xlator_t *this, xlator_t *kid) +{ + xlator_list_t *trav; + int retval = -1; + + for (trav = this->children; trav; trav = trav->next) { + ++retval; + if (trav->xlator == kid) { + return retval; + } + } + + return -1; +} + +/* + * Child notify handling is unreasonably FUBAR. Sometimes we'll get a + * CHILD_DOWN for a protocol/client child before we ever got a CHILD_UP for it. + * Other times we won't. Because it's effectively random (probably racy), we + * can't just maintain a count. We actually have to keep track of the state + * for each child separately, to filter out the bogus CHILD_DOWN events, and + * then generate counts on demand. + */ +int +jbr_notify (xlator_t *this, int event, void *data, ...) +{ + jbr_private_t *priv = this->private; + int index = -1; + int ret = -1; + gf_boolean_t result = _gf_false; + gf_boolean_t relevant = _gf_false; + + switch (event) { + case GF_EVENT_CHILD_UP: + index = jbr_get_child_index(this, data); + if (index >= 0) { + /* Check if the child was previously down + * and it's not a false CHILD_UP + */ + if (!(priv->kid_state & (1 << index))) { + relevant = _gf_true; + } + + priv->kid_state |= (1 << index); + priv->up_children = jbr_count_up_kids(priv); + gf_msg (this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, + "got CHILD_UP for %s, now %u kids", + ((xlator_t *)data)->name, + priv->up_children); + if (!priv->config_leader && (priv->up_children > 1)) { + priv->leader = _gf_false; + } + + /* If it's not relevant, or we have already * + * sent CHILD_UP just break */ + if (!relevant || priv->child_up) + break; + + /* If it's not a leader, just send the notify up */ + if (!priv->leader) { + ret = default_notify(this, event, data); + if (!ret) + priv->child_up = _gf_true; + break; + } + + result = fop_quorum_check (this, + (double)(priv->n_children - 1), + (double)(priv->up_children - 1)); + if (result == _gf_false) { + gf_msg (this->name, GF_LOG_INFO, 0, + J_MSG_GENERIC, "Not enough children " + "are up to meet quorum. Waiting to " + "send CHILD_UP from leader"); + } else { + gf_msg (this->name, GF_LOG_INFO, 0, + J_MSG_GENERIC, "Enough children are up " + "to meet quorum. Sending CHILD_UP " + "from leader"); + ret = default_notify(this, event, data); + if (!ret) + priv->child_up = _gf_true; + } + } + break; + case GF_EVENT_CHILD_DOWN: + index = jbr_get_child_index(this, data); + if (index >= 0) { + /* Check if the child was previously up + * and it's not a false CHILD_DOWN + */ + if (priv->kid_state & (1 << index)) { + relevant = _gf_true; + } + priv->kid_state &= ~(1 << index); + priv->up_children = jbr_count_up_kids(priv); + gf_msg (this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, + "got CHILD_DOWN for %s, now %u kids", + ((xlator_t *)data)->name, + priv->up_children); + if (!priv->config_leader && (priv->up_children < 2) + && relevant) { + priv->leader = _gf_true; + } + + /* If it's not relevant, or we have already * + * sent CHILD_DOWN just break */ + if (!relevant || !priv->child_up) + break; + + /* If it's not a leader, just break coz we shouldn't * + * propagate the failure from the failure till it * + * itself goes down * + */ + if (!priv->leader) { + break; + } + + result = fop_quorum_check (this, + (double)(priv->n_children - 1), + (double)(priv->up_children - 1)); + if (result == _gf_false) { + gf_msg (this->name, GF_LOG_INFO, 0, + J_MSG_GENERIC, "Enough children are " + "to down to fail quorum. " + "Sending CHILD_DOWN from leader"); + ret = default_notify(this, event, data); + if (!ret) + priv->child_up = _gf_false; + } else { + gf_msg (this->name, GF_LOG_INFO, 0, + J_MSG_GENERIC, "Not enough children " + "are down to fail quorum. Waiting to " + "send CHILD_DOWN from leader"); + } + } + break; + default: + ret = default_notify(this, event, data); + } + + return ret; +} + + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO ("jbr", this, out); + + ret = xlator_mem_acct_init (this, gf_mt_jbr_end + 1); + + if (ret != 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, J_MSG_MEM_ERR, + "Memory accounting init" "failed"); + return ret; + } +out: + return ret; +} + + +void +jbr_deallocate_priv (jbr_private_t *priv) +{ + if (!priv) { + return; + } + + GF_FREE(priv); +} + + +int32_t +jbr_init (xlator_t *this) +{ + xlator_list_t *remote; + xlator_list_t *local; + jbr_private_t *priv = NULL; + xlator_list_t *trav; + pthread_t kid; + extern xlator_t global_xlator; + glusterfs_ctx_t *oldctx = global_xlator.ctx; + + /* + * Any fop that gets special treatment has to be patched in here, + * because the compiled-in table is produced by the code generator and + * only contains generated functions. Note that we have to go through + * this->fops because of some dynamic-linking strangeness; modifying + * the static table doesn't work. + */ + this->fops->getxattr = jbr_getxattr_special; + this->fops->fsync = jbr_fsync; + this->fops->ipc = jbr_ipc; + + local = this->children; + if (!local) { + gf_msg (this->name, GF_LOG_ERROR, 0, J_MSG_NO_DATA, + "no local subvolume"); + goto err; + } + + remote = local->next; + if (!remote) { + gf_msg (this->name, GF_LOG_ERROR, 0, J_MSG_NO_DATA, + "no remote subvolumes"); + goto err; + } + + this->local_pool = mem_pool_new (jbr_local_t, 128); + if (!this->local_pool) { + gf_msg (this->name, GF_LOG_ERROR, 0, J_MSG_MEM_ERR, + "failed to create jbr_local_t pool"); + goto err; + } + + priv = GF_CALLOC (1, sizeof(*priv), gf_mt_jbr_private_t); + if (!priv) { + gf_msg (this->name, GF_LOG_ERROR, 0, J_MSG_MEM_ERR, + "could not allocate priv"); + goto err; + } + + for (trav = this->children; trav; trav = trav->next) { + ++(priv->n_children); + } + + LOCK_INIT(&priv->dirty_lock); + LOCK_INIT(&priv->index_lock); + INIT_LIST_HEAD(&priv->dirty_fds); + priv->term_fd = -1; + + this->private = priv; + + GF_OPTION_INIT ("leader", priv->config_leader, bool, err); + GF_OPTION_INIT ("quorum-percent", priv->quorum_pct, percent, err); + + priv->leader = priv->config_leader; + priv->child_up = _gf_false; + + if (pthread_create(&kid, NULL, jbr_flush_thread, + this) != 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, J_MSG_SYS_CALL_FAILURE, + "could not start flush thread"); + /* TBD: treat this as a fatal error? */ + } + + /* + * Calling glfs_new changes old->ctx, even if THIS still points + * to global_xlator. That causes problems later in the main + * thread, when gf_log_dump_graph tries to use the FILE after + * we've mucked with it and gets a segfault in __fprintf_chk. + * We can avoid all that by undoing the damage before we + * continue. + */ + global_xlator.ctx = oldctx; + + return 0; + +err: + jbr_deallocate_priv(priv); + return -1; +} + + +void +jbr_fini (xlator_t *this) +{ + jbr_deallocate_priv(this->private); +} + +class_methods_t class_methods = { + .init = jbr_init, + .fini = jbr_fini, + .reconfigure = jbr_reconfigure, + .notify = jbr_notify, +}; + +struct volume_options options[] = { + { .key = {"leader"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .description = "Start in the leader role. This is only for " + "bootstrapping the code, and should go away when we " + "have real leader election." + }, + { .key = {"vol-name"}, + .type = GF_OPTION_TYPE_STR, + .description = "volume name" + }, + { .key = {"my-name"}, + .type = GF_OPTION_TYPE_STR, + .description = "brick name in form of host:/path" + }, + { .key = {"etcd-servers"}, + .type = GF_OPTION_TYPE_STR, + .description = "list of comma seperated etc servers" + }, + { .key = {"subvol-uuid"}, + .type = GF_OPTION_TYPE_STR, + .description = "UUID for this JBR (sub)volume" + }, + { .key = {"quorum-percent"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "50.0", + .description = "percentage of rep_count-1 that must be up" + }, + { .key = {NULL} }, +}; diff --git a/xlators/experimental/nsr-client/Makefile.am b/xlators/experimental/nsr-client/Makefile.am deleted file mode 100644 index a985f42a87..0000000000 --- a/xlators/experimental/nsr-client/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = src - -CLEANFILES = diff --git a/xlators/experimental/nsr-client/src/Makefile.am b/xlators/experimental/nsr-client/src/Makefile.am deleted file mode 100644 index 0dbba285cc..0000000000 --- a/xlators/experimental/nsr-client/src/Makefile.am +++ /dev/null @@ -1,32 +0,0 @@ -xlator_LTLIBRARIES = nsrc.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/experimental - -nodist_nsrc_la_SOURCES = nsrc-cg.c -CLEANFILES = $(nodist_nsrc_la_SOURCES) - -nsrc_la_LDFLAGS = -module -avoid-version -nsrc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -noinst_HEADERS = \ - $(top_srcdir)/xlators/lib/src/libxlator.h \ - $(top_srcdir)/glusterfsd/src/glusterfsd.h \ - nsrc.h nsr-messages.h - -AM_CPPFLAGS = $(GF_CPPFLAGS) \ - -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ - -I$(top_srcdir)/rpc/rpc-lib/src - -AM_CFLAGS = -Wall $(GF_CFLAGS) - -NSRC_PREFIX = $(top_srcdir)/xlators/experimental/nsr-client/src -NSRC_GEN_FOPS = $(NSRC_PREFIX)/gen-fops.py -NSRC_TEMPLATES = $(NSRC_PREFIX)/fop-template.c -NSRC_WRAPPER = $(NSRC_PREFIX)/nsrc.c -noinst_PYTHON = $(NSRC_GEN_FOPS) -EXTRA_DIST = $(NSRC_TEMPLATES) $(NSRC_WRAPPER) - -nsrc-cg.c: $(NSRC_GEN_FOPS) $(NSRC_TEMPLATES) $(NSRC_WRAPPER) - $(PYTHON) $(NSRC_GEN_FOPS) $(NSRC_TEMPLATES) $(NSRC_WRAPPER) > $@ - -uninstall-local: - rm -f $(DESTDIR)$(xlatordir)/nsr.so diff --git a/xlators/experimental/nsr-client/src/fop-template.c b/xlators/experimental/nsr-client/src/fop-template.c deleted file mode 100644 index 59708732aa..0000000000 --- a/xlators/experimental/nsr-client/src/fop-template.c +++ /dev/null @@ -1,113 +0,0 @@ -/* template-name fop */ -int32_t -nsrc_@NAME@ (call_frame_t *frame, xlator_t *this, - @LONG_ARGS@) -{ - nsrc_local_t *local = NULL; - xlator_t *target_xl = ACTIVE_CHILD(this); - - local = mem_get(this->local_pool); - if (!local) { - goto err; - } - - local->stub = fop_@NAME@_stub (frame, nsrc_@NAME@_continue, - @SHORT_ARGS@); - if (!local->stub) { - goto err; - } - local->curr_xl = target_xl; - local->scars = 0; - - frame->local = local; - STACK_WIND_COOKIE (frame, nsrc_@NAME@_cbk, target_xl, - target_xl, target_xl->fops->@NAME@, - @SHORT_ARGS@); - return 0; - -err: - if (local) { - mem_put(local); - } - STACK_UNWIND_STRICT (@NAME@, frame, -1, ENOMEM, - @ERROR_ARGS@); - return 0; -} - -/* template-name cbk */ -int32_t -nsrc_@NAME@_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - @LONG_ARGS@) -{ - nsrc_local_t *local = frame->local; - xlator_t *last_xl = cookie; - xlator_t *next_xl; - nsrc_private_t *priv = this->private; - struct timespec spec; - - if (op_ret != (-1)) { - if (local->scars) { - gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_RETRY_MSG, - HILITE("retried %p OK"), frame->local); - } - priv->active = last_xl; - goto unwind; - } - if ((op_errno != EREMOTE) && (op_errno != ENOTCONN)) { - goto unwind; - } - - /* TBD: get leader ID from xdata? */ - next_xl = next_xlator(this, last_xl); - /* - * We can't just give up after we've tried all bricks, because it's - * quite likely that a new leader election just hasn't finished yet. - * We also shouldn't retry endlessly, and especially not at a high - * rate, but that's good enough while we work on other things. - * - * TBD: implement slow/finite retry via a worker thread - */ - if (!next_xl || (local->scars >= SCAR_LIMIT)) { - gf_msg (this->name, GF_LOG_DEBUG, 0, N_MSG_RETRY_MSG, - HILITE("ran out of retries for %p"), frame->local); - goto unwind; - } - - local->curr_xl = next_xl; - local->scars += 1; - spec.tv_sec = 1; - spec.tv_nsec = 0; - /* - * WARNING - * - * Just calling gf_timer_call_after like this leaves open the - * possibility that writes will get reordered, if a first write is - * rescheduled and then a second comes along to find an updated - * priv->active before the first actually executes. We might need to - * implement a stricter (and more complicated) queuing mechanism to - * ensure absolute consistency in this case. - */ - if (gf_timer_call_after(this->ctx, spec, nsrc_retry_cb, local)) { - return 0; - } - -unwind: - call_stub_destroy(local->stub); - STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno, - @SHORT_ARGS@); - return 0; -} - -/* template-name cont-func */ -int32_t -nsrc_@NAME@_continue (call_frame_t *frame, xlator_t *this, - @LONG_ARGS@) -{ - nsrc_local_t *local = frame->local; - - STACK_WIND_COOKIE (frame, nsrc_@NAME@_cbk, local->curr_xl, - local->curr_xl, local->curr_xl->fops->@NAME@, - @SHORT_ARGS@); - return 0; -} diff --git a/xlators/experimental/nsr-client/src/gen-fops.py b/xlators/experimental/nsr-client/src/gen-fops.py deleted file mode 100755 index 4d9451f717..0000000000 --- a/xlators/experimental/nsr-client/src/gen-fops.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/python - -import os -import re -import string -import sys - -curdir = os.path.dirname(sys.argv[0]) -gendir = os.path.join(curdir,'../../../../libglusterfs/src') -sys.path.append(gendir) -from generator import ops, fop_subs, cbk_subs, generate - -# We really want the callback argument list, even when we're generating fop -# code, so we propagate here. -# TBD: this should probably be right in generate.py -for k, v in cbk_subs.iteritems(): - fop_subs[k]['@ERROR_ARGS@'] = v['@ERROR_ARGS@'] - -# Stolen from old codegen.py -def load_templates (path): - templates = {} - tmpl_re = re.compile("/\* template-name (.*) \*/") - templates = {} - t_name = None - for line in open(path,"r").readlines(): - if not line: - break - m = tmpl_re.match(line) - if m: - if t_name: - templates[t_name] = string.join(t_contents,'') - t_name = m.group(1).strip() - t_contents = [] - elif t_name: - t_contents.append(line) - if t_name: - templates[t_name] = string.join(t_contents,'') - return templates - -# Stolen from gen_fdl.py -def gen_client (templates): - for name, value in ops.iteritems(): - if name == 'getspec': - # It's not real if it doesn't have a stub function. - continue - print generate(templates['cbk'],name,cbk_subs) - print generate(templates['cont-func'],name,fop_subs) - print generate(templates['fop'],name,fop_subs) - -tmpl = load_templates(sys.argv[1]) -for l in open(sys.argv[2],'r').readlines(): - if l.find('#pragma generate') != -1: - print "/* BEGIN GENERATED CODE - DO NOT MODIFY */" - gen_client(tmpl) - print "/* END GENERATED CODE */" - else: - print l[:-1] diff --git a/xlators/experimental/nsr-client/src/nsr-messages.h b/xlators/experimental/nsr-client/src/nsr-messages.h deleted file mode 100644 index aa28a639a1..0000000000 --- a/xlators/experimental/nsr-client/src/nsr-messages.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - Copyright (c) 2015 Red Hat, Inc. - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _NSR_MESSAGES_H_ -#define _NSR_MESSAGES_H_ - -#include "glfs-message-id.h" - -/* NOTE: Rules for message additions - * 1) Each instance of a message is _better_ left with a unique message ID, even - * if the message format is the same. Reasoning is that, if the message - * format needs to change in one instance, the other instances are not - * impacted or the new change does not change the ID of the instance being - * modified. - * 2) Addition of a message, - * - Should increment the GLFS_NUM_MESSAGES - * - Append to the list of messages defined, towards the end - * - Retain macro naming as glfs_msg_X (for redability across developers) - * NOTE: Rules for message format modifications - * 3) Check acorss the code if the message ID macro in question is reused - * anywhere. If reused then then the modifications should ensure correctness - * everywhere, or needs a new message ID as (1) above was not adhered to. If - * not used anywhere, proceed with the required modification. - * NOTE: Rules for message deletion - * 4) Check (3) and if used anywhere else, then cannot be deleted. If not used - * anywhere, then can be deleted, but will leave a hole by design, as - * addition rules specify modification to the end of the list and not filling - * holes. - */ - -#define NSR_COMP_BASE GLFS_MSGID_COMP_NSR -#define GLFS_NUM_MESSAGES 1 -#define GLFS_MSGID_END (NSR_COMP_BASE + GLFS_NUM_MESSAGES + 1) - -/*! - * @messageid - * @diagnosis - * @recommendedaction - */ -#define N_MSG_INIT_FAIL (NSR_COMP_BASE + 1) - -/*! - * @messageid - * @diagnosis - * @recommendedaction - */ -#define N_MSG_RETRY_MSG (NSR_COMP_BASE + 2) - -/*! - * @messageid - * @diagnosis - * @recommendedaction - */ -#define N_MSG_MEM_ERR (NSR_COMP_BASE + 3) - -/*! - * @messageid - * @diagnosis - * @recommendedaction - */ -#define N_MSG_DICT_FLR (NSR_COMP_BASE + 4) - -/*! - * @messageid - * @diagnosis - * @recommendedaction - */ -#define N_MSG_GENERIC (NSR_COMP_BASE + 5) - -/*! - * @messageid - * @diagnosis - * @recommendedaction - */ -#define N_MSG_INVALID (NSR_COMP_BASE + 6) - -/*! - * @messageid - * @diagnosis - * @recommendedaction - */ -#define N_MSG_NO_DATA (NSR_COMP_BASE + 7) - -/*! - * @messageid - * @diagnosis - * @recommendedaction - */ -#define N_MSG_SYS_CALL_FAILURE (NSR_COMP_BASE + 8) - -/*! - * @messageid - * @diagnosis - * @recommendedaction - */ -#define N_MSG_QUORUM_NOT_MET (NSR_COMP_BASE + 9) - -#endif /* _NSR_MESSAGES_H_ */ diff --git a/xlators/experimental/nsr-client/src/nsrc.c b/xlators/experimental/nsr-client/src/nsrc.c deleted file mode 100644 index 13f1a2d38c..0000000000 --- a/xlators/experimental/nsr-client/src/nsrc.c +++ /dev/null @@ -1,320 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "call-stub.h" -#include "defaults.h" -#include "timer.h" -#include "xlator.h" -#include "nsr-messages.h" -#include "nsrc.h" -#include "statedump.h" - -#define SCAR_LIMIT 20 -#define HILITE(x) (""x"") - -/* - * The fops are actually generated by gen-fops.py; the rest was mostly copied - * from defaults.c (commit cd253754 on 27 August 2013). - */ - -enum gf_dht_mem_types_ { - gf_mt_nsrc_private_t = gf_common_mt_end + 1, - gf_mt_nsrc_end -}; - -char *NSRC_XATTR = "user.nsr.active"; - -static inline -xlator_t * -ACTIVE_CHILD (xlator_t *parent) -{ - nsrc_private_t *priv = parent->private; - - return priv ? priv->active : FIRST_CHILD(parent); -} - -xlator_t * -next_xlator (xlator_t *this, xlator_t *prev) -{ - xlator_list_t *trav; - - for (trav = this->children; trav; trav = trav->next) { - if (trav->xlator == prev) { - return trav->next ? trav->next->xlator - : this->children->xlator; - } - } - - return NULL; -} - -void -nsrc_retry_cb (void *cb_arg) -{ - nsrc_local_t *local = cb_arg; - - gf_msg (__func__, GF_LOG_INFO, 0, N_MSG_RETRY_MSG, - HILITE("retrying %p"), local); - call_resume_wind(local->stub); -} - -#pragma generate - -int32_t -nsrc_forget (xlator_t *this, inode_t *inode) -{ - gf_msg_callingfn (this->name, GF_LOG_WARNING, 0, N_MSG_INIT_FAIL, - "xlator does not implement forget_cbk"); - return 0; -} - - -int32_t -nsrc_releasedir (xlator_t *this, fd_t *fd) -{ - gf_msg_callingfn (this->name, GF_LOG_WARNING, 0, N_MSG_INIT_FAIL, - "xlator does not implement releasedir_cbk"); - return 0; -} - -int32_t -nsrc_release (xlator_t *this, fd_t *fd) -{ - gf_msg_callingfn (this->name, GF_LOG_WARNING, 0, N_MSG_INIT_FAIL, - "xlator does not implement release_cbk"); - return 0; -} - -struct xlator_fops fops = { - .lookup = nsrc_lookup, - .stat = nsrc_stat, - .fstat = nsrc_fstat, - .truncate = nsrc_truncate, - .ftruncate = nsrc_ftruncate, - .access = nsrc_access, - .readlink = nsrc_readlink, - .mknod = nsrc_mknod, - .mkdir = nsrc_mkdir, - .unlink = nsrc_unlink, - .rmdir = nsrc_rmdir, - .symlink = nsrc_symlink, - .rename = nsrc_rename, - .link = nsrc_link, - .create = nsrc_create, - .open = nsrc_open, - .readv = nsrc_readv, - .writev = nsrc_writev, - .flush = nsrc_flush, - .fsync = nsrc_fsync, - .opendir = nsrc_opendir, - .readdir = nsrc_readdir, - .readdirp = nsrc_readdirp, - .fsyncdir = nsrc_fsyncdir, - .statfs = nsrc_statfs, - .setxattr = nsrc_setxattr, - .getxattr = nsrc_getxattr, - .fsetxattr = nsrc_fsetxattr, - .fgetxattr = nsrc_fgetxattr, - .removexattr = nsrc_removexattr, - .fremovexattr = nsrc_fremovexattr, - .lk = nsrc_lk, - .inodelk = nsrc_inodelk, - .finodelk = nsrc_finodelk, - .entrylk = nsrc_entrylk, - .fentrylk = nsrc_fentrylk, - .rchecksum = nsrc_rchecksum, - .xattrop = nsrc_xattrop, - .fxattrop = nsrc_fxattrop, - .setattr = nsrc_setattr, - .fsetattr = nsrc_fsetattr, - .fallocate = nsrc_fallocate, - .discard = nsrc_discard, -}; - -struct xlator_cbks cbks = { -}; - - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO ("nsrc", this, out); - - ret = xlator_mem_acct_init (this, gf_mt_nsrc_end + 1); - - if (ret != 0) { - gf_msg (this->name, GF_LOG_ERROR, ENOMEM, N_MSG_MEM_ERR, - "Memory accounting init failed"); - return ret; - } -out: - return ret; -} - - -int32_t -nsrc_init (xlator_t *this) -{ - nsrc_private_t *priv = NULL; - xlator_list_t *trav = NULL; - - this->local_pool = mem_pool_new (nsrc_local_t, 128); - if (!this->local_pool) { - gf_msg (this->name, GF_LOG_ERROR, ENOMEM, N_MSG_MEM_ERR, - "failed to create nsrc_local_t pool"); - goto err; - } - - priv = GF_CALLOC (1, sizeof (*priv), gf_mt_nsrc_private_t); - if (!priv) { - goto err; - } - - for (trav = this->children; trav; trav = trav->next) { - ++(priv->n_children); - } - - priv->active = FIRST_CHILD(this); - this->private = priv; - return 0; - -err: - if (priv) { - GF_FREE(priv); - } - return -1; -} - -void -nsrc_fini (xlator_t *this) -{ - GF_FREE(this->private); -} - -int -nsrc_get_child_index (xlator_t *this, xlator_t *kid) -{ - xlator_list_t *trav; - int retval = -1; - - for (trav = this->children; trav; trav = trav->next) { - ++retval; - if (trav->xlator == kid) { - return retval; - } - } - - return -1; -} - -uint8_t -nsrc_count_up_kids (nsrc_private_t *priv) -{ - uint8_t retval = 0; - uint8_t i; - - for (i = 0; i < priv->n_children; ++i) { - if (priv->kid_state & (1 << i)) { - ++retval; - } - } - - return retval; -} - -int32_t -nsrc_notify (xlator_t *this, int32_t event, void *data, ...) -{ - int32_t ret = 0; - int32_t index = 0; - nsrc_private_t *priv = NULL; - - GF_VALIDATE_OR_GOTO (THIS->name, this, out); - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, out); - - switch (event) { - case GF_EVENT_CHILD_UP: - index = nsrc_get_child_index(this, data); - if (index >= 0) { - priv->kid_state |= (1 << index); - priv->up_children = nsrc_count_up_kids(priv); - gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC, - "got CHILD_UP for %s, now %u kids", - ((xlator_t *)data)->name, - priv->up_children); - } - ret = default_notify (this, event, data); - break; - case GF_EVENT_CHILD_DOWN: - index = nsrc_get_child_index(this, data); - if (index >= 0) { - priv->kid_state &= ~(1 << index); - priv->up_children = nsrc_count_up_kids(priv); - gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC, - "got CHILD_DOWN for %s, now %u kids", - ((xlator_t *)data)->name, - priv->up_children); - } - break; - default: - ret = default_notify (this, event, data); - } - -out: - return ret; -} - -int -nsrc_priv_dump (xlator_t *this) -{ - nsrc_private_t *priv = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - xlator_list_t *trav = NULL; - int32_t i = -1; - - GF_VALIDATE_OR_GOTO (THIS->name, this, out); - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, out); - - snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", - this->type, this->name); - gf_proc_dump_add_section(key_prefix); - - gf_proc_dump_write("up_children", "%u", priv->up_children); - - for (trav = this->children, i = 0; trav; trav = trav->next, i++) { - snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "child_%d", i); - gf_proc_dump_write(key_prefix, "%s", trav->xlator->name); - } - -out: - return 0; -} - -struct xlator_dumpops dumpops = { - .priv = nsrc_priv_dump, -}; - -class_methods_t class_methods = { - .init = nsrc_init, - .fini = nsrc_fini, - .notify = nsrc_notify, -}; - -struct volume_options options[] = { - { .key = {NULL} }, -}; diff --git a/xlators/experimental/nsr-client/src/nsrc.h b/xlators/experimental/nsr-client/src/nsrc.h deleted file mode 100644 index 15f0d7c85a..0000000000 --- a/xlators/experimental/nsr-client/src/nsrc.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - Copyright (c) 2016 Red Hat, Inc. - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _NSRC_H_ -#define _NSRC_H_ - -typedef struct { - xlator_t *active; - uint8_t up_children; - uint8_t n_children; - uint32_t kid_state; -} nsrc_private_t; - -typedef struct { - call_stub_t *stub; - xlator_t *curr_xl; - uint16_t scars; -} nsrc_local_t; - -#endif /* _NSRC_H_ */ diff --git a/xlators/experimental/nsr-server/Makefile.am b/xlators/experimental/nsr-server/Makefile.am deleted file mode 100644 index a985f42a87..0000000000 --- a/xlators/experimental/nsr-server/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = src - -CLEANFILES = diff --git a/xlators/experimental/nsr-server/src/Makefile.am b/xlators/experimental/nsr-server/src/Makefile.am deleted file mode 100644 index 6c0597610a..0000000000 --- a/xlators/experimental/nsr-server/src/Makefile.am +++ /dev/null @@ -1,35 +0,0 @@ -xlator_LTLIBRARIES = nsr.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/experimental - -nodist_nsr_la_SOURCES = nsr-cg.c -CLEANFILES = $(nodist_nsr_la_SOURCES) - -nsr_la_LDFLAGS = -module -avoid-version -nsr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ - $(top_builddir)/api/src/libgfapi.la - -noinst_HEADERS = nsr-internal.h \ - $(top_srcdir)/xlators/lib/src/libxlator.h \ - $(top_srcdir)/glusterfsd/src/glusterfsd.h - -AM_CPPFLAGS = $(GF_CPPFLAGS) \ - -I$(top_srcdir)/libglusterfs/src \ - -I$(top_srcdir)/xlators/lib/src \ - -I$(top_srcdir)/rpc/rpc-lib/src -DSBIN_DIR=\"$(sbindir)\" \ - -I$(top_srcdir)/api/src -DNSR_SCRIPT_PREFIX=\"$(nsrdir)\" \ - -I$(top_srcdir)/xlators/experimental/nsr-client/src/ - -AM_CFLAGS = -Wall $(GF_CFLAGS) - -NSR_PREFIX = $(top_srcdir)/xlators/experimental/nsr-server/src -NSR_GEN_FOPS = $(NSR_PREFIX)/gen-fops.py -NSR_TEMPLATES = $(NSR_PREFIX)/all-templates.c -NSR_WRAPPER = $(NSR_PREFIX)/nsr.c -noinst_PYTHON = $(NSR_GEN_FOPS) -EXTRA_DIST = $(NSR_TEMPLATES) $(NSR_WRAPPER) - -nsr-cg.c: $(NSR_GEN_FOPS) $(NSR_TEMPLATES) $(NSR_WRAPPER) - $(PYTHON) $(NSR_GEN_FOPS) $(NSR_TEMPLATES) $(NSR_WRAPPER) > $@ - -uninstall-local: - rm -f $(DESTDIR)$(xlatordir)/nsr.so diff --git a/xlators/experimental/nsr-server/src/all-templates.c b/xlators/experimental/nsr-server/src/all-templates.c deleted file mode 100644 index c3819d2af5..0000000000 --- a/xlators/experimental/nsr-server/src/all-templates.c +++ /dev/null @@ -1,437 +0,0 @@ -/* - * You can put anything here - it doesn't even have to be a comment - and it - * will be ignored until we reach the first template-name comment. - */ - - -/* template-name read-fop */ -int32_t -nsr_@NAME@ (call_frame_t *frame, xlator_t *this, - @LONG_ARGS@) -{ - nsr_private_t *priv = this->private; - gf_boolean_t in_recon = _gf_false; - int32_t recon_term, recon_index; - - /* allow reads during reconciliation * - * TBD: allow "dirty" reads on non-leaders * - */ - if (xdata && - (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && - (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { - in_recon = _gf_true; - } - - if ((!priv->leader) && (in_recon == _gf_false)) { - goto err; - } - - STACK_WIND (frame, default_@NAME@_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, - @SHORT_ARGS@); - return 0; - -err: - STACK_UNWIND_STRICT (@NAME@, frame, -1, EREMOTE, - @ERROR_ARGS@); - return 0; -} - -/* template-name read-dispatch */ -/* No "dispatch" function needed for @NAME@ */ - -/* template-name read-fan-in */ -/* No "fan-in" function needed for @NAME@ */ - -/* template-name read-continue */ -/* No "continue" function needed for @NAME@ */ - -/* template-name read-complete */ -/* No "complete" function needed for @NAME@ */ - -/* template-name write-fop */ -int32_t -nsr_@NAME@ (call_frame_t *frame, xlator_t *this, - @LONG_ARGS@) -{ - nsr_local_t *local = NULL; - nsr_private_t *priv = this->private; - gf_boolean_t result = _gf_false; - int op_errno = ENOMEM; - int from_leader; - int from_recon; - uint32_t ti = 0; - - /* - * Our first goal here is to avoid "split brain surprise" for users who - * specify exactly 50% with two- or three-way replication. That means - * either a more-than check against half the total replicas or an - * at-least check against half of our peers (one less). Of the two, - * only an at-least check supports the intuitive use of 100% to mean - * all replicas must be present, because "more than 100%" will never - * succeed regardless of which count we use. This leaves us with a - * slightly non-traditional definition of quorum ("at least X% of peers - * not including ourselves") but one that's useful enough to be worth - * it. - * - * Note that n_children and up_children *do* include the local - * subvolume, so we need to subtract one in each case. - */ - if (priv->leader) { - result = fop_quorum_check (this, (double)(priv->n_children - 1), - (double)(priv->up_children - 1)); - - if (result == _gf_false) { - /* Emulate the AFR client-side-quorum behavior. */ - gf_msg (this->name, GF_LOG_ERROR, EROFS, - N_MSG_QUORUM_NOT_MET, "Sufficient number of " - "subvolumes are not up to meet quorum."); - op_errno = EROFS; - goto err; - } - } else { - if (xdata) { - from_leader = !!dict_get(xdata, NSR_TERM_XATTR); - from_recon = !!dict_get(xdata, RECON_TERM_XATTR) - && !!dict_get(xdata, RECON_INDEX_XATTR); - } else { - from_leader = from_recon = _gf_false; - } - - /* follower/recon path * - * just send it to local node * - */ - if (!from_leader && !from_recon) { - op_errno = EREMOTE; - goto err; - } - } - - local = mem_get0(this->local_pool); - if (!local) { - goto err; - } -#if defined(NSR_CG_NEED_FD) - local->fd = fd_ref(fd); -#else - local->fd = NULL; -#endif - INIT_LIST_HEAD(&local->qlinks); - frame->local = local; - - /* - * If we let it through despite not being the leader, then we just want - * to pass it on down without all of the additional xattrs, queuing, and - * so on. However, nsr_*_complete does depend on the initialization - * immediately above this. - */ - if (!priv->leader) { - STACK_WIND (frame, nsr_@NAME@_complete, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, - @SHORT_ARGS@); - return 0; - } - - if (!xdata) { - xdata = dict_new(); - if (!xdata) { - gf_msg (this->name, GF_LOG_ERROR, ENOMEM, - N_MSG_MEM_ERR, "failed to allocate xdata"); - goto err; - } - } - - if (dict_set_int32(xdata, NSR_TERM_XATTR, priv->current_term) != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_DICT_FLR, "failed to set nsr-term"); - goto err; - } - - LOCK(&priv->index_lock); - ti = ++(priv->index); - UNLOCK(&priv->index_lock); - if (dict_set_int32(xdata, NSR_INDEX_XATTR, ti) != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_DICT_FLR, "failed to set index"); - goto err; - } - - local->stub = fop_@NAME@_stub (frame, nsr_@NAME@_continue, - @SHORT_ARGS@); - if (!local->stub) { - goto err; - } - - -#if defined(NSR_CG_QUEUE) - nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this, fd->inode); - - if (!ictx) { - op_errno = EIO; - goto err; - } - LOCK(&ictx->lock); - if (ictx->active) { - gf_msg_debug (this->name, 0, - "queuing request due to conflict"); - /* - * TBD: enqueue only for real conflict - * - * Currently we just act like all writes are in - * conflict with one another. What we should really do - * is check the active/pending queues and defer only if - * there's a conflict there. - * - * It's important to check the pending queue because we - * might have an active request X which conflicts with - * a pending request Y, and this request Z might - * conflict with Y but not X. If we checked only the - * active queue then Z could jump ahead of Y, which - * would be incorrect. - */ - local->qstub = fop_@NAME@_stub (frame, - nsr_@NAME@_dispatch, - @SHORT_ARGS@); - if (!local->qstub) { - UNLOCK(&ictx->lock); - goto err; - } - list_add_tail(&local->qlinks, &ictx->pqueue); - ++(ictx->pending); - UNLOCK(&ictx->lock); - return 0; - } else { - list_add_tail(&local->qlinks, &ictx->aqueue); - ++(ictx->active); - } - UNLOCK(&ictx->lock); -#endif - - return nsr_@NAME@_dispatch (frame, this, @SHORT_ARGS@); - -err: - if (local) { - if (local->stub) { - call_stub_destroy(local->stub); - } - if (local->qstub) { - call_stub_destroy(local->qstub); - } - if (local->fd) { - fd_unref(local->fd); - } - mem_put(local); - } - STACK_UNWIND_STRICT (@NAME@, frame, -1, op_errno, - @ERROR_ARGS@); - return 0; -} - -/* template-name write-dispatch */ -int32_t -nsr_@NAME@_dispatch (call_frame_t *frame, xlator_t *this, - @LONG_ARGS@) -{ - nsr_local_t *local = frame->local; - nsr_private_t *priv = this->private; - xlator_list_t *trav; - - /* - * TBD: unblock pending request(s) if we fail after this point but - * before we get to nsr_@NAME@_complete (where that code currently - * resides). - */ - - local->call_count = priv->n_children - 1; - local->successful_acks = 0; - for (trav = this->children->next; trav; trav = trav->next) { - STACK_WIND (frame, nsr_@NAME@_fan_in, - trav->xlator, trav->xlator->fops->@NAME@, - @SHORT_ARGS@); - } - - /* TBD: variable Issue count */ - return 0; -} - -/* template-name write-fan-in */ -int32_t -nsr_@NAME@_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - @LONG_ARGS@) -{ - nsr_local_t *local = frame->local; - uint8_t call_count; - - gf_msg_trace (this->name, 0, "op_ret = %d, op_errno = %d\n", - op_ret, op_errno); - - LOCK(&frame->lock); - call_count = --(local->call_count); - if (op_ret != -1) { - /* Increment the number of successful acks * - * received for the operation. * - */ - (local->successful_acks)++; - local->successful_op_ret = op_ret; - } - gf_msg_debug (this->name, 0, "succ_acks = %d, op_ret = %d, op_errno = %d\n", - op_ret, op_errno, local->successful_acks); - UNLOCK(&frame->lock); - - /* TBD: variable Completion count */ - if (call_count == 0) { - call_resume(local->stub); - } - - return 0; -} - -/* template-name write-continue */ -int32_t -nsr_@NAME@_continue (call_frame_t *frame, xlator_t *this, - @LONG_ARGS@) -{ - int32_t ret = -1; - gf_boolean_t result = _gf_false; - nsr_local_t *local = NULL; - nsr_private_t *priv = NULL; - - GF_VALIDATE_OR_GOTO ("nsr", this, out); - GF_VALIDATE_OR_GOTO (this->name, frame, out); - priv = this->private; - local = frame->local; - GF_VALIDATE_OR_GOTO (this->name, priv, out); - GF_VALIDATE_OR_GOTO (this->name, local, out); - - /* Perform quorum check to see if the leader needs * - * to perform the operation. If the operation will not * - * meet quorum irrespective of the leader's result * - * there is no point in the leader performing the fop * - */ - result = fop_quorum_check (this, (double)priv->n_children, - (double)local->successful_acks + 1); - if (result == _gf_false) { - gf_msg (this->name, GF_LOG_ERROR, EROFS, - N_MSG_QUORUM_NOT_MET, "Didn't receive enough acks " - "to meet quorum. Failing the operation without trying " - "it on the leader."); - STACK_UNWIND_STRICT (@NAME@, frame, -1, EROFS, - @ERROR_ARGS@); - } else { - STACK_WIND (frame, nsr_@NAME@_complete, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, - @SHORT_ARGS@); - } - - ret = 0; -out: - return ret; -} - -/* template-name write-complete */ -int32_t -nsr_@NAME@_complete (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - @LONG_ARGS@) -{ - gf_boolean_t result = _gf_false; - nsr_private_t *priv = this->private; - - nsr_local_t *local = frame->local; - - /* If the fop failed on the leader, then reduce one succesful ack - * before calculating the fop quorum - */ - LOCK(&frame->lock); - if (op_ret == -1) - (local->successful_acks)--; - UNLOCK(&frame->lock); - -#if defined(NSR_CG_QUEUE) - nsr_inode_ctx_t *ictx; - nsr_local_t *next; - - if (local->qlinks.next != &local->qlinks) { - list_del(&local->qlinks); - ictx = nsr_get_inode_ctx(this, local->fd->inode); - if (ictx) { - LOCK(&ictx->lock); - if (ictx->pending) { - /* - * TBD: dequeue *all* non-conflicting - * reqs - * - * With the stub implementation there - * can only be one request active at a - * time (zero here) so it's not an - * issue. In a real implementation - * there might still be other active - * requests to check against, and - * multiple pending requests that could - * continue. - */ - gf_msg_debug (this->name, 0, - "unblocking next request"); - --(ictx->pending); - next = list_entry (ictx->pqueue.next, - nsr_local_t, qlinks); - list_del(&next->qlinks); - list_add_tail(&next->qlinks, - &ictx->aqueue); - call_resume(next->qstub); - } else { - --(ictx->active); - } - UNLOCK(&ictx->lock); - } - } -#endif - -#if defined(NSR_CG_FSYNC) - nsr_mark_fd_dirty(this, local); -#endif - -#if defined(NSR_CG_NEED_FD) - fd_unref(local->fd); -#endif - - /* After the leader completes the fop, a quorum check is * - * performed, taking into account the outcome of the fop * - * on the leader. Irrespective of the fop being successful * - * or failing on the leader, the result of the quorum will * - * determine if the overall fop is successful or not. For * - * example, a fop might have succeeded on every node except * - * the leader, in which case as quorum is being met, the fop * - * will be treated as a successful fop, even though it failed * - * on the leader. On follower nodes, no quorum check should * - * be done, and the result is returned to the leader as is. * - */ - if (priv->leader) { - result = fop_quorum_check (this, (double)priv->n_children, - (double)local->successful_acks + 1); - if (result == _gf_false) { - op_ret = -1; - op_errno = EROFS; - gf_msg (this->name, GF_LOG_ERROR, EROFS, - N_MSG_QUORUM_NOT_MET, "Quorum is not met. " - "The operation has failed."); - } else { -#if defined(NSR_CG_NEED_FD) - op_ret = local->successful_op_ret; -#else - op_ret = 0; -#endif - op_errno = 0; - gf_msg_debug (this->name, 0, - "Quorum has met. The operation has succeeded."); - } - } - - STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno, - @SHORT_ARGS@); - - - return 0; - -} diff --git a/xlators/experimental/nsr-server/src/gen-fops.py b/xlators/experimental/nsr-server/src/gen-fops.py deleted file mode 100755 index 336b218a8f..0000000000 --- a/xlators/experimental/nsr-server/src/gen-fops.py +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/python - -# This script generates the boilerplate versions of most fops and cbks in the -# server. This allows the details of leadership-status checking, sequencing -# between leader and followers (including fan-out), and basic error checking -# to be centralized one place, with per-operation code kept to a minimum. - -import os -import re -import string -import sys - -curdir = os.path.dirname(sys.argv[0]) -gendir = os.path.join(curdir,'../../../../libglusterfs/src') -sys.path.append(gendir) -from generator import ops, fop_subs, cbk_subs, generate - -# We really want the callback argument list, even when we're generating fop -# code, so we propagate here. -# TBD: this should probably be right in generate.py -for k, v in cbk_subs.iteritems(): - fop_subs[k]['@ERROR_ARGS@'] = v['@ERROR_ARGS@'] - -# Stolen from old codegen.py -def load_templates (path): - templates = {} - tmpl_re = re.compile("/\* template-name (.*) \*/") - templates = {} - t_name = None - for line in open(path,"r").readlines(): - if not line: - break - m = tmpl_re.match(line) - if m: - if t_name: - templates[t_name] = string.join(t_contents,'') - t_name = m.group(1).strip() - t_contents = [] - elif t_name: - t_contents.append(line) - if t_name: - templates[t_name] = string.join(t_contents,'') - return templates - -# We need two types of templates. The first, for pure read operations, just -# needs to do a simple am-i-leader check (augmented to allow dirty reads). -# The second, for pure writes, needs to do fan-out to followers between those -# initial checks and local execution. There are other operations that don't -# fit neatly into either category - e.g. lock ops or fsync - so we'll just have -# to handle those manually. The table thus includes entries only for those we -# can categorize. The special cases, plus any new operations we've never even -# heard of, aren't in there. -# -# Various keywords can be used to define/undefine preprocessor symbols used -# in the templates, on a per-function basis. For example, if the keyword here -# is "fsync" (lowercase word or abbreviation) that will cause NSR_CG_FSYNC -# (prefix plus uppercase version) to be defined above all of the generated code -# for that fop. - -fop_table = { - "access": "read", - "create": "write", - "discard": "write", -# "entrylk": "read", - "fallocate": "write", -# "fentrylk": "read", - "fgetxattr": "read", -# "finodelk": "read", -# "flush": "read", - "fremovexattr": "write", - "fsetattr": "write", - "fsetxattr": "write", - "fstat": "read", -# "fsync": "read", -# "fsyncdir": "read", - "ftruncate": "write", - "fxattrop": "write", - "getxattr": "read", -# "inodelk": "read", - "link": "write", -# "lk": "read", -# "lookup": "read", - "mkdir": "write", - "mknod": "write", - "open": "write", - "opendir": "read", - "rchecksum": "read", - "readdir": "read", - "readdirp": "read", - "readlink": "read", - "readv": "read", - "removexattr": "write", - "rename": "write", - "rmdir": "write", - "setattr": "write", - "setxattr": "write", - "stat": "read", - "statfs": "read", - "symlink": "write", - "truncate": "write", - "unlink": "write", - "writev": "write,fsync,queue", - "xattrop": "write", -} - -# Stolen from gen_fdl.py -def gen_server (templates): - fops_done = [] - for name in fop_table.keys(): - info = fop_table[name].split(",") - kind = info[0] - flags = info[1:] - if ("fsync" in flags) or ("queue" in flags): - flags.append("need_fd") - for fname in flags: - print "#define NSR_CG_%s" % fname.upper() - print generate(templates[kind+"-complete"],name,cbk_subs) - print generate(templates[kind+"-continue"],name,fop_subs) - print generate(templates[kind+"-fan-in"],name,cbk_subs) - print generate(templates[kind+"-dispatch"],name,fop_subs) - print generate(templates[kind+"-fop"],name,fop_subs) - for fname in flags: - print "#undef NSR_CG_%s" % fname.upper() - fops_done.append(name) - # Just for fun, emit the fops table too. - print("struct xlator_fops fops = {") - for x in fops_done: - print(" .%s = nsr_%s,"%(x,x)) - print("};") - -tmpl = load_templates(sys.argv[1]) -for l in open(sys.argv[2],'r').readlines(): - if l.find('#pragma generate') != -1: - print "/* BEGIN GENERATED CODE - DO NOT MODIFY */" - gen_server(tmpl) - print "/* END GENERATED CODE */" - else: - print l[:-1] diff --git a/xlators/experimental/nsr-server/src/nsr-internal.h b/xlators/experimental/nsr-server/src/nsr-internal.h deleted file mode 100644 index d43fbac9a5..0000000000 --- a/xlators/experimental/nsr-server/src/nsr-internal.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#include -#include - -#define LEADER_XATTR "user.nsr.leader" -#define SECOND_CHILD(xl) (xl->children->next->xlator) -#define RECONCILER_PATH NSR_SCRIPT_PREFIX"/reconciler.py" -#define CHANGELOG_ENTRY_SIZE 128 - -enum { - gf_mt_nsr_private_t = gf_common_mt_end + 1, - gf_mt_nsr_fd_ctx_t, - gf_mt_nsr_inode_ctx_t, - gf_mt_nsr_dirty_t, - gf_mt_nsr_end -}; - -typedef enum nsr_recon_notify_ev_id_t { - NSR_RECON_SET_LEADER = 1, - NSR_RECON_ADD_CHILD = 2 -} nsr_recon_notify_ev_id_t; - -typedef struct _nsr_recon_notify_ev_s { - nsr_recon_notify_ev_id_t id; - uint32_t index; /* in case of add */ - struct list_head list; -} nsr_recon_notify_ev_t; - -typedef struct { - /* - * This is a hack to allow a non-leader to accept requests while the - * leader is down, and it only works for n=2. The way it works is that - * "config_leader" indicates the state from our options (via init or - * reconfigure) but "leader" is what the fop code actually looks at. If - * config_leader is true, then leader will *always* be true as well, - * giving that brick precedence. If config_leader is false, then - * leader will only be true if there is no connection to the other - * brick (tracked in nsr_notify). - * - * TBD: implement real leader election - */ - gf_boolean_t config_leader; - gf_boolean_t leader; - uint8_t up_children; - uint8_t n_children; - char *vol_file; - uint32_t current_term; - uint32_t kid_state; - gf_lock_t dirty_lock; - struct list_head dirty_fds; - uint32_t index; - gf_lock_t index_lock; - double quorum_pct; - int term_fd; - long term_total; - long term_read; - /* - * This is a super-duper hack, but it will do for now. The reason it's - * a hack is that we pass this to dict_set_static_bin, so we don't have - * to mess around with allocating and freeing it on every single IPC - * request, but it's totally not thread-safe. On the other hand, there - * should only be one reconciliation thread running and calling these - * functions at a time, so maybe that doesn't matter. - * - * TBD: re-evaluate how to manage this - */ - char term_buf[CHANGELOG_ENTRY_SIZE]; - gf_boolean_t child_up; /* To maintain the state of * - * the translator */ -} nsr_private_t; - -typedef struct { - call_stub_t *stub; - call_stub_t *qstub; - uint32_t call_count; - uint32_t successful_acks; - uint32_t successful_op_ret; - fd_t *fd; - struct list_head qlinks; -} nsr_local_t; - -/* - * This should match whatever changelog returns on the pre-op for us to pass - * when we're ready for our post-op. - */ -typedef uint32_t log_id_t; - -typedef struct { - struct list_head links; - log_id_t id; -} nsr_dirty_list_t; - -typedef struct { - fd_t *fd; - struct list_head dirty_list; - struct list_head fd_list; -} nsr_fd_ctx_t; - -typedef struct { - gf_lock_t lock; - uint32_t active; - struct list_head aqueue; - uint32_t pending; - struct list_head pqueue; -} nsr_inode_ctx_t; - -void nsr_start_reconciler (xlator_t *this); diff --git a/xlators/experimental/nsr-server/src/nsr.c b/xlators/experimental/nsr-server/src/nsr.c deleted file mode 100644 index 0fb618f236..0000000000 --- a/xlators/experimental/nsr-server/src/nsr.c +++ /dev/null @@ -1,1147 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include -#include "call-stub.h" -#include "defaults.h" -#include "xlator.h" -#include "glfs.h" -#include "glfs-internal.h" -#include "run.h" -#include "common-utils.h" -#include "syncop.h" -#include "syscall.h" -#include "compat-errno.h" - -#include "nsr-internal.h" -#include "nsr-messages.h" - -#define NSR_FLUSH_INTERVAL 5 - -enum { - /* echo "cluster/nsr-server" | md5sum | cut -c 1-8 */ - NSR_SERVER_IPC_BASE = 0x0e2d66a5, - NSR_SERVER_TERM_RANGE, - NSR_SERVER_OPEN_TERM, - NSR_SERVER_NEXT_ENTRY -}; - -/* Used to check the quorum of acks received after the fop - * confirming the status of the fop on all the brick processes - * for this particular subvolume - */ -gf_boolean_t -fop_quorum_check (xlator_t *this, double n_children, - double current_state) -{ - nsr_private_t *priv = NULL; - gf_boolean_t result = _gf_false; - double required = 0; - double current = 0; - - GF_VALIDATE_OR_GOTO ("nsr", this, out); - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, out); - - required = n_children * priv->quorum_pct; - - /* - * Before performing the fop on the leader, we need to check, - * if there is any merit in performing the fop on the leader. - * In a case, where even a successful write on the leader, will - * not meet quorum, there is no point in trying the fop on the - * leader. - * When this function is called after the leader has tried - * performing the fop, this check will calculate quorum taking into - * account the status of the fop on the leader. If the leader's - * op_ret was -1, the complete function would account that by - * decrementing successful_acks by 1 - */ - - current = current_state * 100.0; - - if (current < required) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_QUORUM_NOT_MET, - "Quorum not met. quorum_pct = %f " - "Current State = %f, Required State = %f", - priv->quorum_pct, current, - required); - } else - result = _gf_true; - -out: - return result; -} - -nsr_inode_ctx_t * -nsr_get_inode_ctx (xlator_t *this, inode_t *inode) -{ - uint64_t ctx_int = 0LL; - nsr_inode_ctx_t *ctx_ptr; - - if (__inode_ctx_get(inode, this, &ctx_int) == 0) { - ctx_ptr = (nsr_inode_ctx_t *)(long)ctx_int; - } else { - ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), - gf_mt_nsr_inode_ctx_t); - if (ctx_ptr) { - ctx_int = (uint64_t)(long)ctx_ptr; - if (__inode_ctx_set(inode, this, &ctx_int) == 0) { - LOCK_INIT(&ctx_ptr->lock); - INIT_LIST_HEAD(&ctx_ptr->aqueue); - INIT_LIST_HEAD(&ctx_ptr->pqueue); - } else { - GF_FREE(ctx_ptr); - ctx_ptr = NULL; - } - } - - } - - return ctx_ptr; -} - -nsr_fd_ctx_t * -nsr_get_fd_ctx (xlator_t *this, fd_t *fd) -{ - uint64_t ctx_int = 0LL; - nsr_fd_ctx_t *ctx_ptr; - - if (__fd_ctx_get(fd, this, &ctx_int) == 0) { - ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int; - } else { - ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), gf_mt_nsr_fd_ctx_t); - if (ctx_ptr) { - if (__fd_ctx_set(fd, this, (uint64_t)ctx_ptr) == 0) { - INIT_LIST_HEAD(&ctx_ptr->dirty_list); - INIT_LIST_HEAD(&ctx_ptr->fd_list); - } else { - GF_FREE(ctx_ptr); - ctx_ptr = NULL; - } - } - - } - - return ctx_ptr; -} - -void -nsr_mark_fd_dirty (xlator_t *this, nsr_local_t *local) -{ - fd_t *fd = local->fd; - nsr_fd_ctx_t *ctx_ptr; - nsr_dirty_list_t *dirty; - nsr_private_t *priv = this->private; - - /* - * TBD: don't do any of this for O_SYNC/O_DIRECT writes. - * Unfortunately, that optimization requires that we distinguish - * between writev and other "write" calls, saving the original flags - * and checking them in the callback. Too much work for too little - * gain right now. - */ - - LOCK(&fd->lock); - ctx_ptr = nsr_get_fd_ctx(this, fd); - dirty = GF_CALLOC(1, sizeof(*dirty), gf_mt_nsr_dirty_t); - if (ctx_ptr && dirty) { - gf_msg_trace (this->name, 0, - "marking fd %p as dirty (%p)", fd, dirty); - /* TBD: fill dirty->id from what changelog gave us */ - list_add_tail(&dirty->links, &ctx_ptr->dirty_list); - if (list_empty(&ctx_ptr->fd_list)) { - /* Add a ref so _release doesn't get called. */ - ctx_ptr->fd = fd_ref(fd); - LOCK(&priv->dirty_lock); - list_add_tail (&ctx_ptr->fd_list, - &priv->dirty_fds); - UNLOCK(&priv->dirty_lock); - } - } else { - gf_msg (this->name, GF_LOG_ERROR, ENOMEM, - N_MSG_MEM_ERR, "could not mark %p dirty", fd); - if (ctx_ptr) { - GF_FREE(ctx_ptr); - } - if (dirty) { - GF_FREE(dirty); - } - } - UNLOCK(&fd->lock); -} - -#define NSR_TERM_XATTR "trusted.nsr.term" -#define NSR_INDEX_XATTR "trusted.nsr.index" -#define NSR_REP_COUNT_XATTR "trusted.nsr.rep-count" -#define RECON_TERM_XATTR "trusted.nsr.recon-term" -#define RECON_INDEX_XATTR "trusted.nsr.recon-index" - -#pragma generate - -uint8_t -nsr_count_up_kids (nsr_private_t *priv) -{ - uint8_t retval = 0; - uint8_t i; - - for (i = 0; i < priv->n_children; ++i) { - if (priv->kid_state & (1 << i)) { - ++retval; - } - } - - return retval; -} - -/* - * The fsync machinery looks a lot like that for any write call, but there are - * some important differences that are easy to miss. First, we don't care - * about the xdata that shows whether the call came from a leader or - * reconciliation process. If we're the leader we fan out; if we're not we - * don't. Second, we don't wait for followers before we issue the local call. - * The code generation system could be updated to handle this, and still might - * if we need to implement other "almost identical" paths (e.g. for open), but - * a copy is more readable as long as it's just one. - */ - -int32_t -nsr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - nsr_local_t *local = frame->local; - gf_boolean_t unwind; - - LOCK(&frame->lock); - unwind = !--(local->call_count); - UNLOCK(&frame->lock); - - if (unwind) { - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, - postbuf, xdata); - } - return 0; -} - -int32_t -nsr_fsync_local_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - nsr_dirty_list_t *dirty; - nsr_dirty_list_t *dtmp; - nsr_local_t *local = frame->local; - - list_for_each_entry_safe (dirty, dtmp, &local->qlinks, links) { - gf_msg_trace (this->name, 0, - "sending post-op on %p (%p)", local->fd, dirty); - GF_FREE(dirty); - } - - return nsr_fsync_cbk (frame, cookie, this, op_ret, op_errno, - prebuf, postbuf, xdata); -} - -int32_t -nsr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, - dict_t *xdata) -{ - nsr_private_t *priv = this->private; - nsr_local_t *local; - uint64_t ctx_int = 0LL; - nsr_fd_ctx_t *ctx_ptr; - xlator_list_t *trav; - - local = mem_get0(this->local_pool); - if (!local) { - STACK_UNWIND_STRICT(fsync, frame, -1, ENOMEM, - NULL, NULL, xdata); - return 0; - } - INIT_LIST_HEAD(&local->qlinks); - frame->local = local; - - /* Move the dirty list from the fd to the fsync request. */ - LOCK(&fd->lock); - if (__fd_ctx_get(fd, this, &ctx_int) == 0) { - ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int; - list_splice_init (&ctx_ptr->dirty_list, - &local->qlinks); - } - UNLOCK(&fd->lock); - - /* Issue the local call. */ - local->call_count = priv->leader ? priv->n_children : 1; - STACK_WIND (frame, nsr_fsync_local_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, - fd, flags, xdata); - - /* Issue remote calls if we're the leader. */ - if (priv->leader) { - for (trav = this->children->next; trav; trav = trav->next) { - STACK_WIND (frame, nsr_fsync_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, - fd, flags, xdata); - } - } - - return 0; -} - -int32_t -nsr_getxattr_special (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) -{ - dict_t *result; - nsr_private_t *priv = this->private; - - if (!priv->leader) { - STACK_UNWIND_STRICT (getxattr, frame, -1, EREMOTE, NULL, NULL); - return 0; - } - - if (!name || (strcmp(name, NSR_REP_COUNT_XATTR) != 0)) { - STACK_WIND_TAIL (frame, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, - loc, name, xdata); - return 0; - } - - result = dict_new(); - if (!result) { - goto dn_failed; - } - - priv->up_children = nsr_count_up_kids(this->private); - if (dict_set_uint32(result, NSR_REP_COUNT_XATTR, - priv->up_children) != 0) { - goto dsu_failed; - } - - STACK_UNWIND_STRICT (getxattr, frame, 0, 0, result, NULL); - dict_destroy(result); - return 0; - -dsu_failed: - dict_destroy(result); -dn_failed: - STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL); - return 0; -} - -void -nsr_flush_fd (xlator_t *this, nsr_fd_ctx_t *fd_ctx) -{ - nsr_dirty_list_t *dirty; - nsr_dirty_list_t *dtmp; - - list_for_each_entry_safe (dirty, dtmp, &fd_ctx->dirty_list, links) { - gf_msg_trace (this->name, 0, - "sending post-op on %p (%p)", fd_ctx->fd, dirty); - GF_FREE(dirty); - } - - INIT_LIST_HEAD(&fd_ctx->dirty_list); -} - -void * -nsr_flush_thread (void *ctx) -{ - xlator_t *this = ctx; - nsr_private_t *priv = this->private; - struct list_head dirty_fds; - nsr_fd_ctx_t *fd_ctx; - nsr_fd_ctx_t *fd_tmp; - int ret; - - for (;;) { - /* - * We have to be very careful to avoid lock inversions here, so - * we can't just hold priv->dirty_lock while we take and - * release locks for each fd. Instead, we only hold dirty_lock - * at the beginning of each iteration, as we (effectively) make - * a copy of the current list head and then clear the original. - * This leads to four scenarios for adding the first entry to - * an fd and potentially putting it on the global list. - * - * (1) While we're asleep. No lock contention, it just gets - * added and will be processed on the next iteration. - * - * (2) After we've made a local copy, but before we've started - * processing that fd. The new entry will be added to the - * fd (under its lock), and we'll process it on the current - * iteration. - * - * (3) While we're processing the fd. They'll block on the fd - * lock, then see that the list is empty and put it on the - * global list. We'll process it here on the next - * iteration. - * - * (4) While we're working, but after we've processed that fd. - * Same as (1) as far as that fd is concerned. - */ - INIT_LIST_HEAD(&dirty_fds); - LOCK(&priv->dirty_lock); - list_splice_init(&priv->dirty_fds, &dirty_fds); - UNLOCK(&priv->dirty_lock); - - list_for_each_entry_safe (fd_ctx, fd_tmp, &dirty_fds, fd_list) { - ret = syncop_fsync(FIRST_CHILD(this), fd_ctx->fd, 0, - NULL, NULL); - if (ret) { - gf_msg (this->name, GF_LOG_WARNING, 0, - N_MSG_SYS_CALL_FAILURE, - "failed to fsync %p (%d)", - fd_ctx->fd, -ret); - } - - LOCK(&fd_ctx->fd->lock); - nsr_flush_fd(this, fd_ctx); - list_del_init(&fd_ctx->fd_list); - UNLOCK(&fd_ctx->fd->lock); - fd_unref(fd_ctx->fd); - } - - sleep(NSR_FLUSH_INTERVAL); - } - - return NULL; -} - - -int32_t -nsr_get_changelog_dir (xlator_t *this, char **cl_dir_p) -{ - xlator_t *cl_xl; - - /* Find our changelog translator. */ - cl_xl = this; - while (cl_xl) { - if (strcmp(cl_xl->type, "features/changelog") == 0) { - break; - } - cl_xl = cl_xl->children->xlator; - } - if (!cl_xl) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_INIT_FAIL, - "failed to find changelog translator"); - return ENOENT; - } - - /* Find the actual changelog directory. */ - if (dict_get_str(cl_xl->options, "changelog-dir", cl_dir_p) != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_INIT_FAIL, - "failed to find changelog-dir for %s", cl_xl->name); - return ENODATA; - } - - return 0; -} - - -void -nsr_get_terms (call_frame_t *frame, xlator_t *this) -{ - int32_t op_errno; - char *cl_dir; - DIR *fp = NULL; - struct dirent *rd_entry; - struct dirent *rd_result; - int32_t term_first = -1; - int32_t term_contig = -1; - int32_t term_last = -1; - int term_num; - char *probe_str; - dict_t *my_xdata = NULL; - - op_errno = nsr_get_changelog_dir(this, &cl_dir); - if (op_errno) { - goto err; /* Error was already logged. */ - } - op_errno = ENODATA; /* Most common error after this. */ - - rd_entry = alloca (offsetof(struct dirent, d_name) + - pathconf(cl_dir, _PC_NAME_MAX) + 1); - if (!rd_entry) { - goto err; - } - - fp = sys_opendir (cl_dir); - if (!fp) { - op_errno = errno; - goto err; - } - - /* Find first and last terms. */ - for (;;) { - if (readdir_r(fp, rd_entry, &rd_result) != 0) { - op_errno = errno; - goto err; - } - if (!rd_result) { - break; - } - if (fnmatch("TERM.*", rd_entry->d_name, FNM_PATHNAME) != 0) { - continue; - } - /* +5 points to the character after the period */ - term_num = atoi(rd_entry->d_name+5); - gf_msg (this->name, GF_LOG_INFO, 0, - N_MSG_GENERIC, - "%s => %d", rd_entry->d_name, term_num); - if (term_num < 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_INVALID, - "invalid term file name %s", rd_entry->d_name); - op_errno = EINVAL; - goto err; - } - if ((term_first < 0) || (term_first > term_num)) { - term_first = term_num; - } - if ((term_last < 0) || (term_last < term_num)) { - term_last = term_num; - } - } - if ((term_first < 0) || (term_last < 0)) { - /* TBD: are we *sure* there should always be at least one? */ - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_NO_DATA, "no terms found"); - op_errno = EINVAL; - goto err; - } - - sys_closedir (fp); - fp = NULL; - - /* - * Find term_contig, which is the earliest term for which there are - * no gaps between it and term_last. - */ - for (term_contig = term_last; term_contig > 0; --term_contig) { - if (gf_asprintf(&probe_str, "%s/TERM.%d", - cl_dir, term_contig-1) <= 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_MEM_ERR, - "failed to format term %d", term_contig-1); - goto err; - } - if (sys_access(probe_str, F_OK) != 0) { - GF_FREE(probe_str); - break; - } - GF_FREE(probe_str); - } - - gf_msg (this->name, GF_LOG_INFO, 0, - N_MSG_GENERIC, - "found terms %d-%d (%d)", - term_first, term_last, term_contig); - - /* Return what we've found */ - my_xdata = dict_new(); - if (!my_xdata) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_MEM_ERR, - "failed to allocate reply dictionary"); - goto err; - } - if (dict_set_int32(my_xdata, "term-first", term_first) != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_DICT_FLR, - "failed to set term-first"); - goto err; - } - if (dict_set_int32(my_xdata, "term-contig", term_contig) != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_DICT_FLR, - "failed to set term-contig"); - goto err; - } - if (dict_set_int32(my_xdata, "term-last", term_last) != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_DICT_FLR, - "failed to set term-last"); - goto err; - } - - /* Finally! */ - STACK_UNWIND_STRICT (ipc, frame, 0, 0, my_xdata); - dict_unref(my_xdata); - return; - -err: - if (fp) { - sys_closedir (fp); - } - if (my_xdata) { - dict_unref(my_xdata); - } - STACK_UNWIND_STRICT (ipc, frame, -1, op_errno, NULL); -} - - -long -get_entry_count (xlator_t *this, int fd) -{ - struct stat buf; - long min; /* last entry not known to be empty */ - long max; /* first entry known to be empty */ - long curr; - char entry[CHANGELOG_ENTRY_SIZE]; - - if (sys_fstat (fd, &buf) < 0) { - return -1; - } - - min = 0; - max = buf.st_size / CHANGELOG_ENTRY_SIZE; - - while ((min+1) < max) { - curr = (min + max) / 2; - if (sys_lseek(fd, curr*CHANGELOG_ENTRY_SIZE, SEEK_SET) < 0) { - return -1; - } - if (sys_read(fd, entry, sizeof(entry)) != sizeof(entry)) { - return -1; - } - if ((entry[0] == '_') && (entry[1] == 'P')) { - min = curr; - } else { - max = curr; - } - } - - if (sys_lseek(fd, 0, SEEK_SET) < 0) { - gf_msg (this->name, GF_LOG_WARNING, 0, - N_MSG_SYS_CALL_FAILURE, - "failed to reset offset"); - } - return max; -} - - -void -nsr_open_term (call_frame_t *frame, xlator_t *this, dict_t *xdata) -{ - int32_t op_errno; - char *cl_dir; - char *term; - char *path; - nsr_private_t *priv = this->private; - - op_errno = nsr_get_changelog_dir(this, &cl_dir); - if (op_errno) { - goto err; - } - - if (dict_get_str(xdata, "term", &term) != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_NO_DATA, "missing term"); - op_errno = ENODATA; - goto err; - } - - if (gf_asprintf(&path, "%s/TERM.%s", cl_dir, term) < 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_MEM_ERR, "failed to construct path"); - op_errno = ENOMEM; - goto err; - } - - if (priv->term_fd >= 0) { - sys_close (priv->term_fd); - } - priv->term_fd = open(path, O_RDONLY); - if (priv->term_fd < 0) { - op_errno = errno; - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_SYS_CALL_FAILURE, - "failed to open term file"); - goto err; - } - - priv->term_total = get_entry_count(this, priv->term_fd); - if (priv->term_total < 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_NO_DATA, "failed to get entry count"); - sys_close (priv->term_fd); - priv->term_fd = -1; - op_errno = EIO; - goto err; - } - priv->term_read = 0; - - /* Success! */ - STACK_UNWIND_STRICT (ipc, frame, 0, 0, NULL); - return; - -err: - STACK_UNWIND_STRICT (ipc, frame, -1, op_errno, NULL); -} - - -void -nsr_next_entry (call_frame_t *frame, xlator_t *this) -{ - int32_t op_errno = ENOMEM; - nsr_private_t *priv = this->private; - ssize_t nbytes; - dict_t *my_xdata; - - if (priv->term_fd < 0) { - op_errno = EBADFD; - goto err; - } - - if (priv->term_read >= priv->term_total) { - op_errno = ENODATA; - goto err; - } - - nbytes = sys_read (priv->term_fd, priv->term_buf, CHANGELOG_ENTRY_SIZE); - if (nbytes < CHANGELOG_ENTRY_SIZE) { - if (nbytes < 0) { - op_errno = errno; - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_SYS_CALL_FAILURE, - "error reading next entry: %s", - strerror(errno)); - } else { - op_errno = EIO; - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_SYS_CALL_FAILURE, - "got %ld/%d bytes for next entry", - nbytes, CHANGELOG_ENTRY_SIZE); - } - goto err; - } - ++(priv->term_read); - - my_xdata = dict_new(); - if (!my_xdata) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_MEM_ERR, "failed to allocate reply xdata"); - goto err; - } - - if (dict_set_static_bin(my_xdata, "data", - priv->term_buf, CHANGELOG_ENTRY_SIZE) != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - N_MSG_DICT_FLR, "failed to assign reply xdata"); - goto err; - } - - STACK_UNWIND_STRICT (ipc, frame, 0, 0, my_xdata); - dict_unref(my_xdata); - return; - -err: - STACK_UNWIND_STRICT (ipc, frame, -1, op_errno, NULL); -} - - -int32_t -nsr_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) -{ - switch (op) { - case NSR_SERVER_TERM_RANGE: - nsr_get_terms(frame, this); - break; - case NSR_SERVER_OPEN_TERM: - nsr_open_term(frame, this, xdata); - break; - case NSR_SERVER_NEXT_ENTRY: - nsr_next_entry(frame, this); - break; - default: - STACK_WIND_TAIL (frame, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ipc, - op, xdata); - } - - return 0; -} - - -int32_t -nsr_forget (xlator_t *this, inode_t *inode) -{ - uint64_t ctx = 0LL; - - if ((inode_ctx_del(inode, this, &ctx) == 0) && ctx) { - GF_FREE((void *)(long)ctx); - } - - return 0; -} - -int32_t -nsr_release (xlator_t *this, fd_t *fd) -{ - uint64_t ctx = 0LL; - - if ((fd_ctx_del(fd, this, &ctx) == 0) && ctx) { - GF_FREE((void *)(long)ctx); - } - - return 0; -} - -struct xlator_cbks cbks = { - .forget = nsr_forget, - .release = nsr_release, -}; - -int -nsr_reconfigure (xlator_t *this, dict_t *options) -{ - nsr_private_t *priv = this->private; - - GF_OPTION_RECONF ("leader", - priv->config_leader, options, bool, err); - GF_OPTION_RECONF ("quorum-percent", - priv->quorum_pct, options, percent, err); - gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC, - "reconfigure called, config_leader = %d, quorum_pct = %.1f\n", - priv->leader, priv->quorum_pct); - - priv->leader = priv->config_leader; - - return 0; - -err: - return -1; -} - -int -nsr_get_child_index (xlator_t *this, xlator_t *kid) -{ - xlator_list_t *trav; - int retval = -1; - - for (trav = this->children; trav; trav = trav->next) { - ++retval; - if (trav->xlator == kid) { - return retval; - } - } - - return -1; -} - -/* - * Child notify handling is unreasonably FUBAR. Sometimes we'll get a - * CHILD_DOWN for a protocol/client child before we ever got a CHILD_UP for it. - * Other times we won't. Because it's effectively random (probably racy), we - * can't just maintain a count. We actually have to keep track of the state - * for each child separately, to filter out the bogus CHILD_DOWN events, and - * then generate counts on demand. - */ -int -nsr_notify (xlator_t *this, int event, void *data, ...) -{ - nsr_private_t *priv = this->private; - int index = -1; - int ret = -1; - gf_boolean_t result = _gf_false; - gf_boolean_t relevant = _gf_false; - - switch (event) { - case GF_EVENT_CHILD_UP: - index = nsr_get_child_index(this, data); - if (index >= 0) { - /* Check if the child was previously down - * and it's not a false CHILD_UP - */ - if (!(priv->kid_state & (1 << index))) { - relevant = _gf_true; - } - - priv->kid_state |= (1 << index); - priv->up_children = nsr_count_up_kids(priv); - gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC, - "got CHILD_UP for %s, now %u kids", - ((xlator_t *)data)->name, - priv->up_children); - if (!priv->config_leader && (priv->up_children > 1)) { - priv->leader = _gf_false; - } - - /* If it's not relevant, or we have already * - * sent CHILD_UP just break */ - if (!relevant || priv->child_up) - break; - - /* If it's not a leader, just send the notify up */ - if (!priv->leader) { - ret = default_notify(this, event, data); - if (!ret) - priv->child_up = _gf_true; - break; - } - - result = fop_quorum_check (this, - (double)(priv->n_children - 1), - (double)(priv->up_children - 1)); - if (result == _gf_false) { - gf_msg (this->name, GF_LOG_INFO, 0, - N_MSG_GENERIC, "Not enough children " - "are up to meet quorum. Waiting to " - "send CHILD_UP from leader"); - } else { - gf_msg (this->name, GF_LOG_INFO, 0, - N_MSG_GENERIC, "Enough children are up " - "to meet quorum. Sending CHILD_UP " - "from leader"); - ret = default_notify(this, event, data); - if (!ret) - priv->child_up = _gf_true; - } - } - break; - case GF_EVENT_CHILD_DOWN: - index = nsr_get_child_index(this, data); - if (index >= 0) { - /* Check if the child was previously up - * and it's not a false CHILD_DOWN - */ - if (priv->kid_state & (1 << index)) { - relevant = _gf_true; - } - priv->kid_state &= ~(1 << index); - priv->up_children = nsr_count_up_kids(priv); - gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC, - "got CHILD_DOWN for %s, now %u kids", - ((xlator_t *)data)->name, - priv->up_children); - if (!priv->config_leader && (priv->up_children < 2) - && relevant) { - priv->leader = _gf_true; - } - - /* If it's not relevant, or we have already * - * sent CHILD_DOWN just break */ - if (!relevant || !priv->child_up) - break; - - /* If it's not a leader, just break coz we shouldn't * - * propagate the failure from the failure till it * - * itself goes down * - */ - if (!priv->leader) { - break; - } - - result = fop_quorum_check (this, - (double)(priv->n_children - 1), - (double)(priv->up_children - 1)); - if (result == _gf_false) { - gf_msg (this->name, GF_LOG_INFO, 0, - N_MSG_GENERIC, "Enough children are " - "to down to fail quorum. " - "Sending CHILD_DOWN from leader"); - ret = default_notify(this, event, data); - if (!ret) - priv->child_up = _gf_false; - } else { - gf_msg (this->name, GF_LOG_INFO, 0, - N_MSG_GENERIC, "Not enough children " - "are down to fail quorum. Waiting to " - "send CHILD_DOWN from leader"); - } - } - break; - default: - ret = default_notify(this, event, data); - } - - return ret; -} - - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO ("nsr", this, out); - - ret = xlator_mem_acct_init (this, gf_mt_nsr_end + 1); - - if (ret != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, N_MSG_MEM_ERR, - "Memory accounting init" "failed"); - return ret; - } -out: - return ret; -} - - -void -nsr_deallocate_priv (nsr_private_t *priv) -{ - if (!priv) { - return; - } - - GF_FREE(priv); -} - - -int32_t -nsr_init (xlator_t *this) -{ - xlator_list_t *remote; - xlator_list_t *local; - nsr_private_t *priv = NULL; - xlator_list_t *trav; - pthread_t kid; - extern xlator_t global_xlator; - glusterfs_ctx_t *oldctx = global_xlator.ctx; - - /* - * Any fop that gets special treatment has to be patched in here, - * because the compiled-in table is produced by the code generator and - * only contains generated functions. Note that we have to go through - * this->fops because of some dynamic-linking strangeness; modifying - * the static table doesn't work. - */ - this->fops->getxattr = nsr_getxattr_special; - this->fops->fsync = nsr_fsync; - this->fops->ipc = nsr_ipc; - - local = this->children; - if (!local) { - gf_msg (this->name, GF_LOG_ERROR, 0, N_MSG_NO_DATA, - "no local subvolume"); - goto err; - } - - remote = local->next; - if (!remote) { - gf_msg (this->name, GF_LOG_ERROR, 0, N_MSG_NO_DATA, - "no remote subvolumes"); - goto err; - } - - this->local_pool = mem_pool_new (nsr_local_t, 128); - if (!this->local_pool) { - gf_msg (this->name, GF_LOG_ERROR, 0, N_MSG_MEM_ERR, - "failed to create nsr_local_t pool"); - goto err; - } - - priv = GF_CALLOC (1, sizeof(*priv), gf_mt_nsr_private_t); - if (!priv) { - gf_msg (this->name, GF_LOG_ERROR, 0, N_MSG_MEM_ERR, - "could not allocate priv"); - goto err; - } - - for (trav = this->children; trav; trav = trav->next) { - ++(priv->n_children); - } - - LOCK_INIT(&priv->dirty_lock); - LOCK_INIT(&priv->index_lock); - INIT_LIST_HEAD(&priv->dirty_fds); - priv->term_fd = -1; - - this->private = priv; - - GF_OPTION_INIT ("leader", priv->config_leader, bool, err); - GF_OPTION_INIT ("quorum-percent", priv->quorum_pct, percent, err); - - priv->leader = priv->config_leader; - priv->child_up = _gf_false; - - if (pthread_create(&kid, NULL, nsr_flush_thread, - this) != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, N_MSG_SYS_CALL_FAILURE, - "could not start flush thread"); - /* TBD: treat this as a fatal error? */ - } - - /* - * Calling glfs_new changes old->ctx, even if THIS still points - * to global_xlator. That causes problems later in the main - * thread, when gf_log_dump_graph tries to use the FILE after - * we've mucked with it and gets a segfault in __fprintf_chk. - * We can avoid all that by undoing the damage before we - * continue. - */ - global_xlator.ctx = oldctx; - - return 0; - -err: - nsr_deallocate_priv(priv); - return -1; -} - - -void -nsr_fini (xlator_t *this) -{ - nsr_deallocate_priv(this->private); -} - -class_methods_t class_methods = { - .init = nsr_init, - .fini = nsr_fini, - .reconfigure = nsr_reconfigure, - .notify = nsr_notify, -}; - -struct volume_options options[] = { - { .key = {"leader"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "false", - .description = "Start in the leader role. This is only for " - "bootstrapping the code, and should go away when we " - "have real leader election." - }, - { .key = {"vol-name"}, - .type = GF_OPTION_TYPE_STR, - .description = "volume name" - }, - { .key = {"my-name"}, - .type = GF_OPTION_TYPE_STR, - .description = "brick name in form of host:/path" - }, - { .key = {"etcd-servers"}, - .type = GF_OPTION_TYPE_STR, - .description = "list of comma seperated etc servers" - }, - { .key = {"subvol-uuid"}, - .type = GF_OPTION_TYPE_STR, - .description = "UUID for this NSR (sub)volume" - }, - { .key = {"quorum-percent"}, - .type = GF_OPTION_TYPE_PERCENT, - .default_value = "50.0", - .description = "percentage of rep_count-1 that must be up" - }, - { .key = {NULL} }, -}; diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 44b3e77749..62a4f31cd6 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -1876,7 +1876,7 @@ add_one_peer (volgen_graph_t *graph, glusterd_brickinfo_t *peer, } int -add_nsr_stuff (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, +add_jbr_stuff (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *brickinfo) { xlator_t *me; @@ -1886,8 +1886,8 @@ add_nsr_stuff (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, uint16_t index = 0; xlator_t *kid; - /* Create the NSR xlator, but defer linkage for now. */ - me = xlator_instantiate ("experimental/nsr", "%s-nsr", + /* Create the JBR xlator, but defer linkage for now. */ + me = xlator_instantiate ("experimental/jbr", "%s-jbr", volinfo->volname); if (!me || volgen_xlator_link(me, first_of(graph))) { return -1; @@ -1960,9 +1960,9 @@ brick_graph_add_index (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, if (!graph || !volinfo || !brickinfo || !set_dict) goto out; - /* For NSR we don't need/want index. */ - if (glusterd_volinfo_get_boolean(volinfo, "cluster.nsr") > 0) { - return add_nsr_stuff (graph, volinfo, brickinfo); + /* For JBR we don't need/want index. */ + if (glusterd_volinfo_get_boolean(volinfo, "cluster.jbr") > 0) { + return add_jbr_stuff (graph, volinfo, brickinfo); } xl = volgen_graph_add (graph, "features/index", volinfo->volname); @@ -3552,8 +3552,8 @@ volgen_graph_build_afr_clusters (volgen_graph_t *graph, char option[32] = {0}; int start_count = 0; - if (glusterd_volinfo_get_boolean(volinfo, "cluster.nsr") > 0) { - replicate_type = "experimental/nsrc"; + if (glusterd_volinfo_get_boolean(volinfo, "cluster.jbr") > 0) { + replicate_type = "experimental/jbrc"; } else { replicate_type = "cluster/replicate"; } @@ -5309,7 +5309,7 @@ assign_groups (glusterd_volinfo_t *volinfo) gf_uuid_generate(tmp_uuid); } brickinfo->group = group_num; - gf_uuid_copy(brickinfo->nsr_uuid, tmp_uuid); + gf_uuid_copy(brickinfo->jbr_uuid, tmp_uuid); if (++in_group >= volinfo->replica_count) { in_group = 0; ++group_num; @@ -5385,7 +5385,7 @@ generate_brick_volfiles (glusterd_volinfo_t *volinfo) } } - if (glusterd_volinfo_get_boolean(volinfo, "cluster.nsr") > 0) { + if (glusterd_volinfo_get_boolean(volinfo, "cluster.jbr") > 0) { assign_groups(volinfo); } diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index ecec57b415..40323bb7ec 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -2735,15 +2735,15 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_7_6, .flags = OPT_FLAG_CLIENT_OPT }, - { .key = "cluster.nsr", - .voltype = "experimental/nsr", - .option = "!nsr", + { .key = "cluster.jbr", + .voltype = "experimental/jbr", + .option = "!jbr", .op_version = GD_OP_VERSION_4_0_0, - .description = "enable NSR instead of AFR for replication", + .description = "enable JBR instead of AFR for replication", .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT }, - { .key = "cluster.nsr.quorum-percent", - .voltype = "experimental/nsr", + { .key = "cluster.jbr.quorum-percent", + .voltype = "experimental/jbr", .option = "quorum-percent", .op_version = GD_OP_VERSION_4_0_0, .description = "percent of rep_count-1 bricks that must be up" diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 34e3e19d32..587c2e8277 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -210,13 +210,13 @@ struct glusterd_brickinfo { int32_t snap_status; /* * The group is used to identify which bricks are part of the same - * replica set during brick-volfile generation, so that NSR volfiles + * replica set during brick-volfile generation, so that JBR volfiles * can "cross-connect" the bricks to one another. It is also used by * AFR to load the arbiter xlator in the appropriate brick in case of * a replica 3 volume with arbiter enabled. */ uint16_t group; - uuid_t nsr_uuid; + uuid_t jbr_uuid; }; typedef struct glusterd_brickinfo glusterd_brickinfo_t; -- cgit