From 1392da3e237d8ea080573909015916e3544a6d2c Mon Sep 17 00:00:00 2001 From: Xavier Hernandez Date: Thu, 15 May 2014 10:35:14 +0200 Subject: cli/glusterd: Added support for dispersed volumes Two new options have been added to the 'create' command of the cli interface: disperse [] redundancy Both are optional. A dispersed volume is created by specifying, at least, one of them. If 'disperse' is missing or it's present but '' does not, the number of bricks enumerated in the command line is taken as the disperse count. If 'redundancy' is missing, the lowest optimal value is assumed. A configuration is considered optimal (for most workloads) when the disperse count - redundancy count is a power of 2. If the resulting redundancy is 1, the volume is created normally, but if it's greater than 1, a warning is shown to the user and he/she must answer yes/no to continue volume creation. If there isn't any optimal value for the given number of bricks, a warning is also shown and, if the user accepts, a redundancy of 1 is used. If 'redundancy' is specified and the resulting volume is not optimal, another warning is shown to the user. A distributed-disperse volume can be created using a number of bricks multiple of the disperse count. Change-Id: Iab93efbe78e905cdb91f54f3741599f7ea6645e4 BUG: 1118629 Signed-off-by: Xavier Hernandez Reviewed-on: http://review.gluster.org/7782 Tested-by: Gluster Build System Reviewed-by: Jeff Darcy Reviewed-by: Vijay Bellur --- cli/src/cli-cmd-parser.c | 203 ++++++++++++++++++++- cli/src/cli-cmd-volume.c | 58 ++++-- cli/src/cli-rpc-ops.c | 21 ++- cli/src/cli-xml-output.c | 28 ++- cli/src/cli.h | 4 +- rpc/xdr/src/cli1-xdr.x | 3 +- tests/basic/ec/ec-12-4.t | 14 ++ tests/basic/ec/ec-3-1.t | 14 ++ tests/basic/ec/ec-4-1.t | 14 ++ tests/basic/ec/ec-5-1.t | 14 ++ tests/basic/ec/ec-5-2.t | 14 ++ tests/basic/ec/ec-6-2.t | 14 ++ tests/basic/ec/ec-7-3.t | 14 ++ tests/basic/ec/ec-common | 143 +++++++++++++++ tests/basic/ec/ec.t | 233 ++++++++++++++++++++++++ tests/basic/ec/self-heal.t | 123 +++++++++++++ xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 15 ++ xlators/mgmt/glusterd/src/glusterd-handler.c | 10 + xlators/mgmt/glusterd/src/glusterd-store.c | 23 +++ xlators/mgmt/glusterd/src/glusterd-store.h | 2 + xlators/mgmt/glusterd/src/glusterd-utils.c | 80 +++++++- xlators/mgmt/glusterd/src/glusterd-volgen.c | 24 +++ xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 21 +++ xlators/mgmt/glusterd/src/glusterd.h | 2 + 24 files changed, 1054 insertions(+), 37 deletions(-) create mode 100644 tests/basic/ec/ec-12-4.t create mode 100644 tests/basic/ec/ec-3-1.t create mode 100644 tests/basic/ec/ec-4-1.t create mode 100644 tests/basic/ec/ec-5-1.t create mode 100644 tests/basic/ec/ec-5-2.t create mode 100644 tests/basic/ec/ec-6-2.t create mode 100644 tests/basic/ec/ec-7-3.t create mode 100644 tests/basic/ec/ec-common create mode 100644 tests/basic/ec/ec.t create mode 100644 tests/basic/ec/self-heal.t diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c index 1a39be8d12..4a00b8485d 100644 --- a/cli/src/cli-cmd-parser.c +++ b/cli/src/cli-cmd-parser.c @@ -177,7 +177,86 @@ out: } int32_t -cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options) +cli_cmd_create_disperse_check(struct cli_state * state, int * disperse, + int * redundancy, int count) +{ + int i = 0; + int tmp = 0; + gf_answer_t answer = GF_ANSWER_NO; + char question[128]; + + const char * question1 = "There isn't an optimal redundancy value " + "for this configuration. Do you want to " + "create the volume with redundancy 1 ?"; + + const char * question2 = "The optimal redundancy for this " + "configuration is %d. Do you want to create " + "the volume with this value ?"; + + const char * question3 = "This configuration is not optimal on most " + "workloads. Do you want to use it ?"; + + if (*disperse <= 0) { + if (count < 3) { + cli_err ("number of bricks must be greater " + "than 2"); + + return -1; + } + *disperse = count; + } + + if (*redundancy == 0) { + tmp = *disperse - 1; + for (i = tmp / 2; + (i > 0) && ((tmp & -tmp) != tmp); + i--, tmp--); + + if (i == 0) { + answer = cli_cmd_get_confirmation(state, question1); + if (answer == GF_ANSWER_NO) + return -1; + + *redundancy = 1; + } + else + { + *redundancy = *disperse - tmp; + if (*redundancy > 1) { + sprintf(question, question2, *redundancy); + answer = cli_cmd_get_confirmation(state, + question); + if (answer == GF_ANSWER_NO) + return -1; + } + } + + tmp = 0; + } + else { + tmp = *disperse - *redundancy; + } + + if (*redundancy > (*disperse - 1) / 2) { + cli_err ("redundancy must be less than %d for a " + "disperse %d volume", + (*disperse + 1) / 2, *disperse); + + return -1; + } + + if ((tmp & -tmp) != tmp) { + answer = cli_cmd_get_confirmation(state, question3); + if (answer == GF_ANSWER_NO) + return -1; + } + + return 0; +} + +int32_t +cli_cmd_volume_create_parse (struct cli_state *state, const char **words, + int wordcount, dict_t **options) { dict_t *dict = NULL; char *volname = NULL; @@ -191,7 +270,8 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options int32_t index = 0; char *bricks = NULL; int32_t brick_count = 0; - char *opwords[] = { "replica", "stripe", "transport", NULL }; + char *opwords[] = { "replica", "stripe", "transport", "disperse", + "redundancy", NULL }; char *invalid_volnames[] = {"volume", "type", "subvolumes", "option", "end-volume", "all", "volume_not_in_ring", @@ -200,9 +280,12 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options "snap-max-soft-limit", "auto-delete", NULL}; char *w = NULL; + char *ptr = NULL; int op_count = 0; int32_t replica_count = 1; int32_t stripe_count = 1; + int32_t disperse_count = -1; + int32_t redundancy_count = 0; gf_boolean_t is_force = _gf_false; int wc = wordcount; @@ -279,6 +362,10 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options case GF_CLUSTER_TYPE_STRIPE: type = GF_CLUSTER_TYPE_STRIPE_REPLICATE; break; + case GF_CLUSTER_TYPE_DISPERSE: + cli_err ("replicated-dispersed volume is not " + "supported"); + goto out; } if (wordcount < (index+2)) { @@ -310,6 +397,10 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options case GF_CLUSTER_TYPE_REPLICATE: type = GF_CLUSTER_TYPE_STRIPE_REPLICATE; break; + case GF_CLUSTER_TYPE_DISPERSE: + cli_err ("striped-dispersed volume is not " + "supported"); + goto out; } if (wordcount < (index + 2)) { ret = -1; @@ -348,6 +439,90 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options goto out; } index += 2; + + } else if ((strcmp (w, "disperse")) == 0) { + switch (type) { + case GF_CLUSTER_TYPE_DISPERSE: + if (disperse_count >= 0) { + cli_err ("disperse option given " + "twice"); + goto out; + } + break; + case GF_CLUSTER_TYPE_NONE: + type = GF_CLUSTER_TYPE_DISPERSE; + break; + case GF_CLUSTER_TYPE_STRIPE_REPLICATE: + cli_err ("striped-replicated-dispersed volume " + "is not supported"); + goto out; + case GF_CLUSTER_TYPE_STRIPE: + cli_err ("striped-dispersed volume is not " + "supported"); + goto out; + case GF_CLUSTER_TYPE_REPLICATE: + cli_err ("replicated-dispersed volume is not " + "supported"); + goto out; + } + + if (wordcount >= (index+2)) { + disperse_count = strtol (words[index + 1], + &ptr, 0); + if (*ptr != 0) + disperse_count = 0; + else { + if (disperse_count < 3) { + cli_err ("disperse count must " + "be greater than 2"); + ret = -1; + goto out; + } + index++; + } + } + + index++; + + } else if ((strcmp (w, "redundancy")) == 0) { + switch (type) { + case GF_CLUSTER_TYPE_NONE: + type = GF_CLUSTER_TYPE_DISPERSE; + break; + case GF_CLUSTER_TYPE_DISPERSE: + if (redundancy_count > 0) { + cli_err ("redundancy option given " + "twice"); + goto out; + } + break; + case GF_CLUSTER_TYPE_STRIPE_REPLICATE: + cli_err ("striped-replicated-dispersed volume " + "is not supported"); + goto out; + case GF_CLUSTER_TYPE_STRIPE: + cli_err ("striped-dispersed volume is not " + "supported"); + goto out; + case GF_CLUSTER_TYPE_REPLICATE: + cli_err ("replicated-dispersed volume is not " + "supported"); + goto out; + } + + if (wordcount < (index+2)) { + ret = -1; + goto out; + } + redundancy_count = strtol (words[index+1], NULL, 0); + if (redundancy_count < 1) { + cli_err ("redundancy must be greater than 0"); + ret = -1; + goto out; + } + + index += 2; + } else { GF_ASSERT (!"opword mismatch"); ret = -1; @@ -359,8 +534,6 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options if (!trans_type) trans_type = gf_strdup ("tcp"); - sub_count = stripe_count * replica_count; - /* reset the count value now */ count = 1; @@ -389,6 +562,23 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options goto out; } + if (type == GF_CLUSTER_TYPE_DISPERSE) { + ret = cli_cmd_create_disperse_check(state, &disperse_count, + &redundancy_count, + brick_count); + if (!ret) + ret = dict_set_int32 (dict, "disperse-count", + disperse_count); + if (!ret) + ret = dict_set_int32 (dict, "redundancy-count", + redundancy_count); + if (ret) + goto out; + + sub_count = disperse_count; + } else + sub_count = stripe_count * replica_count; + if (brick_count % sub_count) { if (type == GF_CLUSTER_TYPE_STRIPE) cli_err ("number of bricks is not a multiple of " @@ -396,6 +586,9 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options else if (type == GF_CLUSTER_TYPE_REPLICATE) cli_err ("number of bricks is not a multiple of " "replica count"); + else if (type == GF_CLUSTER_TYPE_DISPERSE) + cli_err ("number of bricks is not a multiple of " + "disperse count"); else cli_err ("number of bricks given doesn't match " "required count"); @@ -404,7 +597,7 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options goto out; } - /* Everything if parsed fine. start setting info in dict */ + /* Everything is parsed fine. start setting info in dict */ ret = dict_set_str (dict, "volname", volname); if (ret) goto out; diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c index b1b6c8275b..43e696d56c 100644 --- a/cli/src/cli-cmd-volume.c +++ b/cli/src/cli-cmd-volume.c @@ -362,7 +362,7 @@ cli_cmd_volume_create_cbk (struct cli_state *state, struct cli_cmd_word *word, if (!frame) goto out; - ret = cli_cmd_volume_create_parse (words, wordcount, &options); + ret = cli_cmd_volume_create_parse (state, words, wordcount, &options); if (ret) { cli_usage_out (word->pattern); @@ -376,32 +376,55 @@ cli_cmd_volume_create_cbk (struct cli_state *state, struct cli_cmd_word *word, goto out; } if ((type == GF_CLUSTER_TYPE_REPLICATE) || - (type == GF_CLUSTER_TYPE_STRIPE_REPLICATE)) { - if ((ret = dict_get_str (options, "bricks", &brick_list)) != 0) { - gf_log ("cli", GF_LOG_ERROR, "Replica bricks check : " - "Could not retrieve bricks list"); + (type == GF_CLUSTER_TYPE_STRIPE_REPLICATE) || + (type == GF_CLUSTER_TYPE_DISPERSE)) { + if ((ret = dict_get_str (options, "bricks", + &brick_list)) != 0) { + gf_log ("cli", GF_LOG_ERROR, "Bricks check : Could " + "not retrieve bricks " + "list"); goto out; } - if ((ret = dict_get_int32 (options, "count", &brick_count)) != 0) { - gf_log ("cli", GF_LOG_ERROR, "Replica bricks check : " - "Could not retrieve brick count"); + if ((ret = dict_get_int32 (options, "count", + &brick_count)) != 0) { + gf_log ("cli", GF_LOG_ERROR, "Bricks check : Could " + "not retrieve brick " + "count"); goto out; } - if ((ret = dict_get_int32 (options, "replica-count", &sub_count)) != 0) { - gf_log ("cli", GF_LOG_ERROR, "Replica bricks check : " - "Could not retrieve replica count"); - goto out; + + if (type != GF_CLUSTER_TYPE_DISPERSE) { + if ((ret = dict_get_int32 (options, "replica-count", + &sub_count)) != 0) { + gf_log ("cli", GF_LOG_ERROR, "Bricks check : " + "Could not retrieve " + "replica count"); + goto out; + } + gf_log ("cli", GF_LOG_INFO, "Replicate cluster type found." + " Checking brick order."); + } else { + ret = dict_get_int32 (options, "disperse-count", + &sub_count); + if (ret) { + gf_log ("cli", GF_LOG_ERROR, "Bricks check : " + "Could not retrieve " + "disperse count"); + goto out; + } + gf_log ("cli", GF_LOG_INFO, "Disperse cluster type found. " + "Checking brick order."); } - gf_log ("cli", GF_LOG_INFO, "Replicate cluster type found." - " Checking brick order."); - ret = cli_cmd_check_brick_order (state, brick_list, brick_count, sub_count); + ret = cli_cmd_check_brick_order (state, brick_list, + brick_count, sub_count); if (ret) { - gf_log("cli", GF_LOG_INFO, "Not creating volume because of bad brick order"); + gf_log("cli", GF_LOG_INFO, "Not creating volume " + "because of bad brick " + "order"); goto out; } } - ret = dict_get_str (options, "transport", &trans_type); if (ret) { gf_log("cli", GF_LOG_ERROR, "Unable to get transport type"); @@ -2328,6 +2351,7 @@ struct cli_cmd volume_cmds[] = { "list information of all volumes"}, { "volume create [stripe ] [replica ] " + "[disperse []] [redundancy ] " "[transport ] " #ifdef HAVE_BD_XLATOR "?" diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index c077622c0f..43db8358bc 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -59,9 +59,11 @@ char *cli_vol_type_str[] = {"Distribute", "Stripe", "Replicate", "Striped-Replicate", + "Disperse", "Distributed-Stripe", "Distributed-Replicate", "Distributed-Striped-Replicate", + "Distributed-Disperse", }; char *cli_vol_status_str[] = {"Created", @@ -518,6 +520,8 @@ gf_cli_get_volume_cbk (struct rpc_req *req, struct iovec *iov, int32_t dist_count = 0; int32_t stripe_count = 0; int32_t replica_count = 0; + int32_t disperse_count = 0; + int32_t redundancy_count = 0; int32_t vol_type = 0; int32_t transport = 0; char *volume_id_str = NULL; @@ -671,6 +675,16 @@ xml_output: if (ret) goto out; + snprintf (key, 256, "volume%d.disperse_count", i); + ret = dict_get_int32 (dict, key, &disperse_count); + if (ret) + goto out; + + snprintf (key, 256, "volume%d.redundancy_count", i); + ret = dict_get_int32 (dict, key, &redundancy_count); + if (ret) + goto out; + snprintf (key, 256, "volume%d.transport", i); ret = dict_get_int32 (dict, key, &transport); if (ret) @@ -685,7 +699,7 @@ xml_output: // Distributed (stripe/replicate/stripe-replica) setups if ((type > 0) && ( dist_count < brick_count)) - vol_type = type + 3; + vol_type = type + 4; cli_out ("Volume Name: %s", volname); cli_out ("Type: %s", cli_vol_type_str[vol_type]); @@ -734,6 +748,11 @@ next: brick_count); } else if (type == GF_CLUSTER_TYPE_NONE) { cli_out ("Number of Bricks: %d", brick_count); + } else if (type == GF_CLUSTER_TYPE_DISPERSE) { + cli_out ("Number of Bricks: %d x (%d + %d) = %d", + (brick_count / dist_count), + disperse_count - redundancy_count, + redundancy_count, brick_count); } else { /* For both replicate and stripe, dist_count is good enough */ diff --git a/cli/src/cli-xml-output.c b/cli/src/cli-xml-output.c index b16c238f7f..1bf4e87464 100644 --- a/cli/src/cli-xml-output.c +++ b/cli/src/cli-xml-output.c @@ -2528,6 +2528,8 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict) int dist_count = 0; int stripe_count = 0; int replica_count = 0; + int disperse_count = 0; + int redundancy_count = 0; int transport = 0; char *brick = NULL; char key[1024] = {0,}; @@ -2621,14 +2623,36 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict) "%d", replica_count); XML_RET_CHECK_AND_GOTO (ret, out); + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.disperse_count", i); + ret = dict_get_int32 (dict, key, &disperse_count); + if (ret) + goto out; + ret = xmlTextWriterWriteFormatElement (local->writer, + (xmlChar *)"disperseCount", + "%d", disperse_count); + XML_RET_CHECK_AND_GOTO (ret, out); + + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.redundancy_count", i); + ret = dict_get_int32 (dict, key, &redundancy_count); + if (ret) + goto out; + ret = xmlTextWriterWriteFormatElement (local->writer, + (xmlChar *)"redundancyCount", + "%d", redundancy_count); + XML_RET_CHECK_AND_GOTO (ret, out); + memset (key, 0, sizeof (key)); snprintf (key, sizeof (key), "volume%d.type", i); ret = dict_get_int32 (dict, key, &type); if (ret) goto out; - /* For Distributed-(stripe,replicate,stipe-replicate) types */ + /* For Distributed-(stripe,replicate,stipe-replicate,disperse) + types + */ if ((type > 0) && (dist_count < brick_count)) - type += 3; + type += 4; ret = xmlTextWriterWriteFormatElement (local->writer, (xmlChar *)"type", "%d", type); diff --git a/cli/src/cli.h b/cli/src/cli.h index 69a7e82bf6..a1a78eca2b 100644 --- a/cli/src/cli.h +++ b/cli/src/cli.h @@ -221,8 +221,8 @@ cli_submit_request (struct rpc_clnt *rpc, void *req, call_frame_t *frame, xlator_t *this, fop_cbk_fn_t cbkfn, xdrproc_t xdrproc); int32_t -cli_cmd_volume_create_parse (const char **words, int wordcount, - dict_t **options); +cli_cmd_volume_create_parse (struct cli_state *state, const char **words, + int wordcount, dict_t **options); int32_t cli_cmd_volume_reset_parse (const char **words, int wordcount, dict_t **opt); diff --git a/rpc/xdr/src/cli1-xdr.x b/rpc/xdr/src/cli1-xdr.x index 3c43e374d9..3a9841934c 100644 --- a/rpc/xdr/src/cli1-xdr.x +++ b/rpc/xdr/src/cli1-xdr.x @@ -23,7 +23,8 @@ GF_CLUSTER_TYPE_NONE = 0, GF_CLUSTER_TYPE_STRIPE, GF_CLUSTER_TYPE_REPLICATE, - GF_CLUSTER_TYPE_STRIPE_REPLICATE + GF_CLUSTER_TYPE_STRIPE_REPLICATE, + GF_CLUSTER_TYPE_DISPERSE }; enum gf1_cli_replace_op { diff --git a/tests/basic/ec/ec-12-4.t b/tests/basic/ec/ec-12-4.t new file mode 100644 index 0000000000..9ab4701861 --- /dev/null +++ b/tests/basic/ec/ec-12-4.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=12 +REDUNDANCY=4 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=634 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-3-1.t b/tests/basic/ec/ec-3-1.t new file mode 100644 index 0000000000..5769c20228 --- /dev/null +++ b/tests/basic/ec/ec-3-1.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=3 +REDUNDANCY=1 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=238 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-4-1.t b/tests/basic/ec/ec-4-1.t new file mode 100644 index 0000000000..d34e1fb4e9 --- /dev/null +++ b/tests/basic/ec/ec-4-1.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=4 +REDUNDANCY=1 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=282 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-5-1.t b/tests/basic/ec/ec-5-1.t new file mode 100644 index 0000000000..61d1cb6ce4 --- /dev/null +++ b/tests/basic/ec/ec-5-1.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=5 +REDUNDANCY=1 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=326 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-5-2.t b/tests/basic/ec/ec-5-2.t new file mode 100644 index 0000000000..4dc1c186f0 --- /dev/null +++ b/tests/basic/ec/ec-5-2.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=5 +REDUNDANCY=2 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=326 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-6-2.t b/tests/basic/ec/ec-6-2.t new file mode 100644 index 0000000000..23ec84e60e --- /dev/null +++ b/tests/basic/ec/ec-6-2.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=6 +REDUNDANCY=2 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=370 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-7-3.t b/tests/basic/ec/ec-7-3.t new file mode 100644 index 0000000000..4ebba2a1de --- /dev/null +++ b/tests/basic/ec/ec-7-3.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=7 +REDUNDANCY=3 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=414 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-common b/tests/basic/ec/ec-common new file mode 100644 index 0000000000..95f53f250b --- /dev/null +++ b/tests/basic/ec/ec-common @@ -0,0 +1,143 @@ + +SIZE_LIST="1048576 1000 12345 0" + +LAST_BRICK=$(($DISPERSE - 1)) + +function fragment_size +{ + local fragments=$(($DISPERSE - $REDUNDANCY)) + local block_size=$((128 * $fragments)) + local size=$(($1 + $block_size - 1)) + + echo $((( $size - ( $size ) % $block_size ) / $fragments)) +} + +cleanup + +tmp=`mktemp -d` +if [ ! -d $tmp ]; then + exit 1 +fi + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 redundancy $REDUNDANCY $H0:$B0/${V0}{0..$LAST_BRICK} +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +TEST dd if=/dev/urandom of=$tmp/small bs=1024 count=1 +TEST dd if=/dev/urandom of=$tmp/big bs=1024 count=4096 + +cs_small=$(sha1sum $tmp/small | awk '{ print $1 }') +cs_big=$(sha1sum $tmp/big | awk '{ print $1 }') +cp $tmp/small $tmp/small1 +for size in $SIZE_LIST; do + truncate -s $size $tmp/small1 + eval cs_small_truncate[$size]=$(sha1sum $tmp/small1 | awk '{ print $1 }') +done +cp $tmp/big $tmp/big1 +for size in $SIZE_LIST; do + truncate -s $size $tmp/big1 + eval cs_big_truncate[$size]=$(sha1sum $tmp/big1 | awk '{ print $1 }') +done + +TEST df -h +TEST stat $M0 + +for idx in `seq 0 $LAST_BRICK`; do + brick[$idx]=$(gf_get_gfid_backend_file_path $B0/$V0$idx) +done + +cd $M0 +EXPECT "2" echo $(ls -a1 | wc -l) +TEST mkdir dir1 +TEST [ -d dir1 ] +TEST touch file1 +TEST [ -f file1 ] + +for dir in . dir1; do + TEST cp $tmp/small $dir/small + TEST [ -f $dir/small ] + fsize=$(fragment_size 1024) + EXPECT "1024" stat -c "%s" $dir/small + for idx in `seq 0 $LAST_BRICK`; do + EXPECT "$fsize" stat -c "%s" ${brick[$idx]}/$dir/small + done + + EXPECT "$cs_small" echo $(sha1sum $dir/small | awk '{ print $1 }') + + TEST cp $tmp/big $dir/big + TEST [ -f $dir/big ] + fsize=$(fragment_size 4194304) + EXPECT "4194304" stat -c "%s" $dir/big + for idx in `seq 0 $LAST_BRICK`; do + EXPECT "$fsize" stat -c "%s" ${brick[$idx]}/$dir/big + done + + EXPECT "$cs_big" echo $(sha1sum $dir/big | awk '{ print $1 }') + + for idx in `seq 0 $LAST_BRICK`; do + TEST kill_brick $V0 $H0 $B0/$V0$idx + + EXPECT "1024" stat -c "%s" $dir/small + EXPECT "4194304" stat -c "%s" $dir/big + EXPECT "$cs_small" echo $(sha1sum $dir/small | awk '{ print $1 }') + EXPECT "$cs_big" echo $(sha1sum $dir/big | awk '{ print $1 }') + + cd + TEST umount $M0 + TEST $CLI volume stop $V0 force + TEST $CLI volume start $V0 + TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + cd $M0 + done + + for size in $SIZE_LIST; do + TEST truncate -s $size $dir/small + TEST [ -f $dir/small ] + fsize=$(fragment_size $size) + EXPECT "$size" stat -c "%s" $dir/small + for idx in `seq 0 $LAST_BRICK`; do + EXPECT "$fsize" stat -c "%s" ${brick[$idx]}/$dir/small + done + + EXPECT "${cs_small_truncate[$size]}" echo $(sha1sum $dir/small | awk '{ print $1 }') + + TEST truncate -s $size $dir/big + TEST [ -f $dir/big ] + EXPECT "$size" stat -c "%s" $dir/big + for idx in `seq 0 $LAST_BRICK`; do + EXPECT "$fsize" stat -c "%s" ${brick[$idx]}/$dir/big + done + + EXPECT "${cs_big_truncate[$size]}" echo $(sha1sum $dir/big | awk '{ print $1 }') + done + + TEST rm -f $dir/small + TEST [ ! -e $dir/small ] + for idx in `seq 0 $LAST_BRICK`; do + TEST [ ! -e ${brick[$idx]}/$dir/small ] + done + + TEST rm -f $dir/big + TEST [ ! -e $dir/big ] + for idx in `seq 0 $LAST_BRICK`; do + TEST [ ! -e ${brick[$idx]}/$dir/big ] + done +done + +TEST rmdir dir1 +TEST [ ! -e dir1 ] +for idx in `seq 0 $LAST_BRICK`; do + TEST [ ! -e ${brick[$idx]}/dir1 ] +done + +TEST rm -f file1 +TEST [ ! -e file1 ] +for idx in `seq 0 $LAST_BRICK`; do + TEST [ ! -e ${brick[$idx]}/file1 ] +done + +rm -rf $tmp + +cleanup diff --git a/tests/basic/ec/ec.t b/tests/basic/ec/ec.t new file mode 100644 index 0000000000..e81de0d97b --- /dev/null +++ b/tests/basic/ec/ec.t @@ -0,0 +1,233 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +TEST_USER=test-ec-user +TEST_UID=27341 + +function my_getfattr { + getfattr --only-values -e text $* 2> /dev/null +} + +function get_rep_count { + v=$(my_getfattr -n trusted.nsr.rep-count $1) + #echo $v > /dev/tty + echo $v +} + +function create_file { + dd if=/dev/urandom of=$1 bs=4k count=$2 conv=sync 2> /dev/null +} + +function setup_perm_file { + mkdir $1/perm_dir || return 1 + chown ${TEST_USER} $1/perm_dir || return 1 + su ${TEST_USER} -c "touch $1/perm_dir/perm_file" || return 1 + return 0 +} + +# Functions to check repair for specific operation types. + +function check_create_write { + for b in $*; do + cmp $tmpdir/create-write $b/create-write || return 1 + done + return 0 +} + +function check_truncate { + truncate --size=8192 $tmpdir/truncate + for b in $*; do + cmp $tmpdir/truncate $b/truncate || return 1 + done + return 0 +} + +function check_hard_link { + for b in $*; do + inum1=$(ls -i $b/hard-link-1 | cut -d' ' -f1) + inum2=$(ls -i $b/hard-link-2 | cut -d' ' -f1) + [ "$inum1" = "$inum2" ] || return 1 + done + echo "Y" + return 0 +} + +function check_soft_link { + for b in $*; do + [ "$(readlink $b/soft-link)" = "soft-link-tgt" ] || return 1 + done + echo "Y" + return 0 +} + +function check_unlink { + for b in $*; do + [ ! -e $b/unlink ] || return 1 + done + echo "Y" + return 0 +} + +function check_mkdir { + for b in $*; do + [ -d $b/mkdir ] || return 1 + done + echo "Y" + return 0 +} + +function check_rmdir { + for b in $*; do + [ ! -e $b/rmdir ] || return 1 + done + echo "Y" + return 0 +} + +function check_setxattr { + for b in $*; do + v=$(my_getfattr -n user.foo $b/setxattr) + [ "$v" = "ash_nazg_durbatuluk" ] || return 1 + done + echo "Y" + return 0 +} + +function check_removexattr { + for b in $*; do + my_getfattr -n user.bar $b/removexattr 2> /dev/null + [ $? = 0 ] && return 1 + done + echo "Y" + return 0 +} + +function check_perm_file { + b1=$1 + shift 1 + ftext=$(stat -c "%u %g %a" $b1/perm_dir/perm_file) + #echo "first u/g/a = $ftext" > /dev/tty + for b in $*; do + btext=$(stat -c "%u %g %a" $b/perm_dir/perm_file) + #echo " next u/a/a = $btext" > /dev/tty + if [ x"$btext" != x"$ftext" ]; then + return 1 + fi + done + echo "Y" + return 0 +} + +cleanup + +TEST useradd -o -M -u ${TEST_UID} ${TEST_USER} +trap "userdel --force ${TEST_USER}" EXIT + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info + +TEST mkdir -p $B0/${V0}{0,1,2,3,4,5,6,7,8,9} +TEST $CLI volume create $V0 disperse 10 redundancy 2 $H0:$B0/${V0}{0,1,2,3,4,5,6,7,8,9} + +EXPECT "$V0" volinfo_field $V0 'Volume Name' +EXPECT 'Created' volinfo_field $V0 'Status' +EXPECT '10' brick_count $V0 + +TEST $CLI volume start $V0 +EXPECT 'Started' volinfo_field $V0 'Status' + +# Mount FUSE with caching disabled +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + +# Create local files for comparisons etc. +tmpdir=$(mktemp -d) +trap "rm -rf $tmpdir" EXIT +TEST create_file $tmpdir/create-write 10 +TEST create_file $tmpdir/truncate 10 + +# Prepare files and directories we'll need later. +TEST cp $tmpdir/truncate $M0/ +TEST touch $M0/hard-link-1 +TEST touch $M0/unlink +TEST mkdir $M0/rmdir +TEST touch $M0/setxattr +TEST touch $M0/removexattr +TEST setfattr -n user.bar -v "ash_nazg_gimbatul" $M0/removexattr + +# Kill a couple of bricks and allow some time for things to settle. +TEST kill_brick $V0 $H0 $B0/${V0}3 +TEST kill_brick $V0 $H0 $B0/${V0}8 +sleep 10 + +# Test create+write +TEST cp $tmpdir/create-write $M0/ +# Test truncate +TEST truncate --size=8192 $M0/truncate +# Test hard link +TEST ln $M0/hard-link-1 $M0/hard-link-2 +# Test soft link +TEST ln -s soft-link-tgt $M0/soft-link +# Test unlink +TEST rm $M0/unlink +# Test rmdir +TEST rmdir $M0/rmdir +# Test mkdir +TEST mkdir $M0/mkdir +# Test setxattr +TEST setfattr -n user.foo -v "ash_nazg_durbatuluk" $M0/setxattr +# Test removexattr +TEST setfattr -x user.bar $M0/removexattr +# Test uid/gid behavior +TEST setup_perm_file $M0 + +# Unmount/remount so that create/write and truncate don't see cached data. +TEST umount $M0 +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + +# Test create/write and truncate *before* the bricks are brought back. +TEST check_create_write $M0 +TEST check_truncate $M0 + +# Restart the bricks and allow repair to occur. +TEST $CLI volume start $V0 force +sleep 10 + +# Unmount/remount again, same reason as before. +TEST umount $M0 +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + +# Make sure everything is as it should be. Most tests check for consistency +# between the bricks and the front end. This is not valid for disperse, so we +# check the mountpoint state instead. + +TEST check_create_write $M0 +TEST check_truncate $M0 + +TEST stat $M0/hard-link-1 +TEST stat $M0/hard-link-2 +TEST stat $M0/soft-link +TEST ! stat $M0/unlink +TEST ! stat $M0/rmdir +TEST stat $M0/mkdir +TEST stat $M0/setxattr +TEST stat $M0/removexattr +TEST stat $M0/perm_dir +TEST stat $M0/perm_dir/perm_file + +EXPECT_WITHIN 5 "Y" check_hard_link $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_soft_link $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_unlink $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_rmdir $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_mkdir $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_setxattr $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_removexattr $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_perm_file $B0/${V0}{0..9} + +rm -rf $tmpdir +userdel --force ${TEST_USER} + +cleanup + diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t new file mode 100644 index 0000000000..99cfd9420a --- /dev/null +++ b/tests/basic/ec/self-heal.t @@ -0,0 +1,123 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks self-healing feature of dispersed volumes + +cleanup + +tmp=`mktemp -d` +if [ ! -d $tmp ]; then + exit 1 +fi + +TESTS_EXPECTED_IN_LOOP=85 + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5} +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +TEST dd if=/dev/urandom of=$tmp/test bs=1024 count=1024 + +cs=$(sha1sum $tmp/test | awk '{ print $1 }') + +TEST df -h +TEST stat $M0 + +for idx in {0..5}; do + brick[$idx]=$(gf_get_gfid_backend_file_path $B0/$V0$idx) +done + +cd $M0 +TEST cp $tmp/test test +TEST chmod 644 test +EXPECT "-rw-r--r--" stat -c "%A" test + +for idx1 in {0..5}; do + TEST chmod 666 ${brick[$idx1]}/test + sleep 1 + EXPECT "-rw-r--r--" stat -c "%A" test + EXPECT_WITHIN 5 "-rw-r--r--" stat -c "%A" ${brick[$idx1]}/test +done + +for idx1 in {0..4}; do + for idx2 in `seq $(($idx1 + 1)) 5`; do + if [ $idx1 -ne $idx2 ]; then + TEST chmod 666 ${brick[$idx1]}/test + TEST chmod 600 ${brick[$idx2]}/test + sleep 1 + EXPECT "-rw-r--r--" stat -c "%A" test + EXPECT_WITHIN 5 "-rw-r--r--" stat -c "%A" ${brick[$idx1]}/test + EXPECT_WITHIN 5 "-rw-r--r--" stat -c "%A" ${brick[$idx2]}/test + fi + done +done + +TEST truncate -s 0 ${brick[0]}/test +TEST truncate -s 2097152 ${brick[1]}/test +TEST setfattr -n user.test -v "test1" ${brick[0]}/test +TEST setfattr -n user.test -v "test2" ${brick[1]}/test +TEST chmod 600 ${brick[0]}/test +TEST chmod 666 ${brick[1]}/test +sleep 1 + +EXPECT "1048576" stat -c "%s" test +TEST ! getfattr -n user.test test + +EXPECT_WITHIN 5 "262144" stat -c "%s" ${brick[0]}/test +EXPECT_WITHIN 5 "262144" stat -c "%s" ${brick[1]}/test +TEST ! getfattr -n user.test ${brick[0]}/test +TEST ! getfattr -n user.test ${brick[1]}/test +EXPECT "-rw-r--r--" stat -c "%A" ${brick[0]}/test +EXPECT "-rw-r--r--" stat -c "%A" ${brick[1]}/test + +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST cp $tmp/test test2 +EXPECT "1048576" stat -c "%s" test2 +TEST chmod 777 test2 +EXPECT "-rwxrwxrwx" stat -c "%A" test2 + +TEST mkdir dir1 +TEST ls -al dir1 + +TEST ln -s test2 test3 +TEST [ -h test3 ] + +TEST ln test2 test4 +TEST [ -f test4 ] +EXPECT "2" stat -c "%h" test2 +EXPECT "2" stat -c "%h" test4 + +cd +TEST umount $M0 +TEST $CLI volume stop $V0 force +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +cd $M0 + +EXPECT "1048576" stat -c "%s" test2 +EXPECT "-rwxrwxrwx" stat -c "%A" test2 +EXPECT_WITHIN 5 "262144" stat -c "%s" ${brick[0]}/test2 +EXPECT_WITHIN 5 "262144" stat -c "%s" ${brick[1]}/test2 +EXPECT "-rwxrwxrwx" stat -c "%A" ${brick[0]}/test2 +EXPECT "-rwxrwxrwx" stat -c "%A" ${brick[1]}/test2 + +TEST ls -al dir1 +EXPECT_WITHIN 5 "1" eval "if [ -d ${brick[0]}/dir1 ]; then echo 1; fi" +EXPECT_WITHIN 5 "1" eval "if [ -d ${brick[1]}/dir1 ]; then echo 1; fi" + +TEST [ -h test3 ] +EXPECT_WITHIN 5 "1" eval "if [ -h ${brick[0]}/test3 ]; then echo 1; fi" +EXPECT_WITHIN 5 "1" eval "if [ -h ${brick[1]}/test3 ]; then echo 1; fi" + +EXPECT "2" stat -c "%h" test4 +EXPECT_WITHIN 5 "3" stat -c "%h" ${brick[0]}/test4 +EXPECT_WITHIN 5 "3" stat -c "%h" ${brick[1]}/test4 + +rm -rf $tmp + +cleanup diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index 452df759ad..089c7d637c 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -169,6 +169,12 @@ gd_addbr_validate_stripe_count (glusterd_volinfo_t *volinfo, int stripe_count, } } break; + case GF_CLUSTER_TYPE_DISPERSE: + snprintf (err_str, err_len, "Volume %s cannot be converted " + "from dispersed to striped-" + "dispersed", volinfo->volname); + gf_log(THIS->name, GF_LOG_ERROR, "%s", err_str); + goto out; } out: @@ -259,6 +265,12 @@ gd_addbr_validate_replica_count (glusterd_volinfo_t *volinfo, int replica_count, } } break; + case GF_CLUSTER_TYPE_DISPERSE: + snprintf (err_str, err_len, "Volume %s cannot be converted " + "from dispersed to replicated-" + "dispersed", volinfo->volname); + gf_log(THIS->name, GF_LOG_ERROR, "%s", err_str); + goto out; } out: return ret; @@ -276,6 +288,7 @@ gd_rmbr_validate_replica_count (glusterd_volinfo_t *volinfo, switch (volinfo->type) { case GF_CLUSTER_TYPE_NONE: case GF_CLUSTER_TYPE_STRIPE: + case GF_CLUSTER_TYPE_DISPERSE: snprintf (err_str, err_len, "replica count (%d) option given for non replicate " "volume %s", replica_count, volinfo->volname); @@ -737,6 +750,8 @@ __glusterd_handle_remove_brick (rpcsvc_request_t *req) strcpy (vol_type, "stripe"); } else if (volinfo->type == GF_CLUSTER_TYPE_STRIPE_REPLICATE) { strcpy (vol_type, "stripe-replicate"); + } else if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) { + strcpy (vol_type, "disperse"); } else { strcpy (vol_type, "distribute"); } diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index ed4bd60f88..e10dc22b56 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -398,6 +398,16 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; + snprintf (key, 256, "volume%d.disperse_count", count); + ret = dict_set_int32 (volumes, key, volinfo->disperse_count); + if (ret) + goto out; + + snprintf (key, 256, "volume%d.redundancy_count", count); + ret = dict_set_int32 (volumes, key, volinfo->redundancy_count); + if (ret) + goto out; + snprintf (key, 256, "volume%d.transport", count); ret = dict_set_int32 (volumes, key, volinfo->transport_type); if (ret) diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index c31d8a8ad7..086a6550a7 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -844,6 +844,18 @@ glusterd_volume_exclude_options_write (int fd, glusterd_volinfo_t *volinfo) if (ret) goto out; + snprintf (buf, sizeof (buf), "%d", volinfo->disperse_count); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT, + buf); + if (ret) + goto out; + + snprintf (buf, sizeof (buf), "%d", volinfo->redundancy_count); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT, + buf); + if (ret) + goto out; + snprintf (buf, sizeof (buf), "%d", volinfo->version); ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_VERSION, buf); if (ret) @@ -2618,6 +2630,12 @@ glusterd_store_update_volinfo (glusterd_volinfo_t *volinfo) } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_REPLICA_CNT, strlen (GLUSTERD_STORE_KEY_VOL_REPLICA_CNT))) { volinfo->replica_count = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT, + strlen (GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT))) { + volinfo->disperse_count = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT, + strlen (GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT))) { + volinfo->redundancy_count = atoi (value); } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_TRANSPORT, strlen (GLUSTERD_STORE_KEY_VOL_TRANSPORT))) { volinfo->transport_type = atoi (value); @@ -2754,6 +2772,11 @@ glusterd_store_update_volinfo (glusterd_volinfo_t *volinfo) GF_ASSERT (volinfo->replica_count > 0); break; + case GF_CLUSTER_TYPE_DISPERSE: + GF_ASSERT (volinfo->disperse_count > 0); + GF_ASSERT (volinfo->redundancy_count > 0); + break; + default: GF_ASSERT (0); break; diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h index 89cf24de78..fb7de7b1b1 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.h +++ b/xlators/mgmt/glusterd/src/glusterd-store.h @@ -44,6 +44,8 @@ typedef enum glusterd_store_ver_ac_{ #define GLUSTERD_STORE_KEY_VOL_SUB_COUNT "sub_count" #define GLUSTERD_STORE_KEY_VOL_STRIPE_CNT "stripe_count" #define GLUSTERD_STORE_KEY_VOL_REPLICA_CNT "replica_count" +#define GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT "disperse_count" +#define GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT "redundancy_count" #define GLUSTERD_STORE_KEY_VOL_BRICK "brick" #define GLUSTERD_STORE_KEY_VOL_VERSION "version" #define GLUSTERD_STORE_KEY_VOL_TRANSPORT "transport-type" diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index dc923b1eeb..aff2356eb4 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -548,6 +548,8 @@ glusterd_volinfo_dup (glusterd_volinfo_t *volinfo, new_volinfo->type = volinfo->type; new_volinfo->replica_count = volinfo->replica_count; new_volinfo->stripe_count = volinfo->stripe_count; + new_volinfo->disperse_count = volinfo->disperse_count; + new_volinfo->redundancy_count = volinfo->redundancy_count; new_volinfo->dist_leaf_count = volinfo->dist_leaf_count; new_volinfo->sub_count = volinfo->sub_count; new_volinfo->transport_type = volinfo->transport_type; @@ -2524,6 +2526,18 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "%s%d.disperse_count", prefix, count); + ret = dict_set_int32 (dict, key, volinfo->disperse_count); + if (ret) + goto out; + + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "%s%d.redundancy_count", prefix, count); + ret = dict_set_int32 (dict, key, volinfo->redundancy_count); + if (ret) + goto out; + memset (key, 0, sizeof (key)); snprintf (key, sizeof (key), "%s%d.dist_count", prefix, count); ret = dict_set_int32 (dict, key, volinfo->dist_leaf_count); @@ -4206,6 +4220,24 @@ glusterd_import_volinfo (dict_t *peer_data, int count, gf_log (THIS->name, GF_LOG_INFO, "peer is possibly old version"); + /* not having a 'disperse_count' key is not a error + (as peer may be of old version) */ + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "%s%d.disperse_count", prefix, count); + ret = dict_get_int32 (peer_data, key, &new_volinfo->disperse_count); + if (ret) + gf_log (THIS->name, GF_LOG_INFO, + "peer is possibly old version"); + + /* not having a 'redundancy_count' key is not a error + (as peer may be of old version) */ + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "%s%d.redundancy_count", prefix, count); + ret = dict_get_int32 (peer_data, key, &new_volinfo->redundancy_count); + if (ret) + gf_log (THIS->name, GF_LOG_INFO, + "peer is possibly old version"); + /* not having a 'dist_count' key is not a error (as peer may be of old version) */ memset (key, 0, sizeof (key)); @@ -6932,6 +6964,9 @@ glusterd_get_dist_leaf_count (glusterd_volinfo_t *volinfo) int rcount = volinfo->replica_count; int scount = volinfo->stripe_count; + if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) + return volinfo->disperse_count; + return (rcount ? rcount : 1) * (scount ? scount : 1); } @@ -11694,6 +11729,13 @@ gd_update_volume_op_versions (glusterd_volinfo_t *volinfo) } } + if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) { + if (volinfo->op_version < GD_OP_VERSION_3_6_0) + volinfo->op_version = GD_OP_VERSION_3_6_0; + if (volinfo->client_op_version < GD_OP_VERSION_3_6_0) + volinfo->client_op_version = GD_OP_VERSION_3_6_0; + } + return; } @@ -12774,7 +12816,7 @@ glusterd_volume_quorum_calculate (glusterd_volinfo_t *volinfo, dict_t *dict, goto out; } - up_count = volinfo->replica_count - down_count; + up_count = volinfo->dist_leaf_count - down_count; if (quorum_type && !strcmp (quorum_type, "fixed")) { if (up_count >= quorum_count) { @@ -12782,7 +12824,8 @@ glusterd_volume_quorum_calculate (glusterd_volinfo_t *volinfo, dict_t *dict, goto out; } } else { - if (volinfo->replica_count % 2 == 0) { + if ((GF_CLUSTER_TYPE_DISPERSE != volinfo->type) && + (volinfo->dist_leaf_count % 2 == 0)) { if ((up_count > quorum_count) || ((up_count == quorum_count) && first_brick_on)) { quorum_met = _gf_true; @@ -12835,8 +12878,9 @@ glusterd_volume_quorum_check (glusterd_volinfo_t *volinfo, int64_t index, goto out; } - if (!glusterd_is_volume_replicate (volinfo) || - volinfo->replica_count < 3) { + if ((!glusterd_is_volume_replicate (volinfo) || + volinfo->replica_count < 3) && + (GF_CLUSTER_TYPE_DISPERSE != volinfo->type)) { for (i = 0; i < volinfo->brick_count ; i++) { /* for a pure distribute volume, and replica volume with replica count 2, quorum is not met if even @@ -12858,7 +12902,8 @@ glusterd_volume_quorum_check (glusterd_volinfo_t *volinfo, int64_t index, ret = 0; quorum_met = _gf_true; } else { - distribute_subvols = volinfo->brick_count / volinfo->replica_count; + distribute_subvols = volinfo->brick_count / + volinfo->dist_leaf_count; for (j = 0; j < distribute_subvols; j++) { // by default assume quorum is not met /* TODO: Handle distributed striped replicate volumes @@ -12867,11 +12912,11 @@ glusterd_volume_quorum_check (glusterd_volinfo_t *volinfo, int64_t index, */ ret = 1; quorum_met = _gf_false; - for (i = 0; i < volinfo->replica_count; i++) { + for (i = 0; i < volinfo->dist_leaf_count; i++) { snprintf (key, sizeof (key), "%s%"PRId64".brick%"PRId64".status", key_prefix, index, - (j * volinfo->replica_count) + i); + (j * volinfo->dist_leaf_count) + i); ret = dict_get_int32 (dict, key, &brick_online); if (ret || !brick_online) { if (i == 0) @@ -13043,6 +13088,9 @@ glusterd_snap_quorum_check_for_create (dict_t *dict, gf_boolean_t snap_volume, else quorum_count = volinfo->replica_count/2 + 1; + } else if (GF_CLUSTER_TYPE_DISPERSE == volinfo->type) { + quorum_count = volinfo->disperse_count - + volinfo->redundancy_count; } else { quorum_count = volinfo->brick_count; } @@ -13061,8 +13109,22 @@ glusterd_snap_quorum_check_for_create (dict_t *dict, gf_boolean_t snap_volume, if the quorum-type option is not set to auto, the behavior is set to the default behavior) */ - if (!ret) - quorum_count = tmp; + if (!ret) { + /* for dispersed volumes, only allow quorums + equal or larger than minimum functional + value. + */ + if ((GF_CLUSTER_TYPE_DISPERSE != + volinfo->type) || + (tmp >= quorum_count)) { + quorum_count = tmp; + } else { + gf_log(this->name, GF_LOG_INFO, + "Ignoring small quorum-count " + "(%d) on dispersed volume", tmp); + quorum_type = NULL; + } + } else quorum_type = NULL; } diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 6ab899a16c..9701c6b939 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -2684,10 +2684,14 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph, "%s-replicate-%d"}; char *stripe_args[] = {"cluster/stripe", "%s-stripe-%d"}; + char *disperse_args[] = {"cluster/disperse", + "%s-disperse-%d"}; + char option[32] = ""; int rclusters = 0; int clusters = 0; int dist_count = 0; int ret = -1; + xlator_t * ec = NULL; if (!volinfo->dist_leaf_count) goto out; @@ -2737,6 +2741,26 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph, if (clusters < 0) goto out; break; + case GF_CLUSTER_TYPE_DISPERSE: + clusters = volgen_graph_build_clusters (graph, volinfo, + disperse_args[0], + disperse_args[1], + volinfo->brick_count, + volinfo->disperse_count); + if (clusters < 0) + goto out; + + sprintf(option, "%d", volinfo->redundancy_count); + ec = first_of (graph); + while (clusters-- > 0) { + ret = xlator_set_option (ec, "redundancy", option); + if (ret) + goto out; + + ec = ec->next; + } + + break; default: gf_log ("", GF_LOG_ERROR, "volume inconsistency: " "unrecognized clustering type"); diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index 53beebe055..f23a9eb96b 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -1689,6 +1689,27 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) "replica count for volume %s", volname); goto out; } + } else if (GF_CLUSTER_TYPE_DISPERSE == volinfo->type) { + ret = dict_get_int32 (dict, "disperse-count", + &volinfo->disperse_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "disperse count for volume %s", volname); + goto out; + } + ret = dict_get_int32 (dict, "redundancy-count", + &volinfo->redundancy_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "redundancy count for volume %s", volname); + goto out; + } + if (priv->op_version < GD_OP_VERSION_3_6_0) { + gf_log (this->name, GF_LOG_ERROR, "Disperse volume " + "needs op-version 3.6.0 or higher"); + ret = -1; + goto out; + } } /* dist-leaf-count is the count of brick nodes for a given diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index a8ecb505a5..ddbb2c8133 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -336,6 +336,8 @@ struct glusterd_volinfo_ { int sub_count; /* backward compatibility */ int stripe_count; int replica_count; + int disperse_count; + int redundancy_count; int subvol_count; /* Number of subvolumes in a distribute volume */ int dist_leaf_count; /* Number of bricks in one -- cgit