summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Assemble.c15
-rw-r--r--Detail.c4
-rw-r--r--Grow.c1509
-rw-r--r--Manage.c18
-rw-r--r--ReadMe.c9
-rw-r--r--mdadm.8111
-rw-r--r--mdadm.c125
-rw-r--r--mdadm.h16
-rw-r--r--restripe.c418
-rw-r--r--super0.c10
-rw-r--r--super1.c8
-rw-r--r--sysfs.c58
-rw-r--r--util.c65
13 files changed, 1926 insertions, 440 deletions
diff --git a/Assemble.c b/Assemble.c
index 4578906..7c3a249 100644
--- a/Assemble.c
+++ b/Assemble.c
@@ -984,6 +984,8 @@ int Assemble(struct supertype *st, char *mddev,
}
if (err) {
fprintf(stderr, Name ": Failed to restore critical section for reshape, sorry.\n");
+ if (backup_file == NULL)
+ fprintf(stderr," Possibly you needed to specify the --backup-file\n");
close(mdfd);
return err;
}
@@ -1092,7 +1094,18 @@ int Assemble(struct supertype *st, char *mddev,
content->array.layout, clean, avail, okcnt) &&
(okcnt >= req_cnt || start_partial_ok)
))) {
- if (ioctl(mdfd, RUN_ARRAY, NULL)==0) {
+ /* This array is good-to-go.
+ * If a reshape is in progress then we might need to
+ * continue monitoring it. In that case we start
+ * it read-only and let the grow code make it writable.
+ */
+ int rv;
+ if (content->reshape_active &&
+ content->delta_disks <= 0)
+ rv = Grow_continue(mdfd, st, content, backup_file);
+ else
+ rv = ioctl(mdfd, RUN_ARRAY, NULL);
+ if (rv == 0) {
if (verbose >= 0) {
fprintf(stderr, Name ": %s has been started with %d drive%s",
mddev, okcnt, okcnt==1?"":"s");
diff --git a/Detail.c b/Detail.c
index a70db34..55d5481 100644
--- a/Detail.c
+++ b/Detail.c
@@ -320,6 +320,10 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
c = map_num(r5layout, array.layout);
printf(" Layout : %s\n", c?c:"-unknown-");
}
+ if (array.level == 6) {
+ c = map_num(r6layout, array.layout);
+ printf(" Layout : %s\n", c?c:"-unknown-");
+ }
if (array.level == 10) {
printf(" Layout :");
print_r10_layout(array.layout);
diff --git a/Grow.c b/Grow.c
index 1805604..5ebb482 100644
--- a/Grow.c
+++ b/Grow.c
@@ -23,6 +23,7 @@
*/
#include "mdadm.h"
#include "dlink.h"
+#include <sys/mman.h>
#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN)
#error no endian defined
@@ -30,6 +31,10 @@
#include "md_u.h"
#include "md_p.h"
+#ifndef offsetof
+#define offsetof(t,f) ((size_t)&(((t*)0)->f))
+#endif
+
int Grow_Add_device(char *devname, int fd, char *newdev)
{
/* Add a device to an active array.
@@ -383,15 +388,20 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
*/
struct mdp_backup_super {
- char magic[16]; /* md_backup_data-1 */
+ char magic[16]; /* md_backup_data-1 or -2 */
__u8 set_uuid[16];
__u64 mtime;
/* start/sizes in 512byte sectors */
- __u64 devstart;
+ __u64 devstart; /* address on backup device/file of data */
__u64 arraystart;
__u64 length;
__u32 sb_csum; /* csum of preceeding bytes. */
- __u8 pad[512-68];
+ __u32 pad1;
+ __u64 devstart2; /* offset in to data of second section */
+ __u64 arraystart2;
+ __u64 length2;
+ __u32 sb_csum2; /* csum of preceeding bytes. */
+ __u8 pad[512-68-32];
} __attribute__((aligned(512))) bsb;
int bsb_csum(char *buf, int len)
@@ -403,32 +413,107 @@ int bsb_csum(char *buf, int len)
return __cpu_to_le32(csum);
}
+static int child_grow(int afd, struct mdinfo *sra, unsigned long blocks,
+ int *fds, unsigned long long *offsets,
+ int disks, int chunk, int level, int layout, int data,
+ int dests, int *destfd, unsigned long long *destoffsets);
+static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks,
+ int *fds, unsigned long long *offsets,
+ int disks, int chunk, int level, int layout, int data,
+ int dests, int *destfd, unsigned long long *destoffsets);
+static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks,
+ int *fds, unsigned long long *offsets,
+ unsigned long long start,
+ int disks, int chunk, int level, int layout, int data,
+ int dests, int *destfd, unsigned long long *destoffsets);
+
+int freeze_array(struct mdinfo *sra)
+{
+ /* Try to freeze resync on this array.
+ * Return -1 if the array is busy,
+ * return 0 if this kernel doesn't support 'frozen'
+ * return 1 if it worked.
+ */
+ char buf[20];
+ if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0)
+ return 0;
+ if (strcmp(buf, "idle\n") != 0 &&
+ strcmp(buf, "frozen\n") != 0)
+ return -1;
+ if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0)
+ return 0;
+ return 1;
+}
+
+void unfreeze_array(struct mdinfo *sra, int frozen)
+{
+ /* If 'frozen' is 1, unfreeze the array */
+ if (frozen > 0)
+ sysfs_set_str(sra, NULL, "sync_action", "idle");
+}
+
+void wait_reshape(struct mdinfo *sra)
+{
+ int fd = sysfs_get_fd(sra, NULL, "sync_action");
+ char action[20];
+
+ do {
+ fd_set rfds;
+ FD_ZERO(&rfds);
+ FD_SET(fd, &rfds);
+ select(fd+1, NULL, NULL, &rfds, NULL);
+
+ if (sysfs_fd_get_str(fd, action, 20) < 0) {
+ close(fd);
+ return;
+ }
+ } while (strncmp(action, "reshape", 7) == 0);
+}
+
+
int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
long long size,
- int level, int layout, int chunksize, int raid_disks)
+ int level, char *layout_str, int chunksize, int raid_disks)
{
/* Make some changes in the shape of an array.
* The kernel must support the change.
- * Different reshapes have subtly different meaning for different
- * levels, so we need to check the current state of the array
- * and go from there.
+ *
+ * There are three different changes. Each can trigger
+ * a resync or recovery so we freeze that until we have
+ * requested everything (if kernel supports freezing - 2.6.30).
+ * The steps are:
+ * - change size (i.e. component_size)
+ * - change level
+ * - change layout/chunksize/ndisks
+ *
+ * The last can require a reshape. It is different on different
+ * levels so we need to check the level before actioning it.
+ * Some times the level change needs to be requested after the
+ * reshape (e.g. raid6->raid5, raid5->raid0)
+ *
*/
- struct mdu_array_info_s array;
+ struct mdu_array_info_s array, orig;
char *c;
-
+ int rv = 0;
struct supertype *st;
- int nlevel, olevel;
int nchunk, ochunk;
int nlayout, olayout;
int ndisks, odisks;
int ndata, odata;
- unsigned long long nstripe, ostripe, last_block;
+ int orig_level = UnSet;
+ char alt_layout[40];
int *fdlist;
unsigned long long *offsets;
- int d, i, spares;
+ int d, i;
int nrdisks;
int err;
+ int frozen;
+ unsigned long a,b, blocks, stripes;
+ int cache;
+ unsigned long long array_size;
+ int changed = 0;
+ int done;
struct mdinfo *sra;
struct mdinfo *sd;
@@ -438,127 +523,287 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
devname);
return 1;
}
+ sra = sysfs_read(fd, 0, GET_LEVEL);
+ frozen = freeze_array(sra);
+ if (frozen < 0) {
+ fprintf(stderr, Name ": %s is performing resync/recovery and cannot"
+ " be reshaped\n", devname);
+ return 1;
+ }
+
+ /* ========= set size =============== */
+ if (size >= 0 && (size == 0 || size != array.size)) {
+ array.size = size;
+ if (array.size != size) {
+ /* got truncated to 32bit, write to
+ * component_size instead
+ */
+ if (sra)
+ rv = sysfs_set_num(sra, NULL,
+ "component_size", size);
+ else
+ rv = -1;
+ } else
+ rv = ioctl(fd, SET_ARRAY_INFO, &array);
+ if (rv != 0) {
+ fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
+ devname, strerror(errno));
+ rv = 1;
+ goto release;
+ }
+ ioctl(fd, GET_ARRAY_INFO, &array);
+ if (!quiet)
+ fprintf(stderr, Name ": component size of %s has been set to %dK\n",
+ devname, array.size);
+ changed = 1;
+ }
+
+ /* ======= set level =========== */
+ if (level != UnSet && level != array.level) {
+ /* Trying to change the level.
+ * We might need to change layout first and schedule a
+ * level change for later.
+ * Level changes that can happen immediately are:
+ * 0->4,5,6 1->5 4->5,6 5->1,6
+ * Level changes that need a layout change first are:
+ * 6->5,4,0 : need a -6 layout, or parity-last
+ * 5->4,0 : need parity-last
+ */
+ if ((array.level == 6 || array.level == 5) &&
+ (level == 5 || level == 4 || level == 0)) {
+ /* Don't change level yet, but choose intermediate
+ * layout
+ */
+ if (level == 5) {
+ if (layout_str == NULL)
+ switch (array.layout) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ case ALGORITHM_LEFT_ASYMMETRIC_6:
+ case ALGORITHM_ROTATING_N_RESTART:
+ layout_str = "left-asymmetric-6";
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ case ALGORITHM_LEFT_SYMMETRIC_6:
+ case ALGORITHM_ROTATING_N_CONTINUE:
+ layout_str = "left-symmetric-6";
+ break;
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ case ALGORITHM_RIGHT_ASYMMETRIC_6:
+ case ALGORITHM_ROTATING_ZERO_RESTART:
+ layout_str = "right-asymmetric-6";
+ break;
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ case ALGORITHM_RIGHT_SYMMETRIC_6:
+ layout_str = "right-symmetric-6";
+ break;
+ case ALGORITHM_PARITY_0:
+ case ALGORITHM_PARITY_0_6:
+ layout_str = "parity-first-6";
+ break;
+ case ALGORITHM_PARITY_N:
+ layout_str = "parity-last";
+ break;
+ default:
+ fprintf(stderr, Name ": %s: cannot"
+ "convert layout to RAID5 equivalent\n",
+ devname);
+ rv = 1;
+ goto release;
+ }
+ else {
+ int l = map_name(r5layout, layout_str);
+ if (l == UnSet) {
+ fprintf(stderr, Name ": %s: layout '%s' not recognised\n",
+ devname, layout_str);
+ rv = 1;
+ goto release;
+ }
+ if (l != ALGORITHM_PARITY_N) {
+ /* need the -6 version */
+ char *ls = map_num(r5layout, l);
+ strcat(strcpy(alt_layout, ls),
+ "-6");
+ layout_str = alt_layout;
+ }
+ }
+ if (raid_disks)
+ /* The find raid6->raid5 conversion
+ * will reduce the number of disks,
+ * so now we need to aim higher
+ */
+ raid_disks++;
+ } else
+ layout_str = "parity-last";
+ } else {
+ c = map_num(pers, level);
+ if (c == NULL)
+ return 1;/* not possible */
+ err = sysfs_set_str(sra, NULL, "level", c);
+ if (err) {
+ fprintf(stderr, Name ": %s: could not set level to %s\n",
+ devname, c);
+ rv = 1;
+ goto release;
+ }
+ orig = array;
+ orig_level = orig.level;
+ ioctl(fd, GET_ARRAY_INFO, &array);
+ if (layout_str == NULL &&
+ orig.level == 5 && level == 6 &&
+ array.layout != orig.layout)
+ layout_str = map_num(r5layout, orig.layout);
+ if (!quiet)
+ fprintf(stderr, Name " level of %s changed to %s\n",
+ devname, c);
+ changed = 1;
+ }
+ }
+
+ /* ========= set shape (chunk_size / layout / ndisks) ============== */
+ /* Check if layout change is a no-op */
+ if (layout_str) switch(array.level) {
+ case 5:
+ if (array.layout == map_name(r5layout, layout_str))
+ layout_str = NULL;
+ break;
+ case 6:
+ if (layout_str == NULL &&
+ ((chunksize && chunksize * 1024 != array.chunk_size) ||
+ (raid_disks && raid_disks != array.raid_disks)) &&
+ array.layout >= 16) {
+ fprintf(stderr, Name
+ ": %s has a non-standard layout. If you wish to preserve this\n"
+ " during the reshape, please specify --layout=preserve\n"
+ " If you want to change it, specify a layout or use --layout=normalise\n",
+ devname);
+ rv = 1;
+ goto release;
+ }
+ if (strcmp(layout_str, "normalise") == 0 ||
+ strcmp(layout_str, "normalize") == 0) {
+ char *hyphen;
+ strcpy(alt_layout, map_num(r6layout, array.layout));
+ hyphen = strrchr(alt_layout, '-');
+ if (hyphen && strcmp(hyphen, "-6") == 0) {
+ *hyphen = 0;
+ layout_str = alt_layout;
+ }
+ }
+
+ if (array.layout == map_name(r6layout, layout_str))
+ layout_str = NULL;
+ if (layout_str && strcmp(layout_str, "preserve") == 0)
+ layout_str = NULL;
+ break;
+ }
+ if (layout_str == NULL
+ && (chunksize == 0 || chunksize*1024 == array.chunk_size)
+ && (raid_disks == 0 || raid_disks == array.raid_disks)) {
+ rv = 0;
+ if (level != UnSet && level != array.level) {
+ /* Looks like this level change doesn't need
+ * a reshape after all.
+ */
+ c = map_num(pers, level);
+ if (c) {
+ rv = sysfs_set_str(sra, NULL, "level", c);
+ if (rv)
+ fprintf(stderr, Name ": %s: could not set level to %s\n",
+ devname, c);
+ }
+ } else if (!changed && !quiet)
+ fprintf(stderr, Name ": %s: no change requested\n",
+ devname);
+ goto release;
+ }
+
c = map_num(pers, array.level);
if (c == NULL) c = "-unknown-";
switch(array.level) {
default: /* raid0, linear, multipath cannot be reconfigured */
fprintf(stderr, Name ": %s array %s cannot be reshaped.\n",
c, devname);
- return 1;
+ rv = 1;
+ break;
case LEVEL_FAULTY: /* only 'layout' change is permitted */
- if (size >= 0) {
- fprintf(stderr, Name ": %s: Cannot change size of a 'faulty' array\n",
- devname);
- return 1;
- }
- if (level != UnSet && level != LEVEL_FAULTY) {
- fprintf(stderr, Name ": %s: Cannot change RAID level of a 'faulty' array\n",
- devname);
- return 1;
- }
if (chunksize || raid_disks) {
fprintf(stderr, Name ": %s: Cannot change chunksize or disks of a 'faulty' array\n",
devname);
- return 1;
+ rv = 1;
+ break;
+ }
+ if (layout_str == NULL)
+ break; /* nothing to do.... */
+
+ array.layout = parse_layout_faulty(layout_str);
+ if (array.layout < 0) {
+ int rv;
+ fprintf(stderr, Name ": %s: layout %s not understood for 'faulty' array\n",
+ devname, layout_str);
+ rv = 1;
+ break;
}
- if (layout == UnSet)
- return 0; /* nothing to do.... */
-
- array.layout = layout;
if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
fprintf(stderr, Name ": Cannot set layout for %s: %s\n",
devname, strerror(errno));
- return 1;
- }
- if (!quiet)
+ rv = 1;
+ } else if (!quiet)
printf("layout for %s set to %d\n", devname, array.layout);
- return 0;
+ break;
- case 1: /* raid_disks and size can each be changed. They are independant */
+ case 1: /* only raid_disks can each be changed. */
- if (level != UnSet && level != 1) {
- fprintf(stderr, Name ": %s: Cannot change RAID level of a RAID1 array.\n",
+ if (chunksize || layout_str != NULL) {
+ fprintf(stderr, Name ": %s: Cannot change chunk size or layout for a RAID1 array.\n",
devname);
- return 1;
- }
- if (chunksize || layout != UnSet) {
- fprintf(stderr, Name ": %s: Cannot change chunk size of layout for a RAID1 array.\n",
- devname);
- return 1;
+ rv = 1;
+ break;
}
-
- /* Each can trigger a resync/recovery which will block the
- * other from happening. Later we could block
- * resync for the duration via 'sync_action'...
- */
if (raid_disks > 0) {
array.raid_disks = raid_disks;
if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
fprintf(stderr, Name ": Cannot set raid-devices for %s: %s\n",
devname, strerror(errno));
- return 1;
- }
- }
- if (size >= 0) {
- array.size = size;
- if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
- fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
- devname, strerror(errno));
- return 1;
+ rv = 1;
}
}
- return 0;
+ break;
case 4:
case 5:
case 6:
- st = super_by_fd(fd);
- /* size can be changed independently.
- * layout/chunksize/raid_disks/level can be changed
+ /*
+ * layout/chunksize/raid_disks can be changed
* though the kernel may not support it all.
- * If 'suspend_lo' is not present in devfs, then
- * these cannot be changed.
*/
- if (size >= 0) {
- /* Cannot change other details as well.. */
- if (layout != UnSet ||
- chunksize != 0 ||
- raid_disks != 0 ||
- level != UnSet) {
- fprintf(stderr, Name ": %s: Cannot change shape as well as size of a %s array.\n",
- devname, c);
- return 1;
- }
- array.size = size;
- if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
- fprintf(stderr, Name ": Cannot set device size/shape for %s: %s\n",
- devname, strerror(errno));
- return 1;
- }
- return 0;
- }
- /* Ok, just change the shape. This can be awkward.
- * There are three possibilities.
- * 1/ The array will shrink. We don't support this
- * possibility. Maybe one day...
- * 2/ The array will not change size. This is easy enough
- * to do, but not reliably. If the process is aborted
- * the array *will* be corrupted. So maybe we can allow
- * this but only if the user is really certain. e.g.
- * --really-risk-everything
- * 3/ The array will grow. This can be reliably achieved.
+ st = super_by_fd(fd);
+
+ /*
+ * There are three possibilities.
+ * 1/ The array will shrink.
+ * We need to ensure the reshape will pause before reaching
+ * the 'critical section'. We also need to fork and wait for
+ * that to happen. When it does we
+ * suspend/backup/complete/unfreeze
+ *
+ * 2/ The array will not change size.
+ * This requires that we keep a backup of a sliding window
+ * so that we can restore data after a crash. So we need
+ * to fork and monitor progress.
+ *
+ * 3/ The array will grow. This is relatively easy.
* However the kernel's restripe routines will cheerfully
* overwrite some early data before it is safe. So we
* need to make a backup of the early parts of the array
* and be ready to restore it if rebuild aborts very early.
*
- * We backup data by writing it to all spares (there must be
- * at least 1, so even raid6->raid5 requires a spare to be
- * present).
+ * We backup data by writing it to one spare, or to a
+ * file which was given on command line.
*
+ * [FOLLOWING IS OLD AND PARTLY WRONG]
* So: we enumerate the devices in the array and
* make sure we can open all of them.
* Then we freeze the early part of the array and
@@ -568,71 +813,108 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
* and finally invalidate the copied data and unfreeze the
* start of the array.
*
- * Before we can do this we need to decide:
- * - will the array grow? Just calculate size
- * - how much needs to be saved: count stripes.
- * - where to save data... good question.
- *
+ * In each case, we first make sure that storage is available
+ * for the required backup.
+ * Then we:
+ * - request the shape change.
+ * - for to handle backup etc.
*/
- nlevel = olevel = array.level;
nchunk = ochunk = array.chunk_size;
nlayout = olayout = array.layout;
ndisks = odisks = array.raid_disks;
- if (level != UnSet) nlevel = level;
- if (chunksize) nchunk = chunksize;
- if (layout != UnSet) nlayout = layout;
+ if (chunksize) {
+ nchunk = chunksize * 1024;
+ if (array.size % chunksize) {
+ fprintf(stderr, Name ": component size %dK is not"
+ " a multiple of chunksize %dK\n",
+ array.size, chunksize);
+ break;
+ }
+ }
+ if (layout_str != NULL)
+ switch(array.level) {
+ case 4: /* ignore layout */
+ break;
+ case 5:
+ nlayout = map_name(r5layout, layout_str);
+ if (nlayout == UnSet) {
+ fprintf(stderr, Name ": layout %s not understood for raid5.\n",
+ layout_str);
+ return 1;
+ }
+ break;
+
+ case 6:
+ nlayout = map_name(r6layout, layout_str);
+ if (nlayout == UnSet) {
+ fprintf(stderr, Name ": layout %s not understood for raid6.\n",
+ layout_str);
+ return 1;
+ }
+ break;
+ }
if (raid_disks) ndisks = raid_disks;
odata = odisks-1;
- if (olevel == 6) odata--; /* number of data disks */
ndata = ndisks-1;
- if (nlevel == 6) ndata--;
-
- if (ndata < odata) {
- fprintf(stderr, Name ": %s: Cannot reduce number of data disks (yet).\n",
- devname);
- return 1;
+ if (array.level == 6) {
+ odata--; /* number of data disks */
+ ndata--;
}
- if (ndata == odata) {
- fprintf(stderr, Name ": %s: Cannot reshape array without increasing size (yet).\n",
- devname);
- return 1;
+
+ /* Check that we can hold all the data */
+ size = ndata * array.size;
+ get_dev_size(fd, NULL, &array_size);
+ if (size < (array_size/1024)) {
+ fprintf(stderr, Name ": this change will reduce the size of the array.\n"
+ " use --grow --array-size first to truncate array.\n"
+ " e.g. mdadm --grow %s --array-size %llu\n",
+ devname, size);
+ rv = 1;
+ break;
}
- /* Well, it is growing... so how much do we need to backup.
- * Need to backup a full number of new-stripes, such that the
- * last one does not over-write any place that it would be read
- * from
+
+ /* So how much do we need to backup.
+ * We need an amount of data which is both a whole number of
+ * old stripes and a whole number of new stripes.
+ * So LCM for (chunksize*datadisks).
*/
- nstripe = ostripe = 0;
- while (nstripe >= ostripe) {
- nstripe += nchunk/512;
- last_block = nstripe * ndata;
- ostripe = last_block / odata / (ochunk/512) * (ochunk/512);
+ a = ochunk/512 * odata;
+ b = nchunk/512 * ndata;
+ /* Find GCD */
+ while (a != b) {
+ if (a < b)
+ b -= a;
+ if (b < a)
+ a -= b;
}
- fprintf(stderr, Name ": Need to backup %lluK of critical "
- "section..\n", last_block/2);
+ /* LCM == product / GCD */
+ blocks = ochunk/512 * nchunk/512 * odata * ndata / a;
+
+ if (ndata == odata)
+ blocks *= 16;
+ else
+ fprintf(stderr, Name ": Need to backup %luK of critical "
+ "section..\n", blocks/2);
+ sysfs_free(sra);
sra = sysfs_read(fd, 0,
GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
GET_CACHE);
if (!sra) {
fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n",
devname);
- return 1;
+ rv = 1;
+ break;
}
- if (last_block >= sra->component_size/2) {
+ if (blocks >= sra->component_size/2) {
fprintf(stderr, Name ": %s: Something wrong - reshape aborted\n",
devname);
- return 1;
+ rv = 1;
+ break;
}
- if (sra->array.spare_disks == 0 && backup_file == NULL) {
- fprintf(stderr, Name ": %s: Cannot grow - need a spare or backup-file to backup critical section\n",
- devname);
- return 1;
- }
-
nrdisks = array.nr_disks + sra->array.spare_disks;
/* Now we need to open all these devices so we can read/write.
*/
@@ -640,7 +922,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
offsets = malloc((1+nrdisks) * sizeof(offsets[0]));
if (!fdlist || !offsets) {
fprintf(stderr, Name ": malloc failed: grow aborted\n");
- return 1;
+ rv = 1;
+ break;
}
for (d=0; d <= nrdisks; d++)
fdlist[d] = -1;
@@ -653,214 +936,602 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
sd->disk.minor, 1);
fdlist[sd->disk.raid_disk]
= dev_open(dn, O_RDONLY);
- offsets[sd->disk.raid_disk] = sd->data_offset;
+ offsets[sd->disk.raid_disk] = sd->data_offset*512;
if (fdlist[sd->disk.raid_disk] < 0) {
fprintf(stderr, Name ": %s: cannot open component %s\n",
devname, dn?dn:"-unknown-");
- goto abort;
+ rv = 1;
+ goto release;
}
- } else {
+ } else if (backup_file == NULL) {
/* spare */
char *dn = map_dev(sd->disk.major,
sd->disk.minor, 1);
fdlist[d] = dev_open(dn, O_RDWR);
- offsets[d] = sd->data_offset;
+ offsets[d] = (sra->component_size - blocks - 8)*512;
if (fdlist[d]<0) {
fprintf(stderr, Name ": %s: cannot open component %s\n",
devname, dn?dn:"-unknown");
- goto abort;
+ rv = 1;
+ goto release;
}
d++;
}
}
- for (i=0 ; i<array.raid_disks; i++)
- if (fdlist[i] < 0) {
- fprintf(stderr, Name ": %s: failed to find device %d. Array might be degraded.\n"
- " --grow aborted\n", devname, i);
- goto abort;
+ if (backup_file == NULL) {
+ if (ndata <= odata) {
+ fprintf(stderr, Name ": %s: Cannot grow - need backup-file\n",
+ devname);
+ rv = 1;
+ break;
+ } else if (sra->array.spare_disks == 0) {
+ fprintf(stderr, Name ": %s: Cannot grow - need a spare or "
+ "backup-file to backup critical section\n",
+ devname);
+ rv = 1;
+ break;
}
- spares = sra->array.spare_disks;
- if (backup_file) {
- fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL, S_IRUSR | S_IWUSR);
+ if (d == array.raid_disks) {
+ fprintf(stderr, Name ": %s: No spare device for backup\n",
+ devname);
+ rv = 1;
+ break;
+ }
+ } else {
+ /* need to check backup file is large enough */
+ char buf[512];
+ fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL,
+ S_IRUSR | S_IWUSR);
+ offsets[d] = 8 * 512;
if (fdlist[d] < 0) {
fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
devname, backup_file, strerror(errno));
- goto abort;
+ rv = 1;
+ break;
+ }
+ memset(buf, 0, 512);
+ for (i=0; i < blocks + 1 ; i++) {
+ if (write(fdlist[d], buf, 512) != 512) {
+ fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
+ devname, backup_file, strerror(errno));
+ rv = 1;
+ break;
+ }
+ }
+ if (fsync(fdlist[d]) != 0) {
+ fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
+ devname, backup_file, strerror(errno));
+ rv = 1;
+ break;
}
- offsets[d] = 8;
d++;
- spares++;
}
- if (fdlist[array.raid_disks] < 0) {
- fprintf(stderr, Name ": %s: failed to find a spare and no backup-file given - --grow aborted\n",
- devname);
- goto abort;
+
+ /* lastly, check that the internal stripe cache is
+ * large enough, or it won't work.
+ */
+
+ cache = (nchunk < ochunk) ? ochunk : nchunk;
+ cache = cache * 4 / 4096;
+ if (sra->cache_size < cache)
+ sysfs_set_num(sra, NULL, "stripe_cache_size",
+ cache+1);
+ /* Right, everything seems fine. Let's kick things off.
+ * If only changing raid_disks, use ioctl, else use
+ * sysfs.
+ */
+ if (ochunk == nchunk && olayout == nlayout) {
+ array.raid_disks = ndisks;
+ if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+ rv = 1;
+ fprintf(stderr, Name ": Cannot set device shape for %s: %s\n",
+ devname, strerror(errno));
+ if (ndisks < odisks &&
+ get_linux_version() < 2006030)
+ fprintf(stderr, Name ": linux 2.6.30 or later required\n");
+
+ break;
+ }
+ } else {
+ /* set them all just in case some old 'new_*' value
+ * persists from some earlier problem
+ */
+ if (sysfs_set_num(sra, NULL, "chunk_size", nchunk) < 0)
+ rv = 1;
+ if (sysfs_set_num(sra, NULL, "layout", nlayout) < 0)
+ rv = 1;
+ if (sysfs_set_num(sra, NULL, "raid_disks", ndisks) < 0)
+ rv = 1;
+ if (rv) {
+ fprintf(stderr, Name ": Cannot set device shape for %s\n",
+ devname);
+ if (get_linux_version() < 2006030)
+ fprintf(stderr, Name ": linux 2.6.30 or later required\n");
+ break;
+ }
}
+ if (ndisks == 2 && odisks == 2) {
+ /* No reshape is needed in this trivial case */
+ rv = 0;
+ break;
+ }
+
+ /* set up the backup-super-block. This requires the
+ * uuid from the array.
+ */
/* Find a superblock */
- if (st->ss->load_super(st, fdlist[0], NULL)) {
+ for (sd = sra->devs; sd; sd = sd->next) {
+ char *dn;
+ int devfd;
+ int ok;
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ dn = map_dev(sd->disk.major, sd->disk.minor, 1);
+ devfd = dev_open(dn, O_RDONLY);
+ if (devfd < 0)
+ continue;
+ ok = st->ss->load_super(st, devfd, NULL);
+ close(devfd);
+ if (ok >= 0)
+ break;
+ }
+ if (!sd) {
fprintf(stderr, Name ": %s: Cannot find a superblock\n",
devname);
- goto abort;
+ rv = 1;
+ break;
}
-
+ memset(&bsb, 0, 512);
memcpy(bsb.magic, "md_backup_data-1", 16);
st->ss->uuid_from_super(st, (int*)&bsb.set_uuid);
bsb.mtime = __cpu_to_le64(time(0));
- bsb.arraystart = 0;
- bsb.length = __cpu_to_le64(last_block);
-
- /* Decide offset for the backup, llseek the spares, and write
- * a leading superblock 4K earlier.
+ bsb.devstart2 = blocks;
+ stripes = blocks / (ochunk/512) / odata;
+ /* Now we just need to kick off the reshape and watch, while
+ * handling backups of the data...
+ * This is all done by a forked background process.
*/
- for (i=array.raid_disks; i<d; i++) {
- char abuf[4096+512];
- char *buf = (char*)(((unsigned long)abuf+511)& ~511);
- if (i==d-1 && backup_file) {
- /* This is the backup file */
- offsets[i] = 8;
- } else
- offsets[i] += sra->component_size - last_block - 8;
- if (lseek64(fdlist[i], (offsets[i]<<9) - 4096, 0)
- != (offsets[i]<<9) - 4096) {
- fprintf(stderr, Name ": could not seek...\n");
- goto abort;
- }
- memset(buf, 0, 4096);
- bsb.devstart = __cpu_to_le64(offsets[i]);
- bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
- memcpy(buf, &bsb, sizeof(bsb));
- if (write(fdlist[i], buf, 4096) != 4096) {
- fprintf(stderr, Name ": could not write leading superblock\n");
- goto abort;
- }
- }
- array.level = nlevel;
- array.raid_disks = ndisks;
- array.chunk_size = nchunk;
- array.layout = nlayout;
- if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
- if (errno == ENOSPC) {
- /* stripe cache is not big enough.
- * It needs to be 4 times chunksize_size,
- * and we assume pagesize is 4K
+ switch(fork()) {
+ case 0:
+ close(fd);
+ if (check_env("MDADM_GROW_VERIFY"))
+ fd = open(devname, O_RDONLY | O_DIRECT);
+ else
+ fd = -1;
+ mlockall(MCL_FUTURE);
+
+ if (odata < ndata)
+ done = child_grow(fd, sra, stripes,
+ fdlist, offsets,
+ odisks, ochunk, array.level, olayout, odata,
+ d - odisks, fdlist+odisks, offsets+odisks);
+ else if (odata > ndata)
+ done = child_shrink(fd, sra, stripes,
+ fdlist, offsets,
+ odisks, ochunk, array.level, olayout, odata,
+ d - odisks, fdlist+odisks, offsets+odisks);
+ else
+ done = child_same_size(fd, sra, stripes,
+ fdlist, offsets,
+ 0,
+ odisks, ochunk, array.level, olayout, odata,
+ d - odisks, fdlist+odisks, offsets+odisks);
+ if (backup_file && done)
+ unlink(backup_file);
+ if (level != UnSet && level != array.level) {
+ /* We need to wait for the reshape to finish
+ * (which will have happened unless odata < ndata)
+ * and then set the level
*/
- if (sra->cache_size < 4 * (nchunk/4096)) {
- sysfs_set_num(sra, NULL,
- "stripe_cache_size",
- 4 * (nchunk/4096) +1);
- if (ioctl(fd, SET_ARRAY_INFO,
- &array) == 0)
- goto ok;
- }
+
+ c = map_num(pers, level);
+ if (c == NULL)
+ exit(0);/* not possible */
+
+ if (odata < ndata)
+ wait_reshape(sra);
+ err = sysfs_set_str(sra, NULL, "level", c);
+ if (err)
+ fprintf(stderr, Name ": %s: could not set level to %s\n",
+ devname, c);
}
- fprintf(stderr, Name ": Cannot set device size/shape for %s: %s\n",
- devname, strerror(errno));
- goto abort;
+ exit(0);
+ case -1:
+ fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n",
+ strerror(errno));
+ rv = 1;
+ break;
+ default:
+ /* The child will take care of unfreezing the array */
+ frozen = 0;
+ break;
}
- ok: ;
+ break;
- /* suspend the relevant region */
- sysfs_set_num(sra, NULL, "suspend_hi", 0); /* just in case */
- if (sysfs_set_num(sra, NULL, "suspend_lo", 0) < 0 ||
- sysfs_set_num(sra, NULL, "suspend_hi", last_block) < 0) {
- fprintf(stderr, Name ": %s: failed to suspend device.\n",
- devname);
- goto abort_resume;
- }
+ }
+ release:
+ if (rv && orig_level != UnSet && sra) {
+ c = map_num(pers, orig_level);
+ if (c && sysfs_set_str(sra, NULL, "level", c) == 0)
+ fprintf(stderr, Name ": aborting level change\n");
+ }
+ if (sra)
+ unfreeze_array(sra, frozen);
+ return rv;
+}
- err = save_stripes(fdlist, offsets,
- odisks, ochunk, olevel, olayout,
- spares, fdlist+odisks,
- 0ULL, last_block*512);
+/*
+ * We run a child process in the background which performs the following
+ * steps:
+ * - wait for resync to reach a certain point
+ * - suspend io to the following section
+ * - backup that section
+ * - allow resync to proceed further
+ * - resume io
+ * - discard the backup.
+ *
+ * When are combined in slightly different ways in the three cases.
+ * Grow:
+ * - suspend/backup/allow/wait/resume/discard
+ * Shrink:
+ * - allow/wait/suspend/backup/allow/wait/resume/discard
+ * same-size:
+ * - wait/resume/discard/suspend/backup/allow
+ *
+ * suspend/backup/allow always come together
+ * wait/resume/discard do too.
+ * For the same-size case we have two backups to improve flow.
+ *
+ */
- /* abort if there was an error */
- if (err < 0) {
- fprintf(stderr, Name ": %s: failed to save critical region\n",
- devname);
- goto abort_resume;
- }
+int grow_backup(struct mdinfo *sra,
+ unsigned long long offset, /* per device */
+ unsigned long stripes, /* per device */
+ int *sources, unsigned long long *offsets,
+ int disks, int chunk, int level, int layout,
+ int dests, int *destfd, unsigned long long *destoffsets,
+ int part,
+ char *buf)
+{
+ /* Backup 'blocks' sectors at 'offset' on each device of the array,
+ * to storage 'destfd' (offset 'destoffsets'), after first
+ * suspending IO. Then allow resync to continue
+ * over the suspended section.
+ * Use part 'part' of the backup-super-block.
+ */
+ int odata = disks;
+ int rv = 0;
+ int i;
+ //printf("offset %llu\n", offset);
+ if (level >= 4)
+ odata--;
+ if (level == 6)
+ odata--;
+ sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * chunk/512) * odata);
+ if (part) {
+ bsb.arraystart2 = __cpu_to_le64(offset * odata);
+ bsb.length2 = __cpu_to_le64(stripes * chunk/512 * odata);
+ } else {
+ bsb.arraystart = __cpu_to_le64(offset * odata);
+ bsb.length = __cpu_to_le64(stripes * chunk/512 * odata);
+ }
+ if (part)
+ bsb.magic[15] = '2';
+ for (i = 0; i < dests; i++)
+ if (part)
+ lseek64(destfd[i], destoffsets[i] + __le64_to_cpu(bsb.devstart2)*512, 0);
+ else
+ lseek64(destfd[i], destoffsets[i], 0);
+
+ rv = save_stripes(sources, offsets,
+ disks, chunk, level, layout,
+ dests, destfd,
+ offset*512*odata, stripes * chunk * odata,
+ buf);
+
+ if (rv)
+ return rv;
+ for (i = 0; i < dests; i++) {
+ bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
+
+ bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
+ if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
+ bsb.sb_csum2 = bsb_csum((char*)&bsb,
+ ((char*)&bsb.sb_csum2)-((char*)&bsb));
+
+ lseek64(destfd[i], destoffsets[i] - 4096, 0);
+ write(destfd[i], &bsb, 512);
+ fsync(destfd[i]);
+ }
- for (i=odisks; i<d ; i++) {
- bsb.devstart = __cpu_to_le64(offsets[i]);
- bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
- if (lseek64(fdlist[i], (offsets[i]+last_block)<<9, 0) < 0 ||
- write(fdlist[i], &bsb, sizeof(bsb)) != sizeof(bsb) ||
- fsync(fdlist[i]) != 0) {
- fprintf(stderr, Name ": %s: failed to save metadata for critical region backups.\n",
- devname);
- goto abort_resume;
- }
+ return 0;
+}
+
+/* in 2.6.30, the value reported by sync_completed can be
+ * less that it should be by one stripe.
+ * This only happens when reshape hits sync_max and pauses.
+ * So allow wait_backup to either extent sync_max further
+ * than strictly necessary, or return before the
+ * sync has got quite as far as we would really like.
+ * This is what 'blocks2' is for.
+ * The various caller give appropriate values so that
+ * every works.
+ */
+int wait_backup(struct mdinfo *sra,
+ unsigned long long offset, /* per device */
+ unsigned long long blocks, /* per device */
+ unsigned long long blocks2, /* per device - hack */
+ int dests, int *destfd, unsigned long long *destoffsets,
+ int part)
+{
+ /* Wait for resync to pass the section that was backed up
+ * then erase the backup and allow IO
+ */
+ int fd = sysfs_get_fd(sra, NULL, "sync_completed");
+ unsigned long long completed;
+ int i;
+
+ if (fd < 0)
+ return -1;
+ sysfs_set_num(sra, NULL, "sync_max", offset + blocks + blocks2);
+ if (offset == 0)
+ sysfs_set_str(sra, NULL, "sync_action", "reshape");
+ do {
+ char action[20];
+ fd_set rfds;
+ FD_ZERO(&rfds);
+ FD_SET(fd, &rfds);
+ select(fd+1, NULL, NULL, &rfds, NULL);
+ if (sysfs_fd_get_ll(fd, &completed) < 0) {
+ close(fd);
+ return -1;
}
+ if (sysfs_get_str(sra, NULL, "sync_action",
+ action, 20) > 0 &&
+ strncmp(action, "reshape", 7) != 0)
+ break;
+ } while (completed < offset + blocks);
+ close(fd);
+
+ if (part) {
+ bsb.arraystart2 = __cpu_to_le64(0);
+ bsb.length2 = __cpu_to_le64(0);
+ } else {
+ bsb.arraystart = __cpu_to_le64(0);
+ bsb.length = __cpu_to_le64(0);
+ }
+ for (i = 0; i < dests; i++) {
+ bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
+ bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
+ if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
+ bsb.sb_csum2 = bsb_csum((char*)&bsb,
+ ((char*)&bsb.sb_csum2)-((char*)&bsb));
+ lseek64(destfd[i], destoffsets[i]-4096, 0);
+ write(destfd[i], &bsb, 512);
+ fsync(destfd[i]);
+ }
+ return 0;
+}
- /* start the reshape happening */
- if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) {
- fprintf(stderr, Name ": %s: failed to initiate reshape\n",
- devname);
- goto abort_resume;
+static void fail(char *msg)
+{
+ write(2, msg, strlen(msg));
+ write(2, "\n", 1);
+ exit(1);
+}
+
+static char *abuf, *bbuf;
+static int abuflen;
+static void validate(int afd, int bfd, unsigned long long offset)
+{
+ /* check that the data in the backup against the array.
+ * This is only used for regression testing and should not
+ * be used while the array is active
+ */
+ struct mdp_backup_super bsb2;
+ if (afd < 0)
+ return;
+ lseek64(bfd, offset - 4096, 0);
+ if (read(bfd, &bsb2, 512) != 512)
+ fail("cannot read bsb");
+ if (bsb2.sb_csum != bsb_csum((char*)&bsb2,
+ ((char*)&bsb2.sb_csum)-((char*)&bsb2)))
+ fail("first csum bad");
+ if (memcmp(bsb2.magic, "md_backup_data", 14) != 0)
+ fail("magic is bad");
+ if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 &&
+ bsb2.sb_csum2 != bsb_csum((char*)&bsb2,
+ ((char*)&bsb2.sb_csum2)-((char*)&bsb2)))
+ fail("second csum bad");
+
+ if (__le64_to_cpu(bsb2.devstart)*512 != offset)
+ fail("devstart is wrong");
+
+ if (bsb2.length) {
+ unsigned long long len = __le64_to_cpu(bsb2.length)*512;
+
+ if (abuflen < len) {
+ free(abuf);
+ free(bbuf);
+ abuflen = len;
+ posix_memalign((void**)&abuf, 4096, abuflen);
+ posix_memalign((void**)&bbuf, 4096, abuflen);
}
- /* wait for reshape to pass the critical region */
- while(1) {
- unsigned long long comp;
- if (sysfs_get_ll(sra, NULL, "sync_completed", &comp)<0) {
- sleep(5);
- break;
- }
- if (comp >= nstripe)
- break;
- if (comp == 0) {
- /* Maybe it finished already */
- char action[20];
- if (sysfs_get_str(sra, NULL, "sync_action",
- action, 20) > 0 &&
- strncmp(action, "reshape", 7) != 0)
+ lseek64(bfd, offset, 0);
+ if (read(bfd, bbuf, len) != len) {
+ printf("len %llu\n", len);
+ fail("read first backup failed");
+ }
+ lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0);
+ if (read(afd, abuf, len) != len)
+ fail("read first from array failed");
+ if (memcmp(bbuf, abuf, len) != 0) {
+ int i;
+ printf("offset=%llu len=%llu\n",
+ __le64_to_cpu(bsb2.arraystart)*512, len);
+ for (i=0; i<len; i++)
+ if (bbuf[i] != abuf[i]) {
+ printf("first diff byte %d\n", i);
break;
- }
- sleep(1);
+ }
+ fail("data1 compare failed");
}
-
- /* invalidate superblocks */
- memset(&bsb, 0, sizeof(bsb));
- for (i=odisks; i<d ; i++) {
- lseek64(fdlist[i], (offsets[i]+last_block)<<9, 0);
- if (write(fdlist[i], &bsb, sizeof(bsb)) < 0) {
- fprintf(stderr, Name ": %s: failed to invalidate metadata for raid disk %d\n",
- devname, i);
- }
+ }
+ if (bsb2.length2) {
+ unsigned long long len = __le64_to_cpu(bsb2.length2)*512;
+
+ if (abuflen < len) {
+ free(abuf);
+ free(bbuf);
+ abuflen = len;
+ abuf = malloc(abuflen);
+ bbuf = malloc(abuflen);
}
- /* unsuspend. */
- sysfs_set_num(sra, NULL, "suspend_lo", last_block);
-
- for (i=0; i<d; i++)
- if (fdlist[i] >= 0)
- close(fdlist[i]);
- free(fdlist);
- free(offsets);
- if (backup_file)
- unlink(backup_file);
-
- fprintf(stderr, Name ": ... critical section passed.\n");
- break;
+ lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0);
+ if (read(bfd, bbuf, len) != len)
+ fail("read second backup failed");
+ lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0);
+ if (read(afd, abuf, len) != len)
+ fail("read second from array failed");
+ if (memcmp(bbuf, abuf, len) != 0)
+ fail("data2 compare failed");
}
- return 0;
+}
+static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes,
+ int *fds, unsigned long long *offsets,
+ int disks, int chunk, int level, int layout, int data,
+ int dests, int *destfd, unsigned long long *destoffsets)
+{
+ char *buf;
+
+ posix_memalign((void**)&buf, 4096, disks * chunk);
+ sysfs_set_num(sra, NULL, "suspend_hi", 0);
+ sysfs_set_num(sra, NULL, "suspend_lo", 0);
+ grow_backup(sra, 0, stripes,
+ fds, offsets, disks, chunk, level, layout,
+ dests, destfd, destoffsets,
+ 0, buf);
+ validate(afd, destfd[0], destoffsets[0]);
+ if (wait_backup(sra, 0, stripes * chunk / 512, stripes * chunk / 512,
+ dests, destfd, destoffsets,
+ 0) < 0)
+ return 0;
+ sysfs_set_num(sra, NULL, "suspend_lo", (stripes * chunk/512) * data);
+ free(buf);
+ /* FIXME this should probably be numeric */
+ sysfs_set_str(sra, NULL, "sync_max", "max");
+ return 1;
+}
- abort_resume:
- sysfs_set_num(sra, NULL, "suspend_lo", last_block);
- abort:
- for (i=0; i<array.nr_disks; i++)
- if (fdlist[i] >= 0)
- close(fdlist[i]);
- free(fdlist);
- free(offsets);
- if (backup_file)
- unlink(backup_file);
+static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
+ int *fds, unsigned long long *offsets,
+ int disks, int chunk, int level, int layout, int data,
+ int dests, int *destfd, unsigned long long *destoffsets)
+{
+ char *buf;
+ unsigned long long start;
+ int rv;
+
+ posix_memalign((void**)&buf, 4096, disks * chunk);
+ start = sra->component_size - stripes * chunk/512;
+ sysfs_set_num(sra, NULL, "sync_max", start);
+ sysfs_set_str(sra, NULL, "sync_action", "reshape");
+ sysfs_set_num(sra, NULL, "suspend_lo", 0);
+ sysfs_set_num(sra, NULL, "suspend_hi", 0);
+ rv = wait_backup(sra, 0, start - stripes * chunk/512, stripes * chunk/512,
+ dests, destfd, destoffsets, 0);
+ if (rv < 0)
+ return 0;
+ grow_backup(sra, 0, stripes,
+ fds, offsets,
+ disks, chunk, level, layout,
+ dests, destfd, destoffsets,
+ 0, buf);
+ validate(afd, destfd[0], destoffsets[0]);
+ rv = wait_backup(sra, start, stripes*chunk/512, 0,
+ dests, destfd, destoffsets, 0);
+ if (rv < 0)
+ return 0;
+ sysfs_set_num(sra, NULL, "suspend_lo", (stripes * chunk/512) * data);
+ free(buf);
+ /* FIXME this should probably be numeric */
+ sysfs_set_str(sra, NULL, "sync_max", "max");
return 1;
+}
+static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
+ int *fds, unsigned long long *offsets,
+ unsigned long long start,
+ int disks, int chunk, int level, int layout, int data,
+ int dests, int *destfd, unsigned long long *destoffsets)
+{
+ unsigned long long size;
+ unsigned long tailstripes = stripes;
+ int part;
+ char *buf;
+ unsigned long long speed;
+
+
+ posix_memalign((void**)&buf, 4096, disks * chunk);
+
+ sysfs_set_num(sra, NULL, "suspend_lo", 0);
+ sysfs_set_num(sra, NULL, "suspend_hi", 0);
+
+ sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
+ sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
+
+ grow_backup(sra, start, stripes,
+ fds, offsets,
+ disks, chunk, level, layout,
+ dests, destfd, destoffsets,
+ 0, buf);
+ grow_backup(sra, (start + stripes) * chunk/512, stripes,
+ fds, offsets,
+ disks, chunk, level, layout,
+ dests, destfd, destoffsets,
+ 1, buf);
+ validate(afd, destfd[0], destoffsets[0]);
+ part = 0;
+ start += stripes * 2; /* where to read next */
+ size = sra->component_size / (chunk/512);
+ while (start < size) {
+ if (wait_backup(sra, (start-stripes*2)*chunk/512,
+ stripes*chunk/512, 0,
+ dests, destfd, destoffsets,
+ part) < 0)
+ return 0;
+ sysfs_set_num(sra, NULL, "suspend_lo", start*chunk/512 * data);
+ if (start + stripes > size)
+ tailstripes = (size - start);
+
+ grow_backup(sra, start*chunk/512, tailstripes,
+ fds, offsets,
+ disks, chunk, level, layout,
+ dests, destfd, destoffsets,
+ part, buf);
+ start += stripes;
+ part = 1 - part;
+ validate(afd, destfd[0], destoffsets[0]);
+ }
+ if (wait_backup(sra, (start-stripes*2) * chunk/512, stripes * chunk/512, 0,
+ dests, destfd, destoffsets,
+ part) < 0)
+ return 0;
+ sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*chunk/512) * data);
+ if (wait_backup(sra, (start-stripes) * chunk/512, tailstripes * chunk/512, 0,
+ dests, destfd, destoffsets,
+ 1-part) < 0)
+ return 0;
+ sysfs_set_num(sra, NULL, "suspend_lo", (size*chunk/512) * data);
+ sysfs_set_num(sra, NULL, "sync_speed_min", speed);
+ free(buf);
+ return 1;
}
/*
@@ -876,19 +1547,26 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
unsigned long long nstripe, ostripe, last_block;
int ndata, odata;
- if (info->delta_disks < 0)
- return 1; /* cannot handle a shrink */
- if (info->new_level != info->array.level ||
- info->new_layout != info->array.layout ||
- info->new_chunk != info->array.chunk_size)
- return 1; /* Can only handle change in disks */
+ if (info->new_level != info->array.level)
+ return 1; /* Cannot handle level changes (they are instantaneous) */
+
+ odata = info->array.raid_disks - info->delta_disks - 1;
+ if (info->array.level == 6) odata--; /* number of data disks */
+ ndata = info->array.raid_disks - 1;
+ if (info->new_level == 6) ndata--;
old_disks = info->array.raid_disks - info->delta_disks;
+ if (info->delta_disks <= 0)
+ /* Didn't grow, so the backup file must have
+ * been used
+ */
+ old_disks = cnt;
for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
struct mdinfo dinfo;
char buf[4096];
int fd;
+ int bsbsize;
/* This was a spare and may have some saved data on it.
* Load the superblock, find and load the
@@ -899,8 +1577,11 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
*/
if (i == old_disks-1) {
fd = open(backup_file, O_RDONLY);
- if (fd<0)
+ if (fd<0) {
+ fprintf(stderr, Name ": backup file %s inaccessible: %s\n",
+ backup_file, strerror(errno));
continue;
+ }
} else {
fd = fdlist[i];
if (fd < 0)
@@ -918,10 +1599,13 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
}
if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb))
continue; /* Cannot read */
- if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0)
+ if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 &&
+ memcmp(bsb.magic, "md_backup_data-2", 16) != 0)
continue;
if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)))
continue; /* bad checksum */
+ if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 &&
+ bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb)))
if (memcmp(bsb.set_uuid,info->uuid, 16) != 0)
continue; /* Wrong uuid */
@@ -929,18 +1613,46 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
info->array.utime < __le64_to_cpu(bsb.mtime))
continue; /* time stamp is too bad */
- if (__le64_to_cpu(bsb.arraystart) != 0)
- continue; /* Can only handle backup from start of array */
- if (__le64_to_cpu(bsb.length) <
- info->reshape_progress)
- continue; /* No new data here */
-
+ if (bsb.magic[15] == '1') {
+ if (info->delta_disks >= 0) {
+ /* reshape_progress is increasing */
+ if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
+ info->reshape_progress)
+ continue; /* No new data here */
+ } else {
+ /* reshape_progress is decreasing */
+ if (__le64_to_cpu(bsb.arraystart) >=
+ info->reshape_progress)
+ continue; /* No new data here */
+ }
+ } else {
+ if (info->delta_disks >= 0) {
+ /* reshape_progress is increasing */
+ if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
+ info->reshape_progress &&
+ __le64_to_cpu(bsb.arraystart2) + __le64_to_cpu(bsb.length2) <
+ info->reshape_progress)
+ continue; /* No new data here */
+ } else {
+ /* reshape_progress is decreasing */
+ if (__le64_to_cpu(bsb.arraystart) >=
+ info->reshape_progress &&
+ __le64_to_cpu(bsb.arraystart2) >=
+ info->reshape_progress)
+ continue; /* No new data here */
+ }
+ }
if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0)
continue; /* Cannot seek */
/* There should be a duplicate backup superblock 4k before here */
if (lseek64(fd, -4096, 1) < 0 ||
- read(fd, buf, 4096) != 4096 ||
- memcmp(buf, &bsb, sizeof(bsb)) != 0)
+ read(fd, buf, 4096) != 4096)
+ continue; /* Cannot find leading superblock */
+ if (bsb.magic[15] == '1')
+ bsbsize = offsetof(struct mdp_backup_super, pad1);
+ else
+ bsbsize = offsetof(struct mdp_backup_super, pad);
+ if (memcmp(buf, &bsb, bsbsize) != 0)
continue; /* Cannot find leading superblock */
/* Now need the data offsets for all devices. */
@@ -963,37 +1675,67 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
info->new_level,
info->new_layout,
fd, __le64_to_cpu(bsb.devstart)*512,
- 0, __le64_to_cpu(bsb.length)*512)) {
+ __le64_to_cpu(bsb.arraystart),
+ __le64_to_cpu(bsb.length)*512)) {
+ /* didn't succeed, so giveup */
+ return 1;
+ }
+
+ if (bsb.magic[15] == '2' &&
+ restore_stripes(fdlist, offsets,
+ info->array.raid_disks,
+ info->new_chunk,
+ info->new_level,
+ info->new_layout,
+ fd, __le64_to_cpu(bsb.devstart)*512 +
+ __le64_to_cpu(bsb.devstart2)*512,
+ __le64_to_cpu(bsb.arraystart2),
+ __le64_to_cpu(bsb.length2)*512)) {
/* didn't succeed, so giveup */
return 1;
}
+
/* Ok, so the data is restored. Let's update those superblocks. */
+ if (info->delta_disks >= 0) {
+ info->reshape_progress = __le64_to_cpu(bsb.arraystart) +
+ __le64_to_cpu(bsb.length);
+ if (bsb.magic[15] == '2') {
+ unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) +
+ __le64_to_cpu(bsb.length2);
+ if (p2 > info->reshape_progress)
+ info->reshape_progress = p2;
+ }
+ } else {
+ info->reshape_progress = __le64_to_cpu(bsb.arraystart);
+ if (bsb.magic[15] == '2') {
+ unsigned long long p2 = __le64_to_cpu(bsb.arraystart2);
+ if (p2 < info->reshape_progress)
+ info->reshape_progress = p2;
+ }
+ }
for (j=0; j<info->array.raid_disks; j++) {
if (fdlist[j] < 0) continue;
if (st->ss->load_super(st, fdlist[j], NULL))
continue;
st->ss->getinfo_super(st, &dinfo);
- dinfo.reshape_progress = __le64_to_cpu(bsb.length);
+ dinfo.reshape_progress = info->reshape_progress;
st->ss->update_super(st, &dinfo,
"_reshape_progress",
NULL,0, 0, NULL);
st->ss->store_super(st, fdlist[j]);
st->ss->free_super(st);
}
-
- /* And we are done! */
return 0;
}
/* Didn't find any backup data, try to see if any
* was needed.
*/
+ if (info->delta_disks == 0)
+ /* Alway need backup data when size doesn't change */
+ return 1;
nstripe = ostripe = 0;
- odata = info->array.raid_disks - info->delta_disks - 1;
- if (info->array.level == 6) odata--; /* number of data disks */
- ndata = info->array.raid_disks - 1;
- if (info->new_level == 6) ndata--;
last_block = 0;
while (nstripe >= ostripe) {
nstripe += info->new_chunk / 512;
@@ -1007,3 +1749,148 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
/* needed to recover critical section! */
return 1;
}
+
+int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
+ char *backup_file)
+{
+ /* Array is assembled and ready to be started, but
+ * monitoring is probably required.
+ * So:
+ * - start read-only
+ * - set upper bound for resync
+ * - initialise the 'suspend' boundaries
+ * - switch to read-write
+ * - fork and continue monitoring
+ */
+ int err;
+ int backup_list[1];
+ unsigned long long backup_offsets[1];
+ int odisks, ndisks, ochunk, nchunk,odata,ndata;
+ unsigned long a,b,blocks,stripes;
+ int backup_fd;
+ int *fds;
+ unsigned long long *offsets;
+ int d;
+ struct mdinfo *sra, *sd;
+ int rv;
+ int done = 0;
+
+ err = sysfs_set_str(info, NULL, "array_state", "readonly");
+ if (err)
+ return err;
+
+ /* make sure reshape doesn't progress until we are ready */
+ sysfs_set_str(info, NULL, "sync_max", "0");
+ sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */
+
+ /* ndisks is not growing, so raid_disks is old and +delta is new */
+ odisks = info->array.raid_disks;
+ ndisks = odisks + info->delta_disks;
+ odata = odisks - 1;
+ ndata = ndisks - 1;
+ if (info->array.level == 6) {
+ odata--;
+ ndata--;
+ }
+ ochunk = info->array.chunk_size;
+ nchunk = info->new_chunk;
+
+
+ a = ochunk/512 * odata;
+ b = nchunk/512 * ndata;
+ /* Find GCD */
+ while (a != b) {
+ if (a < b)
+ b -= a;
+ if (b < a)
+ a -= b;
+ }
+ /* LCM == product / GCD */
+ blocks = ochunk/512 * nchunk/512 * odata * ndata / a;
+
+ if (ndata == odata)
+ blocks *= 16;
+ stripes = blocks / (info->array.chunk_size/512) / odata;
+
+
+ memset(&bsb, 0, 512);
+ memcpy(bsb.magic, "md_backup_data-1", 16);
+ memcpy(&bsb.set_uuid, info->uuid, 16);
+ bsb.mtime = __cpu_to_le64(time(0));
+ bsb.devstart2 = blocks;
+
+ backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR);
+ backup_list[0] = backup_fd;
+ backup_offsets[0] = 8 * 512;
+ fds = malloc(odisks * sizeof(fds[0]));
+ offsets = malloc(odisks * sizeof(offsets[0]));
+ for (d=0; d<odisks; d++)
+ fds[d] = -1;
+
+ sra = sysfs_read(-1, devname2devnum(info->sys_name),
+ GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
+ GET_CACHE);
+
+ for (sd = sra->devs; sd; sd = sd->next) {
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ if (sd->disk.state & (1<<MD_DISK_SYNC)) {
+ char *dn = map_dev(sd->disk.major,
+ sd->disk.minor, 1);
+ fds[sd->disk.raid_disk]
+ = dev_open(dn, O_RDONLY);
+ offsets[sd->disk.raid_disk] = sd->data_offset*512;
+ if (fds[sd->disk.raid_disk] < 0) {
+ fprintf(stderr, Name ": %s: cannot open component %s\n",
+ info->sys_name, dn?dn:"-unknown-");
+ rv = 1;
+ goto release;
+ }
+ free(dn);
+ }
+ }
+
+ switch(fork()) {
+ case 0:
+ close(mdfd);
+ mlockall(MCL_FUTURE);
+ if (info->delta_disks < 0)
+ done = child_shrink(-1, info, stripes,
+ fds, offsets,
+ info->array.raid_disks,
+ info->array.chunk_size,
+ info->array.level, info->array.layout,
+ odata,
+ 1, backup_list, backup_offsets);
+ else if (info->delta_disks == 0) {
+ /* The 'start' is a per-device stripe number.
+ * reshape_progress is a per-array sector number.
+ * So divide by ndata * chunk_size
+ */
+ unsigned long long start = info->reshape_progress / ndata;
+ start /= (info->array.chunk_size/512);
+ done = child_same_size(-1, info, stripes,
+ fds, offsets,
+ start,
+ info->array.raid_disks,
+ info->array.chunk_size,
+ info->array.level, info->array.layout,
+ odata,
+ 1, backup_list, backup_offsets);
+ }
+ if (backup_file && done)
+ unlink(backup_file);
+ /* FIXME should I intuit a level change */
+ exit(0);
+ case -1:
+ fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n",
+ strerror(errno));
+ return 1;
+ default:
+ break;
+ }
+release:
+ return 0;
+}
+
+
diff --git a/Manage.c b/Manage.c
index 3aa09bc..9217139 100644
--- a/Manage.c
+++ b/Manage.c
@@ -305,24 +305,6 @@ int Manage_resize(char *devname, int fd, long long size, int raid_disks)
return 0;
}
-int Manage_reconfig(char *devname, int fd, int layout)
-{
- mdu_array_info_t info;
- if (ioctl(fd, GET_ARRAY_INFO, &info) != 0) {
- fprintf(stderr, Name ": Cannot get array information for %s: %s\n",
- devname, strerror(errno));
- return 1;
- }
- info.layout = layout;
- printf("layout set to %d\n", info.layout);
- if (ioctl(fd, SET_ARRAY_INFO, &info) != 0) {
- fprintf(stderr, Name ": Cannot set layout for %s: %s\n",
- devname, strerror(errno));
- return 1;
- }
- return 0;
-}
-
int Manage_subdevs(char *devname, int fd,
mddev_dev_t devlist, int verbose)
{
diff --git a/ReadMe.c b/ReadMe.c
index 90b4daf..0a50acb 100644
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -86,11 +86,11 @@ char Version[] = Name " - v3.0.2 - 25th September 2009\n";
* At the time if writing, there is only minimal support.
*/
-char short_options[]="-ABCDEFGIQhVXWvqbc:i:l:p:m:n:x:u:c:d:z:U:sarfRSow1tye:";
+char short_options[]="-ABCDEFGIQhVXWZvqbc:i:l:p:m:n:x:u:c:d:z:U:sarfRSow1tye:";
char short_bitmap_options[]=
- "-ABCDEFGIQhVXWvqb:c:i:l:p:m:n:x:u:c:d:z:U:sarfRSow1tye:";
+ "-ABCDEFGIQhVXWZvqb:c:i:l:p:m:n:x:u:c:d:z:U:sarfRSow1tye:";
char short_bitmap_auto_options[]=
- "-ABCDEFGIQhVXWvqb:c:i:l:p:m:n:x:u:c:d:z:U:sa:rfRSow1tye:";
+ "-ABCDEFGIQhVXWZvqb:c:i:l:p:m:n:x:u:c:d:z:U:sa:rfRSow1tye:";
struct option long_options[] = {
{"manage", 0, 0, '@'},
@@ -184,6 +184,7 @@ struct option long_options[] = {
{"syslog", 0, 0, 'y'},
/* For Grow */
{"backup-file", 1,0, BackupFile},
+ {"array-size", 1, 0, 'Z'},
/* For Incremental */
{"rebuild-map", 0, 0, 'r'},
@@ -527,6 +528,8 @@ char Help_grow[] =
" --backup-file= file : A file on a differt device to store data for a\n"
" : short time while increasing raid-devices on a\n"
" : RAID4/5/6 array. Not needed when a spare is present.\n"
+" --array-size= -Z : Change visible size of array. This does not change\n"
+" : any data on the device, and is not stable across restarts.\n"
;
char Help_incr[] =
diff --git a/mdadm.8 b/mdadm.8
index 7f19918..8022014 100644
--- a/mdadm.8
+++ b/mdadm.8
@@ -123,7 +123,9 @@ missing, spare, or failed drives, so there is nothing to monitor.
Grow (or shrink) an array, or otherwise reshape it in some way.
Currently supported growth options including changing the active size
of component devices and changing the number of active devices in RAID
-levels 1/4/5/6, as well as adding or removing a write-intent bitmap.
+levels 1/4/5/6, changing the RAID level between 1, 5, and 6, changing
+the chunk size and layout for RAID5 and RAID5, as well as adding or
+removing a write-intent bitmap.
.TP
.B "Incremental Assembly"
@@ -422,6 +424,21 @@ This value can not be used with
metadata such as DDF and IMSM.
.TP
+.BR \-Z ", " \-\-array-size=
+This is only meaningful with
+.B \-\-grow
+and its effect is not persistent: when the array is stopped an
+restarted the default array size will be restored.
+
+Setting the array-size causes the array to appear smaller to programs
+that access the data. This is particularly needed before reshaping an
+array so that it will be smaller. As the reshape is not reversible,
+but setting the size with
+.B \-\-array-size
+is, it is required that the array size is reduced as appropriate
+before the number of devices in the array is reduced.
+
+.TP
.BR \-c ", " \-\-chunk=
Specify chunk size of kibibytes. The default is 64.
This is only meaningful for RAID0, RAID4, RAID5, RAID6, and RAID10.
@@ -520,11 +537,6 @@ option to set subsequent failure modes.
"clear" or "none" will remove any pending or periodic failure modes,
and "flush" will clear any persistent faults.
-To set the parity with
-.BR \-\-grow ,
-the level of the array ("faulty")
-must be specified before the fault mode is specified.
-
Finally, the layout options for RAID10 are one of 'n', 'o' or 'f' followed
by a small number. The default is 'n2'. The supported options are:
@@ -550,6 +562,18 @@ devices in the array. It does not need to divide evenly into that
number (e.g. it is perfectly legal to have an 'n2' layout for an array
with an odd number of devices).
+When an array is converted between RAID5 and RAID6 an intermediate
+RAID6 layout is used in which the second parity block (Q) is always on
+the last device. To convert a RAID5 to RAID6 and leave it in this new
+layout (which does not require re-striping) use
+.BR \-\-layout=preserve .
+This will try to avoid any restriping.
+
+The converse of this is
+.B \-\-layout=normalise
+which will change a non-standard RAID6 layout into a more standard
+arrangement.
+
.TP
.BR \-\-parity=
same as
@@ -632,6 +656,21 @@ should be stored on a separate device, not on the RAID array being
reshaped.
.TP
+.BR \-\-array-size= ", " \-Z
+Set the size of the array which is seen by users of the device such as
+filesystems. This can be less that the real size, but never greater.
+The size set this way does not persist across restarts of the array.
+
+This is most useful when reducing the number of devices in a RAID5 or
+RAID6. Such arrays require the array-size to be reduced before a
+reshape can be performed that reduces the real size.
+
+A value of
+.B max
+restores the apparent size of the array to be whatever the real
+amount of available space is.
+
+.TP
.BR \-N ", " \-\-name=
Set a
.B name
@@ -722,6 +761,7 @@ number, and there is no entry in /dev for that number and with a
non-standard name. Names that are not in 'standard' format are only
allowed in "/dev/md/".
+.ig XX
.\".TP
.\".BR \-\-symlink = no
.\"Normally when
@@ -743,6 +783,7 @@ allowed in "/dev/md/".
.\"to enforce this even if it is suppressing
.\".IR mdadm.conf .
.\"
+.XX
.SH For assemble:
@@ -1942,7 +1983,12 @@ Currently the only support available is to
change the "size" attribute
for RAID1, RAID5 and RAID6.
.IP \(bu 4
-increase the "raid\-devices" attribute of RAID1, RAID5, and RAID6.
+increase or decrease the "raid\-devices" attribute of RAID1, RAID5,
+and RAID6.
+.IP \bu 4
+change the chunk-size and layout of RAID5 and RAID6.
+.IP \bu 4
+convert between RAID1 and RAID5, and between RAID5 and RAID6.
.IP \(bu 4
add a write-intent bitmap to any array which supports these bitmaps, or
remove a write-intent bitmap from such an array.
@@ -1985,10 +2031,22 @@ devices which were in those slots must be failed and removed.
When the number of devices is increased, any hot spares that are
present will be activated immediately.
-Increasing the number of active devices in a RAID5 is much more
+Changing the number of active devices in a RAID5 or RAID6 is much more
effort. Every block in the array will need to be read and written
-back to a new location. From 2.6.17, the Linux Kernel is able to do
-this safely, including restarting an interrupted "reshape".
+back to a new location. From 2.6.17, the Linux Kernel is able to
+increase the number of devices in a RAID5 safely, including restarting
+an interrupted "reshape". From 2.6.31, the Linux Kernel is able to
+increase or decrease the number of devices in a RAID5 or RAID6.
+
+When decreasing the number of devices, the size of the array will also
+decrease. If there was data in the array, it could get destroyed and
+this is not reversible. To help prevent accidents,
+.I mdadm
+requires that the size of the array be decreased first with
+.BR "mdadm --grow --array-size" .
+This is a reversible change which simply makes the end of the array
+inaccessible. The integrity of any data can then be checked before
+the non-reversible reduction in the number of devices is request.
When relocating the first few stripes on a RAID5, it is not possible
to keep the data on disk completely consistent and crash-proof. To
@@ -2003,6 +2061,31 @@ critical period, the same file must be passed to
.B \-\-assemble
to restore the backup and reassemble the array.
+.SS LEVEL CHANGES
+
+Changing the RAID level of any array happens instantaneously. However
+in the RAID to RAID6 case this requires a non-standard layout of the
+RAID6 data, and in the RAID6 to RAID5 case that non-standard layout is
+required before the change can be accomplish. So while the level
+change is instant, the accompanying layout change can take quite a
+long time.
+
+.SS CHUNK-SIZE AND LAYOUT CHANGES
+
+Changing the chunk-size of layout without also changing the number of
+devices as the same time will involve re-writing all blocks in-place.
+To ensure against data loss in the case of a crash, a
+.B --backup-file
+must be provided for these changes. Small sections of the array will
+be copied to the backup file while they are being rearranged.
+
+If the reshape is interrupted for any reason, this backup file must be
+make available to
+.B "mdadm --assemble"
+so the array can be reassembled. Consequently the file cannot be
+stored on the device being reshaped.
+
+
.SS BITMAP CHANGES
A write-intent bitmap can be added to, or removed from, an active
@@ -2277,6 +2360,14 @@ can be started.
Any devices which are components of /dev/md4 will be marked as faulty
and then remove from the array.
+.B " mdadm --grow /dev/md4 --level=6 --backup-file=/root/backup-md4
+.br
+The array
+.B /dev/md4
+which is currently a RAID5 array will be converted to RAID6. There
+should normally already be a spare drive attached to the array as a
+RAID6 needs one more drive than a matching RAID5.
+
.B " mdadm --create /dev/md/ddf --metadata=ddf --raid-disks 6 /dev/sd[a-f]"
.br
Create a DDF array over 6 devices.
diff --git a/mdadm.c b/mdadm.c
index bb3e5bb..a4f2d90 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -41,8 +41,10 @@ int main(int argc, char *argv[])
int chunk = 0;
long long size = -1;
+ long long array_size = -1;
int level = UnSet;
int layout = UnSet;
+ char *layout_str = NULL;
int raiddisks = 0;
int max_disks = MD_SB_DISKS; /* just a default */
int sparedisks = 0;
@@ -101,7 +103,6 @@ int main(int argc, char *argv[])
int rebuild_map = 0;
int auto_update_home = 0;
- int copies;
int print_help = 0;
FILE *outf;
@@ -323,6 +324,7 @@ int main(int argc, char *argv[])
* could depend on the mode */
#define O(a,b) ((a<<8)|b)
switch (O(mode,opt)) {
+ case O(GROW,'c'):
case O(CREATE,'c'):
case O(BUILD,'c'): /* chunk or rounding */
if (chunk) {
@@ -385,16 +387,36 @@ int main(int argc, char *argv[])
if (strcmp(optarg, "max")==0)
size = 0;
else {
- size = strtoll(optarg, &c, 10);
- if (!optarg[0] || *c || size < 4) {
+ size = parse_size(optarg);
+ if (size < 8) {
fprintf(stderr, Name ": invalid size: %s\n",
optarg);
exit(2);
}
+ /* convert sectors to K */
+ size /= 2;
}
continue;
- case O(GROW,'l'): /* hack - needed to understand layout */
+ case O(GROW,'Z'): /* array size */
+ if (array_size >= 0) {
+ fprintf(stderr, Name ": array-size may only be specified once. "
+ "Second value is %s.\n", optarg);
+ exit(2);
+ }
+ if (strcmp(optarg, "max") == 0)
+ array_size = 0;
+ else {
+ array_size = parse_size(optarg);
+ if (array_size <= 0) {
+ fprintf(stderr, Name ": invalid array size: %s\n",
+ optarg);
+ exit(2);
+ }
+ }
+ continue;
+
+ case O(GROW,'l'):
case O(CREATE,'l'):
case O(BUILD,'l'): /* set raid level*/
if (level != UnSet) {
@@ -424,9 +446,18 @@ int main(int argc, char *argv[])
ident.level = level;
continue;
+ case O(GROW, 'p'): /* new layout */
+ if (layout_str) {
+ fprintf(stderr,Name ": layout may only be sent once. "
+ "Second value was %s\n", optarg);
+ exit(2);
+ }
+ layout_str = optarg;
+ /* 'Grow' will parse the value */
+ continue;
+
case O(CREATE,'p'): /* raid5 layout */
case O(BUILD,'p'): /* faulty layout */
- case O(GROW, 'p'): /* faulty reconfig */
if (layout != UnSet) {
fprintf(stderr,Name ": layout may only be sent once. "
"Second value was %s\n", optarg);
@@ -459,38 +490,23 @@ int main(int argc, char *argv[])
break;
case 10:
- /* 'f', 'o' or 'n' followed by a number <= raid_disks */
- if ((optarg[0] != 'n' && optarg[0] != 'f' && optarg[0] != 'o') ||
- (copies = strtoul(optarg+1, &cp, 10)) < 1 ||
- copies > 200 ||
- *cp) {
+ layout = parse_layout_10(optarg);
+ if (layout < 0) {
fprintf(stderr, Name ": layout for raid10 must be 'nNN', 'oNN' or 'fNN' where NN is a number, not %s\n", optarg);
exit(2);
}
- if (optarg[0] == 'n')
- layout = 256 + copies;
- else if (optarg[0] == 'o')
- layout = 0x10000 + (copies<<8) + 1;
- else
- layout = 1 + (copies<<8);
break;
- case -5: /* Faulty
- * modeNNN
- */
-
- {
- int ln = strcspn(optarg, "0123456789");
- char *m = strdup(optarg);
- int mode;
- m[ln] = 0;
- mode = map_name(faultylayout, m);
- if (mode == UnSet) {
+ case LEVEL_FAULTY:
+ /* Faulty
+ * modeNNN
+ */
+ layout = parse_layout_faulty(optarg);
+ if (layout == -1) {
fprintf(stderr, Name ": layout %s not understood for faulty.\n",
optarg);
exit(2);
}
- layout = mode | (atoi(optarg+ln)<< ModeShift);
- }
+ break;
}
continue;
@@ -1381,11 +1397,33 @@ int main(int argc, char *argv[])
break;
case GROW:
+ if (array_size >= 0) {
+ /* alway impose array size first, independent of
+ * anything else
+ */
+ struct mdinfo sra;
+ int err;
+ sysfs_init(&sra, mdfd, 0);
+ if (array_size == 0)
+ err = sysfs_set_str(&sra, NULL, "array_size", "default");
+ else
+ err = sysfs_set_num(&sra, NULL, "array_size", array_size / 2);
+ if (err < 0) {
+ if (errno == E2BIG)
+ fprintf(stderr, Name ": --array-size setting"
+ " is too large.\n");
+ else
+ fprintf(stderr, Name ": current kernel does"
+ " not support setting --array-size\n");
+ rv = 1;
+ break;
+ }
+ }
if (devs_found > 1) {
/* must be '-a'. */
- if (size >= 0 || raiddisks) {
- fprintf(stderr, Name ": --size, --raiddisks, and --add are exclusing in --grow mode\n");
+ if (size >= 0 || raiddisks || chunk || layout_str != NULL || bitmap_file) {
+ fprintf(stderr, Name ": --add cannot be used with other geometry changes in --grow mode\n");
rv = 1;
break;
}
@@ -1394,20 +1432,21 @@ int main(int argc, char *argv[])
if (rv)
break;
}
- } else if ((size >= 0) + (raiddisks != 0) + (layout != UnSet) + (bitmap_file != NULL)> 1) {
- fprintf(stderr, Name ": can change at most one of size, raiddisks, bitmap, and layout\n");
- rv = 1;
- break;
- } else if (layout != UnSet)
- rv = Manage_reconfig(devlist->devname, mdfd, layout);
- else if (size >= 0 || raiddisks)
- rv = Grow_reshape(devlist->devname, mdfd, quiet, backup_file,
- size, level, layout, chunk, raiddisks);
- else if (bitmap_file) {
- if (delay == 0) delay = DEFAULT_BITMAP_DELAY;
+ } else if (bitmap_file) {
+ if (size >= 0 || raiddisks || chunk || layout_str != NULL) {
+ fprintf(stderr, Name ": --bitmap changes cannot be used with other geometry changes in --grow mode\n");
+ rv = 1;
+ break;
+ }
+ if (delay == 0)
+ delay = DEFAULT_BITMAP_DELAY;
rv = Grow_addbitmap(devlist->devname, mdfd, bitmap_file,
bitmap_chunk, delay, write_behind, force);
- } else
+ } else if (size >= 0 || raiddisks != 0 || layout_str != NULL
+ || chunk != 0 || level != UnSet) {
+ rv = Grow_reshape(devlist->devname, mdfd, quiet, backup_file,
+ size, level, layout_str, chunk, raiddisks);
+ } else if (array_size < 0)
fprintf(stderr, Name ": no changes to --grow\n");
break;
case INCREMENTAL:
diff --git a/mdadm.h b/mdadm.h
index 91ba624..74a1b71 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -365,8 +365,12 @@ extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
extern int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev,
char *name, unsigned long long val);
extern int sysfs_uevent(struct mdinfo *sra, char *event);
+extern int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev,
+ char *name);
+extern int sysfs_fd_get_ll(int fd, unsigned long long *val);
extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
char *name, unsigned long long *val);
+extern int sysfs_fd_get_str(int fd, char *val, int size);
extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev,
char *name, char *val, int size);
extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms);
@@ -381,7 +385,8 @@ extern int load_sys(char *path, char *buf);
extern int save_stripes(int *source, unsigned long long *offsets,
int raid_disks, int chunk_size, int level, int layout,
int nwrites, int *dest,
- unsigned long long start, unsigned long long length);
+ unsigned long long start, unsigned long long length,
+ char *buf);
extern int restore_stripes(int *dest, unsigned long long *offsets,
int raid_disks, int chunk_size, int level, int layout,
int source, unsigned long long read_offset,
@@ -703,7 +708,6 @@ extern int add_dev(const char *name, const struct stat *stb, int flag, struct FT
extern int Manage_ro(char *devname, int fd, int readonly);
extern int Manage_runstop(char *devname, int fd, int runstop, int quiet);
extern int Manage_resize(char *devname, int fd, long long size, int raid_disks);
-extern int Manage_reconfig(char *devname, int fd, int layout);
extern int Manage_subdevs(char *devname, int fd,
mddev_dev_t devlist, int verbose);
extern int autodetect(void);
@@ -711,10 +715,11 @@ extern int Grow_Add_device(char *devname, int fd, char *newdev);
extern int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int write_behind, int force);
extern int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
long long size,
- int level, int layout, int chunksize, int raid_disks);
+ int level, char *layout_str, int chunksize, int raid_disks);
extern int Grow_restart(struct supertype *st, struct mdinfo *info,
int *fdlist, int cnt, char *backup_file);
-
+extern int Grow_continue(int mdfd, struct supertype *st,
+ struct mdinfo *info, char *backup_file);
extern int Assemble(struct supertype *st, char *mddev,
mddev_ident_t ident,
@@ -770,7 +775,10 @@ extern unsigned long bitmap_sectors(struct bitmap_super_s *bsb);
extern int md_get_version(int fd);
extern int get_linux_version(void);
+extern long long parse_size(char *size);
extern int parse_uuid(char *str, int uuid[4]);
+extern int parse_layout_10(char *layout);
+extern int parse_layout_faulty(char *layout);
extern int check_ext2(int fd, char *name);
extern int check_reiser(int fd, char *name);
extern int check_raid(int fd, char *name);
diff --git a/restripe.c b/restripe.c
index 29c7336..e5ecd10 100644
--- a/restripe.c
+++ b/restripe.c
@@ -23,14 +23,18 @@
*/
#include "mdadm.h"
+#include <stdint.h>
/* To restripe, we read from old geometry to a buffer, and
* read from buffer to new geometry.
- * When reading we don't worry about parity. When writing we do.
+ * When reading, we might have missing devices and so could need
+ * to reconstruct.
+ * When writing, we need to create correct parity and Q.
*
*/
-static int geo_map(int block, unsigned long long stripe, int raid_disks, int level, int layout)
+static int geo_map(int block, unsigned long long stripe, int raid_disks,
+ int level, int layout)
{
/* On the given stripe, find which disk in the array will have
* block numbered 'block'.
@@ -42,6 +46,7 @@ static int geo_map(int block, unsigned long long stripe, int raid_disks, int lev
switch(level*100 + layout) {
case 000:
case 400:
+ case 500 + ALGORITHM_PARITY_N:
/* raid 4 isn't messed around by parity blocks */
if (block == -1)
return raid_disks-1; /* parity block */
@@ -70,6 +75,65 @@ static int geo_map(int block, unsigned long long stripe, int raid_disks, int lev
if (block == -1) return pd;
return (pd + 1 + block) % raid_disks;
+ case 500 + ALGORITHM_PARITY_0:
+ return block + 1;
+
+
+ case 600 + ALGORITHM_PARITY_N_6:
+ if (block == -2)
+ return raid_disks - 1;
+ if (block == -1)
+ return raid_disks - 2; /* parity block */
+ return block;
+ case 600 + ALGORITHM_LEFT_ASYMMETRIC_6:
+ if (block == -2)
+ return raid_disks - 1;
+ raid_disks--;
+ pd = (raid_disks-1) - stripe % raid_disks;
+ if (block == -1) return pd;
+ if (block >= pd)
+ block++;
+ return block;
+
+ case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6:
+ if (block == -2)
+ return raid_disks - 1;
+ raid_disks--;
+ pd = stripe % raid_disks;
+ if (block == -1) return pd;
+ if (block >= pd)
+ block++;
+ return block;
+
+ case 600 + ALGORITHM_LEFT_SYMMETRIC_6:
+ if (block == -2)
+ return raid_disks - 1;
+ raid_disks--;
+ pd = (raid_disks - 1) - stripe % raid_disks;
+ if (block == -1) return pd;
+ return (pd + 1 + block) % raid_disks;
+
+ case 600 + ALGORITHM_RIGHT_SYMMETRIC_6:
+ if (block == -2)
+ return raid_disks - 1;
+ raid_disks--;
+ pd = stripe % raid_disks;
+ if (block == -1) return pd;
+ return (pd + 1 + block) % raid_disks;
+
+ case 600 + ALGORITHM_PARITY_0_6:
+ if (block == -2)
+ return raid_disks - 1;
+ return block + 1;
+
+
+ case 600 + ALGORITHM_PARITY_0:
+ if (block == -1)
+ return 0;
+ if (block == -2)
+ return 1;
+ return block + 2;
+
case 600 + ALGORITHM_LEFT_ASYMMETRIC:
pd = raid_disks - 1 - (stripe % raid_disks);
if (block == -1) return pd;
@@ -80,6 +144,8 @@ static int geo_map(int block, unsigned long long stripe, int raid_disks, int lev
return block+2;
return block;
+ case 600 + ALGORITHM_ROTATING_ZERO_RESTART:
+ /* Different order for calculating Q, otherwize same as ... */
case 600 + ALGORITHM_RIGHT_ASYMMETRIC:
pd = stripe % raid_disks;
if (block == -1) return pd;
@@ -101,9 +167,43 @@ static int geo_map(int block, unsigned long long stripe, int raid_disks, int lev
if (block == -1) return pd;
if (block == -2) return (pd+1) % raid_disks;
return (pd + 2 + block) % raid_disks;
+
+
+ case 600 + ALGORITHM_ROTATING_N_RESTART:
+ /* Same a left_asymmetric, by first stripe is
+ * D D D P Q rather than
+ * Q D D D P
+ */
+ pd = raid_disks - 1 - ((stripe + 1) % raid_disks);
+ if (block == -1) return pd;
+ if (block == -2) return (pd+1) % raid_disks;
+ if (pd == raid_disks - 1)
+ return block+1;
+ if (block >= pd)
+ return block+2;
+ return block;
+
+ case 600 + ALGORITHM_ROTATING_N_CONTINUE:
+ /* Same as left_symmetric but Q is before P */
+ pd = raid_disks - 1 - (stripe % raid_disks);
+ if (block == -1) return pd;
+ if (block == -2) return (pd+raid_disks-1) % raid_disks;
+ return (pd + 1 + block) % raid_disks;
}
return -1;
}
+static int is_ddf(int layout)
+{
+ switch (layout)
+ {
+ default:
+ return 0;
+ case ALGORITHM_ROTATING_N_CONTINUE:
+ case ALGORITHM_ROTATING_N_RESTART:
+ case ALGORITHM_ROTATING_ZERO_RESTART:
+ return 1;
+ }
+}
static void xor_blocks(char *target, char **sources, int disks, int size)
@@ -118,10 +218,10 @@ static void xor_blocks(char *target, char **sources, int disks, int size)
}
}
-static void qsyndrome(char *p, char *q, char **sources, int disks, int size)
+static void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size)
{
int d, z;
- char wq0, wp0, wd0, w10, w20;
+ uint8_t wq0, wp0, wd0, w10, w20;
for ( d = 0; d < size; d++) {
wq0 = wp0 = sources[disks-1][d];
for ( z = disks-2 ; z >= 0 ; z-- ) {
@@ -138,50 +238,267 @@ static void qsyndrome(char *p, char *q, char **sources, int disks, int size)
}
}
+
+/*
+ * The following was taken from linux/drivers/md/mktables.c, and modified
+ * to create in-memory tables rather than C code
+ */
+static uint8_t gfmul(uint8_t a, uint8_t b)
+{
+ uint8_t v = 0;
+
+ while (b) {
+ if (b & 1)
+ v ^= a;
+ a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
+ b >>= 1;
+ }
+
+ return v;
+}
+
+static uint8_t gfpow(uint8_t a, int b)
+{
+ uint8_t v = 1;
+
+ b %= 255;
+ if (b < 0)
+ b += 255;
+
+ while (b) {
+ if (b & 1)
+ v = gfmul(v, a);
+ a = gfmul(a, a);
+ b >>= 1;
+ }
+
+ return v;
+}
+
+int tables_ready = 0;
+uint8_t raid6_gfmul[256][256];
+uint8_t raid6_gfexp[256];
+uint8_t raid6_gfinv[256];
+uint8_t raid6_gfexi[256];
+void make_tables(void)
+{
+ int i, j;
+ uint8_t v;
+
+ /* Compute multiplication table */
+ for (i = 0; i < 256; i++)
+ for (j = 0; j < 256; j++)
+ raid6_gfmul[i][j] = gfmul(i, j);
+
+ /* Compute power-of-2 table (exponent) */
+ v = 1;
+ for (i = 0; i < 256; i++) {
+ raid6_gfexp[i] = v;
+ v = gfmul(v, 2);
+ if (v == 1)
+ v = 0; /* For entry 255, not a real entry */
+ }
+
+ /* Compute inverse table x^-1 == x^254 */
+ for (i = 0; i < 256; i++)
+ raid6_gfinv[i] = gfpow(i, 254);
+
+ /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
+ for (i = 0; i < 256; i ++)
+ raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1];
+
+ tables_ready = 1;
+}
+
+uint8_t *zero;
+/* Following was taken from linux/drivers/md/raid6recov.c */
+
+/* Recover two failed data blocks. */
+void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+ uint8_t **ptrs)
+{
+ uint8_t *p, *q, *dp, *dq;
+ uint8_t px, qx, db;
+ const uint8_t *pbmul; /* P multiplier table for B data */
+ const uint8_t *qmul; /* Q multiplier table (for both) */
+
+ p = ptrs[disks-2];
+ q = ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data pages
+ Use the dead data pages as temporary storage for
+ delta p and delta q */
+ dp = ptrs[faila];
+ ptrs[faila] = zero;
+ dq = ptrs[failb];
+ ptrs[failb] = zero;
+
+ qsyndrome(dp, dq, ptrs, disks-2, bytes);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
+ qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
+
+ /* Now do it... */
+ while ( bytes-- ) {
+ px = *p ^ *dp;
+ qx = qmul[*q ^ *dq];
+ *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
+ *dp++ = db ^ px; /* Reconstructed A */
+ p++; q++;
+ }
+}
+
+/* Recover failure of one data block plus the P block */
+void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs)
+{
+ uint8_t *p, *q, *dq;
+ const uint8_t *qmul; /* Q multiplier table */
+
+ p = ptrs[disks-2];
+ q = ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data page
+ Use the dead data page as temporary storage for delta q */
+ dq = ptrs[faila];
+ ptrs[faila] = zero;
+
+ qsyndrome(p, dq, ptrs, disks-2, bytes);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ /* Now do it... */
+ while ( bytes-- ) {
+ *p++ ^= *dq = qmul[*q ^ *dq];
+ q++; dq++;
+ }
+}
+
/* Save data:
* We are given:
- * A list of 'fds' of the active disks. For now we require all to be present.
+ * A list of 'fds' of the active disks. Some may be absent.
* A geometry: raid_disks, chunk_size, level, layout
* A list of 'fds' for mirrored targets. They are already seeked to
* right (Write) location
- * A start and length
+ * A start and length which must be stripe-aligned
+ * 'buf' is large enough to hold one stripe, and is aligned
*/
int save_stripes(int *source, unsigned long long *offsets,
int raid_disks, int chunk_size, int level, int layout,
int nwrites, int *dest,
- unsigned long long start, unsigned long long length)
+ unsigned long long start, unsigned long long length,
+ char *buf)
{
- char abuf[8192+512];
- char *buf = (char*)(((unsigned long)abuf+511)&~511UL);
- int cpos = start % chunk_size; /* where in chunk we are up to */
int len;
int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
int disk;
+ int i;
+
+ if (!tables_ready)
+ make_tables();
+
+ if (zero == NULL) {
+ zero = malloc(chunk_size);
+ memset(zero, 0, chunk_size);
+ }
+ len = data_disks * chunk_size;
while (length > 0) {
- unsigned long long offset;
- int i;
- len = chunk_size - cpos;
- if (len > 8192) len = 8192;
- if (len > length) len = length;
- /* len bytes to be moved from one device */
-
- offset = (start/chunk_size/data_disks)*chunk_size + cpos;
- disk = start/chunk_size % data_disks;
- disk = geo_map(disk, start/chunk_size/data_disks,
- raid_disks, level, layout);
- if (lseek64(source[disk], offsets[disk]+offset, 0) < 0)
- return -1;
- if (read(source[disk], buf, len) != len)
+ int failed = 0;
+ int fdisk[3], fblock[3];
+ for (disk = 0; disk < raid_disks ; disk++) {
+ unsigned long long offset;
+ int dnum;
+
+ offset = (start/chunk_size/data_disks)*chunk_size;
+ dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1,
+ start/chunk_size/data_disks,
+ raid_disks, level, layout);
+ if (dnum < 0) abort();
+ if (source[dnum] < 0 ||
+ lseek64(source[dnum], offsets[disk]+offset, 0) < 0 ||
+ read(source[dnum], buf+disk * chunk_size, chunk_size)
+ != chunk_size)
+ if (failed <= 2) {
+ fdisk[failed] = dnum;
+ fblock[failed] = disk;
+ failed++;
+ }
+ }
+ if (failed == 0 || fblock[0] >= data_disks)
+ /* all data disks are good */
+ ;
+ else if (failed == 1 || fblock[1] >= data_disks+1) {
+ /* one failed data disk and good parity */
+ char *bufs[data_disks];
+ for (i=0; i < data_disks; i++)
+ if (fblock[0] == i)
+ bufs[i] = buf + data_disks*chunk_size;
+ else
+ bufs[i] = buf + i*chunk_size;
+
+ xor_blocks(buf + fblock[0]*chunk_size,
+ bufs, data_disks, chunk_size);
+ } else if (failed > 2 || level != 6)
+ /* too much failure */
return -1;
+ else {
+ /* RAID6 computations needed. */
+ uint8_t *bufs[data_disks+4];
+ int qdisk;
+ int syndrome_disks;
+ disk = geo_map(-1, start/chunk_size/data_disks,
+ raid_disks, level, layout);
+ qdisk = geo_map(-2, start/chunk_size/data_disks,
+ raid_disks, level, layout);
+ if (is_ddf(layout)) {
+ /* q over 'raid_disks' blocks, in device order.
+ * 'p' and 'q' get to be all zero
+ */
+ for (i = 0; i < raid_disks; i++)
+ if (i == disk || i == qdisk)
+ bufs[i] = zero;
+ else
+ bufs[i] = (uint8_t*)buf+i*chunk_size;
+ syndrome_disks = raid_disks;
+ } else {
+ /* for md, q is over 'data_disks' blocks,
+ * starting immediately after 'q'
+ */
+ for (i = 0; i < data_disks; i++)
+ bufs[i] = (uint8_t*)buf + chunk_size * ((qdisk+1+i) % raid_disks);
+
+ fdisk[0] = (qdisk + 1 + fdisk[0]) % raid_disks;
+ fdisk[1] = (qdisk + 1 + fdisk[1]) % raid_disks;
+ syndrome_disks = data_disks;
+ }
+ bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * disk;
+ bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * qdisk;
+ if (fblock[1] == data_disks)
+ /* One data failed, and parity failed */
+ raid6_datap_recov(syndrome_disks+2, chunk_size,
+ fdisk[0], bufs);
+ else
+ /* Two data blocks failed, P,Q OK */
+ raid6_2data_recov(syndrome_disks+2, chunk_size,
+ fdisk[0], fdisk[1], bufs);
+ }
+
for (i=0; i<nwrites; i++)
if (write(dest[i], buf, len) != len)
return -1;
+
length -= len;
start += len;
- cpos += len;
- while (cpos >= chunk_size) cpos -= chunk_size;
}
return 0;
}
@@ -202,17 +519,25 @@ int restore_stripes(int *dest, unsigned long long *offsets,
int source, unsigned long long read_offset,
unsigned long long start, unsigned long long length)
{
- char *stripe_buf = malloc(raid_disks * chunk_size);
+ char *stripe_buf;
char **stripes = malloc(raid_disks * sizeof(char*));
char **blocks = malloc(raid_disks * sizeof(char*));
int i;
- int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
+ int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2);
- if (stripe_buf == NULL || stripes == NULL || blocks == NULL) {
+ posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size);
+ if (zero == NULL) {
+ zero = malloc(chunk_size);
+ if (zero)
+ memset(zero, 0, chunk_size);
+ }
+ if (stripe_buf == NULL || stripes == NULL || blocks == NULL
+ || zero == NULL) {
free(stripe_buf);
free(stripes);
free(blocks);
+ free(zero);
return -2;
}
for (i=0; i<raid_disks; i++)
@@ -221,12 +546,12 @@ int restore_stripes(int *dest, unsigned long long *offsets,
int len = data_disks * chunk_size;
unsigned long long offset;
int disk, qdisk;
+ int syndrome_disks;
if (length < len)
return -3;
for (i=0; i < data_disks; i++) {
int disk = geo_map(i, start/chunk_size/data_disks,
raid_disks, level, layout);
- blocks[i] = stripes[disk];
if (lseek64(source, read_offset, 0) != read_offset)
return -1;
if (read(source, stripes[disk], chunk_size) != chunk_size)
@@ -240,6 +565,8 @@ int restore_stripes(int *dest, unsigned long long *offsets,
case 5:
disk = geo_map(-1, start/chunk_size/data_disks,
raid_disks, level, layout);
+ for (i = 0; i < data_disks; i++)
+ blocks[i] = stripes[(disk+1+i) % raid_disks];
xor_blocks(stripes[disk], blocks, data_disks, chunk_size);
break;
case 6:
@@ -247,9 +574,29 @@ int restore_stripes(int *dest, unsigned long long *offsets,
raid_disks, level, layout);
qdisk = geo_map(-2, start/chunk_size/data_disks,
raid_disks, level, layout);
-
- qsyndrome(stripes[disk], stripes[qdisk], blocks,
- data_disks, chunk_size);
+ if (is_ddf(layout)) {
+ /* q over 'raid_disks' blocks, in device order.
+ * 'p' and 'q' get to be all zero
+ */
+ for (i = 0; i < raid_disks; i++)
+ if (i == disk || i == qdisk)
+ blocks[i] = (char*)zero;
+ else
+ blocks[i] = stripes[i];
+ syndrome_disks = raid_disks;
+ } else {
+ /* for md, q is over 'data_disks' blocks,
+ * starting immediately after 'q'
+ */
+ for (i = 0; i < data_disks; i++)
+ blocks[i] = stripes[(qdisk+1+i) % raid_disks];
+
+ syndrome_disks = data_disks;
+ }
+ qsyndrome((uint8_t*)stripes[disk],
+ (uint8_t*)stripes[qdisk],
+ (uint8_t**)blocks,
+ syndrome_disks, chunk_size);
break;
}
for (i=0; i < raid_disks ; i++)
@@ -337,6 +684,7 @@ main(int argc, char *argv[])
int save;
int *fds;
char *file;
+ char *buf;
int storefd;
unsigned long long *offsets;
int raid_disks, chunk_size, level, layout;
@@ -395,11 +743,13 @@ main(int argc, char *argv[])
}
}
+ buf = malloc(raid_disks * chunk_size);
+
if (save == 1) {
int rv = save_stripes(fds, offsets,
raid_disks, chunk_size, level, layout,
1, &storefd,
- start, length);
+ start, length, buf);
if (rv != 0) {
fprintf(stderr,
"test_stripe: save_stripes returned %d\n", rv);
diff --git a/super0.c b/super0.c
index 07f4792..b84db29 100644
--- a/super0.c
+++ b/super0.c
@@ -135,7 +135,7 @@ static void examine_super0(struct supertype *st, char *homehost)
printf(" Reshape pos'n : %llu%s\n", (unsigned long long)sb->reshape_position/2, human_size((long long)sb->reshape_position<<9));
if (sb->delta_disks) {
printf(" Delta Devices : %d", sb->delta_disks);
- if (sb->delta_disks)
+ if (sb->delta_disks > 0)
printf(" (%d->%d)\n", sb->raid_disks-sb->delta_disks, sb->raid_disks);
else
printf(" (%d->%d)\n", sb->raid_disks, sb->raid_disks+sb->delta_disks);
@@ -149,6 +149,10 @@ static void examine_super0(struct supertype *st, char *homehost)
c = map_num(r5layout, sb->new_layout);
printf(" New Layout : %s\n", c?c:"-unknown-");
}
+ if (sb->level == 6) {
+ c = map_num(r6layout, sb->new_layout);
+ printf(" New Layout : %s\n", c?c:"-unknown-");
+ }
if (sb->level == 10) {
printf(" New Layout : near=%d, %s=%d\n",
sb->new_layout&255,
@@ -182,6 +186,10 @@ static void examine_super0(struct supertype *st, char *homehost)
c = map_num(r5layout, sb->layout);
printf(" Layout : %s\n", c?c:"-unknown-");
}
+ if (sb->level == 6) {
+ c = map_num(r6layout, sb->layout);
+ printf(" Layout : %s\n", c?c:"-unknown-");
+ }
if (sb->level == 10) {
printf(" Layout :");
print_r10_layout(sb->layout);
diff --git a/super1.c b/super1.c
index fee22a9..540c776 100644
--- a/super1.c
+++ b/super1.c
@@ -300,6 +300,10 @@ static void examine_super1(struct supertype *st, char *homehost)
c = map_num(r5layout, __le32_to_cpu(sb->new_layout));
printf(" New Layout : %s\n", c?c:"-unknown-");
}
+ if (__le32_to_cpu(sb->level) == 6) {
+ c = map_num(r6layout, __le32_to_cpu(sb->new_layout));
+ printf(" New Layout : %s\n", c?c:"-unknown-");
+ }
if (__le32_to_cpu(sb->level) == 10) {
printf(" New Layout :");
print_r10_layout(__le32_to_cpu(sb->new_layout));
@@ -331,6 +335,10 @@ static void examine_super1(struct supertype *st, char *homehost)
c = map_num(r5layout, __le32_to_cpu(sb->layout));
printf(" Layout : %s\n", c?c:"-unknown-");
}
+ if (__le32_to_cpu(sb->level) == 6) {
+ c = map_num(r6layout, __le32_to_cpu(sb->layout));
+ printf(" Layout : %s\n", c?c:"-unknown-");
+ }
if (__le32_to_cpu(sb->level) == 10) {
int lo = __le32_to_cpu(sb->layout);
printf(" Layout :");
diff --git a/sysfs.c b/sysfs.c
index 81ccb53..5806fa7 100644
--- a/sysfs.c
+++ b/sysfs.c
@@ -442,21 +442,28 @@ int sysfs_uevent(struct mdinfo *sra, char *event)
return 0;
}
-int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
- char *name, unsigned long long *val)
+int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev,
+ char *name)
{
char fname[50];
- char buf[50];
- int n;
int fd;
- char *ep;
+
sprintf(fname, "/sys/block/%s/md/%s/%s",
sra->sys_name, dev?dev->sys_name:"", name);
- fd = open(fname, O_RDONLY);
+ fd = open(fname, O_RDWR);
if (fd < 0)
- return -1;
+ fd = open(fname, O_RDONLY);
+ return fd;
+}
+
+int sysfs_fd_get_ll(int fd, unsigned long long *val)
+{
+ char buf[50];
+ int n;
+ char *ep;
+
+ lseek(fd, 0, 0);
n = read(fd, buf, sizeof(buf));
- close(fd);
if (n <= 0)
return -1;
buf[n] = 0;
@@ -466,25 +473,46 @@ int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
return 0;
}
-int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev,
- char *name, char *val, int size)
+int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, unsigned long long *val)
{
- char fname[50];
int n;
int fd;
- sprintf(fname, "/sys/block/%s/md/%s/%s",
- sra->sys_name, dev?dev->sys_name:"", name);
- fd = open(fname, O_RDONLY);
+
+ fd = sysfs_get_fd(sra, dev, name);
if (fd < 0)
return -1;
- n = read(fd, val, size);
+ n = sysfs_fd_get_ll(fd, val);
close(fd);
+ return n;
+}
+
+int sysfs_fd_get_str(int fd, char *val, int size)
+{
+ int n;
+
+ lseek(fd, 0, 0);
+ n = read(fd, val, size);
if (n <= 0)
return -1;
val[n] = 0;
return n;
}
+int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, char *val, int size)
+{
+ int n;
+ int fd;
+
+ fd = sysfs_get_fd(sra, dev, name);
+ if (fd < 0)
+ return -1;
+ n = sysfs_fd_get_str(fd, val, size);
+ close(fd);
+ return n;
+}
+
int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms)
{
unsigned long sec;
diff --git a/util.c b/util.c
index 4ccb1bb..f646bb9 100644
--- a/util.c
+++ b/util.c
@@ -149,6 +149,71 @@ int get_linux_version()
return (a*1000000)+(b*1000)+c;
}
+long long parse_size(char *size)
+{
+ /* parse 'size' which should be a number optionally
+ * followed by 'K', 'M', or 'G'.
+ * Without a suffix, K is assumed.
+ * Number returned is in sectors (half-K)
+ */
+ char *c;
+ long long s = strtoll(size, &c, 10);
+ if (s > 0) {
+ switch (*c) {
+ case 'K':
+ c++;
+ default:
+ s *= 2;
+ break;
+ case 'M':
+ c++;
+ s *= 1024 * 2;
+ break;
+ case 'G':
+ c++;
+ s *= 1024 * 1024 * 2;
+ break;
+ }
+ }
+ if (*c)
+ s = 0;
+ return s;
+}
+
+int parse_layout_10(char *layout)
+{
+ int copies, rv;
+ char *cp;
+ /* Parse the layout string for raid10 */
+ /* 'f', 'o' or 'n' followed by a number <= raid_disks */
+ if ((layout[0] != 'n' && layout[0] != 'f' && layout[0] != 'o') ||
+ (copies = strtoul(layout+1, &cp, 10)) < 1 ||
+ copies > 200 ||
+ *cp)
+ return -1;
+ if (layout[0] == 'n')
+ rv = 256 + copies;
+ else if (layout[0] == 'o')
+ rv = 0x10000 + (copies<<8) + 1;
+ else
+ rv = 1 + (copies<<8);
+ return rv;
+}
+
+int parse_layout_faulty(char *layout)
+{
+ /* Parse the layout string for 'faulty' */
+ int ln = strcspn(layout, "0123456789");
+ char *m = strdup(layout);
+ int mode;
+ m[ln] = 0;
+ mode = map_name(faultylayout, m);
+ if (mode == UnSet)
+ return -1;
+
+ return mode | (atoi(layout+ln)<< ModeShift);
+}
+
void remove_partitions(int fd)
{
/* remove partitions from this block devices.