From 7236ee7ad4a15b0d800cfc1ccbcee78158a05ada Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 11 Aug 2009 13:02:49 +1000 Subject: Handle extra 'grow' variations. UNFINISHED --- Grow.c | 1219 +++++++++++++++++++++++++++++++++++++++++++++--------------- mdadm.c | 28 +- mdadm.h | 4 + restripe.c | 5 +- sysfs.c | 23 +- 5 files changed, 959 insertions(+), 320 deletions(-) diff --git a/Grow.c b/Grow.c index c4e8cb4..6bc00b8 100644 --- a/Grow.c +++ b/Grow.c @@ -28,6 +28,7 @@ */ #include "mdadm.h" #include "dlink.h" +#include #if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) #error no endian defined @@ -388,15 +389,20 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int */ struct mdp_backup_super { - char magic[16]; /* md_backup_data-1 */ + char magic[16]; /* md_backup_data-1 or -2 */ __u8 set_uuid[16]; __u64 mtime; /* start/sizes in 512byte sectors */ - __u64 devstart; + __u64 devstart; /* address on backup device/file of data */ __u64 arraystart; __u64 length; __u32 sb_csum; /* csum of preceeding bytes. */ - __u8 pad[512-68]; + __u32 pad1; + __u64 devstart2; /* offset in to data of second section */ + __u64 arraystart2; + __u64 length2; + __u32 sb_csum2; /* csum of preceeding bytes. */ + __u8 pad[512-68-32]; } __attribute__((aligned(512))) bsb; int bsb_csum(char *buf, int len) @@ -408,33 +414,106 @@ int bsb_csum(char *buf, int len) return __cpu_to_le32(csum); } +static int child_grow(int afd, struct mdinfo *sra, unsigned long blocks, + int *fds, unsigned long long *offsets, + int disks, int chunk, int level, int layout, int data, + int dests, int *destfd, unsigned long long *destoffsets); +static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks, + int *fds, unsigned long long *offsets, + int disks, int chunk, int level, int layout, int data, + int dests, int *destfd, unsigned long long *destoffsets); +static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks, + int *fds, unsigned long long *offsets, + int disks, int chunk, int level, int layout, int data, + int dests, int *destfd, unsigned long long *destoffsets); + +int freeze_array(struct mdinfo *sra) +{ + /* Try to freeze resync on this array. + * Return -1 if the array is busy, + * return 0 if this kernel doesn't support 'frozen' + * return 1 if it worked. + */ + char buf[20]; + if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0) + return 0; + if (strcmp(buf, "idle\n") != 0 && + strcmp(buf, "frozen\n") != 0) + return -1; + if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0) + return 0; + return 1; +} + +void unfreeze_array(struct mdinfo *sra, int frozen) +{ + /* If 'frozen' is 1, unfreeze the array */ + if (frozen > 0) + sysfs_set_str(sra, NULL, "sync_action", "idle"); +} + +void wait_reshape(struct mdinfo *sra) +{ + int fd = sysfs_get_fd(sra, NULL, "sync_action"); + char action[20]; + + do { + fd_set rfds; + FD_ZERO(&rfds); + FD_SET(fd, &rfds); + select(fd+1, NULL, NULL, &rfds, NULL); + + if (sysfs_fd_get_str(fd, action, 20) < 0) { + close(fd); + return; + } + } while (strncmp(action, "reshape", 7) == 0); +} + + int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, long long size, int level, char *layout_str, int chunksize, int raid_disks) { /* Make some changes in the shape of an array. * The kernel must support the change. - * Different reshapes have subtly different meaning for different - * levels, so we need to check the current state of the array - * and go from there. + * + * There are three different changes. Each can trigger + * a resync or recovery so we freeze that until we have + * requested everything (if kernel supports freezing - 2.6.30). + * The steps are: + * - change size (i.e. component_size) + * - change level + * - change layout/chunksize/ndisks + * + * The last can require a reshape. It is different on different + * levels so we need to check the level before actioning it. + * Some times the level change needs to be requested after the + * reshape (e.g. raid6->raid5, raid5->raid0) + * */ - struct mdu_array_info_s array; + struct mdu_array_info_s array, orig; char *c; - + int rv = 0; struct supertype *st; - int nlevel, olevel; int nchunk, ochunk; int nlayout, olayout; int ndisks, odisks; int ndata, odata; - unsigned long long nstripe, ostripe, last_block; + int orig_level = UnSet; + char alt_layout[40]; int *fdlist; unsigned long long *offsets; - int d, i, spares; + int d, i; int nrdisks; int err; - char *buf; + int frozen; + unsigned long a,b, blocks, stripes; + int cache; + unsigned long long array_size; + int changed = 0; + int done; struct mdinfo *sra; struct mdinfo *sd; @@ -444,145 +523,287 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, devname); return 1; } + sra = sysfs_read(fd, 0, GET_LEVEL); + frozen = freeze_array(sra); + if (frozen < 0) { + fprintf(stderr, Name ": %s is performing resync/recovery and cannot" + " be reshaped\n", devname); + return 1; + } + + /* ========= set size =============== */ + if (size >= 0 && (size == 0 || size != array.size)) { + array.size = size; + if (array.size != size) { + /* got truncated to 32bit, write to + * component_size instead + */ + if (sra) + rv = sysfs_set_num(sra, NULL, + "component_size", size); + else + rv = -1; + } else + rv = ioctl(fd, SET_ARRAY_INFO, &array); + if (rv != 0) { + fprintf(stderr, Name ": Cannot set device size for %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto release; + } + ioctl(fd, GET_ARRAY_INFO, &array); + if (!quiet) + fprintf(stderr, Name ": component size of %s has been set to %dK\n", + devname, array.size); + changed = 1; + } + + /* ======= set level =========== */ + if (level != UnSet && level != array.level) { + /* Trying to change the level. + * We might need to change layout first and schedule a + * level change for later. + * Level changes that can happen immediately are: + * 0->4,5,6 1->5 4->5,6 5->1,6 + * Level changes that need a layout change first are: + * 6->5,4,0 : need a -6 layout, or parity-last + * 5->4,0 : need parity-last + */ + if ((array.level == 6 || array.level == 5) && + (level == 5 || level == 4 || level == 0)) { + /* Don't change level yet, but choose intermediate + * layout + */ + if (level == 5) { + if (layout_str == NULL) + switch (array.layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_ROTATING_N_RESTART: + layout_str = "left-asymmetric-6"; + break; + case ALGORITHM_LEFT_SYMMETRIC: + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_ROTATING_N_CONTINUE: + layout_str = "left-symmetric-6"; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + case ALGORITHM_ROTATING_ZERO_RESTART: + layout_str = "right-asymmetric-6"; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + case ALGORITHM_RIGHT_SYMMETRIC_6: + layout_str = "right-symmetric-6"; + break; + case ALGORITHM_PARITY_0: + case ALGORITHM_PARITY_0_6: + layout_str = "parity-first-6"; + break; + case ALGORITHM_PARITY_N: + layout_str = "parity-last"; + break; + default: + fprintf(stderr, Name ": %s: cannot" + "convert layout to RAID5 equivalent\n", + devname); + rv = 1; + goto release; + } + else { + int l = map_name(r5layout, layout_str); + if (l == UnSet) { + fprintf(stderr, Name ": %s: layout '%s' not recognised\n", + devname, layout_str); + rv = 1; + goto release; + } + if (l != ALGORITHM_PARITY_N) { + /* need the -6 version */ + char *ls = map_num(r5layout, l); + strcat(strcpy(alt_layout, ls), + "-6"); + layout_str = alt_layout; + } + } + if (raid_disks) + /* The find raid6->raid5 conversion + * will reduce the number of disks, + * so now we need to aim higher + */ + raid_disks++; + } else + layout_str = "parity-last"; + } else { + c = map_num(pers, level); + if (c == NULL) + return 1;/* not possible */ + err = sysfs_set_str(sra, NULL, "level", c); + if (err) { + fprintf(stderr, Name ": %s: could not set level to %s\n", + devname, c); + rv = 1; + goto release; + } + orig = array; + orig_level = orig.level; + ioctl(fd, GET_ARRAY_INFO, &array); + if (layout_str == NULL && + orig.level == 5 && level == 6 && + array.layout != orig.layout) + layout_str = map_num(r5layout, orig.layout); + if (!quiet) + fprintf(stderr, Name " level of %s changed to %s\n", + devname, c); + changed = 1; + } + } + + /* ========= set shape (chunk_size / layout / ndisks) ============== */ + /* Check if layout change is a no-op */ + if (layout_str) switch(array.level) { + case 5: + if (array.layout == map_name(r5layout, layout_str)) + layout_str = NULL; + break; + case 6: + if (layout_str == NULL && + ((chunksize && chunksize * 1024 != array.chunk_size) || + (raid_disks && raid_disks != array.raid_disks)) && + array.layout >= 16) { + fprintf(stderr, Name + ": %s has a non-standard layout. If you wish to preserve this\n" + " during the reshape, please specify --layout=preserve\n" + " If you want to change it, specify a layout or use --layout=normalise\n", + devname); + rv = 1; + goto release; + } + if (strcmp(layout_str, "normalise") == 0 || + strcmp(layout_str, "normalize") == 0) { + char *hyphen; + strcpy(alt_layout, map_num(r6layout, array.layout)); + hyphen = strrchr(alt_layout, '-'); + if (hyphen && strcmp(hyphen, "-6") == 0) { + *hyphen = 0; + layout_str = alt_layout; + } + } + + if (array.layout == map_name(r6layout, layout_str)) + layout_str = NULL; + if (layout_str && strcmp(layout_str, "preserve") == 0) + layout_str = NULL; + break; + } + if (layout_str == NULL + && (chunksize == 0 || chunksize*1024 == array.chunk_size) + && (raid_disks == 0 || raid_disks == array.raid_disks)) { + rv = 0; + if (level != UnSet && level != array.level) { + /* Looks like this level change doesn't need + * a reshape after all. + */ + c = map_num(pers, level); + if (c) { + rv = sysfs_set_str(sra, NULL, "level", c); + if (rv) + fprintf(stderr, Name ": %s: could not set level to %s\n", + devname, c); + } + } else if (!changed && !quiet) + fprintf(stderr, Name ": %s: no change requested\n", + devname); + goto release; + } + c = map_num(pers, array.level); if (c == NULL) c = "-unknown-"; switch(array.level) { default: /* raid0, linear, multipath cannot be reconfigured */ fprintf(stderr, Name ": %s array %s cannot be reshaped.\n", c, devname); - return 1; + rv = 1; + break; case LEVEL_FAULTY: /* only 'layout' change is permitted */ - if (size >= 0) { - fprintf(stderr, Name ": %s: Cannot change size of a 'faulty' array\n", - devname); - return 1; - } - if (level != UnSet && level != LEVEL_FAULTY) { - fprintf(stderr, Name ": %s: Cannot change RAID level of a 'faulty' array\n", - devname); - return 1; - } if (chunksize || raid_disks) { fprintf(stderr, Name ": %s: Cannot change chunksize or disks of a 'faulty' array\n", devname); - return 1; + rv = 1; + break; } if (layout_str == NULL) - return 0; /* nothing to do.... */ + break; /* nothing to do.... */ array.layout = parse_layout_faulty(layout_str); if (array.layout < 0) { + int rv; fprintf(stderr, Name ": %s: layout %s not understood for 'faulty' array\n", devname, layout_str); - return 1; + rv = 1; + break; } if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { fprintf(stderr, Name ": Cannot set layout for %s: %s\n", devname, strerror(errno)); - return 1; - } - if (!quiet) + rv = 1; + } else if (!quiet) printf("layout for %s set to %d\n", devname, array.layout); - return 0; + break; - case 1: /* raid_disks and size can each be changed. They are independant */ + case 1: /* only raid_disks can each be changed. */ - if (level != UnSet && level != 1) { - fprintf(stderr, Name ": %s: Cannot change RAID level of a RAID1 array.\n", - devname); - return 1; - } if (chunksize || layout_str != NULL) { - fprintf(stderr, Name ": %s: Cannot change chunk size of layout for a RAID1 array.\n", + fprintf(stderr, Name ": %s: Cannot change chunk size or layout for a RAID1 array.\n", devname); - return 1; + rv = 1; + break; } - - /* Each can trigger a resync/recovery which will block the - * other from happening. Later we could block - * resync for the duration via 'sync_action'... - */ if (raid_disks > 0) { array.raid_disks = raid_disks; if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { fprintf(stderr, Name ": Cannot set raid-devices for %s: %s\n", devname, strerror(errno)); - return 1; - } - } - if (size >= 0) { - int rv; - array.size = size; - if (array.size != size) { - /* got truncated to 32bit, write to - * component_size instead - */ - sra = sysfs_read(fd, 0, 0); - if (sra) - rv = sysfs_set_num(sra, NULL, - "component_size", size); - else - rv = -1; - } else - rv = ioctl(fd, SET_ARRAY_INFO, &array); - if (rv != 0) { - fprintf(stderr, Name ": Cannot set device size for %s: %s\n", - devname, strerror(errno)); - return 1; + rv = 1; } } - return 0; + break; case 4: case 5: case 6: - st = super_by_fd(fd); - /* size can be changed independently. - * layout/chunksize/raid_disks/level can be changed + /* + * layout/chunksize/raid_disks can be changed * though the kernel may not support it all. - * If 'suspend_lo' is not present in devfs, then - * these cannot be changed. */ - if (size >= 0) { - /* Cannot change other details as well.. */ - if (layout_str != NULL || - chunksize != 0 || - raid_disks != 0 || - level != UnSet) { - fprintf(stderr, Name ": %s: Cannot change shape as well as size of a %s array.\n", - devname, c); - return 1; - } - array.size = size; - if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { - fprintf(stderr, Name ": Cannot set device size/shape for %s: %s\n", - devname, strerror(errno)); - return 1; - } - return 0; - } - /* Ok, just change the shape. This can be awkward. - * There are three possibilities. - * 1/ The array will shrink. We don't support this - * possibility. Maybe one day... - * 2/ The array will not change size. This is easy enough - * to do, but not reliably. If the process is aborted - * the array *will* be corrupted. So maybe we can allow - * this but only if the user is really certain. e.g. - * --really-risk-everything - * 3/ The array will grow. This can be reliably achieved. + st = super_by_fd(fd); + + /* + * There are three possibilities. + * 1/ The array will shrink. + * We need to ensure the reshape will pause before reaching + * the 'critical section'. We also need to fork and wait for + * that to happen. When it does we + * suspend/backup/complete/unfreeze + * + * 2/ The array will not change size. + * This requires that we keep a backup of a sliding window + * so that we can restore data after a crash. So we need + * to fork and monitor progress. + * + * 3/ The array will grow. This is relatively easy. * However the kernel's restripe routines will cheerfully * overwrite some early data before it is safe. So we * need to make a backup of the early parts of the array * and be ready to restore it if rebuild aborts very early. * - * We backup data by writing it to all spares (there must be - * at least 1, so even raid6->raid5 requires a spare to be - * present). + * We backup data by writing it to one spare, or to a + * file which was given on command line. * + * [FOLLOWING IS OLD AND PARTLY WRONG] * So: we enumerate the devices in the array and * make sure we can open all of them. * Then we freeze the early part of the array and @@ -592,21 +813,27 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, * and finally invalidate the copied data and unfreeze the * start of the array. * - * Before we can do this we need to decide: - * - will the array grow? Just calculate size - * - how much needs to be saved: count stripes. - * - where to save data... good question. - * + * In each case, we first make sure that storage is available + * for the required backup. + * Then we: + * - request the shape change. + * - for to handle backup etc. */ - nlevel = olevel = array.level; nchunk = ochunk = array.chunk_size; nlayout = olayout = array.layout; ndisks = odisks = array.raid_disks; - if (level != UnSet) nlevel = level; - if (chunksize) nchunk = chunksize; + if (chunksize) { + nchunk = chunksize * 1024; + if (array.size % chunksize) { + fprintf(stderr, Name ": component size %dK is not" + " a multiple of chunksize %dK\n", + array.size, chunksize); + break; + } + } if (layout_str != NULL) - switch(nlevel) { + switch(array.level) { case 4: /* ignore layout */ break; case 5: @@ -630,54 +857,64 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, if (raid_disks) ndisks = raid_disks; odata = odisks-1; - if (olevel == 6) odata--; /* number of data disks */ ndata = ndisks-1; - if (nlevel == 6) ndata--; - - if (ndata < odata) { - fprintf(stderr, Name ": %s: Cannot reduce number of data disks (yet).\n", - devname); - return 1; + if (array.level == 6) { + odata--; /* number of data disks */ + ndata--; } - if (ndata == odata) { - fprintf(stderr, Name ": %s: Cannot reshape array without increasing size (yet).\n", - devname); - return 1; + + /* Check that we can hold all the data */ + size = ndata * array.size; + get_dev_size(fd, NULL, &array_size); + if (size < (array_size/1024)) { + fprintf(stderr, Name ": this change will reduce the size of the array.\n" + " use --grow --array-size first to truncate array.\n" + " e.g. mdadm --grow %s --array-size %llu\n", + devname, size); + rv = 1; + break; } - /* Well, it is growing... so how much do we need to backup. - * Need to backup a full number of new-stripes, such that the - * last one does not over-write any place that it would be read - * from + + /* So how much do we need to backup. + * We need an amount of data which is both a whole number of + * old stripes and a whole number of new stripes. + * So LCM for (chunksize*datadisks). */ - nstripe = ostripe = 0; - while (nstripe >= ostripe) { - nstripe += nchunk/512; - last_block = nstripe * ndata; - ostripe = last_block / odata / (ochunk/512) * (ochunk/512); + a = ochunk/512 * odata; + b = nchunk/512 * ndata; + /* Find GCD */ + while (a != b) { + if (a < b) + b -= a; + if (b < a) + a -= b; } - fprintf(stderr, Name ": Need to backup %lluK of critical " - "section..\n", last_block/2); + /* LCM == product / GCD */ + blocks = ochunk/512 * nchunk/512 * odata * ndata / a; + + if (ndata == odata) + blocks *= 16; + else + fprintf(stderr, Name ": Need to backup %luK of critical " + "section..\n", blocks/2); + sysfs_free(sra); sra = sysfs_read(fd, 0, GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE| GET_CACHE); if (!sra) { fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n", devname); - return 1; + rv = 1; + break; } - if (last_block >= sra->component_size/2) { + if (blocks >= sra->component_size/2) { fprintf(stderr, Name ": %s: Something wrong - reshape aborted\n", devname); - return 1; - } - if (sra->array.spare_disks == 0 && backup_file == NULL) { - fprintf(stderr, Name ": %s: Cannot grow - need a spare or backup-file to backup critical section\n", - devname); - return 1; + rv = 1; + break; } - nrdisks = array.nr_disks + sra->array.spare_disks; /* Now we need to open all these devices so we can read/write. */ @@ -685,7 +922,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, offsets = malloc((1+nrdisks) * sizeof(offsets[0])); if (!fdlist || !offsets) { fprintf(stderr, Name ": malloc failed: grow aborted\n"); - return 1; + rv = 1; + break; } for (d=0; d <= nrdisks; d++) fdlist[d] = -1; @@ -698,215 +936,600 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, sd->disk.minor, 1); fdlist[sd->disk.raid_disk] = dev_open(dn, O_RDONLY); - offsets[sd->disk.raid_disk] = sd->data_offset; + offsets[sd->disk.raid_disk] = sd->data_offset*512; if (fdlist[sd->disk.raid_disk] < 0) { fprintf(stderr, Name ": %s: cannot open component %s\n", devname, dn?dn:"-unknown-"); - goto abort; + rv = 1; + goto release; } - } else { + } else if (backup_file == NULL) { /* spare */ char *dn = map_dev(sd->disk.major, sd->disk.minor, 1); fdlist[d] = dev_open(dn, O_RDWR); - offsets[d] = sd->data_offset; + offsets[d] = (sra->component_size - blocks - 8)*512; if (fdlist[d]<0) { fprintf(stderr, Name ": %s: cannot open component %s\n", devname, dn?dn:"-unknown"); - goto abort; + rv = 1; + goto release; } d++; } } - for (i=0 ; iarray.spare_disks == 0) { + fprintf(stderr, Name ": %s: Cannot grow - need a spare or " + "backup-file to backup critical section\n", + devname); + rv = 1; + break; + } + if (d == array.raid_disks) { + fprintf(stderr, Name ": %s: No spare device for backup\n", + devname); + rv = 1; + break; } - spares = sra->array.spare_disks; - if (backup_file) { - fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL, S_IRUSR | S_IWUSR); + } else { + /* need to check backup file is large enough */ + char buf[512]; + fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL, + S_IRUSR | S_IWUSR); + offsets[d] = 8 * 512; if (fdlist[d] < 0) { fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n", devname, backup_file, strerror(errno)); - goto abort; + rv = 1; + break; + } + memset(buf, 0, 512); + for (i=0; i < blocks + 1 ; i++) { + if (write(fdlist[d], buf, 512) != 512) { + fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + rv = 1; + break; + } + } + if (fsync(fdlist[d]) != 0) { + fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + rv = 1; + break; } - offsets[d] = 8; d++; - spares++; } - if (fdlist[array.raid_disks] < 0) { - fprintf(stderr, Name ": %s: failed to find a spare and no backup-file given - --grow aborted\n", - devname); - goto abort; + + /* lastly, check that the internal stripe cache is + * large enough, or it won't work. + */ + + cache = (nchunk < ochunk) ? ochunk : nchunk; + cache = cache * 4 / 4096; + if (sra->cache_size < cache) + sysfs_set_num(sra, NULL, "stripe_cache_size", + cache+1); + /* Right, everything seems fine. Let's kick things off. + * If only changing raid_disks, use ioctl, else use + * sysfs. + */ + if (ochunk == nchunk && olayout == nlayout) { + array.raid_disks = ndisks; + if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + rv = 1; + fprintf(stderr, Name ": Cannot set device shape for %s: %s\n", + devname, strerror(errno)); + if (ndisks < odisks && + get_linux_version() < 2006030) + fprintf(stderr, Name ": linux 2.6.30 or later required\n"); + + break; + } + } else { + /* set them all just in case some old 'new_*' value + * persists from some earlier problem + */ + if (sysfs_set_num(sra, NULL, "chunk_size", nchunk) < 0) + rv = 1; + if (sysfs_set_num(sra, NULL, "layout", nlayout) < 0) + rv = 1; + if (sysfs_set_num(sra, NULL, "raid_disks", ndisks) < 0) + rv = 1; + if (rv) { + fprintf(stderr, Name ": Cannot set device shape for %s\n", + devname); + if (get_linux_version() < 2006030) + fprintf(stderr, Name ": linux 2.6.30 or later required\n"); + break; + } } + if (ndisks == 2 && odisks == 2) { + /* No reshape is needed in this trivial case */ + rv = 0; + break; + } + + /* set up the backup-super-block. This requires the + * uuid from the array. + */ /* Find a superblock */ - if (st->ss->load_super(st, fdlist[0], NULL)) { + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int devfd; + int ok; + if (sd->disk.state & (1<disk.major, sd->disk.minor, 1); + devfd = dev_open(dn, O_RDONLY); + if (devfd < 0) + continue; + ok = st->ss->load_super(st, devfd, NULL); + close(devfd); + if (ok >= 0) + break; + } + if (!sd) { fprintf(stderr, Name ": %s: Cannot find a superblock\n", devname); - goto abort; + rv = 1; + break; } - + memset(&bsb, 0, 512); memcpy(bsb.magic, "md_backup_data-1", 16); st->ss->uuid_from_super(st, (int*)&bsb.set_uuid); bsb.mtime = __cpu_to_le64(time(0)); - bsb.arraystart = 0; - bsb.length = __cpu_to_le64(last_block); - - /* Decide offset for the backup, llseek the spares, and write - * a leading superblock 4K earlier. + bsb.devstart2 = blocks; + stripes = blocks / (ochunk/512) / odata; + /* Now we just need to kick off the reshape and watch, while + * handling backups of the data... + * This is all done by a forked background process. */ - for (i=array.raid_disks; icomponent_size - last_block - 8; - if (lseek64(fdlist[i], (offsets[i]<<9) - 4096, 0) - != (offsets[i]<<9) - 4096) { - fprintf(stderr, Name ": could not seek...\n"); - goto abort; - } - memset(buf, 0, 4096); - bsb.devstart = __cpu_to_le64(offsets[i]); - bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)); - memcpy(buf, &bsb, sizeof(bsb)); - if (write(fdlist[i], buf, 4096) != 4096) { - fprintf(stderr, Name ": could not write leading superblock\n"); - goto abort; - } - } - array.level = nlevel; - array.raid_disks = ndisks; - array.chunk_size = nchunk; - array.layout = nlayout; - if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { - if (errno == ENOSPC) { - /* stripe cache is not big enough. - * It needs to be 4 times chunksize_size, - * and we assume pagesize is 4K + switch(fork()) { + case 0: + close(fd); + if (check_env("MDADM_GROW_VERIFY")) + fd = open(devname, O_RDONLY | O_DIRECT); + else + fd = -1; + mlockall(MCL_FUTURE); + + if (odata < ndata) + done = child_grow(fd, sra, stripes, + fdlist, offsets, + odisks, ochunk, array.level, olayout, odata, + d - odisks, fdlist+odisks, offsets+odisks); + else if (odata > ndata) + done = child_shrink(fd, sra, stripes, + fdlist, offsets, + odisks, ochunk, array.level, olayout, odata, + d - odisks, fdlist+odisks, offsets+odisks); + else + done = child_same_size(fd, sra, stripes, + fdlist, offsets, + odisks, ochunk, array.level, olayout, odata, + d - odisks, fdlist+odisks, offsets+odisks); + if (backup_file && done) + unlink(backup_file); + if (level != UnSet && level != array.level) { + /* We need to wait for the reshape to finish + * (which will have happened unless odata < ndata) + * and then set the level */ - if (sra->cache_size < 4 * (nchunk/4096)) { - sysfs_set_num(sra, NULL, - "stripe_cache_size", - 4 * (nchunk/4096) +1); - if (ioctl(fd, SET_ARRAY_INFO, - &array) == 0) - goto ok; - } + + c = map_num(pers, level); + if (c == NULL) + exit(0);/* not possible */ + + if (odata < ndata) + wait_reshape(sra); + err = sysfs_set_str(sra, NULL, "level", c); + if (err) + fprintf(stderr, Name ": %s: could not set level to %s\n", + devname, c); } - fprintf(stderr, Name ": Cannot set device size/shape for %s: %s\n", - devname, strerror(errno)); - goto abort; + exit(0); + case -1: + fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n", + strerror(errno)); + rv = 1; + break; + default: + /* The child will take care of unfreezing the array */ + frozen = 0; + break; } - ok: ; + break; - /* suspend the relevant region */ - sysfs_set_num(sra, NULL, "suspend_hi", 0); /* just in case */ - if (sysfs_set_num(sra, NULL, "suspend_lo", 0) < 0 || - sysfs_set_num(sra, NULL, "suspend_hi", last_block) < 0) { - fprintf(stderr, Name ": %s: failed to suspend device.\n", - devname); - goto abort_resume; - } + } - buf = malloc(odisks * ochunk); + release: + if (rv && orig_level != UnSet && sra) { + c = map_num(pers, orig_level); + if (c && sysfs_set_str(sra, NULL, "level", c) == 0) + fprintf(stderr, Name ": aborting level change\n"); + } + if (sra) + unfreeze_array(sra, frozen); + return rv; +} - err = save_stripes(fdlist, offsets, - odisks, ochunk, olevel, olayout, - spares, fdlist+odisks, - 0ULL, last_block*512, buf); +/* + * We run a child process in the background which performs the following + * steps: + * - wait for resync to reach a certain point + * - suspend io to the following section + * - backup that section + * - allow resync to proceed further + * - resume io + * - discard the backup. + * + * When are combined in slightly different ways in the three cases. + * Grow: + * - suspend/backup/allow/wait/resume/discard + * Shrink: + * - allow/wait/suspend/backup/allow/wait/resume/discard + * same-size: + * - wait/resume/discard/suspend/backup/allow + * + * suspend/backup/allow always come together + * wait/resume/discard do too. + * For the same-size case we have two backups to improve flow. + * + */ - /* abort if there was an error */ - if (err < 0) { - fprintf(stderr, Name ": %s: failed to save critical region\n", - devname); - goto abort_resume; - } +int grow_backup(struct mdinfo *sra, + unsigned long long offset, /* per device */ + unsigned long stripes, /* per device */ + int *sources, unsigned long long *offsets, + int disks, int chunk, int level, int layout, + int dests, int *destfd, unsigned long long *destoffsets, + int part, + char *buf) +{ + /* Backup 'blocks' sectors at 'offset' on each device of the array, + * to storage 'destfd' (offset 'destoffsets'), after first + * suspending IO. Then allow resync to continue + * over the suspended section. + * Use part 'part' of the backup-super-block. + */ + int odata = disks; + int rv = 0; + int i; + //printf("offset %llu\n", offset); + if (level >= 4) + odata--; + if (level == 6) + odata--; + sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * chunk/512) * odata); + if (part) { + bsb.arraystart2 = __cpu_to_le64(offset * odata); + bsb.length2 = __cpu_to_le64(stripes * chunk/512 * odata); + } else { + bsb.arraystart = __cpu_to_le64(offset * odata); + bsb.length = __cpu_to_le64(stripes * chunk/512 * odata); + } + if (part) + bsb.magic[15] = '2'; + for (i = 0; i < dests; i++) + if (part) + lseek64(destfd[i], destoffsets[i] + __le64_to_cpu(bsb.devstart2)*512, 0); + else + lseek64(destfd[i], destoffsets[i], 0); + + rv = save_stripes(sources, offsets, + disks, chunk, level, layout, + dests, destfd, + offset*512*odata, stripes * chunk * odata, + buf); + + if (rv) + return rv; + for (i = 0; i < dests; i++) { + bsb.devstart = __cpu_to_le64(destoffsets[i]/512); + + bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)); + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) + bsb.sb_csum2 = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum2)-((char*)&bsb)); + + lseek64(destfd[i], destoffsets[i] - 4096, 0); + write(destfd[i], &bsb, 512); + fsync(destfd[i]); + } - for (i=odisks; i 0 && + strncmp(action, "reshape", 7) != 0) + break; + } while (completed < offset + blocks); + close(fd); + + if (part) { + bsb.arraystart2 = __cpu_to_le64(0); + bsb.length2 = __cpu_to_le64(0); + } else { + bsb.arraystart = __cpu_to_le64(0); + bsb.length = __cpu_to_le64(0); + } + for (i = 0; i < dests; i++) { + bsb.devstart = __cpu_to_le64(destoffsets[i]/512); + bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)); + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) + bsb.sb_csum2 = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum2)-((char*)&bsb)); + lseek64(destfd[i], destoffsets[i]-4096, 0); + write(destfd[i], &bsb, 512); + fsync(destfd[i]); + } + return 0; +} - /* start the reshape happening */ - if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) { - fprintf(stderr, Name ": %s: failed to initiate reshape\n", - devname); - goto abort_resume; +static void fail(char *msg) +{ + write(2, msg, strlen(msg)); + write(2, "\n", 1); + exit(1); +} + +static char *abuf, *bbuf; +static int abuflen; +static void validate(int afd, int bfd, unsigned long long offset) +{ + /* check that the data in the backup against the array. + * This is only used for regression testing and should not + * be used while the array is active + */ + struct mdp_backup_super bsb2; + if (afd < 0) + return; + lseek64(bfd, offset - 4096, 0); + if (read(bfd, &bsb2, 512) != 512) + fail("cannot read bsb"); + if (bsb2.sb_csum != bsb_csum((char*)&bsb2, + ((char*)&bsb2.sb_csum)-((char*)&bsb2))) + fail("first csum bad"); + if (memcmp(bsb2.magic, "md_backup_data", 14) != 0) + fail("magic is bad"); + if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 && + bsb2.sb_csum2 != bsb_csum((char*)&bsb2, + ((char*)&bsb2.sb_csum2)-((char*)&bsb2))) + fail("second csum bad"); + + if (__le64_to_cpu(bsb2.devstart)*512 != offset) + fail("devstart is wrong"); + + if (bsb2.length) { + unsigned long long len = __le64_to_cpu(bsb2.length)*512; + + if (abuflen < len) { + free(abuf); + free(bbuf); + abuflen = len; + posix_memalign((void**)&abuf, 4096, abuflen); + posix_memalign((void**)&bbuf, 4096, abuflen); } - /* wait for reshape to pass the critical region */ - while(1) { - unsigned long long comp; - if (sysfs_get_ll(sra, NULL, "sync_completed", &comp)<0) { - sleep(5); - break; - } - if (comp >= nstripe) - break; - if (comp == 0) { - /* Maybe it finished already */ - char action[20]; - if (sysfs_get_str(sra, NULL, "sync_action", - action, 20) > 0 && - strncmp(action, "reshape", 7) != 0) + lseek64(bfd, offset, 0); + if (read(bfd, bbuf, len) != len) { + printf("len %llu\n", len); + fail("read first backup failed"); + } + lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0); + if (read(afd, abuf, len) != len) + fail("read first from array failed"); + if (memcmp(bbuf, abuf, len) != 0) { + int i; + printf("offset=%llu len=%llu\n", + __le64_to_cpu(bsb2.arraystart)*512, len); + for (i=0; i= 0) - close(fdlist[i]); - free(fdlist); - free(offsets); - if (backup_file) - unlink(backup_file); - - fprintf(stderr, Name ": ... critical section passed.\n"); - break; + lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0); + if (read(bfd, bbuf, len) != len) + fail("read second backup failed"); + lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0); + if (read(afd, abuf, len) != len) + fail("read second from array failed"); + if (memcmp(bbuf, abuf, len) != 0) + fail("data2 compare failed"); } - return 0; +} + +static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes, + int *fds, unsigned long long *offsets, + int disks, int chunk, int level, int layout, int data, + int dests, int *destfd, unsigned long long *destoffsets) +{ + char *buf; + posix_memalign((void**)&buf, 4096, disks * chunk); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + grow_backup(sra, 0, stripes, + fds, offsets, disks, chunk, level, layout, + dests, destfd, destoffsets, + 0, buf); + validate(afd, destfd[0], destoffsets[0]); + if (wait_backup(sra, 0, stripes * chunk / 512, stripes * chunk / 512, + dests, destfd, destoffsets, + 0) < 0) + return 0; + sysfs_set_num(sra, NULL, "suspend_lo", (stripes * chunk/512) * data); + free(buf); + /* FIXME this should probably be numeric */ + sysfs_set_str(sra, NULL, "sync_max", "max"); + return 1; +} - abort_resume: - sysfs_set_num(sra, NULL, "suspend_lo", last_block); - abort: - for (i=0; i= 0) - close(fdlist[i]); - free(fdlist); - free(offsets); - if (backup_file) - unlink(backup_file); +static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes, + int *fds, unsigned long long *offsets, + int disks, int chunk, int level, int layout, int data, + int dests, int *destfd, unsigned long long *destoffsets) +{ + char *buf; + unsigned long long start; + int rv; + + posix_memalign((void**)&buf, 4096, disks * chunk); + start = sra->component_size - stripes * chunk/512; + sysfs_set_num(sra, NULL, "sync_max", start); + sysfs_set_str(sra, NULL, "sync_action", "reshape"); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + rv = wait_backup(sra, 0, start - stripes * chunk/512, stripes * chunk/512, + dests, destfd, destoffsets, 0); + if (rv < 0) + return 0; + grow_backup(sra, 0, stripes, + fds, offsets, + disks, chunk, level, layout, + dests, destfd, destoffsets, + 0, buf); + validate(afd, destfd[0], destoffsets[0]); + rv = wait_backup(sra, start, stripes*chunk/512, 0, + dests, destfd, destoffsets, 0); + if (rv < 0) + return 0; + sysfs_set_num(sra, NULL, "suspend_lo", (stripes * chunk/512) * data); + free(buf); + /* FIXME this should probably be numeric */ + sysfs_set_str(sra, NULL, "sync_max", "max"); return 1; +} +static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes, + int *fds, unsigned long long *offsets, + int disks, int chunk, int level, int layout, int data, + int dests, int *destfd, unsigned long long *destoffsets) +{ + unsigned long long start, size; + unsigned long tailstripes = stripes; + int part; + char *buf; + unsigned long long speed; + + + posix_memalign((void**)&buf, 4096, disks * chunk); + + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + + sysfs_get_ll(sra, NULL, "sync_speed_min", &speed); + sysfs_set_num(sra, NULL, "sync_speed_min", 200000); + + grow_backup(sra, 0, stripes, + fds, offsets, + disks, chunk, level, layout, + dests, destfd, destoffsets, + 0, buf); + grow_backup(sra, stripes * chunk/512, stripes, + fds, offsets, + disks, chunk, level, layout, + dests, destfd, destoffsets, + 1, buf); + validate(afd, destfd[0], destoffsets[0]); + part = 0; + start = stripes * 2; /* where to read next */ + size = sra->component_size / (chunk/512); + while (start < size) { + if (wait_backup(sra, (start-stripes*2)*chunk/512, + stripes*chunk/512, 0, + dests, destfd, destoffsets, + part) < 0) + return 0; + sysfs_set_num(sra, NULL, "suspend_lo", start*chunk/512 * data); + if (start + stripes > size) + tailstripes = (size - start); + + grow_backup(sra, start*chunk/512, tailstripes, + fds, offsets, + disks, chunk, level, layout, + dests, destfd, destoffsets, + part, buf); + start += stripes; + part = 1 - part; + validate(afd, destfd[0], destoffsets[0]); + } + if (wait_backup(sra, (start-stripes*2) * chunk/512, stripes * chunk/512, 0, + dests, destfd, destoffsets, + part) < 0) + return 0; + sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*chunk/512) * data); + if (wait_backup(sra, (start-stripes) * chunk/512, tailstripes * chunk/512, 0, + dests, destfd, destoffsets, + 1-part) < 0) + return 0; + sysfs_set_num(sra, NULL, "suspend_lo", (size*chunk/512) * data); + sysfs_set_num(sra, NULL, "sync_speed_min", speed); + free(buf); + return 1; } /* diff --git a/mdadm.c b/mdadm.c index a77c4ad..6b571d7 100644 --- a/mdadm.c +++ b/mdadm.c @@ -329,6 +329,7 @@ int main(int argc, char *argv[]) * could depend on the mode */ #define O(a,b) ((a<<8)|b) switch (O(mode,opt)) { + case O(GROW,'c'): case O(CREATE,'c'): case O(BUILD,'c'): /* chunk or rounding */ if (chunk) { @@ -418,7 +419,7 @@ int main(int argc, char *argv[]) } continue; - case O(GROW,'l'): /* hack - needed to understand layout */ + case O(GROW,'l'): case O(CREATE,'l'): case O(BUILD,'l'): /* set raid level*/ if (level != UnSet) { @@ -1412,8 +1413,8 @@ int main(int argc, char *argv[]) if (devs_found > 1) { /* must be '-a'. */ - if (size >= 0 || raiddisks) { - fprintf(stderr, Name ": --size, --raiddisks, and --add are exclusing in --grow mode\n"); + if (size >= 0 || raiddisks || chunk || layout_str != NULL || bitmap_file) { + fprintf(stderr, Name ": --add cannot be used with other geometry changes in --grow mode\n"); rv = 1; break; } @@ -1422,17 +1423,20 @@ int main(int argc, char *argv[]) if (rv) break; } - } else if ((size >= 0) + (raiddisks != 0) + (layout_str != NULL) + (bitmap_file != NULL)> 1) { - fprintf(stderr, Name ": can change at most one of size, raiddisks, bitmap, and layout\n"); - rv = 1; - break; - } else if (size >= 0 || raiddisks || layout_str != NULL) - rv = Grow_reshape(devlist->devname, mdfd, quiet, backup_file, - size, level, layout_str, chunk, raiddisks); - else if (bitmap_file) { - if (delay == 0) delay = DEFAULT_BITMAP_DELAY; + } else if (bitmap_file) { + if (size >= 0 || raiddisks || chunk || layout_str != NULL) { + fprintf(stderr, Name ": --bitmap changes cannot be used with other geometry changes in --grow mode\n"); + rv = 1; + break; + } + if (delay == 0) + delay = DEFAULT_BITMAP_DELAY; rv = Grow_addbitmap(devlist->devname, mdfd, bitmap_file, bitmap_chunk, delay, write_behind, force); + } else if (size >= 0 || raiddisks != 0 || layout_str != NULL + || chunk != 0 || level != UnSet) { + rv = Grow_reshape(devlist->devname, mdfd, quiet, backup_file, + size, level, layout_str, chunk, raiddisks); } else if (array_size < 0) fprintf(stderr, Name ": no changes to --grow\n"); break; diff --git a/mdadm.h b/mdadm.h index 26ef69d..181de07 100644 --- a/mdadm.h +++ b/mdadm.h @@ -370,8 +370,12 @@ extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, extern int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev, char *name, unsigned long long val); extern int sysfs_uevent(struct mdinfo *sra, char *event); +extern int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev, + char *name); +extern int sysfs_fd_get_ll(int fd, unsigned long long *val); extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, char *name, unsigned long long *val); +extern int sysfs_fd_get_str(int fd, char *val, int size); extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, char *name, char *val, int size); extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms); diff --git a/restripe.c b/restripe.c index ea348e1..38b4412 100644 --- a/restripe.c +++ b/restripe.c @@ -418,15 +418,16 @@ int save_stripes(int *source, unsigned long long *offsets, for (disk = 0; disk < raid_disks ; disk++) { unsigned long long offset; int dnum; - len = chunk_size; offset = (start/chunk_size/data_disks)*chunk_size; dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1, start/chunk_size/data_disks, raid_disks, level, layout); + if (dnum < 0) abort(); if (source[dnum] < 0 || lseek64(source[dnum], offsets[disk]+offset, 0) < 0 || - read(source[dnum], buf+disk * chunk_size, len) != len) + read(source[dnum], buf+disk * chunk_size, chunk_size) + != chunk_size) if (failed <= 2) { fdisk[failed] = dnum; fblock[failed] = disk; diff --git a/sysfs.c b/sysfs.c index 56fd968..1452939 100644 --- a/sysfs.c +++ b/sysfs.c @@ -487,22 +487,29 @@ int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, return n; } +int sysfs_fd_get_str(int fd, char *val, int size) +{ + int n; + + lseek(fd, 0, 0); + n = read(fd, val, size); + if (n <= 0) + return -1; + val[n] = 0; + return n; +} + int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, char *name, char *val, int size) { - char fname[50]; int n; int fd; - sprintf(fname, "/sys/block/%s/md/%s/%s", - sra->sys_name, dev?dev->sys_name:"", name); - fd = open(fname, O_RDONLY); + + fd = sysfs_get_fd(sra, dev, name); if (fd < 0) return -1; - n = read(fd, val, size); + n = sysfs_fd_get_str(fd, val, size); close(fd); - if (n <= 0) - return -1; - val[n] = 0; return n; } -- cgit