summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Create.c8
-rw-r--r--managemon.c45
-rw-r--r--mdadm.87
-rw-r--r--mdadm.h2
-rw-r--r--mdmon.h9
-rw-r--r--monitor.c37
-rw-r--r--platform-intel.h49
-rw-r--r--super-intel.c30
-rw-r--r--sysfs.c3
9 files changed, 149 insertions, 41 deletions
diff --git a/Create.c b/Create.c
index b04388f..43e5f37 100644
--- a/Create.c
+++ b/Create.c
@@ -235,9 +235,13 @@ int Create(struct supertype *st, char *mddev,
case 6:
case 0:
if (chunk == 0) {
- chunk = 512;
+ if (st && st->ss->default_chunk)
+ chunk = st->ss->default_chunk(st);
+
+ chunk = chunk ? : 512;
+
if (verbose > 0)
- fprintf(stderr, Name ": chunk size defaults to 512K\n");
+ fprintf(stderr, Name ": chunk size defaults to %dK\n", chunk);
}
break;
case LEVEL_LINEAR:
diff --git a/managemon.c b/managemon.c
index 037406f..d5ba6d6 100644
--- a/managemon.c
+++ b/managemon.c
@@ -361,6 +361,23 @@ static void manage_container(struct mdstat_ent *mdstat,
}
}
+static int disk_init_and_add(struct mdinfo *disk, struct mdinfo *clone,
+ struct active_array *aa)
+{
+ if (!disk || !clone)
+ return -1;
+
+ *disk = *clone;
+ disk->recovery_fd = sysfs_open(aa->devnum, disk->sys_name, "recovery_start");
+ disk->state_fd = sysfs_open(aa->devnum, disk->sys_name, "state");
+ disk->prev_state = read_dev_state(disk->state_fd);
+ disk->curr_state = disk->prev_state;
+ disk->next = aa->info.devs;
+ aa->info.devs = disk;
+
+ return 0;
+}
+
static void manage_member(struct mdstat_ent *mdstat,
struct active_array *a)
{
@@ -414,14 +431,7 @@ static void manage_member(struct mdstat_ent *mdstat,
free(newd);
continue;
}
- *newd = *d;
- newd->next = newa->info.devs;
- newa->info.devs = newd;
-
- newd->state_fd = sysfs_open(a->devnum, newd->sys_name,
- "state");
- newd->prev_state = read_dev_state(newd->state_fd);
- newd->curr_state = newd->prev_state;
+ disk_init_and_add(newd, d, newa);
}
queue_metadata_update(updates);
updates = NULL;
@@ -513,19 +523,7 @@ static void manage_new(struct mdstat_ent *mdstat,
if (i == di->disk.raid_disk)
break;
- if (di && newd) {
- memcpy(newd, di, sizeof(*newd));
-
- newd->state_fd = sysfs_open(new->devnum,
- newd->sys_name,
- "state");
- newd->recovery_fd = sysfs_open(new->devnum,
- newd->sys_name,
- "recovery_start");
-
- newd->prev_state = read_dev_state(newd->state_fd);
- newd->curr_state = newd->prev_state;
- } else {
+ if (disk_init_and_add(newd, di, new) != 0) {
if (newd)
free(newd);
@@ -535,17 +533,14 @@ static void manage_new(struct mdstat_ent *mdstat,
new->container = NULL;
break;
}
- continue;
}
- sprintf(newd->sys_name, "rd%d", i);
- newd->next = new->info.devs;
- new->info.devs = newd;
}
new->action_fd = sysfs_open(new->devnum, NULL, "sync_action");
new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state");
new->resync_start_fd = sysfs_open(new->devnum, NULL, "resync_start");
new->metadata_fd = sysfs_open(new->devnum, NULL, "metadata_version");
+ new->sync_completed_fd = sysfs_open(new->devnum, NULL, "sync_completed");
dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst),
new->action_fd, new->info.state_fd);
diff --git a/mdadm.8 b/mdadm.8
index 1909819..7e252fa 100644
--- a/mdadm.8
+++ b/mdadm.8
@@ -1213,14 +1213,11 @@ listed, otherwise it will return failure.
For each md device given, or each device in /proc/mdstat if
.B \-\-scan
is given, arrange for the array to be marked clean as soon as possible.
-Also, quiesce resync so that the monitor for external metadata arrays
-(mdmon) has an opportunity to checkpoint the resync position.
.I mdadm
will return with success if the array uses external metadata and we
successfully waited. For native arrays this returns immediately as the
-kernel handles both dirty-clean transitions and resync checkpointing in
-the kernel at shutdown. No action is taken if safe-mode handling is
-disabled.
+kernel handles dirty-clean transitions at shutdown. No action is taken
+if safe-mode handling is disabled.
.SH For Incremental Assembly mode:
.TP
diff --git a/mdadm.h b/mdadm.h
index 68d61a3..798713c 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -612,6 +612,8 @@ extern struct superswitch {
struct mdinfo *(*container_content)(struct supertype *st);
/* Allow a metadata handler to override mdadm's default layouts */
int (*default_layout)(int level); /* optional */
+ /* query the supertype for default chunk size */
+ int (*default_chunk)(struct supertype *st); /* optional */
/* Permit subarray's to be deleted from inactive containers */
int (*kill_subarray)(struct supertype *st); /* optional */
/* Permit subarray's to be modified */
diff --git a/mdmon.h b/mdmon.h
index 20a0a01..5c51566 100644
--- a/mdmon.h
+++ b/mdmon.h
@@ -32,6 +32,15 @@ struct active_array {
int action_fd;
int resync_start_fd;
int metadata_fd; /* for monitoring rw/ro status */
+ int sync_completed_fd; /* for checkpoint notification events */
+ unsigned long long last_checkpoint; /* sync_completed fires for many
+ * reasons this field makes sure the
+ * kernel has made progress before
+ * moving the checkpoint. It is
+ * cleared by the metadata handler
+ * when it determines recovery is
+ * terminated.
+ */
enum array_state prev_state, curr_state, next_state;
enum sync_action prev_action, curr_action, next_action;
diff --git a/monitor.c b/monitor.c
index e43e545..59b4181 100644
--- a/monitor.c
+++ b/monitor.c
@@ -80,6 +80,24 @@ static unsigned long long read_resync_start(int fd)
return strtoull(buf, NULL, 10);
}
+static unsigned long long read_sync_completed(int fd)
+{
+ unsigned long long val;
+ char buf[50];
+ int n;
+ char *ep;
+
+ n = read_attr(buf, 50, fd);
+
+ if (n <= 0)
+ return 0;
+ buf[n] = 0;
+ val = strtoull(buf, &ep, 0);
+ if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))
+ return 0;
+ return val;
+}
+
static enum array_state read_state(int fd)
{
char buf[20];
@@ -195,6 +213,7 @@ static void signal_manager(void)
static int read_and_act(struct active_array *a)
{
+ unsigned long long sync_completed;
int check_degraded = 0;
int deactivate = 0;
struct mdinfo *mdi;
@@ -206,6 +225,7 @@ static int read_and_act(struct active_array *a)
a->curr_state = read_state(a->info.state_fd);
a->curr_action = read_action(a->action_fd);
a->info.resync_start = read_resync_start(a->resync_start_fd);
+ sync_completed = read_sync_completed(a->sync_completed_fd);
for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
mdi->next_state = 0;
if (mdi->state_fd >= 0) {
@@ -307,6 +327,22 @@ static int read_and_act(struct active_array *a)
}
}
+ /* Check for recovery checkpoint notifications. We need to be a
+ * minimum distance away from the last checkpoint to prevent
+ * over checkpointing. Note reshape checkpointing is not
+ * handled here.
+ */
+ if (sync_completed > a->last_checkpoint &&
+ sync_completed - a->last_checkpoint > a->info.component_size >> 4 &&
+ a->curr_action > reshape) {
+ /* A (non-reshape) sync_action has reached a checkpoint.
+ * Record the updated position in the metadata
+ */
+ a->last_checkpoint = sync_completed;
+ a->container->ss->set_array_state(a, a->curr_state <= clean);
+ } else if (sync_completed > a->last_checkpoint)
+ a->last_checkpoint = sync_completed;
+
a->container->ss->sync_metadata(a->container);
dprintf("%s(%d): state:%s action:%s next(", __func__, a->info.container_member,
array_states[a->curr_state], sync_actions[a->curr_action]);
@@ -461,6 +497,7 @@ static int wait_and_act(struct supertype *container, int nowait)
add_fd(&rfds, &maxfd, a->info.state_fd);
add_fd(&rfds, &maxfd, a->action_fd);
+ add_fd(&rfds, &maxfd, a->sync_completed_fd);
for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
add_fd(&rfds, &maxfd, mdi->state_fd);
diff --git a/platform-intel.h b/platform-intel.h
index bbdc9f9..9088436 100644
--- a/platform-intel.h
+++ b/platform-intel.h
@@ -115,6 +115,55 @@ static inline int imsm_orom_has_chunk(const struct imsm_orom *orom, int chunk)
return !!(orom->sss & (1 << (fs - 1)));
}
+/**
+ * fls - find last (most-significant) bit set
+ * @x: the word to search
+ * The funciton is borrowed from Linux kernel code
+ * include/asm-generic/bitops/fls.h
+ */
+static inline int fls(int x)
+{
+ int r = 32;
+
+ if (!x)
+ return 0;
+ if (!(x & 0xffff0000u)) {
+ x <<= 16;
+ r -= 16;
+ }
+ if (!(x & 0xff000000u)) {
+ x <<= 8;
+ r -= 8;
+ }
+ if (!(x & 0xf0000000u)) {
+ x <<= 4;
+ r -= 4;
+ }
+ if (!(x & 0xc0000000u)) {
+ x <<= 2;
+ r -= 2;
+ }
+ if (!(x & 0x80000000u)) {
+ x <<= 1;
+ r -= 1;
+ }
+ return r;
+}
+
+/**
+ * imsm_orom_default_chunk - return the largest chunk size supported via orom
+ * @orom: orom pointer from find_imsm_orom
+ */
+static inline int imsm_orom_default_chunk(const struct imsm_orom *orom)
+{
+ int fs = fls(orom->sss);
+
+ if (!fs)
+ return 0;
+
+ return min(512, (1 << fs));
+}
+
struct sys_dev {
char *path;
struct sys_dev *next;
diff --git a/super-intel.c b/super-intel.c
index f0377b8..159ae4a 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -654,7 +654,7 @@ static __u64 blocks_per_migr_unit(struct imsm_dev *dev);
static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx)
{
__u64 sz;
- int slot;
+ int slot, i;
struct imsm_map *map = get_imsm_map(dev, 0);
__u32 ord;
@@ -663,6 +663,12 @@ static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx)
printf(" UUID : %s\n", uuid);
printf(" RAID Level : %d\n", get_imsm_raid_level(map));
printf(" Members : %d\n", map->num_members);
+ printf(" Slots : [");
+ for (i = 0; i < map->num_members; i++) {
+ ord = get_imsm_ord_tbl_ent(dev, i);
+ printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U");
+ }
+ printf("]\n");
slot = get_imsm_disk_slot(map, disk_idx);
if (slot >= 0) {
ord = get_imsm_ord_tbl_ent(dev, slot);
@@ -4055,6 +4061,16 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout,
return 0;
}
+static int default_chunk_imsm(struct supertype *st)
+{
+ struct intel_super *super = st->sb;
+
+ if (!super->orom)
+ return 0;
+
+ return imsm_orom_default_chunk(super->orom);
+}
+
static void handle_missing(struct intel_super *super, struct imsm_dev *dev);
static int kill_subarray_imsm(struct supertype *st)
@@ -4570,6 +4586,7 @@ static int imsm_set_array_state(struct active_array *a, int consistent)
dprintf("imsm: mark resync done\n");
end_migration(dev, map_state);
super->updates_pending++;
+ a->last_checkpoint = 0;
}
} else if (!is_resyncing(dev) && !failed) {
/* mark the start of the init process if nothing is failed */
@@ -4583,14 +4600,11 @@ static int imsm_set_array_state(struct active_array *a, int consistent)
/* check if we can update curr_migr_unit from resync_start, recovery_start */
blocks_per_unit = blocks_per_migr_unit(dev);
- if (blocks_per_unit && failed <= 1) {
+ if (blocks_per_unit) {
__u32 units32;
__u64 units;
- if (migr_type(dev) == MIGR_REBUILD)
- units = min_recovery_start(&a->info) / blocks_per_unit;
- else
- units = a->info.resync_start / blocks_per_unit;
+ units = a->last_checkpoint / blocks_per_unit;
units32 = units;
/* check that we did not overflow 32-bits, and that
@@ -4662,17 +4676,20 @@ static void imsm_set_disk(struct active_array *a, int n, int state)
map = get_imsm_map(dev, 0);
map->failed_disk_num = ~0;
super->updates_pending++;
+ a->last_checkpoint = 0;
} else if (map_state == IMSM_T_STATE_DEGRADED &&
map->map_state != map_state &&
!dev->vol.migr_state) {
dprintf("imsm: mark degraded\n");
map->map_state = map_state;
super->updates_pending++;
+ a->last_checkpoint = 0;
} else if (map_state == IMSM_T_STATE_FAILED &&
map->map_state != map_state) {
dprintf("imsm: mark failed\n");
end_migration(dev, map_state);
super->updates_pending++;
+ a->last_checkpoint = 0;
}
}
@@ -5486,6 +5503,7 @@ struct superswitch super_imsm = {
.brief_detail_super = brief_detail_super_imsm,
.write_init_super = write_init_super_imsm,
.validate_geometry = validate_geometry_imsm,
+ .default_chunk = default_chunk_imsm,
.add_to_super = add_to_super_imsm,
.detail_platform = detail_platform_imsm,
.kill_subarray = kill_subarray_imsm,
diff --git a/sysfs.c b/sysfs.c
index ebf9d8a..72c7c5b 100644
--- a/sysfs.c
+++ b/sysfs.c
@@ -852,9 +852,6 @@ int WaitClean(char *dev, int sock, int verbose)
tm.tv_sec = 5;
tm.tv_usec = 0;
- /* give mdmon a chance to checkpoint resync */
- sysfs_set_str(mdi, NULL, "sync_action", "idle");
-
FD_ZERO(&fds);
/* wait for array_state to be clean */