summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeil Brown <neilb@suse.de>2008-05-15 16:48:37 +1000
committerNeil Brown <neilb@suse.de>2008-05-15 16:48:37 +1000
commit549e9569c6006433512801ae76b34abc0d3e1ac0 (patch)
treecc17c37e280d4d0acd40009285a497eda693999d
parentf7dd881f909a7bc552a6de3c1fc4920bb0bfdff2 (diff)
downloadmdadm-549e9569c6006433512801ae76b34abc0d3e1ac0.tar.gz
mdadm-549e9569c6006433512801ae76b34abc0d3e1ac0.tar.xz
mdadm-549e9569c6006433512801ae76b34abc0d3e1ac0.zip
Merge mdmon
-rw-r--r--Makefile13
-rw-r--r--managemon.c309
-rw-r--r--mdadm.h35
-rw-r--r--mdmon.c222
-rw-r--r--mdmon.h41
-rw-r--r--mdstat.c60
-rw-r--r--monitor.c372
-rw-r--r--super-ddf.c39
-rw-r--r--sysfs.c29
9 files changed, 1101 insertions, 19 deletions
diff --git a/Makefile b/Makefile
index 46d7594..b2087d0 100644
--- a/Makefile
+++ b/Makefile
@@ -77,6 +77,11 @@ SRCS = mdadm.c config.c mdstat.c ReadMe.c util.c Manage.c Assemble.c Build.c \
mdopen.c super0.c super1.c super-ddf.c super-intel.c bitmap.c \
restripe.c sysfs.c sha1.c mapfile.c crc32.c sg_io.c msg.c
+MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o \
+ Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \
+ super-ddf.o sha1.o crc32.o
+
+
STATICSRC = pwgr.c
STATICOBJS = pwgr.o
@@ -88,7 +93,7 @@ ASSEMBLE_SRCS += mdopen.c mdstat.c
ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO
endif
-all : mdadm mdadm.man md.man mdadm.conf.man
+all : mdadm mdmon mdadm.man md.man mdadm.conf.man
everything: all mdadm.static swap_super test_stripe \
mdassemble mdassemble.static mdassemble.man \
@@ -118,6 +123,9 @@ mdadm.Os : $(SRCS) mdadm.h
mdadm.O2 : $(SRCS) mdadm.h
gcc -o mdadm.O2 $(CFLAGS) -DHAVE_STDINT_H -O2 $(SRCS)
+mdmon : $(MON_OBJS)
+ $(CC) $(LDFLAGS) -o mdmon $(MON_OBJS) $(LDLIBS)
+
test_stripe : restripe.c mdadm.h
$(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c
@@ -182,7 +190,8 @@ test: mdadm test_stripe swap_super
@echo "Please run 'sh ./test' as root"
clean :
- rm -f mdadm $(OBJS) $(STATICOBJS) core *.man mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
+ rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \
+ mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
mdadm.Os mdadm.O2 \
mdassemble mdassemble.static mdassemble.uclibc mdassemble.klibc swap_super \
init.cpio.gz mdadm.uclibc.static test_stripe
diff --git a/managemon.c b/managemon.c
new file mode 100644
index 0000000..ee4ee2b
--- /dev/null
+++ b/managemon.c
@@ -0,0 +1,309 @@
+
+/*
+ * The management thread for monitoring active md arrays.
+ * This thread does things which might block such as memory
+ * allocation.
+ * In particular:
+ *
+ * - Find out about new arrays in this container.
+ * Allocate the data structures and open the files.
+ *
+ * For this we watch /proc/mdstat and find new arrays with
+ * metadata type that confirms sharing. e.g. "md4"
+ * When we find a new array we slip it into the list of
+ * arrays and signal 'monitor' by writing to a pipe.
+ *
+ * - Respond to reshape requests by allocating new data structures
+ * and opening new files.
+ *
+ * These come as a change to raid_disks. We allocate a new
+ * version of the data structures and slip it into the list.
+ * 'monitor' will notice and release the old version.
+ * Changes to level, chunksize, layout.. do not need re-allocation.
+ * Reductions in raid_disks don't really either, but we handle
+ * them the same way for consistency.
+ *
+ * - When a device is added to the container, we add it to the metadata
+ * as a spare.
+ *
+ * - assist with activating spares by opening relevant sysfs file.
+ *
+ * - Pass on metadata updates from external programs such as
+ * mdadm creating a new array.
+ *
+ * This is most-messy.
+ * It might involve adding a new array or changing the status of
+ * a spare, or any reconfig that the kernel doesn't get involved in.
+ *
+ * The required updates are received via a named pipe. There will
+ * be one named pipe for each container. Each message contains a
+ * sync marker: 0x5a5aa5a5, A byte count, and the message. This is
+ * passed to the metadata handler which will interpret and process it.
+ * For 'DDF' messages are internal data blocks with the leading
+ * 'magic number' signifying what sort of data it is.
+ *
+ */
+
+/*
+ * We select on /proc/mdstat and the named pipe.
+ * We create new arrays or updated version of arrays and slip
+ * them into the head of the list, then signal 'monitor' via a pipe write.
+ * 'monitor' will notice and place the old array on a return list.
+ * Metadata updates are placed on a queue just like they arrive
+ * from the named pipe.
+ *
+ * When new arrays are found based on correct metadata string, we
+ * need to identify them with an entry in the metadata. Maybe we require
+ * the metadata to be mdX/NN when NN is the index into an appropriate table.
+ *
+ */
+
+/*
+ * List of tasks:
+ * - Watch for spares to be added to the container, and write updated
+ * metadata to them.
+ * - Watch for new arrays using this container, confirm they match metadata
+ * and if so, start monitoring them
+ * - Watch for spares being added to monitored arrays. This shouldn't
+ * happen, as we should do all the adding. Just remove them.
+ * - Watch for change in raid-disks, chunk-size, etc. Update metadata and
+ * start a reshape.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "mdadm.h"
+#include "mdmon.h"
+#include <sys/socket.h>
+
+
+static void free_aa(struct active_array *aa)
+{
+ /* Note that this doesn't close fds, as they may be in used
+ * by a clone. Use close_aa for that.
+ */
+ while (aa->info.devs) {
+ struct mdinfo *d = aa->info.devs;
+ aa->info.devs = d->next;
+ free(d);
+ }
+ free(aa);
+}
+
+static void replace_array(struct supertype *container,
+ struct active_array *old,
+ struct active_array *new)
+{
+ /* To replace an array, we add it to the top of the list
+ * marked with ->replaces to point to the original.
+ * 'monitor' will take the original out of the list
+ * and put it on 'discard_this'. We take it from there
+ * and discard it.
+ */
+
+ while (pending_discard) {
+ while (discard_this == NULL)
+ sleep(1);
+ if (discard_this != pending_discard)
+ abort();
+ discard_this->next = NULL;
+ free_aa(discard_this);
+ discard_this = NULL;
+ pending_discard = NULL;
+ }
+ pending_discard = old;
+ new->replaces = old;
+ new->next = container->arrays;
+ container->arrays = new;
+}
+
+
+static void manage_container(struct mdstat_ent *mdstat,
+ struct supertype *container)
+{
+ /* The only thing of interest here is if a new device
+ * has been added to the container. We add it to the
+ * array ignoring any metadata on it.
+ * FIXME should we look for compatible metadata and take hints
+ * about spare assignment.... probably not.
+ *
+ */
+ if (mdstat->devcnt != container->devcnt) {
+ /* read /sys/block/NAME/md/dev-??/block/dev to find out
+ * what is there, and compare with container->info.devs
+ * To see what is removed and what is added.
+ * These need to be remove from, or added to, the array
+ */
+ // FIXME
+ container->devcnt = mdstat->devcnt;
+ }
+}
+
+static void manage_member(struct mdstat_ent *mdstat,
+ struct active_array *a)
+{
+ /* Compare mdstat info with known state of member array.
+ * We do not need to look for device state changes here, that
+ * is dealt with by the monitor.
+ *
+ * We just look for changes which suggest that a reshape is
+ * being requested.
+ * Unfortunately decreases in raid_disks don't show up in
+ * mdstat until the reshape completes FIXME.
+ */
+ // FIXME
+ a->info.array.raid_disks = mdstat->raid_disks;
+ a->info.array.chunk_size = mdstat->chunk_size;
+ // MORE
+
+}
+
+static void write_wakeup(struct supertype *c)
+{
+ write(c->pipe[1], "PING", 4);
+}
+
+static void manage_new(struct mdstat_ent *mdstat,
+ struct supertype *container)
+{
+ /* A new array has appeared in this container.
+ * Hopefully it is already recorded in the metadata.
+ * Check, then create the new array to report it to
+ * the monitor.
+ */
+
+ struct active_array *new;
+ struct mdinfo *mdi, *di;
+ char *n;
+ int inst;
+ int i;
+
+ new = malloc(sizeof(*new));
+
+ new->devnum = mdstat->devnum;
+
+ new->prev_state = new->curr_state = new->next_state = inactive;
+ new->prev_action= new->curr_action= new->next_action= idle;
+
+ new->container = container;
+
+ n = &mdstat->metadata_version[10+strlen(container->devname)+1];
+ inst = atoi(n);
+ if (inst < 0)
+ abort();//FIXME
+
+ mdi = sysfs_read(-1, new->devnum,
+ GET_LEVEL|GET_CHUNK|GET_DISKS|
+ GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE);
+ if (!mdi) {
+ /* Eeek. Cannot monitor this array.
+ * Mark it to be ignored by setting container to NULL
+ */
+ new->container = NULL;
+ replace_array(container, NULL, new);
+ return;
+ }
+
+ new->info.array = mdi->array;
+
+ for (i = 0; i < new->info.array.raid_disks; i++) {
+ struct mdinfo *newd = malloc(sizeof(*newd));
+
+ for (di = mdi->devs; di; di = di->next)
+ if (i == di->disk.raid_disk)
+ break;
+
+ if (di) {
+ memcpy(newd, di, sizeof(*newd));
+
+ sprintf(newd->sys_name, "rd%d", i);
+
+ newd->state_fd = sysfs_open(new->devnum,
+ newd->sys_name,
+ "state");
+
+ newd->prev_state = read_dev_state(newd->state_fd);
+ newd->curr_state = newd->curr_state;
+ } else {
+ newd->state_fd = -1;
+ }
+ newd->next = new->info.devs;
+ new->info.devs = newd;
+ }
+ new->action_fd = sysfs_open(new->devnum, NULL, "sync_action");
+ new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state");
+ new->sync_pos_fd = sysfs_open(new->devnum, NULL, "sync_completed");
+ new->sync_pos = 0;
+
+ // finds and compares.
+ if (container->ss->open_new(container, new, inst) < 0) {
+ // FIXME close all those files
+ new->container = NULL;
+ replace_array(container, NULL, new);
+ return;
+ }
+ replace_array(container, NULL, new);
+ write_wakeup(container);
+ return;
+}
+
+void manage(struct mdstat_ent *mdstat, struct active_array *aa,
+ struct supertype *container)
+{
+ /* We have just read mdstat and need to compare it with
+ * the known active arrays.
+ * Arrays with the wrong metadata are ignored.
+ */
+
+ for ( ; mdstat ; mdstat = mdstat->next) {
+ struct active_array *a;
+ if (mdstat->devnum == container->devnum) {
+ manage_container(mdstat, container);
+ continue;
+ }
+ if (mdstat->metadata_version == NULL ||
+ strncmp(mdstat->metadata_version, "external:/", 10) != 0 ||
+ strncmp(mdstat->metadata_version+10, container->devname,
+ strlen(container->devname)) != 0 ||
+ mdstat->metadata_version[10+strlen(container->devname)]
+ != '/')
+ /* Not for this array */
+ continue;
+ /* Looks like a member of this container */
+ for (a = aa; a; a = a->next) {
+ if (mdstat->devnum == a->devnum) {
+ if (a->container)
+ manage_member(mdstat, a);
+ break;
+ }
+ }
+ if (a == NULL)
+ manage_new(mdstat, container);
+ }
+}
+
+void read_sock(int pfd)
+{
+ int fd;
+
+ // FIXME set non-blocking
+ fd = accept(pfd, NULL, NULL);
+ if (fd < 0)
+ return;
+ // FIXME do something useful
+ close(fd);
+}
+void do_manager(struct supertype *container)
+{
+ struct mdstat_ent *mdstat;
+
+ do {
+ mdstat = mdstat_read(1, 0);
+
+ manage(mdstat, array_list, container);
+
+ read_sock(container->sock);
+
+ mdstat_wait_fd(container->sock);
+ } while(1);
+}
diff --git a/mdadm.h b/mdadm.h
index 64f41fd..3f778f1 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -159,6 +159,11 @@ struct mdinfo {
char sys_name[20];
struct mdinfo *devs;
struct mdinfo *next;
+
+ /* Device info for mdmon: */
+ int state_fd;
+ int prev_state, curr_state, next_state;
+
};
struct createinfo {
@@ -271,12 +276,17 @@ struct mdstat_ent {
char *pattern; /* U or up, _ for down */
int percent; /* -1 if no resync */
int resync; /* 1 if resync, 0 if recovery */
+ int devcnt;
+ int raid_disks;
+ int chunk_size;
+ char * metadata_version;
struct mdstat_ent *next;
};
extern struct mdstat_ent *mdstat_read(int hold, int start);
extern void free_mdstat(struct mdstat_ent *ms);
extern void mdstat_wait(int seconds);
+extern void mdstat_wait_fd(int fd);
extern int mddev_busy(int devnum);
struct map_ent {
@@ -304,6 +314,7 @@ extern void map_add(struct map_ent **melp,
#define GET_CACHE 16
#define GET_MISMATCH 32
#define GET_VERSION 64
+#define GET_DISKS 128
#define GET_DEVS 1024 /* gets role, major, minor */
#define GET_OFFSET 2048
@@ -314,6 +325,7 @@ extern void map_add(struct map_ent **melp,
/* If fd >= 0, get the array it is open on,
* else use devnum. >=0 -> major9. <0.....
*/
+extern int sysfs_open(int devnum, char *devname, char *attr);
extern void sysfs_free(struct mdinfo *sra);
extern struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options);
extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
@@ -350,6 +362,7 @@ extern mapping_t r5layout[], pers[], modes[], faultylayout[];
extern char *map_dev(int major, int minor, int create);
+struct active_array;
extern struct superswitch {
void (*examine_super)(struct supertype *st, char *homehost);
@@ -390,6 +403,14 @@ extern struct superswitch {
struct mdinfo *(*container_content)(struct supertype *st);
+/* for mdmon */
+ int (*open_new)(struct supertype *c, struct active_array *a, int inst);
+ void (*mark_clean)(struct active_array *a, unsigned long long sync_pos);
+ void (*mark_dirty)(struct active_array *a);
+ void (*set_disk)(struct active_array *a, int n);
+ void (*sync_metadata)(struct active_array *a);
+
+
int major;
char *text_version;
int swapuuid; /* true if uuid is bigending rather than hostendian */
@@ -406,6 +427,20 @@ struct supertype {
int container_member; /* numerical position in container */
void *sb;
void *info;
+
+ /* extra stuff used by mdmon */
+ struct active_array *arrays;
+ int devfd;
+ int sock; /* listen to external programs */
+ int pipe[2]; /* communicate between threads */
+ int devnum;
+ char *devname; /* e.g. md0. This appears in metadata_verison:
+ * external:/md0/12
+ */
+ int devcnt;
+
+ struct mdinfo *devs;
+
};
extern struct supertype supertype_container_member;
diff --git a/mdmon.c b/mdmon.c
new file mode 100644
index 0000000..1284a12
--- /dev/null
+++ b/mdmon.c
@@ -0,0 +1,222 @@
+
+/*
+ * md array manager.
+ * When md arrays have user-space managed metadata, this is the program
+ * that does the managing.
+ *
+ * Given one argument: the name of the array (e.g. /dev/md0) that is
+ * the container.
+ * We fork off a helper that runs high priority and mlocked. It responds to
+ * device failures and other events that might stop writeout, or that are
+ * trivial to deal with.
+ * The main thread then watches for new arrays being created in the container
+ * and starts monitoring them too ... along with a few other tasks.
+ *
+ * The main thread communicates with the priority thread by writing over
+ * a pipe.
+ * Separate programs can communicate with the main thread via Unix-domain
+ * socket.
+ * The two threads share address space and open file table.
+ *
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/mman.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include <sched.h>
+
+#include "mdadm.h"
+#include "mdmon.h"
+
+struct active_array *array_list;
+struct active_array *discard_this;
+struct active_array *pending_discard;
+
+int run_child(void *v)
+{
+ struct supertype *c = v;
+ do_monitor(c);
+ return 0;
+}
+
+int clone_monitor(struct supertype *container)
+{
+ int pfd[2];
+ static char stack[4096];
+ int rv;
+
+ pipe(container->pipe);
+
+ rv = clone(run_child, stack+4096-64,
+ CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
+ container);
+
+ if (rv < 0)
+ return rv;
+ return pfd[1];
+}
+
+static struct superswitch *find_metadata_methods(char *vers)
+{
+ if (strcmp(vers, "ddf") == 0)
+ return &super_ddf;
+ return NULL;
+}
+
+
+static int make_pidfile(char *devname)
+{
+ char path[100];
+ char pid[10];
+ int fd;
+ sprintf(path, "/var/run/mdadm/%s.pid", devname);
+
+ fd = open(path, O_RDWR|O_CREAT|O_EXCL, 0600);
+ if (fd < 0)
+ return -1;
+ sprintf(pid, "%d\n", getpid());
+ write(fd, pid, strlen(pid));
+ close(fd);
+ return 0;
+}
+
+static int make_control_sock(char *devname)
+{
+ char path[100];
+ int sfd;
+ long fl;
+ struct sockaddr_un addr;
+
+ sprintf(path, "/var/run/mdadm/%s.sock", devname);
+ unlink(path);
+ sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
+ if (sfd < 0)
+ return -1;
+
+ addr.sun_family = PF_LOCAL;
+ strcpy(addr.sun_path, path);
+ if (bind(sfd, &addr, sizeof(addr)) < 0) {
+ close(sfd);
+ return -1;
+ }
+ listen(sfd, 10);
+ fl = fcntl(sfd, F_GETFL, 0);
+ fl |= O_NONBLOCK;
+ fcntl(sfd, F_SETFL, fl);
+ return sfd;
+}
+
+int main(int argc, char *argv[])
+{
+ int mdfd;
+ int pipefd;
+ struct mdinfo *mdi, *di;
+ struct supertype *container;
+ if (argc != 2) {
+ fprintf(stderr, "Usage: md-manage /device/name/for/container\n");
+ exit(2);
+ }
+ mdfd = open(argv[1], O_RDWR);
+ if (mdfd < 0) {
+ fprintf(stderr, "md-manage: %s: %s\n", argv[1],
+ strerror(errno));
+ exit(1);
+ }
+ if (md_get_version(mdfd) < 0) {
+ fprintf(stderr, "md-manage: %s: Not an md device\n",
+ argv[1]);
+ exit(1);
+ }
+
+ /* hopefully it is a container - we'll check later */
+
+ container = malloc(sizeof(*container));
+ container->devfd = mdfd;
+ container->devnum = fd2devnum(mdfd);
+ container->devname = devnum2devname(container->devnum);
+
+ /* If this fails, we hope it already exists */
+ mkdir("/var/run/mdadm", 0600);
+ /* pid file lives in /var/run/mdadm/mdXX.pid */
+ if (make_pidfile(container->devname) < 0) {
+ fprintf(stderr, "md-manage: %s already managed\n",
+ container->devname);
+ exit(3);
+ }
+
+ container->sock = make_control_sock(container->devname);
+ if (container->sock < 0) {
+ fprintf(stderr, "mdmon: Cannot create socket in /var/run/mdadm\n");
+ exit(3);
+ }
+ container->arrays = NULL;
+
+ mdi = sysfs_read(mdfd, container->devnum,
+ GET_VERSION|GET_LEVEL|GET_DEVS);
+
+ if (!mdi) {
+ fprintf(stderr, "mdmon: failed to load sysfs info for %s\n",
+ container->devname);
+ exit(3);
+ }
+ if (mdi->array.level != UnSet) {
+ fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n",
+ argv[1]);
+ exit(3);
+ }
+ if (mdi->array.major_version != -1 ||
+ mdi->array.minor_version != -2) {
+ fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n",
+ argv[1]);
+ exit(3);
+ }
+
+ container->ss = find_metadata_methods(mdi->text_version);
+ if (container->ss == NULL) {
+ fprintf(stderr, "mdmon: %s uses unknown metadata: %s\n",
+ argv[1], mdi->text_version);
+ exit(3);
+ }
+
+ container->devs = NULL;
+ for (di = mdi->devs; di; di = di->next) {
+ struct mdinfo *cd = malloc(sizeof(*cd));
+ cd = di;
+ cd->next = container->devs;
+ container->devs = cd;
+ }
+ sysfs_free(mdi);
+
+
+ if (container->ss->load_super(container, mdfd, argv[1])) {
+ fprintf(stderr, "mdmon: Cannot load metadata for %s\n",
+ argv[1]);
+ exit(3);
+ }
+
+
+ mlockall(MCL_FUTURE);
+
+ pipefd = clone_monitor(container);
+ if (pipefd < 0) {
+ fprintf(stderr, "md-manage: failed to start monitor process: %s\n",
+ strerror(errno));
+ exit(2);
+ }
+
+ do_manager(container);
+
+ exit(0);
+}
diff --git a/mdmon.h b/mdmon.h
new file mode 100644
index 0000000..497bbec
--- /dev/null
+++ b/mdmon.h
@@ -0,0 +1,41 @@
+
+enum array_state { clear, inactive, suspended, readonly, read_auto,
+ clean, active, write_pending, active_idle, bad_word};
+
+enum sync_action { idle, reshape, resync, recover, check, repair, bad_action };
+
+
+struct active_array {
+ struct mdinfo info;
+ struct supertype *container;
+ struct active_array *next, *replaces;
+
+ int action_fd;
+ int sync_pos_fd;
+
+ enum array_state prev_state, curr_state, next_state;
+ enum sync_action prev_action, curr_action, next_action;
+
+ int devnum;
+
+ unsigned long long sync_pos;
+};
+
+
+
+#define MD_MAJOR 9
+
+extern struct active_array *container;
+extern struct active_array *array_list;
+extern struct active_array *discard_this;
+extern struct active_array *pending_discard;
+
+
+void do_monitor(struct supertype *container);
+void do_manager(struct supertype *container);
+
+int read_dev_state(int fd);
+
+struct mdstat_ent *mdstat_read(int hold, int start);
+
+extern struct superswitch super_ddf, super_ddf_bvd, super_ddf_svd;
diff --git a/mdstat.c b/mdstat.c
index a8f7ce7..c75260e 100644
--- a/mdstat.c
+++ b/mdstat.c
@@ -86,6 +86,7 @@
#include "mdadm.h"
#include "dlink.h"
#include <sys/select.h>
+#include <ctype.h>
void free_mdstat(struct mdstat_ent *ms)
{
@@ -158,6 +159,10 @@ struct mdstat_ent *mdstat_read(int hold, int start)
ent->percent = -1;
ent->active = -1;
ent->resync = 0;
+ ent->metadata_version = NULL;
+ ent->raid_disks = 0;
+ ent->chunk_size = 0;
+ ent->devcnt = 0;
ent->dev = strdup(line);
ent->devnum = devnum;
@@ -176,22 +181,32 @@ struct mdstat_ent *mdstat_read(int hold, int start)
in_devs = 1;
} else if (in_devs && strcmp(w, "blocks")==0)
in_devs = 0;
- else if (in_devs && strncmp(w, "md", 2)==0) {
- /* This has an md device as a component.
- * If that device is already in the list,
- * make sure we insert before there.
- */
- struct mdstat_ent **ih;
- int dn2;
- if (strncmp(w, "md_d", 4)==0)
- dn2 = -1-strtoul(w+4, &ep, 10);
- else
- dn2 = strtoul(w+2, &ep, 10);
- ih = &all;
- while (ih != insert_here && *ih &&
- (*ih)->devnum != dn2)
- ih = & (*ih)->next;
- insert_here = ih;
+ else if (in_devs) {
+ ent->devcnt++;
+ if (strncmp(w, "md", 2)==0) {
+ /* This has an md device as a component.
+ * If that device is already in the
+ * list, make sure we insert before
+ * there.
+ */
+ struct mdstat_ent **ih;
+ int dn2;
+ if (strncmp(w, "md_d", 4)==0)
+ dn2 = -1-strtoul(w+4, &ep, 10);
+ else
+ dn2 = strtoul(w+2, &ep, 10);
+ ih = &all;
+ while (ih != insert_here && *ih &&
+ (*ih)->devnum != dn2)
+ ih = & (*ih)->next;
+ insert_here = ih;
+ }
+ } else if (strcmp(w, "super") == 0 &&
+ dl_next(w) != line) {
+ w = dl_next(w);
+ ent->metadata_version = strdup(w);
+ } else if (w[0] == '[' && isdigit(w[1])) {
+ ent->raid_disks = atoi(w+1);
} else if (!ent->pattern &&
w[0] == '[' &&
(w[1] == 'U' || w[1] == '_')) {
@@ -256,6 +271,19 @@ void mdstat_wait(int seconds)
select(mdstat_fd >2 ? mdstat_fd+1:3, NULL, NULL, &fds, &tm);
}
+void mdstat_wait_fd(int fd)
+{
+ fd_set fds, rfds;
+
+ FD_ZERO(&fds);
+ FD_ZERO(&rfds);
+ if (mdstat_fd >= 0)
+ FD_SET(mdstat_fd, &fds);
+ FD_SET(fd, &rfds);
+
+ select(mdstat_fd >2 ? mdstat_fd+1:3, &rfds, NULL, &fds, NULL);
+}
+
int mddev_busy(int devnum)
{
struct mdstat_ent *mdstat = mdstat_read(0, 0);
diff --git a/monitor.c b/monitor.c
new file mode 100644
index 0000000..38725d1
--- /dev/null
+++ b/monitor.c
@@ -0,0 +1,372 @@
+
+#include "mdadm.h"
+#include "mdmon.h"
+
+#include <sys/select.h>
+
+
+static char *array_states[] = {
+ "clear", "inactive", "suspended", "readonly", "read-auto",
+ "clean", "active", "write-pending", "active-idle", NULL };
+static char *sync_actions[] = {
+ "idle", "reshape", "resync", "recover", "check", "repair", NULL
+};
+
+static int write_attr(char *attr, int fd)
+{
+ return write(fd, attr, strlen(attr));
+}
+
+static void add_fd(fd_set *fds, int *maxfd, int fd)
+{
+ if (fd < 0)
+ return;
+ if (fd > *maxfd)
+ *maxfd = fd;
+ FD_SET(fd, fds);
+}
+
+static int read_attr(char *buf, int len, int fd)
+{
+ int n;
+
+ if (fd < 0) {
+ buf[0] = 0;
+ return 0;
+ }
+ lseek(fd, 0, 0);
+ n = read(fd, buf, len - 1);
+
+ if (n <= 0) {
+ buf[0] = 0;
+ return 0;
+ }
+ buf[n] = 0;
+ if (buf[n-1] == '\n')
+ buf[n-1] = 0;
+ return n;
+}
+
+static int get_sync_pos(struct active_array *a)
+{
+ char buf[30];
+ int n;
+
+ n = read_attr(buf, 30, a->sync_pos_fd);
+ if (n <= 0)
+ return n;
+
+ if (strncmp(buf, "max", 3) == 0) {
+ a->sync_pos = ~(unsigned long long)0;
+ return 1;
+ }
+ a->sync_pos = strtoull(buf, NULL, 10);
+ return 1;
+}
+
+
+static int attr_match(const char *attr, const char *str)
+{
+ /* See if attr, read from a sysfs file, matches
+ * str. They must either be the same, or attr can
+ * have a trailing newline or comma
+ */
+ while (*attr && *str && *attr == *str) {
+ attr++;
+ str++;
+ }
+
+ if (*str || (*attr && *attr != ',' && *attr != '\n'))
+ return 0;
+ return 1;
+}
+
+static int match_word(const char *word, char **list)
+{
+ int n;
+ for (n=0; list[n]; n++)
+ if (attr_match(word, list[n]))
+ break;
+ return n;
+}
+
+static enum array_state read_state(int fd)
+{
+ char buf[20];
+ int n = read_attr(buf, 20, fd);
+
+ if (n <= 0)
+ return bad_word;
+ return (enum array_state) match_word(buf, array_states);
+}
+
+static enum sync_action read_action( int fd)
+{
+ char buf[20];
+ int n = read_attr(buf, 20, fd);
+
+ if (n <= 0)
+ return bad_action;
+ return (enum sync_action) match_word(buf, sync_actions);
+}
+
+#define DS_FAULTY 1
+#define DS_INSYNC 2
+#define DS_WRITE_MOSTLY 4
+#define DS_SPARE 8
+#define DS_REMOVE 1024
+
+int read_dev_state(int fd)
+{
+ char buf[60];
+ int n = read_attr(buf, 60, fd);
+ char *cp;
+ int rv = 0;
+
+ if (n <= 0)
+ return 0;
+
+ cp = buf;
+ while (cp) {
+ if (attr_match("faulty", cp))
+ rv |= DS_FAULTY;
+ if (attr_match("in_sync", cp))
+ rv |= DS_INSYNC;
+ if (attr_match("write_mostly", cp))
+ rv |= DS_WRITE_MOSTLY;
+ if (attr_match("spare", cp))
+ rv |= DS_SPARE;
+ cp = strchr(cp, ',');
+ if (cp)
+ cp++;
+ }
+ return rv;
+}
+
+
+/* Monitor a set of active md arrays - all of which share the
+ * same metadata - and respond to events that require
+ * metadata update.
+ *
+ * New arrays are detected by another thread which allocates
+ * required memory and attaches the data structure to our list.
+ *
+ * Events:
+ * Array stops.
+ * This is detected by array_state going to 'clear' or 'inactive'.
+ * while we thought it was active.
+ * Response is to mark metadata as clean and 'clear' the array(??)
+ * write-pending
+ * array_state if 'write-pending'
+ * We mark metadata as 'dirty' then set array to 'active'.
+ * active_idle
+ * Either ignore, or mark clean, then mark metadata as clean.
+ *
+ * device fails
+ * detected by rd-N/state reporting "faulty"
+ * mark device as 'failed' in metadata, the remove device
+ * by writing 'remove' to rd/state.
+ *
+ * sync completes
+ * sync_action was 'resync' and becomes 'idle' and resync_start becomes
+ * MaxSector
+ * Notify metadata that sync is complete.
+ * "Deal with Degraded"
+ *
+ * recovery completes
+ * sync_action changes from 'recover' to 'idle'
+ * Check each device state and mark metadata if 'faulty' or 'in_sync'.
+ * "Deal with Degraded"
+ *
+ * deal with degraded array
+ * We only do this when first noticing the array is degraded.
+ * This can be when we first see the array, when sync completes or
+ * when recovery completes.
+ *
+ * Check if number of failed devices suggests recovery is needed, and
+ * skip if not.
+ * Ask metadata for a spare device
+ * Add device as not in_sync and give a role
+ * Update metadata.
+ * Start recovery.
+ *
+ * deal with resync
+ * This only happens on finding a new array....
+ * Maybe this is done by mdadm before passing the array to us?
+ *
+ * If array is 'clean' but metadata is 'dirty', start a resync
+ * and mark array as 'dirty'.
+ *
+ *
+ *
+ *
+ * We wait for a change (poll/select) on array_state, sync_action, and
+ * each rd-X/state file.
+ * When we get any change, we check everything. So read each state file,
+ * then decide what to do.
+ *
+ * The core action is to write new metadata to all devices in the array.
+ * This is done at most once on any wakeup.
+ * After that we might:
+ * - update the array_state
+ * - set the role of some devices.
+ * - request a sync_action
+ *
+ */
+
+static int read_and_act(struct active_array *a)
+{
+ int check_degraded;
+ struct mdinfo *mdi;
+
+ a->next_state = bad_word;
+ a->next_action = bad_action;
+
+ a->curr_state = read_state(a->info.state_fd);
+ a->curr_action = read_action(a->action_fd);
+ for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+ mdi->next_state = 0;
+ mdi->curr_state = read_dev_state(mdi->state_fd);
+ }
+
+ if (a->curr_state <= inactive &&
+ a->prev_state > inactive) {
+ /* array has been stopped */
+ get_sync_pos(a);
+ a->container->ss->mark_clean(a, a->sync_pos);
+ a->next_state = clear;
+ }
+ if (a->curr_state == write_pending) {
+ a->container->ss->mark_dirty(a);
+ a->next_state = active;
+ }
+ if (a->curr_state == active_idle) {
+ /* Set array to 'clean' FIRST, then
+ * a->ss->mark_clean(a);
+ * just ignore for now.
+ */
+ }
+
+ if (a->curr_state == readonly) {
+ /* Well, I'm ready to handle things, so
+ * read-auto is OK. FIXME what if we really want
+ * readonly ???
+ */
+ a->next_state = read_auto;
+ }
+
+ if (a->curr_action == idle &&
+ a->prev_action == resync) {
+ /* check resync_start to see if it is 'max'.
+ * Do I open here, or have it open the whole time?
+ */
+ get_sync_pos(a);
+ check_degraded = 1;
+ }
+
+ if (a->curr_action == idle &&
+ a->prev_action == recover) {
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+ a->container->ss->set_disk(a, mdi->disk.raid_disk);
+ if (! (mdi->curr_state & DS_INSYNC))
+ check_degraded = 1;
+ }
+ }
+
+
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+ if (mdi->curr_state & DS_FAULTY) {
+ a->container->ss->set_disk(a, mdi->disk.raid_disk);
+ check_degraded = 1;
+ mdi->next_state = DS_REMOVE;
+ }
+ }
+
+ if (check_degraded) {
+ // FIXME;
+ }
+
+ a->container->ss->sync_metadata(a);
+
+ /* Effect state changes in the array */
+ if (a->next_state != bad_word)
+ write_attr(array_states[a->next_state], a->info.state_fd);
+ if (a->next_action != bad_action)
+ write_attr(sync_actions[a->next_action], a->action_fd);
+ for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+ if (mdi->next_state == DS_REMOVE)
+ write_attr("remove", mdi->state_fd);
+ if (mdi->next_state & DS_INSYNC)
+ write_attr("+in_sync", mdi->state_fd);
+ }
+
+ /* move curr_ to prev_ */
+ a->prev_state = a->curr_state;
+
+ a->prev_action = a->curr_action;
+
+ for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+ mdi->prev_state = mdi->curr_state;
+ mdi->next_state = 0;
+ }
+
+ return 1;
+}
+
+static int wait_and_act(struct active_array *aa, int pfd, int nowait)
+{
+ fd_set rfds;
+ int maxfd = 0;
+ struct active_array *a;
+ int rv;
+
+ FD_ZERO(&rfds);
+
+ add_fd(&rfds, &maxfd, pfd);
+ for (a = aa ; a ; a = a->next) {
+ struct mdinfo *mdi;
+
+ add_fd(&rfds, &maxfd, a->info.state_fd);
+ add_fd(&rfds, &maxfd, a->action_fd);
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+ add_fd(&rfds, &maxfd, mdi->state_fd);
+ }
+
+ if (!nowait) {
+ rv = select(maxfd+1, &rfds, NULL, NULL, NULL);
+
+ if (rv <= 0)
+ return rv;
+
+ if (FD_ISSET(pfd, &rfds)) {
+ char buf[4];
+ read(pfd, buf, 4);
+ ; // FIXME read from the pipe
+ }
+ }
+
+ for (a = aa; a ; a = a->next) {
+ if (a->replaces) {
+ struct active_array **ap;
+ for (ap = &a->next; *ap && *ap != a->replaces;
+ ap = & (*ap)->next)
+ ;
+ if (*ap)
+ *ap = (*ap)->next;
+ discard_this = a->replaces;
+ a->replaces = NULL;
+ }
+ rv += read_and_act(a);
+ }
+ return rv;
+}
+
+void do_monitor(struct supertype *container)
+{
+ int rv;
+ int first = 1;
+ do {
+ rv = wait_and_act(container->arrays, container->pipe[0], first);
+ first = 0;
+ } while (rv >= 0);
+}
diff --git a/super-ddf.c b/super-ddf.c
index 1031e22..c11fa1c 100644
--- a/super-ddf.c
+++ b/super-ddf.c
@@ -27,6 +27,7 @@
#define HAVE_STDINT_H 1
#include "mdadm.h"
+#include "mdmon.h"
#include "sha1.h"
#include <values.h>
@@ -416,7 +417,7 @@ struct ddf_super {
#define offsetof(t,f) ((size_t)&(((t*)0)->f))
#endif
-extern struct superswitch super_ddf_container, super_ddf_bvd;
+extern struct superswitch super_ddf_container, super_ddf_bvd, super_ddf;
static int calc_crc(void *buf, int len)
{
@@ -2442,6 +2443,32 @@ static int compare_super_ddf(struct supertype *st, struct supertype *tst)
return 0;
}
+static int ddf_open_new(struct supertype *c, struct active_array *a, int inst)
+{
+ fprintf(stderr, "ddf: open_new %d\n", inst);
+ return 0;
+}
+
+static void ddf_mark_clean(struct active_array *a, unsigned long long sync_pos)
+{
+ fprintf(stderr, "ddf: mark clean %llu\n", sync_pos);
+}
+
+static void ddf_mark_dirty(struct active_array *a)
+{
+ fprintf(stderr, "ddf: mark dirty\n");
+}
+
+static void ddf_set_disk(struct active_array *a, int n)
+{
+ fprintf(stderr, "ddf: set_disk %d\n", n);
+}
+
+static void ddf_sync_metadata(struct active_array *a)
+{
+ fprintf(stderr, "ddf: sync_metadata\n");
+}
+
struct superswitch super_ddf = {
#ifndef MDASSEMBLE
.examine_super = examine_super_ddf,
@@ -2471,6 +2498,16 @@ struct superswitch super_ddf = {
.swapuuid = 0,
.external = 1,
.text_version = "ddf",
+
+/* for mdmon */
+ .open_new = ddf_open_new,
+ .load_super = load_super_ddf,
+ .mark_clean = ddf_mark_clean,
+ .mark_dirty = ddf_mark_dirty,
+ .set_disk = ddf_set_disk,
+ .sync_metadata = ddf_sync_metadata,
+
+
};
/* Super_ddf_container is set by validate_geometry_ddf when given a
diff --git a/sysfs.c b/sysfs.c
index f0e9512..34840f7 100644
--- a/sysfs.c
+++ b/sysfs.c
@@ -56,6 +56,29 @@ void sysfs_free(struct mdinfo *sra)
}
}
+int sysfs_open(int devnum, char *devname, char *attr)
+{
+ char fname[50];
+ char sys_name[16];
+ int fd;
+ if (devnum >= 0)
+ sprintf(sys_name, "md%d", devnum);
+ else
+ sprintf(sys_name, "md_d%d",
+ -1-devnum);
+
+ sprintf(fname, "/sys/block/%s/md/", sys_name);
+ if (devname) {
+ strcat(fname, devname);
+ strcat(fname, "/");
+ }
+ strcat(fname, attr);
+ fd = open(fname, O_RDWR);
+ if (fd < 0 && errno == -EACCES)
+ fd = open(fname, O_RDONLY);
+ return fd;
+}
+
struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
{
/* Longest possible name in sysfs, mounted at /sys, is
@@ -128,6 +151,12 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
goto abort;
sra->array.layout = strtoul(buf, NULL, 0);
}
+ if (options & GET_DISKS) {
+ strcpy(base, "raid_disks");
+ if (load_sys(fname, buf))
+ goto abort;
+ sra->array.raid_disks = strtoul(buf, NULL, 0);
+ }
if (options & GET_COMPONENT) {
strcpy(base, "component_size");
if (load_sys(fname, buf))