summaryrefslogtreecommitdiffstats
path: root/Monitor.c
diff options
context:
space:
mode:
Diffstat (limited to 'Monitor.c')
-rw-r--r--Monitor.c211
1 files changed, 211 insertions, 0 deletions
diff --git a/Monitor.c b/Monitor.c
new file mode 100644
index 0000000..968e4b3
--- /dev/null
+++ b/Monitor.c
@@ -0,0 +1,211 @@
+/*
+ * mdctl - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001 Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@cse.unsw.edu.au>
+ * Paper: Neil Brown
+ * School of Computer Science and Engineering
+ * The University of New South Wales
+ * Sydney, 2052
+ * Australia
+ */
+
+#include "mdctl.h"
+#include "md_p.h"
+#include "md_u.h"
+#include <sys/signal.h>
+
+static void alert(char *event, char *dev, char *disc, char *mailaddr, char *cmd);
+
+int Monitor(int num_devs, char *devlist[],
+ char *mailaddr, char *alert_cmd,
+ int period,
+ char *config)
+{
+ /*
+ * Every few seconds, scan every md device looking for changes
+ * When a change is found, log it, possibly run the alert command,
+ * and possibly send Email
+ *
+ * For each array, we record:
+ * Update time
+ * active/working/failed/spare drives
+ * State of each device.
+ *
+ * If the update time changes, check out all the data again
+ * It is possible that we cannot get the state of each device
+ * due to bugs in the md kernel module.
+ *
+ * if active_drives decreases, generate a "Fail" event
+ * if active_drives increases, generate a "SpareActive" event
+ *
+ * if we detect an array with active<raid and spare==0
+ * we look at other arrays that have same spare-group
+ * If we find one with active==raid and spare>0,
+ * and if we can get_disk_info and find a name
+ * Then we hot-remove and hot-add to the other array
+ *
+ */
+
+ struct state {
+ char *devname;
+ long utime;
+ int err;
+ int active, working, failed, spare;
+ int devstate[MD_SB_DISKS];
+ struct state *next;
+ } *statelist = NULL;
+ int finished = 0;
+ while (! finished) {
+ mddev_ident_t mdlist = NULL;
+ int dnum=0;
+ if (num_devs == 0)
+ mdlist = conf_get_ident(config, NULL);
+ while (dnum < num_devs || mdlist) {
+ mddev_ident_t mdident;
+ struct state *st;
+ mdu_array_info_t array;
+ char *dev;
+ int fd;
+ char *event = NULL;
+ int i;
+ char *event_disc = NULL;
+ if (num_devs) {
+ dev = devlist[dnum++];
+ mdident = conf_get_ident(config, dev);
+ } else {
+ mdident = mdlist;
+ dev = mdident->devname;
+ mdlist = mdlist->next;
+ }
+ for (st=statelist; st ; st=st->next)
+ if (strcmp(st->devname, dev)==0)
+ break;
+ if (!st) {
+ st =malloc(sizeof *st);
+ if (st == NULL)
+ continue;
+ st->devname = strdup(dev);
+ st->utime = 0;
+ st->next = statelist;
+ st->err = 0;
+ statelist = st;
+ }
+ fd = open(dev, O_RDONLY);
+ if (fd < 0) {
+ if (!st->err)
+ fprintf(stderr, Name ": cannot open %s: %s\n",
+ dev, strerror(errno));
+ st->err=1;
+ continue;
+ }
+ if (ioctl(fd, GET_ARRAY_INFO, &array)<0) {
+ if (!st->err)
+ fprintf(stderr, Name ": cannot get array info for %s: %s\n",
+ dev, strerror(errno));
+ st->err=1;
+ close(fd);
+ continue;
+ }
+ st->err = 0;
+
+ if (st->utime == array.utime &&
+ st->failed == array.failed_disks) {
+ close(fd);
+ continue;
+ }
+ event = NULL;
+ if (st->utime) {
+ int i;
+ if (st->active > array.active_disks)
+ event = "Fail";
+ else if (st->working > array.working_disks)
+ event = "FailSpare";
+ else if (st->active < array.active_disks)
+ event = "ActiveSpare";
+ }
+ for (i=0; i<array.raid_disks+array.spare_disks; i++) {
+ mdu_disk_info_t disc;
+ disc.number = i;
+ if (ioctl(fd, GET_DISK_INFO, &disc)>= 0) {
+ if (event && event_disc == NULL &&
+ st->devstate[i] != disc.state) {
+ char * dv = map_dev(disc.major, disc.minor);
+ if (dv)
+ event_disc = strdup(dv);
+ }
+ st->devstate[i] = disc.state;
+ }
+ }
+ close(fd);
+ st->active = array.active_disks;
+ st->working = array.working_disks;
+ st->spare = array.spare_disks;
+ st->failed = array.failed_disks;
+ st->utime = array.utime;
+ if (event)
+ alert(event, dev, event_disc, mailaddr, alert_cmd);
+ }
+ sleep(period);
+ }
+ return 0;
+}
+
+
+static void alert(char *event, char *dev, char *disc, char *mailaddr, char *cmd)
+{
+ if (cmd) {
+ int pid = fork();
+ switch(pid) {
+ default:
+ waitpid(pid, NULL, 0);
+ break;
+ case -1:
+ break;
+ case 0:
+ execl(cmd, cmd, event, dev, disc, NULL);
+ exit(2);
+ }
+ }
+ if (mailaddr && strncmp(event, "Fail", 4)==0) {
+ FILE *mp = popen(Sendmail, "w");
+ if (mp) {
+ char hname[256];
+ gethostname(hname, sizeof(hname));
+ signal(SIGPIPE, SIG_IGN);
+ fprintf(mp, "From: " Name " monitoring <root>\n");
+ fprintf(mp, "To: %s\n", mailaddr);
+ fprintf(mp, "Subject: %s event on %s:%s\n\n", event, dev, hname);
+
+ fprintf(mp, "This is an automatically generated mail message from " Name "\n");
+ fprintf(mp, "running on %s\n\n", hname);
+
+ fprintf(mp, "A %s event had been detected on md device %s.\n\n", event, dev);
+
+ if (disc)
+ fprintf(mp, "It could be related to sub-device %s.\n\n", disc);
+
+ fprintf(mp, "Faithfully yours, etc.\n");
+ fclose(mp);
+ }
+
+ }
+ /* FIXME log the event to syslog maybe */
+}