handle disk failures

From: Dan Williams <dan.j.williams@intel.com> Added curr_state as a parameter to set_disk. Handlers look at this to record components failures, and set global 'degraded' or 'failed' status. When reading the state as faulty: 1/ mark the disk failed in the metadata 2/ write '-blocked' to the rdev state to allow the kernel's failure mechanism to advance 3/ the kernel will take away the drive's role in remove_and_add_spares() 4/ once the disk no longer has a role writing 'remove' to the rdev state will get the disk out of array. There is a window after writing '-blocked' where the kernel will return -EBUSY to remove requests. We rely on the fact that the disk will continue to show faulty so we lazily wait until the kernel is ready to remove the disk. If the manager thread needs to get the disk out of the way it can ping the monitor and wait, just like the replace_array() case. [buglet fix: swap the parameters of attr_match in read_dev_state] Signed-off-by: Dan Williams <dan.j.williams@intel.com>
author: Dan Williams <dan.j.williams@intel.com> 2008-05-15 16:48:49 +1000
committer: Neil Brown <neilb@suse.de> 2008-05-15 16:48:49 +1000
commit: 8d45d1969bc299040201df82c51f7fbbc985c401 (patch)
tree: aaa4a6f63d3dd1682388d48f293b892b0d6c0567 /monitor.c
parent: c2a1e7dad7a2ca88bd6667f8ad0853a62fe8f874 (diff)
download: mdadm-8d45d1969bc299040201df82c51f7fbbc985c401.tar.gz
mdadm-8d45d1969bc299040201df82c51f7fbbc985c401.tar.xz
mdadm-8d45d1969bc299040201df82c51f7fbbc985c401.zip
1 files changed, 29 insertions, 17 deletions
diff --git a/monitor.c b/monitor.c
index 9e98aeb..98d0219 100644
--- a/monitor.c
+++ b/monitor.c
@@ -123,12 +123,6 @@ static enum sync_action read_action( int fd)
 	return (enum sync_action) match_word(buf, sync_actions);
 }
 
-#define DS_FAULTY	1
-#define	DS_INSYNC	2
-#define	DS_WRITE_MOSTLY	4
-#define	DS_SPARE	8
-#define	DS_REMOVE	1024
-
 int read_dev_state(int fd)
 {
 	char buf[60];
@@ -141,14 +135,16 @@ int read_dev_state(int fd)
 
 	cp = buf;
 	while (cp) {
-		if (attr_match("faulty", cp))
+		if (attr_match(cp, "faulty"))
 			rv |= DS_FAULTY;
-		if (attr_match("in_sync", cp))
+		if (attr_match(cp, "in_sync"))
 			rv |= DS_INSYNC;
-		if (attr_match("write_mostly", cp))
+		if (attr_match(cp, "write_mostly"))
 			rv |= DS_WRITE_MOSTLY;
-		if (attr_match("spare", cp))
+		if (attr_match(cp, "spare"))
 			rv |= DS_SPARE;
+		if (attr_match(cp, "blocked"))
+			rv |= DS_BLOCKED;
 		cp = strchr(cp, ',');
 		if (cp)
 			cp++;
@@ -177,8 +173,9 @@ int read_dev_state(int fd)
  *
  *  device fails
  *    detected by rd-N/state reporting "faulty"
- *    mark device as 'failed' in metadata, the remove device
- *    by writing 'remove' to rd/state.
+ *    mark device as 'failed' in metadata, let the kernel release the
+ *    device by writing '-blocked' to rd/state, and finally write 'remove' to
+ *    rd/state
  *
  *  sync completes
  *    sync_action was 'resync' and becomes 'idle' and resync_start becomes
@@ -238,7 +235,8 @@ static int read_and_act(struct active_array *a)
 	a->curr_action = read_action(a->action_fd);
 	for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
 		mdi->next_state = 0;
-		mdi->curr_state = read_dev_state(mdi->state_fd);
+		if (mdi->state_fd > 0)
+			mdi->curr_state = read_dev_state(mdi->state_fd);
 	}
 
 	if (a->curr_state <= inactive &&
@@ -285,7 +283,8 @@ static int read_and_act(struct active_array *a)
 	if (a->curr_action == idle &&
 	    a->prev_action == recover) {
 		for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
-			a->container->ss->set_disk(a, mdi->disk.raid_disk);
+			a->container->ss->set_disk(a, mdi->disk.raid_disk,
+						   mdi->curr_state);
 			if (! (mdi->curr_state & DS_INSYNC))
 				check_degraded = 1;
 		}
@@ -294,7 +293,8 @@ static int read_and_act(struct active_array *a)
 
 	for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
 		if (mdi->curr_state & DS_FAULTY) {
-			a->container->ss->set_disk(a, mdi->disk.raid_disk);
+			a->container->ss->set_disk(a, mdi->disk.raid_disk,
+						   mdi->curr_state);
 			check_degraded = 1;
 			mdi->next_state = DS_REMOVE;
 		}
@@ -312,8 +312,20 @@ static int read_and_act(struct active_array *a)
 	if (a->next_action != bad_action)
 		write_attr(sync_actions[a->next_action], a->action_fd);
 	for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
-		if (mdi->next_state == DS_REMOVE)
-			write_attr("remove", mdi->state_fd);
+		if (mdi->next_state == DS_REMOVE && mdi->state_fd > 0) {
+			int remove_err;
+
+			write_attr("-blocked", mdi->state_fd);
+			/* the kernel may not be able to immediately remove the
+			 * disk, we can simply wait until the next event to try
+			 * again.
+			 */
+			remove_err = write_attr("remove", mdi->state_fd);
+			if (!remove_err) {
+				close(mdi->state_fd);
+				mdi->state_fd = -1;
+			}
+		}
 		if (mdi->next_state & DS_INSYNC)
 			write_attr("+in_sync", mdi->state_fd);
 	}
author	Dan Williams <dan.j.williams@intel.com>	2008-05-15 16:48:49 +1000
committer	Neil Brown <neilb@suse.de>	2008-05-15 16:48:49 +1000
commit	8d45d1969bc299040201df82c51f7fbbc985c401 (patch)
tree	aaa4a6f63d3dd1682388d48f293b892b0d6c0567 /monitor.c
parent	c2a1e7dad7a2ca88bd6667f8ad0853a62fe8f874 (diff)
download	mdadm-8d45d1969bc299040201df82c51f7fbbc985c401.tar.gz mdadm-8d45d1969bc299040201df82c51f7fbbc985c401.tar.xz mdadm-8d45d1969bc299040201df82c51f7fbbc985c401.zip