/* * sysfs - extract md related information from sysfs. Part of: * mdadm - manage Linux "md" devices aka RAID arrays. * * Copyright (C) 2006-2009 Neil Brown * * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Neil Brown * Email: */ #include "mdadm.h" #include #include int load_sys(char *path, char *buf) { int fd = open(path, O_RDONLY); int n; if (fd < 0) return -1; n = read(fd, buf, 1024); close(fd); if (n <0 || n >= 1024) return -1; buf[n] = 0; if (n && buf[n-1] == '\n') buf[n-1] = 0; return 0; } void sysfs_free(struct mdinfo *sra) { while (sra) { struct mdinfo *sra2 = sra->next; while (sra->devs) { struct mdinfo *d = sra->devs; sra->devs = d->next; free(d); } free(sra); sra = sra2; } } int sysfs_open(int devnum, char *devname, char *attr) { char fname[50]; int fd; char *mdname = devnum2devname(devnum); if (!mdname) return -1; sprintf(fname, "/sys/block/%s/md/", mdname); if (devname) { strcat(fname, devname); strcat(fname, "/"); } strcat(fname, attr); fd = open(fname, O_RDWR); if (fd < 0 && errno == EACCES) fd = open(fname, O_RDONLY); free(mdname); return fd; } void sysfs_init(struct mdinfo *mdi, int fd, int devnum) { mdi->sys_name[0] = 0; if (fd >= 0) { mdu_version_t vers; if (ioctl(fd, RAID_VERSION, &vers) != 0) return; devnum = fd2devnum(fd); } if (devnum == NoMdDev) return; if (devnum >= 0) sprintf(mdi->sys_name, "md%d", devnum); else sprintf(mdi->sys_name, "md_d%d", -1-devnum); } struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) { char fname[PATH_MAX]; char buf[PATH_MAX]; char *base; char *dbase; struct mdinfo *sra; struct mdinfo *dev; DIR *dir = NULL; struct dirent *de; sra = malloc(sizeof(*sra)); if (sra == NULL) return sra; memset(sra, 0, sizeof(*sra)); sysfs_init(sra, fd, devnum); if (sra->sys_name[0] == 0) { free(sra); return NULL; } sprintf(fname, "/sys/block/%s/md/", sra->sys_name); base = fname + strlen(fname); sra->devs = NULL; if (options & GET_VERSION) { strcpy(base, "metadata_version"); if (load_sys(fname, buf)) goto abort; if (strncmp(buf, "none", 4) == 0) { sra->array.major_version = sra->array.minor_version = -1; strcpy(sra->text_version, ""); } else if (strncmp(buf, "external:", 9) == 0) { sra->array.major_version = -1; sra->array.minor_version = -2; strcpy(sra->text_version, buf+9); } else { sscanf(buf, "%d.%d", &sra->array.major_version, &sra->array.minor_version); strcpy(sra->text_version, buf); } } if (options & GET_LEVEL) { strcpy(base, "level"); if (load_sys(fname, buf)) goto abort; sra->array.level = map_name(pers, buf); } if (options & GET_LAYOUT) { strcpy(base, "layout"); if (load_sys(fname, buf)) goto abort; sra->array.layout = strtoul(buf, NULL, 0); } if (options & GET_DISKS) { strcpy(base, "raid_disks"); if (load_sys(fname, buf)) goto abort; sra->array.raid_disks = strtoul(buf, NULL, 0); } if (options & GET_DEGRADED) { strcpy(base, "degraded"); if (load_sys(fname, buf)) goto abort; sra->array.failed_disks = strtoul(buf, NULL, 0); } if (options & GET_COMPONENT) { strcpy(base, "component_size"); if (load_sys(fname, buf)) goto abort; sra->component_size = strtoull(buf, NULL, 0); /* sysfs reports "K", but we want sectors */ sra->component_size *= 2; } if (options & GET_CHUNK) { strcpy(base, "chunk_size"); if (load_sys(fname, buf)) goto abort; sra->array.chunk_size = strtoul(buf, NULL, 0); } if (options & GET_CACHE) { strcpy(base, "stripe_cache_size"); if (load_sys(fname, buf)) goto abort; sra->cache_size = strtoul(buf, NULL, 0); } if (options & GET_MISMATCH) { strcpy(base, "mismatch_cnt"); if (load_sys(fname, buf)) goto abort; sra->mismatch_cnt = strtoul(buf, NULL, 0); } if (options & GET_SAFEMODE) { int scale = 1; int dot = 0; int i; unsigned long msec; size_t len; strcpy(base, "safe_mode_delay"); if (load_sys(fname, buf)) goto abort; /* remove a period, and count digits after it */ len = strlen(buf); for (i = 0; i < len; i++) { if (dot) { if (isdigit(buf[i])) { buf[i-1] = buf[i]; scale *= 10; } buf[i] = 0; } else if (buf[i] == '.') { dot=1; buf[i] = 0; } } msec = strtoul(buf, NULL, 10); msec = (msec * 1000) / scale; sra->safe_mode_delay = msec; } if (! (options & GET_DEVS)) return sra; /* Get all the devices as well */ *base = 0; dir = opendir(fname); if (!dir) goto abort; sra->array.spare_disks = 0; while ((de = readdir(dir)) != NULL) { char *ep; if (de->d_ino == 0 || strncmp(de->d_name, "dev-", 4) != 0) continue; strcpy(base, de->d_name); dbase = base + strlen(base); *dbase++ = '/'; dev = malloc(sizeof(*dev)); if (!dev) goto abort; /* Always get slot, major, minor */ strcpy(dbase, "slot"); if (load_sys(fname, buf)) { /* hmm... unable to read 'slot' maybe the device * is going away? */ strcpy(dbase, "block"); if (readlink(fname, buf, sizeof(buf)) < 0 && errno != ENAMETOOLONG) { /* ...yup device is gone */ free(dev); continue; } else { /* slot is unreadable but 'block' link * still intact... something bad is happening * so abort */ free(dev); goto abort; } } strcpy(dev->sys_name, de->d_name); dev->disk.raid_disk = strtoul(buf, &ep, 10); if (*ep) dev->disk.raid_disk = -1; strcpy(dbase, "block/dev"); if (load_sys(fname, buf)) { free(dev); if (options & SKIP_GONE_DEVS) continue; else goto abort; } sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor); /* special case check for block devices that can go 'offline' */ if (options & SKIP_GONE_DEVS) { strcpy(dbase, "block/device/state"); if (load_sys(fname, buf) == 0 && strncmp(buf, "offline", 7) == 0) { free(dev); continue; } } /* finally add this disk to the array */ dev->next = sra->devs; sra->devs = dev; if (options & GET_OFFSET) { strcpy(dbase, "offset"); if (load_sys(fname, buf)) goto abort; dev->data_offset = strtoull(buf, NULL, 0); } if (options & GET_SIZE) { strcpy(dbase, "size"); if (load_sys(fname, buf)) goto abort; dev->component_size = strtoull(buf, NULL, 0) * 2; } if (options & GET_STATE) { dev->disk.state = 0; strcpy(dbase, "state"); if (load_sys(fname, buf)) goto abort; if (strstr(buf, "in_sync")) dev->disk.state |= (1<disk.state |= (1<disk.state == 0) sra->array.spare_disks++; } if (options & GET_ERROR) { strcpy(buf, "errors"); if (load_sys(fname, buf)) goto abort; dev->errors = strtoul(buf, NULL, 0); } } closedir(dir); return sra; abort: if (dir) closedir(dir); sysfs_free(sra); return NULL; } int sysfs_attr_match(const char *attr, const char *str) { /* See if attr, read from a sysfs file, matches * str. They must either be the same, or attr can * have a trailing newline or comma */ while (*attr && *str && *attr == *str) { attr++; str++; } if (*str || (*attr && *attr != ',' && *attr != '\n')) return 0; return 1; } int sysfs_match_word(const char *word, char **list) { int n; for (n=0; list[n]; n++) if (sysfs_attr_match(word, list[n])) break; return n; } unsigned long long get_component_size(int fd) { /* Find out the component size of the array. * We cannot trust GET_ARRAY_INFO ioctl as it's * size field is only 32bits. * So look in /sys/block/mdXXX/md/component_size * * This returns in units of sectors. */ struct stat stb; char fname[50]; int n; if (fstat(fd, &stb)) return 0; if (major(stb.st_rdev) != get_mdp_major()) sprintf(fname, "/sys/block/md%d/md/component_size", (int)minor(stb.st_rdev)); else sprintf(fname, "/sys/block/md_d%d/md/component_size", (int)minor(stb.st_rdev)>>MdpMinorShift); fd = open(fname, O_RDONLY); if (fd < 0) return 0; n = read(fd, fname, sizeof(fname)); close(fd); if (n == sizeof(fname)) return 0; fname[n] = 0; return strtoull(fname, NULL, 10) * 2; } int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, char *name, char *val) { char fname[50]; int n; int fd; sprintf(fname, "/sys/block/%s/md/%s/%s", sra->sys_name, dev?dev->sys_name:"", name); fd = open(fname, O_WRONLY); if (fd < 0) return -1; n = write(fd, val, strlen(val)); close(fd); if (n != strlen(val)) { dprintf(Name ": failed to write '%s' to '%s' (%s)\n", val, fname, strerror(errno)); return -1; } return 0; } int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev, char *name, unsigned long long val) { char valstr[50]; sprintf(valstr, "%llu", val); return sysfs_set_str(sra, dev, name, valstr); } int sysfs_uevent(struct mdinfo *sra, char *event) { char fname[50]; int n; int fd; sprintf(fname, "/sys/block/%s/uevent", sra->sys_name); fd = open(fname, O_WRONLY); if (fd < 0) return -1; n = write(fd, event, strlen(event)); close(fd); return 0; } int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev, char *name) { char fname[50]; int fd; sprintf(fname, "/sys/block/%s/md/%s/%s", sra->sys_name, dev?dev->sys_name:"", name); fd = open(fname, O_RDWR); if (fd < 0) fd = open(fname, O_RDONLY); return fd; } int sysfs_fd_get_ll(int fd, unsigned long long *val) { char buf[50]; int n; char *ep; lseek(fd, 0, 0); n = read(fd, buf, sizeof(buf)); if (n <= 0) return -1; buf[n] = 0; *val = strtoull(buf, &ep, 0); if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) return -1; return 0; } int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, char *name, unsigned long long *val) { int n; int fd; fd = sysfs_get_fd(sra, dev, name); if (fd < 0) return -1; n = sysfs_fd_get_ll(fd, val); close(fd); return n; } int sysfs_fd_get_str(int fd, char *val, int size) { int n; lseek(fd, 0, 0); n = read(fd, val, size); if (n <= 0) return -1; val[n] = 0; return n; } int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, char *name, char *val, int size) { int n; int fd; fd = sysfs_get_fd(sra, dev, name); if (fd < 0) return -1; n = sysfs_fd_get_str(fd, val, size); close(fd); return n; } int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms) { unsigned long sec; unsigned long msec; char delay[30]; sec = ms / 1000; msec = ms % 1000; sprintf(delay, "%ld.%03ld\n", sec, msec); /* this '\n' ^ needed for kernels older than 2.6.28 */ return sysfs_set_str(sra, NULL, "safe_mode_delay", delay); } int sysfs_set_array(struct mdinfo *info, int vers) { int rv = 0; char ver[100]; ver[0] = 0; if (info->array.major_version == -1 && info->array.minor_version == -2) { strcat(strcpy(ver, "external:"), info->text_version); if ((vers % 100) < 2 || sysfs_set_str(info, NULL, "metadata_version", ver) < 0) { fprintf(stderr, Name ": This kernel does not " "support external metadata.\n"); return 1; } } if (info->array.level < 0) return 0; /* FIXME */ rv |= sysfs_set_str(info, NULL, "level", map_num(pers, info->array.level)); rv |= sysfs_set_num(info, NULL, "raid_disks", info->array.raid_disks); rv |= sysfs_set_num(info, NULL, "chunk_size", info->array.chunk_size); rv |= sysfs_set_num(info, NULL, "layout", info->array.layout); rv |= sysfs_set_num(info, NULL, "component_size", info->component_size/2); if (info->custom_array_size) { int rc; rc = sysfs_set_num(info, NULL, "array_size", info->custom_array_size/2); if (rc && errno == ENOENT) { fprintf(stderr, Name ": This kernel does not " "have the md/array_size attribute, " "the array may be larger than expected\n"); rc = 0; } rv |= rc; } if (info->array.level > 0) rv |= sysfs_set_num(info, NULL, "resync_start", info->resync_start); return rv; } int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume) { char dv[PATH_MAX]; char nm[PATH_MAX]; char *dname; int rv; sprintf(dv, "%d:%d", sd->disk.major, sd->disk.minor); rv = sysfs_set_str(sra, NULL, "new_dev", dv); if (rv) return rv; memset(nm, 0, sizeof(nm)); sprintf(dv, "/sys/dev/block/%d:%d", sd->disk.major, sd->disk.minor); rv = readlink(dv, nm, sizeof(nm)); if (rv <= 0) return -1; nm[rv] = '\0'; dname = strrchr(nm, '/'); if (dname) dname++; strcpy(sd->sys_name, "dev-"); strcpy(sd->sys_name+4, dname); /* test write to see if 'recovery_start' is available */ if (resume && sd->recovery_start < MaxSector && sysfs_set_num(sra, sd, "recovery_start", 0)) { sysfs_set_str(sra, sd, "state", "remove"); return -1; } rv = sysfs_set_num(sra, sd, "offset", sd->data_offset); rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2); if (sra->array.level != LEVEL_CONTAINER) { if (sd->recovery_start == MaxSector) /* This can correctly fail if array isn't started, * yet, so just ignore status for now. */ sysfs_set_str(sra, sd, "state", "insync"); rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk); if (resume) sysfs_set_num(sra, sd, "recovery_start", sd->recovery_start); } return rv; } #if 0 int sysfs_disk_to_sg(int fd) { /* from an open block device, try find and open its corresponding * scsi_generic interface */ struct stat st; char path[256]; char sg_path[256]; char sg_major_minor[8]; char *c; DIR *dir; struct dirent *de; int major, minor, rv; if (fstat(fd, &st)) return -1; snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device", major(st.st_rdev), minor(st.st_rdev)); dir = opendir(path); if (!dir) return -1; de = readdir(dir); while (de) { if (strncmp("scsi_generic:", de->d_name, strlen("scsi_generic:")) == 0) break; de = readdir(dir); } closedir(dir); if (!de) return -1; snprintf(sg_path, sizeof(sg_path), "%s/%s/dev", path, de->d_name); fd = open(sg_path, O_RDONLY); if (fd < 0) return fd; rv = read(fd, sg_major_minor, sizeof(sg_major_minor)); close(fd); if (rv < 0) return -1; else sg_major_minor[rv - 1] = '\0'; c = strchr(sg_major_minor, ':'); *c = '\0'; c++; major = strtol(sg_major_minor, NULL, 10); minor = strtol(c, NULL, 10); snprintf(path, sizeof(path), "/dev/.tmp.md.%d:%d:%d", (int) getpid(), major, minor); if (mknod(path, S_IFCHR|0600, makedev(major, minor))==0) { fd = open(path, O_RDONLY); unlink(path); return fd; } return -1; } #endif int sysfs_disk_to_scsi_id(int fd, __u32 *id) { /* from an open block device, try to retrieve it scsi_id */ struct stat st; char path[256]; char *c1, *c2; DIR *dir; struct dirent *de; if (fstat(fd, &st)) return 1; snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device", major(st.st_rdev), minor(st.st_rdev)); dir = opendir(path); if (!dir) return 1; de = readdir(dir); while (de) { if (strncmp("scsi_disk:", de->d_name, strlen("scsi_disk:")) == 0) break; de = readdir(dir); } closedir(dir); if (!de) return 1; c1 = strchr(de->d_name, ':'); c1++; c2 = strchr(c1, ':'); *c2 = '\0'; *id = strtol(c1, NULL, 10) << 24; /* host */ c1 = c2 + 1; c2 = strchr(c1, ':'); *c2 = '\0'; *id |= strtol(c1, NULL, 10) << 16; /* channel */ c1 = c2 + 1; c2 = strchr(c1, ':'); *c2 = '\0'; *id |= strtol(c1, NULL, 10) << 8; /* lun */ c1 = c2 + 1; *id |= strtol(c1, NULL, 10); /* id */ return 0; } int sysfs_unique_holder(int devnum, long rdev) { /* Check that devnum is a holder of rdev, * and is the only holder. * we should be locked against races by * an O_EXCL on devnum */ DIR *dir; struct dirent *de; char dirname[100]; char l; int found = 0; sprintf(dirname, "/sys/dev/block/%d:%d/holders", major(rdev), minor(rdev)); dir = opendir(dirname); errno = ENOENT; if (!dir) return 0; l = strlen(dirname); while ((de = readdir(dir)) != NULL) { char buf[10]; int n; int mj, mn; char c; int fd; if (de->d_ino == 0) continue; if (de->d_name[0] == '.') continue; strcpy(dirname+l, "/"); strcat(dirname+l, de->d_name); strcat(dirname+l, "/dev"); fd = open(dirname, O_RDONLY); if (fd < 0) { errno = ENOENT; break; } n = read(fd, buf, sizeof(buf)-1); close(fd); buf[n] = 0; if (sscanf(buf, "%d:%d%c", &mj, &mn, &c) != 3 || c != '\n') { errno = ENOENT; break; } if (mj != MD_MAJOR) mn = -1-(mn>>6); if (devnum != mn) { errno = EEXIST; break; } found = 1; } closedir(dir); if (de) return 0; else return found; } #ifndef MDASSEMBLE static char *clean_states[] = { "clear", "inactive", "readonly", "read-auto", "clean", NULL }; int WaitClean(char *dev, int sock, int verbose) { int fd; struct mdinfo *mdi; int rv = 1; int devnum; fd = open(dev, O_RDONLY); if (fd < 0) { if (verbose) fprintf(stderr, Name ": Couldn't open %s: %s\n", dev, strerror(errno)); return 1; } devnum = fd2devnum(fd); mdi = sysfs_read(fd, devnum, GET_VERSION|GET_LEVEL|GET_SAFEMODE); if (!mdi) { if (verbose) fprintf(stderr, Name ": Failed to read sysfs attributes for " "%s\n", dev); close(fd); return 0; } switch(mdi->array.level) { case LEVEL_LINEAR: case LEVEL_MULTIPATH: case 0: /* safemode delay is irrelevant for these levels */ rv = 0; } /* for internal metadata the kernel handles the final clean * transition, containers can never be dirty */ if (!is_subarray(mdi->text_version)) rv = 0; /* safemode disabled ? */ if (mdi->safe_mode_delay == 0) rv = 0; if (rv) { int state_fd = sysfs_open(fd2devnum(fd), NULL, "array_state"); char buf[20]; fd_set fds; struct timeval tm; /* minimize the safe_mode_delay and prepare to wait up to 5s * for writes to quiesce */ sysfs_set_safemode(mdi, 1); tm.tv_sec = 5; tm.tv_usec = 0; /* give mdmon a chance to checkpoint resync */ sysfs_set_str(mdi, NULL, "sync_action", "idle"); FD_ZERO(&fds); /* wait for array_state to be clean */ while (1) { rv = read(state_fd, buf, sizeof(buf)); if (rv < 0) break; if (sysfs_match_word(buf, clean_states) <= 4) break; FD_SET(state_fd, &fds); rv = select(state_fd + 1, NULL, NULL, &fds, &tm); if (rv < 0 && errno != EINTR) break; lseek(state_fd, 0, SEEK_SET); } if (rv < 0) rv = 1; else if (fping_monitor(sock) == 0 || ping_monitor(mdi->text_version) == 0) { /* we need to ping to close the window between array * state transitioning to clean and the metadata being * marked clean */ rv = 0; } else rv = 1; if (rv && verbose) fprintf(stderr, Name ": Error waiting for %s to be clean\n", dev); /* restore the original safe_mode_delay */ sysfs_set_safemode(mdi, mdi->safe_mode_delay); close(state_fd); } sysfs_free(mdi); close(fd); return rv; } #endif /* MDASSEMBLE */