From ed198cb49750fd9ec564e9f1df66c10efea605f1 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 22 Apr 2006 02:38:50 +0100 Subject: [RBTREE] Update hrtimers to use rb_parent() accessor macro. Also switch it to use the same method of using off-tree nodes as everyone else now does -- set them to point to themselves. Signed-off-by: David Woodhouse --- kernel/hrtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d2a7296c825..04ab27ddfd9 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -393,7 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) if (base->first == &timer->node) base->first = rb_next(&timer->node); rb_erase(&timer->node, &base->active); - timer->node.rb_parent = HRTIMER_INACTIVE; + rb_set_parent(&timer->node, &timer->node); } /* @@ -578,7 +578,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_id = CLOCK_MONOTONIC; timer->base = &bases[clock_id]; - timer->node.rb_parent = HRTIMER_INACTIVE; + rb_set_parent(&timer->node, &timer->node); } /** -- cgit From 6f18a022fb311f07f3b32f2c0e1b5c9477dc4439 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 8 May 2006 22:40:05 +0100 Subject: Finally remove the obnoxious inter_module_xxx() This was already a bad plan when I argued against adding it in the first place. Good riddance. Signed-off-by: David Woodhouse --- kernel/Makefile | 1 - kernel/intermodule.c | 184 --------------------------------------------------- 2 files changed, 185 deletions(-) delete mode 100644 kernel/intermodule.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 58908f9d156..f6ef00f4f90 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -20,7 +20,6 @@ obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o diff --git a/kernel/intermodule.c b/kernel/intermodule.c deleted file mode 100644 index 55b1e5b85db..00000000000 --- a/kernel/intermodule.c +++ /dev/null @@ -1,184 +0,0 @@ -/* Deprecated, do not use. Moved from module.c to here. --RR */ - -/* Written by Keith Owens Oct 2000 */ -#include -#include -#include -#include -#include - -/* inter_module functions are always available, even when the kernel is - * compiled without modules. Consumers of inter_module_xxx routines - * will always work, even when both are built into the kernel, this - * approach removes lots of #ifdefs in mainline code. - */ - -static struct list_head ime_list = LIST_HEAD_INIT(ime_list); -static DEFINE_SPINLOCK(ime_lock); -static int kmalloc_failed; - -struct inter_module_entry { - struct list_head list; - const char *im_name; - struct module *owner; - const void *userdata; -}; - -/** - * inter_module_register - register a new set of inter module data. - * @im_name: an arbitrary string to identify the data, must be unique - * @owner: module that is registering the data, always use THIS_MODULE - * @userdata: pointer to arbitrary userdata to be registered - * - * Description: Check that the im_name has not already been registered, - * complain if it has. For new data, add it to the inter_module_entry - * list. - */ -void inter_module_register(const char *im_name, struct module *owner, const void *userdata) -{ - struct list_head *tmp; - struct inter_module_entry *ime, *ime_new; - - if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) { - /* Overloaded kernel, not fatal */ - printk(KERN_ERR - "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", - im_name); - kmalloc_failed = 1; - return; - } - ime_new->im_name = im_name; - ime_new->owner = owner; - ime_new->userdata = userdata; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - spin_unlock(&ime_lock); - kfree(ime_new); - /* Program logic error, fatal */ - printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name); - BUG(); - } - } - list_add(&(ime_new->list), &ime_list); - spin_unlock(&ime_lock); -} - -/** - * inter_module_unregister - unregister a set of inter module data. - * @im_name: an arbitrary string to identify the data, must be unique - * - * Description: Check that the im_name has been registered, complain if - * it has not. For existing data, remove it from the - * inter_module_entry list. - */ -void inter_module_unregister(const char *im_name) -{ - struct list_head *tmp; - struct inter_module_entry *ime; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - list_del(&(ime->list)); - spin_unlock(&ime_lock); - kfree(ime); - return; - } - } - spin_unlock(&ime_lock); - if (kmalloc_failed) { - printk(KERN_ERR - "inter_module_unregister: no entry for '%s', " - "probably caused by previous kmalloc failure\n", - im_name); - return; - } - else { - /* Program logic error, fatal */ - printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name); - BUG(); - } -} - -/** - * inter_module_get - return arbitrary userdata from another module. - * @im_name: an arbitrary string to identify the data, must be unique - * - * Description: If the im_name has not been registered, return NULL. - * Try to increment the use count on the owning module, if that fails - * then return NULL. Otherwise return the userdata. - */ -static const void *inter_module_get(const char *im_name) -{ - struct list_head *tmp; - struct inter_module_entry *ime; - const void *result = NULL; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - if (try_module_get(ime->owner)) - result = ime->userdata; - break; - } - } - spin_unlock(&ime_lock); - return(result); -} - -/** - * inter_module_get_request - im get with automatic request_module. - * @im_name: an arbitrary string to identify the data, must be unique - * @modname: module that is expected to register im_name - * - * Description: If inter_module_get fails, do request_module then retry. - */ -const void *inter_module_get_request(const char *im_name, const char *modname) -{ - const void *result = inter_module_get(im_name); - if (!result) { - request_module("%s", modname); - result = inter_module_get(im_name); - } - return(result); -} - -/** - * inter_module_put - release use of data from another module. - * @im_name: an arbitrary string to identify the data, must be unique - * - * Description: If the im_name has not been registered, complain, - * otherwise decrement the use count on the owning module. - */ -void inter_module_put(const char *im_name) -{ - struct list_head *tmp; - struct inter_module_entry *ime; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - if (ime->owner) - module_put(ime->owner); - spin_unlock(&ime_lock); - return; - } - } - spin_unlock(&ime_lock); - printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name); - BUG(); -} - -EXPORT_SYMBOL(inter_module_register); -EXPORT_SYMBOL(inter_module_unregister); -EXPORT_SYMBOL(inter_module_get_request); -EXPORT_SYMBOL(inter_module_put); - -MODULE_LICENSE("GPL"); - -- cgit From 651d765d0b2c72d33430487c8b6ef64c60cd2134 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 7 Jun 2006 16:10:19 +1000 Subject: [PATCH] Add a prctl to change the endianness of a process. This new prctl is intended for changing the execution mode of the processor, on processors that support both a little-endian mode and a big-endian mode. It is intended for use by programs such as instruction set emulators (for example an x86 emulator on PowerPC), which may find it convenient to use the processor in an alternate endianness mode when executing translated instructions. Note that this does not imply the existence of a fully-fledged ABI for both endiannesses, or of compatibility code for converting system calls done in the non-native endianness mode. The program is expected to arrange for all of its system call arguments to be presented in the native endianness. Switching between big and little-endian mode will require some care in constructing the instruction sequence for the switch. Generally the instructions up to the instruction that invokes the prctl system call will have to be in the old endianness, and subsequent instructions will have to be in the new endianness. Signed-off-by: Anton Blanchard Signed-off-by: Paul Mackerras --- kernel/sys.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 0b6ec0e7936..12d2d753dc3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -57,6 +57,12 @@ #ifndef GET_FPEXC_CTL # define GET_FPEXC_CTL(a,b) (-EINVAL) #endif +#ifndef GET_ENDIAN +# define GET_ENDIAN(a,b) (-EINVAL) +#endif +#ifndef SET_ENDIAN +# define SET_ENDIAN(a,b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2057,6 +2063,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, return -EFAULT; return 0; } + case PR_GET_ENDIAN: + error = GET_ENDIAN(current, arg2); + break; + case PR_SET_ENDIAN: + error = SET_ENDIAN(current, arg2); + break; + default: error = -EINVAL; break; -- cgit From b817f6feff4a565b08f0e699a5790b4008b8f494 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Fri, 9 Jun 2006 21:53:55 +0200 Subject: kbuild: check license compatibility when building modules Modules that uses GPL symbols can no longer be build with kbuild, the build will fail during the modpost step. When a GPL-incompatible module uses a EXPORT_SYMBOL_GPL_FUTURE symbol then warn during modpost so author are actually notified. The actual license compatibility check is shared with the kernel to make sure it is in sync. Patch originally from: Andreas Gruenbacher and Ram Pai Signed-off-by: Sam Ravnborg --- kernel/module.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index bbe04862e1b..690381508d0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -43,6 +43,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -1248,16 +1249,6 @@ static void layout_sections(struct module *mod, } } -static inline int license_is_gpl_compatible(const char *license) -{ - return (strcmp(license, "GPL") == 0 - || strcmp(license, "GPL v2") == 0 - || strcmp(license, "GPL and additional rights") == 0 - || strcmp(license, "Dual BSD/GPL") == 0 - || strcmp(license, "Dual MIT/GPL") == 0 - || strcmp(license, "Dual MPL/GPL") == 0); -} - static void set_license(struct module *mod, const char *license) { if (!license) -- cgit From 2d9048e201bfb67ba21f05e647b1286b8a4a5667 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Thu, 1 Jun 2006 13:10:59 -0700 Subject: [PATCH] inotify (1/5): split kernel API from userspace support The following series of patches introduces a kernel API for inotify, making it possible for kernel modules to benefit from inotify's mechanism for watching inodes. With these patches, inotify will maintain for each caller a list of watches (via an embedded struct inotify_watch), where each inotify_watch is associated with a corresponding struct inode. The caller registers an event handler and specifies for which filesystem events their event handler should be called per inotify_watch. Signed-off-by: Amy Griffis Acked-by: Robert Love Acked-by: John McCutchan Signed-off-by: Al Viro --- kernel/sysctl.c | 4 ++-- kernel/user.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e82726faeef..0d656e61621 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -150,7 +150,7 @@ extern ctl_table random_table[]; #ifdef CONFIG_UNIX98_PTYS extern ctl_table pty_table[]; #endif -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER extern ctl_table inotify_table[]; #endif @@ -1028,7 +1028,7 @@ static ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER { .ctl_name = FS_INOTIFY, .procname = "inotify", diff --git a/kernel/user.c b/kernel/user.c index 2116642f42c..4b1eb745afa 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid) atomic_set(&new->processes, 0); atomic_set(&new->files, 0); atomic_set(&new->sigpending, 0); -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER atomic_set(&new->inotify_watches, 0); atomic_set(&new->inotify_devs, 0); #endif -- cgit From 9044e6bca5a4a575d3c068dfccb5651a2d6a13bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 May 2006 01:09:24 -0400 Subject: [PATCH] fix deadlocks in AUDIT_LIST/AUDIT_LIST_RULES We should not send a pile of replies while holding audit_netlink_mutex since we hold the same mutex when we receive commands. As the result, we can get blocked while sending and sit there holding the mutex while auditctl is unable to send the next command and get around to receiving what we'd sent. Solution: create skb and put them into a queue instead of sending; once we are done, send what we've got on the list. The former can be done synchronously while we are handling AUDIT_LIST or AUDIT_LIST_RULES; we are holding audit_netlink_mutex at that point. The latter is done asynchronously and without messing with audit_netlink_mutex. Signed-off-by: Al Viro --- kernel/audit.c | 62 ++++++++++++++++++++++++++++++++++++++-------------- kernel/audit.h | 11 ++++++++++ kernel/auditfilter.c | 60 +++++++++++++++++++++----------------------------- 3 files changed, 81 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index df57b493e1c..bf74bf02aa4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -366,6 +366,50 @@ static int kauditd_thread(void *dummy) return 0; } +int audit_send_list(void *_dest) +{ + struct audit_netlink_list *dest = _dest; + int pid = dest->pid; + struct sk_buff *skb; + + /* wait for parent to finish and send an ACK */ + mutex_lock(&audit_netlink_mutex); + mutex_unlock(&audit_netlink_mutex); + + while ((skb = __skb_dequeue(&dest->q)) != NULL) + netlink_unicast(audit_sock, skb, pid, 0); + + kfree(dest); + + return 0; +} + +struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, + int multi, void *payload, int size) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int len = NLMSG_SPACE(size); + void *data; + int flags = multi ? NLM_F_MULTI : 0; + int t = done ? NLMSG_DONE : type; + + skb = alloc_skb(len, GFP_KERNEL); + if (!skb) + return NULL; + + nlh = NLMSG_PUT(skb, pid, seq, t, size); + nlh->nlmsg_flags = flags; + data = NLMSG_DATA(nlh); + memcpy(data, payload, size); + return skb; + +nlmsg_failure: /* Used by NLMSG_PUT */ + if (skb) + kfree_skb(skb); + return NULL; +} + /** * audit_send_reply - send an audit reply message via netlink * @pid: process id to send reply to @@ -383,29 +427,13 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { struct sk_buff *skb; - struct nlmsghdr *nlh; - int len = NLMSG_SPACE(size); - void *data; - int flags = multi ? NLM_F_MULTI : 0; - int t = done ? NLMSG_DONE : type; - - skb = alloc_skb(len, GFP_KERNEL); + skb = audit_make_reply(pid, seq, type, done, multi, payload, size); if (!skb) return; - - nlh = NLMSG_PUT(skb, pid, seq, t, size); - nlh->nlmsg_flags = flags; - data = NLMSG_DATA(nlh); - memcpy(data, payload, size); - /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(audit_sock, skb, pid, 0); return; - -nlmsg_failure: /* Used by NLMSG_PUT */ - if (skb) - kfree_skb(skb); } /* diff --git a/kernel/audit.h b/kernel/audit.h index 6f733920fd3..8948fc1e9e5 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -22,6 +22,7 @@ #include #include #include +#include /* 0 = no checking 1 = put_count checking @@ -82,6 +83,9 @@ struct audit_entry { extern int audit_pid; extern int audit_comparator(const u32 left, const u32 op, const u32 right); +extern struct sk_buff * audit_make_reply(int pid, int seq, int type, + int done, int multi, + void *payload, int size); extern void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size); @@ -89,4 +93,11 @@ extern void audit_log_lost(const char *message); extern void audit_panic(const char *message); extern struct mutex audit_netlink_mutex; +struct audit_netlink_list { + int pid; + struct sk_buff_head q; +}; + +int audit_send_list(void *); + extern int selinux_audit_rule_update(void); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7c134906d68..ccfea6d82cc 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -510,19 +510,12 @@ static inline int audit_del_rule(struct audit_entry *entry, /* List rules using struct audit_rule. Exists for backward * compatibility with userspace. */ -static int audit_list(void *_dest) +static void audit_list(int pid, int seq, struct sk_buff_head *q) { - int pid, seq; - int *dest = _dest; + struct sk_buff *skb; struct audit_entry *entry; int i; - pid = dest[0]; - seq = dest[1]; - kfree(dest); - - mutex_lock(&audit_netlink_mutex); - /* The *_rcu iterators not needed here because we are always called with audit_netlink_mutex held. */ for (i=0; irule); if (unlikely(!rule)) break; - audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, rule, sizeof(*rule)); + if (skb) + skb_queue_tail(q, skb); kfree(rule); } } - audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); - - mutex_unlock(&audit_netlink_mutex); - return 0; + skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); + if (skb) + skb_queue_tail(q, skb); } /* List rules using struct audit_rule_data. */ -static int audit_list_rules(void *_dest) +static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) { - int pid, seq; - int *dest = _dest; + struct sk_buff *skb; struct audit_entry *e; int i; - pid = dest[0]; - seq = dest[1]; - kfree(dest); - - mutex_lock(&audit_netlink_mutex); - /* The *_rcu iterators not needed here because we are always called with audit_netlink_mutex held. */ for (i=0; irule); if (unlikely(!data)) break; - audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, data, sizeof(*data)); + if (skb) + skb_queue_tail(q, skb); kfree(data); } } - audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); - - mutex_unlock(&audit_netlink_mutex); - return 0; + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); + if (skb) + skb_queue_tail(q, skb); } /** @@ -592,7 +580,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, size_t datasz, uid_t loginuid, u32 sid) { struct task_struct *tsk; - int *dest; + struct audit_netlink_list *dest; int err = 0; struct audit_entry *entry; @@ -605,18 +593,20 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, * happen if we're actually running in the context of auditctl * trying to _send_ the stuff */ - dest = kmalloc(2 * sizeof(int), GFP_KERNEL); + dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); if (!dest) return -ENOMEM; - dest[0] = pid; - dest[1] = seq; + dest->pid = pid; + skb_queue_head_init(&dest->q); if (type == AUDIT_LIST) - tsk = kthread_run(audit_list, dest, "audit_list"); + audit_list(pid, seq, &dest->q); else - tsk = kthread_run(audit_list_rules, dest, - "audit_list_rules"); + audit_list_rules(pid, seq, &dest->q); + + tsk = kthread_run(audit_send_list, dest, "audit_send_list"); if (IS_ERR(tsk)) { + skb_queue_purge(&dest->q); kfree(dest); err = PTR_ERR(tsk); } -- cgit From 473ae30bc7b1dda5c5791c773f95e9424ddfead9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Apr 2006 14:04:08 -0400 Subject: [PATCH] execve argument logging Signed-off-by: Al Viro --- kernel/audit.c | 8 +++++--- kernel/auditsc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index bf74bf02aa4..d09f131b111 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1026,18 +1026,20 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, * or a space. Unescaped strings will start and end with a double quote mark. * Strings that are escaped are printed in hex (2 digits per char). */ -void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) +const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { const unsigned char *p = string; + size_t len = strlen(string); while (*p) { if (*p == '"' || *p < 0x21 || *p > 0x7f) { - audit_log_hex(ab, string, strlen(string)); - return; + audit_log_hex(ab, string, len); + return string + len + 1; } p++; } audit_log_format(ab, "\"%s\"", string); + return p + 1; } /* This is a helper-function to print the escaped d_path */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1c03a4ed1b2..114f921979e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -59,6 +59,7 @@ #include #include #include +#include #include "audit.h" @@ -110,6 +111,13 @@ struct audit_aux_data_ipcctl { u32 osid; }; +struct audit_aux_data_execve { + struct audit_aux_data d; + int argc; + int envc; + char mem[0]; +}; + struct audit_aux_data_socketcall { struct audit_aux_data d; int nargs; @@ -667,6 +675,16 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts kfree(ctx); } break; } + case AUDIT_EXECVE: { + struct audit_aux_data_execve *axi = (void *)aux; + int i; + const char *p; + for (i = 0, p = axi->mem; i < axi->argc; i++) { + audit_log_format(ab, "a%d=", i); + p = audit_log_untrustedstring(ab, p); + audit_log_format(ab, "\n"); + } + break; } case AUDIT_SOCKETCALL: { int i; @@ -1231,6 +1249,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, return 0; } +int audit_bprm(struct linux_binprm *bprm) +{ + struct audit_aux_data_execve *ax; + struct audit_context *context = current->audit_context; + unsigned long p, next; + void *to; + + if (likely(!audit_enabled || !context)) + return 0; + + ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, + GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->argc = bprm->argc; + ax->envc = bprm->envc; + for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { + struct page *page = bprm->page[p / PAGE_SIZE]; + void *kaddr = kmap(page); + next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1); + memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p); + to += next - p; + kunmap(page); + } + + ax->d.type = AUDIT_EXECVE; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + + /** * audit_socketcall - record audit data for sys_socketcall * @nargs: number of args -- cgit From e1396065e0489f98b35021b97907ab4edbfb24e1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 25 May 2006 10:19:47 -0400 Subject: [PATCH] collect sid of those who send signals to auditd Signed-off-by: Al Viro --- kernel/audit.c | 31 ++++++++++++++++++++----------- kernel/audit.h | 11 +++++++++++ kernel/auditsc.c | 23 ++++++++++++----------- kernel/signal.c | 2 +- 4 files changed, 44 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d09f131b111..bb20922d08c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -89,6 +89,7 @@ static int audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ uid_t audit_sig_uid = -1; pid_t audit_sig_pid = -1; +u32 audit_sig_sid = 0; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] @@ -479,7 +480,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; uid_t loginuid; /* loginuid of sender */ - struct audit_sig_info sig_data; + struct audit_sig_info *sig_data; + char *ctx; + u32 len; err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); if (err) @@ -531,12 +534,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_PID) { int old = audit_pid; if (sid) { - char *ctx = NULL; - u32 len; - int rc; - if ((rc = selinux_ctxid_to_string( + if ((err = selinux_ctxid_to_string( sid, &ctx, &len))) - return rc; + return err; else audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, @@ -572,8 +572,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) "user pid=%d uid=%u auid=%u", pid, uid, loginuid); if (sid) { - char *ctx = NULL; - u32 len; if (selinux_ctxid_to_string( sid, &ctx, &len)) { audit_log_format(ab, @@ -612,10 +610,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) loginuid, sid); break; case AUDIT_SIGNAL_INFO: - sig_data.uid = audit_sig_uid; - sig_data.pid = audit_sig_pid; + err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len); + if (err) + return err; + sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); + if (!sig_data) { + kfree(ctx); + return -ENOMEM; + } + sig_data->uid = audit_sig_uid; + sig_data->pid = audit_sig_pid; + memcpy(sig_data->ctx, ctx, len); + kfree(ctx); audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, - 0, 0, &sig_data, sizeof(sig_data)); + 0, 0, sig_data, sizeof(*sig_data) + len); + kfree(sig_data); break; default: err = -EINVAL; diff --git a/kernel/audit.h b/kernel/audit.h index 8948fc1e9e5..52cb1e31d52 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -101,3 +101,14 @@ struct audit_netlink_list { int audit_send_list(void *); extern int selinux_audit_rule_update(void); + +#ifdef CONFIG_AUDITSYSCALL +extern void __audit_signal_info(int sig, struct task_struct *t); +static inline void audit_signal_info(int sig, struct task_struct *t) +{ + if (unlikely(audit_pid && t->tgid == audit_pid)) + __audit_signal_info(sig, t); +} +#else +#define audit_signal_info(s,t) +#endif diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 114f921979e..4ca913daa7d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1376,19 +1376,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ -void audit_signal_info(int sig, struct task_struct *t) +void __audit_signal_info(int sig, struct task_struct *t) { extern pid_t audit_sig_pid; extern uid_t audit_sig_uid; - - if (unlikely(audit_pid && t->tgid == audit_pid)) { - if (sig == SIGTERM || sig == SIGHUP) { - struct audit_context *ctx = current->audit_context; - audit_sig_pid = current->pid; - if (ctx) - audit_sig_uid = ctx->loginuid; - else - audit_sig_uid = current->uid; - } + extern u32 audit_sig_sid; + + if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { + struct task_struct *tsk = current; + struct audit_context *ctx = tsk->audit_context; + audit_sig_pid = tsk->pid; + if (ctx) + audit_sig_uid = ctx->loginuid; + else + audit_sig_uid = tsk->uid; + selinux_get_task_sid(tsk, &audit_sig_sid); } } diff --git a/kernel/signal.c b/kernel/signal.c index e5f8aea78ff..1b3c921737e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -23,12 +23,12 @@ #include #include #include -#include #include #include #include #include #include +#include "audit.h" /* audit_signal_info() */ /* * SLAB caches for signal bits. -- cgit From f46038ff7d23ae092d61b366332c05aab8227b48 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 6 May 2006 08:22:52 -0400 Subject: [PATCH] log ppid Signed-off-by: Al Viro --- kernel/auditsc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4ca913daa7d..4fc3867fa25 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -60,6 +60,7 @@ #include #include #include +#include #include "audit.h" @@ -156,7 +157,7 @@ struct audit_context { struct audit_aux_data *aux; /* Save things to print about task_struct */ - pid_t pid; + pid_t pid, ppid; uid_t uid, euid, suid, fsuid; gid_t gid, egid, sgid, fsgid; unsigned long personality; @@ -379,6 +380,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, } context->pid = tsk->pid; + context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ context->uid = tsk->uid; context->gid = tsk->gid; context->euid = tsk->euid; @@ -614,7 +616,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts tty = "(none)"; audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" - " pid=%d auid=%u uid=%u gid=%u" + " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s", context->argv[0], @@ -622,6 +624,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->argv[2], context->argv[3], context->name_count, + context->ppid, context->pid, context->loginuid, context->uid, -- cgit From 3c66251e573219a0532a5a07381b2f60a412d9eb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 6 May 2006 08:26:27 -0400 Subject: [PATCH] add filtering by ppid Signed-off-by: Al Viro --- kernel/auditsc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4fc3867fa25..e4551659ad7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -188,6 +188,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_PID: result = audit_comparator(tsk->pid, f->op, f->val); break; + case AUDIT_PPID: + if (ctx) + result = audit_comparator(ctx->ppid, f->op, f->val); + break; case AUDIT_UID: result = audit_comparator(tsk->uid, f->op, f->val); break; -- cgit From 0a3b483e83edb6aa6d3c49db70eeb6f1cd9f6c6b Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Tue, 2 May 2006 15:06:01 -0400 Subject: [PATCH] fix audit_krule_to_{rule,data} return values Don't return -ENOMEM when callers of these functions are checking for a NULL return. Bug noticed by Serge Hallyn. Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- kernel/auditfilter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ccfea6d82cc..b3fccd6808f 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -291,7 +291,7 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) rule = kmalloc(sizeof(*rule), GFP_KERNEL); if (unlikely(!rule)) - return ERR_PTR(-ENOMEM); + return NULL; memset(rule, 0, sizeof(*rule)); rule->flags = krule->flags | krule->listnr; @@ -322,7 +322,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); if (unlikely(!data)) - return ERR_PTR(-ENOMEM); + return NULL; memset(data, 0, sizeof(*data)); data->flags = krule->flags | krule->listnr; -- cgit From 5d136a010de3bc16fe595987feb9ef8868f064c2 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Thu, 27 Apr 2006 16:45:14 -0500 Subject: [PATCH] minor audit updates Just a few minor proposed updates. Only the last one will actually affect behavior. The rest are just misleading code. Several AUDIT_SET functions return 'old' value, but only return value <0 is checked for. So just return 0. propagate audit_set_rate_limit and audit_set_backlog_limit error values In audit_buffer_free, the audit_freelist_count was being incremented even when we discard the return buffer, so audit_freelist_count can end up wrong. This could cause the actual freelist to shrink over time, eventually threatening to degrate audit performance. Signed-off-by: Serge E. Hallyn Signed-off-by: Al Viro --- kernel/audit.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index bb20922d08c..0738a4b290e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -251,7 +251,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) "audit_rate_limit=%d old=%d by auid=%u", limit, old, loginuid); audit_rate_limit = limit; - return old; + return 0; } static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) @@ -274,7 +274,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) "audit_backlog_limit=%d old=%d by auid=%u", limit, old, loginuid); audit_backlog_limit = limit; - return old; + return 0; } static int audit_set_enabled(int state, uid_t loginuid, u32 sid) @@ -300,7 +300,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid) "audit_enabled=%d old=%d by auid=%u", state, old, loginuid); audit_enabled = state; - return old; + return 0; } static int audit_set_failure(int state, uid_t loginuid, u32 sid) @@ -328,7 +328,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid) "audit_failure=%d old=%d by auid=%u", state, old, loginuid); audit_failure = state; - return old; + return 0; } static int kauditd_thread(void *dummy) @@ -364,7 +364,6 @@ static int kauditd_thread(void *dummy) remove_wait_queue(&kauditd_wait, &wait); } } - return 0; } int audit_send_list(void *_dest) @@ -551,10 +550,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_pid = status_get->pid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) - audit_set_rate_limit(status_get->rate_limit, + err = audit_set_rate_limit(status_get->rate_limit, loginuid, sid); if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) - audit_set_backlog_limit(status_get->backlog_limit, + err = audit_set_backlog_limit(status_get->backlog_limit, loginuid, sid); break; case AUDIT_USER: @@ -727,10 +726,12 @@ static void audit_buffer_free(struct audit_buffer *ab) kfree_skb(ab->skb); spin_lock_irqsave(&audit_freelist_lock, flags); - if (++audit_freelist_count > AUDIT_MAXFREE) + if (audit_freelist_count > AUDIT_MAXFREE) kfree(ab); - else + else { + audit_freelist_count++; list_add(&ab->list, &audit_freelist); + } spin_unlock_irqrestore(&audit_freelist_lock, flags); } -- cgit From ac03221a4fdda9bfdabf99bcd129847f20fc1d80 Mon Sep 17 00:00:00 2001 From: Linda Knippers Date: Tue, 16 May 2006 22:03:48 -0400 Subject: [PATCH] update of IPC audit record cleanup The following patch addresses most of the issues with the IPC_SET_PERM records as described in: https://www.redhat.com/archives/linux-audit/2006-May/msg00010.html and addresses the comments I received on the record field names. To summarize, I made the following changes: 1. Changed sys_msgctl() and semctl_down() so that an IPC_SET_PERM record is emitted in the failure case as well as the success case. This matches the behavior in sys_shmctl(). I could simplify the code in sys_msgctl() and semctl_down() slightly but it would mean that in some error cases we could get an IPC_SET_PERM record without an IPC record and that seemed odd. 2. No change to the IPC record type, given no feedback on the backward compatibility question. 3. Removed the qbytes field from the IPC record. It wasn't being set and when audit_ipc_obj() is called from ipcperms(), the information isn't available. If we want the information in the IPC record, more extensive changes will be necessary. Since it only applies to message queues and it isn't really permission related, it doesn't seem worth it. 4. Removed the obj field from the IPC_SET_PERM record. This means that the kern_ipc_perm argument is no longer needed. 5. Removed the spaces and renamed the IPC_SET_PERM field names. Replaced iuid and igid fields with ouid and ogid in the IPC record. I tested this with the lspp.22 kernel on an x86_64 box. I believe it applies cleanly on the latest kernel. -- ljk Signed-off-by: Linda Knippers Signed-off-by: Al Viro --- kernel/auditsc.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e4551659ad7..fa4bf962545 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -648,8 +648,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " qbytes=%lx iuid=%u igid=%u mode=%x", - axi->qbytes, axi->uid, axi->gid, axi->mode); + "ouid=%u ogid=%u mode=%x", + axi->uid, axi->gid, axi->mode); if (axi->osid != 0) { char *ctx = NULL; u32 len; @@ -667,21 +667,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_IPC_SET_PERM: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " new qbytes=%lx new iuid=%u new igid=%u new mode=%x", + "qbytes=%lx ouid=%u ogid=%u mode=%x", axi->qbytes, axi->uid, axi->gid, axi->mode); - if (axi->osid != 0) { - char *ctx = NULL; - u32 len; - if (selinux_ctxid_to_string( - axi->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - axi->osid); - call_panic = 1; - } else - audit_log_format(ab, " obj=%s", ctx); - kfree(ctx); - } break; } + case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; int i; @@ -1232,7 +1221,7 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) +int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) { struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; @@ -1248,7 +1237,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, ax->uid = uid; ax->gid = gid; ax->mode = mode; - selinux_get_ipc_sid(ipcp, &ax->osid); ax->d.type = AUDIT_IPC_SET_PERM; ax->d.next = context->aux; -- cgit From d8945bb51a2bb6623cfa36b9ff63594f46d513aa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 May 2006 16:01:30 -0400 Subject: [PATCH] inline more audit helpers pull checks for ->audit_context into inlined wrappers Signed-off-by: Al Viro --- kernel/auditsc.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fa4bf962545..05d31ee4f3d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -922,11 +922,11 @@ void audit_syscall_exit(int valid, long return_code) * Add a name to the list of audit names for this context. * Called from fs/namei.c:getname(). */ -void audit_getname(const char *name) +void __audit_getname(const char *name) { struct audit_context *context = current->audit_context; - if (!context || IS_ERR(name) || !name) + if (IS_ERR(name) || !name) return; if (!context->in_syscall) { @@ -1189,14 +1189,11 @@ uid_t audit_get_loginuid(struct audit_context *ctx) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_ipc_obj(struct kern_ipc_perm *ipcp) +int __audit_ipc_obj(struct kern_ipc_perm *ipcp) { struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) - return 0; - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); if (!ax) return -ENOMEM; @@ -1221,14 +1218,11 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) { struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) - return 0; - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); if (!ax) return -ENOMEM; -- cgit From 014149cce19c5acb19014e57a5b739b7f64e6fbf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 23 May 2006 01:36:13 -0400 Subject: [PATCH] deprecate AUDIT_POSSBILE Signed-off-by: Al Viro --- kernel/auditfilter.c | 8 +++++--- kernel/auditsc.c | 1 - 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b3fccd6808f..df9503da40f 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -128,8 +128,11 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) #endif ; } - if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && - rule->action != AUDIT_ALWAYS) + if (unlikely(rule->action == AUDIT_POSSIBLE)) { + printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); + goto exit_err; + } + if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) goto exit_err; if (rule->field_count > AUDIT_MAX_FIELDS) goto exit_err; @@ -734,7 +737,6 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } return 1; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 05d31ee4f3d..4503c4663cf 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -307,7 +307,6 @@ static int audit_filter_rules(struct task_struct *tsk, } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } return 1; -- cgit From 20ca73bc792be9625af184cbec36e1372611d1c3 Mon Sep 17 00:00:00 2001 From: "George C. Wilson" Date: Wed, 24 May 2006 16:09:55 -0500 Subject: [PATCH] Audit of POSIX Message Queue Syscalls v.2 This patch adds audit support to POSIX message queues. It applies cleanly to the lspp.b15 branch of Al Viro's git tree. There are new auxiliary data structures, and collection and emission routines in kernel/auditsc.c. New hooks in ipc/mqueue.c collect arguments from the syscalls. I tested the patch by building the examples from the POSIX MQ library tarball. Build them -lrt, not against the old MQ library in the tarball. Here's the URL: http://www.geocities.com/wronski12/posix_ipc/libmqueue-4.41.tar.gz Do auditctl -a exit,always -S for mq_open, mq_timedsend, mq_timedreceive, mq_notify, mq_getsetattr. mq_unlink has no new hooks. Please see the corresponding userspace patch to get correct output from auditd for the new record types. [fixes folded] Signed-off-by: George Wilson Signed-off-by: Al Viro --- kernel/auditsc.c | 274 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 273 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4503c4663cf..14e295a4121 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -3,7 +3,7 @@ * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * Copyright 2005 Hewlett-Packard Development Company, L.P. - * Copyright (C) 2005 IBM Corporation + * Copyright (C) 2005, 2006 IBM Corporation * All Rights Reserved. * * This program is free software; you can redistribute it and/or modify @@ -29,6 +29,9 @@ * this file -- see entry.S) is based on a GPL'd patch written by * okir@suse.de and Copyright 2003 SuSE Linux AG. * + * POSIX message queue support added by George Wilson , + * 2006. + * * The support of additional filter rules compares (>, <, >=, <=) was * added by Dustin Kirkland , 2005. * @@ -49,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +106,33 @@ struct audit_aux_data { #define AUDIT_AUX_IPCPERM 0 +struct audit_aux_data_mq_open { + struct audit_aux_data d; + int oflag; + mode_t mode; + struct mq_attr attr; +}; + +struct audit_aux_data_mq_sendrecv { + struct audit_aux_data d; + mqd_t mqdes; + size_t msg_len; + unsigned int msg_prio; + struct timespec abs_timeout; +}; + +struct audit_aux_data_mq_notify { + struct audit_aux_data d; + mqd_t mqdes; + struct sigevent notification; +}; + +struct audit_aux_data_mq_getsetattr { + struct audit_aux_data d; + mqd_t mqdes; + struct mq_attr mqstat; +}; + struct audit_aux_data_ipcctl { struct audit_aux_data d; struct ipc_perm p; @@ -644,6 +675,43 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts continue; /* audit_panic has been called */ switch (aux->type) { + case AUDIT_MQ_OPEN: { + struct audit_aux_data_mq_open *axi = (void *)aux; + audit_log_format(ab, + "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " + "mq_msgsize=%ld mq_curmsgs=%ld", + axi->oflag, axi->mode, axi->attr.mq_flags, + axi->attr.mq_maxmsg, axi->attr.mq_msgsize, + axi->attr.mq_curmsgs); + break; } + + case AUDIT_MQ_SENDRECV: { + struct audit_aux_data_mq_sendrecv *axi = (void *)aux; + audit_log_format(ab, + "mqdes=%d msg_len=%zd msg_prio=%u " + "abs_timeout_sec=%ld abs_timeout_nsec=%ld", + axi->mqdes, axi->msg_len, axi->msg_prio, + axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); + break; } + + case AUDIT_MQ_NOTIFY: { + struct audit_aux_data_mq_notify *axi = (void *)aux; + audit_log_format(ab, + "mqdes=%d sigev_signo=%d", + axi->mqdes, + axi->notification.sigev_signo); + break; } + + case AUDIT_MQ_GETSETATTR: { + struct audit_aux_data_mq_getsetattr *axi = (void *)aux; + audit_log_format(ab, + "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " + "mq_curmsgs=%ld ", + axi->mqdes, + axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg, + axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); + break; } + case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, @@ -1182,6 +1250,210 @@ uid_t audit_get_loginuid(struct audit_context *ctx) return ctx ? ctx->loginuid : -1; } +/** + * __audit_mq_open - record audit data for a POSIX MQ open + * @oflag: open flag + * @mode: mode bits + * @u_attr: queue attributes + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) +{ + struct audit_aux_data_mq_open *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_attr != NULL) { + if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->attr, 0, sizeof(ax->attr)); + + ax->oflag = oflag; + ax->mode = mode; + + ax->d.type = AUDIT_MQ_OPEN; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_timedsend - record audit data for a POSIX MQ timed send + * @mqdes: MQ descriptor + * @msg_len: Message length + * @msg_prio: Message priority + * @abs_timeout: Message timeout in absolute time + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, + const struct timespec __user *u_abs_timeout) +{ + struct audit_aux_data_mq_sendrecv *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_abs_timeout != NULL) { + if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); + + ax->mqdes = mqdes; + ax->msg_len = msg_len; + ax->msg_prio = msg_prio; + + ax->d.type = AUDIT_MQ_SENDRECV; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive + * @mqdes: MQ descriptor + * @msg_len: Message length + * @msg_prio: Message priority + * @abs_timeout: Message timeout in absolute time + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, + unsigned int __user *u_msg_prio, + const struct timespec __user *u_abs_timeout) +{ + struct audit_aux_data_mq_sendrecv *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_msg_prio != NULL) { + if (get_user(ax->msg_prio, u_msg_prio)) { + kfree(ax); + return -EFAULT; + } + } else + ax->msg_prio = 0; + + if (u_abs_timeout != NULL) { + if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); + + ax->mqdes = mqdes; + ax->msg_len = msg_len; + + ax->d.type = AUDIT_MQ_SENDRECV; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_notify - record audit data for a POSIX MQ notify + * @mqdes: MQ descriptor + * @u_notification: Notification event + * + * Returns 0 for success or NULL context or < 0 on error. + */ + +int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) +{ + struct audit_aux_data_mq_notify *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_notification != NULL) { + if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->notification, 0, sizeof(ax->notification)); + + ax->mqdes = mqdes; + + ax->d.type = AUDIT_MQ_NOTIFY; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute + * @mqdes: MQ descriptor + * @mqstat: MQ flags + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) +{ + struct audit_aux_data_mq_getsetattr *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + ax->mqdes = mqdes; + ax->mqstat = *mqstat; + + ax->d.type = AUDIT_MQ_GETSETATTR; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + /** * audit_ipc_obj - record audit data for ipc object * @ipcp: ipc permissions -- cgit From f368c07d7214a7c41dfceb76c8db473b850f0229 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Fri, 7 Apr 2006 16:55:56 -0400 Subject: [PATCH] audit: path-based rules In this implementation, audit registers inotify watches on the parent directories of paths specified in audit rules. When audit's inotify event handler is called, it updates any affected rules based on the filesystem event. If the parent directory is renamed, removed, or its filesystem is unmounted, audit removes all rules referencing that inotify watch. To keep things simple, this implementation limits location-based auditing to the directory entries in an existing directory. Given a path-based rule for /foo/bar/passwd, the following table applies: passwd modified -- audit event logged passwd replaced -- audit event logged, rules list updated bar renamed -- rule removed foo renamed -- untracked, meaning that the rule now applies to the new location Audit users typically want to have many rules referencing filesystem objects, which can significantly impact filtering performance. This patch also adds an inode-number-based rule hash to mitigate this situation. The patch is relative to the audit git tree: http://kernel.org/git/?p=linux/kernel/git/viro/audit-current.git;a=summary and uses the inotify kernel API: http://lkml.org/lkml/2006/6/1/145 Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- kernel/audit.c | 41 ++- kernel/audit.h | 38 ++- kernel/auditfilter.c | 785 ++++++++++++++++++++++++++++++++++++++++++++++++--- kernel/auditsc.c | 124 +++++--- 4 files changed, 900 insertions(+), 88 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 0738a4b290e..0fbf1c11636 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "audit.h" @@ -103,6 +104,12 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; +/* Inotify handle. */ +struct inotify_handle *audit_ih; + +/* Hash for inode-based rules */ +struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; + /* The audit_freelist is a list of pre-allocated audit buffers (if more * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of * being placed on the freelist). */ @@ -115,10 +122,8 @@ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); -/* The netlink socket is only to be read by 1 CPU, which lets us assume - * that list additions and deletions never happen simultaneously in - * auditsc.c */ -DEFINE_MUTEX(audit_netlink_mutex); +/* Serialize requests from userspace. */ +static DEFINE_MUTEX(audit_cmd_mutex); /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -373,8 +378,8 @@ int audit_send_list(void *_dest) struct sk_buff *skb; /* wait for parent to finish and send an ACK */ - mutex_lock(&audit_netlink_mutex); - mutex_unlock(&audit_netlink_mutex); + mutex_lock(&audit_cmd_mutex); + mutex_unlock(&audit_cmd_mutex); while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(audit_sock, skb, pid, 0); @@ -665,20 +670,30 @@ static void audit_receive(struct sock *sk, int length) struct sk_buff *skb; unsigned int qlen; - mutex_lock(&audit_netlink_mutex); + mutex_lock(&audit_cmd_mutex); for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { skb = skb_dequeue(&sk->sk_receive_queue); audit_receive_skb(skb); kfree_skb(skb); } - mutex_unlock(&audit_netlink_mutex); + mutex_unlock(&audit_cmd_mutex); } +#ifdef CONFIG_AUDITSYSCALL +static const struct inotify_operations audit_inotify_ops = { + .handle_event = audit_handle_ievent, + .destroy_watch = audit_free_parent, +}; +#endif /* Initialize audit support at boot time. */ static int __init audit_init(void) { +#ifdef CONFIG_AUDITSYSCALL + int i; +#endif + printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, @@ -697,6 +712,16 @@ static int __init audit_init(void) selinux_audit_set_callback(&selinux_audit_rule_update); audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); + +#ifdef CONFIG_AUDITSYSCALL + audit_ih = inotify_init(&audit_inotify_ops); + if (IS_ERR(audit_ih)) + audit_panic("cannot initialize inotify handle"); + + for (i = 0; i < AUDIT_INODE_BUCKETS; i++) + INIT_LIST_HEAD(&audit_inode_hash[i]); +#endif + return 0; } __initcall(audit_init); diff --git a/kernel/audit.h b/kernel/audit.h index 52cb1e31d52..58fa44cb8d0 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -19,7 +19,6 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include #include #include #include @@ -54,6 +53,18 @@ enum audit_state { }; /* Rule lists */ +struct audit_parent; + +struct audit_watch { + atomic_t count; /* reference count */ + char *path; /* insertion path */ + dev_t dev; /* associated superblock device */ + unsigned long ino; /* associated inode number */ + struct audit_parent *parent; /* associated parent */ + struct list_head wlist; /* entry in parent->watches list */ + struct list_head rules; /* associated rules */ +}; + struct audit_field { u32 type; u32 val; @@ -71,6 +82,9 @@ struct audit_krule { u32 buflen; /* for data alloc on list rules */ u32 field_count; struct audit_field *fields; + struct audit_field *inode_f; /* quick access to an inode field */ + struct audit_watch *watch; /* associated watch */ + struct list_head rlist; /* entry in audit_watch.rules list */ }; struct audit_entry { @@ -79,10 +93,18 @@ struct audit_entry { struct audit_krule rule; }; - extern int audit_pid; -extern int audit_comparator(const u32 left, const u32 op, const u32 right); +#define AUDIT_INODE_BUCKETS 32 +extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; + +static inline int audit_hash_ino(u32 ino) +{ + return (ino & (AUDIT_INODE_BUCKETS-1)); +} + +extern int audit_comparator(const u32 left, const u32 op, const u32 right); +extern int audit_compare_dname_path(const char *dname, const char *path); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, void *payload, int size); @@ -91,7 +113,6 @@ extern void audit_send_reply(int pid, int seq, int type, void *payload, int size); extern void audit_log_lost(const char *message); extern void audit_panic(const char *message); -extern struct mutex audit_netlink_mutex; struct audit_netlink_list { int pid; @@ -100,6 +121,10 @@ struct audit_netlink_list { int audit_send_list(void *); +struct inotify_watch; +extern void audit_free_parent(struct inotify_watch *); +extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, + const char *, struct inode *); extern int selinux_audit_rule_update(void); #ifdef CONFIG_AUDITSYSCALL @@ -109,6 +134,11 @@ static inline void audit_signal_info(int sig, struct task_struct *t) if (unlikely(audit_pid && t->tgid == audit_pid)) __audit_signal_info(sig, t); } +extern enum audit_state audit_filter_inodes(struct task_struct *, + struct audit_context *); +extern void audit_set_auditable(struct audit_context *); #else #define audit_signal_info(s,t) +#define audit_filter_inodes(t,c) AUDIT_DISABLED +#define audit_set_auditable(c) #endif diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index df9503da40f..03a6919103d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -22,13 +22,59 @@ #include #include #include +#include +#include +#include #include +#include +#include #include #include "audit.h" -/* There are three lists of rules -- one to search at task creation - * time, one to search at syscall entry time, and another to search at - * syscall exit time. */ +/* + * Locking model: + * + * audit_filter_mutex: + * Synchronizes writes and blocking reads of audit's filterlist + * data. Rcu is used to traverse the filterlist and access + * contents of structs audit_entry, audit_watch and opaque + * selinux rules during filtering. If modified, these structures + * must be copied and replace their counterparts in the filterlist. + * An audit_parent struct is not accessed during filtering, so may + * be written directly provided audit_filter_mutex is held. + */ + +/* + * Reference counting: + * + * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED + * event. Each audit_watch holds a reference to its associated parent. + * + * audit_watch: if added to lists, lifetime is from audit_init_watch() to + * audit_remove_watch(). Additionally, an audit_watch may exist + * temporarily to assist in searching existing filter data. Each + * audit_krule holds a reference to its associated watch. + */ + +struct audit_parent { + struct list_head ilist; /* entry in inotify registration list */ + struct list_head watches; /* associated watches */ + struct inotify_watch wdata; /* inotify watch data */ + unsigned flags; /* status flags */ +}; + +/* + * audit_parent status flags: + * + * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to + * a filesystem event to ensure we're adding audit watches to a valid parent. + * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot + * receive them while we have nameidata, but must be used for IN_MOVE_SELF which + * we can receive while holding nameidata. + */ +#define AUDIT_PARENT_INVALID 0x001 + +/* Audit filter lists, defined in */ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[0]), LIST_HEAD_INIT(audit_filter_list[1]), @@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { #endif }; +static DEFINE_MUTEX(audit_filter_mutex); + +/* Inotify handle */ +extern struct inotify_handle *audit_ih; + +/* Inotify events we care about. */ +#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF + +void audit_free_parent(struct inotify_watch *i_watch) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + WARN_ON(!list_empty(&parent->watches)); + kfree(parent); +} + +static inline void audit_get_watch(struct audit_watch *watch) +{ + atomic_inc(&watch->count); +} + +static void audit_put_watch(struct audit_watch *watch) +{ + if (atomic_dec_and_test(&watch->count)) { + WARN_ON(watch->parent); + WARN_ON(!list_empty(&watch->rules)); + kfree(watch->path); + kfree(watch); + } +} + +static void audit_remove_watch(struct audit_watch *watch) +{ + list_del(&watch->wlist); + put_inotify_watch(&watch->parent->wdata); + watch->parent = NULL; + audit_put_watch(watch); /* match initial get */ +} + static inline void audit_free_rule(struct audit_entry *e) { int i; + + /* some rules don't have associated watches */ + if (e->rule.watch) + audit_put_watch(e->rule.watch); if (e->rule.fields) for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; @@ -60,6 +150,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head) audit_free_rule(e); } +/* Initialize a parent watch entry. */ +static struct audit_parent *audit_init_parent(struct nameidata *ndp) +{ + struct audit_parent *parent; + s32 wd; + + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (unlikely(!parent)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&parent->watches); + parent->flags = 0; + + inotify_init_watch(&parent->wdata); + /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ + get_inotify_watch(&parent->wdata); + wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode, + AUDIT_IN_WATCH); + if (wd < 0) { + audit_free_parent(&parent->wdata); + return ERR_PTR(wd); + } + + return parent; +} + +/* Initialize a watch entry. */ +static struct audit_watch *audit_init_watch(char *path) +{ + struct audit_watch *watch; + + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + if (unlikely(!watch)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&watch->rules); + atomic_set(&watch->count, 1); + watch->path = path; + watch->dev = (dev_t)-1; + watch->ino = (unsigned long)-1; + + return watch; +} + /* Initialize an audit filterlist entry. */ static inline struct audit_entry *audit_init_entry(u32 field_count) { @@ -107,6 +241,43 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len) return str; } +/* Translate an inode field to kernel respresentation. */ +static inline int audit_to_inode(struct audit_krule *krule, + struct audit_field *f) +{ + if (krule->listnr != AUDIT_FILTER_EXIT || + krule->watch || krule->inode_f) + return -EINVAL; + + krule->inode_f = f; + return 0; +} + +/* Translate a watch string to kernel respresentation. */ +static int audit_to_watch(struct audit_krule *krule, char *path, int len, + u32 op) +{ + struct audit_watch *watch; + + if (!audit_ih) + return -EOPNOTSUPP; + + if (path[0] != '/' || path[len-1] == '/' || + krule->listnr != AUDIT_FILTER_EXIT || + op & ~AUDIT_EQUAL || + krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */ + return -EINVAL; + + watch = audit_init_watch(path); + if (unlikely(IS_ERR(watch))) + return PTR_ERR(watch); + + audit_get_watch(watch); + krule->watch = watch; + + return 0; +} + /* Common user-space to kernel rule translation. */ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) { @@ -161,6 +332,7 @@ exit_err: static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) { struct audit_entry *entry; + struct audit_field *f; int err = 0; int i; @@ -175,14 +347,23 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); f->val = rule->values[i]; - if (f->type & AUDIT_UNUSED_BITS || - f->type == AUDIT_SE_USER || - f->type == AUDIT_SE_ROLE || - f->type == AUDIT_SE_TYPE || - f->type == AUDIT_SE_SEN || - f->type == AUDIT_SE_CLR) { - err = -EINVAL; + err = -EINVAL; + if (f->type & AUDIT_UNUSED_BITS) + goto exit_free; + + switch(f->type) { + case AUDIT_SE_USER: + case AUDIT_SE_ROLE: + case AUDIT_SE_TYPE: + case AUDIT_SE_SEN: + case AUDIT_SE_CLR: + case AUDIT_WATCH: goto exit_free; + case AUDIT_INODE: + err = audit_to_inode(&entry->rule, f); + if (err) + goto exit_free; + break; } entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; @@ -199,6 +380,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) } } + f = entry->rule.inode_f; + if (f) { + switch(f->op) { + case AUDIT_NOT_EQUAL: + entry->rule.inode_f = NULL; + case AUDIT_EQUAL: + break; + default: + goto exit_free; + } + } + exit_nofree: return entry; @@ -213,6 +406,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, { int err = 0; struct audit_entry *entry; + struct audit_field *f; void *bufp; size_t remain = datasz - sizeof(struct audit_rule_data); int i; @@ -263,6 +457,35 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, } else f->se_str = str; break; + case AUDIT_WATCH: + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) + goto exit_free; + entry->rule.buflen += f->val; + + err = audit_to_watch(&entry->rule, str, f->val, f->op); + if (err) { + kfree(str); + goto exit_free; + } + break; + case AUDIT_INODE: + err = audit_to_inode(&entry->rule, f); + if (err) + goto exit_free; + break; + } + } + + f = entry->rule.inode_f; + if (f) { + switch(f->op) { + case AUDIT_NOT_EQUAL: + entry->rule.inode_f = NULL; + case AUDIT_EQUAL: + break; + default: + goto exit_free; } } @@ -346,6 +569,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, f->se_str); break; + case AUDIT_WATCH: + data->buflen += data->values[i] = + audit_pack_string(&bufp, krule->watch->path); + break; default: data->values[i] = f->val; } @@ -381,6 +608,10 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) return 1; break; + case AUDIT_WATCH: + if (strcmp(a->watch->path, b->watch->path)) + return 1; + break; default: if (a->fields[i].val != b->fields[i].val) return 1; @@ -394,6 +625,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 0; } +/* Duplicate the given audit watch. The new watch's rules list is initialized + * to an empty list and wlist is undefined. */ +static struct audit_watch *audit_dupe_watch(struct audit_watch *old) +{ + char *path; + struct audit_watch *new; + + path = kstrdup(old->path, GFP_KERNEL); + if (unlikely(!path)) + return ERR_PTR(-ENOMEM); + + new = audit_init_watch(path); + if (unlikely(IS_ERR(new))) { + kfree(path); + goto out; + } + + new->dev = old->dev; + new->ino = old->ino; + get_inotify_watch(&old->parent->wdata); + new->parent = old->parent; + +out: + return new; +} + /* Duplicate selinux field information. The se_rule is opaque, so must be * re-initialized. */ static inline int audit_dupe_selinux_field(struct audit_field *df, @@ -425,8 +682,11 @@ static inline int audit_dupe_selinux_field(struct audit_field *df, /* Duplicate an audit rule. This will be a deep copy with the exception * of the watch - that pointer is carried over. The selinux specific fields * will be updated in the copy. The point is to be able to replace the old - * rule with the new rule in the filterlist, then free the old rule. */ -static struct audit_entry *audit_dupe_rule(struct audit_krule *old) + * rule with the new rule in the filterlist, then free the old rule. + * The rlist element is undefined; list manipulations are handled apart from + * the initial copy. */ +static struct audit_entry *audit_dupe_rule(struct audit_krule *old, + struct audit_watch *watch) { u32 fcount = old->field_count; struct audit_entry *entry; @@ -445,6 +705,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) for (i = 0; i < AUDIT_BITMASK_SIZE; i++) new->mask[i] = old->mask[i]; new->buflen = old->buflen; + new->inode_f = old->inode_f; + new->watch = NULL; new->field_count = old->field_count; memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); @@ -466,21 +728,318 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) } } + if (watch) { + audit_get_watch(watch); + new->watch = watch; + } + return entry; } -/* Add rule to given filterlist if not a duplicate. Protected by - * audit_netlink_mutex. */ +/* Update inode info in audit rules based on filesystem event. */ +static void audit_update_watch(struct audit_parent *parent, + const char *dname, dev_t dev, + unsigned long ino, unsigned invalidating) +{ + struct audit_watch *owatch, *nwatch, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *oentry, *nentry; + struct audit_buffer *ab; + + mutex_lock(&audit_filter_mutex); + list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { + if (audit_compare_dname_path(dname, owatch->path)) + continue; + + /* If the update involves invalidating rules, do the inode-based + * filtering now, so we don't omit records. */ + if (invalidating && + audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) + audit_set_auditable(current->audit_context); + + nwatch = audit_dupe_watch(owatch); + if (unlikely(IS_ERR(nwatch))) { + mutex_unlock(&audit_filter_mutex); + audit_panic("error updating watch, skipping"); + return; + } + nwatch->dev = dev; + nwatch->ino = ino; + + list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { + + oentry = container_of(r, struct audit_entry, rule); + list_del(&oentry->rule.rlist); + list_del_rcu(&oentry->list); + + nentry = audit_dupe_rule(&oentry->rule, nwatch); + if (unlikely(IS_ERR(nentry))) + audit_panic("error updating watch, removing"); + else { + int h = audit_hash_ino((u32)ino); + list_add(&nentry->rule.rlist, &nwatch->rules); + list_add_rcu(&nentry->list, &audit_inode_hash[h]); + } + + call_rcu(&oentry->rcu, audit_free_rule_rcu); + } + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "audit updated rules specifying watch="); + audit_log_untrustedstring(ab, owatch->path); + audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); + audit_log_end(ab); + + audit_remove_watch(owatch); + goto add_watch_to_parent; /* event applies to a single watch */ + } + mutex_unlock(&audit_filter_mutex); + return; + +add_watch_to_parent: + list_add(&nwatch->wlist, &parent->watches); + mutex_unlock(&audit_filter_mutex); + return; +} + +/* Remove all watches & rules associated with a parent that is going away. */ +static void audit_remove_parent_watches(struct audit_parent *parent) +{ + struct audit_watch *w, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *e; + + mutex_lock(&audit_filter_mutex); + parent->flags |= AUDIT_PARENT_INVALID; + list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { + list_for_each_entry_safe(r, nextr, &w->rules, rlist) { + e = container_of(r, struct audit_entry, rule); + list_del(&r->rlist); + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit implicitly removed rule from list=%d\n", + AUDIT_FILTER_EXIT); + } + audit_remove_watch(w); + } + mutex_unlock(&audit_filter_mutex); +} + +/* Unregister inotify watches for parents on in_list. + * Generates an IN_IGNORED event. */ +static void audit_inotify_unregister(struct list_head *in_list) +{ + struct audit_parent *p, *n; + + list_for_each_entry_safe(p, n, in_list, ilist) { + list_del(&p->ilist); + inotify_rm_watch(audit_ih, &p->wdata); + /* the put matching the get in audit_do_del_rule() */ + put_inotify_watch(&p->wdata); + } +} + +/* Find an existing audit rule. + * Caller must hold audit_filter_mutex to prevent stale rule data. */ +static struct audit_entry *audit_find_rule(struct audit_entry *entry, + struct list_head *list) +{ + struct audit_entry *e, *found = NULL; + int h; + + if (entry->rule.watch) { + /* we don't know the inode number, so must walk entire hash */ + for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { + list = &audit_inode_hash[h]; + list_for_each_entry(e, list, list) + if (!audit_compare_rule(&entry->rule, &e->rule)) { + found = e; + goto out; + } + } + goto out; + } + + list_for_each_entry(e, list, list) + if (!audit_compare_rule(&entry->rule, &e->rule)) { + found = e; + goto out; + } + +out: + return found; +} + +/* Get path information necessary for adding watches. */ +static int audit_get_nd(char *path, struct nameidata **ndp, + struct nameidata **ndw) +{ + struct nameidata *ndparent, *ndwatch; + int err; + + ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); + if (unlikely(!ndparent)) + return -ENOMEM; + + ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); + if (unlikely(!ndwatch)) { + kfree(ndparent); + return -ENOMEM; + } + + err = path_lookup(path, LOOKUP_PARENT, ndparent); + if (err) { + kfree(ndparent); + kfree(ndwatch); + return err; + } + + err = path_lookup(path, 0, ndwatch); + if (err) { + kfree(ndwatch); + ndwatch = NULL; + } + + *ndp = ndparent; + *ndw = ndwatch; + + return 0; +} + +/* Release resources used for watch path information. */ +static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) +{ + if (ndp) { + path_release(ndp); + kfree(ndp); + } + if (ndw) { + path_release(ndw); + kfree(ndw); + } +} + +/* Associate the given rule with an existing parent inotify_watch. + * Caller must hold audit_filter_mutex. */ +static void audit_add_to_parent(struct audit_krule *krule, + struct audit_parent *parent) +{ + struct audit_watch *w, *watch = krule->watch; + int watch_found = 0; + + list_for_each_entry(w, &parent->watches, wlist) { + if (strcmp(watch->path, w->path)) + continue; + + watch_found = 1; + + /* put krule's and initial refs to temporary watch */ + audit_put_watch(watch); + audit_put_watch(watch); + + audit_get_watch(w); + krule->watch = watch = w; + break; + } + + if (!watch_found) { + get_inotify_watch(&parent->wdata); + watch->parent = parent; + + list_add(&watch->wlist, &parent->watches); + } + list_add(&krule->rlist, &watch->rules); +} + +/* Find a matching watch entry, or add this one. + * Caller must hold audit_filter_mutex. */ +static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, + struct nameidata *ndw) +{ + struct audit_watch *watch = krule->watch; + struct inotify_watch *i_watch; + struct audit_parent *parent; + int ret = 0; + + /* update watch filter fields */ + if (ndw) { + watch->dev = ndw->dentry->d_inode->i_sb->s_dev; + watch->ino = ndw->dentry->d_inode->i_ino; + } + + /* The audit_filter_mutex must not be held during inotify calls because + * we hold it during inotify event callback processing. If an existing + * inotify watch is found, inotify_find_watch() grabs a reference before + * returning. + */ + mutex_unlock(&audit_filter_mutex); + + if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) { + parent = audit_init_parent(ndp); + if (IS_ERR(parent)) { + /* caller expects mutex locked */ + mutex_lock(&audit_filter_mutex); + return PTR_ERR(parent); + } + } else + parent = container_of(i_watch, struct audit_parent, wdata); + + mutex_lock(&audit_filter_mutex); + + /* parent was moved before we took audit_filter_mutex */ + if (parent->flags & AUDIT_PARENT_INVALID) + ret = -ENOENT; + else + audit_add_to_parent(krule, parent); + + /* match get in audit_init_parent or inotify_find_watch */ + put_inotify_watch(&parent->wdata); + return ret; +} + +/* Add rule to given filterlist if not a duplicate. */ static inline int audit_add_rule(struct audit_entry *entry, - struct list_head *list) + struct list_head *list) { struct audit_entry *e; + struct audit_field *inode_f = entry->rule.inode_f; + struct audit_watch *watch = entry->rule.watch; + struct nameidata *ndp, *ndw; + int h, err, putnd_needed = 0; + + if (inode_f) { + h = audit_hash_ino(inode_f->val); + list = &audit_inode_hash[h]; + } + + mutex_lock(&audit_filter_mutex); + e = audit_find_rule(entry, list); + mutex_unlock(&audit_filter_mutex); + if (e) { + err = -EEXIST; + goto error; + } - /* Do not use the _rcu iterator here, since this is the only - * addition routine. */ - list_for_each_entry(e, list, list) { - if (!audit_compare_rule(&entry->rule, &e->rule)) - return -EEXIST; + /* Avoid calling path_lookup under audit_filter_mutex. */ + if (watch) { + err = audit_get_nd(watch->path, &ndp, &ndw); + if (err) + goto error; + putnd_needed = 1; + } + + mutex_lock(&audit_filter_mutex); + if (watch) { + /* audit_filter_mutex is dropped and re-taken during this call */ + err = audit_add_watch(&entry->rule, ndp, ndw); + if (err) { + mutex_unlock(&audit_filter_mutex); + goto error; + } + h = audit_hash_ino((u32)watch->ino); + list = &audit_inode_hash[h]; } if (entry->rule.flags & AUDIT_FILTER_PREPEND) { @@ -488,27 +1047,77 @@ static inline int audit_add_rule(struct audit_entry *entry, } else { list_add_tail_rcu(&entry->list, list); } + mutex_unlock(&audit_filter_mutex); - return 0; + if (putnd_needed) + audit_put_nd(ndp, ndw); + + return 0; + +error: + if (putnd_needed) + audit_put_nd(ndp, ndw); + if (watch) + audit_put_watch(watch); /* tmp watch, matches initial get */ + return err; } -/* Remove an existing rule from filterlist. Protected by - * audit_netlink_mutex. */ +/* Remove an existing rule from filterlist. */ static inline int audit_del_rule(struct audit_entry *entry, struct list_head *list) { struct audit_entry *e; + struct audit_field *inode_f = entry->rule.inode_f; + struct audit_watch *watch, *tmp_watch = entry->rule.watch; + LIST_HEAD(inotify_list); + int h, ret = 0; + + if (inode_f) { + h = audit_hash_ino(inode_f->val); + list = &audit_inode_hash[h]; + } - /* Do not use the _rcu iterator here, since this is the only - * deletion routine. */ - list_for_each_entry(e, list, list) { - if (!audit_compare_rule(&entry->rule, &e->rule)) { - list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule_rcu); - return 0; + mutex_lock(&audit_filter_mutex); + e = audit_find_rule(entry, list); + if (!e) { + mutex_unlock(&audit_filter_mutex); + ret = -ENOENT; + goto out; + } + + watch = e->rule.watch; + if (watch) { + struct audit_parent *parent = watch->parent; + + list_del(&e->rule.rlist); + + if (list_empty(&watch->rules)) { + audit_remove_watch(watch); + + if (list_empty(&parent->watches)) { + /* Put parent on the inotify un-registration + * list. Grab a reference before releasing + * audit_filter_mutex, to be released in + * audit_inotify_unregister(). */ + list_add(&parent->ilist, &inotify_list); + get_inotify_watch(&parent->wdata); + } } } - return -ENOENT; /* No matching rule */ + + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + + mutex_unlock(&audit_filter_mutex); + + if (!list_empty(&inotify_list)) + audit_inotify_unregister(&inotify_list); + +out: + if (tmp_watch) + audit_put_watch(tmp_watch); /* match initial get */ + + return ret; } /* List rules using struct audit_rule. Exists for backward @@ -519,8 +1128,8 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q) struct audit_entry *entry; int i; - /* The *_rcu iterators not needed here because we are - always called with audit_netlink_mutex held. */ + /* This is a blocking read, so use audit_filter_mutex instead of rcu + * iterator to sync with list writers. */ for (i=0; irule); + if (unlikely(!rule)) + break; + skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, + rule, sizeof(*rule)); + if (skb) + skb_queue_tail(q, skb); + kfree(rule); + } + } skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); if (skb) skb_queue_tail(q, skb); @@ -547,8 +1170,8 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) struct audit_entry *e; int i; - /* The *_rcu iterators not needed here because we are - always called with audit_netlink_mutex held. */ + /* This is a blocking read, so use audit_filter_mutex instead of rcu + * iterator to sync with list writers. */ for (i=0; ibuflen); + if (skb) + skb_queue_tail(q, skb); + kfree(data); + } + } + for (i=0; i< AUDIT_INODE_BUCKETS; i++) { + list_for_each_entry(e, &audit_inode_hash[i], list) { + struct audit_rule_data *data; + + data = audit_krule_to_data(&e->rule); + if (unlikely(!data)) + break; + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, + data, sizeof(*data) + data->buflen); if (skb) skb_queue_tail(q, skb); kfree(data); @@ -602,10 +1239,12 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, dest->pid = pid; skb_queue_head_init(&dest->q); + mutex_lock(&audit_filter_mutex); if (type == AUDIT_LIST) audit_list(pid, seq, &dest->q); else audit_list_rules(pid, seq, &dest->q); + mutex_unlock(&audit_filter_mutex); tsk = kthread_run(audit_send_list, dest, "audit_send_list"); if (IS_ERR(tsk)) { @@ -625,6 +1264,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_add_rule(entry, &audit_filter_list[entry->rule.listnr]); + if (sid) { char *ctx = NULL; u32 len; @@ -705,7 +1345,39 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) return 0; } +/* Compare given dentry name with last component in given path, + * return of 0 indicates a match. */ +int audit_compare_dname_path(const char *dname, const char *path) +{ + int dlen, plen; + const char *p; + + if (!dname || !path) + return 1; + + dlen = strlen(dname); + plen = strlen(path); + if (plen < dlen) + return 1; + + /* disregard trailing slashes */ + p = path + plen - 1; + while ((*p == '/') && (p > path)) + p--; + + /* find last path component */ + p = p - dlen + 1; + if (p < path) + return 1; + else if (p > path) { + if (*--p != '/') + return 1; + else + p++; + } + return strncmp(p, dname, dlen); +} static int audit_filter_user_rules(struct netlink_skb_parms *cb, struct audit_krule *rule, @@ -818,32 +1490,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule) int selinux_audit_rule_update(void) { struct audit_entry *entry, *n, *nentry; + struct audit_watch *watch; int i, err = 0; - /* audit_netlink_mutex synchronizes the writers */ - mutex_lock(&audit_netlink_mutex); + /* audit_filter_mutex synchronizes the writers */ + mutex_lock(&audit_filter_mutex); for (i = 0; i < AUDIT_NR_FILTERS; i++) { list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { if (!audit_rule_has_selinux(&entry->rule)) continue; - nentry = audit_dupe_rule(&entry->rule); + watch = entry->rule.watch; + nentry = audit_dupe_rule(&entry->rule, watch); if (unlikely(IS_ERR(nentry))) { /* save the first error encountered for the * return value */ if (!err) err = PTR_ERR(nentry); audit_panic("error updating selinux filters"); + if (watch) + list_del(&entry->rule.rlist); list_del_rcu(&entry->list); } else { + if (watch) { + list_add(&nentry->rule.rlist, + &watch->rules); + list_del(&entry->rule.rlist); + } list_replace_rcu(&entry->list, &nentry->list); } call_rcu(&entry->rcu, audit_free_rule_rcu); } } - mutex_unlock(&audit_netlink_mutex); + mutex_unlock(&audit_filter_mutex); return err; } + +/* Update watch data in audit rules based on inotify events. */ +void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, + u32 cookie, const char *dname, struct inode *inode) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + + if (mask & (IN_CREATE|IN_MOVED_TO) && inode) + audit_update_watch(parent, dname, inode->i_sb->s_dev, + inode->i_ino, 0); + else if (mask & (IN_DELETE|IN_MOVED_FROM)) + audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + /* inotify automatically removes the watch and sends IN_IGNORED */ + else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) + audit_remove_parent_watches(parent); + /* inotify does not remove the watch, so remove it manually */ + else if(mask & IN_MOVE_SELF) { + audit_remove_parent_watches(parent); + inotify_remove_watch_locked(audit_ih, i_watch); + } else if (mask & IN_IGNORED) + put_inotify_watch(i_watch); +} diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 14e295a4121..174a3f62489 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -200,12 +200,13 @@ struct audit_context { #endif }; - +/* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. */ static int audit_filter_rules(struct task_struct *tsk, struct audit_krule *rule, struct audit_context *ctx, + struct audit_names *name, enum audit_state *state) { int i, j, need_sid = 1; @@ -268,7 +269,10 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMAJOR: - if (ctx) { + if (name) + result = audit_comparator(MAJOR(name->dev), + f->op, f->val); + else if (ctx) { for (j = 0; j < ctx->name_count; j++) { if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { ++result; @@ -278,7 +282,10 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMINOR: - if (ctx) { + if (name) + result = audit_comparator(MINOR(name->dev), + f->op, f->val); + else if (ctx) { for (j = 0; j < ctx->name_count; j++) { if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { ++result; @@ -288,7 +295,10 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_INODE: - if (ctx) { + if (name) + result = (name->ino == f->val || + name->pino == f->val); + else if (ctx) { for (j = 0; j < ctx->name_count; j++) { if (audit_comparator(ctx->names[j].ino, f->op, f->val) || audit_comparator(ctx->names[j].pino, f->op, f->val)) { @@ -298,6 +308,12 @@ static int audit_filter_rules(struct task_struct *tsk, } } break; + case AUDIT_WATCH: + if (name && rule->watch->ino != (unsigned long)-1) + result = (name->dev == rule->watch->dev && + (name->ino == rule->watch->ino || + name->pino == rule->watch->ino)); + break; case AUDIT_LOGINUID: result = 0; if (ctx) @@ -354,7 +370,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { - if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { + if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { rcu_read_unlock(); return state; } @@ -384,8 +400,9 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, int bit = AUDIT_BIT(ctx->major); list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit - && audit_filter_rules(tsk, &e->rule, ctx, &state)) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, NULL, + &state)) { rcu_read_unlock(); return state; } @@ -395,6 +412,49 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_BUILD_CONTEXT; } +/* At syscall exit time, this filter is called if any audit_names[] have been + * collected during syscall processing. We only check rules in sublists at hash + * buckets applicable to the inode numbers in audit_names[]. + * Regarding audit_state, same rules apply as for audit_filter_syscall(). + */ +enum audit_state audit_filter_inodes(struct task_struct *tsk, + struct audit_context *ctx) +{ + int i; + struct audit_entry *e; + enum audit_state state; + + if (audit_pid && tsk->tgid == audit_pid) + return AUDIT_DISABLED; + + rcu_read_lock(); + for (i = 0; i < ctx->name_count; i++) { + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + struct audit_names *n = &ctx->names[i]; + int h = audit_hash_ino((u32)n->ino); + struct list_head *list = &audit_inode_hash[h]; + + if (list_empty(list)) + continue; + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { + rcu_read_unlock(); + return state; + } + } + } + rcu_read_unlock(); + return AUDIT_BUILD_CONTEXT; +} + +void audit_set_auditable(struct audit_context *ctx) +{ + ctx->auditable = 1; +} + static inline struct audit_context *audit_get_context(struct task_struct *tsk, int return_valid, int return_code) @@ -408,11 +468,20 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, if (context->in_syscall && !context->auditable) { enum audit_state state; + state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); + if (state == AUDIT_RECORD_CONTEXT) { + context->auditable = 1; + goto get_context; + } + + state = audit_filter_inodes(tsk, context); if (state == AUDIT_RECORD_CONTEXT) context->auditable = 1; + } +get_context: context->pid = tsk->pid; context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ context->uid = tsk->uid; @@ -1142,37 +1211,20 @@ void __audit_inode_child(const char *dname, const struct inode *inode, return; /* determine matching parent */ - if (dname) - for (idx = 0; idx < context->name_count; idx++) - if (context->names[idx].pino == pino) { - const char *n; - const char *name = context->names[idx].name; - int dlen = strlen(dname); - int nlen = name ? strlen(name) : 0; - - if (nlen < dlen) - continue; - - /* disregard trailing slashes */ - n = name + nlen - 1; - while ((*n == '/') && (n > name)) - n--; - - /* find last path component */ - n = n - dlen + 1; - if (n < name) - continue; - else if (n > name) { - if (*--n != '/') - continue; - else - n++; - } + if (!dname) + goto no_match; + for (idx = 0; idx < context->name_count; idx++) + if (context->names[idx].pino == pino) { + const char *name = context->names[idx].name; - if (strncmp(n, dname, dlen) == 0) - goto update_context; - } + if (!name) + continue; + + if (audit_compare_dname_path(dname, name) == 0) + goto update_context; + } +no_match: /* catch-all in case match not found */ idx = context->name_count++; context->names[idx].name = NULL; -- cgit From 0a73dccc4fd472e65887eae6fbf4afc030541709 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Jun 2006 08:15:59 -0400 Subject: [PATCH] validate rule fields' types Signed-off-by: Al Viro --- kernel/auditfilter.c | 57 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 03a6919103d..9f985dd5e93 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -348,17 +348,31 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) f->val = rule->values[i]; err = -EINVAL; - if (f->type & AUDIT_UNUSED_BITS) - goto exit_free; - switch(f->type) { - case AUDIT_SE_USER: - case AUDIT_SE_ROLE: - case AUDIT_SE_TYPE: - case AUDIT_SE_SEN: - case AUDIT_SE_CLR: - case AUDIT_WATCH: + default: goto exit_free; + case AUDIT_PID: + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_LOGINUID: + case AUDIT_PERS: + case AUDIT_ARCH: + case AUDIT_MSGTYPE: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + break; case AUDIT_INODE: err = audit_to_inode(&entry->rule, f); if (err) @@ -432,6 +446,29 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->se_str = NULL; f->se_rule = NULL; switch(f->type) { + case AUDIT_PID: + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_LOGINUID: + case AUDIT_PERS: + case AUDIT_ARCH: + case AUDIT_MSGTYPE: + case AUDIT_PPID: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + break; case AUDIT_SE_USER: case AUDIT_SE_ROLE: case AUDIT_SE_TYPE: @@ -474,6 +511,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (err) goto exit_free; break; + default: + goto exit_free; } } -- cgit From 6a2bceec0ea7fdc47aef9a3f2f771c201eaabe5d Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Fri, 2 Jun 2006 13:16:01 -0400 Subject: [PATCH] fix AUDIT_FILTER_PREPEND handling Clear AUDIT_FILTER_PREPEND flag after adding rule to list. This fixes three problems when a rule is added with the -A syntax: - auditctl displays filter list as "(null)" - the rule cannot be removed using -d - a duplicate rule can be added with -a Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- kernel/auditfilter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9f985dd5e93..a536f7148bc 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1083,6 +1083,7 @@ static inline int audit_add_rule(struct audit_entry *entry, if (entry->rule.flags & AUDIT_FILTER_PREPEND) { list_add_rcu(&entry->list, list); + entry->rule.flags &= ~AUDIT_FILTER_PREPEND; } else { list_add_tail_rcu(&entry->list, list); } -- cgit From 9c937dcc71021f2dbf78f904f03d962dd9bcc130 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Thu, 8 Jun 2006 23:19:31 -0400 Subject: [PATCH] log more info for directory entry change events When an audit event involves changes to a directory entry, include a PATH record for the directory itself. A few other notable changes: - fixed audit_inode_child() hooks in fsnotify_move() - removed unused flags arg from audit_inode() - added audit log routines for logging a portion of a string Here's some sample output. before patch: type=SYSCALL msg=audit(1149821605.320:26): arch=40000003 syscall=39 success=yes exit=0 a0=bf8d3c7c a1=1ff a2=804e1b8 a3=bf8d3c7c items=1 ppid=739 pid=800 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=ttyS0 comm="mkdir" exe="/bin/mkdir" subj=root:system_r:unconfined_t:s0-s0:c0.c255 type=CWD msg=audit(1149821605.320:26): cwd="/root" type=PATH msg=audit(1149821605.320:26): item=0 name="foo" parent=164068 inode=164010 dev=03:00 mode=040755 ouid=0 ogid=0 rdev=00:00 obj=root:object_r:user_home_t:s0 after patch: type=SYSCALL msg=audit(1149822032.332:24): arch=40000003 syscall=39 success=yes exit=0 a0=bfdd9c7c a1=1ff a2=804e1b8 a3=bfdd9c7c items=2 ppid=714 pid=777 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=ttyS0 comm="mkdir" exe="/bin/mkdir" subj=root:system_r:unconfined_t:s0-s0:c0.c255 type=CWD msg=audit(1149822032.332:24): cwd="/root" type=PATH msg=audit(1149822032.332:24): item=0 name="/root" inode=164068 dev=03:00 mode=040750 ouid=0 ogid=0 rdev=00:00 obj=root:object_r:user_home_dir_t:s0 type=PATH msg=audit(1149822032.332:24): item=1 name="foo" inode=164010 dev=03:00 mode=040755 ouid=0 ogid=0 rdev=00:00 obj=root:object_r:user_home_t:s0 Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- kernel/audit.c | 54 ++++++++++++++++++++-- kernel/audit.h | 3 +- kernel/auditfilter.c | 8 +++- kernel/auditsc.c | 123 +++++++++++++++++++++++++++++---------------------- 4 files changed, 127 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 0fbf1c11636..7dfac7031bd 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1051,20 +1051,53 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, skb_put(skb, len << 1); /* new string is twice the old string */ } +/* + * Format a string of no more than slen characters into the audit buffer, + * enclosed in quote marks. + */ +static void audit_log_n_string(struct audit_buffer *ab, size_t slen, + const char *string) +{ + int avail, new_len; + unsigned char *ptr; + struct sk_buff *skb; + + BUG_ON(!ab->skb); + skb = ab->skb; + avail = skb_tailroom(skb); + new_len = slen + 3; /* enclosing quotes + null terminator */ + if (new_len > avail) { + avail = audit_expand(ab, new_len); + if (!avail) + return; + } + ptr = skb->tail; + *ptr++ = '"'; + memcpy(ptr, string, slen); + ptr += slen; + *ptr++ = '"'; + *ptr = 0; + skb_put(skb, slen + 2); /* don't include null terminator */ +} + /** - * audit_log_unstrustedstring - log a string that may contain random characters + * audit_log_n_unstrustedstring - log a string that may contain random characters * @ab: audit_buffer + * @len: lenth of string (not including trailing null) * @string: string to be logged * * This code will escape a string that is passed to it if the string * contains a control character, unprintable character, double quote mark, * or a space. Unescaped strings will start and end with a double quote mark. * Strings that are escaped are printed in hex (2 digits per char). + * + * The caller specifies the number of characters in the string to log, which may + * or may not be the entire string. */ -const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) +const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, + const char *string) { const unsigned char *p = string; - size_t len = strlen(string); while (*p) { if (*p == '"' || *p < 0x21 || *p > 0x7f) { @@ -1073,10 +1106,23 @@ const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *strin } p++; } - audit_log_format(ab, "\"%s\"", string); + audit_log_n_string(ab, len, string); return p + 1; } +/** + * audit_log_unstrustedstring - log a string that may contain random characters + * @ab: audit_buffer + * @string: string to be logged + * + * Same as audit_log_n_unstrustedstring(), except that strlen is used to + * determine string length. + */ +const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) +{ + return audit_log_n_untrustedstring(ab, strlen(string), string); +} + /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, struct dentry *dentry, struct vfsmount *vfsmnt) diff --git a/kernel/audit.h b/kernel/audit.h index 58fa44cb8d0..8323e4132a3 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -104,7 +104,8 @@ static inline int audit_hash_ino(u32 ino) } extern int audit_comparator(const u32 left, const u32 op, const u32 right); -extern int audit_compare_dname_path(const char *dname, const char *path); +extern int audit_compare_dname_path(const char *dname, const char *path, + int *dirlen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, void *payload, int size); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a536f7148bc..4c99d2c586e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -787,7 +787,7 @@ static void audit_update_watch(struct audit_parent *parent, mutex_lock(&audit_filter_mutex); list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path)) + if (audit_compare_dname_path(dname, owatch->path, NULL)) continue; /* If the update involves invalidating rules, do the inode-based @@ -1387,7 +1387,8 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) /* Compare given dentry name with last component in given path, * return of 0 indicates a match. */ -int audit_compare_dname_path(const char *dname, const char *path) +int audit_compare_dname_path(const char *dname, const char *path, + int *dirlen) { int dlen, plen; const char *p; @@ -1416,6 +1417,9 @@ int audit_compare_dname_path(const char *dname, const char *path) p++; } + /* return length of path's directory component */ + if (dirlen) + *dirlen = p - path; return strncmp(p, dname, dlen); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 174a3f62489..851ae0217e4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -82,6 +82,9 @@ extern int audit_enabled; * path_lookup. */ #define AUDIT_NAMES_RESERVED 7 +/* Indicates that audit should log the full pathname. */ +#define AUDIT_NAME_FULL -1 + /* When fs/namei.c:getname() is called, we store the pointer in name and * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). @@ -89,8 +92,9 @@ extern int audit_enabled; * Further, in fs/namei.c:path_lookup() we store the inode and device. */ struct audit_names { const char *name; + int name_len; /* number of name's characters to log */ + unsigned name_put; /* call __putname() for this name */ unsigned long ino; - unsigned long pino; dev_t dev; umode_t mode; uid_t uid; @@ -296,12 +300,10 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_INODE: if (name) - result = (name->ino == f->val || - name->pino == f->val); + result = (name->ino == f->val); else if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(ctx->names[j].ino, f->op, f->val) || - audit_comparator(ctx->names[j].pino, f->op, f->val)) { + if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { ++result; break; } @@ -311,8 +313,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_WATCH: if (name && rule->watch->ino != (unsigned long)-1) result = (name->dev == rule->watch->dev && - (name->ino == rule->watch->ino || - name->pino == rule->watch->ino)); + name->ino == rule->watch->ino); break; case AUDIT_LOGINUID: result = 0; @@ -526,7 +527,7 @@ static inline void audit_free_names(struct audit_context *context) #endif for (i = 0; i < context->name_count; i++) { - if (context->names[i].name) + if (context->names[i].name && context->names[i].name_put) __putname(context->names[i].name); } context->name_count = 0; @@ -850,8 +851,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } } for (i = 0; i < context->name_count; i++) { - unsigned long ino = context->names[i].ino; - unsigned long pino = context->names[i].pino; + struct audit_names *n = &context->names[i]; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); if (!ab) @@ -859,33 +859,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_format(ab, "item=%d", i); - audit_log_format(ab, " name="); - if (context->names[i].name) - audit_log_untrustedstring(ab, context->names[i].name); - else - audit_log_format(ab, "(null)"); - - if (pino != (unsigned long)-1) - audit_log_format(ab, " parent=%lu", pino); - if (ino != (unsigned long)-1) - audit_log_format(ab, " inode=%lu", ino); - if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) - audit_log_format(ab, " dev=%02x:%02x mode=%#o" - " ouid=%u ogid=%u rdev=%02x:%02x", - MAJOR(context->names[i].dev), - MINOR(context->names[i].dev), - context->names[i].mode, - context->names[i].uid, - context->names[i].gid, - MAJOR(context->names[i].rdev), - MINOR(context->names[i].rdev)); - if (context->names[i].osid != 0) { + if (n->name) { + switch(n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, " name=", context->pwd, + context->pwdmnt); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name_len, + n->name); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#o" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + n->uid, + n->gid, + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { char *ctx = NULL; u32 len; if (selinux_ctxid_to_string( - context->names[i].osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - context->names[i].osid); + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); call_panic = 2; } else audit_log_format(ab, " obj=%s", ctx); @@ -1075,6 +1089,8 @@ void __audit_getname(const char *name) } BUG_ON(context->name_count >= AUDIT_NAMES); context->names[context->name_count].name = name; + context->names[context->name_count].name_len = AUDIT_NAME_FULL; + context->names[context->name_count].name_put = 1; context->names[context->name_count].ino = (unsigned long)-1; ++context->name_count; if (!context->pwd) { @@ -1141,11 +1157,10 @@ static void audit_inode_context(int idx, const struct inode *inode) * audit_inode - store the inode and device from a lookup * @name: name being audited * @inode: inode being audited - * @flags: lookup flags (as used in path_lookup()) * * Called from fs/namei.c:path_lookup(). */ -void __audit_inode(const char *name, const struct inode *inode, unsigned flags) +void __audit_inode(const char *name, const struct inode *inode) { int idx; struct audit_context *context = current->audit_context; @@ -1171,20 +1186,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags) ++context->ino_count; #endif } + context->names[idx].ino = inode->i_ino; context->names[idx].dev = inode->i_sb->s_dev; context->names[idx].mode = inode->i_mode; context->names[idx].uid = inode->i_uid; context->names[idx].gid = inode->i_gid; context->names[idx].rdev = inode->i_rdev; audit_inode_context(idx, inode); - if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && - (strcmp(name, ".") != 0)) { - context->names[idx].ino = (unsigned long)-1; - context->names[idx].pino = inode->i_ino; - } else { - context->names[idx].ino = inode->i_ino; - context->names[idx].pino = (unsigned long)-1; - } } /** @@ -1206,34 +1214,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode, { int idx; struct audit_context *context = current->audit_context; + const char *found_name = NULL; + int dirlen = 0; if (!context->in_syscall) return; /* determine matching parent */ if (!dname) - goto no_match; + goto update_context; for (idx = 0; idx < context->name_count; idx++) - if (context->names[idx].pino == pino) { + if (context->names[idx].ino == pino) { const char *name = context->names[idx].name; if (!name) continue; - if (audit_compare_dname_path(dname, name) == 0) - goto update_context; + if (audit_compare_dname_path(dname, name, &dirlen) == 0) { + context->names[idx].name_len = dirlen; + found_name = name; + break; + } } -no_match: - /* catch-all in case match not found */ +update_context: idx = context->name_count++; - context->names[idx].name = NULL; - context->names[idx].pino = pino; #if AUDIT_DEBUG context->ino_count++; #endif + /* Re-use the name belonging to the slot for a matching parent directory. + * All names for this context are relinquished in audit_free_names() */ + context->names[idx].name = found_name; + context->names[idx].name_len = AUDIT_NAME_FULL; + context->names[idx].name_put = 0; /* don't call __putname() */ -update_context: if (inode) { context->names[idx].ino = inode->i_ino; context->names[idx].dev = inode->i_sb->s_dev; @@ -1242,7 +1256,8 @@ update_context: context->names[idx].gid = inode->i_gid; context->names[idx].rdev = inode->i_rdev; audit_inode_context(idx, inode); - } + } else + context->names[idx].ino = (unsigned long)-1; } /** -- cgit From 41757106b9ca7867dafb2404d618f947b4786fd7 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Mon, 12 Jun 2006 07:48:28 -0400 Subject: [PATCH] make set_loginuid obey audit_enabled Hi, I was doing some testing and noticed that when the audit system was disabled, I was still getting messages about the loginuid being set. The following patch makes audit_set_loginuid look at in_syscall to determine if it should create an audit event. The loginuid will continue to be set as long as there is a context. Signed-off-by: Steve Grubb Signed-off-by: Al Viro --- kernel/auditsc.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 851ae0217e4..b097ccb4eb7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1290,18 +1290,23 @@ void auditsc_get_stamp(struct audit_context *ctx, */ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) { - if (task->audit_context) { - struct audit_buffer *ab; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (ab) { - audit_log_format(ab, "login pid=%d uid=%u " - "old auid=%u new auid=%u", - task->pid, task->uid, - task->audit_context->loginuid, loginuid); - audit_log_end(ab); + struct audit_context *context = task->audit_context; + + if (context) { + /* Only log if audit is enabled */ + if (context->in_syscall) { + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (ab) { + audit_log_format(ab, "login pid=%d uid=%u " + "old auid=%u new auid=%u", + task->pid, task->uid, + context->loginuid, loginuid); + audit_log_end(ab); + } } - task->audit_context->loginuid = loginuid; + context->loginuid = loginuid; } return 0; } -- cgit From d720024e94de4e8b7f10ee83c532926f3ad5d708 Mon Sep 17 00:00:00 2001 From: Michael LeMay Date: Thu, 22 Jun 2006 14:47:17 -0700 Subject: [PATCH] selinux: add hooks for key subsystem Introduce SELinux hooks to support the access key retention subsystem within the kernel. Incorporate new flask headers from a modified version of the SELinux reference policy, with support for the new security class representing retained keys. Extend the "key_alloc" security hook with a task parameter representing the intended ownership context for the key being allocated. Attach security information to root's default keyrings within the SELinux initialization routine. Has passed David's testsuite. Signed-off-by: Michael LeMay Signed-off-by: David Howells Signed-off-by: James Morris Acked-by: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 4b1eb745afa..6408c042429 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid) new->mq_bytes = 0; new->locked_shm = 0; - if (alloc_uid_keyring(new) < 0) { + if (alloc_uid_keyring(new, current) < 0) { kmem_cache_free(uid_cachep, new); return NULL; } -- cgit From 6cc0719181a7aa8883855140541e7892250e66af Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 22 Jun 2006 14:47:18 -0700 Subject: [PATCH] suspend_console() warning fix kernel/power/main.c: In function 'suspend_prepare': kernel/power/main.c:89: warning: implicit declaration of function 'suspend_console' kernel/power/main.c: In function 'suspend_finish': kernel/power/main.c:137: warning: implicit declaration of function 'resume_console' Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 0a907f0dc56..cdf0f07af92 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,7 +15,7 @@ #include #include #include - +#include #include "power.h" -- cgit From de047c1bcd7f7bcfbdc29eb5b439fb332594da3f Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Thu, 22 Jun 2006 14:47:26 -0700 Subject: [PATCH] avoid tasklist_lock at getrusage for multithreaded case too Avoid taking tasklist_lock for at getrusage for the multithreaded case too. We don't need to take the tasklist lock for thread traversal of a process since Oleg's do-__unhash_process-under-siglock.patch and related work. Signed-off-by: Ravikiran Thirumalai Cc: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 56 ++++++++++++++++++++++---------------------------------- 1 file changed, 22 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 0b6ec0e7936..fc9ebbbaba0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1860,23 +1860,20 @@ out: * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. * - * tasklist_lock locking optimisation: - * If we are current and single threaded, we do not need to take the tasklist - * lock or the siglock. No one else can take our signal_struct away, - * no one else can reap the children to update signal->c* counters, and - * no one else can race with the signal-> fields. - * If we do not take the tasklist_lock, the signal-> fields could be read - * out of order while another thread was just exiting. So we place a - * read memory barrier when we avoid the lock. On the writer side, - * write memory barrier is implied in __exit_signal as __exit_signal releases - * the siglock spinlock after updating the signal-> fields. - * - * We don't really need the siglock when we access the non c* fields - * of the signal_struct (for RUSAGE_SELF) even in multithreaded - * case, since we take the tasklist lock for read and the non c* signal-> - * fields are updated only in __exit_signal, which is called with - * tasklist_lock taken for write, hence these two threads cannot execute - * concurrently. + * Locking: + * We need to take the siglock for CHILDEREN, SELF and BOTH + * for the cases current multithreaded, non-current single threaded + * non-current multithreaded. Thread traversal is now safe with + * the siglock held. + * Strictly speaking, we donot need to take the siglock if we are current and + * single threaded, as no one else can take our signal_struct away, no one + * else can reap the children to update signal->c* counters, and no one else + * can race with the signal-> fields. If we do not take any lock, the + * signal-> fields could be read out of order while another thread was just + * exiting. So we should place a read memory barrier when we avoid the lock. + * On the writer side, write memory barrier is implied in __exit_signal + * as __exit_signal releases the siglock spinlock after updating the signal-> + * fields. But we don't do this yet to keep things simple. * */ @@ -1885,35 +1882,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; - int need_lock = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; - if (p != current || !thread_group_empty(p)) - need_lock = 1; - - if (need_lock) { - read_lock(&tasklist_lock); - if (unlikely(!p->signal)) { - read_unlock(&tasklist_lock); - return; - } - } else - /* See locking comments above */ - smp_rmb(); + rcu_read_lock(); + if (!lock_task_sighand(p, &flags)) { + rcu_read_unlock(); + return; + } switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: - spin_lock_irqsave(&p->sighand->siglock, flags); utime = p->signal->cutime; stime = p->signal->cstime; r->ru_nvcsw = p->signal->cnvcsw; r->ru_nivcsw = p->signal->cnivcsw; r->ru_minflt = p->signal->cmin_flt; r->ru_majflt = p->signal->cmaj_flt; - spin_unlock_irqrestore(&p->sighand->siglock, flags); if (who == RUSAGE_CHILDREN) break; @@ -1941,8 +1928,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) BUG(); } - if (need_lock) - read_unlock(&tasklist_lock); + unlock_task_sighand(p, &flags); + rcu_read_unlock(); + cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); } -- cgit From 454e2398be9b9fa30433fccc548db34d19aa9958 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2006 02:02:57 -0700 Subject: [PATCH] VFS: Permit filesystem to override root dentry on mount Extend the get_sb() filesystem operation to take an extra argument that permits the VFS to pass in the target vfsmount that defines the mountpoint. The filesystem is then required to manually set the superblock and root dentry pointers. For most filesystems, this should be done with simple_set_mnt() which will set the superblock pointer and then set the root dentry to the superblock's s_root (as per the old default behaviour). The get_sb() op now returns an integer as there's now no need to return the superblock pointer. This patch permits a superblock to be implicitly shared amongst several mount points, such as can be done with NFS to avoid potential inode aliasing. In such a case, simple_set_mnt() would not be called, and instead the mnt_root and mnt_sb would be set directly. The patch also makes the following changes: (*) the get_sb_*() convenience functions in the core kernel now take a vfsmount pointer argument and return an integer, so most filesystems have to change very little. (*) If one of the convenience function is not used, then get_sb() should normally call simple_set_mnt() to instantiate the vfsmount. This will always return 0, and so can be tail-called from get_sb(). (*) generic_shutdown_super() now calls shrink_dcache_sb() to clean up the dcache upon superblock destruction rather than shrink_dcache_anon(). This is required because the superblock may now have multiple trees that aren't actually bound to s_root, but that still need to be cleaned up. The currently called functions assume that the whole tree is rooted at s_root, and that anonymous dentries are not the roots of trees which results in dentries being left unculled. However, with the way NFS superblock sharing are currently set to be implemented, these assumptions are violated: the root of the filesystem is simply a dummy dentry and inode (the real inode for '/' may well be inaccessible), and all the vfsmounts are rooted on anonymous[*] dentries with child trees. [*] Anonymous until discovered from another tree. (*) The documentation has been adjusted, including the additional bit of changing ext2_* into foo_* in the documentation. [akpm@osdl.org: convert ipath_fs, do other stuff] Signed-off-by: David Howells Acked-by: Al Viro Cc: Nathan Scott Cc: Roland Dreier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 8 ++++---- kernel/futex.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ab81fdd4572..77f45ffd5ea 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -392,11 +392,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data, return 0; } -static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +static int cpuset_get_sb(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, cpuset_fill_super); + return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); } static struct file_system_type cpuset_fs_type = { diff --git a/kernel/futex.c b/kernel/futex.c index 5699c512057..e1a380c77a5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1056,11 +1056,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, (unsigned long)uaddr2, val2, val3); } -static struct super_block * -futexfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int futexfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); + return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt); } static struct file_system_type futex_fs_type = { -- cgit From 726c334223180e3c0197cc980a432681370d4baf Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2006 02:02:58 -0700 Subject: [PATCH] VFS: Permit filesystem to perform statfs with a known root dentry Give the statfs superblock operation a dentry pointer rather than a superblock pointer. This complements the get_sb() patch. That reduced the significance of sb->s_root, allowing NFS to place a fake root there. However, NFS does require a dentry to use as a target for the statfs operation. This permits the root in the vfsmount to be used instead. linux/mount.h has been added where necessary to make allyesconfig build successfully. Interest has also been expressed for use with the FUSE and XFS filesystems. Signed-off-by: David Howells Acked-by: Al Viro Cc: Nathan Scott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index b327f4d2010..6802020e0ce 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -118,7 +118,7 @@ static int check_free_space(struct file *file) spin_unlock(&acct_globals.lock); /* May block */ - if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) + if (vfs_statfs(file->f_dentry, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; -- cgit From fadd8fbd153c12963f8fe3c9ef7f8967f286f98b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 23 Jun 2006 02:03:13 -0700 Subject: [PATCH] support for panic at OOM This patch adds panic_on_oom sysctl under sys.vm. When sysctl vm.panic_on_oom = 1, the kernel panics intead of killing rogue processes. And if vm.panic_on_oom is 0 the kernel will do oom_kill() in the same way as it does today. Of course, the default value is 0 and only root can modifies it. In general, oom_killer works well and kill rogue processes. So the whole system can survive. But there are environments where panic is preferable rather than kill some processes. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0d656e61621..072ac446810 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, extern int C_A_D; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; +extern int sysctl_panic_on_oom; extern int max_threads; extern int sysrq_enabled; extern int core_uses_pid; @@ -701,6 +702,14 @@ static ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = VM_PANIC_ON_OOM, + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = VM_OVERCOMMIT_RATIO, .procname = "overcommit_ratio", -- cgit From d6277db4ab271862ed599da08d78961c70f00002 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Jun 2006 02:03:18 -0700 Subject: [PATCH] swsusp: rework memory shrinker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework the swsusp's memory shrinker in the following way: - Simplify balance_pgdat() by removing all of the swsusp-related code from it. - Make shrink_all_memory() use shrink_slab() and a new function shrink_all_zones() which calls shrink_active_list() and shrink_inactive_list() directly for each zone in a way that's optimized for suspend. In shrink_all_memory() we try to free exactly as many pages as the caller asks for, preferably in one shot, starting from easier targets.  If slab caches are huge, they are most likely to have enough pages to reclaim.  The inactive lists are next (the zones with more inactive pages go first) etc. Each time shrink_all_memory() attempts to shrink the active and inactive lists for each zone in 5 passes.  In the first pass, only the inactive lists are taken into consideration.  In the next two passes the active lists are also shrunk, but mapped pages are not reclaimed.  In the last two passes the active and inactive lists are shrunk and mapped pages are reclaimed as well. The aim of this is to alter the reclaim logic to choose the best pages to keep on resume and improve the responsiveness of the resumed system. Signed-off-by: Rafael J. Wysocki Signed-off-by: Con Kolivas Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swsusp.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c4016cbbd3e..f9238faf76e 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap) */ #define SHRINK_BITE 10000 +static inline unsigned long __shrink_memory(long tmp) +{ + if (tmp > SHRINK_BITE) + tmp = SHRINK_BITE; + return shrink_all_memory(tmp); +} int swsusp_shrink_memory(void) { @@ -195,12 +201,12 @@ int swsusp_shrink_memory(void) if (!is_highmem(zone)) tmp -= zone->free_pages; if (tmp > 0) { - tmp = shrink_all_memory(SHRINK_BITE); + tmp = __shrink_memory(tmp); if (!tmp) return -ENOMEM; pages += tmp; } else if (size > image_size / PAGE_SIZE) { - tmp = shrink_all_memory(SHRINK_BITE); + tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); pages += tmp; } printk("\b%c", p[i++%4]); -- cgit From 742755a1d8ce2b548428f7aacf1758b4bba50080 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:55 -0700 Subject: [PATCH] page migration: sys_move_pages(): support moving of individual pages move_pages() is used to move individual pages of a process. The function can be used to determine the location of pages and to move them onto the desired node. move_pages() returns status information for each page. long move_pages(pid, number_of_pages_to_move, addresses_of_pages[], nodes[] or NULL, status[], flags); The addresses of pages is an array of void * pointing to the pages to be moved. The nodes array contains the node numbers that the pages should be moved to. If a NULL is passed instead of an array then no pages are moved but the status array is updated. The status request may be used to determine the page state before issuing another move_pages() to move pages. The status array will contain the state of all individual page migration attempts when the function terminates. The status array is only valid if move_pages() completed successfullly. Possible page states in status[]: 0..MAX_NUMNODES The page is now on the indicated node. -ENOENT Page is not present -EACCES Page is mapped by multiple processes and can only be moved if MPOL_MF_MOVE_ALL is specified. -EPERM The page has been mlocked by a process/driver and cannot be moved. -EBUSY Page is busy and cannot be moved. Try again later. -EFAULT Invalid address (no VMA or zero page). -ENOMEM Unable to allocate memory on target node. -EIO Unable to write back page. The page must be written back in order to move it since the page is dirty and the filesystem does not provide a migration function that would allow the moving of dirty pages. -EINVAL A dirty page cannot be moved. The filesystem does not provide a migration function and has no ability to write back pages. The flags parameter indicates what types of pages to move: MPOL_MF_MOVE Move pages that are only mapped by the process. MPOL_MF_MOVE_ALL Also move pages that are mapped by multiple processes. Requires sufficient capabilities. Possible return codes from move_pages() -ENOENT No pages found that would require moving. All pages are either already on the target node, not present, had an invalid address or could not be moved because they were mapped by multiple processes. -EINVAL Flags other than MPOL_MF_MOVE(_ALL) specified or an attempt to migrate pages in a kernel thread. -EPERM MPOL_MF_MOVE_ALL specified without sufficient priviledges. or an attempt to move a process belonging to another user. -EACCES One of the target nodes is not allowed by the current cpuset. -ENODEV One of the target nodes is not online. -ESRCH Process does not exist. -E2BIG Too many pages to move. -ENOMEM Not enough memory to allocate control array. -EFAULT Parameters could not be accessed. A test program for move_pages() may be found with the patches on ftp.kernel.org:/pub/linux/kernel/people/christoph/pmig/patches-2.6.17-rc4-mm3 From: Christoph Lameter Detailed results for sys_move_pages() Pass a pointer to an integer to get_new_page() that may be used to indicate where the completion status of a migration operation should be placed. This allows sys_move_pags() to report back exactly what happened to each page. Wish there would be a better way to do this. Looks a bit hacky. Signed-off-by: Christoph Lameter Cc: Hugh Dickins Cc: Jes Sorensen Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Andi Kleen Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5433195040f..597229749de 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); cond_syscall(sys_migrate_pages); +cond_syscall(sys_move_pages); cond_syscall(sys_chown16); cond_syscall(sys_fchown16); cond_syscall(sys_getegid16); -- cgit From 1b2db9fb7adc4d67d9ce7d16ce79c41ee84730fe Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:56 -0700 Subject: [PATCH] sys_move_pages: 32bit support (i386, x86_64) sys_move_pages() support for 32bit (i386 plus x86_64 compat layer) Add support for move_pages() on i386 and also add the compat functions necessary to run 32 bit binaries on x86_64. Add compat_sys_move_pages to the x86_64 32bit binary layer. Note that it is not up to date so I added the missing pieces. Not sure if this is done the right way. [akpm@osdl.org: compile fix] Signed-off-by: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 23 +++++++++++++++++++++++ kernel/sys_ni.c | 1 + 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index c1601a84f8d..ccea93e2895 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -934,3 +935,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) return ret; } + +#ifdef CONFIG_NUMA +asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, + void __user *pages32, + const int __user *nodes, + int __user *status, + int flags) +{ + const void __user * __user *pages; + int i; + + pages = compat_alloc_user_space(nr_pages * sizeof(void *)); + for (i = 0; i < nr_pages; i++) { + compat_uptr_t p; + + if (get_user(p, (compat_uptr_t *)(pages32 + i)) || + put_user(compat_ptr(p), pages + i)) + return -EFAULT; + } + return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); +} +#endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 597229749de..6991bece67e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -133,3 +133,4 @@ cond_syscall(sys_mincore); cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); +cond_syscall(compat_sys_move_pages); -- cgit From 9216dfad4fc97ab639ef0885efc713f3d7a20d5b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:57 -0700 Subject: [PATCH] move_pages: fix 32 -> 64 bit compat function The definition of the third parameter is a pointer to an array of virtual addresses which give us some trouble. The existing code calculated the wrong address in the array since I used void to avoid having to specify a type. I now use the correct type "compat_uptr_t __user *" in the definition of the function in kernel/compat.c. However, I used __u32 in syscalls.h. Would have to include compat.h there in order to provide the same definition which would generate an ugly include situation. On both ia64 and x86_64 compat_uptr_t is u32. So this works although parameter declarations differ. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index ccea93e2895..2f672332430 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -938,7 +938,7 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) #ifdef CONFIG_NUMA asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, - void __user *pages32, + compat_uptr_t __user *pages32, const int __user *nodes, int __user *status, int flags) @@ -950,7 +950,7 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, for (i = 0; i < nr_pages; i++) { compat_uptr_t p; - if (get_user(p, (compat_uptr_t *)(pages32 + i)) || + if (get_user(p, pages32 + i) || put_user(compat_ptr(p), pages + i)) return -EFAULT; } -- cgit From e7834f8fccd791225a1cf91c2c3e740ad8e2e145 Mon Sep 17 00:00:00 2001 From: David Quigley Date: Fri, 23 Jun 2006 02:03:59 -0700 Subject: [PATCH] SELinux: add security hooks to {get,set}affinity This patch adds LSM hooks into the setaffinity and getaffinity functions to enable security modules to control these operations between tasks with task_setscheduler and task_getscheduler LSM hooks. Signed-off-by: David Quigley Acked-by: Stephen Smalley Signed-off-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c13f1bd2df7..87665132cec 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3886,6 +3886,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) !capable(CAP_SYS_NICE)) goto out_unlock; + retval = security_task_setscheduler(p, 0, NULL); + if (retval) + goto out_unlock; + cpus_allowed = cpuset_cpus_allowed(p); cpus_and(new_mask, new_mask, cpus_allowed); retval = set_cpus_allowed(p, new_mask); @@ -3954,7 +3958,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (!p) goto out_unlock; - retval = 0; + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + cpus_and(*mask, p->cpus_allowed, cpu_online_map); out_unlock: -- cgit From 22fb52dd736a62e24c44c50739007496265dc38c Mon Sep 17 00:00:00 2001 From: David Quigley Date: Fri, 23 Jun 2006 02:04:00 -0700 Subject: [PATCH] SELinux: add security hook call to mediate attach_task (kernel/cpuset.c) Add a security hook call to enable security modules to control the ability to attach a task to a cpuset. While limited control over this operation is possible via permission checks on the pseudo fs interface, those checks are not sufficient to control access to the target task, which is looked up in this function. The existing task_setscheduler hook is re-used for this operation since this falls under the same class of operations. Signed-off-by: David Quigley Acked-by: Stephen Smalley Signed-off-by: James Morris Acked-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 77f45ffd5ea..b602f73fb38 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -1177,6 +1178,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) cpumask_t cpus; nodemask_t from, to; struct mm_struct *mm; + int retval; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; @@ -1205,6 +1207,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) get_task_struct(tsk); } + retval = security_task_setscheduler(tsk, 0, NULL); + if (retval) { + put_task_struct(tsk); + return retval; + } + mutex_lock(&callback_mutex); task_lock(tsk); -- cgit From 1b61b910e99059abdd54c93aa70e84e076e33d16 Mon Sep 17 00:00:00 2001 From: Zhang Yanmin Date: Fri, 23 Jun 2006 02:04:22 -0700 Subject: [PATCH] x86: kernel irq balance doesn't work On i386, kernel irq balance doesn't work. 1) In function do_irq_balance, after kernel finds the min_loaded cpu but before calling set_pending_irq to really pin the selected_irq to the target cpu, kernel does a cpus_and with irq_affinity[selected_irq]. Later on, when the irq is acked, kernel would calls move_native_irq=>desc->handler->set_affinity to change the irq affinity. However, every function pointed by hw_interrupt_type->set_affinity(unsigned int irq, cpumask_t cpumask) always changes irq_affinity[irq] to cpumask. Next time when recalling do_irq_balance, it has to do cpu_ands again with irq_affinity[selected_irq], but irq_affinity[selected_irq] already becomes one cpu selected by the first irq balance. 2) Function balance_irq in file arch/i386/kernel/io_apic.c has the same issue. [akpm@osdl.org: cleanups] Signed-off-by: Zhang Yanmin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/proc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d03b5eef8ce..afacd6f585f 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -24,6 +24,8 @@ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; #ifdef CONFIG_GENERIC_PENDING_IRQ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { + set_balance_irq_affinity(irq, mask_val); + /* * Save these away for later use. Re-progam when the * interrupt is pending @@ -33,6 +35,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) #else void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { + set_balance_irq_affinity(irq, mask_val); irq_affinity[irq] = mask_val; irq_desc[irq].handler->set_affinity(irq, mask_val); } -- cgit From ce4ab0012b32c1a4a1d6e934aeb73bf3151c48d9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 23 Jun 2006 02:04:44 -0700 Subject: [PATCH] swsusp: add architecture special saveable pages support 1. Add architecture specific pages save/restore support. Next two patches will use this to save/restore 'ACPI NVS' pages. 2. Allow reserved pages 'nosave'. This could avoid save/restore BIOS reserved pages. Signed-off-by: Shaohua Li Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 4 ++ kernel/power/snapshot.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/power/swsusp.c | 18 ++------ 3 files changed, 117 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index f06f12f2176..c81f0ed3eeb 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -105,6 +105,10 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); +extern unsigned int count_special_pages(void); +extern int save_special_mem(void); +extern int restore_special_mem(void); + extern int swsusp_check(void); extern int swsusp_shrink_memory(void); extern void swsusp_free(void); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3eeedbb13b7..7f511d89c66 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -39,6 +39,88 @@ static unsigned int nr_copy_pages; static unsigned int nr_meta_pages; static unsigned long *buffer; +struct arch_saveable_page { + unsigned long start; + unsigned long end; + char *data; + struct arch_saveable_page *next; +}; +static struct arch_saveable_page *arch_pages; + +int swsusp_add_arch_pages(unsigned long start, unsigned long end) +{ + struct arch_saveable_page *tmp; + + while (start < end) { + tmp = kzalloc(sizeof(struct arch_saveable_page), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + tmp->start = start; + tmp->end = ((start >> PAGE_SHIFT) + 1) << PAGE_SHIFT; + if (tmp->end > end) + tmp->end = end; + tmp->next = arch_pages; + start = tmp->end; + arch_pages = tmp; + } + return 0; +} + +static unsigned int count_arch_pages(void) +{ + unsigned int count = 0; + struct arch_saveable_page *tmp = arch_pages; + while (tmp) { + count++; + tmp = tmp->next; + } + return count; +} + +static int save_arch_mem(void) +{ + char *kaddr; + struct arch_saveable_page *tmp = arch_pages; + int offset; + + pr_debug("swsusp: Saving arch specific memory"); + while (tmp) { + tmp->data = (char *)__get_free_page(GFP_ATOMIC); + if (!tmp->data) + return -ENOMEM; + offset = tmp->start - (tmp->start & PAGE_MASK); + /* arch pages might haven't a 'struct page' */ + kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0); + memcpy(tmp->data + offset, kaddr + offset, + tmp->end - tmp->start); + kunmap_atomic(kaddr, KM_USER0); + + tmp = tmp->next; + } + return 0; +} + +static int restore_arch_mem(void) +{ + char *kaddr; + struct arch_saveable_page *tmp = arch_pages; + int offset; + + while (tmp) { + if (!tmp->data) + continue; + offset = tmp->start - (tmp->start & PAGE_MASK); + kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0); + memcpy(kaddr + offset, tmp->data + offset, + tmp->end - tmp->start); + kunmap_atomic(kaddr, KM_USER0); + free_page((long)tmp->data); + tmp->data = NULL; + tmp = tmp->next; + } + return 0; +} + #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void) { @@ -150,8 +232,35 @@ int restore_highmem(void) } return 0; } +#else +static unsigned int count_highmem_pages(void) {return 0;} +static int save_highmem(void) {return 0;} +static int restore_highmem(void) {return 0;} #endif +unsigned int count_special_pages(void) +{ + return count_arch_pages() + count_highmem_pages(); +} + +int save_special_mem(void) +{ + int ret; + ret = save_arch_mem(); + if (!ret) + ret = save_highmem(); + return ret; +} + +int restore_special_mem(void) +{ + int ret; + ret = restore_arch_mem(); + if (!ret) + ret = restore_highmem(); + return ret; +} + static int pfn_is_nosave(unsigned long pfn) { unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; @@ -177,7 +286,6 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn) return 0; page = pfn_to_page(pfn); - BUG_ON(PageReserved(page) && PageNosave(page)); if (PageNosave(page)) return 0; if (PageReserved(page) && pfn_is_nosave(pfn)) diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index f9238faf76e..78b6e71b081 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -62,16 +62,6 @@ unsigned long image_size = 500 * 1024 * 1024; int in_suspend __nosavedata = 0; -#ifdef CONFIG_HIGHMEM -unsigned int count_highmem_pages(void); -int save_highmem(void); -int restore_highmem(void); -#else -static int save_highmem(void) { return 0; } -static int restore_highmem(void) { return 0; } -static unsigned int count_highmem_pages(void) { return 0; } -#endif - /** * The following functions are used for tracing the allocated * swap pages, so that they can be freed in case of an error. @@ -192,7 +182,7 @@ int swsusp_shrink_memory(void) printk("Shrinking memory... "); do { - size = 2 * count_highmem_pages(); + size = 2 * count_special_pages(); size += size / 50 + count_data_pages(); size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE + PAGES_FOR_IO; @@ -234,7 +224,7 @@ int swsusp_suspend(void) goto Enable_irqs; } - if ((error = save_highmem())) { + if ((error = save_special_mem())) { printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); goto Restore_highmem; } @@ -245,7 +235,7 @@ int swsusp_suspend(void) /* Restore control flow magically appears here */ restore_processor_state(); Restore_highmem: - restore_highmem(); + restore_special_mem(); device_power_up(); Enable_irqs: local_irq_enable(); @@ -271,7 +261,7 @@ int swsusp_resume(void) */ swsusp_free(); restore_processor_state(); - restore_highmem(); + restore_special_mem(); touch_softlockup_watchdog(); device_power_up(); local_irq_enable(); -- cgit From a938c356d5b007fe6d28251c0ddbf6c11d0d92b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Jun 2006 02:04:46 -0700 Subject: [PATCH] swsusp: take lowmem reserves into account swsusp allocates memory from the normal zone, so it cannot use lowmem reserve pages from the lower zones. Therefore it should not count these pages as available to it. Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swsusp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 78b6e71b081..f0ee4e7780d 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -188,8 +188,10 @@ int swsusp_shrink_memory(void) PAGES_FOR_IO; tmp = size; for_each_zone (zone) - if (!is_highmem(zone)) + if (!is_highmem(zone) && populated_zone(zone)) { tmp -= zone->free_pages; + tmp += zone->lowmem_reserve[ZONE_NORMAL]; + } if (tmp > 0) { tmp = __shrink_memory(tmp); if (!tmp) -- cgit From 7bff24e255ee11ecbc304315a252fcbd84f9ffce Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 23 Jun 2006 02:04:47 -0700 Subject: [PATCH] kernel/power/snapshot.c: cleanups - make needlessly global functions static - make dummy functions static inline Signed-off-by: Adrian Bunk Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 7f511d89c66..513eef3391a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -122,7 +122,7 @@ static int restore_arch_mem(void) } #ifdef CONFIG_HIGHMEM -unsigned int count_highmem_pages(void) +static unsigned int count_highmem_pages(void) { struct zone *zone; unsigned long zone_pfn; @@ -199,7 +199,7 @@ static int save_highmem_zone(struct zone *zone) return 0; } -int save_highmem(void) +static int save_highmem(void) { struct zone *zone; int res = 0; @@ -216,7 +216,7 @@ int save_highmem(void) return 0; } -int restore_highmem(void) +static int restore_highmem(void) { printk("swsusp: Restoring Highmem\n"); while (highmem_copy) { @@ -233,9 +233,9 @@ int restore_highmem(void) return 0; } #else -static unsigned int count_highmem_pages(void) {return 0;} -static int save_highmem(void) {return 0;} -static int restore_highmem(void) {return 0;} +static inline unsigned int count_highmem_pages(void) {return 0;} +static inline int save_highmem(void) {return 0;} +static inline int restore_highmem(void) {return 0;} #endif unsigned int count_special_pages(void) @@ -482,7 +482,8 @@ unsigned long get_safe_page(gfp_t gfp_mask) * On each page we set up a list of struct_pbe elements. */ -struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) +static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, + int safe_needed) { unsigned int num; struct pbe *pblist, *pbe; -- cgit From 968808b8956e332e556b1eae9b4f7df77518f53b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Jun 2006 02:04:48 -0700 Subject: [PATCH] swsusp: use less memory during resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make swsusp allocate only as much memory as needed to store the image data and metadata during resume. Without this patch swsusp additionally allocates many page frames that will conflict with the "original" locations of the image data and are considered as "unsafe", treating them as "eaten" pages (ie. allocated but unusable). The patch makes swsusp allocate as many pages as it'll need to store the data read from the image in one shot, creating a list of allocated "safe" pages, and use the observation that all pages allocated by it are marked with the PG_nosave and PG_nosave_free flags set.  Namely, when it's about to load an image page, swsusp can check whether the page frame corresponding to the "original" location of this page has been allocated (ie. if the page frame has the PG_nosave and PG_nosave_free flags set) and if so, it can load the page directly into this page frame.  Otherwise it uses an allocated "safe" page from the list to store the data that will be copied to their "original" location later on. This allows us to save many page copyings and page allocations during resume and in the future it may allow us to load images greater than 50% of the normal zone. Signed-off-by: Rafael J. Wysocki Acked-by: "Pavel Machek" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 2 +- kernel/power/snapshot.c | 141 ++++++++++++++++++++++++++++-------------------- 2 files changed, 85 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index c81f0ed3eeb..98c41423f3b 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -55,7 +55,7 @@ struct snapshot_handle { unsigned int page; unsigned int page_offset; unsigned int prev; - struct pbe *pbe; + struct pbe *pbe, *last_pbe; void *buffer; unsigned int buf_offset; }; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 513eef3391a..3d9284100b2 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -401,62 +401,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) } } -/** - * On resume it is necessary to trace and eventually free the unsafe - * pages that have been allocated, because they are needed for I/O - * (on x86-64 we likely will "eat" these pages once again while - * creating the temporary page translation tables) - */ - -struct eaten_page { - struct eaten_page *next; - char padding[PAGE_SIZE - sizeof(void *)]; -}; - -static struct eaten_page *eaten_pages = NULL; - -static void release_eaten_pages(void) -{ - struct eaten_page *p, *q; - - p = eaten_pages; - while (p) { - q = p->next; - /* We don't want swsusp_free() to free this page again */ - ClearPageNosave(virt_to_page(p)); - free_page((unsigned long)p); - p = q; - } - eaten_pages = NULL; -} +static unsigned int unsafe_pages; /** * @safe_needed - on resume, for storing the PBE list and the image, * we can only use memory pages that do not conflict with the pages - * which had been used before suspend. + * used before suspend. * * The unsafe pages are marked with the PG_nosave_free flag - * - * Allocated but unusable (ie eaten) memory pages should be marked - * so that swsusp_free() can release them + * and we count them using unsafe_pages */ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) { void *res; + res = (void *)get_zeroed_page(gfp_mask); if (safe_needed) - do { + while (res && PageNosaveFree(virt_to_page(res))) { + /* The page is unsafe, mark it for swsusp_free() */ + SetPageNosave(virt_to_page(res)); + unsafe_pages++; res = (void *)get_zeroed_page(gfp_mask); - if (res && PageNosaveFree(virt_to_page(res))) { - /* This is for swsusp_free() */ - SetPageNosave(virt_to_page(res)); - ((struct eaten_page *)res)->next = eaten_pages; - eaten_pages = res; - } - } while (res && PageNosaveFree(virt_to_page(res))); - else - res = (void *)get_zeroed_page(gfp_mask); + } if (res) { SetPageNosave(virt_to_page(res)); SetPageNosaveFree(virt_to_page(res)); @@ -751,6 +718,8 @@ static int mark_unsafe_pages(struct pbe *pblist) return -EFAULT; } + unsafe_pages = 0; + return 0; } @@ -828,42 +797,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf, } /** - * create_image - use metadata contained in the PBE list + * prepare_image - use metadata contained in the PBE list * pointed to by pagedir_nosave to mark the pages that will * be overwritten in the process of restoring the system - * memory state from the image and allocate memory for - * the image avoiding these pages + * memory state from the image ("unsafe" pages) and allocate + * memory for the image + * + * The idea is to allocate the PBE list first and then + * allocate as many pages as it's needed for the image data, + * but not to assign these pages to the PBEs initially. + * Instead, we just mark them as allocated and create a list + * of "safe" which will be used later */ -static int create_image(struct snapshot_handle *handle) +struct safe_page { + struct safe_page *next; + char padding[PAGE_SIZE - sizeof(void *)]; +}; + +static struct safe_page *safe_pages; + +static int prepare_image(struct snapshot_handle *handle) { int error = 0; - struct pbe *p, *pblist; + unsigned int nr_pages = nr_copy_pages; + struct pbe *p, *pblist = NULL; p = pagedir_nosave; error = mark_unsafe_pages(p); if (!error) { - pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); + pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); if (pblist) copy_page_backup_list(pblist, p); free_pagedir(p, 0); if (!pblist) error = -ENOMEM; } - if (!error) - error = alloc_data_pages(pblist, GFP_ATOMIC, 1); + safe_pages = NULL; + if (!error && nr_pages > unsafe_pages) { + nr_pages -= unsafe_pages; + while (nr_pages--) { + struct safe_page *ptr; + + ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC); + if (!ptr) { + error = -ENOMEM; + break; + } + if (!PageNosaveFree(virt_to_page(ptr))) { + /* The page is "safe", add it to the list */ + ptr->next = safe_pages; + safe_pages = ptr; + } + /* Mark the page as allocated */ + SetPageNosave(virt_to_page(ptr)); + SetPageNosaveFree(virt_to_page(ptr)); + } + } if (!error) { - release_eaten_pages(); pagedir_nosave = pblist; } else { - pagedir_nosave = NULL; handle->pbe = NULL; - nr_copy_pages = 0; - nr_meta_pages = 0; + swsusp_free(); } return error; } +static void *get_buffer(struct snapshot_handle *handle) +{ + struct pbe *pbe = handle->pbe, *last = handle->last_pbe; + struct page *page = virt_to_page(pbe->orig_address); + + if (PageNosave(page) && PageNosaveFree(page)) { + /* + * We have allocated the "original" page frame and we can + * use it directly to store the read page + */ + pbe->address = 0; + if (last && last->next) + last->next = NULL; + return (void *)pbe->orig_address; + } + /* + * The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the read page + */ + pbe->address = (unsigned long)safe_pages; + safe_pages = safe_pages->next; + if (last) + last->next = pbe; + handle->last_pbe = pbe; + return (void *)pbe->address; +} + /** * snapshot_write_next - used for writing the system memory snapshot. * @@ -908,15 +934,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) } else if (handle->prev <= nr_meta_pages) { handle->pbe = unpack_orig_addresses(buffer, handle->pbe); if (!handle->pbe) { - error = create_image(handle); + error = prepare_image(handle); if (error) return error; handle->pbe = pagedir_nosave; - handle->buffer = (void *)handle->pbe->address; + handle->last_pbe = NULL; + handle->buffer = get_buffer(handle); } } else { handle->pbe = handle->pbe->next; - handle->buffer = (void *)handle->pbe->address; + handle->buffer = get_buffer(handle); } handle->prev = handle->page; } -- cgit From c330dda908b5a46469a997eea90b66f2f9f02b34 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 23 Jun 2006 02:05:07 -0700 Subject: [PATCH] Add a sysfs file to determine if a kexec kernel is loaded Create two files in /sys/kernel, kexec_loaded and kexec_crash_loaded. Each file contains a simple boolean value indicating whether the relevant kernel has been loaded into memory. The motivation for this is geared around support. Signed-off-by: Jeff Moyer Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 6 +++--- kernel/ksysfs.c | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index bf39d28e4c0..58f0f382597 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image, * kexec does not sync, or unmount filesystems so if you need * that to happen you need to do that yourself. */ -struct kimage *kexec_image = NULL; -static struct kimage *kexec_crash_image = NULL; +struct kimage *kexec_image; +struct kimage *kexec_crash_image; /* * A home grown binary mutex. * Nothing can wait so this mutex is safe to use * in interrupt context :) */ -static int kexec_lock = 0; +static int kexec_lock; asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index f119e098e67..9e28478a17a 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include #include #include +#include #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -48,6 +49,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s KERNEL_ATTR_RW(uevent_helper); #endif +#ifdef CONFIG_KEXEC +static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%d\n", !!kexec_image); +} +KERNEL_ATTR_RO(kexec_loaded); + +static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%d\n", !!kexec_crash_image); +} +KERNEL_ATTR_RO(kexec_crash_loaded); +#endif /* CONFIG_KEXEC */ + decl_subsys(kernel, NULL, NULL); EXPORT_SYMBOL_GPL(kernel_subsys); @@ -55,6 +70,10 @@ static struct attribute * kernel_attrs[] = { #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, +#endif +#ifdef CONFIG_KEXEC + &kexec_loaded_attr.attr, + &kexec_crash_loaded_attr.attr, #endif NULL }; -- cgit From 0ae26f1b3159f3acb21ae1e866c3c7e16edd450f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 23 Jun 2006 02:05:15 -0700 Subject: [PATCH] mmput() might sleep exit_aio() and exit_mmap() can sleep. But it's easy to accidentally call mmput() from inside locks. Cc: Dave Peterson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ac8100e3088..195958a3a4d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -368,6 +368,8 @@ void fastcall __mmdrop(struct mm_struct *mm) */ void mmput(struct mm_struct *mm) { + might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); -- cgit From 6e6672604773b9bae44d88d38afdf0763c104b1c Mon Sep 17 00:00:00 2001 From: Prasanna Meda Date: Fri, 23 Jun 2006 02:05:23 -0700 Subject: [PATCH] dup fd error fix Set errorp in dup_fd, it will be used in sys_unshare also. Signed-off-by: Prasanna Meda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 195958a3a4d..49adc0e8d47 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -625,6 +625,7 @@ out: /* * Allocate a new files structure and copy contents from the * passed in files structure. + * errorp will be valid only when the returned files_struct is NULL. */ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { @@ -633,6 +634,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) int open_files, size, i, expand; struct fdtable *old_fdt, *new_fdt; + *errorp = -ENOMEM; newf = alloc_files(); if (!newf) goto out; @@ -746,7 +748,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) * break this. */ tsk->files = NULL; - error = -ENOMEM; newf = dup_fd(oldf, &error); if (!newf) goto out; -- cgit From 8e0a43d8fa953179505869ec28de78550246e795 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 23 Jun 2006 02:05:23 -0700 Subject: [PATCH] cond_resched() might_sleep() fix add the __might_sleep() check back to cond_resched(). Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 87665132cec..5dbc4269447 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4053,6 +4053,9 @@ asmlinkage long sys_sched_yield(void) static inline void __cond_resched(void) { +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP + __might_sleep(__FILE__, __LINE__); +#endif /* * The BKS might be reacquired before we have dropped * PREEMPT_ACTIVE, which could trigger a second -- cgit From 89d0cf01c0aa9e8241cc3703a359ecd6abf3c28a Mon Sep 17 00:00:00 2001 From: Daniel Walker Date: Fri, 23 Jun 2006 02:05:29 -0700 Subject: [PATCH] invert irq/migration.c brach prediction If you get to that point in the code it means that desc->move_irq is set, pending_irq_cpumask[irq] and cpu_online_map should have a value. Still pretty good chance anding those two you'll still have a value. So these two branch predictors should be inverted. Signed-off-by: Daniel Walker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/migration.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 134f9f2e0e3..a12d00eb5e7 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -30,7 +30,7 @@ void move_native_irq(int irq) desc->move_irq = 0; - if (likely(cpus_empty(pending_irq_cpumask[irq]))) + if (unlikely(cpus_empty(pending_irq_cpumask[irq]))) return; if (!desc->handler->set_affinity) @@ -49,7 +49,7 @@ void move_native_irq(int irq) * cause some ioapics to mal-function. * Being paranoid i guess! */ - if (unlikely(!cpus_empty(tmp))) { + if (likely(!cpus_empty(tmp))) { if (likely(!(desc->status & IRQ_DISABLED))) desc->handler->disable(irq); -- cgit From 83d4e6e7fba0b2a01092f0cf14ba2e33bd1253e9 Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Fri, 23 Jun 2006 02:05:32 -0700 Subject: [PATCH] make noirqdebug/irqfixup __read_mostly, add (un)likely() Signed-off-by: Andreas Mohr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/spurious.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 7df9abd5ec8..b2fb3c18d06 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -11,7 +11,7 @@ #include #include -static int irqfixup; +static int irqfixup __read_mostly; /* * Recovery handler for misrouted interrupts. @@ -136,9 +136,9 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, struct pt_regs *regs) { - if (action_ret != IRQ_HANDLED) { + if (unlikely(action_ret != IRQ_HANDLED)) { desc->irqs_unhandled++; - if (action_ret != IRQ_NONE) + if (unlikely(action_ret != IRQ_NONE)) report_bad_irq(irq, desc, action_ret); } @@ -152,11 +152,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, } desc->irq_count++; - if (desc->irq_count < 100000) + if (likely(desc->irq_count < 100000)) return; desc->irq_count = 0; - if (desc->irqs_unhandled > 99900) { + if (unlikely(desc->irqs_unhandled > 99900)) { /* * The interrupt is stuck */ @@ -171,7 +171,7 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, desc->irqs_unhandled = 0; } -int noirqdebug; +int noirqdebug __read_mostly; int __init noirqdebug_setup(char *str) { -- cgit From 57ae2508610d50893cb3e3bbb869ff70ff724a2a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 23 Jun 2006 02:05:47 -0700 Subject: [PATCH] CONFIG_NET=n build fix Cc: Greg KH Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 072ac446810..eb8bd214e7d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -399,7 +399,7 @@ static ctl_table kern_table[] = { .strategy = &sysctl_string, }, #endif -#ifdef CONFIG_HOTPLUG +#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) { .ctl_name = KERN_HOTPLUG, .procname = "hotplug", -- cgit From 55f4e8d156d23709739029afb108932ef94cac94 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Fri, 23 Jun 2006 02:05:50 -0700 Subject: [PATCH] kernel/sys.c doesn't need init.h kernel/sys.c doesn't have anything in it relying on linux/init.h - remove the include. Signed-off-by: Jes Sorensen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index a57a00597ce..90930b28d2c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include -- cgit From d83015b8f62ee3fcd338f6f009051ed57f77a531 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Jun 2006 02:05:51 -0700 Subject: [PATCH] Make RCU API inaccessible to non-GPL Linux kernel modules Remove synchronize_kernel() (deprecated 2-APR-2005 in http://lkml.org/lkml/2005/4/3/11) and makes the RCU API inaccessible to non-GPL Linux kernel modules (as was announced more than one year ago in http://lkml.org/lkml/2005/4/3/8). Tested on x86 and ppc64. Signed-off-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcupdate.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2058f88c7bb..20e9710fc21 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -612,14 +612,6 @@ void synchronize_rcu(void) wait_for_completion(&rcu.completion); } -/* - * Deprecated, use synchronize_rcu() or synchronize_sched() instead. - */ -void synchronize_kernel(void) -{ - synchronize_rcu(); -} - module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); @@ -627,7 +619,6 @@ module_param(qlowmark, int, 0); module_param(rsinterval, int, 0); #endif EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ -EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ +EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_bh); EXPORT_SYMBOL_GPL(synchronize_rcu); -EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ -- cgit From 862f5f0133f1c8a179dd93adc03d43f8f7e8bac5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 23 Jun 2006 02:05:52 -0700 Subject: [PATCH] Doc: add audit & acct to DocBook Fix one audit kernel-doc description (one parameter was missing). Add audit*.c interfaces to DocBook. Add BSD accounting interfaces to DocBook. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b097ccb4eb7..9ebd96fda29 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1558,6 +1558,7 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) * @uid: msgq user id * @gid: msgq group id * @mode: msgq mode (permissions) + * @ipcp: in-kernel IPC permissions * * Returns 0 for success or NULL context or < 0 on error. */ -- cgit From 626ab0e69d376fa07599af669af8ba92d58e87c1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Jun 2006 02:05:55 -0700 Subject: [PATCH] list: use list_replace_init() instead of list_splice_init() list_splice_init(list, head) does unneeded job if it is known that list_empty(head) == 1. We can use list_replace_init() instead. Signed-off-by: Oleg Nesterov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 8 ++++---- kernel/workqueue.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 9e49deed468..3bf0e9ed2db 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -419,10 +419,10 @@ static inline void __run_timers(tvec_base_t *base) spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list = LIST_HEAD_INIT(work_list); + struct list_head work_list; struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; - + /* * Cascade timers: */ @@ -431,8 +431,8 @@ static inline void __run_timers(tvec_base_t *base) (!cascade(base, &base->tv3, INDEX(1))) && !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; - list_splice_init(base->tv1.vec + index, &work_list); + ++base->timer_jiffies; + list_replace_init(base->tv1.vec + index, &work_list); while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 880fb415a8f..740c5abceb0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -531,11 +531,11 @@ int current_is_keventd(void) static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) { struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - LIST_HEAD(list); + struct list_head list; struct work_struct *work; spin_lock_irq(&cwq->lock); - list_splice_init(&cwq->worklist, &list); + list_replace_init(&cwq->worklist, &list); while (!list_empty(&list)) { printk("Taking work for %s\n", wq->name); -- cgit From 3439dd86e34580384d3b58cf8d54a9283cd7a342 Mon Sep 17 00:00:00 2001 From: Porpoise Date: Fri, 23 Jun 2006 02:05:56 -0700 Subject: [PATCH] When CONFIG_BASE_SMALL=1, cascade() may enter an infinite loop When CONFIG_BASE_SAMLL=1, cascade() in may enter the infinite loop. Because of CONFIG_BASE_SMALL=1(TVR_BITS=6 and TVN_BITS=4), the list base->tv5 may cascade into base->tv5. So, the kernel enters the infinite loop in the function cascade(). I created a test module to verify this bug, and a patch to fix it. #include #include #include #include #if 0 #include #else #define kdb_printf printk #endif #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) #define TVN_SIZE (1 << TVN_BITS) #define TVR_SIZE (1 << TVR_BITS) #define TVN_MASK (TVN_SIZE - 1) #define TVR_MASK (TVR_SIZE - 1) #define TV_SIZE(N) (N*TVN_BITS + TVR_BITS) struct timer_list timer0; struct timer_list dummy_timer1; struct timer_list dummy_timer2; void dummy_timer_fun(unsigned long data) { } unsigned long j=0; void check_timer_base(unsigned long data) { kdb_printf("check_timer_base %08x\n",jiffies); mod_timer(&timer0,(jiffies & (~0xFFF)) + 0x1FFF); } int init_module(void) { init_timer(&timer0); timer0.data = (unsigned long)0; timer0.function = check_timer_base; mod_timer(&timer0,jiffies+1); init_timer(&dummy_timer1); dummy_timer1.data = (unsigned long)0; dummy_timer1.function = dummy_timer_fun; init_timer(&dummy_timer2); dummy_timer2.data = (unsigned long)0; dummy_timer2.function = dummy_timer_fun; j=jiffies; j&=(~((1< Cc: Matt Mackall Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 3bf0e9ed2db..f35b3939e93 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -383,23 +383,19 @@ EXPORT_SYMBOL(del_timer_sync); static int cascade(tvec_base_t *base, tvec_t *tv, int index) { /* cascade all the timers from tv up one level */ - struct list_head *head, *curr; + struct timer_list *timer, *tmp; + struct list_head tv_list; + + list_replace_init(tv->vec + index, &tv_list); - head = tv->vec + index; - curr = head->next; /* - * We are removing _all_ timers from the list, so we don't have to - * detach them individually, just clear the list afterwards. + * We are removing _all_ timers from the list, so we + * don't have to detach them individually. */ - while (curr != head) { - struct timer_list *tmp; - - tmp = list_entry(curr, struct timer_list, entry); - BUG_ON(tmp->base != base); - curr = curr->next; - internal_add_timer(base, tmp); + list_for_each_entry_safe(timer, tmp, &tv_list, entry) { + BUG_ON(timer->base != base); + internal_add_timer(base, timer); } - INIT_LIST_HEAD(head); return index; } -- cgit From 908dcecda1d18803b5823f30e6c47d2882dc0cf1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 23 Jun 2006 02:06:00 -0700 Subject: [PATCH] adjust handle_IRR_event() return type Correct the return type of handle_IRQ_event() (inconsistency noticed during Xen development), and remove redundant declarations. The return type adjustment required breaking out the definition of irqreturn_t into a separate header, in order to satisfy current include order dependencies. Signed-off-by: Jan Beulich Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Russell King Cc: Ian Molton Cc: Mikael Starvik Cc: Yoshinori Sato Cc: Hirokazu Takata Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: William Lee Irwin III Cc: "David S. Miller" Cc: Miles Bader Cc: Geert Uytterhoeven Cc: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 51df337b37d..0f653011710 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -76,10 +76,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) /* * Have got an event to handle: */ -fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, +fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, struct irqaction *action) { - int ret, retval = 0, status = 0; + irqreturn_t ret, retval = IRQ_NONE; + unsigned int status = 0; if (!(action->flags & SA_INTERRUPT)) local_irq_enable(); -- cgit From 125e18745f16685f69a34fd6130d47598fc4bf54 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Fri, 23 Jun 2006 02:06:06 -0700 Subject: [PATCH] More BUG_ON conversion Signed-off-by: Eric Sesterhenn Signed-off-by: Alexey Dobriyan Cc: Bartlomiej Zolnierkiewicz Cc: Alan Cox Cc: James Bottomley Acked-by: "Salyzyn, Mark" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index e06d0c10a24..a3baf92462b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -579,7 +579,7 @@ static void exit_mm(struct task_struct * tsk) down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); - if (mm != tsk->active_mm) BUG(); + BUG_ON(mm != tsk->active_mm); /* more a memory barrier than a real lock */ task_lock(tsk); tsk->mm = NULL; @@ -1530,8 +1530,7 @@ check_continued: if (options & __WNOTHREAD) break; tsk = next_thread(tsk); - if (tsk->signal != current->signal) - BUG(); + BUG_ON(tsk->signal != current->signal); } while (tsk != current); read_unlock(&tasklist_lock); -- cgit From eb71c87a492b7090ff9e8ac46912c480a1687e38 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 24 Jun 2006 14:27:42 -0700 Subject: Add some basic resume trace facilities Considering that there isn't a lot of hw we can depend on during resume, this is about as good as it gets. This is x86-only for now, although the basic concept (and most of the code) will certainly work on almost any platform. Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ce0dfb8f4a4..cdf315e794f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -36,6 +36,15 @@ config PM_DEBUG code. This is helpful when debugging and reporting various PM bugs, like suspend support. +config PM_TRACE + bool "Suspend/resume event tracing" + depends on PM && PM_DEBUG && X86 + default y + ---help--- + This enables some cheesy code to save the last PM event point in the + RTC across reboots, so that you can debug a machine that just hangs + during suspend (or more commonly, during resume). + config SOFTWARE_SUSPEND bool "Software Suspend" depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) -- cgit From f867d2a2e5f3f0ce6356f452cc27b70d577de7c7 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sun, 25 Jun 2006 05:47:09 -0700 Subject: [PATCH] ensure NULL deref can't possibly happen in is_exported() If CONFIG_KALLSYMS is defined and if it should happen that is_exported() is given a NULL 'mod' and lookup_symbol(name, __start___ksymtab, __stop___ksymtab) returns 0, then we'll end up dereferencing a NULL pointer. Signed-off-by: Jesper Juhl Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index bbe04862e1b..d75275de1c2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1326,7 +1326,7 @@ int is_exported(const char *name, const struct module *mod) if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) return 1; else - if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) + if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) return 1; else return 0; -- cgit From bfe5d834195b3089b8846577311340376cc0f450 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 25 Jun 2006 05:47:14 -0700 Subject: [PATCH] Define __raw_get_cpu_var and use it There are several instances of per_cpu(foo, raw_smp_processor_id()), which is semantically equivalent to __get_cpu_var(foo) but without the warning that smp_processor_id() can give if CONFIG_DEBUG_PREEMPT is enabled. For those architectures with optimized per-cpu implementations, namely ia64, powerpc, s390, sparc64 and x86_64, per_cpu() turns into more and slower code than __get_cpu_var(), so it would be preferable to use __get_cpu_var on those platforms. This defines a __raw_get_cpu_var(x) macro which turns into per_cpu(x, raw_smp_processor_id()) on architectures that use the generic per-cpu implementation, and turns into __get_cpu_var(x) on the architectures that have an optimized per-cpu implementation. Signed-off-by: Paul Mackerras Acked-by: David S. Miller Acked-by: Ingo Molnar Acked-by: Martin Schwidefsky Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 4 ++-- kernel/sched.c | 4 ++-- kernel/softlockup.c | 2 +- kernel/timer.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 18324305724..9587aac72f4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -576,7 +576,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, memset(timer, 0, sizeof(struct hrtimer)); - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + bases = __raw_get_cpu_var(hrtimer_bases); if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) clock_id = CLOCK_MONOTONIC; @@ -599,7 +599,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { struct hrtimer_base *bases; - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + bases = __raw_get_cpu_var(hrtimer_bases); *tp = ktime_to_timespec(bases[which_clock].resolution); return 0; diff --git a/kernel/sched.c b/kernel/sched.c index 5dbc4269447..f8d540b324c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4152,7 +4152,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); atomic_inc(&rq->nr_iowait); schedule(); @@ -4163,7 +4163,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); long ret; atomic_inc(&rq->nr_iowait); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 14c7faf0290..2c1be1163ed 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -36,7 +36,7 @@ static struct notifier_block panic_block = { void touch_softlockup_watchdog(void) { - per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; + __raw_get_cpu_var(touch_timestamp) = jiffies; } EXPORT_SYMBOL(touch_softlockup_watchdog); diff --git a/kernel/timer.c b/kernel/timer.c index f35b3939e93..eb97371b87d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; - timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); + timer->base = __raw_get_cpu_var(tvec_bases); } EXPORT_SYMBOL(init_timer); -- cgit From bbb1747d4e44ce49acc73daa8d66e5f6bd546f1b Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Sun, 25 Jun 2006 05:47:15 -0700 Subject: [PATCH] Allow raw_notifier callouts to unregister themselves Since raw_notifier chains don't benefit from any centralized locking protections, they shouldn't suffer from the associated limitations. Under some circumstances it might make sense for a raw_notifier callout routine to unregister itself from the notifier chain. This patch (as678) changes the notifier core to allow for such things. Signed-off-by: Alan Stern Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 90930b28d2c..7e0927bad71 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -137,14 +137,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v) { int ret = NOTIFY_DONE; - struct notifier_block *nb; + struct notifier_block *nb, *next_nb; nb = rcu_dereference(*nl); while (nb) { + next_nb = rcu_dereference(nb->next); ret = nb->notifier_call(nb, val, v); if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) break; - nb = rcu_dereference(nb->next); + nb = next_nb; } return ret; } -- cgit From 76a8ad293912cd2f01eca075d80cd0ddec30c627 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sun, 25 Jun 2006 05:47:40 -0700 Subject: [PATCH] Make printk work for really early debugging Currently printk is no use for early debugging because it refuses to actually print anything to the console unless cpu_online(smp_processor_id()) is true. The stated explanation is that console drivers may require per-cpu resources, or otherwise barf, because the system is not yet setup correctly. Fair enough. However some console drivers might be quite happy running early during boot, in fact we have one, and so it'd be nice if printk understood that. So I added a flag (which I would have called CON_BOOT, but that's taken) called CON_ANYTIME, which indicates that a console is happy to be called anytime, even if the cpu is not yet online. Tested on a Power 5 machine, with both a CON_ANYTIME driver and a bogus console driver that BUG()s if called while offline. No problems AFAICT. Built for i386 UP & SMP. Signed-off-by: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 50 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 19a95561929..6b89dd9d11b 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -327,7 +327,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end) struct console *con; for (con = console_drivers; con; con = con->next) { - if ((con->flags & CON_ENABLED) && con->write) + if ((con->flags & CON_ENABLED) && con->write && + (cpu_online(smp_processor_id()) || + (con->flags & CON_ANYTIME))) con->write(con, &LOG_BUF(start), end - start); } } @@ -453,6 +455,18 @@ __attribute__((weak)) unsigned long long printk_clock(void) return sched_clock(); } +/* Check if we have any console registered that can be called early in boot. */ +static int have_callable_console(void) +{ + struct console *con; + + for (con = console_drivers; con; con = con->next) + if (con->flags & CON_ANYTIME) + return 1; + + return 0; +} + /** * printk - print a kernel message * @fmt: format string @@ -566,27 +580,29 @@ asmlinkage int vprintk(const char *fmt, va_list args) log_level_unknown = 1; } - if (!cpu_online(smp_processor_id())) { + if (!down_trylock(&console_sem)) { /* - * Some console drivers may assume that per-cpu resources have - * been allocated. So don't allow them to be called by this - * CPU until it is officially up. We shouldn't be calling into - * random console drivers on a CPU which doesn't exist yet.. + * We own the drivers. We can drop the spinlock and + * let release_console_sem() print the text, maybe ... */ + console_locked = 1; printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); - goto out; - } - if (!down_trylock(&console_sem)) { - console_locked = 1; + /* - * We own the drivers. We can drop the spinlock and let - * release_console_sem() print the text + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. */ - printk_cpu = UINT_MAX; - spin_unlock_irqrestore(&logbuf_lock, flags); - console_may_schedule = 0; - release_console_sem(); + if (cpu_online(smp_processor_id()) || have_callable_console()) { + console_may_schedule = 0; + release_console_sem(); + } else { + /* Release by hand to avoid flushing the buffer. */ + console_locked = 0; + up(&console_sem); + } } else { /* * Someone else owns the drivers. We drop the spinlock, which @@ -596,7 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); } -out: + preempt_enable(); return printed_len; } -- cgit From 83cc5ed3c4c65fc4c3729a5cec2111ede1ebf85e Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sun, 25 Jun 2006 05:47:41 -0700 Subject: [PATCH] kernel/sys.c: cleanups - proper prototypes for the following functions: - ctrl_alt_del() (in include/linux/reboot.h) - getrusage() (in include/linux/resource.h) - make the following needlessly global functions static: - kernel_restart_prepare() - kernel_kexec() [akpm@osdl.org: compile fix] Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 +-- kernel/sys.c | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a3baf92462b..b12a4706f73 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -36,6 +36,7 @@ #include #include #include /* for audit_free() */ +#include #include #include @@ -45,8 +46,6 @@ extern void sem_exit (void); extern struct task_struct *child_reaper; -int getrusage(struct task_struct *, int, struct rusage __user *); - static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) diff --git a/kernel/sys.c b/kernel/sys.c index 7e0927bad71..2d5179c67ce 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -589,7 +589,7 @@ void emergency_restart(void) } EXPORT_SYMBOL_GPL(emergency_restart); -void kernel_restart_prepare(char *cmd) +static void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; @@ -623,7 +623,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ -void kernel_kexec(void) +static void kernel_kexec(void) { #ifdef CONFIG_KEXEC struct kimage *image; @@ -637,7 +637,6 @@ void kernel_kexec(void) machine_kexec(image); #endif } -EXPORT_SYMBOL_GPL(kernel_kexec); void kernel_shutdown_prepare(enum system_states state) { -- cgit From b61367732fc273977cc3fb85c272ce1a7bb1f533 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 25 Jun 2006 05:47:49 -0700 Subject: [PATCH] schedule_on_each_cpu(): reduce kmalloc() size schedule_on_each_cpu() presently does a large kmalloc - 96 kbytes on 1024 CPU 64-bit. Rework it so that we do one 8192-byte allocation and then a pile of tiny ones, via alloc_percpu(). This has a much higher chance of success (100% in the current VM). This also has the effect of reducing the memory requirements from NR_CPUS*n to num_possible_cpus()*n. Cc: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 740c5abceb0..f869aff6bc0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -428,22 +428,34 @@ int schedule_delayed_work_on(int cpu, return ret; } -int schedule_on_each_cpu(void (*func) (void *info), void *info) +/** + * schedule_on_each_cpu - call a function on each online CPU from keventd + * @func: the function to call + * @info: a pointer to pass to func() + * + * Returns zero on success. + * Returns -ve errno on failure. + * + * Appears to be racy against CPU hotplug. + * + * schedule_on_each_cpu() is very slow. + */ +int schedule_on_each_cpu(void (*func)(void *info), void *info) { int cpu; - struct work_struct *work; + struct work_struct *works; - work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); - - if (!work) + works = alloc_percpu(struct work_struct); + if (!works) return -ENOMEM; + for_each_online_cpu(cpu) { - INIT_WORK(work + cpu, func, info); + INIT_WORK(per_cpu_ptr(works, cpu), func, info); __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), - work + cpu); + per_cpu_ptr(works, cpu)); } flush_workqueue(keventd_wq); - kfree(work); + free_percpu(works); return 0; } -- cgit From 3b364b8d584b94777f8446a943b3c65e75e758f8 Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Sun, 25 Jun 2006 05:47:56 -0700 Subject: [PATCH] constify parts of kernel/power/ Signed-off-by: Andreas Mohr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 2 +- kernel/power/main.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 81d4d982f3f..e13e7406784 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -231,7 +231,7 @@ static int software_resume(void) late_initcall(software_resume); -static char * pm_disk_modes[] = { +static const char * const pm_disk_modes[] = { [PM_DISK_FIRMWARE] = "firmware", [PM_DISK_PLATFORM] = "platform", [PM_DISK_SHUTDOWN] = "shutdown", diff --git a/kernel/power/main.c b/kernel/power/main.c index cdf0f07af92..6d295c77679 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -145,7 +145,7 @@ static void suspend_finish(suspend_state_t state) -static char *pm_states[PM_SUSPEND_MAX] = { +static const char * const pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", #ifdef CONFIG_SOFTWARE_SUSPEND @@ -262,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) { suspend_state_t state = PM_SUSPEND_STANDBY; - char ** s; + const char * const *s; char *p; int error; int len; -- cgit From 11e64757f9fb32f13f51596bbf01988f42fca764 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Sun, 25 Jun 2006 05:48:03 -0700 Subject: [PATCH] Remove unecessary NULL check in kernel/acct.c copy_process() appears to be the only caller of acct_clear_integrals() and does not pass in NULL task pointers. Remove the unecessary check. Signed-off-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 6802020e0ce..44dd6bd6351 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -599,9 +599,7 @@ void acct_update_integrals(struct task_struct *tsk) */ void acct_clear_integrals(struct task_struct *tsk) { - if (tsk) { - tsk->acct_stimexpd = 0; - tsk->acct_rss_mem1 = 0; - tsk->acct_vm_mem1 = 0; - } + tsk->acct_stimexpd = 0; + tsk->acct_rss_mem1 = 0; + tsk->acct_vm_mem1 = 0; } -- cgit From 3b9c04106b70e46803c69d13d5da32f6129fa76d Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Sun, 25 Jun 2006 05:48:15 -0700 Subject: [PATCH] printk time parameter Currently, enabling/disabling printk timestamps is only possible through reboot (bootparam) or recompile. I normally do not run with timestamps (since syslog handles that in a good manner), but for measuring small kernel delays (e.g. irq probing - see parport thread) I needed subsecond precision, but then again, just for some minutes rather than all kernel messages to come. The following patch adds a module_param() with which the timestamps can be en-/disabled in a live system through /sys/modules/printk/parameters/printk_time. Signed-off-by: Jan Engelhardt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 6b89dd9d11b..95b7fe17f12 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -24,6 +24,7 @@ #include #include #include +#include #include /* For in_interrupt() */ #include #include @@ -439,6 +440,7 @@ static int printk_time = 1; #else static int printk_time = 0; #endif +module_param(printk_time, int, S_IRUGO | S_IWUSR); static int __init printk_time_setup(char *str) { -- cgit From eab03ac7bd3e0da99eb9dc068772a85a5e3f3577 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 25 Jun 2006 05:48:31 -0700 Subject: [PATCH] Get rid of /proc/sys/proc The table is empty, why does it still exist? Signed-off-by: Stephen Hemminger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index eb8bd214e7d..2c0e6581944 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -143,7 +143,6 @@ static struct ctl_table_header root_table_header = static ctl_table kern_table[]; static ctl_table vm_table[]; -static ctl_table proc_table[]; static ctl_table fs_table[]; static ctl_table debug_table[]; static ctl_table dev_table[]; @@ -202,12 +201,6 @@ static ctl_table root_table[] = { .child = net_table, }, #endif - { - .ctl_name = CTL_PROC, - .procname = "proc", - .mode = 0555, - .child = proc_table, - }, { .ctl_name = CTL_FS, .procname = "fs", @@ -927,10 +920,6 @@ static ctl_table vm_table[] = { { .ctl_name = 0 } }; -static ctl_table proc_table[] = { - { .ctl_name = 0 } -}; - static ctl_table fs_table[] = { { .ctl_name = FS_NRINODE, -- cgit From 838cd153a5250a79a302f6c5d68a4794b70c4ccb Mon Sep 17 00:00:00 2001 From: "akpm@osdl.org" Date: Sun, 25 Jun 2006 05:48:35 -0700 Subject: [PATCH] N32 sigset and __COMPAT_ENDIAN_SWAP__ I'm testing glibc on MIPS64, little-endian, N32, O32 and N64 multilibs. Among the NPTL test failures seen are some arising from sigsuspend problems for N32: it blocks the wrong signals, so SIGCANCEL (SIGRTMIN) is blocked despite glibc's carefully excluding it from sets of signals to block. Specifically, testing suggests it blocks signal N^32 instead of signal N, so (in the example tested) blocking SIGUSR1 (17) blocks signal 49 instead. glibc's sigset_t uses an array of unsigned long, as does the kernel. In both cases, signal N+1 is represented as (1UL << (N % (8 * sizeof (unsigned long)))) in word number (N / (8 * sizeof (unsigned long))). Thus the N32 glibc uses an array of 32-bit words and the N64 kernel uses an array of 64-bit words. For little-endian, the layout is the same, with signals 1-32 in the first 4 bytes, signals 33-64 in the second, etc.; for big-endian, userspace has that layout while in the kernel each 8 bytes have the two halves swapped from the userspace layout. The N32 sigsuspend syscall uses sigset_from_compat to convert the userspace sigset to kernel format. If __COMPAT_ENDIAN_SWAP__ is *not* set, this uses logic of the form set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ) to convert the userspace sigset to a kernel one. This looks correct to me for both big and little endian, given that in userspace compat->sig[1] will represent signals 33-64, and so will the high 32 bits of set->sig[0] in the kernel. If however __COMPAT_ENDIAN_SWAP__ *is* set, as it is for __MIPSEL__, it uses set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); which seems incorrect for both big and little endian, and would explain the observed symptoms. This code is the only use of __COMPAT_ENDIAN_SWAP__, so if incorrect then that macro serves no purpose, in which case something like the following patch would seem appropriate to remove it. Signed-off-by: Joseph Myers Signed-off-by: Ralf Baechle Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 2f672332430..126dee9530a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -730,17 +730,10 @@ void sigset_from_compat (sigset_t *set, compat_sigset_t *compat) { switch (_NSIG_WORDS) { -#if defined (__COMPAT_ENDIAN_SWAP__) - case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 ); - case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 ); - case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 ); - case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); -#else case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); -#endif } } -- cgit From 2aa92581fb13e04e1440e5041b412cc06c782e0e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 25 Jun 2006 05:48:44 -0700 Subject: [PATCH] Link error when futexes are disabled on 64bit architectures If futexes are disabled we fail to link on ppc64. Signed-off-by: Anton Blanchard Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index b12a4706f73..601263c0806 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -898,7 +898,7 @@ fastcall NORET_TYPE void do_exit(long code) } if (unlikely(tsk->robust_list)) exit_robust_list(tsk); -#ifdef CONFIG_COMPAT +#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) if (unlikely(tsk->compat_robust_list)) compat_exit_robust_list(tsk); #endif -- cgit From 8bdd1d1250d55afe403ac4affa6ccc5f9e60468f Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Sun, 25 Jun 2006 05:49:08 -0700 Subject: [PATCH] kthread: convert stop_machine into a kthread - Update stop_machine.c to spawn stop_machine as kthreads rather than the deprecated kernel_threads. - Update stop_machine to use the more efficient kthread_bind() before running task in place of set_cpus_allowed() after. [akpm@osdl.org: remove now-wrong set_cpus_allowed()] Signed-off-by: Serge E. Hallyn Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index dcfb5d73146..2c0aacc37c5 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -25,13 +26,11 @@ static unsigned int stopmachine_num_threads; static atomic_t stopmachine_thread_ack; static DECLARE_MUTEX(stopmachine_mutex); -static int stopmachine(void *cpu) +static int stopmachine(void *unused) { int irqs_disabled = 0; int prepared = 0; - set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); - /* Ack: we are alive */ smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ atomic_inc(&stopmachine_thread_ack); @@ -85,7 +84,8 @@ static void stopmachine_set_state(enum stopmachine_state state) static int stop_machine(void) { - int i, ret = 0; + int ret = 0; + unsigned int i; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; /* One high-prio thread per cpu. We'll do this one. */ @@ -96,11 +96,16 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { + struct task_struct *tsk; if (i == raw_smp_processor_id()) continue; - ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); - if (ret < 0) + tsk = kthread_create(stopmachine, NULL, "stopmachine"); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); break; + } + kthread_bind(tsk, i); + wake_up_process(tsk); stopmachine_num_threads++; } -- cgit From fc75cdfa5b43ac4d3232b490800cd35063adafd3 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 25 Jun 2006 05:49:10 -0700 Subject: [PATCH] cpu hotplug: fix CPU_UP_CANCEL handling If a cpu hotplug callback fails on CPU_UP_PREPARE, all callbacks will be called with CPU_UP_CANCELED. A few of these callbacks assume that on CPU_UP_PREPARE a pointer to task has been stored in a percpu array. This assumption is not true if CPU_UP_PREPARE fails and the following calls to kthread_bind() in CPU_UP_CANCELED will cause an addressing exception because of passing a NULL pointer. Signed-off-by: Heiko Carstens Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 ++ kernel/softirq.c | 2 ++ kernel/softlockup.c | 2 ++ kernel/workqueue.c | 2 ++ 4 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f8d540b324c..f06d059edef 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4756,6 +4756,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!cpu_rq(cpu)->migration_thread) + break; /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, any_online_cpu(cpu_online_map)); diff --git a/kernel/softirq.c b/kernel/softirq.c index 336f92d64e2..9e2f1c6e73d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -470,6 +470,8 @@ static int cpu_callback(struct notifier_block *nfb, break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!per_cpu(ksoftirqd, hotcpu)) + break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(ksoftirqd, hotcpu), any_online_cpu(cpu_online_map)); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 2c1be1163ed..b5c3b94e01c 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!per_cpu(watchdog_task, hotcpu)) + break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(watchdog_task, hotcpu), any_online_cpu(cpu_online_map)); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f869aff6bc0..565cf7a1feb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -590,6 +590,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: list_for_each_entry(wq, &workqueues, list) { + if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) + continue; /* Unbind so it can run. */ kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, any_online_cpu(cpu_online_map)); -- cgit From fa9799e33d362aeca4555cd6318735bab1c04d16 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 25 Jun 2006 05:49:15 -0700 Subject: [PATCH] ktime/hrtimer: fix kernel-doc comments Fix kernel-doc formatting in ktime.h and hrtimer.[ch] files. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9587aac72f4..55601b3ce60 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = /** * ktime_get_ts - get the monotonic clock in timespec format - * * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime @@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) # ifndef CONFIG_KTIME_SCALAR /** * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * * @kt: addend * @nsec: the scalar nsec value to add * @@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) /** * hrtimer_forward - forward the timer expiry - * * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward @@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) /** * hrtimer_start - (re)start an relative timer on the current CPU - * * @timer: the timer to be added * @tim: expiry time * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) @@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); /** * hrtimer_try_to_cancel - try to deactivate a timer - * * @timer: hrtimer to stop * * Returns: * 0 when the timer was not active * 1 when the timer was active * -1 when the timer is currently excuting the callback function and - * can not be stopped + * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { @@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. - * * @timer: the timer to be cancelled * * Returns: @@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer - * * @timer: the timer to read */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) @@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void) /** * hrtimer_init - initialize a timer to the given clock - * * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel @@ -588,7 +580,6 @@ EXPORT_SYMBOL_GPL(hrtimer_init); /** * hrtimer_get_res - get the timer resolution for a clock - * * @which_clock: which clock to query * @tp: pointer to timespec variable to store the resolution * -- cgit From 9e37bd301ee130598fa1406c1281caa159473bf8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 25 Jun 2006 05:49:19 -0700 Subject: [PATCH] kthread: move kernel-doc and put it into DocBook Move kthread API kernel-doc from kthread.h to kthread.c & fix it. Add kthread API to kernel-api DocBook. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index c5f3c6613b6..24be714b04c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -45,6 +45,13 @@ struct kthread_stop_info static DEFINE_MUTEX(kthread_stop_lock); static struct kthread_stop_info kthread_stop_info; +/** + * kthread_should_stop - should this kthread return now? + * + * When someone calls kthread_stop on your kthread, it will be woken + * and this will return true. You should then return, and your return + * value will be passed through to kthread_stop(). + */ int kthread_should_stop(void) { return (kthread_stop_info.k == current); @@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create) complete(&create->done); } +/** + * kthread_create - create a kthread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @namefmt: printf-style name for the thread. + * + * Description: This helper function creates and names a kernel + * thread. The thread will be stopped: use wake_up_process() to start + * it. See also kthread_run(), kthread_create_on_cpu(). + * + * When woken, the thread will run @threadfn() with @data as its + * argument. @threadfn can either call do_exit() directly if it is a + * standalone thread for which noone will call kthread_stop(), or + * return when 'kthread_should_stop()' is true (which means + * kthread_stop() has been called). The return value should be zero + * or a negative error number; it will be passed to kthread_stop(). + * + * Returns a task_struct or ERR_PTR(-ENOMEM). + */ struct task_struct *kthread_create(int (*threadfn)(void *data), void *data, const char namefmt[], @@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create); +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @k: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create(). + */ void kthread_bind(struct task_struct *k, unsigned int cpu) { BUG_ON(k->state != TASK_INTERRUPTIBLE); @@ -166,12 +201,36 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) } EXPORT_SYMBOL(kthread_bind); +/** + * kthread_stop - stop a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_stop() for @k to return true, wakes it, and + * waits for it to exit. Your threadfn() must not call do_exit() + * itself if you use this function! This can also be called after + * kthread_create() instead of calling wake_up_process(): the thread + * will exit without calling threadfn(). + * + * Returns the result of threadfn(), or %-EINTR if wake_up_process() + * was never called. + */ int kthread_stop(struct task_struct *k) { return kthread_stop_sem(k, NULL); } EXPORT_SYMBOL(kthread_stop); +/** + * kthread_stop_sem - stop a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * @s: semaphore that @k waits on while idle. + * + * Does essentially the same thing as kthread_stop() above, but wakes + * @k by calling up(@s). + * + * Returns the result of threadfn(), or %-EINTR if wake_up_process() + * was never called. + */ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) { int ret; @@ -210,5 +269,5 @@ static __init int helper_init(void) return 0; } -core_initcall(helper_init); +core_initcall(helper_init); -- cgit From 0e4648141af02331f21aabcd34940c70f09a2d04 Mon Sep 17 00:00:00 2001 From: KaiGai Kohei Date: Sun, 25 Jun 2006 05:49:24 -0700 Subject: [PATCH] pacct: add pacct_struct to fix some pacct bugs. The pacct facility need an i/o operation when an accounting record is generated. There is a possibility to wake OOM killer up. If OOM killer is activated, it kills some processes to make them release process memory regions. But acct_process() is called in the killed processes context before calling exit_mm(), so those processes cannot release own memory. In the results, any processes stop in this point and it finally cause a system stall. --- kernel/acct.c | 51 ++++++++++++++++++++++++++++++++++++--------------- kernel/exit.c | 4 +++- kernel/fork.c | 1 + 3 files changed, 40 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 44dd6bd6351..b3526313782 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -421,9 +421,9 @@ static u32 encode_float(u64 value) */ static void do_acct_process(long exitcode, struct file *file) { + struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; mm_segment_t fs; - unsigned long vsize; unsigned long flim; u64 elapsed; u64 run_time; @@ -505,20 +505,9 @@ static void do_acct_process(long exitcode, struct file *file) ac.ac_flag |= ACORE; if (current->flags & PF_SIGNALED) ac.ac_flag |= AXSIG; - - vsize = 0; - if (current->mm) { - struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); - vma = current->mm->mmap; - while (vma) { - vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - up_read(¤t->mm->mmap_sem); - } - vsize = vsize / 1024; - ac.ac_mem = encode_comp_t(vsize); + spin_lock(¤t->sighand->siglock); + ac.ac_mem = encode_comp_t(pacct->ac_mem); + spin_unlock(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); ac.ac_minflt = encode_comp_t(current->signal->min_flt + @@ -545,6 +534,38 @@ static void do_acct_process(long exitcode, struct file *file) set_fs(fs); } +/** + * acct_init_pacct - initialize a new pacct_struct + */ +void acct_init_pacct(struct pacct_struct *pacct) +{ + memset(pacct, 0, sizeof(struct pacct_struct)); +} + +/** + * acct_collect - collect accounting information into pacct_struct + */ +void acct_collect(void) +{ + struct pacct_struct *pacct = ¤t->signal->pacct; + unsigned long vsize = 0; + + if (current->mm) { + struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); + vma = current->mm->mmap; + while (vma) { + vsize += vma->vm_end - vma->vm_start; + vma = vma->vm_next; + } + up_read(¤t->mm->mmap_sem); + } + + spin_lock(¤t->sighand->siglock); + pacct->ac_mem = vsize / 1024; + spin_unlock(¤t->sighand->siglock); +} + /** * acct_process - now just a wrapper around do_acct_process * @exitcode: task exit code diff --git a/kernel/exit.c b/kernel/exit.c index 601263c0806..819d82c2efb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -894,7 +894,7 @@ fastcall NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); - acct_process(code); + acct_collect(); } if (unlikely(tsk->robust_list)) exit_robust_list(tsk); @@ -906,6 +906,8 @@ fastcall NORET_TYPE void do_exit(long code) audit_free(tsk); exit_mm(tsk); + if (group_dead) + acct_process(code); exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 49adc0e8d47..dfd10cb370c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -874,6 +874,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts tsk->it_prof_expires = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); } + acct_init_pacct(&sig->pacct); return 0; } -- cgit From f6ec29a42d7ac3b309a9cef179b686d23986ab98 Mon Sep 17 00:00:00 2001 From: KaiGai Kohei Date: Sun, 25 Jun 2006 05:49:25 -0700 Subject: [PATCH] pacct: avoidance to refer the last thread as a representation of the process When pacct facility generate an 'ac_flag' field in accounting record, it refers a task_struct of the thread which died last in the process. But any other task_structs are ignored. Therefore, pacct facility drops ASU flag even if root-privilege operations are used by any other threads except the last one. In addition, AFORK flag is always set when the thread of group-leader didn't die last, although this process has called execve() after fork(). We have a same matter in ac_exitcode. The recorded ac_exitcode is an exit code of the last thread in the process. There is a possibility this exitcode is not the group leader's one. --- kernel/acct.c | 42 ++++++++++++++++++++++++------------------ kernel/exit.c | 4 ++-- 2 files changed, 26 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index b3526313782..4c85fdf615d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -75,7 +75,7 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(long, struct file *); +static void do_acct_process(struct file *); /* * This structure is used so that all the data protected by lock @@ -196,7 +196,7 @@ static void acct_file_reopen(struct file *file) if (old_acct) { mnt_unpin(old_acct->f_vfsmnt); spin_unlock(&acct_globals.lock); - do_acct_process(0, old_acct); + do_acct_process(old_acct); filp_close(old_acct, NULL); spin_lock(&acct_globals.lock); } @@ -419,7 +419,7 @@ static u32 encode_float(u64 value) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(long exitcode, struct file *file) +static void do_acct_process(struct file *file) { struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; @@ -496,17 +496,10 @@ static void do_acct_process(long exitcode, struct file *file) old_encode_dev(tty_devnum(current->signal->tty)) : 0; read_unlock(&tasklist_lock); - ac.ac_flag = 0; - if (current->flags & PF_FORKNOEXEC) - ac.ac_flag |= AFORK; - if (current->flags & PF_SUPERPRIV) - ac.ac_flag |= ASU; - if (current->flags & PF_DUMPCORE) - ac.ac_flag |= ACORE; - if (current->flags & PF_SIGNALED) - ac.ac_flag |= AXSIG; spin_lock(¤t->sighand->siglock); + ac.ac_flag = pacct->ac_flag; ac.ac_mem = encode_comp_t(pacct->ac_mem); + ac.ac_exitcode = pacct->ac_exitcode; spin_unlock(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); @@ -515,7 +508,6 @@ static void do_acct_process(long exitcode, struct file *file) ac.ac_majflt = encode_comp_t(current->signal->maj_flt + current->maj_flt); ac.ac_swaps = encode_comp_t(0); - ac.ac_exitcode = exitcode; /* * Kernel segment override to datasegment and write it @@ -544,13 +536,15 @@ void acct_init_pacct(struct pacct_struct *pacct) /** * acct_collect - collect accounting information into pacct_struct + * @exitcode: task exit code + * @group_dead: not 0, if this thread is the last one in the process. */ -void acct_collect(void) +void acct_collect(long exitcode, int group_dead) { struct pacct_struct *pacct = ¤t->signal->pacct; unsigned long vsize = 0; - if (current->mm) { + if (group_dead && current->mm) { struct vm_area_struct *vma; down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; @@ -562,7 +556,19 @@ void acct_collect(void) } spin_lock(¤t->sighand->siglock); - pacct->ac_mem = vsize / 1024; + if (group_dead) + pacct->ac_mem = vsize / 1024; + if (thread_group_leader(current)) { + pacct->ac_exitcode = exitcode; + if (current->flags & PF_FORKNOEXEC) + pacct->ac_flag |= AFORK; + } + if (current->flags & PF_SUPERPRIV) + pacct->ac_flag |= ASU; + if (current->flags & PF_DUMPCORE) + pacct->ac_flag |= ACORE; + if (current->flags & PF_SIGNALED) + pacct->ac_flag |= AXSIG; spin_unlock(¤t->sighand->siglock); } @@ -572,7 +578,7 @@ void acct_collect(void) * * handles process accounting for an exiting task */ -void acct_process(long exitcode) +void acct_process() { struct file *file = NULL; @@ -591,7 +597,7 @@ void acct_process(long exitcode) get_file(file); spin_unlock(&acct_globals.lock); - do_acct_process(exitcode, file); + do_acct_process(file); fput(file); } diff --git a/kernel/exit.c b/kernel/exit.c index 819d82c2efb..e76bd02e930 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -894,8 +894,8 @@ fastcall NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); - acct_collect(); } + acct_collect(code, group_dead); if (unlikely(tsk->robust_list)) exit_robust_list(tsk); #if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) @@ -907,7 +907,7 @@ fastcall NORET_TYPE void do_exit(long code) exit_mm(tsk); if (group_dead) - acct_process(code); + acct_process(); exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); -- cgit From 77787bfb44da6e6166af088226707aeccee27968 Mon Sep 17 00:00:00 2001 From: KaiGai Kohei Date: Sun, 25 Jun 2006 05:49:26 -0700 Subject: [PATCH] pacct: none-delayed process accounting accumulation In current 2.6.17 implementation, signal_struct refered from task_struct is used for per-process data structure. The pacct facility also uses it as a per-process data structure to store stime, utime, minflt, majflt. But those members are saved in __exit_signal(). It's too late. For example, if some threads exits at same time, pacct facility has a possibility to drop accountings for a part of those threads. (see, the following 'The results of original 2.6.17 kernel') I think accounting information should be completely collected into the per-process data structure before writing out an accounting record. This patch fixes this matter. Accumulation of stime, utime, minflt and majflt are done before generating accounting record. [mingo@elte.hu: fix acct_collect() siglock bug found by lockdep] Signed-off-by: KaiGai Kohei Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 4c85fdf615d..368c4f03fe0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -428,7 +428,6 @@ static void do_acct_process(struct file *file) u64 elapsed; u64 run_time; struct timespec uptime; - unsigned long jiffies; /* * First check to see if there is enough free_space to continue @@ -469,12 +468,6 @@ static void do_acct_process(struct file *file) #endif do_div(elapsed, AHZ); ac.ac_btime = xtime.tv_sec - elapsed; - jiffies = cputime_to_jiffies(cputime_add(current->utime, - current->signal->utime)); - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); - jiffies = cputime_to_jiffies(cputime_add(current->stime, - current->signal->stime)); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); /* we really need to bite the bullet and change layout */ ac.ac_uid = current->uid; ac.ac_gid = current->gid; @@ -497,16 +490,16 @@ static void do_acct_process(struct file *file) read_unlock(&tasklist_lock); spin_lock(¤t->sighand->siglock); + ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); ac.ac_flag = pacct->ac_flag; ac.ac_mem = encode_comp_t(pacct->ac_mem); + ac.ac_minflt = encode_comp_t(pacct->ac_minflt); + ac.ac_majflt = encode_comp_t(pacct->ac_majflt); ac.ac_exitcode = pacct->ac_exitcode; spin_unlock(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_minflt = encode_comp_t(current->signal->min_flt + - current->min_flt); - ac.ac_majflt = encode_comp_t(current->signal->maj_flt + - current->maj_flt); ac.ac_swaps = encode_comp_t(0); /* @@ -532,6 +525,7 @@ static void do_acct_process(struct file *file) void acct_init_pacct(struct pacct_struct *pacct) { memset(pacct, 0, sizeof(struct pacct_struct)); + pacct->ac_utime = pacct->ac_stime = cputime_zero; } /** @@ -555,7 +549,7 @@ void acct_collect(long exitcode, int group_dead) up_read(¤t->mm->mmap_sem); } - spin_lock(¤t->sighand->siglock); + spin_lock_irq(¤t->sighand->siglock); if (group_dead) pacct->ac_mem = vsize / 1024; if (thread_group_leader(current)) { @@ -569,7 +563,11 @@ void acct_collect(long exitcode, int group_dead) pacct->ac_flag |= ACORE; if (current->flags & PF_SIGNALED) pacct->ac_flag |= AXSIG; - spin_unlock(¤t->sighand->siglock); + pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); + pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); + pacct->ac_minflt += current->min_flt; + pacct->ac_majflt += current->maj_flt; + spin_unlock_irq(¤t->sighand->siglock); } /** -- cgit From 72cf2709bf8e0410800f118c4298bfbf8715b303 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 25 Jun 2006 10:04:15 -0700 Subject: Fix PM_TRACE dependency: works only on 32-bit x86 for now Not that x86-64 and other architecture support should be difficult to add (trivial fixups to the data format and add the proper linker script entry). Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index cdf315e794f..fc311a4673a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -38,7 +38,7 @@ config PM_DEBUG config PM_TRACE bool "Suspend/resume event tracing" - depends on PM && PM_DEBUG && X86 + depends on PM && PM_DEBUG && X86_32 default y ---help--- This enables some cheesy code to save the last PM event point in the -- cgit From 3448097fccdce4ea8f0fcad4f37f502a8cd72e68 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 25 Jun 2006 18:41:00 -0700 Subject: Revert "swsusp special saveable pages support" commits This reverts commits 3e3318dee0878d42ed62a19c292a2ac284135db3 [PATCH] swsusp: x86_64 mark special saveable/unsaveable pages b6370d96e09944c6e3ae8d5743ca8a8ab1f79f6c [PATCH] swsusp: i386 mark special saveable/unsaveable pages ce4ab0012b32c1a4a1d6e934aeb73bf3151c48d9 [PATCH] swsusp: add architecture special saveable pages support because not only do they apparently cause page faults on x86, the infrastructure doesn't compile on powerpc. Signed-off-by: Linus Torvalds --- kernel/power/power.h | 4 -- kernel/power/snapshot.c | 112 ++---------------------------------------------- kernel/power/swsusp.c | 18 ++++++-- 3 files changed, 18 insertions(+), 116 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 98c41423f3b..57a792982fb 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -105,10 +105,6 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); -extern unsigned int count_special_pages(void); -extern int save_special_mem(void); -extern int restore_special_mem(void); - extern int swsusp_check(void); extern int swsusp_shrink_memory(void); extern void swsusp_free(void); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3d9284100b2..24c96f35423 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -39,90 +39,8 @@ static unsigned int nr_copy_pages; static unsigned int nr_meta_pages; static unsigned long *buffer; -struct arch_saveable_page { - unsigned long start; - unsigned long end; - char *data; - struct arch_saveable_page *next; -}; -static struct arch_saveable_page *arch_pages; - -int swsusp_add_arch_pages(unsigned long start, unsigned long end) -{ - struct arch_saveable_page *tmp; - - while (start < end) { - tmp = kzalloc(sizeof(struct arch_saveable_page), GFP_KERNEL); - if (!tmp) - return -ENOMEM; - tmp->start = start; - tmp->end = ((start >> PAGE_SHIFT) + 1) << PAGE_SHIFT; - if (tmp->end > end) - tmp->end = end; - tmp->next = arch_pages; - start = tmp->end; - arch_pages = tmp; - } - return 0; -} - -static unsigned int count_arch_pages(void) -{ - unsigned int count = 0; - struct arch_saveable_page *tmp = arch_pages; - while (tmp) { - count++; - tmp = tmp->next; - } - return count; -} - -static int save_arch_mem(void) -{ - char *kaddr; - struct arch_saveable_page *tmp = arch_pages; - int offset; - - pr_debug("swsusp: Saving arch specific memory"); - while (tmp) { - tmp->data = (char *)__get_free_page(GFP_ATOMIC); - if (!tmp->data) - return -ENOMEM; - offset = tmp->start - (tmp->start & PAGE_MASK); - /* arch pages might haven't a 'struct page' */ - kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0); - memcpy(tmp->data + offset, kaddr + offset, - tmp->end - tmp->start); - kunmap_atomic(kaddr, KM_USER0); - - tmp = tmp->next; - } - return 0; -} - -static int restore_arch_mem(void) -{ - char *kaddr; - struct arch_saveable_page *tmp = arch_pages; - int offset; - - while (tmp) { - if (!tmp->data) - continue; - offset = tmp->start - (tmp->start & PAGE_MASK); - kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0); - memcpy(kaddr + offset, tmp->data + offset, - tmp->end - tmp->start); - kunmap_atomic(kaddr, KM_USER0); - free_page((long)tmp->data); - tmp->data = NULL; - tmp = tmp->next; - } - return 0; -} - #ifdef CONFIG_HIGHMEM -static unsigned int count_highmem_pages(void) +unsigned int count_highmem_pages(void) { struct zone *zone; unsigned long zone_pfn; @@ -199,7 +117,7 @@ static int save_highmem_zone(struct zone *zone) return 0; } -static int save_highmem(void) +int save_highmem(void) { struct zone *zone; int res = 0; @@ -216,7 +134,7 @@ static int save_highmem(void) return 0; } -static int restore_highmem(void) +int restore_highmem(void) { printk("swsusp: Restoring Highmem\n"); while (highmem_copy) { @@ -238,29 +156,6 @@ static inline int save_highmem(void) {return 0;} static inline int restore_highmem(void) {return 0;} #endif -unsigned int count_special_pages(void) -{ - return count_arch_pages() + count_highmem_pages(); -} - -int save_special_mem(void) -{ - int ret; - ret = save_arch_mem(); - if (!ret) - ret = save_highmem(); - return ret; -} - -int restore_special_mem(void) -{ - int ret; - ret = restore_arch_mem(); - if (!ret) - ret = restore_highmem(); - return ret; -} - static int pfn_is_nosave(unsigned long pfn) { unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; @@ -286,6 +181,7 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn) return 0; page = pfn_to_page(pfn); + BUG_ON(PageReserved(page) && PageNosave(page)); if (PageNosave(page)) return 0; if (PageReserved(page) && pfn_is_nosave(pfn)) diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index f0ee4e7780d..17f669c8301 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -62,6 +62,16 @@ unsigned long image_size = 500 * 1024 * 1024; int in_suspend __nosavedata = 0; +#ifdef CONFIG_HIGHMEM +unsigned int count_highmem_pages(void); +int save_highmem(void); +int restore_highmem(void); +#else +static inline int save_highmem(void) { return 0; } +static inline int restore_highmem(void) { return 0; } +static inline unsigned int count_highmem_pages(void) { return 0; } +#endif + /** * The following functions are used for tracing the allocated * swap pages, so that they can be freed in case of an error. @@ -182,7 +192,7 @@ int swsusp_shrink_memory(void) printk("Shrinking memory... "); do { - size = 2 * count_special_pages(); + size = 2 * count_highmem_pages(); size += size / 50 + count_data_pages(); size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE + PAGES_FOR_IO; @@ -226,7 +236,7 @@ int swsusp_suspend(void) goto Enable_irqs; } - if ((error = save_special_mem())) { + if ((error = save_highmem())) { printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); goto Restore_highmem; } @@ -237,7 +247,7 @@ int swsusp_suspend(void) /* Restore control flow magically appears here */ restore_processor_state(); Restore_highmem: - restore_special_mem(); + restore_highmem(); device_power_up(); Enable_irqs: local_irq_enable(); @@ -263,7 +273,7 @@ int swsusp_resume(void) */ swsusp_free(); restore_processor_state(); - restore_special_mem(); + restore_highmem(); touch_softlockup_watchdog(); device_power_up(); local_irq_enable(); -- cgit From 1fb00c6cbd8356f43b46322742f3c01c2a1f02da Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 26 Jun 2006 00:24:31 -0700 Subject: [PATCH] work around ppc64 bootup bug by making mutex-debugging save/restore irqs It seems ppc64 wants to lock mutexes in early bootup code, with interrupts disabled, and they expect interrupts to stay disabled, else they crash. Work around this bug by making mutex debugging variants save/restore irq flags. Signed-off-by: Ingo Molnar Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/mutex-debug.c | 12 ++++++------ kernel/mutex-debug.h | 25 +++++-------------------- kernel/mutex.c | 21 ++++++++++++--------- kernel/mutex.h | 6 ++++-- 4 files changed, 27 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index f4913c37695..036b6285b15 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -153,13 +153,13 @@ next: continue; count++; cursor = curr->next; - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("\n#%03d: ", count); printk_lock(lock, filter ? 0 : 1); goto next; } - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("\n"); } @@ -316,7 +316,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task) continue; list_del_init(curr); DEBUG_OFF(); - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("BUG: %s/%d, lock held at task exit time!\n", task->comm, task->pid); @@ -325,7 +325,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task) printk("exiting task is not even the owner??\n"); return; } - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); } /* @@ -352,7 +352,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) continue; list_del_init(curr); DEBUG_OFF(); - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", current->comm, current->pid, lock, from, to); @@ -362,7 +362,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) printk("freeing task is not even the owner??\n"); return; } - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); } /* diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index fd384050acb..a5196c36a5f 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -46,21 +46,6 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name); -#define debug_spin_lock(lock) \ - do { \ - local_irq_disable(); \ - if (debug_mutex_on) \ - spin_lock(lock); \ - } while (0) - -#define debug_spin_unlock(lock) \ - do { \ - if (debug_mutex_on) \ - spin_unlock(lock); \ - local_irq_enable(); \ - preempt_check_resched(); \ - } while (0) - #define debug_spin_lock_save(lock, flags) \ do { \ local_irq_save(flags); \ @@ -68,7 +53,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name); spin_lock(lock); \ } while (0) -#define debug_spin_lock_restore(lock, flags) \ +#define debug_spin_unlock_restore(lock, flags) \ do { \ if (debug_mutex_on) \ spin_unlock(lock); \ @@ -76,20 +61,20 @@ extern void debug_mutex_init(struct mutex *lock, const char *name); preempt_check_resched(); \ } while (0) -#define spin_lock_mutex(lock) \ +#define spin_lock_mutex(lock, flags) \ do { \ struct mutex *l = container_of(lock, struct mutex, wait_lock); \ \ DEBUG_WARN_ON(in_interrupt()); \ - debug_spin_lock(&debug_mutex_lock); \ + debug_spin_lock_save(&debug_mutex_lock, flags); \ spin_lock(lock); \ DEBUG_WARN_ON(l->magic != l); \ } while (0) -#define spin_unlock_mutex(lock) \ +#define spin_unlock_mutex(lock, flags) \ do { \ spin_unlock(lock); \ - debug_spin_unlock(&debug_mutex_lock); \ + debug_spin_unlock_restore(&debug_mutex_lock, flags); \ } while (0) #define DEBUG_OFF() \ diff --git a/kernel/mutex.c b/kernel/mutex.c index 5449b210d9e..7043db21bbc 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -125,10 +125,11 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) struct task_struct *task = current; struct mutex_waiter waiter; unsigned int old_val; + unsigned long flags; debug_mutex_init_waiter(&waiter); - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); @@ -157,7 +158,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) if (unlikely(state == TASK_INTERRUPTIBLE && signal_pending(task))) { mutex_remove_waiter(lock, &waiter, task->thread_info); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); return -EINTR; @@ -165,9 +166,9 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) __set_task_state(task, state); /* didnt get the lock, go to sleep: */ - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); schedule(); - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); } /* got the lock - rejoice! */ @@ -178,7 +179,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); @@ -203,10 +204,11 @@ static fastcall noinline void __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) { struct mutex *lock = container_of(lock_count, struct mutex, count); + unsigned long flags; DEBUG_WARN_ON(lock->owner != current_thread_info()); - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); /* * some architectures leave the lock unlocked in the fastpath failure @@ -231,7 +233,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) debug_mutex_clear_owner(lock); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); } /* @@ -276,9 +278,10 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) static inline int __mutex_trylock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); + unsigned long flags; int prev; - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); prev = atomic_xchg(&lock->count, -1); if (likely(prev == 1)) @@ -287,7 +290,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); return prev == 1; } diff --git a/kernel/mutex.h b/kernel/mutex.h index 00fe84e7b67..06918994725 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -9,8 +9,10 @@ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ -#define spin_lock_mutex(lock) spin_lock(lock) -#define spin_unlock_mutex(lock) spin_unlock(lock) +#define spin_lock_mutex(lock, flags) \ + do { spin_lock(lock); (void)(flags); } while (0) +#define spin_unlock_mutex(lock, flags) \ + do { spin_unlock(lock); (void)(flags); } while (0) #define mutex_remove_waiter(lock, waiter, ti) \ __list_del((waiter)->list.prev, (waiter)->list.next) -- cgit From 81615b624a45621b758380ec45d750483eae281d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 26 Jun 2006 00:24:32 -0700 Subject: [PATCH] Convert kernel/cpu.c to mutexes Convert kernel/cpu.c from semaphore to mutex. I've reviewed all lock_cpu_hotplug() critical sections, and they all seem to fit mutex semantics. Signed-off-by: Ingo Molnar Cc: Rusty Russell Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index fe2b8d0bfe4..03dcd981846 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -13,10 +13,10 @@ #include #include #include -#include +#include /* This protects CPUs going up and down... */ -static DECLARE_MUTEX(cpucontrol); +static DEFINE_MUTEX(cpucontrol); static BLOCKING_NOTIFIER_HEAD(cpu_chain); @@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible) if (lock_cpu_hotplug_owner != current) { if (interruptible) - ret = down_interruptible(&cpucontrol); + ret = mutex_lock_interruptible(&cpucontrol); else - down(&cpucontrol); + mutex_lock(&cpucontrol); } /* @@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void) { if (--lock_cpu_hotplug_depth == 0) { lock_cpu_hotplug_owner = NULL; - up(&cpucontrol); + mutex_unlock(&cpucontrol); } } EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); -- cgit From 734efb467b31e56c2f9430590a9aa867ecf3eea1 Mon Sep 17 00:00:00 2001 From: john stultz Date: Mon, 26 Jun 2006 00:25:05 -0700 Subject: [PATCH] Time: Clocksource Infrastructure This introduces the clocksource management infrastructure. A clocksource is a driver-like architecture generic abstraction of a free-running counter. This code defines the clocksource structure, and provides management code for registering, selecting, accessing and scaling clocksources. Additionally, this includes the trivial jiffies clocksource, a lowest common denominator clocksource, provided mainly for use as an example. [hirofumi@mail.parknet.co.jp: Don't enable IRQ too early] Signed-off-by: John Stultz Signed-off-by: Ingo Molnar Signed-off-by: Paul Mundt Signed-off-by: John Stultz Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/Makefile | 1 + kernel/time/clocksource.c | 344 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/time/jiffies.c | 73 ++++++++++ 3 files changed, 418 insertions(+) create mode 100644 kernel/time/Makefile create mode 100644 kernel/time/clocksource.c create mode 100644 kernel/time/jiffies.c (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile new file mode 100644 index 00000000000..e1dfd8e86cc --- /dev/null +++ b/kernel/time/Makefile @@ -0,0 +1 @@ +obj-y += clocksource.o jiffies.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c new file mode 100644 index 00000000000..95dd2200a10 --- /dev/null +++ b/kernel/time/clocksource.c @@ -0,0 +1,344 @@ +/* + * linux/kernel/time/clocksource.c + * + * This file contains the functions which manage clocksource drivers. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * TODO WishList: + * o Allow clocksource drivers to be unregistered + * o get rid of clocksource_jiffies extern + */ + +#include +#include +#include +#include + +/* XXX - Would like a better way for initializing curr_clocksource */ +extern struct clocksource clocksource_jiffies; + +/*[Clocksource internal variables]--------- + * curr_clocksource: + * currently selected clocksource. Initialized to clocksource_jiffies. + * next_clocksource: + * pending next selected clocksource. + * clocksource_list: + * linked list with the registered clocksources + * clocksource_lock: + * protects manipulations to curr_clocksource and next_clocksource + * and the clocksource_list + * override_name: + * Name of the user-specified clocksource. + */ +static struct clocksource *curr_clocksource = &clocksource_jiffies; +static struct clocksource *next_clocksource; +static LIST_HEAD(clocksource_list); +static DEFINE_SPINLOCK(clocksource_lock); +static char override_name[32]; +static int finished_booting; + +/* clocksource_done_booting - Called near the end of bootup + * + * Hack to avoid lots of clocksource churn at boot time + */ +static int clocksource_done_booting(void) +{ + finished_booting = 1; + return 0; +} + +late_initcall(clocksource_done_booting); + +/** + * get_next_clocksource - Returns the selected clocksource + * + */ +struct clocksource *get_next_clocksource(void) +{ + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + if (next_clocksource && finished_booting) { + curr_clocksource = next_clocksource; + next_clocksource = NULL; + } + spin_unlock_irqrestore(&clocksource_lock, flags); + + return curr_clocksource; +} + +/** + * select_clocksource - Finds the best registered clocksource. + * + * Private function. Must hold clocksource_lock when called. + * + * Looks through the list of registered clocksources, returning + * the one with the highest rating value. If there is a clocksource + * name that matches the override string, it returns that clocksource. + */ +static struct clocksource *select_clocksource(void) +{ + struct clocksource *best = NULL; + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (!best) + best = src; + + /* check for override: */ + if (strlen(src->name) == strlen(override_name) && + !strcmp(src->name, override_name)) { + best = src; + break; + } + /* pick the highest rating: */ + if (src->rating > best->rating) + best = src; + } + + return best; +} + +/** + * is_registered_source - Checks if clocksource is registered + * @c: pointer to a clocksource + * + * Private helper function. Must hold clocksource_lock when called. + * + * Returns one if the clocksource is already registered, zero otherwise. + */ +static int is_registered_source(struct clocksource *c) +{ + int len = strlen(c->name); + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (strlen(src->name) == len && !strcmp(src->name, c->name)) + return 1; + } + + return 0; +} + +/** + * register_clocksource - Used to install new clocksources + * @t: clocksource to be registered + * + * Returns -EBUSY if registration fails, zero otherwise. + */ +int register_clocksource(struct clocksource *c) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + /* check if clocksource is already registered */ + if (is_registered_source(c)) { + printk("register_clocksource: Cannot register %s. " + "Already registered!", c->name); + ret = -EBUSY; + } else { + /* register it */ + list_add(&c->list, &clocksource_list); + /* scan the registered clocksources, and pick the best one */ + next_clocksource = select_clocksource(); + } + spin_unlock_irqrestore(&clocksource_lock, flags); + return ret; +} + +EXPORT_SYMBOL(register_clocksource); + +/** + * reselect_clocksource - Rescan list for next clocksource + * + * A quick helper function to be used if a clocksource changes its + * rating. Forces the clocksource list to be re-scaned for the best + * clocksource. + */ +void reselect_clocksource(void) +{ + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + next_clocksource = select_clocksource(); + spin_unlock_irqrestore(&clocksource_lock, flags); +} + +/** + * sysfs_show_current_clocksources - sysfs interface for current clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing current clocksource. + */ +static ssize_t +sysfs_show_current_clocksources(struct sys_device *dev, char *buf) +{ + char *curr = buf; + + spin_lock_irq(&clocksource_lock); + curr += sprintf(curr, "%s ", curr_clocksource->name); + spin_unlock_irq(&clocksource_lock); + + curr += sprintf(curr, "\n"); + + return curr - buf; +} + +/** + * sysfs_override_clocksource - interface for manually overriding clocksource + * @dev: unused + * @buf: name of override clocksource + * @count: length of buffer + * + * Takes input from sysfs interface for manually overriding the default + * clocksource selction. + */ +static ssize_t sysfs_override_clocksource(struct sys_device *dev, + const char *buf, size_t count) +{ + size_t ret = count; + /* strings from sysfs write are not 0 terminated! */ + if (count >= sizeof(override_name)) + return -EINVAL; + + /* strip of \n: */ + if (buf[count-1] == '\n') + count--; + if (count < 1) + return -EINVAL; + + spin_lock_irq(&clocksource_lock); + + /* copy the name given: */ + memcpy(override_name, buf, count); + override_name[count] = 0; + + /* try to select it: */ + next_clocksource = select_clocksource(); + + spin_unlock_irq(&clocksource_lock); + + return ret; +} + +/** + * sysfs_show_available_clocksources - sysfs interface for listing clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing registered clocksources + */ +static ssize_t +sysfs_show_available_clocksources(struct sys_device *dev, char *buf) +{ + struct list_head *tmp; + char *curr = buf; + + spin_lock_irq(&clocksource_lock); + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + curr += sprintf(curr, "%s ", src->name); + } + spin_unlock_irq(&clocksource_lock); + + curr += sprintf(curr, "\n"); + + return curr - buf; +} + +/* + * Sysfs setup bits: + */ +static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, + sysfs_override_clocksource); + +static SYSDEV_ATTR(available_clocksource, 0600, + sysfs_show_available_clocksources, NULL); + +static struct sysdev_class clocksource_sysclass = { + set_kset_name("clocksource"), +}; + +static struct sys_device device_clocksource = { + .id = 0, + .cls = &clocksource_sysclass, +}; + +static int init_clocksource_sysfs(void) +{ + int error = sysdev_class_register(&clocksource_sysclass); + + if (!error) + error = sysdev_register(&device_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_current_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_available_clocksource); + return error; +} + +device_initcall(init_clocksource_sysfs); + +/** + * boot_override_clocksource - boot clock override + * @str: override name + * + * Takes a clocksource= boot argument and uses it + * as the clocksource override name. + */ +static int __init boot_override_clocksource(char* str) +{ + unsigned long flags; + spin_lock_irqsave(&clocksource_lock, flags); + if (str) + strlcpy(override_name, str, sizeof(override_name)); + spin_unlock_irqrestore(&clocksource_lock, flags); + return 1; +} + +__setup("clocksource=", boot_override_clocksource); + +/** + * boot_override_clock - Compatibility layer for deprecated boot option + * @str: override name + * + * DEPRECATED! Takes a clock= boot argument and uses it + * as the clocksource override name + */ +static int __init boot_override_clock(char* str) +{ + printk("Warning! clock= boot option is deprecated.\n"); + + return boot_override_clocksource(str); +} + +__setup("clock=", boot_override_clock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c new file mode 100644 index 00000000000..1fe8376e717 --- /dev/null +++ b/kernel/time/jiffies.c @@ -0,0 +1,73 @@ +/*********************************************************************** +* linux/kernel/time/jiffies.c +* +* This file contains the jiffies based clocksource. +* +* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +************************************************************************/ +#include +#include +#include + +/* The Jiffies based clocksource is the lowest common + * denominator clock source which should function on + * all systems. It has the same coarse resolution as + * the timer interrupt frequency HZ and it suffers + * inaccuracies caused by missed or lost timer + * interrupts and the inability for the timer + * interrupt hardware to accuratly tick at the + * requested HZ value. It is also not reccomended + * for "tick-less" systems. + */ +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) + +/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. NSEC_PER_JIFFY grows as + * HZ shrinks, so values greater then 8 overflow 32bits when + * HZ=100. + */ +#define JIFFIES_SHIFT 8 + +static cycle_t jiffies_read(void) +{ + return (cycle_t) jiffies; +} + +struct clocksource clocksource_jiffies = { + .name = "jiffies", + .rating = 0, /* lowest rating*/ + .read = jiffies_read, + .mask = 0xffffffff, /*32bits*/ + .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .is_continuous = 0, /* tick based, not free running */ +}; + +static int __init init_jiffies_clocksource(void) +{ + return register_clocksource(&clocksource_jiffies); +} + +module_init(init_jiffies_clocksource); -- cgit From ad596171ed635c51a9eef829187af100cbf8dcf7 Mon Sep 17 00:00:00 2001 From: john stultz Date: Mon, 26 Jun 2006 00:25:06 -0700 Subject: [PATCH] Time: Use clocksource infrastructure for update_wall_time Modify the update_wall_time function so it increments time using the clocksource abstraction instead of jiffies. Since the only clocksource driver currently provided is the jiffies clocksource, this should result in no functional change. Additionally, a timekeeping_init and timekeeping_resume function has been added to initialize and maintain some of the new timekeping state. [hirofumi@mail.parknet.co.jp: fixlet] Signed-off-by: John Stultz Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/time/clocksource.c | 4 +- kernel/timer.c | 93 ++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 83 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f6ef00f4f90..bc4b8a7161f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,6 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o +obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 95dd2200a10..4288bfa12c3 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -56,7 +56,7 @@ static int finished_booting; * * Hack to avoid lots of clocksource churn at boot time */ -static int clocksource_done_booting(void) +static int __init clocksource_done_booting(void) { finished_booting = 1; return 0; @@ -289,7 +289,7 @@ static struct sys_device device_clocksource = { .cls = &clocksource_sysclass, }; -static int init_clocksource_sysfs(void) +static int __init init_clocksource_sysfs(void) { int error = sysdev_class_register(&clocksource_sysclass); diff --git a/kernel/timer.c b/kernel/timer.c index eb97371b87d..524c7f63836 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -792,24 +792,93 @@ u64 current_tick_length(void) return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; } +/* XXX - all of this timekeeping code should be later moved to time.c */ +#include +static struct clocksource *clock; /* pointer to current clocksource */ +static cycle_t last_clock_cycle; /* cycle value at last update_wall_time */ /* - * Using a loop looks inefficient, but "ticks" is - * usually just one (we shouldn't be losing ticks, - * we're doing this this way mainly for interrupt - * latency reasons, not because we think we'll - * have lots of lost timer ticks + * timekeeping_init - Initializes the clocksource and common timekeeping values */ -static void update_wall_time(unsigned long ticks) +void __init timekeeping_init(void) { - do { - ticks--; + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + clock = get_next_clocksource(); + calculate_clocksource_interval(clock, tick_nsec); + last_clock_cycle = read_clocksource(clock); + ntp_clear(); + write_sequnlock_irqrestore(&xtime_lock, flags); +} + + +/* + * timekeeping_resume - Resumes the generic timekeeping subsystem. + * @dev: unused + * + * This is for the generic clocksource timekeeping. + * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are + * still managed by arch specific suspend/resume code. + */ +static int timekeeping_resume(struct sys_device *dev) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + /* restart the last cycle value */ + last_clock_cycle = read_clocksource(clock); + write_sequnlock_irqrestore(&xtime_lock, flags); + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct sysdev_class timekeeping_sysclass = { + .resume = timekeeping_resume, + set_kset_name("timekeeping"), +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timekeeping_sysclass, +}; + +static int __init timekeeping_init_device(void) +{ + int error = sysdev_class_register(&timekeeping_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(timekeeping_init_device); + +/* + * update_wall_time - Uses the current clocksource to increment the wall time + * + * Called from the timer interrupt, must hold a write on xtime_lock. + */ +static void update_wall_time(void) +{ + cycle_t now, offset; + + now = read_clocksource(clock); + offset = (now - last_clock_cycle)&clock->mask; + + /* normally this loop will run just once, however in the + * case of lost or late ticks, it will accumulate correctly. + */ + while (offset > clock->interval_cycles) { + /* accumulate one interval */ + last_clock_cycle += clock->interval_cycles; + offset -= clock->interval_cycles; + update_wall_time_one_tick(); if (xtime.tv_nsec >= 1000000000) { xtime.tv_nsec -= 1000000000; xtime.tv_sec++; second_overflow(); } - } while (ticks); + } } /* @@ -915,10 +984,8 @@ static inline void update_times(void) unsigned long ticks; ticks = jiffies - wall_jiffies; - if (ticks) { - wall_jiffies += ticks; - update_wall_time(ticks); - } + wall_jiffies += ticks; + update_wall_time(); calc_load(ticks); } -- cgit From 260a42309b31cbc54eb4b6b85649e412bcad053f Mon Sep 17 00:00:00 2001 From: john stultz Date: Mon, 26 Jun 2006 00:25:07 -0700 Subject: [PATCH] Time: Let user request precision from current_tick_length() Change the current_tick_length() function so it takes an argument which specifies how much precision to return in shifted nanoseconds. This provides a simple way to convert between NTPs internal nanoseconds shifted by (SHIFT_SCALE - 10) to other shifted nanosecond units that are used by the clocksource abstraction. Signed-off-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 524c7f63836..623f9ea198d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -780,16 +780,29 @@ static void update_wall_time_one_tick(void) * Return how long ticks are at the moment, that is, how much time * update_wall_time_one_tick will add to xtime next time we call it * (assuming no calls to do_adjtimex in the meantime). - * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 - * bits to the right of the binary point. + * The return value is in fixed-point nanoseconds shifted by the + * specified number of bits to the right of the binary point. * This function has no side-effects. */ -u64 current_tick_length(void) +u64 current_tick_length(long shift) { long delta_nsec; + u64 ret; + /* calculate the finest interval NTP will allow. + * ie: nanosecond value shifted by (SHIFT_SCALE - 10) + */ delta_nsec = tick_nsec + adjtime_adjustment() * 1000; - return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; + ret = ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; + + /* convert from (SHIFT_SCALE - 10) to specified shift scale: */ + shift = shift - (SHIFT_SCALE - 10); + if (shift < 0) + ret >>= -shift; + else + ret <<= shift; + + return ret; } /* XXX - all of this timekeeping code should be later moved to time.c */ -- cgit From 5eb6d20533d14a432df714520939a6181e28f099 Mon Sep 17 00:00:00 2001 From: john stultz Date: Mon, 26 Jun 2006 00:25:07 -0700 Subject: [PATCH] Time: Use clocksource abstraction for NTP adjustments Instead of incrementing xtime by tick_nsec + ntp adjustments, use the clocksource abstraction to increment and scale time. Using the clocksource abstraction allows other clocksources to be used consistently in the face of late or lost ticks, while preserving the existing behavior via the jiffies clocksource. This removes the need to keep time_phase adjustments as we just use the current_tick_length() function as the NTP interface and accumulate time using shifted nanoseconds. The basics of this design was by Roman Zippel, however it is my own interpretation and implementation, so the credit should go to him and the blame to me. Signed-off-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 623f9ea198d..6811436a031 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -597,7 +597,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ long time_precision = 1; /* clock precision (us) */ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -static long time_phase; /* phase offset (scaled us) */ long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; /* frequency offset (scaled ppm)*/ static long time_adj; /* tick adjust (scaled 1 / HZ) */ @@ -747,27 +746,14 @@ static long adjtime_adjustment(void) } /* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) +static void update_ntp_one_tick(void) { - long time_adjust_step, delta_nsec; + long time_adjust_step; time_adjust_step = adjtime_adjustment(); if (time_adjust_step) /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; - delta_nsec = tick_nsec + time_adjust_step * 1000; - /* - * Advance the phase, once it gets to one microsecond, then - * advance the tick more. - */ - time_phase += time_adj; - if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { - long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); - time_phase -= ltemp << (SHIFT_SCALE - 10); - delta_nsec += ltemp; - } - xtime.tv_nsec += delta_nsec; - time_interpolator_update(delta_nsec); /* Changes by adjtime() do not take effect till next tick. */ if (time_next_adjust != 0) { @@ -872,8 +858,13 @@ device_initcall(timekeeping_init_device); */ static void update_wall_time(void) { + static s64 remainder_snsecs, error; + s64 snsecs_per_sec; cycle_t now, offset; + snsecs_per_sec = (s64)NSEC_PER_SEC << clock->shift; + remainder_snsecs += (s64)xtime.tv_nsec << clock->shift; + now = read_clocksource(clock); offset = (now - last_clock_cycle)&clock->mask; @@ -881,17 +872,35 @@ static void update_wall_time(void) * case of lost or late ticks, it will accumulate correctly. */ while (offset > clock->interval_cycles) { + /* get the ntp interval in clock shifted nanoseconds */ + s64 ntp_snsecs = current_tick_length(clock->shift); + /* accumulate one interval */ + remainder_snsecs += clock->interval_snsecs; last_clock_cycle += clock->interval_cycles; offset -= clock->interval_cycles; - update_wall_time_one_tick(); - if (xtime.tv_nsec >= 1000000000) { - xtime.tv_nsec -= 1000000000; + /* interpolator bits */ + time_interpolator_update(clock->interval_snsecs + >> clock->shift); + /* increment the NTP state machine */ + update_ntp_one_tick(); + + /* accumulate error between NTP and clock interval */ + error += (ntp_snsecs - (s64)clock->interval_snsecs); + + /* correct the clock when NTP error is too big */ + remainder_snsecs += make_ntp_adj(clock, offset, &error); + + if (remainder_snsecs >= snsecs_per_sec) { + remainder_snsecs -= snsecs_per_sec; xtime.tv_sec++; second_overflow(); } } + /* store full nanoseconds into xtime */ + xtime.tv_nsec = remainder_snsecs >> clock->shift; + remainder_snsecs -= (s64)xtime.tv_nsec << clock->shift; } /* -- cgit From cf3c769b4b0dd1146da84d5cf045dcfe53bd0f13 Mon Sep 17 00:00:00 2001 From: john stultz Date: Mon, 26 Jun 2006 00:25:08 -0700 Subject: [PATCH] Time: Introduce arch generic time accessors Introduces clocksource switching code and the arch generic time accessor functions that use the clocksource infrastructure. Signed-off-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time.c | 2 + kernel/timer.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 172 insertions(+) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index b00ddc71ced..5bd48974764 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday); #else +#ifndef CONFIG_GENERIC_TIME /* * Simulate gettimeofday using do_gettimeofday which only allows a timeval * and therefore only yields usec accuracy @@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv) } EXPORT_SYMBOL_GPL(getnstimeofday); #endif +#endif /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 diff --git a/kernel/timer.c b/kernel/timer.c index 6811436a031..e5adb9e2e7a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -795,6 +795,169 @@ u64 current_tick_length(long shift) #include static struct clocksource *clock; /* pointer to current clocksource */ static cycle_t last_clock_cycle; /* cycle value at last update_wall_time */ + +#ifdef CONFIG_GENERIC_TIME +/** + * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * + * private function, must hold xtime_lock lock when being + * called. Returns the number of nanoseconds since the + * last call to update_wall_time() (adjusted by NTP scaling) + */ +static inline s64 __get_nsec_offset(void) +{ + cycle_t cycle_now, cycle_delta; + s64 ns_offset; + + /* read clocksource: */ + cycle_now = read_clocksource(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - last_clock_cycle) & clock->mask; + + /* convert to nanoseconds: */ + ns_offset = cyc2ns(clock, cycle_delta); + + return ns_offset; +} + +/** + * __get_realtime_clock_ts - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. Used by + * do_gettimeofday() and get_realtime_clock_ts(). + */ +static inline void __get_realtime_clock_ts(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + + do { + seq = read_seqbegin(&xtime_lock); + + *ts = xtime; + nsecs = __get_nsec_offset(); + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} + +/** + * get_realtime_clock_ts - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void getnstimeofday(struct timespec *ts) +{ + __get_realtime_clock_ts(ts); +} + +EXPORT_SYMBOL(getnstimeofday); + +/** + * do_gettimeofday - Returns the time of day in a timeval + * @tv: pointer to the timeval to be set + * + * NOTE: Users should be converted to using get_realtime_clock_ts() + */ +void do_gettimeofday(struct timeval *tv) +{ + struct timespec now; + + __get_realtime_clock_ts(&now); + tv->tv_sec = now.tv_sec; + tv->tv_usec = now.tv_nsec/1000; +} + +EXPORT_SYMBOL(do_gettimeofday); +/** + * do_settimeofday - Sets the time of day + * @tv: pointer to the timespec variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday(struct timespec *tv) +{ + unsigned long flags; + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec, nsec = tv->tv_nsec; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + nsec -= __get_nsec_offset(); + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + ntp_clear(); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +/** + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource + */ +static int change_clocksource(void) +{ + struct clocksource *new; + cycle_t now; + u64 nsec; + new = get_next_clocksource(); + if (clock != new) { + now = read_clocksource(new); + nsec = __get_nsec_offset(); + timespec_add_ns(&xtime, nsec); + + clock = new; + last_clock_cycle = now; + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); + return 1; + } else if (clock->update_callback) { + return clock->update_callback(); + } + return 0; +} +#else +#define change_clocksource() (0) +#endif + +/** + * timeofday_is_continuous - check to see if timekeeping is free running + */ +int timekeeping_is_continuous(void) +{ + unsigned long seq; + int ret; + + do { + seq = read_seqbegin(&xtime_lock); + + ret = clock->is_continuous; + + } while (read_seqretry(&xtime_lock, seq)); + + return ret; +} + /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ @@ -901,6 +1064,13 @@ static void update_wall_time(void) /* store full nanoseconds into xtime */ xtime.tv_nsec = remainder_snsecs >> clock->shift; remainder_snsecs -= (s64)xtime.tv_nsec << clock->shift; + + /* check to see if there is a new clocksource to use */ + if (change_clocksource()) { + error = 0; + remainder_snsecs = 0; + calculate_clocksource_interval(clock, tick_nsec); + } } /* -- cgit From 5d0cf410e94b1f1ff852c3f210d22cc6c5a27ffa Mon Sep 17 00:00:00 2001 From: john stultz Date: Mon, 26 Jun 2006 00:25:12 -0700 Subject: [PATCH] Time: i386 Clocksource Drivers Implement the time sources for i386 (acpi_pm, cyclone, hpet, pit, and tsc). With this patch, the conversion of the i386 arch to the generic timekeeping code should be complete. The patch should be fairly straight forward, only adding the new clocksources. [hirofumi@mail.parknet.co.jp: acpi_pm cleanup] Signed-off-by: John Stultz Signed-off-by: Adrian Bunk Signed-off-by: Paul Mundt Signed-off-by: John Stultz Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/clocksource.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 4288bfa12c3..a9f387ea83b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -174,7 +174,7 @@ EXPORT_SYMBOL(register_clocksource); * reselect_clocksource - Rescan list for next clocksource * * A quick helper function to be used if a clocksource changes its - * rating. Forces the clocksource list to be re-scaned for the best + * rating. Forces the clocksource list to be re-scanned for the best * clocksource. */ void reselect_clocksource(void) @@ -336,8 +336,13 @@ __setup("clocksource=", boot_override_clocksource); */ static int __init boot_override_clock(char* str) { - printk("Warning! clock= boot option is deprecated.\n"); - + if (!strcmp(str, "pmtmr")) { + printk("Warning: clock=pmtmr is deprecated. " + "Use clocksource=acpi_pm.\n"); + return boot_override_clocksource("acpi_pm"); + } + printk("Warning! clock= boot option is deprecated. " + "Use clocksource=xyz\n"); return boot_override_clocksource(str); } -- cgit From a275254975a29c51929ee175b92ac471ac2a0043 Mon Sep 17 00:00:00 2001 From: john stultz Date: Mon, 26 Jun 2006 00:25:14 -0700 Subject: [PATCH] time: rename clocksource functions As suggested by Roman Zippel, change clocksource functions to use clocksource_xyz rather then xyz_clocksource to avoid polluting the namespace. Signed-off-by: John Stultz Cc: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/clocksource.c | 16 ++++++++-------- kernel/time/jiffies.c | 2 +- kernel/timer.c | 20 ++++++++++---------- 3 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index a9f387ea83b..74eca5939bd 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -65,10 +65,10 @@ static int __init clocksource_done_booting(void) late_initcall(clocksource_done_booting); /** - * get_next_clocksource - Returns the selected clocksource + * clocksource_get_next - Returns the selected clocksource * */ -struct clocksource *get_next_clocksource(void) +struct clocksource *clocksource_get_next(void) { unsigned long flags; @@ -142,12 +142,12 @@ static int is_registered_source(struct clocksource *c) } /** - * register_clocksource - Used to install new clocksources + * clocksource_register - Used to install new clocksources * @t: clocksource to be registered * * Returns -EBUSY if registration fails, zero otherwise. */ -int register_clocksource(struct clocksource *c) +int clocksource_register(struct clocksource *c) { int ret = 0; unsigned long flags; @@ -167,17 +167,16 @@ int register_clocksource(struct clocksource *c) spin_unlock_irqrestore(&clocksource_lock, flags); return ret; } - -EXPORT_SYMBOL(register_clocksource); +EXPORT_SYMBOL(clocksource_register); /** - * reselect_clocksource - Rescan list for next clocksource + * clocksource_reselect - Rescan list for next clocksource * * A quick helper function to be used if a clocksource changes its * rating. Forces the clocksource list to be re-scanned for the best * clocksource. */ -void reselect_clocksource(void) +void clocksource_reselect(void) { unsigned long flags; @@ -185,6 +184,7 @@ void reselect_clocksource(void) next_clocksource = select_clocksource(); spin_unlock_irqrestore(&clocksource_lock, flags); } +EXPORT_SYMBOL(clocksource_reselect); /** * sysfs_show_current_clocksources - sysfs interface for current clocksource diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 1fe8376e717..126bb30c4af 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -67,7 +67,7 @@ struct clocksource clocksource_jiffies = { static int __init init_jiffies_clocksource(void) { - return register_clocksource(&clocksource_jiffies); + return clocksource_register(&clocksource_jiffies); } module_init(init_jiffies_clocksource); diff --git a/kernel/timer.c b/kernel/timer.c index e5adb9e2e7a..890a56937cf 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -810,7 +810,7 @@ static inline s64 __get_nsec_offset(void) s64 ns_offset; /* read clocksource: */ - cycle_now = read_clocksource(clock); + cycle_now = clocksource_read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - last_clock_cycle) & clock->mask; @@ -845,7 +845,7 @@ static inline void __get_realtime_clock_ts(struct timespec *ts) } /** - * get_realtime_clock_ts - Returns the time of day in a timespec + * getnstimeofday - Returns the time of day in a timespec * @ts: pointer to the timespec to be set * * Returns the time of day in a timespec. @@ -920,9 +920,9 @@ static int change_clocksource(void) struct clocksource *new; cycle_t now; u64 nsec; - new = get_next_clocksource(); + new = clocksource_get_next(); if (clock != new) { - now = read_clocksource(new); + now = clocksource_read(new); nsec = __get_nsec_offset(); timespec_add_ns(&xtime, nsec); @@ -966,9 +966,9 @@ void __init timekeeping_init(void) unsigned long flags; write_seqlock_irqsave(&xtime_lock, flags); - clock = get_next_clocksource(); - calculate_clocksource_interval(clock, tick_nsec); - last_clock_cycle = read_clocksource(clock); + clock = clocksource_get_next(); + clocksource_calculate_interval(clock, tick_nsec); + last_clock_cycle = clocksource_read(clock); ntp_clear(); write_sequnlock_irqrestore(&xtime_lock, flags); } @@ -988,7 +988,7 @@ static int timekeeping_resume(struct sys_device *dev) write_seqlock_irqsave(&xtime_lock, flags); /* restart the last cycle value */ - last_clock_cycle = read_clocksource(clock); + last_clock_cycle = clocksource_read(clock); write_sequnlock_irqrestore(&xtime_lock, flags); return 0; } @@ -1028,7 +1028,7 @@ static void update_wall_time(void) snsecs_per_sec = (s64)NSEC_PER_SEC << clock->shift; remainder_snsecs += (s64)xtime.tv_nsec << clock->shift; - now = read_clocksource(clock); + now = clocksource_read(clock); offset = (now - last_clock_cycle)&clock->mask; /* normally this loop will run just once, however in the @@ -1069,7 +1069,7 @@ static void update_wall_time(void) if (change_clocksource()) { error = 0; remainder_snsecs = 0; - calculate_clocksource_interval(clock, tick_nsec); + clocksource_calculate_interval(clock, tick_nsec); } } -- cgit From 19923c190e0932bf0ac1e1d06a48f5c3678dd0de Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Mon, 26 Jun 2006 00:25:18 -0700 Subject: [PATCH] fix and optimize clock source update This fixes the clock source updates in update_wall_time() to correctly track the time coming in via current_tick_length(). Optimize the fast paths to be as short as possible to keep the overhead low. Signed-off-by: Roman Zippel Acked-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 151 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 109 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 890a56937cf..5bb6b7976ee 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -770,7 +770,7 @@ static void update_ntp_one_tick(void) * specified number of bits to the right of the binary point. * This function has no side-effects. */ -u64 current_tick_length(long shift) +u64 current_tick_length(void) { long delta_nsec; u64 ret; @@ -779,14 +779,8 @@ u64 current_tick_length(long shift) * ie: nanosecond value shifted by (SHIFT_SCALE - 10) */ delta_nsec = tick_nsec + adjtime_adjustment() * 1000; - ret = ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; - - /* convert from (SHIFT_SCALE - 10) to specified shift scale: */ - shift = shift - (SHIFT_SCALE - 10); - if (shift < 0) - ret >>= -shift; - else - ret <<= shift; + ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; + ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); return ret; } @@ -794,7 +788,6 @@ u64 current_tick_length(long shift) /* XXX - all of this timekeeping code should be later moved to time.c */ #include static struct clocksource *clock; /* pointer to current clocksource */ -static cycle_t last_clock_cycle; /* cycle value at last update_wall_time */ #ifdef CONFIG_GENERIC_TIME /** @@ -813,7 +806,7 @@ static inline s64 __get_nsec_offset(void) cycle_now = clocksource_read(clock); /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - last_clock_cycle) & clock->mask; + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ ns_offset = cyc2ns(clock, cycle_delta); @@ -927,7 +920,7 @@ static int change_clocksource(void) timespec_add_ns(&xtime, nsec); clock = new; - last_clock_cycle = now; + clock->cycle_last = now; printk(KERN_INFO "Time: %s clocksource has been installed.\n", clock->name); return 1; @@ -968,7 +961,7 @@ void __init timekeeping_init(void) write_seqlock_irqsave(&xtime_lock, flags); clock = clocksource_get_next(); clocksource_calculate_interval(clock, tick_nsec); - last_clock_cycle = clocksource_read(clock); + clock->cycle_last = clocksource_read(clock); ntp_clear(); write_sequnlock_irqrestore(&xtime_lock, flags); } @@ -988,7 +981,7 @@ static int timekeeping_resume(struct sys_device *dev) write_seqlock_irqsave(&xtime_lock, flags); /* restart the last cycle value */ - last_clock_cycle = clocksource_read(clock); + clock->cycle_last = clocksource_read(clock); write_sequnlock_irqrestore(&xtime_lock, flags); return 0; } @@ -1014,6 +1007,81 @@ static int __init timekeeping_init_device(void) device_initcall(timekeeping_init_device); +/* + * If the error is already larger, we look ahead another tick, + * to compensate for late or lost adjustments. + */ +static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset) +{ + int adj; + + /* + * As soon as the machine is synchronized to the external time + * source this should be the common case. + */ + error >>= 2; + if (likely(sign > 0 ? error <= *interval : error >= *interval)) + return sign; + + /* + * An extra look ahead dampens the effect of the current error, + * which can grow quite large with continously late updates, as + * it would dominate the adjustment value and can lead to + * oscillation. + */ + error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); + error -= clock->xtime_interval >> 1; + + adj = 0; + while (1) { + error >>= 1; + if (sign > 0 ? error <= *interval : error >= *interval) + break; + adj++; + } + + /* + * Add the current adjustments to the error and take the offset + * into account, the latter can cause the error to be hardly + * reduced at the next tick. Check the error again if there's + * room for another adjustment, thus further reducing the error + * which otherwise had to be corrected at the next update. + */ + error = (error << 1) - *interval + *offset; + if (sign > 0 ? error > *interval : error < *interval) + adj++; + + *interval <<= adj; + *offset <<= adj; + return sign << adj; +} + +/* + * Adjust the multiplier to reduce the error value, + * this is optimized for the most common adjustments of -1,0,1, + * for other values we can do a bit more work. + */ +static void clocksource_adjust(struct clocksource *clock, s64 offset) +{ + s64 error, interval = clock->cycle_interval; + int adj; + + error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); + if (error > interval) { + adj = clocksource_bigadjust(1, error, &interval, &offset); + } else if (error < -interval) { + interval = -interval; + offset = -offset; + adj = clocksource_bigadjust(-1, error, &interval, &offset); + } else + return; + + clock->mult += adj; + clock->xtime_interval += interval; + clock->xtime_nsec -= offset; + clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); +} + /* * update_wall_time - Uses the current clocksource to increment the wall time * @@ -1021,54 +1089,53 @@ device_initcall(timekeeping_init_device); */ static void update_wall_time(void) { - static s64 remainder_snsecs, error; - s64 snsecs_per_sec; - cycle_t now, offset; + cycle_t offset; - snsecs_per_sec = (s64)NSEC_PER_SEC << clock->shift; - remainder_snsecs += (s64)xtime.tv_nsec << clock->shift; + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; - now = clocksource_read(clock); - offset = (now - last_clock_cycle)&clock->mask; +#ifdef CONFIG_GENERIC_TIME + offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; +#else + offset = clock->cycle_interval; +#endif /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ - while (offset > clock->interval_cycles) { - /* get the ntp interval in clock shifted nanoseconds */ - s64 ntp_snsecs = current_tick_length(clock->shift); - + while (offset >= clock->cycle_interval) { /* accumulate one interval */ - remainder_snsecs += clock->interval_snsecs; - last_clock_cycle += clock->interval_cycles; - offset -= clock->interval_cycles; + clock->xtime_nsec += clock->xtime_interval; + clock->cycle_last += clock->cycle_interval; + offset -= clock->cycle_interval; + + if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { + clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; + xtime.tv_sec++; + second_overflow(); + } /* interpolator bits */ - time_interpolator_update(clock->interval_snsecs + time_interpolator_update(clock->xtime_interval >> clock->shift); /* increment the NTP state machine */ update_ntp_one_tick(); /* accumulate error between NTP and clock interval */ - error += (ntp_snsecs - (s64)clock->interval_snsecs); + clock->error += current_tick_length(); + clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); + } - /* correct the clock when NTP error is too big */ - remainder_snsecs += make_ntp_adj(clock, offset, &error); + /* correct the clock when NTP error is too big */ + clocksource_adjust(clock, offset); - if (remainder_snsecs >= snsecs_per_sec) { - remainder_snsecs -= snsecs_per_sec; - xtime.tv_sec++; - second_overflow(); - } - } /* store full nanoseconds into xtime */ - xtime.tv_nsec = remainder_snsecs >> clock->shift; - remainder_snsecs -= (s64)xtime.tv_nsec << clock->shift; + xtime.tv_nsec = clock->xtime_nsec >> clock->shift; + clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; /* check to see if there is a new clocksource to use */ if (change_clocksource()) { - error = 0; - remainder_snsecs = 0; + clock->error = 0; + clock->xtime_nsec = 0; clocksource_calculate_interval(clock, tick_nsec); } } -- cgit From 36721656776f177280ccb50477a02e86e6444292 Mon Sep 17 00:00:00 2001 From: "mao, bibo" Date: Mon, 26 Jun 2006 00:25:22 -0700 Subject: [PATCH] Kprobe: multi kprobe posthandler for booster If there are multi kprobes on the same probepoint, there will be one extra aggr_kprobe on the head of kprobe list. The aggr_kprobe has aggr_post_handler/aggr_break_handler whether the other kprobe post_hander/break_handler is NULL or not. This patch modifies this, only when there is one or more kprobe in the list whose post_handler is not NULL, post_handler of aggr_kprobe will be set as aggr_post_handler. [soshima@redhat.com: !CONFIG_PREEMPT fix] Signed-off-by: bibo, mao Cc: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: "Keshavamurthy, Anil S" Cc: Prasanna S Panchamukhi Cc: Jim Keniston Cc: Yumiko Sugita Cc: Hideo Aoki Signed-off-by: Satoshi Oshima Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1fbf466a29a..f095178e48c 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -368,16 +368,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) */ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) { - struct kprobe *kp; - if (p->break_handler) { - list_for_each_entry_rcu(kp, &old_p->list, list) { - if (kp->break_handler) - return -EEXIST; - } + if (old_p->break_handler) + return -EEXIST; list_add_tail_rcu(&p->list, &old_p->list); + old_p->break_handler = aggr_break_handler; } else list_add_rcu(&p->list, &old_p->list); + if (p->post_handler && !old_p->post_handler) + old_p->post_handler = aggr_post_handler; return 0; } @@ -390,9 +389,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) copy_kprobe(p, ap); ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; - ap->post_handler = aggr_post_handler; ap->fault_handler = aggr_fault_handler; - ap->break_handler = aggr_break_handler; + if (p->post_handler) + ap->post_handler = aggr_post_handler; + if (p->break_handler) + ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); list_add_rcu(&p->list, &ap->list); @@ -536,6 +537,21 @@ valid_p: kfree(old_p); } arch_remove_kprobe(p); + } else { + mutex_lock(&kprobe_mutex); + if (p->break_handler) + old_p->break_handler = NULL; + if (p->post_handler){ + list_for_each_entry_rcu(list_p, &old_p->list, list){ + if (list_p->post_handler){ + cleanup_p = 2; + break; + } + } + if (cleanup_p == 0) + old_p->post_handler = NULL; + } + mutex_unlock(&kprobe_mutex); } } -- cgit From 3d5631e0631a11633c649bc995a6537ec21b67b4 Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Mon, 26 Jun 2006 00:25:28 -0700 Subject: [PATCH] Kprobes registers for notify page fault Kprobes now registers for page fault notifications. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f095178e48c..507f26e7ae7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -556,6 +556,11 @@ valid_p: } static struct notifier_block kprobe_exceptions_nb = { + .notifier_call = kprobe_exceptions_notify, + .priority = 0x7fffffff /* we need to be notified first */ +}; + +static struct notifier_block kprobe_page_fault_nb = { .notifier_call = kprobe_exceptions_notify, .priority = 0x7fffffff /* we need to notified first */ }; @@ -673,6 +678,9 @@ static int __init init_kprobes(void) if (!err) err = register_die_notifier(&kprobe_exceptions_nb); + if (!err) + err = register_page_fault_notifier(&kprobe_page_fault_nb); + return err; } -- cgit From e6f47f978bcd5413fff610613b18e9e0eab9bc1b Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Mon, 26 Jun 2006 00:25:29 -0700 Subject: [PATCH] Notify page fault call chain With this patch Kprobes now registers for page fault notifications only when their is an active probe registered. Once all the active probes are unregistered their is no need to be notified of page faults and kprobes unregisters itself from the page fault notifications. Hence we will have ZERO side effects when no probes are active. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 507f26e7ae7..64aab081153 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -47,11 +47,17 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; +static atomic_t kprobe_count; DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +static struct notifier_block kprobe_page_fault_nb = { + .notifier_call = kprobe_exceptions_notify, + .priority = 0x7fffffff /* we need to notified first */ +}; + #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* * kprobe->ainsn.insn points to the copy of the instruction to be @@ -465,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p, old_p = get_kprobe(p->addr); if (old_p) { ret = register_aggr_kprobe(old_p, p); + if (!ret) + atomic_inc(&kprobe_count); goto out; } @@ -475,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p, hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); + if (atomic_add_return(1, &kprobe_count) == \ + (ARCH_INACTIVE_KPROBE_COUNT + 1)) + register_page_fault_notifier(&kprobe_page_fault_nb); + arch_arm_kprobe(p); out: @@ -553,6 +565,16 @@ valid_p: } mutex_unlock(&kprobe_mutex); } + + /* Call unregister_page_fault_notifier() + * if no probes are active + */ + mutex_lock(&kprobe_mutex); + if (atomic_add_return(-1, &kprobe_count) == \ + ARCH_INACTIVE_KPROBE_COUNT) + unregister_page_fault_notifier(&kprobe_page_fault_nb); + mutex_unlock(&kprobe_mutex); + return; } static struct notifier_block kprobe_exceptions_nb = { @@ -560,10 +582,6 @@ static struct notifier_block kprobe_exceptions_nb = { .priority = 0x7fffffff /* we need to be notified first */ }; -static struct notifier_block kprobe_page_fault_nb = { - .notifier_call = kprobe_exceptions_notify, - .priority = 0x7fffffff /* we need to notified first */ -}; int __kprobes register_jprobe(struct jprobe *jp) { @@ -673,14 +691,12 @@ static int __init init_kprobes(void) INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); } + atomic_set(&kprobe_count, 0); err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); - if (!err) - err = register_page_fault_notifier(&kprobe_page_fault_nb); - return err; } -- cgit From 48e6484d49020dba3578ad117b461e8a391e8f0f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 26 Jun 2006 00:25:48 -0700 Subject: [PATCH] proc: Rewrite the proc dentry flush on exit optimization To keep the dcache from filling up with dead /proc entries we flush them on process exit. However over the years that code has gotten hairy with a dentry_pointer and a lock in task_struct and misdocumented as a correctness feature. I have rewritten this code to look and see if we have a corresponding entry in the dcache and if so flush it on process exit. This removes the extra fields in the task_struct and allows me to trivially handle the case of a /proc//task/ entry as well as the current /proc/ entries. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 +------ kernel/fork.c | 3 --- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index e76bd02e930..304ef637be6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -137,12 +137,8 @@ void release_task(struct task_struct * p) { int zap_leader; task_t *leader; - struct dentry *proc_dentry; - repeat: atomic_dec(&p->user->processes); - spin_lock(&p->proc_lock); - proc_dentry = proc_pid_unhash(p); write_lock_irq(&tasklist_lock); ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); @@ -171,8 +167,7 @@ repeat: sched_exit(p); write_unlock_irq(&tasklist_lock); - spin_unlock(&p->proc_lock); - proc_pid_flush(proc_dentry); + proc_flush_task(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); diff --git a/kernel/fork.c b/kernel/fork.c index dfd10cb370c..79e91046f36 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -993,13 +993,10 @@ static task_t *copy_process(unsigned long clone_flags, if (put_user(p->pid, parent_tidptr)) goto bad_fork_cleanup; - p->proc_dentry = NULL; - INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); - spin_lock_init(&p->proc_lock); clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); -- cgit From 99f895518368252ba862cc15ce4eb98ebbe1bec6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 26 Jun 2006 00:25:55 -0700 Subject: [PATCH] proc: don't lock task_structs indefinitely Every inode in /proc holds a reference to a struct task_struct. If a directory or file is opened and remains open after the the task exits this pinning continues. With 8K stacks on a 32bit machine the amount pinned per file descriptor is about 10K. Normally I would figure a reasonable per user process limit is about 100 processes. With 80 processes, with a 1000 file descriptors each I can trigger the 00M killer on a 32bit kernel, because I have pinned about 800MB of useless data. This patch replaces the struct task_struct pointer with a pointer to a struct task_ref which has a struct task_struct pointer. The so the pinning of dead tasks does not happen. The code now has to contend with the fact that the task may now exit at any time. Which is a little but not muh more complicated. With this change it takes about 1000 processes each opening up 1000 file descriptors before I can trigger the OOM killer. Much better. [mlp@google.com: task_mmu small fixes] Signed-off-by: Eric W. Biederman Cc: Trond Myklebust Cc: Paul Jackson Cc: Oleg Nesterov Cc: Albert Cahalan Signed-off-by: Prasanna Meda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b602f73fb38..3e991c0c02e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -2442,31 +2443,43 @@ void __cpuset_memory_pressure_bump(void) */ static int proc_cpuset_show(struct seq_file *m, void *v) { + struct task_ref *tref; struct task_struct *tsk; char *buf; - int retval = 0; + int retval; + retval = -ENOMEM; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) - return -ENOMEM; + goto out; + + retval = -ESRCH; + tref = m->private; + tsk = get_tref_task(tref); + if (!tsk) + goto out_free; - tsk = m->private; + retval = -EINVAL; mutex_lock(&manage_mutex); + retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); if (retval < 0) - goto out; + goto out_unlock; seq_puts(m, buf); seq_putc(m, '\n'); -out: +out_unlock: mutex_unlock(&manage_mutex); + put_task_struct(tsk); +out_free: kfree(buf); +out: return retval; } static int cpuset_open(struct inode *inode, struct file *file) { - struct task_struct *tsk = PROC_I(inode)->task; - return single_open(file, proc_cpuset_show, tsk); + struct task_ref *tref = PROC_I(inode)->tref; + return single_open(file, proc_cpuset_show, tref); } struct file_operations proc_cpuset_operations = { -- cgit From 13b41b09491e5d75e8027dca1ee78f5e073bc4c0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 26 Jun 2006 00:25:56 -0700 Subject: [PATCH] proc: Use struct pid not struct task_ref Incrementally update my proc-dont-lock-task_structs-indefinitely patches so that they work with struct pid instead of struct task_ref. Mostly this is a straight 1-1 substitution. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e991c0c02e..1535af3a912 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include @@ -2443,7 +2442,7 @@ void __cpuset_memory_pressure_bump(void) */ static int proc_cpuset_show(struct seq_file *m, void *v) { - struct task_ref *tref; + struct pid *pid; struct task_struct *tsk; char *buf; int retval; @@ -2454,8 +2453,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v) goto out; retval = -ESRCH; - tref = m->private; - tsk = get_tref_task(tref); + pid = m->private; + tsk = get_pid_task(pid, PIDTYPE_PID); if (!tsk) goto out_free; @@ -2478,8 +2477,8 @@ out: static int cpuset_open(struct inode *inode, struct file *file) { - struct task_ref *tref = PROC_I(inode)->tref; - return single_open(file, proc_cpuset_show, tref); + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cpuset_show, pid); } struct file_operations proc_cpuset_operations = { -- cgit From df26c40e567356caeefe2861311e19c54444d917 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 26 Jun 2006 00:25:59 -0700 Subject: [PATCH] proc: Cleanup proc_fd_access_allowed In process of getting proc_fd_access_allowed to work it has developed a few warts. In particular the special case that always allows introspection and the special case to allow inspection of kernel threads. The special case for introspection is needed for /proc/self/mem. The special case for kernel threads really should be overridable by security modules. So consolidate these checks into ptrace.c:may_attach(). The check to always allow introspection is trivial. The check to allow access to kernel threads, and zombies is a little trickier. mem_read and mem_write already verify an mm exists so it isn't needed twice. proc_fd_access_allowed only doesn't want a check to verify task->mm exits, s it prevents all access to kernel threads. So just move the task->mm check into ptrace_attach where it is needed for practical reasons. I did a quick audit and none of the security modules in the kernel seem to care if they are passed a task without an mm into security_ptrace. So the above move should be safe and it allows security modules to come up with more restrictive policy. Signed-off-by: Eric W. Biederman Cc: Stephen Smalley Cc: Chris Wright Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 921c22ad16e..6252d2fa2bf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill) static int may_attach(struct task_struct *task) { - if (!task->mm) - return -EPERM; + /* May we inspect the given task? + * This check is used both for attaching with ptrace + * and for allowing access to sensitive information in /proc. + * + * ptrace_attach denies several cases that /proc allows + * because setting up the necessary parent/child relationship + * or halting the specified task is impossible. + */ + int dumpable = 0; + /* Don't let security modules deny introspection */ + if (task == current) + return 0; if (((current->uid != task->euid) || (current->uid != task->suid) || (current->uid != task->uid) || @@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task) (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) return -EPERM; smp_rmb(); - if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + if (task->mm) + dumpable = task->mm->dumpable; + if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; return security_ptrace(current, task); @@ -176,6 +188,8 @@ repeat: goto repeat; } + if (!task->mm) + goto bad; /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; -- cgit From d5f70c00ad24cd1158d3678b44ff969b4c971d49 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Jun 2006 00:26:07 -0700 Subject: [PATCH] coredump: kill ptrace related stuff With this patch zap_process() sets SIGNAL_GROUP_EXIT while sending SIGKILL to the thread group. This means that a TASK_TRACED task 1. Will be awakened by signal_wake_up(1) 2. Can't sleep again via ptrace_notify() 3. Can't go to do_signal_stop() after return from ptrace_stop() in get_signal_to_deliver() So we can remove all ptrace related stuff from coredump path. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 3 ++- kernel/signal.c | 35 ++++++++++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6252d2fa2bf..335c5b932e1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -214,7 +214,7 @@ out: return retval; } -void __ptrace_detach(struct task_struct *child, unsigned int data) +static inline void __ptrace_detach(struct task_struct *child, unsigned int data) { child->exit_code = data; /* .. re-parent .. */ @@ -233,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) ptrace_disable(child); write_lock_irq(&tasklist_lock); + /* protect against de_thread()->release_task() */ if (child->ptrace) __ptrace_detach(child, data); write_unlock_irq(&tasklist_lock); diff --git a/kernel/signal.c b/kernel/signal.c index 1b3c921737e..52adf53929f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1531,6 +1531,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) spin_unlock_irqrestore(&sighand->siglock, flags); } +static inline int may_ptrace_stop(void) +{ + if (!likely(current->ptrace & PT_PTRACED)) + return 0; + + if (unlikely(current->parent == current->real_parent && + (current->ptrace & PT_ATTACHED))) + return 0; + + if (unlikely(current->signal == current->parent->signal) && + unlikely(current->signal->flags & SIGNAL_GROUP_EXIT)) + return 0; + + /* + * Are we in the middle of do_coredump? + * If so and our tracer is also part of the coredump stopping + * is a deadlock situation, and pointless because our tracer + * is dead so don't allow us to stop. + * If SIGKILL was already sent before the caller unlocked + * ->siglock we must see ->core_waiters != 0. Otherwise it + * is safe to enter schedule(). + */ + if (unlikely(current->mm->core_waiters) && + unlikely(current->mm == current->parent->mm)) + return 0; + + return 1; +} + /* * This must be called with current->sighand->siglock held. * @@ -1559,11 +1588,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) spin_unlock_irq(¤t->sighand->siglock); try_to_freeze(); read_lock(&tasklist_lock); - if (likely(current->ptrace & PT_PTRACED) && - likely(current->parent != current->real_parent || - !(current->ptrace & PT_ATTACHED)) && - (likely(current->parent->signal != current->signal) || - !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { + if (may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); read_unlock(&tasklist_lock); schedule(); -- cgit From cf2dfbfbf4c5cb489ea12defd85a484307b955b8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Jun 2006 00:26:10 -0700 Subject: [PATCH] coredump: copy_process: don't check SIGNAL_GROUP_EXIT After the previous patch SIGNAL_GROUP_EXIT implies a pending SIGKILL, we can remove this check from copy_process() because we already checked !signal_pending(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 79e91046f36..9b4e54ef022 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1156,18 +1156,6 @@ static task_t *copy_process(unsigned long clone_flags, } if (clone_flags & CLONE_THREAD) { - /* - * Important: if an exit-all has been started then - * do not create this new thread - the whole thread - * group is supposed to exit anyway. - */ - if (current->signal->flags & SIGNAL_GROUP_EXIT) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); - retval = -EAGAIN; - goto bad_fork_cleanup_namespace; - } - p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); -- cgit From b78709cfd4387c15a9894748bcada8a4ca75c561 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Mon, 26 Jun 2006 16:58:00 +1000 Subject: [PATCH] sched: fix SCHED_FIFO bug in sys_sched_rr_get_interval() The introduction of SCHED_BATCH scheduling class with a value of 3 means that the expression (p->policy & SCHED_FIFO) will return true if policy is SCHED_BATCH or SCHED_FIFO. Unfortunately, this expression is used in sys_sched_rr_get_interval() and in the absence of a comment to say that this is intentional I presume that it is unintentional and erroneous. The fix is to change the expression to (p->policy == SCHED_FIFO). Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f06d059edef..cfaf3fabeec 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4247,7 +4247,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) if (retval) goto out_unlock; - jiffies_to_timespec(p->policy & SCHED_FIFO ? + jiffies_to_timespec(p->policy == SCHED_FIFO ? 0 : task_timeslice(p), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; -- cgit From bebfa1013eee1d91b3242e5801cc8fbdfaf148ec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 26 Jun 2006 13:56:52 +0200 Subject: [PATCH] x86_64: Add compat_printk and sysctl to turn off compat layer warnings Sometimes e.g. with crashme the compat layer warnings can be noisy. Add a way to turn them off by gating all output through compat_printk that checks a global sysctl. The default is not changed. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2c0e6581944..f1a4eb1a655 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -73,6 +73,7 @@ extern int printk_ratelimit_burst; extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; +extern int compat_log; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; @@ -676,6 +677,16 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif +#ifdef CONFIG_COMPAT + { + .ctl_name = KERN_COMPAT_LOG, + .procname = "compat-log", + .data = &compat_log, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif { .ctl_name = 0 } }; -- cgit From 4552d5dc08b79868829b4be8951b29b07284753f Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 26 Jun 2006 13:57:28 +0200 Subject: [PATCH] x86_64: reliable stack trace support These are the generic bits needed to enable reliable stack traces based on Dwarf2-like (.eh_frame) unwind information. Subsequent patches will enable x86-64 and i386 to make use of this. Thanks to Andi Kleen and Ingo Molnar, who pointed out several possibilities for improvement. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/module.c | 16 +- kernel/unwind.c | 915 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 931 insertions(+), 1 deletion(-) create mode 100644 kernel/unwind.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f6ef00f4f90..a31276e190f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o +obj-$(CONFIG_STACK_UNWIND) += unwind.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o diff --git a/kernel/module.c b/kernel/module.c index d75275de1c2..08811e26ac9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -1051,6 +1052,8 @@ static void free_module(struct module *mod) remove_sect_attrs(mod); mod_kobject_remove(mod); + unwind_remove_table(mod->unwind_info, 0); + /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1412,7 +1415,7 @@ static struct module *load_module(void __user *umod, unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, exportindex, modindex, obsparmindex, infoindex, gplindex, crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, - gplfuturecrcindex; + gplfuturecrcindex, unwindex = 0; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -1502,6 +1505,9 @@ static struct module *load_module(void __user *umod, versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); +#ifdef ARCH_UNWIND_SECTION_NAME + unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); +#endif /* Don't keep modinfo section */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -1510,6 +1516,8 @@ static struct module *load_module(void __user *umod, sechdrs[symindex].sh_flags |= SHF_ALLOC; sechdrs[strindex].sh_flags |= SHF_ALLOC; #endif + if (unwindex) + sechdrs[unwindex].sh_flags |= SHF_ALLOC; /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -1738,6 +1746,11 @@ static struct module *load_module(void __user *umod, goto arch_cleanup; add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); + /* Size of section 0 is 0, so this works well if no unwind info. */ + mod->unwind_info = unwind_add_table(mod, + (void *)sechdrs[unwindex].sh_addr, + sechdrs[unwindex].sh_size); + /* Get rid of temporary copy */ vfree(hdr); @@ -1836,6 +1849,7 @@ sys_init_module(void __user *umod, mod->state = MODULE_STATE_LIVE; /* Drop initial reference. */ module_put(mod); + unwind_remove_table(mod->unwind_info, 1); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; diff --git a/kernel/unwind.c b/kernel/unwind.c new file mode 100644 index 00000000000..d36bcd3ad3b --- /dev/null +++ b/kernel/unwind.c @@ -0,0 +1,915 @@ +/* + * Copyright (C) 2002-2006 Novell, Inc. + * Jan Beulich + * This code is released under version 2 of the GNU GPL. + * + * A simple API for unwinding kernel stacks. This is used for + * debugging and error reporting purposes. The kernel doesn't need + * full-blown stack unwinding with all the bells and whistles, so there + * is not much point in implementing the full Dwarf2 unwind API. + */ + +#include +#include +#include +#include +#include +#include +#include + +extern char __start_unwind[], __end_unwind[]; + +#define MAX_STACK_DEPTH 8 + +#define EXTRA_INFO(f) { \ + BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \ + % FIELD_SIZEOF(struct unwind_frame_info, f)) \ + + offsetof(struct unwind_frame_info, f) \ + / FIELD_SIZEOF(struct unwind_frame_info, f), \ + FIELD_SIZEOF(struct unwind_frame_info, f) \ + } +#define PTREGS_INFO(f) EXTRA_INFO(regs.f) + +static const struct { + unsigned offs:BITS_PER_LONG / 2; + unsigned width:BITS_PER_LONG / 2; +} reg_info[] = { + UNW_REGISTER_INFO +}; + +#undef PTREGS_INFO +#undef EXTRA_INFO + +#ifndef REG_INVALID +#define REG_INVALID(r) (reg_info[r].width == 0) +#endif + +#define DW_CFA_nop 0x00 +#define DW_CFA_set_loc 0x01 +#define DW_CFA_advance_loc1 0x02 +#define DW_CFA_advance_loc2 0x03 +#define DW_CFA_advance_loc4 0x04 +#define DW_CFA_offset_extended 0x05 +#define DW_CFA_restore_extended 0x06 +#define DW_CFA_undefined 0x07 +#define DW_CFA_same_value 0x08 +#define DW_CFA_register 0x09 +#define DW_CFA_remember_state 0x0a +#define DW_CFA_restore_state 0x0b +#define DW_CFA_def_cfa 0x0c +#define DW_CFA_def_cfa_register 0x0d +#define DW_CFA_def_cfa_offset 0x0e +#define DW_CFA_def_cfa_expression 0x0f +#define DW_CFA_expression 0x10 +#define DW_CFA_offset_extended_sf 0x11 +#define DW_CFA_def_cfa_sf 0x12 +#define DW_CFA_def_cfa_offset_sf 0x13 +#define DW_CFA_val_offset 0x14 +#define DW_CFA_val_offset_sf 0x15 +#define DW_CFA_val_expression 0x16 +#define DW_CFA_lo_user 0x1c +#define DW_CFA_GNU_window_save 0x2d +#define DW_CFA_GNU_args_size 0x2e +#define DW_CFA_GNU_negative_offset_extended 0x2f +#define DW_CFA_hi_user 0x3f + +#define DW_EH_PE_FORM 0x07 +#define DW_EH_PE_native 0x00 +#define DW_EH_PE_leb128 0x01 +#define DW_EH_PE_data2 0x02 +#define DW_EH_PE_data4 0x03 +#define DW_EH_PE_data8 0x04 +#define DW_EH_PE_signed 0x08 +#define DW_EH_PE_ADJUST 0x70 +#define DW_EH_PE_abs 0x00 +#define DW_EH_PE_pcrel 0x10 +#define DW_EH_PE_textrel 0x20 +#define DW_EH_PE_datarel 0x30 +#define DW_EH_PE_funcrel 0x40 +#define DW_EH_PE_aligned 0x50 +#define DW_EH_PE_indirect 0x80 +#define DW_EH_PE_omit 0xff + +typedef unsigned long uleb128_t; +typedef signed long sleb128_t; + +static struct unwind_table { + struct { + unsigned long pc; + unsigned long range; + } core, init; + const void *address; + unsigned long size; + struct unwind_table *link; + const char *name; +} root_table, *last_table; + +struct unwind_item { + enum item_location { + Nowhere, + Memory, + Register, + Value + } where; + uleb128_t value; +}; + +struct unwind_state { + uleb128_t loc, org; + const u8 *cieStart, *cieEnd; + uleb128_t codeAlign; + sleb128_t dataAlign; + struct cfa { + uleb128_t reg, offs; + } cfa; + struct unwind_item regs[ARRAY_SIZE(reg_info)]; + unsigned stackDepth:8; + unsigned version:8; + const u8 *label; + const u8 *stack[MAX_STACK_DEPTH]; +}; + +static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; + +static struct unwind_table *find_table(unsigned long pc) +{ + struct unwind_table *table; + + for (table = &root_table; table; table = table->link) + if ((pc >= table->core.pc + && pc < table->core.pc + table->core.range) + || (pc >= table->init.pc + && pc < table->init.pc + table->init.range)) + break; + + return table; +} + +static void init_unwind_table(struct unwind_table *table, + const char *name, + const void *core_start, + unsigned long core_size, + const void *init_start, + unsigned long init_size, + const void *table_start, + unsigned long table_size) +{ + table->core.pc = (unsigned long)core_start; + table->core.range = core_size; + table->init.pc = (unsigned long)init_start; + table->init.range = init_size; + table->address = table_start; + table->size = table_size; + table->link = NULL; + table->name = name; +} + +void __init unwind_init(void) +{ + init_unwind_table(&root_table, "kernel", + _text, _end - _text, + NULL, 0, + __start_unwind, __end_unwind - __start_unwind); +} + +/* Must be called with module_mutex held. */ +void *unwind_add_table(struct module *module, + const void *table_start, + unsigned long table_size) +{ + struct unwind_table *table; + + if (table_size <= 0) + return NULL; + + table = kmalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return NULL; + + init_unwind_table(table, module->name, + module->module_core, module->core_size, + module->module_init, module->init_size, + table_start, table_size); + + if (last_table) + last_table->link = table; + else + root_table.link = table; + last_table = table; + + return table; +} + +struct unlink_table_info +{ + struct unwind_table *table; + int init_only; +}; + +static int unlink_table(void *arg) +{ + struct unlink_table_info *info = arg; + struct unwind_table *table = info->table, *prev; + + for (prev = &root_table; prev->link && prev->link != table; prev = prev->link) + ; + + if (prev->link) { + if (info->init_only) { + table->init.pc = 0; + table->init.range = 0; + info->table = NULL; + } else { + prev->link = table->link; + if (!prev->link) + last_table = prev; + } + } else + info->table = NULL; + + return 0; +} + +/* Must be called with module_mutex held. */ +void unwind_remove_table(void *handle, int init_only) +{ + struct unwind_table *table = handle; + struct unlink_table_info info; + + if (!table || table == &root_table) + return; + + if (init_only && table == last_table) { + table->init.pc = 0; + table->init.range = 0; + return; + } + + info.table = table; + info.init_only = init_only; + stop_machine_run(unlink_table, &info, NR_CPUS); + + if (info.table) + kfree(table); +} + +static uleb128_t get_uleb128(const u8 **pcur, const u8 *end) +{ + const u8 *cur = *pcur; + uleb128_t value; + unsigned shift; + + for (shift = 0, value = 0; cur < end; shift += 7) { + if (shift + 7 > 8 * sizeof(value) + && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { + cur = end + 1; + break; + } + value |= (uleb128_t)(*cur & 0x7f) << shift; + if (!(*cur++ & 0x80)) + break; + } + *pcur = cur; + + return value; +} + +static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) +{ + const u8 *cur = *pcur; + sleb128_t value; + unsigned shift; + + for (shift = 0, value = 0; cur < end; shift += 7) { + if (shift + 7 > 8 * sizeof(value) + && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { + cur = end + 1; + break; + } + value |= (sleb128_t)(*cur & 0x7f) << shift; + if (!(*cur & 0x80)) { + value |= -(*cur++ & 0x40) << shift; + break; + } + } + *pcur = cur; + + return value; +} + +static unsigned long read_pointer(const u8 **pLoc, + const void *end, + signed ptrType) +{ + unsigned long value = 0; + union { + const u8 *p8; + const u16 *p16u; + const s16 *p16s; + const u32 *p32u; + const s32 *p32s; + const unsigned long *pul; + } ptr; + + if (ptrType < 0 || ptrType == DW_EH_PE_omit) + return 0; + ptr.p8 = *pLoc; + switch(ptrType & DW_EH_PE_FORM) { + case DW_EH_PE_data2: + if (end < (const void *)(ptr.p16u + 1)) + return 0; + if(ptrType & DW_EH_PE_signed) + value = get_unaligned(ptr.p16s++); + else + value = get_unaligned(ptr.p16u++); + break; + case DW_EH_PE_data4: +#ifdef CONFIG_64BIT + if (end < (const void *)(ptr.p32u + 1)) + return 0; + if(ptrType & DW_EH_PE_signed) + value = get_unaligned(ptr.p32s++); + else + value = get_unaligned(ptr.p32u++); + break; + case DW_EH_PE_data8: + BUILD_BUG_ON(sizeof(u64) != sizeof(value)); +#else + BUILD_BUG_ON(sizeof(u32) != sizeof(value)); +#endif + case DW_EH_PE_native: + if (end < (const void *)(ptr.pul + 1)) + return 0; + value = get_unaligned(ptr.pul++); + break; + case DW_EH_PE_leb128: + BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value)); + value = ptrType & DW_EH_PE_signed + ? get_sleb128(&ptr.p8, end) + : get_uleb128(&ptr.p8, end); + if ((const void *)ptr.p8 > end) + return 0; + break; + default: + return 0; + } + switch(ptrType & DW_EH_PE_ADJUST) { + case DW_EH_PE_abs: + break; + case DW_EH_PE_pcrel: + value += (unsigned long)*pLoc; + break; + default: + return 0; + } + if ((ptrType & DW_EH_PE_indirect) + && __get_user(value, (unsigned long *)value)) + return 0; + *pLoc = ptr.p8; + + return value; +} + +static signed fde_pointer_type(const u32 *cie) +{ + const u8 *ptr = (const u8 *)(cie + 2); + unsigned version = *ptr; + + if (version != 1) + return -1; /* unsupported */ + if (*++ptr) { + const char *aug; + const u8 *end = (const u8 *)(cie + 1) + *cie; + uleb128_t len; + + /* check if augmentation size is first (and thus present) */ + if (*ptr != 'z') + return -1; + /* check if augmentation string is nul-terminated */ + if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL) + return -1; + ++ptr; /* skip terminator */ + get_uleb128(&ptr, end); /* skip code alignment */ + get_sleb128(&ptr, end); /* skip data alignment */ + /* skip return address column */ + version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end); + len = get_uleb128(&ptr, end); /* augmentation length */ + if (ptr + len < ptr || ptr + len > end) + return -1; + end = ptr + len; + while (*++aug) { + if (ptr >= end) + return -1; + switch(*aug) { + case 'L': + ++ptr; + break; + case 'P': { + signed ptrType = *ptr++; + + if (!read_pointer(&ptr, end, ptrType) || ptr > end) + return -1; + } + break; + case 'R': + return *ptr; + default: + return -1; + } + } + } + return DW_EH_PE_native|DW_EH_PE_abs; +} + +static int advance_loc(unsigned long delta, struct unwind_state *state) +{ + state->loc += delta * state->codeAlign; + + return delta > 0; +} + +static void set_rule(uleb128_t reg, + enum item_location where, + uleb128_t value, + struct unwind_state *state) +{ + if (reg < ARRAY_SIZE(state->regs)) { + state->regs[reg].where = where; + state->regs[reg].value = value; + } +} + +static int processCFI(const u8 *start, + const u8 *end, + unsigned long targetLoc, + signed ptrType, + struct unwind_state *state) +{ + union { + const u8 *p8; + const u16 *p16; + const u32 *p32; + } ptr; + int result = 1; + + if (start != state->cieStart) { + state->loc = state->org; + result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state); + if (targetLoc == 0 && state->label == NULL) + return result; + } + for (ptr.p8 = start; result && ptr.p8 < end; ) { + switch(*ptr.p8 >> 6) { + uleb128_t value; + + case 0: + switch(*ptr.p8++) { + case DW_CFA_nop: + break; + case DW_CFA_set_loc: + if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) + result = 0; + break; + case DW_CFA_advance_loc1: + result = ptr.p8 < end && advance_loc(*ptr.p8++, state); + break; + case DW_CFA_advance_loc2: + result = ptr.p8 <= end + 2 + && advance_loc(*ptr.p16++, state); + break; + case DW_CFA_advance_loc4: + result = ptr.p8 <= end + 4 + && advance_loc(*ptr.p32++, state); + break; + case DW_CFA_offset_extended: + value = get_uleb128(&ptr.p8, end); + set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); + break; + case DW_CFA_val_offset: + value = get_uleb128(&ptr.p8, end); + set_rule(value, Value, get_uleb128(&ptr.p8, end), state); + break; + case DW_CFA_offset_extended_sf: + value = get_uleb128(&ptr.p8, end); + set_rule(value, Memory, get_sleb128(&ptr.p8, end), state); + break; + case DW_CFA_val_offset_sf: + value = get_uleb128(&ptr.p8, end); + set_rule(value, Value, get_sleb128(&ptr.p8, end), state); + break; + case DW_CFA_restore_extended: + case DW_CFA_undefined: + case DW_CFA_same_value: + set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state); + break; + case DW_CFA_register: + value = get_uleb128(&ptr.p8, end); + set_rule(value, + Register, + get_uleb128(&ptr.p8, end), state); + break; + case DW_CFA_remember_state: + if (ptr.p8 == state->label) { + state->label = NULL; + return 1; + } + if (state->stackDepth >= MAX_STACK_DEPTH) + return 0; + state->stack[state->stackDepth++] = ptr.p8; + break; + case DW_CFA_restore_state: + if (state->stackDepth) { + const uleb128_t loc = state->loc; + const u8 *label = state->label; + + state->label = state->stack[state->stackDepth - 1]; + memcpy(&state->cfa, &badCFA, sizeof(state->cfa)); + memset(state->regs, 0, sizeof(state->regs)); + state->stackDepth = 0; + result = processCFI(start, end, 0, ptrType, state); + state->loc = loc; + state->label = label; + } else + return 0; + break; + case DW_CFA_def_cfa: + state->cfa.reg = get_uleb128(&ptr.p8, end); + /*nobreak*/ + case DW_CFA_def_cfa_offset: + state->cfa.offs = get_uleb128(&ptr.p8, end); + break; + case DW_CFA_def_cfa_sf: + state->cfa.reg = get_uleb128(&ptr.p8, end); + /*nobreak*/ + case DW_CFA_def_cfa_offset_sf: + state->cfa.offs = get_sleb128(&ptr.p8, end) + * state->dataAlign; + break; + case DW_CFA_def_cfa_register: + state->cfa.reg = get_uleb128(&ptr.p8, end); + break; + /*todo case DW_CFA_def_cfa_expression: */ + /*todo case DW_CFA_expression: */ + /*todo case DW_CFA_val_expression: */ + case DW_CFA_GNU_args_size: + get_uleb128(&ptr.p8, end); + break; + case DW_CFA_GNU_negative_offset_extended: + value = get_uleb128(&ptr.p8, end); + set_rule(value, + Memory, + (uleb128_t)0 - get_uleb128(&ptr.p8, end), state); + break; + case DW_CFA_GNU_window_save: + default: + result = 0; + break; + } + break; + case 1: + result = advance_loc(*ptr.p8++ & 0x3f, state); + break; + case 2: + value = *ptr.p8++ & 0x3f; + set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); + break; + case 3: + set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); + break; + } + if (ptr.p8 > end) + result = 0; + if (result && targetLoc != 0 && targetLoc < state->loc) + return 1; + } + + return result + && ptr.p8 == end + && (targetLoc == 0 + || (/*todo While in theory this should apply, gcc in practice omits + everything past the function prolog, and hence the location + never reaches the end of the function. + targetLoc < state->loc &&*/ state->label == NULL)); +} + +/* Unwind to previous to frame. Returns 0 if successful, negative + * number in case of an error. */ +int unwind(struct unwind_frame_info *frame) +{ +#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) + const u32 *fde = NULL, *cie = NULL; + const u8 *ptr = NULL, *end = NULL; + unsigned long startLoc = 0, endLoc = 0, cfa; + unsigned i; + signed ptrType = -1; + uleb128_t retAddrReg = 0; + struct unwind_table *table; + struct unwind_state state; + + if (UNW_PC(frame) == 0) + return -EINVAL; + if ((table = find_table(UNW_PC(frame))) != NULL + && !(table->size & (sizeof(*fde) - 1))) { + unsigned long tableSize = table->size; + + for (fde = table->address; + tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, + fde += 1 + *fde / sizeof(*fde)) { + if (!*fde || (*fde & (sizeof(*fde) - 1))) + break; + if (!fde[1]) + continue; /* this is a CIE */ + if ((fde[1] & (sizeof(*fde) - 1)) + || fde[1] > (unsigned long)(fde + 1) + - (unsigned long)table->address) + continue; /* this is not a valid FDE */ + cie = fde + 1 - fde[1] / sizeof(*fde); + if (*cie <= sizeof(*cie) + 4 + || *cie >= fde[1] - sizeof(*fde) + || (*cie & (sizeof(*cie) - 1)) + || cie[1] + || (ptrType = fde_pointer_type(cie)) < 0) { + cie = NULL; /* this is not a (valid) CIE */ + continue; + } + ptr = (const u8 *)(fde + 2); + startLoc = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType & DW_EH_PE_indirect + ? ptrType + : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); + if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc) + break; + cie = NULL; + } + } + if (cie != NULL) { + memset(&state, 0, sizeof(state)); + state.cieEnd = ptr; /* keep here temporarily */ + ptr = (const u8 *)(cie + 2); + end = (const u8 *)(cie + 1) + *cie; + if ((state.version = *ptr) != 1) + cie = NULL; /* unsupported version */ + else if (*++ptr) { + /* check if augmentation size is first (and thus present) */ + if (*ptr == 'z') { + /* check for ignorable (or already handled) + * nul-terminated augmentation string */ + while (++ptr < end && *ptr) + if (strchr("LPR", *ptr) == NULL) + break; + } + if (ptr >= end || *ptr) + cie = NULL; + } + ++ptr; + } + if (cie != NULL) { + /* get code aligment factor */ + state.codeAlign = get_uleb128(&ptr, end); + /* get data aligment factor */ + state.dataAlign = get_sleb128(&ptr, end); + if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) + cie = NULL; + else { + retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); + /* skip augmentation */ + if (((const char *)(cie + 2))[1] == 'z') + ptr += get_uleb128(&ptr, end); + if (ptr > end + || retAddrReg >= ARRAY_SIZE(reg_info) + || REG_INVALID(retAddrReg) + || reg_info[retAddrReg].width != sizeof(unsigned long)) + cie = NULL; + } + } + if (cie != NULL) { + state.cieStart = ptr; + ptr = state.cieEnd; + state.cieEnd = end; + end = (const u8 *)(fde + 1) + *fde; + /* skip augmentation */ + if (((const char *)(cie + 2))[1] == 'z') { + uleb128_t augSize = get_uleb128(&ptr, end); + + if ((ptr += augSize) > end) + fde = NULL; + } + } + if (cie == NULL || fde == NULL) { +#ifdef CONFIG_FRAME_POINTER + unsigned long top, bottom; +#endif + +#ifdef CONFIG_FRAME_POINTER + top = STACK_TOP(frame->task); + bottom = STACK_BOTTOM(frame->task); +# if FRAME_RETADDR_OFFSET < 0 + if (UNW_SP(frame) < top + && UNW_FP(frame) <= UNW_SP(frame) + && bottom < UNW_FP(frame) +# else + if (UNW_SP(frame) > top + && UNW_FP(frame) >= UNW_SP(frame) + && bottom > UNW_FP(frame) +# endif + && !((UNW_SP(frame) | UNW_FP(frame)) + & (sizeof(unsigned long) - 1))) { + unsigned long link; + + if (!__get_user(link, + (unsigned long *)(UNW_FP(frame) + + FRAME_LINK_OFFSET)) +# if FRAME_RETADDR_OFFSET < 0 + && link > bottom && link < UNW_FP(frame) +# else + && link > UNW_FP(frame) && link < bottom +# endif + && !(link & (sizeof(link) - 1)) + && !__get_user(UNW_PC(frame), + (unsigned long *)(UNW_FP(frame) + + FRAME_RETADDR_OFFSET))) { + UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET +# if FRAME_RETADDR_OFFSET < 0 + - +# else + + +# endif + sizeof(UNW_PC(frame)); + UNW_FP(frame) = link; + return 0; + } + } +#endif + return -ENXIO; + } + state.org = startLoc; + memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); + /* process instructions */ + if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state) + || state.loc > endLoc + || state.regs[retAddrReg].where == Nowhere + || state.cfa.reg >= ARRAY_SIZE(reg_info) + || reg_info[state.cfa.reg].width != sizeof(unsigned long) + || state.cfa.offs % sizeof(unsigned long)) + return -EIO; + /* update frame */ + cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; + startLoc = min((unsigned long)UNW_SP(frame), cfa); + endLoc = max((unsigned long)UNW_SP(frame), cfa); + if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) { + startLoc = min(STACK_LIMIT(cfa), cfa); + endLoc = max(STACK_LIMIT(cfa), cfa); + } +#ifndef CONFIG_64BIT +# define CASES CASE(8); CASE(16); CASE(32) +#else +# define CASES CASE(8); CASE(16); CASE(32); CASE(64) +#endif + for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { + if (REG_INVALID(i)) { + if (state.regs[i].where == Nowhere) + continue; + return -EIO; + } + switch(state.regs[i].where) { + default: + break; + case Register: + if (state.regs[i].value >= ARRAY_SIZE(reg_info) + || REG_INVALID(state.regs[i].value) + || reg_info[i].width > reg_info[state.regs[i].value].width) + return -EIO; + switch(reg_info[state.regs[i].value].width) { +#define CASE(n) \ + case sizeof(u##n): \ + state.regs[i].value = FRAME_REG(state.regs[i].value, \ + const u##n); \ + break + CASES; +#undef CASE + default: + return -EIO; + } + break; + } + } + for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { + if (REG_INVALID(i)) + continue; + switch(state.regs[i].where) { + case Nowhere: + if (reg_info[i].width != sizeof(UNW_SP(frame)) + || &FRAME_REG(i, __typeof__(UNW_SP(frame))) + != &UNW_SP(frame)) + continue; + UNW_SP(frame) = cfa; + break; + case Register: + switch(reg_info[i].width) { +#define CASE(n) case sizeof(u##n): \ + FRAME_REG(i, u##n) = state.regs[i].value; \ + break + CASES; +#undef CASE + default: + return -EIO; + } + break; + case Value: + if (reg_info[i].width != sizeof(unsigned long)) + return -EIO; + FRAME_REG(i, unsigned long) = cfa + state.regs[i].value + * state.dataAlign; + break; + case Memory: { + unsigned long addr = cfa + state.regs[i].value + * state.dataAlign; + + if ((state.regs[i].value * state.dataAlign) + % sizeof(unsigned long) + || addr < startLoc + || addr + sizeof(unsigned long) < addr + || addr + sizeof(unsigned long) > endLoc) + return -EIO; + switch(reg_info[i].width) { +#define CASE(n) case sizeof(u##n): \ + __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ + break + CASES; +#undef CASE + default: + return -EIO; + } + } + break; + } + } + + return 0; +#undef CASES +#undef FRAME_REG +} +EXPORT_SYMBOL(unwind); + +int unwind_init_frame_info(struct unwind_frame_info *info, + struct task_struct *tsk, + /*const*/ struct pt_regs *regs) +{ + info->task = tsk; + arch_unw_init_frame_info(info, regs); + + return 0; +} +EXPORT_SYMBOL(unwind_init_frame_info); + +/* + * Prepare to unwind a blocked task. + */ +int unwind_init_blocked(struct unwind_frame_info *info, + struct task_struct *tsk) +{ + info->task = tsk; + arch_unw_init_blocked(info); + + return 0; +} +EXPORT_SYMBOL(unwind_init_blocked); + +/* + * Prepare to unwind the currently running thread. + */ +int unwind_init_running(struct unwind_frame_info *info, + asmlinkage void (*callback)(struct unwind_frame_info *, + void *arg), + void *arg) +{ + info->task = current; + arch_unwind_init_running(info, callback, arg); + + return 0; +} +EXPORT_SYMBOL(unwind_init_running); + +/* + * Unwind until the return pointer is in user-land (or until an error + * occurs). Returns 0 if successful, negative number in case of + * error. + */ +int unwind_to_user(struct unwind_frame_info *info) +{ + while (!arch_unw_user_mode(info)) { + int err = unwind(info); + + if (err < 0) + return err; + } + + return 0; +} +EXPORT_SYMBOL(unwind_to_user); -- cgit From c33bd9aac0597eeedaaa01ea5aafe456894b2f2b Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 26 Jun 2006 13:57:47 +0200 Subject: [PATCH] i386/x86-64: fall back to old-style call trace if no unwinding If no unwinding is possible at all for a certain exception instance, fall back to the old style call trace instead of not showing any trace at all. Also, allow setting the stack trace mode at the command line. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/unwind.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index d36bcd3ad3b..0421035272d 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -885,14 +885,13 @@ EXPORT_SYMBOL(unwind_init_blocked); * Prepare to unwind the currently running thread. */ int unwind_init_running(struct unwind_frame_info *info, - asmlinkage void (*callback)(struct unwind_frame_info *, - void *arg), + asmlinkage int (*callback)(struct unwind_frame_info *, + void *arg), void *arg) { info->task = current; - arch_unwind_init_running(info, callback, arg); - return 0; + return arch_unwind_init_running(info, callback, arg); } EXPORT_SYMBOL(unwind_init_running); -- cgit From 83f4fcce7fdd213bd570b899862c3838871f8cf7 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 26 Jun 2006 13:57:50 +0200 Subject: [PATCH] x86_64: allow unwinder to build without module support Add proper conditionals to be able to build with CONFIG_MODULES=n. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/unwind.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/unwind.c b/kernel/unwind.c index 0421035272d..f69c804c8e6 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -172,6 +172,8 @@ void __init unwind_init(void) __start_unwind, __end_unwind - __start_unwind); } +#ifdef CONFIG_MODULES + /* Must be called with module_mutex held. */ void *unwind_add_table(struct module *module, const void *table_start, @@ -253,6 +255,8 @@ void unwind_remove_table(void *handle, int init_only) kfree(table); } +#endif /* CONFIG_MODULES */ + static uleb128_t get_uleb128(const u8 **pcur, const u8 *end) { const u8 *cur = *pcur; -- cgit From 495ab9c045e1b0e5c82951b762257fe1c9d81564 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 26 Jun 2006 13:59:11 +0200 Subject: [PATCH] i386/x86-64/ia64: Move polling flag into thread_info_status During some profiling I noticed that default_idle causes a lot of memory traffic. I think that is caused by the atomic operations to clear/set the polling flag in thread_info. There is actually no reason to make this atomic - only the idle thread does it to itself, other CPUs only read it. So I moved it into ti->status. Converted i386/x86-64/ia64 for now because that was the easiest way to fix ACPI which also manipulates these flags in its idle function. Cc: Nick Piggin Cc: Tony Luck Cc: Len Brown Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/sched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f06d059edef..7d1027a4dd2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -818,6 +818,11 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) * the target CPU. */ #ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + static void resched_task(task_t *p) { int cpu; @@ -833,9 +838,9 @@ static void resched_task(task_t *p) if (cpu == smp_processor_id()) return; - /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ + /* NEED_RESCHED must be visible before we test polling */ smp_mb(); - if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) + if (!tsk_is_polling(p)) smp_send_reschedule(cpu); } #else -- cgit From 685143ac1f7a579a3fac9c7f2ac8f82e95af6864 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 12 Jun 2006 15:18:31 -0700 Subject: [PATCH] 64bit resource: fix up printks for resources in arch and core code This is needed if we wish to change the size of the resource structures. Based on an original patch from Vivek Goyal and Andrew Morton. (tweaked by Andy Isaacson ) Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Andy Isaacson Signed-off-by: Greg Kroah-Hartman --- kernel/resource.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index e3080fcc66a..ea5f7811a40 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -83,10 +83,10 @@ static int r_show(struct seq_file *m, void *v) for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) if (p->parent == root) break; - seq_printf(m, "%*s%0*lx-%0*lx : %s\n", + seq_printf(m, "%*s%0*llx-%0*llx : %s\n", depth * 2, "", - width, r->start, - width, r->end, + width, (unsigned long long) r->start, + width, (unsigned long long) r->end, r->name ? r->name : ""); return 0; } @@ -511,7 +511,9 @@ void __release_region(struct resource *parent, unsigned long start, unsigned lon write_unlock(&resource_lock); - printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end); + printk(KERN_WARNING "Trying to free nonexistent resource " + "<%016llx-%016llx>\n", (unsigned long long)start, + (unsigned long long)end); } EXPORT_SYMBOL(__release_region); -- cgit From d75fc8bbccf7c019994bcfd6255d5b56335ed21d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 12 Jun 2006 16:09:23 -0700 Subject: [PATCH] 64bit resource: change resource core to use resource_size_t Based on a patch series originally from Vivek Goyal Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- kernel/resource.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index ea5f7811a40..54835c02ab3 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -151,8 +151,8 @@ __initcall(ioresources_init); /* Return the conflict entry if you can't request it */ static struct resource * __request_resource(struct resource *root, struct resource *new) { - unsigned long start = new->start; - unsigned long end = new->end; + resource_size_t start = new->start; + resource_size_t end = new->end; struct resource *tmp, **p; if (end < start) @@ -236,11 +236,10 @@ EXPORT_SYMBOL(release_resource); * Find empty slot in the resource tree given range and alignment. */ static int find_resource(struct resource *root, struct resource *new, - unsigned long size, - unsigned long min, unsigned long max, - unsigned long align, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, void (*alignf)(void *, struct resource *, - unsigned long, unsigned long), + resource_size_t, resource_size_t), void *alignf_data) { struct resource *this = root->child; @@ -282,11 +281,10 @@ static int find_resource(struct resource *root, struct resource *new, * Allocate empty slot in the resource tree given range and alignment. */ int allocate_resource(struct resource *root, struct resource *new, - unsigned long size, - unsigned long min, unsigned long max, - unsigned long align, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, void (*alignf)(void *, struct resource *, - unsigned long, unsigned long), + resource_size_t, resource_size_t), void *alignf_data) { int err; @@ -378,10 +376,10 @@ EXPORT_SYMBOL(insert_resource); * arguments. Returns -EBUSY if it can't fit. Existing children of * the resource are assumed to be immutable. */ -int adjust_resource(struct resource *res, unsigned long start, unsigned long size) +int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { struct resource *tmp, *parent = res->parent; - unsigned long end = start + size - 1; + resource_size_t end = start + size - 1; int result = -EBUSY; write_lock(&resource_lock); @@ -428,7 +426,9 @@ EXPORT_SYMBOL(adjust_resource); * * Release-region releases a matching busy region. */ -struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) +struct resource * __request_region(struct resource *parent, + resource_size_t start, resource_size_t n, + const char *name) { struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); @@ -464,7 +464,8 @@ struct resource * __request_region(struct resource *parent, unsigned long start, EXPORT_SYMBOL(__request_region); -int __check_region(struct resource *parent, unsigned long start, unsigned long n) +int __check_region(struct resource *parent, resource_size_t start, + resource_size_t n) { struct resource * res; @@ -479,10 +480,11 @@ int __check_region(struct resource *parent, unsigned long start, unsigned long n EXPORT_SYMBOL(__check_region); -void __release_region(struct resource *parent, unsigned long start, unsigned long n) +void __release_region(struct resource *parent, resource_size_t start, + resource_size_t n) { struct resource **p; - unsigned long end; + resource_size_t end; p = &parent->child; end = start + n - 1; -- cgit From 6550e07f41ce8473ed684dac54fbfbd42183ffda Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 12 Jun 2006 17:11:31 -0700 Subject: [PATCH] 64bit Resource: finally enable 64bit resource sizes Introduce the Kconfig entry and actually switch to a 64bit value, if wanted, for resource_size_t. Based on a patch series originally from Vivek Goyal Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- kernel/resource.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 54835c02ab3..cc73029088a 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -23,20 +23,18 @@ struct resource ioport_resource = { .name = "PCI IO", - .start = 0x0000, + .start = 0, .end = IO_SPACE_LIMIT, .flags = IORESOURCE_IO, }; - EXPORT_SYMBOL(ioport_resource); struct resource iomem_resource = { .name = "PCI mem", - .start = 0UL, - .end = ~0UL, + .start = 0, + .end = -1, .flags = IORESOURCE_MEM, }; - EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); -- cgit From 7f32a25f63358aa993a3403c047f3ecfa6d96367 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 27 Jun 2006 02:53:25 -0700 Subject: [PATCH] kernel/acct: fix function definition kernel/acct.c:579:19: warning: non-ANSI function declaration of function 'acct_process' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 368c4f03fe0..4c613de6602 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -576,7 +576,7 @@ void acct_collect(long exitcode, int group_dead) * * handles process accounting for an exiting task */ -void acct_process() +void acct_process(void) { struct file *file = NULL; -- cgit From 5c31f2738ab124ebc1f8948a5fc17dd7a08ed1ec Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 27 Jun 2006 02:53:26 -0700 Subject: [PATCH] pm_trace is dangerous CONFIG_PM_TRACES scrogs your RTC. Mark it as experimental, and defaulting to `off'. Also beef up the help message a bit. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index fc311a4673a..857b4fa0912 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -38,13 +38,22 @@ config PM_DEBUG config PM_TRACE bool "Suspend/resume event tracing" - depends on PM && PM_DEBUG && X86_32 - default y + depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL + default n ---help--- This enables some cheesy code to save the last PM event point in the RTC across reboots, so that you can debug a machine that just hangs during suspend (or more commonly, during resume). + To use this debugging feature you should attempt to suspend the machine, + then reboot it, then run + + dmesg -s 1000000 | grep 'hash matches' + + CAUTION: this option will cause your machine's real-time clock to be + set to an invalid time after a resume. + + config SOFTWARE_SUSPEND bool "Software Suspend" depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) -- cgit From 2842f11419704f8707fffc82e10d2263427fc130 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 27 Jun 2006 02:53:36 -0700 Subject: [PATCH] catch valid mem range at onlining memory This patch allows hot-add memory which is not aligned to section. Now, hot-added memory has to be aligned to section size. Considering big section sized archs, this is not useful. When hot-added memory is registerd as iomem resoruce by iomem resource patch, we can make use of that information to detect valid memory range. Note: With this, not-aligned memory can be registerd. To allow hot-add memory with holes, we have to do more work around add_memory(). (It doesn't allows add memory to already existing mem section.) Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index e3080fcc66a..2404f9b0bc4 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -232,6 +232,44 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * Finds the lowest memory reosurce exists within [res->start.res->end) + * the caller must specify res->start, res->end, res->flags. + * If found, returns 0, res is overwritten, if not found, returns -1. + */ +int find_next_system_ram(struct resource *res) +{ + resource_size_t start, end; + struct resource *p; + + BUG_ON(!res); + + start = res->start; + end = res->end; + + read_lock(&resource_lock); + for (p = iomem_resource.child; p ; p = p->sibling) { + /* system ram is just marked as IORESOURCE_MEM */ + if (p->flags != res->flags) + continue; + if (p->start > end) { + p = NULL; + break; + } + if (p->start >= start) + break; + } + read_unlock(&resource_lock); + if (!p) + return -1; + /* copy data */ + res->start = p->start; + res->end = p->end; + return 0; +} +#endif + /* * Find empty slot in the resource tree given range and alignment. */ -- cgit From e6e5494cb23d1933735ee47cc674ffe1c4afed6f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:53:50 -0700 Subject: [PATCH] vdso: randomize the i386 vDSO by moving it into a vma Move the i386 VDSO down into a vma and thus randomize it. Besides the security implications, this feature also helps debuggers, which can COW a vma-backed VDSO just like a normal DSO and can thus do single-stepping and other debugging features. It's good for hypervisors (Xen, VMWare) too, which typically live in the same high-mapped address space as the VDSO, hence whenever the VDSO is used, they get lots of guest pagefaults and have to fix such guest accesses up - which slows things down instead of speeding things up (the primary purpose of the VDSO). There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support for older glibcs that still rely on a prelinked high-mapped VDSO. Newer distributions (using glibc 2.3.3 or later) can turn this option off. Turning it off is also recommended for security reasons: attackers cannot use the predictable high-mapped VDSO page as syscall trampoline anymore. There is a new vdso=[0|1] boot option as well, and a runtime /proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned on/off. (This version of the VDSO-randomization patch also has working ELF coredumping, the previous patch crashed in the coredumping code.) This code is a combined work of the exec-shield VDSO randomization code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell started this patch and i completed it. [akpm@osdl.org: cleanups] [akpm@osdl.org: compile fix] [akpm@osdl.org: compile fix 2] [akpm@osdl.org: compile fix 3] [akpm@osdl.org: revernt MAXMEM change] Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Cc: Gerd Hoffmann Cc: Rusty Russell Cc: Zachary Amsden Cc: Andi Kleen Cc: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f1a4eb1a655..f54afed8426 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -927,6 +927,18 @@ static ctl_table vm_table[] = { .proc_handler = &proc_dointvec_jiffies, .strategy = &sysctl_jiffies, }, +#endif +#ifdef CONFIG_X86_32 + { + .ctl_name = VM_VDSO_ENABLED, + .procname = "vdso_enabled", + .data = &vdso_enabled, + .maxlen = sizeof(vdso_enabled), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, #endif { .ctl_name = 0 } }; -- cgit From a7807a32bbb027ab9955b96734fdc7f1e6497a9f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 27 Jun 2006 02:53:54 -0700 Subject: [PATCH] poison: add & use more constants Add more poison values to include/linux/poison.h. It's not clear to me whether some others should be added or not, so I haven't added any of these: ./include/linux/libata.h:#define ATA_TAG_POISON 0xfafbfcfdU ./arch/ppc/8260_io/fcc_enet.c:1918: memset((char *)(&(immap->im_dprambase[(mem_addr+64)])), 0x88, 32); ./drivers/usb/mon/mon_text.c:429: memset(mem, 0xe5, sizeof(struct mon_event_text)); ./drivers/char/ftape/lowlevel/ftape-ctl.c:738: memset(ft_buffer[i]->address, 0xAA, FT_BUFF_SIZE); ./drivers/block/sx8.c:/* 0xf is just arbitrary, non-zero noise; this is sorta like poisoning */ Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/mutex-debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 036b6285b15..e38e4bac97c 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock, void debug_mutex_init_waiter(struct mutex_waiter *waiter) { - memset(waiter, 0x11, sizeof(*waiter)); + memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); waiter->magic = waiter; INIT_LIST_HEAD(&waiter->list); } @@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) void debug_mutex_free_waiter(struct mutex_waiter *waiter) { DEBUG_WARN_ON(!list_empty(&waiter->list)); - memset(waiter, 0x22, sizeof(*waiter)); + memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); } void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, -- cgit From 34af946a22724c4e2b204957f2b24b22a0fb121c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:53:55 -0700 Subject: [PATCH] spin/rwlock init cleanups locking init cleanups: - convert " = SPIN_LOCK_UNLOCKED" to spin_lock_init() or DEFINE_SPINLOCK() - convert rwlocks in a similar manner this patch was generated automatically. Motivation: - cleanliness - lockdep needs control of lock initialization, which the open-coded variants do not give - it's also useful for -rt and for lock debugging in general Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7dfac7031bd..82443fb433e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -818,7 +818,7 @@ err: */ unsigned int audit_serial(void) { - static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(serial_lock); static unsigned int serial = 0; unsigned long flags; -- cgit From 1dbe83c3445a1604546620a60888cf26b63f8782 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 27 Jun 2006 02:54:01 -0700 Subject: [PATCH] fix kernel-doc in kernel/ dir Fix kernel-doc parameters in kernel/ Warning(/var/linsrc/linux-2617-g9//kernel/auditsc.c:1376): No description found for parameter 'u_abs_timeout' Warning(/var/linsrc/linux-2617-g9//kernel/auditsc.c:1420): No description found for parameter 'u_msg_prio' Warning(/var/linsrc/linux-2617-g9//kernel/auditsc.c:1420): No description found for parameter 'u_abs_timeout' Warning(/var/linsrc/linux-2617-g9//kernel/acct.c:526): No description found for parameter 'pacct' Signed-off-by: Randy Dunlap Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 1 + kernel/auditsc.c | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 4c613de6602..126ca43d5d2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -521,6 +521,7 @@ static void do_acct_process(struct file *file) /** * acct_init_pacct - initialize a new pacct_struct + * @pacct: per-process accounting info struct to initialize */ void acct_init_pacct(struct pacct_struct *pacct) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9ebd96fda29..bdfb580a067 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1367,7 +1367,7 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) * @mqdes: MQ descriptor * @msg_len: Message length * @msg_prio: Message priority - * @abs_timeout: Message timeout in absolute time + * @u_abs_timeout: Message timeout in absolute time * * Returns 0 for success or NULL context or < 0 on error. */ @@ -1409,8 +1409,8 @@ int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive * @mqdes: MQ descriptor * @msg_len: Message length - * @msg_prio: Message priority - * @abs_timeout: Message timeout in absolute time + * @u_msg_prio: Message priority + * @u_abs_timeout: Message timeout in absolute time * * Returns 0 for success or NULL context or < 0 on error. */ @@ -1558,7 +1558,6 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) * @uid: msgq user id * @gid: msgq group id * @mode: msgq mode (permissions) - * @ipcp: in-kernel IPC permissions * * Returns 0 for success or NULL context or < 0 on error. */ -- cgit From 29766f1eb3d8d6cfaf1d6623fb4c3f7e5a822fe4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Jun 2006 02:54:02 -0700 Subject: [PATCH] rcutorture: catchup doc fixes for idle-hz tests This just catches the RCU torture documentation up with the recent fixes that test RCU for architectures that turn of the scheduling-clock interrupt for idle CPUs and the addition of a SUCCESS/FAILURE indication, fixing up an obsolete comment as well. Signed-off-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 8154e7589d1..2a65bd8a3d1 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1,5 +1,5 @@ /* - * Read-Copy Update /proc-based torture test facility + * Read-Copy Update module-based torture test facility * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by -- cgit From 72e9bb549280b354311af30640c9433474f3a32c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Jun 2006 02:54:03 -0700 Subject: [PATCH] rcutorture: add ops vector and Classic RCU ops Add an ops vector to rcutorture, and add the ops for Classic RCU. Update the rcutorture documentation to reflect slight change to the dmesg formats. Signed-off-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 163 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 120 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2a65bd8a3d1..c96b5edd6ed 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ +static char *torture_type = "rcu"; /* What to torture. */ module_param(nreaders, int, 0); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); @@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0); MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); module_param(shuffle_interval, int, 0); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -#define TORTURE_FLAG "rcutorture: " +module_param(torture_type, charp, 0); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu)"); + +#define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ - do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) + do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) #define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) + do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) #define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) + do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) static char printk_buf[4096]; @@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p) spin_unlock_bh(&rcu_torture_lock); } -static void -rcu_torture_cb(struct rcu_head *p) -{ - int i; - struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - - if (fullstop) { - /* Test is ending, just drop callbacks on the floor. */ - /* The next initialization will pick up the pieces. */ - return; - } - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - rcu_torture_free(rp); - } else - call_rcu(p, rcu_torture_cb); -} - struct rcu_random_state { unsigned long rrs_state; unsigned long rrs_count; @@ -190,6 +172,83 @@ rcu_random(struct rcu_random_state *rrsp) return swahw32(rrsp->rrs_state); } +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_torture_ops { + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + int (*completed)(void); + void (*deferredfree)(struct rcu_torture *p); + int (*stats)(char *page); + char *name; +}; +static struct rcu_torture_ops *cur_ops = NULL; + +/* + * Definitions for rcu torture testing. + */ + +static int rcu_torture_read_lock(void) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_torture_read_unlock(int idx) +{ + rcu_read_unlock(); +} + +static int rcu_torture_completed(void) +{ + return rcu_batches_completed(); +} + +static void +rcu_torture_cb(struct rcu_head *p) +{ + int i; + struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); + + if (fullstop) { + /* Test is ending, just drop callbacks on the floor. */ + /* The next initialization will pick up the pieces. */ + return; + } + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + rcu_torture_free(rp); + } else + cur_ops->deferredfree(rp); +} + +static void rcu_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops rcu_ops = { + .init = NULL, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferredfree = rcu_torture_deferred_free, + .stats = NULL, + .name = "rcu" +}; + +static struct rcu_torture_ops *torture_ops[] = + { &rcu_ops, NULL }; + /* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure @@ -209,8 +268,6 @@ rcu_torture_writer(void *arg) do { schedule_timeout_uninterruptible(1); - if (rcu_batches_completed() == oldbatch) - continue; if ((rp = rcu_torture_alloc()) == NULL) continue; rp->rtort_pipe_count = 0; @@ -225,10 +282,10 @@ rcu_torture_writer(void *arg) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; - call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); + cur_ops->deferredfree(old_rp); } rcu_torture_current_version++; - oldbatch = rcu_batches_completed(); + oldbatch = cur_ops->completed(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); while (!kthread_should_stop()) @@ -246,6 +303,7 @@ static int rcu_torture_reader(void *arg) { int completed; + int idx; DEFINE_RCU_RANDOM(rand); struct rcu_torture *p; int pipe_count; @@ -254,12 +312,12 @@ rcu_torture_reader(void *arg) set_user_nice(current, 19); do { - rcu_read_lock(); - completed = rcu_batches_completed(); + idx = cur_ops->readlock(); + completed = cur_ops->completed(); p = rcu_dereference(rcu_torture_current); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcu_read_unlock(); + cur_ops->readunlock(idx); schedule_timeout_interruptible(HZ); continue; } @@ -273,14 +331,14 @@ rcu_torture_reader(void *arg) pipe_count = RCU_TORTURE_PIPE_LEN; } ++__get_cpu_var(rcu_torture_count)[pipe_count]; - completed = rcu_batches_completed() - completed; + completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } ++__get_cpu_var(rcu_torture_batch)[completed]; preempt_enable(); - rcu_read_unlock(); + cur_ops->readunlock(idx); schedule(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); @@ -311,7 +369,7 @@ rcu_torture_printk(char *page) if (pipesummary[i] != 0) break; } - cnt += sprintf(&page[cnt], "rcutorture: "); + cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " "rtmbe: %d", @@ -324,7 +382,7 @@ rcu_torture_printk(char *page) atomic_read(&n_rcu_torture_mberror)); if (atomic_read(&n_rcu_torture_mberror) != 0) cnt += sprintf(&page[cnt], " !!!"); - cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); if (i > 1) { cnt += sprintf(&page[cnt], "!!! "); atomic_inc(&n_rcu_torture_error); @@ -332,17 +390,19 @@ rcu_torture_printk(char *page) cnt += sprintf(&page[cnt], "Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); - cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "Reader Batch: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); - cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "Free-Block Circulation: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { cnt += sprintf(&page[cnt], " %d", atomic_read(&rcu_torture_wcount[i])); } cnt += sprintf(&page[cnt], "\n"); + if (cur_ops->stats != NULL) + cnt += cur_ops->stats(&page[cnt]); return cnt; } @@ -444,11 +504,11 @@ rcu_torture_shuffle(void *arg) static inline void rcu_torture_print_module_parms(char *tag) { - printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " + printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " "shuffle_interval = %d\n", - tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, - shuffle_interval); + torture_type, tag, nrealreaders, stat_interval, verbose, + test_no_idle_hz, shuffle_interval); } static void @@ -493,6 +553,9 @@ rcu_torture_cleanup(void) rcu_barrier(); rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error)) rcu_torture_print_module_parms("End of test: FAILURE"); else @@ -508,6 +571,20 @@ rcu_torture_init(void) /* Process args and tell the world that the torturer is on the job. */ + for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) { + cur_ops = torture_ops[i]; + if (strcmp(torture_type, cur_ops->name) == 0) { + break; + } + } + if (cur_ops == NULL) { + printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", + torture_type); + return (-EINVAL); + } + if (cur_ops->init != NULL) + cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + if (nreaders >= 0) nrealreaders = nreaders; else -- cgit From c32e066057fe0914da262c94e52cefb142f965b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Jun 2006 02:54:04 -0700 Subject: [PATCH] rcutorture: add call_rcu_bh() operations Add operations for the call_rcu_bh() variant of RCU. Also add an rcu_batches_completed_bh() function, which is needed by rcutorture. Signed-off-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcupdate.c | 10 ++++++++++ kernel/rcutorture.c | 40 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 20e9710fc21..c0e1cb95dd4 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -182,6 +182,15 @@ long rcu_batches_completed(void) return rcu_ctrlblk.completed; } +/* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed_bh(void) +{ + return rcu_bh_ctrlblk.completed; +} + static void rcu_barrier_callback(struct rcu_head *notused) { if (atomic_dec_and_test(&rcu_barrier_cpu_count)) @@ -619,6 +628,7 @@ module_param(qlowmark, int, 0); module_param(rsinterval, int, 0); #endif EXPORT_SYMBOL_GPL(rcu_batches_completed); +EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); EXPORT_SYMBOL_GPL(call_rcu); EXPORT_SYMBOL_GPL(call_rcu_bh); EXPORT_SYMBOL_GPL(synchronize_rcu); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c96b5edd6ed..4d1c3d24712 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -66,7 +66,7 @@ MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); module_param(shuffle_interval, int, 0); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); module_param(torture_type, charp, 0); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu)"); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)"); #define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ @@ -246,8 +246,44 @@ static struct rcu_torture_ops rcu_ops = { .name = "rcu" }; +/* + * Definitions for rcu_bh torture testing. + */ + +static int rcu_bh_torture_read_lock(void) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_torture_read_unlock(int idx) +{ + rcu_read_unlock_bh(); +} + +static int rcu_bh_torture_completed(void) +{ + return rcu_batches_completed_bh(); +} + +static void rcu_bh_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops rcu_bh_ops = { + .init = NULL, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferredfree = rcu_bh_torture_deferred_free, + .stats = NULL, + .name = "rcu_bh" +}; + static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, NULL }; + { &rcu_ops, &rcu_bh_ops, NULL }; /* * RCU torture writer kthread. Repeatedly substitutes a new structure -- cgit From 9c7b216d23e820e0e148d5be01bbb5bd2d8378fe Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Tue, 27 Jun 2006 02:54:07 -0700 Subject: [PATCH] cpu hotplug: revert init patch submitted for 2.6.17 In 2.6.17, there was a problem with cpu_notifiers and XFS. I provided a band-aid solution to solve that problem. In the process, i undid all the changes you both were making to ensure that these notifiers were available only at init time (unless CONFIG_HOTPLUG_CPU is defined). We deferred the real fix to 2.6.18. Here is a set of patches that fixes the XFS problem cleanly and makes the cpu notifiers available only at init time (unless CONFIG_HOTPLUG_CPU is defined). If CONFIG_HOTPLUG_CPU is defined then cpu notifiers are available at run time. This patch reverts the notifier_call changes made in 2.6.17 Signed-off-by: Chandra Seetharaman Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 2 +- kernel/profile.c | 2 +- kernel/rcupdate.c | 2 +- kernel/softirq.c | 2 +- kernel/softlockup.c | 2 +- kernel/timer.c | 2 +- kernel/workqueue.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 55601b3ce60..f9f53191661 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -833,7 +833,7 @@ static void migrate_hrtimers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int hrtimer_cpu_notify(struct notifier_block *self, +static int __devinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; diff --git a/kernel/profile.c b/kernel/profile.c index 68afe121e50..5a730fdb1a2 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -299,7 +299,7 @@ out: } #ifdef CONFIG_HOTPLUG_CPU -static int profile_cpu_callback(struct notifier_block *info, +static int __devinit profile_cpu_callback(struct notifier_block *info, unsigned long action, void *__cpu) { int node, cpu = (unsigned long)__cpu; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c0e1cb95dd4..a8d80b7048b 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -548,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu) tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); } -static int rcu_cpu_notify(struct notifier_block *self, +static int __devinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; diff --git a/kernel/softirq.c b/kernel/softirq.c index 9e2f1c6e73d..db65a311f14 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int cpu_callback(struct notifier_block *nfb, +static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { diff --git a/kernel/softlockup.c b/kernel/softlockup.c index b5c3b94e01c..29da0a847ba 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu) /* * Create/destroy watchdog threads as CPUs come and go: */ -static int +static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; diff --git a/kernel/timer.c b/kernel/timer.c index 5bb6b7976ee..878194ec8bd 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1652,7 +1652,7 @@ static void __devinit migrate_timers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int timer_cpu_notify(struct notifier_block *self, +static int __devinit timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 565cf7a1feb..59f0b42bd89 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -559,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) } /* We're holding the cpucontrol mutex here */ -static int workqueue_cpu_callback(struct notifier_block *nfb, +static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { -- cgit From 054cc8a2d808822dadf488a61729e3e550f114c4 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Tue, 27 Jun 2006 02:54:07 -0700 Subject: [PATCH] cpu hotplug: revert initdata patch submitted for 2.6.17 This patch reverts notifier_block changes made in 2.6.17 Signed-off-by: Chandra Seetharaman Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 2 +- kernel/rcupdate.c | 2 +- kernel/sched.c | 2 +- kernel/softirq.c | 2 +- kernel/softlockup.c | 2 +- kernel/timer.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f9f53191661..8d3dc29ef41 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -857,7 +857,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block hrtimers_nb = { +static struct notifier_block __devinitdata hrtimers_nb = { .notifier_call = hrtimer_cpu_notify, }; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a8d80b7048b..f464f5ae3f1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -565,7 +565,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block rcu_nb = { +static struct notifier_block __devinitdata rcu_nb = { .notifier_call = rcu_cpu_notify, }; diff --git a/kernel/sched.c b/kernel/sched.c index a856040c200..b8deddb7833 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4805,7 +4805,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, /* Register at highest priority so that task migration (migrate_all_tasks) * happens before everything else. */ -static struct notifier_block migration_notifier = { +static struct notifier_block __devinitdata migration_notifier = { .notifier_call = migration_call, .priority = 10 }; diff --git a/kernel/softirq.c b/kernel/softirq.c index db65a311f14..8f03e3b89b5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -486,7 +486,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block cpu_nfb = { +static struct notifier_block __devinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 29da0a847ba..6b76caa2298 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block cpu_nfb = { +static struct notifier_block __devinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/timer.c b/kernel/timer.c index 878194ec8bd..5a896025306 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1672,7 +1672,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block timers_nb = { +static struct notifier_block __devinitdata timers_nb = { .notifier_call = timer_cpu_notify, }; -- cgit From 65edc68c345cbe21d0b0375c3452a3ed5e322868 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Tue, 27 Jun 2006 02:54:08 -0700 Subject: [PATCH] cpu hotplug: make [un]register_cpu_notifier init time only CPUs come online only at init time (unless CONFIG_HOTPLUG_CPU is defined). So, cpu_notifier functionality need to be available only at init time. This patch makes register_cpu_notifier() available only at init time, unless CONFIG_HOTPLUG_CPU is defined. This patch exports register_cpu_notifier() and unregister_cpu_notifier() only if CONFIG_HOTPLUG_CPU is defined. Signed-off-by: Chandra Seetharaman Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 03dcd981846..70fbf2e8376 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -18,7 +18,7 @@ /* This protects CPUs going up and down... */ static DEFINE_MUTEX(cpucontrol); -static BLOCKING_NOTIFIER_HEAD(cpu_chain); +static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); #ifdef CONFIG_HOTPLUG_CPU static struct task_struct *lock_cpu_hotplug_owner; @@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); #endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ -int register_cpu_notifier(struct notifier_block *nb) +int __cpuinit register_cpu_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&cpu_chain, nb); } + +#ifdef CONFIG_HOTPLUG_CPU + EXPORT_SYMBOL(register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) @@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); -#ifdef CONFIG_HOTPLUG_CPU static inline void check_for_tasks(int cpu) { struct task_struct *p; -- cgit From 26c2143b63b8078d08d562733716de142927e17a Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Tue, 27 Jun 2006 02:54:10 -0700 Subject: [PATCH] cpu hotplug: make cpu_notifier related notifier calls __cpuinit only Make notifier_calls associated with cpu_notifier as __cpuinit. __cpuinit makes sure that the function is init time only unless CONFIG_HOTPLUG_CPU is defined. [akpm@osdl.org: section fix] Signed-off-by: Chandra Seetharaman Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b8deddb7833..3e57712aefd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4734,8 +4734,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu) * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. */ -static int migration_call(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int __cpuinit migration_call(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { int cpu = (long)hcpu; struct task_struct *p; @@ -4805,7 +4806,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, /* Register at highest priority so that task migration (migrate_all_tasks) * happens before everything else. */ -static struct notifier_block __devinitdata migration_notifier = { +static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, .priority = 10 }; -- cgit From c96d145e71c5c84601322d85748512e09d7b325f Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Tue, 27 Jun 2006 02:54:28 -0700 Subject: [PATCH] sched: fix smt nice lock contention and optimization Initial report and lock contention fix from Chris Mason: Recent benchmarks showed some performance regressions between 2.6.16 and 2.6.5. We tracked down one of the regressions to lock contention in schedule heavy workloads (~70,000 context switches per second) kernel/sched.c:dependent_sleeper() was responsible for most of the lock contention, hammering on the run queue locks. The patch below is more of a discussion point than a suggested fix (although it does reduce lock contention significantly). The dependent_sleeper code looks very expensive to me, especially for using a spinlock to bounce control between two different siblings in the same cpu. It is further optimized: * perform dependent_sleeper check after next task is determined * convert wake_sleeping_dependent to use trylock * skip smt runqueue check if trylock fails * optimize double_rq_lock now that smt nice is converted to trylock * early exit in searching first SD_SHARE_CPUPOWER domain * speedup fast path of dependent_sleeper [akpm@osdl.org: cleanup] Signed-off-by: Ken Chen Acked-by: Ingo Molnar Acked-by: Con Kolivas Signed-off-by: Nick Piggin Acked-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 182 +++++++++++++++++++-------------------------------------- 1 file changed, 59 insertions(+), 123 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3e57712aefd..50a67edc358 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -239,7 +239,6 @@ struct runqueue { task_t *migration_thread; struct list_head migration_queue; - int cpu; #endif #ifdef CONFIG_SCHEDSTATS @@ -1074,9 +1073,10 @@ static int sched_balance_self(int cpu, int flag) struct task_struct *t = current; struct sched_domain *tmp, *sd = NULL; - for_each_domain(cpu, tmp) + for_each_domain(cpu, tmp) { if (tmp->flags & flag) sd = tmp; + } while (sd) { cpumask_t span; @@ -1691,9 +1691,6 @@ unsigned long nr_active(void) /* * double_rq_lock - safely lock two runqueues * - * We must take them in cpu order to match code in - * dependent_sleeper and wake_dependent_sleeper. - * * Note this does not disable interrupts like task_rq_lock, * you need to do so manually before calling. */ @@ -1705,7 +1702,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ } else { - if (rq1->cpu < rq2->cpu) { + if (rq1 < rq2) { spin_lock(&rq1->lock); spin_lock(&rq2->lock); } else { @@ -1741,7 +1738,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) __acquires(this_rq->lock) { if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest->cpu < this_rq->cpu) { + if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); @@ -2352,10 +2349,11 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) double_lock_balance(busiest_rq, target_rq); /* Search for an sd spanning us and the target CPU. */ - for_each_domain(target_cpu, sd) + for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && cpu_isset(busiest_cpu, sd->span)) break; + } if (unlikely(sd == NULL)) goto out; @@ -2691,48 +2689,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq) resched_task(rq->idle); } -static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +/* + * Called with interrupt disabled and this_rq's runqueue locked. + */ +static void wake_sleeping_dependent(int this_cpu) { struct sched_domain *tmp, *sd = NULL; - cpumask_t sibling_map; int i; - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_SHARE_CPUPOWER) + for_each_domain(this_cpu, tmp) { + if (tmp->flags & SD_SHARE_CPUPOWER) { sd = tmp; + break; + } + } if (!sd) return; - /* - * Unlock the current runqueue because we have to lock in - * CPU order to avoid deadlocks. Caller knows that we might - * unlock. We keep IRQs disabled. - */ - spin_unlock(&this_rq->lock); - - sibling_map = sd->span; - - for_each_cpu_mask(i, sibling_map) - spin_lock(&cpu_rq(i)->lock); - /* - * We clear this CPU from the mask. This both simplifies the - * inner loop and keps this_rq locked when we exit: - */ - cpu_clear(this_cpu, sibling_map); - - for_each_cpu_mask(i, sibling_map) { + for_each_cpu_mask(i, sd->span) { runqueue_t *smt_rq = cpu_rq(i); + if (i == this_cpu) + continue; + if (unlikely(!spin_trylock(&smt_rq->lock))) + continue; + wakeup_busy_runqueue(smt_rq); + spin_unlock(&smt_rq->lock); } - - for_each_cpu_mask(i, sibling_map) - spin_unlock(&cpu_rq(i)->lock); - /* - * We exit with this_cpu's rq still held and IRQs - * still disabled: - */ } /* @@ -2745,52 +2730,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) return p->time_slice * (100 - sd->per_cpu_gain) / 100; } -static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +/* + * To minimise lock contention and not have to drop this_rq's runlock we only + * trylock the sibling runqueues and bypass those runqueues if we fail to + * acquire their lock. As we only trylock the normal locking order does not + * need to be obeyed. + */ +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) { struct sched_domain *tmp, *sd = NULL; - cpumask_t sibling_map; - prio_array_t *array; int ret = 0, i; - task_t *p; - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_SHARE_CPUPOWER) + /* kernel/rt threads do not participate in dependent sleeping */ + if (!p->mm || rt_task(p)) + return 0; + + for_each_domain(this_cpu, tmp) { + if (tmp->flags & SD_SHARE_CPUPOWER) { sd = tmp; + break; + } + } if (!sd) return 0; - /* - * The same locking rules and details apply as for - * wake_sleeping_dependent(): - */ - spin_unlock(&this_rq->lock); - sibling_map = sd->span; - for_each_cpu_mask(i, sibling_map) - spin_lock(&cpu_rq(i)->lock); - cpu_clear(this_cpu, sibling_map); + for_each_cpu_mask(i, sd->span) { + runqueue_t *smt_rq; + task_t *smt_curr; - /* - * Establish next task to be run - it might have gone away because - * we released the runqueue lock above: - */ - if (!this_rq->nr_running) - goto out_unlock; - array = this_rq->active; - if (!array->nr_active) - array = this_rq->expired; - BUG_ON(!array->nr_active); + if (i == this_cpu) + continue; - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, - task_t, run_list); + smt_rq = cpu_rq(i); + if (unlikely(!spin_trylock(&smt_rq->lock))) + continue; - for_each_cpu_mask(i, sibling_map) { - runqueue_t *smt_rq = cpu_rq(i); - task_t *smt_curr = smt_rq->curr; + smt_curr = smt_rq->curr; - /* Kernel threads do not participate in dependent sleeping */ - if (!p->mm || !smt_curr->mm || rt_task(p)) - goto check_smt_task; + if (!smt_curr->mm) + goto unlock; /* * If a user task with lower static priority than the @@ -2808,49 +2787,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) if ((jiffies % DEF_TIMESLICE) > (sd->per_cpu_gain * DEF_TIMESLICE / 100)) ret = 1; - } else + } else { if (smt_curr->static_prio < p->static_prio && !TASK_PREEMPTS_CURR(p, smt_rq) && smt_slice(smt_curr, sd) > task_timeslice(p)) ret = 1; - -check_smt_task: - if ((!smt_curr->mm && smt_curr != smt_rq->idle) || - rt_task(smt_curr)) - continue; - if (!p->mm) { - wakeup_busy_runqueue(smt_rq); - continue; - } - - /* - * Reschedule a lower priority task on the SMT sibling for - * it to be put to sleep, or wake it up if it has been put to - * sleep for priority reasons to see if it should run now. - */ - if (rt_task(p)) { - if ((jiffies % DEF_TIMESLICE) > - (sd->per_cpu_gain * DEF_TIMESLICE / 100)) - resched_task(smt_curr); - } else { - if (TASK_PREEMPTS_CURR(p, smt_rq) && - smt_slice(p, sd) > task_timeslice(smt_curr)) - resched_task(smt_curr); - else - wakeup_busy_runqueue(smt_rq); } +unlock: + spin_unlock(&smt_rq->lock); } -out_unlock: - for_each_cpu_mask(i, sibling_map) - spin_unlock(&cpu_rq(i)->lock); return ret; } #else -static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +static inline void wake_sleeping_dependent(int this_cpu) { } -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, + task_t *p) { return 0; } @@ -2972,32 +2926,13 @@ need_resched_nonpreemptible: cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { -go_idle: idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - /* - * wake_sleeping_dependent() might have released - * the runqueue, so break out if we got new - * tasks meanwhile: - */ - if (!rq->nr_running) - goto switch_tasks; - } - } else { - if (dependent_sleeper(cpu, rq)) { - next = rq->idle; + wake_sleeping_dependent(cpu); goto switch_tasks; } - /* - * dependent_sleeper() releases and reacquires the runqueue - * lock, hence go into the idle loop if the rq went - * empty meanwhile: - */ - if (unlikely(!rq->nr_running)) - goto go_idle; } array = rq->active; @@ -3035,6 +2970,8 @@ go_idle: } } next->sleep_type = SLEEP_NORMAL; + if (dependent_sleeper(cpu, rq, next)) + next = rq->idle; switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); @@ -6144,7 +6081,6 @@ void __init sched_init(void) rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); - rq->cpu = i; #endif atomic_set(&rq->nr_iowait, 0); @@ -6205,7 +6141,7 @@ void normalize_rt_tasks(void) runqueue_t *rq; read_lock_irq(&tasklist_lock); - for_each_process (p) { + for_each_process(p) { if (!rt_task(p)) continue; -- cgit From d444886e149a8413159da8b43ada2bc287e69b0c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 Jun 2006 02:54:29 -0700 Subject: [PATCH] sched: simplify bitmap definition Signed-off-by: Steven Rostedt Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 50a67edc358..582faed3d36 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -184,13 +184,11 @@ static unsigned int task_timeslice(task_t *p) * These are the runqueue data structures: */ -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) - typedef struct runqueue runqueue_t; struct prio_array { unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; + DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ struct list_head queue[MAX_PRIO]; }; -- cgit From 72d2854d4ebf5faf086ebccb2ed575c6b68a05dc Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 27 Jun 2006 02:54:30 -0700 Subject: [PATCH] sched: fix interactive ceiling code The relationship between INTERACTIVE_SLEEP and the ceiling is not perfect and not explicit enough. The sleep boost is not supposed to be any larger than without this code and the comment is not clear enough about what exactly it does, just the reason it does it. Fix it. There is a ceiling to the priority beyond which tasks that only ever sleep for very long periods cannot surpass. Fix it. Prevent the on-runqueue bonus logic from defeating the idle sleep logic. Opportunity to micro-optimise. Signed-off-by: Con Kolivas Signed-off-by: Mike Galbraith Acked-by: Ingo Molnar Signed-off-by: Ken Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 582faed3d36..cb146219d53 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -686,33 +686,35 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) static int recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; + unsigned long sleep_time = now - p->timestamp; if (batch_task(p)) sleep_time = 0; - else { - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; - } if (likely(sleep_time > 0)) { /* - * User tasks that sleep a long time are categorised as - * idle. They will only have their sleep_avg increased to a - * level that makes them just interactive priority to stay - * active yet prevent them suddenly becoming cpu hogs and - * starving other processes. + * This ceiling is set to the lowest priority that would allow + * a task to be reinserted into the active array on timeslice + * completion. */ - if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { - unsigned long ceiling; + unsigned long ceiling = INTERACTIVE_SLEEP(p); - ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - - DEF_TIMESLICE); - if (p->sleep_avg < ceiling) - p->sleep_avg = ceiling; + if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { + /* + * Prevents user tasks from achieving best priority + * with one single large enough sleep. + */ + p->sleep_avg = ceiling; + /* + * Using INTERACTIVE_SLEEP() as a ceiling places a + * nice(0) task 1ms sleep away from promotion, and + * gives it 700ms to round-robin with no chance of + * being demoted. This is more than generous, so + * mark this sleep as non-interactive to prevent the + * on-runqueue bonus logic from intervening should + * this task not receive cpu immediately. + */ + p->sleep_type = SLEEP_NONINTERACTIVE; } else { /* * Tasks waking from uninterruptible sleep are @@ -720,12 +722,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now) * are likely to be waiting on I/O */ if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) + if (p->sleep_avg >= ceiling) sleep_time = 0; else if (p->sleep_avg + sleep_time >= - INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); - sleep_time = 0; + ceiling) { + p->sleep_avg = ceiling; + sleep_time = 0; } } @@ -739,9 +741,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now) */ p->sleep_avg += sleep_time; - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; } + if (p->sleep_avg > NS_MAX_SLEEP_AVG) + p->sleep_avg = NS_MAX_SLEEP_AVG; } return effective_prio(p); -- cgit From cc94abfcbc9fed0048365ce1fb8dc81353408bf8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 Jun 2006 02:54:31 -0700 Subject: [PATCH] unnecessary long index i in sched Unless we expect to have more than 2G CPUs, there's no reason to have 'i' as a long long here. Signed-off-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cb146219d53..235c421631d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1653,7 +1653,8 @@ unsigned long nr_uninterruptible(void) unsigned long long nr_context_switches(void) { - unsigned long long i, sum = 0; + int i; + unsigned long long sum = 0; for_each_possible_cpu(i) sum += cpu_rq(i)->nr_switches; -- cgit From efc30814a88bdbe2bfe4ac94de2eb089ad80bee3 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Tue, 27 Jun 2006 02:54:32 -0700 Subject: [PATCH] sched: CPU hotplug race vs. set_cpus_allowed() There is a race between set_cpus_allowed() and move_task_off_dead_cpu(). __migrate_task() doesn't report any err code, so task can be left on its runqueue if its cpus_allowed mask changed so that dest_cpu is not longer a possible target. Also, chaning cpus_allowed mask requires rq->lock being held. Signed-off-by: Kirill Korotaev Acked-By: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 235c421631d..678335a8b39 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4412,13 +4412,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); * * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. + * + * Returns non-zero if task was successfully migrated. */ -static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { runqueue_t *rq_dest, *rq_src; + int ret = 0; if (unlikely(cpu_is_offline(dest_cpu))) - return; + return ret; rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); @@ -4446,9 +4449,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); } - + ret = 1; out: double_rq_unlock(rq_src, rq_dest); + return ret; } /* @@ -4518,9 +4522,12 @@ wait_to_die: /* Figure out where task on dead CPU should go, use force if neccessary. */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) { + runqueue_t *rq; + unsigned long flags; int dest_cpu; cpumask_t mask; +restart: /* On same node? */ mask = node_to_cpumask(cpu_to_node(dead_cpu)); cpus_and(mask, mask, tsk->cpus_allowed); @@ -4532,8 +4539,10 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { + rq = task_rq_lock(tsk, &flags); cpus_setall(tsk->cpus_allowed); dest_cpu = any_online_cpu(tsk->cpus_allowed); + task_rq_unlock(rq, &flags); /* * Don't tell them about moving exiting tasks or @@ -4545,7 +4554,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) "longer affine to cpu%d\n", tsk->pid, tsk->comm, dead_cpu); } - __migrate_task(tsk, dead_cpu, dest_cpu); + if (!__migrate_task(tsk, dead_cpu, dest_cpu)) + goto restart; } /* -- cgit From 2dd73a4f09beacadde827a032cf15fd8b1fa3d48 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Tue, 27 Jun 2006 02:54:34 -0700 Subject: [PATCH] sched: implement smpnice Problem: The introduction of separate run queues per CPU has brought with it "nice" enforcement problems that are best described by a simple example. For the sake of argument suppose that on a single CPU machine with a nice==19 hard spinner and a nice==0 hard spinner running that the nice==0 task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and 2 nice==0 hard spinners running. The user of this system would be entitled to expect that the nice==0 tasks each get 95% of a CPU and the nice==19 tasks only get 5% each. However, whether this expectation is met is pretty much down to luck as there are four equally likely distributions of the tasks to the CPUs that the load balancing code will consider to be balanced with loads of 2.0 for each CPU. Two of these distributions involve one nice==0 and one nice==19 task per CPU and in these circumstances the users expectations will be met. The other two distributions both involve both nice==0 tasks being on one CPU and both nice==19 being on the other CPU and each task will get 50% of a CPU and the user's expectations will not be met. Solution: The solution to this problem that is implemented in the attached patch is to use weighted loads when determining if the system is balanced and, when an imbalance is detected, to move an amount of weighted load between run queues (as opposed to a number of tasks) to restore the balance. Once again, the easiest way to explain why both of these measures are necessary is to use a simple example. Suppose that (in a slight variation of the above example) that we have a two CPU system with 4 nice==0 and 4 nice=19 hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and the 4 nice==19 tasks are on the other CPU. The weighted loads for the two CPUs would be 4.0 and 0.2 respectively and the load balancing code would move 2 tasks resulting in one CPU with a load of 2.0 and the other with load of 2.2. If this was considered to be a big enough imbalance to justify moving a task and that task was moved using the current move_tasks() then it would move the highest priority task that it found and this would result in one CPU with a load of 3.0 and the other with a load of 1.2 which would result in the movement of a task in the opposite direction and so on -- infinite loop. If, on the other hand, an amount of load to be moved is calculated from the imbalance (in this case 0.1) and move_tasks() skips tasks until it find ones whose contributions to the weighted load are less than this amount it would move two of the nice==19 tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with loads of 2.1 for each CPU. One of the advantages of this mechanism is that on a system where all tasks have nice==0 the load balancing calculations would be mathematically identical to the current load balancing code. Notes: struct task_struct: has a new field load_weight which (in a trade off of space for speed) stores the contribution that this task makes to a CPU's weighted load when it is runnable. struct runqueue: has a new field raw_weighted_load which is the sum of the load_weight values for the currently runnable tasks on this run queue. This field always needs to be updated when nr_running is updated so two new inline functions inc_nr_running() and dec_nr_running() have been created to make sure that this happens. This also offers a convenient way to optimize away this part of the smpnice mechanism when CONFIG_SMP is not defined. int try_to_wake_up(): in this function the value SCHED_LOAD_BALANCE is used to represent the load contribution of a single task in various calculations in the code that decides which CPU to put the waking task on. While this would be a valid on a system where the nice values for the runnable tasks were distributed evenly around zero it will lead to anomalous load balancing if the distribution is skewed in either direction. To overcome this problem SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task or by the average load_weight per task for the queue in question (as appropriate). int move_tasks(): The modifications to this function were complicated by the fact that active_load_balance() uses it to move exactly one task without checking whether an imbalance actually exists. This precluded the simple overloading of max_nr_move with max_load_move and necessitated the addition of the latter as an extra argument to the function. The internal implementation is then modified to move up to max_nr_move tasks and max_load_move of weighted load. This slightly complicates the code where move_tasks() is called and if ever active_load_balance() is changed to not use move_tasks() the implementation of move_tasks() should be simplified accordingly. struct sched_group *find_busiest_group(): Similar to try_to_wake_up(), there are places in this function where SCHED_LOAD_SCALE is used to represent the load contribution of a single task and the same issues are created. A similar solution is adopted except that it is now the average per task contribution to a group's load (as opposed to a run queue) that is required. As this value is not directly available from the group it is calculated on the fly as the queues in the groups are visited when determining the busiest group. A key change to this function is that it is no longer to scale down *imbalance on exit as move_tasks() uses the load in its scaled form. void set_user_nice(): has been modified to update the task's load_weight field when it's nice value and also to ensure that its run queue's raw_weighted_load field is updated if it was runnable. From: "Siddha, Suresh B" With smpnice, sched groups with highest priority tasks can mask the imbalance between the other sched groups with in the same domain. This patch fixes some of the listed down scenarios by not considering the sched groups which are lightly loaded. a) on a simple 4-way MP system, if we have one high priority and 4 normal priority tasks, with smpnice we would like to see the high priority task scheduled on one cpu, two other cpus getting one normal task each and the fourth cpu getting the remaining two normal tasks. but with current smpnice extra normal priority task keeps jumping from one cpu to another cpu having the normal priority task. This is because of the busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the cpu with high priority task in max_load calculations but including that in total and avg_load calcuations.. leading to max_load < avg_load and load balance between cpus running normal priority tasks(2 Vs 1) will always show imbalanace as one normal priority and the extra normal priority task will keep moving from one cpu to another cpu having normal priority task.. b) 4-way system with HT (8 logical processors). Package-P0 T0 has a highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal priority task each.. P2 and P3 are idle. With this patch, one of the normal priority tasks on P1 will be moved to P2 or P3.. c) With the current weighted smp nice calculations, it doesn't always make sense to look at the highest weighted runqueue in the busy group.. Consider a load balance scenario on a DP with HT system, with Package-0 containing one high priority and one low priority, Package-1 containing one low priority(with other thread being idle).. Package-1 thinks that it need to take the low priority thread from Package-0. And find_busiest_queue() returns the cpu thread with highest priority task.. And ultimately(with help of active load balance) we move high priority task to Package-1. And same continues with Package-0 now, moving high priority task from package-1 to package-0.. Even without the presence of active load balance, load balance will fail to balance the above scenario.. Fix find_busiest_queue to use "imbalance" when it is lightly loaded. [kernel@kolivas.org: sched: store weighted load on up] [kernel@kolivas.org: sched: add discrete weighted cpu load function] [suresh.b.siddha@intel.com: sched: remove dead code] Signed-off-by: Peter Williams Cc: "Siddha, Suresh B" Cc: "Chen, Kenneth W" Acked-by: Ingo Molnar Cc: Nick Piggin Signed-off-by: Con Kolivas Cc: John Hawkes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 284 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 219 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 678335a8b39..1847a4456a2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -168,15 +168,21 @@ */ #define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) -static unsigned int task_timeslice(task_t *p) +static unsigned int static_prio_timeslice(int static_prio) { - if (p->static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); + if (static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); else - return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); + return SCALE_PRIO(DEF_TIMESLICE, static_prio); } + +static inline unsigned int task_timeslice(task_t *p) +{ + return static_prio_timeslice(p->static_prio); +} + #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ < (long long) (sd)->cache_hot_time) @@ -207,6 +213,7 @@ struct runqueue { * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; + unsigned long raw_weighted_load; #ifdef CONFIG_SMP unsigned long cpu_load[3]; #endif @@ -661,6 +668,68 @@ static int effective_prio(task_t *p) return prio; } +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +/* + * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE + * If static_prio_timeslice() is ever changed to break this assumption then + * this code will need modification + */ +#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE +#define LOAD_WEIGHT(lp) \ + (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) +#define PRIO_TO_LOAD_WEIGHT(prio) \ + LOAD_WEIGHT(static_prio_timeslice(prio)) +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ + (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) + +static void set_load_weight(task_t *p) +{ + if (rt_task(p)) { +#ifdef CONFIG_SMP + if (p == task_rq(p)->migration_thread) + /* + * The migration thread does the actual balancing. + * Giving its load any weight will skew balancing + * adversely. + */ + p->load_weight = 0; + else +#endif + p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else + p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); +} + +static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) +{ + rq->raw_weighted_load += p->load_weight; +} + +static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p) +{ + rq->raw_weighted_load -= p->load_weight; +} + +static inline void inc_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running++; + inc_raw_weighted_load(rq, p); +} + +static inline void dec_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running--; + dec_raw_weighted_load(rq, p); +} + /* * __activate_task - move a task to the runqueue. */ @@ -671,7 +740,7 @@ static void __activate_task(task_t *p, runqueue_t *rq) if (batch_task(p)) target = rq->expired; enqueue_task(p, target); - rq->nr_running++; + inc_nr_running(p, rq); } /* @@ -680,7 +749,7 @@ static void __activate_task(task_t *p, runqueue_t *rq) static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { enqueue_task_head(p, rq->active); - rq->nr_running++; + inc_nr_running(p, rq); } static int recalc_task_prio(task_t *p, unsigned long long now) @@ -804,7 +873,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) */ static void deactivate_task(struct task_struct *p, runqueue_t *rq) { - rq->nr_running--; + dec_nr_running(p, rq); dequeue_task(p, p->array); p->array = NULL; } @@ -859,6 +928,12 @@ inline int task_curr(const task_t *p) return cpu_curr(task_cpu(p)) == p; } +/* Used instead of source_load when we know the type == 0 */ +unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->raw_weighted_load; +} + #ifdef CONFIG_SMP typedef struct { struct list_head list; @@ -948,7 +1023,8 @@ void kick_process(task_t *p) } /* - * Return a low guess at the load of a migration-source cpu. + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. * * We want to under-estimate the load of migration sources, to * balance conservatively. @@ -956,24 +1032,36 @@ void kick_process(task_t *p) static inline unsigned long source_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) - return load_now; + return rq->raw_weighted_load; - return min(rq->cpu_load[type-1], load_now); + return min(rq->cpu_load[type-1], rq->raw_weighted_load); } /* - * Return a high guess at the load of a migration-target cpu + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. */ static inline unsigned long target_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) - return load_now; + return rq->raw_weighted_load; - return max(rq->cpu_load[type-1], load_now); + return max(rq->cpu_load[type-1], rq->raw_weighted_load); +} + +/* + * Return the average load per task on the cpu's run queue + */ +static inline unsigned long cpu_avg_load_per_task(int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long n = rq->nr_running; + + return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; } /* @@ -1046,7 +1134,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) cpus_and(tmp, group->cpumask, p->cpus_allowed); for_each_cpu_mask(i, tmp) { - load = source_load(i, 0); + load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; @@ -1226,17 +1314,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) if (this_sd->flags & SD_WAKE_AFFINE) { unsigned long tl = this_load; + unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load * of the current CPU: */ if (sync) - tl -= SCHED_LOAD_SCALE; + tl -= current->load_weight; if ((tl <= load && - tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || - 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { + tl + target_load(cpu, idx) <= tl_per_task) || + 100*(tl + p->load_weight) <= imbalance*load) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1435,7 +1525,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; - rq->nr_running++; + inc_nr_running(p, rq); } set_need_resched(); } else @@ -1802,9 +1892,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { dequeue_task(p, src_array); - src_rq->nr_running--; + dec_nr_running(p, src_rq); set_task_cpu(p, this_cpu); - this_rq->nr_running++; + inc_nr_running(p, this_rq); enqueue_task(p, this_array); p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; @@ -1852,24 +1942,27 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, } /* - * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, - * as part of a balancing operation within "domain". Returns the number of - * tasks moved. + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted + * load from busiest to this_rq, as part of a balancing operation within + * "domain". Returns the number of tasks moved. * * Called with both runqueues locked. */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle, int *all_pinned) + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) { prio_array_t *array, *dst_array; struct list_head *head, *curr; int idx, pulled = 0, pinned = 0; + long rem_load_move; task_t *tmp; - if (max_nr_move == 0) + if (max_nr_move == 0 || max_load_move == 0) goto out; + rem_load_move = max_load_move; pinned = 1; /* @@ -1910,7 +2003,8 @@ skip_queue: curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + if (tmp->load_weight > rem_load_move || + !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { if (curr != head) goto skip_queue; idx++; @@ -1924,9 +2018,13 @@ skip_queue: pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; + rem_load_move -= tmp->load_weight; - /* We only want to steal up to the prescribed number of tasks. */ - if (pulled < max_nr_move) { + /* + * We only want to steal up to the prescribed number of tasks + * and the prescribed amount of weighted load. + */ + if (pulled < max_nr_move && rem_load_move > 0) { if (curr != head) goto skip_queue; idx++; @@ -1947,7 +2045,7 @@ out: /* * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the number of tasks which should be + * domain. It calculates and returns the amount of weighted load which should be * moved to restore balance via the imbalance parameter. */ static struct sched_group * @@ -1957,9 +2055,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; unsigned long max_pull; + unsigned long busiest_load_per_task, busiest_nr_running; + unsigned long this_load_per_task, this_nr_running; int load_idx; max_load = this_load = total_load = total_pwr = 0; + busiest_load_per_task = busiest_nr_running = 0; + this_load_per_task = this_nr_running = 0; if (idle == NOT_IDLE) load_idx = sd->busy_idx; else if (idle == NEWLY_IDLE) @@ -1971,13 +2073,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long load; int local_group; int i; + unsigned long sum_nr_running, sum_weighted_load; local_group = cpu_isset(this_cpu, group->cpumask); /* Tally up the load of all CPUs in the group */ - avg_load = 0; + sum_weighted_load = sum_nr_running = avg_load = 0; for_each_cpu_mask(i, group->cpumask) { + runqueue_t *rq = cpu_rq(i); + if (*sd_idle && !idle_cpu(i)) *sd_idle = 0; @@ -1988,6 +2093,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, load = source_load(i, load_idx); avg_load += load; + sum_nr_running += rq->nr_running; + sum_weighted_load += rq->raw_weighted_load; } total_load += avg_load; @@ -1999,14 +2106,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (local_group) { this_load = avg_load; this = group; - } else if (avg_load > max_load) { + this_nr_running = sum_nr_running; + this_load_per_task = sum_weighted_load; + } else if (avg_load > max_load && + sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) { max_load = avg_load; busiest = group; + busiest_nr_running = sum_nr_running; + busiest_load_per_task = sum_weighted_load; } group = group->next; } while (group != sd->groups); - if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) + if (!busiest || this_load >= max_load || busiest_nr_running == 0) goto out_balanced; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; @@ -2015,6 +2127,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, 100*max_load <= sd->imbalance_pct*this_load) goto out_balanced; + busiest_load_per_task /= busiest_nr_running; /* * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to @@ -2026,21 +2139,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ + if (max_load <= busiest_load_per_task) + goto out_balanced; + + /* + * In the presence of smp nice balancing, certain scenarios can have + * max load less than avg load(as we skip the groups at or below + * its cpu_power, while calculating max_load..) + */ + if (max_load < avg_load) { + *imbalance = 0; + goto small_imbalance; + } /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); + max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ *imbalance = min(max_pull * busiest->cpu_power, (avg_load - this_load) * this->cpu_power) / SCHED_LOAD_SCALE; - if (*imbalance < SCHED_LOAD_SCALE) { - unsigned long pwr_now = 0, pwr_move = 0; + /* + * if *imbalance is less than the average load per runnable task + * there is no gaurantee that any tasks will be moved so we'll have + * a think about bumping its value to force at least one task to be + * moved + */ + if (*imbalance < busiest_load_per_task) { + unsigned long pwr_now, pwr_move; unsigned long tmp; + unsigned int imbn; + +small_imbalance: + pwr_move = pwr_now = 0; + imbn = 2; + if (this_nr_running) { + this_load_per_task /= this_nr_running; + if (busiest_load_per_task > this_load_per_task) + imbn = 1; + } else + this_load_per_task = SCHED_LOAD_SCALE; - if (max_load - this_load >= SCHED_LOAD_SCALE*2) { - *imbalance = 1; + if (max_load - this_load >= busiest_load_per_task * imbn) { + *imbalance = busiest_load_per_task; return busiest; } @@ -2050,35 +2192,34 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * moving them. */ - pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); - pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now += busiest->cpu_power * + min(busiest_load_per_task, max_load); + pwr_now += this->cpu_power * + min(this_load_per_task, this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; if (max_load > tmp) - pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, - max_load - tmp); + pwr_move += busiest->cpu_power * + min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ if (max_load*busiest->cpu_power < - SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) + busiest_load_per_task*SCHED_LOAD_SCALE) tmp = max_load*busiest->cpu_power/this->cpu_power; else - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; - pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); + tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; + pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ if (pwr_move <= pwr_now) goto out_balanced; - *imbalance = 1; - return busiest; + *imbalance = busiest_load_per_task; } - /* Get rid of the scaling factor, rounding down as we divide */ - *imbalance = *imbalance / SCHED_LOAD_SCALE; return busiest; out_balanced: @@ -2091,18 +2232,21 @@ out_balanced: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static runqueue_t *find_busiest_queue(struct sched_group *group, - enum idle_type idle) + enum idle_type idle, unsigned long imbalance) { - unsigned long load, max_load = 0; - runqueue_t *busiest = NULL; + unsigned long max_load = 0; + runqueue_t *busiest = NULL, *rqi; int i; for_each_cpu_mask(i, group->cpumask) { - load = source_load(i, 0); + rqi = cpu_rq(i); + + if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance) + continue; - if (load > max_load) { - max_load = load; - busiest = cpu_rq(i); + if (rqi->raw_weighted_load > max_load) { + max_load = rqi->raw_weighted_load; + busiest = rqi; } } @@ -2115,6 +2259,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, */ #define MAX_PINNED_INTERVAL 512 +#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0) /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -2142,7 +2287,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, goto out_balanced; } - busiest = find_busiest_queue(group, idle); + busiest = find_busiest_queue(group, idle, imbalance); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2162,6 +2307,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, */ double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, + minus_1_or_zero(busiest->nr_running), imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); @@ -2265,7 +2411,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, goto out_balanced; } - busiest = find_busiest_queue(group, NEWLY_IDLE); + busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); if (!busiest) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); goto out_balanced; @@ -2280,6 +2426,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, + minus_1_or_zero(busiest->nr_running), imbalance, sd, NEWLY_IDLE, NULL); spin_unlock(&busiest->lock); } @@ -2361,7 +2508,8 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) schedstat_inc(sd, alb_cnt); - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) + if (move_tasks(target_rq, target_cpu, busiest_rq, 1, + RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -2389,7 +2537,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, struct sched_domain *sd; int i; - this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + this_load = this_rq->raw_weighted_load; /* Update our load */ for (i = 0; i < 3; i++) { unsigned long new_load = this_load; @@ -3441,17 +3589,21 @@ void set_user_nice(task_t *p, long nice) goto out_unlock; } array = p->array; - if (array) + if (array) { dequeue_task(p, array); + dec_raw_weighted_load(rq, p); + } old_prio = p->prio; new_prio = NICE_TO_PRIO(nice); delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p); p->prio += delta; if (array) { enqueue_task(p, array); + inc_raw_weighted_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3587,6 +3739,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) if (policy == SCHED_BATCH) p->sleep_avg = 0; } + set_load_weight(p); } /** @@ -6106,6 +6259,7 @@ void __init sched_init(void) } } + set_load_weight(&init_task); /* * The boot idle thread does lazy MMU switching as well: */ -- cgit From 50ddd96917e4548b3813bfb5dd6f97f052b652bd Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Tue, 27 Jun 2006 02:54:36 -0700 Subject: [PATCH] sched: modify move_tasks() to improve load balancing outcomes Problem: The move_tasks() function is designed to move UP TO the amount of load it is asked to move and in doing this it skips over tasks looking for ones whose load weights are less than or equal to the remaining load to be moved. This is (in general) a good thing but it has the unfortunate result of breaking one of the original load balancer's good points: namely, that (within the limits imposed by the active/expired array model and the fact the expired is processed first) it moves high priority tasks before low priority ones and this means there's a good chance (see active/expired problem for why it's only a chance) that the highest priority task on the queue but not actually on the CPU will be moved to the other CPU where (as a high priority task) it may preempt the current task. Solution: Modify move_tasks() so that high priority tasks are not skipped when moving them will make them the highest priority task on their new run queue. Signed-off-by: Peter Williams Cc: Ingo Molnar Cc: "Siddha, Suresh B" Cc: "Chen, Kenneth W" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1847a4456a2..b4dab63c6db 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1955,7 +1955,7 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, { prio_array_t *array, *dst_array; struct list_head *head, *curr; - int idx, pulled = 0, pinned = 0; + int idx, pulled = 0, pinned = 0, this_min_prio; long rem_load_move; task_t *tmp; @@ -1964,6 +1964,7 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, rem_load_move = max_load_move; pinned = 1; + this_min_prio = this_rq->curr->prio; /* * We first consider expired tasks. Those will likely not be @@ -2003,7 +2004,12 @@ skip_queue: curr = curr->prev; - if (tmp->load_weight > rem_load_move || + /* + * To help distribute high priority tasks accross CPUs we don't + * skip a task if it will be the highest priority task (i.e. smallest + * prio value) on its new queue regardless of its load weight + */ + if ((idx >= this_min_prio && tmp->load_weight > rem_load_move) || !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { if (curr != head) goto skip_queue; @@ -2025,6 +2031,8 @@ skip_queue: * and the prescribed amount of weighted load. */ if (pulled < max_nr_move && rem_load_move > 0) { + if (idx < this_min_prio) + this_min_prio = idx; if (curr != head) goto skip_queue; idx++; -- cgit From 615052dc3bf96278a843a64d3d1eea03532028c3 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Tue, 27 Jun 2006 02:54:37 -0700 Subject: [PATCH] sched: Avoid unnecessarily moving highest priority task move_tasks() Problem: To help distribute high priority tasks evenly across the available CPUs move_tasks() does not, under some circumstances, skip tasks whose load weight is bigger than the designated amount. Because the highest priority task on the busiest queue may be on the expired array it may be moved as a result of this mechanism. Apart from not being the most desirable way to redistribute the high priority tasks (we'd rather move the second highest priority task), there is a risk that this could set up a loop with this task bouncing backwards and forwards between the two queues. (This latter possibility can be demonstrated by running a nice==-20 CPU bound task on an otherwise quiet 2 CPU system.) Solution: Modify the mechanism so that it does not override skip for the highest priority task on the CPU. Of course, if there are more than one tasks at the highest priority then it will allow the override for one of them as this is a desirable redistribution of high priority tasks. Signed-off-by: Peter Williams Cc: Ingo Molnar Cc: "Siddha, Suresh B" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b4dab63c6db..0ec84f57695 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1941,6 +1941,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; } +#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) /* * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted * load from busiest to this_rq, as part of a balancing operation within @@ -1955,7 +1956,9 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, { prio_array_t *array, *dst_array; struct list_head *head, *curr; - int idx, pulled = 0, pinned = 0, this_min_prio; + int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio; + int busiest_best_prio_seen; + int skip_for_load; /* skip the task based on weighted load issues */ long rem_load_move; task_t *tmp; @@ -1964,7 +1967,16 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, rem_load_move = max_load_move; pinned = 1; - this_min_prio = this_rq->curr->prio; + this_best_prio = rq_best_prio(this_rq); + busiest_best_prio = rq_best_prio(busiest); + /* + * Enable handling of the case where there is more than one task + * with the best priority. If the current running task is one + * of those with prio==busiest_best_prio we know it won't be moved + * and therefore it's safe to override the skip (based on load) of + * any task we find with that prio. + */ + busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio; /* * We first consider expired tasks. Those will likely not be @@ -2009,8 +2021,12 @@ skip_queue: * skip a task if it will be the highest priority task (i.e. smallest * prio value) on its new queue regardless of its load weight */ - if ((idx >= this_min_prio && tmp->load_weight > rem_load_move) || + skip_for_load = tmp->load_weight > rem_load_move; + if (skip_for_load && idx < this_best_prio) + skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio; + if (skip_for_load || !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + busiest_best_prio_seen |= idx == busiest_best_prio; if (curr != head) goto skip_queue; idx++; @@ -2031,8 +2047,8 @@ skip_queue: * and the prescribed amount of weighted load. */ if (pulled < max_nr_move && rem_load_move > 0) { - if (idx < this_min_prio) - this_min_prio = idx; + if (idx < this_best_prio) + this_best_prio = idx; if (curr != head) goto skip_queue; idx++; -- cgit From 51888ca25a03125e742ef84d4ddfd74e139707a0 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 27 Jun 2006 02:54:38 -0700 Subject: [PATCH] sched_domain: handle kmalloc failure Try to handle mem allocation failures in build_sched_domains by bailing out and cleaning up thus-far allocated memory. The patch has a direct consequence that we disable load balancing completely (even at sibling level) upon *any* memory allocation failure. [Lee.Schermerhorn@hp.com: bugfix] Signed-off-by: Srivatsa Vaddagir Cc: Nick Piggin Cc: Ingo Molnar Cc: "Siddha, Suresh B" Signed-off-by: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 139 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 78 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0ec84f57695..77a2ec55ef7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5820,11 +5820,56 @@ next_sg: } #endif +/* Free memory allocated for various sched_group structures */ +static void free_sched_groups(const cpumask_t *cpu_map) +{ +#ifdef CONFIG_NUMA + int i; + int cpu; + + for_each_cpu_mask(cpu, *cpu_map) { + struct sched_group *sched_group_allnodes + = sched_group_allnodes_bycpu[cpu]; + struct sched_group **sched_group_nodes + = sched_group_nodes_bycpu[cpu]; + + if (sched_group_allnodes) { + kfree(sched_group_allnodes); + sched_group_allnodes_bycpu[cpu] = NULL; + } + + if (!sched_group_nodes) + continue; + + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + struct sched_group *oldsg, *sg = sched_group_nodes[i]; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) + continue; + + if (sg == NULL) + continue; + sg = sg->next; +next_sg: + oldsg = sg; + sg = sg->next; + kfree(oldsg); + if (oldsg != sched_group_nodes[i]) + goto next_sg; + } + kfree(sched_group_nodes); + sched_group_nodes_bycpu[cpu] = NULL; + } +#endif +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -void build_sched_domains(const cpumask_t *cpu_map) +static int build_sched_domains(const cpumask_t *cpu_map) { int i; #ifdef CONFIG_NUMA @@ -5834,11 +5879,11 @@ void build_sched_domains(const cpumask_t *cpu_map) /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, + sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, GFP_ATOMIC); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); - return; + return -ENOMEM; } sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif @@ -5864,7 +5909,7 @@ void build_sched_domains(const cpumask_t *cpu_map) if (!sched_group_allnodes) { printk(KERN_WARNING "Can not alloc allnodes sched group\n"); - break; + goto error; } sched_group_allnodes_bycpu[i] = sched_group_allnodes; @@ -5978,23 +6023,20 @@ void build_sched_domains(const cpumask_t *cpu_map) cpus_and(domainspan, domainspan, *cpu_map); sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + if (!sg) { + printk(KERN_WARNING "Can not alloc domain group for " + "node %d\n", i); + goto error; + } sched_group_nodes[i] = sg; for_each_cpu_mask(j, nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); sd->groups = sg; - if (sd->groups == NULL) { - /* Turn off balancing if we have no groups */ - sd->flags = 0; - } - } - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", i); - continue; } sg->cpu_power = 0; sg->cpumask = nodemask; + sg->next = sg; cpus_or(covered, covered, nodemask); prev = sg; @@ -6017,15 +6059,15 @@ void build_sched_domains(const cpumask_t *cpu_map) if (!sg) { printk(KERN_WARNING "Can not alloc domain group for node %d\n", j); - break; + goto error; } sg->cpu_power = 0; sg->cpumask = tmp; + sg->next = prev->next; cpus_or(covered, covered, tmp); prev->next = sg; prev = sg; } - prev->next = sched_group_nodes[i]; } #endif @@ -6088,13 +6130,22 @@ void build_sched_domains(const cpumask_t *cpu_map) * Tune cache-hot values: */ calibrate_migration_costs(cpu_map); + + return 0; + +#ifdef CONFIG_NUMA +error: + free_sched_groups(cpu_map); + return -ENOMEM; +#endif } /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. */ -static void arch_init_sched_domains(const cpumask_t *cpu_map) +static int arch_init_sched_domains(const cpumask_t *cpu_map) { cpumask_t cpu_default_map; + int err; /* * Setup mask for cpus without special case scheduling requirements. @@ -6103,51 +6154,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map) */ cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); - build_sched_domains(&cpu_default_map); + err = build_sched_domains(&cpu_default_map); + + return err; } static void arch_destroy_sched_domains(const cpumask_t *cpu_map) { -#ifdef CONFIG_NUMA - int i; - int cpu; - - for_each_cpu_mask(cpu, *cpu_map) { - struct sched_group *sched_group_allnodes - = sched_group_allnodes_bycpu[cpu]; - struct sched_group **sched_group_nodes - = sched_group_nodes_bycpu[cpu]; - - if (sched_group_allnodes) { - kfree(sched_group_allnodes); - sched_group_allnodes_bycpu[cpu] = NULL; - } - - if (!sched_group_nodes) - continue; - - for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); - struct sched_group *oldsg, *sg = sched_group_nodes[i]; - - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) - continue; - - if (sg == NULL) - continue; - sg = sg->next; -next_sg: - oldsg = sg; - sg = sg->next; - kfree(oldsg); - if (oldsg != sched_group_nodes[i]) - goto next_sg; - } - kfree(sched_group_nodes); - sched_group_nodes_bycpu[cpu] = NULL; - } -#endif + free_sched_groups(cpu_map); } /* @@ -6172,9 +6186,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) * correct sched domains * Call with hotplug lock held */ -void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) +int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) { cpumask_t change_map; + int err = 0; cpus_and(*partition1, *partition1, cpu_online_map); cpus_and(*partition2, *partition2, cpu_online_map); @@ -6183,9 +6198,11 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) /* Detach sched domains from all of the affected cpus */ detach_destroy_domains(&change_map); if (!cpus_empty(*partition1)) - build_sched_domains(partition1); - if (!cpus_empty(*partition2)) - build_sched_domains(partition2); + err = build_sched_domains(partition1); + if (!err && !cpus_empty(*partition2)) + err = build_sched_domains(partition2); + + return err; } #ifdef CONFIG_HOTPLUG_CPU -- cgit From d3a5aa9858cc9cecc3aadac7311d376c7c9e101a Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 27 Jun 2006 02:54:39 -0700 Subject: [PATCH] sched_domai: Don't use GFP_ATOMIC Replace GFP_ATOMIC allocation for sched_group_nodes with GFP_KERNEL based allocation. Signed-off-by: Srivatsa Vaddagiri Cc: Ingo Molnar Cc: "Siddha, Suresh B" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 77a2ec55ef7..e93c75ffdc8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5880,7 +5880,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) * Allocate the per-node list of sched groups */ sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, - GFP_ATOMIC); + GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); return -ENOMEM; -- cgit From 15f0b676a482fb4067cfe25de417c417dda3440a Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 27 Jun 2006 02:54:40 -0700 Subject: [PATCH] sched_domai: Use kmalloc_node The sched group structures used to represent various nodes need to be allocated from respective nodes (as suggested here also: http://uwsg.ucs.indiana.edu/hypermail/linux/kernel/0603.3/0051.html) Signed-off-by: Srivatsa Vaddagiri Cc: Nick Piggin Cc: Ingo Molnar Cc: "Siddha, Suresh B" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e93c75ffdc8..ee4211bd40c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6022,7 +6022,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) domainspan = sched_domain_node_span(i); cpus_and(domainspan, domainspan, *cpu_map); - sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING "Can not alloc domain group for " "node %d\n", i); @@ -6055,7 +6055,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (cpus_empty(tmp)) continue; - sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sg = kmalloc_node(sizeof(struct sched_group), + GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING "Can not alloc domain group for node %d\n", j); -- cgit From 369381694ddcf03f1de403501c8b97099b5109ec Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 27 Jun 2006 02:54:41 -0700 Subject: [PATCH] sched_domai: Allocate sched_group structures dynamically As explained here: http://marc.theaimsgroup.com/?l=linux-kernel&m=114327539012323&w=2 there is a problem with sharing sched_group structures between two separate sched_group structures for different sched_domains. The patch has been tested and found to avoid the kernel lockup problem described in above URL. Signed-off-by: Srivatsa Vaddagiri Cc: Nick Piggin Cc: Ingo Molnar Cc: "Siddha, Suresh B" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 48 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ee4211bd40c..122b75584a1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5747,7 +5747,7 @@ static int cpu_to_cpu_group(int cpu) #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); -static struct sched_group sched_group_core[NR_CPUS]; +static struct sched_group *sched_group_core_bycpu[NR_CPUS]; #endif #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) @@ -5763,7 +5763,7 @@ static int cpu_to_core_group(int cpu) #endif static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; static int cpu_to_phys_group(int cpu) { #if defined(CONFIG_SCHED_MC) @@ -5823,9 +5823,9 @@ next_sg: /* Free memory allocated for various sched_group structures */ static void free_sched_groups(const cpumask_t *cpu_map) { + int cpu; #ifdef CONFIG_NUMA int i; - int cpu; for_each_cpu_mask(cpu, *cpu_map) { struct sched_group *sched_group_allnodes @@ -5863,6 +5863,18 @@ next_sg: sched_group_nodes_bycpu[cpu] = NULL; } #endif + for_each_cpu_mask(cpu, *cpu_map) { + if (sched_group_phys_bycpu[cpu]) { + kfree(sched_group_phys_bycpu[cpu]); + sched_group_phys_bycpu[cpu] = NULL; + } +#ifdef CONFIG_SCHED_MC + if (sched_group_core_bycpu[cpu]) { + kfree(sched_group_core_bycpu[cpu]); + sched_group_core_bycpu[cpu] = NULL; + } +#endif + } } /* @@ -5872,6 +5884,10 @@ next_sg: static int build_sched_domains(const cpumask_t *cpu_map) { int i; + struct sched_group *sched_group_phys = NULL; +#ifdef CONFIG_SCHED_MC + struct sched_group *sched_group_core = NULL; +#endif #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; struct sched_group *sched_group_allnodes = NULL; @@ -5930,6 +5946,18 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpus_and(sd->span, sd->span, *cpu_map); #endif + if (!sched_group_phys) { + sched_group_phys + = kmalloc(sizeof(struct sched_group) * NR_CPUS, + GFP_KERNEL); + if (!sched_group_phys) { + printk (KERN_WARNING "Can not alloc phys sched" + "group\n"); + goto error; + } + sched_group_phys_bycpu[i] = sched_group_phys; + } + p = sd; sd = &per_cpu(phys_domains, i); group = cpu_to_phys_group(i); @@ -5939,6 +5967,18 @@ static int build_sched_domains(const cpumask_t *cpu_map) sd->groups = &sched_group_phys[group]; #ifdef CONFIG_SCHED_MC + if (!sched_group_core) { + sched_group_core + = kmalloc(sizeof(struct sched_group) * NR_CPUS, + GFP_KERNEL); + if (!sched_group_core) { + printk (KERN_WARNING "Can not alloc core sched" + "group\n"); + goto error; + } + sched_group_core_bycpu[i] = sched_group_core; + } + p = sd; sd = &per_cpu(core_domains, i); group = cpu_to_core_group(i); @@ -6134,11 +6174,9 @@ static int build_sched_domains(const cpumask_t *cpu_map) return 0; -#ifdef CONFIG_NUMA error: free_sched_groups(cpu_map); return -ENOMEM; -#endif } /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. -- cgit From 5c45bf279d378d436ce45825c0f136696c7b6109 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Tue, 27 Jun 2006 02:54:42 -0700 Subject: [PATCH] sched: mc/smt power savings sched policy sysfs entries 'sched_mc_power_savings' and 'sched_smt_power_savings' in /sys/devices/system/cpu/ control the MC/SMT power savings policy for the scheduler. Based on the values (1-enable, 0-disable) for these controls, sched groups cpu power will be determined for different domains. When power savings policy is enabled and under light load conditions, scheduler will minimize the physical packages/cpu cores carrying the load and thus conserving power(with a perf impact based on the workload characteristics... see OLS 2005 CMP kernel scheduler paper for more details..) Signed-off-by: Suresh Siddha Cc: Ingo Molnar Cc: Nick Piggin Cc: Con Kolivas Cc: "Chen, Kenneth W" Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 240 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 215 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 122b75584a1..54fa282657c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1162,6 +1162,11 @@ static int sched_balance_self(int cpu, int flag) struct sched_domain *tmp, *sd = NULL; for_each_domain(cpu, tmp) { + /* + * If power savings logic is enabled for a domain, stop there. + */ + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + break; if (tmp->flags & flag) sd = tmp; } @@ -2082,6 +2087,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long busiest_load_per_task, busiest_nr_running; unsigned long this_load_per_task, this_nr_running; int load_idx; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + int power_savings_balance = 1; + unsigned long leader_nr_running = 0, min_load_per_task = 0; + unsigned long min_nr_running = ULONG_MAX; + struct sched_group *group_min = NULL, *group_leader = NULL; +#endif max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; @@ -2094,7 +2105,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, load_idx = sd->idle_idx; do { - unsigned long load; + unsigned long load, group_capacity; int local_group; int i; unsigned long sum_nr_running, sum_weighted_load; @@ -2127,18 +2138,76 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Adjust by relative CPU power of the group */ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + group_capacity = group->cpu_power / SCHED_LOAD_SCALE; + if (local_group) { this_load = avg_load; this = group; this_nr_running = sum_nr_running; this_load_per_task = sum_weighted_load; } else if (avg_load > max_load && - sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) { + sum_nr_running > group_capacity) { max_load = avg_load; busiest = group; busiest_nr_running = sum_nr_running; busiest_load_per_task = sum_weighted_load; } + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto group_next; + + /* + * If the local group is idle or completely loaded + * no need to do power savings balance at this domain + */ + if (local_group && (this_nr_running >= group_capacity || + !this_nr_running)) + power_savings_balance = 0; + + /* + * If a group is already running at full capacity or idle, + * don't include that group in power savings calculations + */ + if (!power_savings_balance || sum_nr_running >= group_capacity + || !sum_nr_running) + goto group_next; + + /* + * Calculate the group which has the least non-idle load. + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sum_nr_running < min_nr_running) || + (sum_nr_running == min_nr_running && + first_cpu(group->cpumask) < + first_cpu(group_min->cpumask))) { + group_min = group; + min_nr_running = sum_nr_running; + min_load_per_task = sum_weighted_load / + sum_nr_running; + } + + /* + * Calculate the group which is almost near its + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sum_nr_running <= group_capacity - 1) + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + first_cpu(group->cpumask) > + first_cpu(group_leader->cpumask))) { + group_leader = group; + leader_nr_running = sum_nr_running; + } + +group_next: +#endif group = group->next; } while (group != sd->groups); @@ -2247,7 +2316,16 @@ small_imbalance: return busiest; out_balanced: +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto ret; + if (this == group_leader && group_leader != group_min) { + *imbalance = min_load_per_task; + return group_min; + } +ret: +#endif *imbalance = 0; return NULL; } @@ -2300,7 +2378,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, int active_balance = 0; int sd_idle = 0; - if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) + if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && + !sched_smt_power_savings) sd_idle = 1; schedstat_inc(sd, lb_cnt[idle]); @@ -2389,7 +2468,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, sd->balance_interval *= 2; } - if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !sched_smt_power_savings) return -1; return nr_moved; @@ -2404,7 +2484,7 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) return -1; return 0; } @@ -2425,7 +2505,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, int nr_moved = 0; int sd_idle = 0; - if (sd->flags & SD_SHARE_CPUPOWER) + if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); @@ -2466,7 +2546,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) return -1; sd->nr_balance_failed = 0; return 0; @@ -5732,6 +5812,7 @@ static cpumask_t sched_domain_node_span(int node) } #endif +int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we * can switch it on easily if needed. @@ -6113,37 +6194,72 @@ static int build_sched_domains(const cpumask_t *cpu_map) #endif /* Calculate CPU power for physical packages and nodes */ +#ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - int power; struct sched_domain *sd; -#ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); - power = SCHED_LOAD_SCALE; - sd->groups->cpu_power = power; + sd->groups->cpu_power = SCHED_LOAD_SCALE; + } #endif #ifdef CONFIG_SCHED_MC + for_each_cpu_mask(i, *cpu_map) { + int power; + struct sched_domain *sd; sd = &per_cpu(core_domains, i); - power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) + if (sched_smt_power_savings) + power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); + else + power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) * SCHED_LOAD_SCALE / 10; sd->groups->cpu_power = power; + } +#endif + for_each_cpu_mask(i, *cpu_map) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_MC sd = &per_cpu(phys_domains, i); + if (i != first_cpu(sd->groups->cpumask)) + continue; - /* - * This has to be < 2 * SCHED_LOAD_SCALE - * Lets keep it SCHED_LOAD_SCALE, so that - * while calculating NUMA group's cpu_power - * we can simply do - * numa_group->cpu_power += phys_group->cpu_power; - * - * See "only add power once for each physical pkg" - * comment below - */ - sd->groups->cpu_power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = 0; + if (sched_mc_power_savings || sched_smt_power_savings) { + int j; + + for_each_cpu_mask(j, sd->groups->cpumask) { + struct sched_domain *sd1; + sd1 = &per_cpu(core_domains, j); + /* + * for each core we will add once + * to the group in physical domain + */ + if (j != first_cpu(sd1->groups->cpumask)) + continue; + + if (sched_smt_power_savings) + sd->groups->cpu_power += sd1->groups->cpu_power; + else + sd->groups->cpu_power += SCHED_LOAD_SCALE; + } + } else + /* + * This has to be < 2 * SCHED_LOAD_SCALE + * Lets keep it SCHED_LOAD_SCALE, so that + * while calculating NUMA group's cpu_power + * we can simply do + * numa_group->cpu_power += phys_group->cpu_power; + * + * See "only add power once for each physical pkg" + * comment below + */ + sd->groups->cpu_power = SCHED_LOAD_SCALE; #else + int power; sd = &per_cpu(phys_domains, i); - power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * - (cpus_weight(sd->groups->cpumask)-1) / 10; + if (sched_smt_power_savings) + power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); + else + power = SCHED_LOAD_SCALE; sd->groups->cpu_power = power; #endif } @@ -6244,6 +6360,80 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) return err; } +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +int arch_reinit_sched_domains(void) +{ + int err; + + lock_cpu_hotplug(); + detach_destroy_domains(&cpu_online_map); + err = arch_init_sched_domains(&cpu_online_map); + unlock_cpu_hotplug(); + + return err; +} + +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) +{ + int ret; + + if (buf[0] != '0' && buf[0] != '1') + return -EINVAL; + + if (smt) + sched_smt_power_savings = (buf[0] == '1'); + else + sched_mc_power_savings = (buf[0] == '1'); + + ret = arch_reinit_sched_domains(); + + return ret ? ret : count; +} + +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +{ + int err = 0; +#ifdef CONFIG_SCHED_SMT + if (smt_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_smt_power_savings.attr); +#endif +#ifdef CONFIG_SCHED_MC + if (!err && mc_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_mc_power_savings.attr); +#endif + return err; +} +#endif + +#ifdef CONFIG_SCHED_MC +static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) +{ + return sprintf(page, "%u\n", sched_mc_power_savings); +} +static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 0); +} +SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, + sched_mc_power_savings_store); +#endif + +#ifdef CONFIG_SCHED_SMT +static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) +{ + return sprintf(page, "%u\n", sched_smt_power_savings); +} +static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 1); +} +SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, + sched_smt_power_savings_store); +#endif + + #ifdef CONFIG_HOTPLUG_CPU /* * Force a reinitialization of the sched domains hierarchy. The domains -- cgit From 9fea80e4d984d411aa188baa31225c273ebe0fe6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 27 Jun 2006 02:54:42 -0700 Subject: [PATCH] sched: uninline task_rq_lock() Saves 543 bytes from sched.o (gcc 3.3.3). Signed-off-by: Oleg Nesterov Cc: Ingo Molnar Cc: Nick Piggin Cc: Con Kolivas Cc: Peter Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 54fa282657c..19c0d5d16fe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -359,7 +359,7 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ -static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) __acquires(rq->lock) { struct runqueue *rq; -- cgit From 66e5393a78b3fcca63e7748e38221dcca61c4aab Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 Jun 2006 02:54:44 -0700 Subject: [PATCH] BUG() if setscheduler is called from interrupt context Thomas Gleixner is adding the call to a rtmutex function in setscheduler. This call grabs a spin_lock that is not always protected by interrupts disabled. So this means that setscheduler cant be called from interrupt context. To prevent this from happening in the future, this patch adds a BUG_ON(in_interrupt()) in that function. (Thanks to akpm for this suggestion). Signed-off-by: Steven Rostedt Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 19c0d5d16fe..15abf083324 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3862,6 +3862,8 @@ int sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; runqueue_t *rq; + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ if (policy < 0) -- cgit From e2970f2fb6950183a34e8545faa093eb49d186e1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:54:47 -0700 Subject: [PATCH] pi-futex: futex code cleanups We are pleased to announce "lightweight userspace priority inheritance" (PI) support for futexes. The following patchset and glibc patch implements it, ontop of the robust-futexes patchset which is included in 2.6.16-mm1. We are calling it lightweight for 3 reasons: - in the user-space fastpath a PI-enabled futex involves no kernel work (or any other PI complexity) at all. No registration, no extra kernel calls - just pure fast atomic ops in userspace. - in the slowpath (in the lock-contention case), the system call and scheduling pattern is in fact better than that of normal futexes, due to the 'integrated' nature of FUTEX_LOCK_PI. [more about that further down] - the in-kernel PI implementation is streamlined around the mutex abstraction, with strict rules that keep the implementation relatively simple: only a single owner may own a lock (i.e. no read-write lock support), only the owner may unlock a lock, no recursive locking, etc. Priority Inheritance - why, oh why??? ------------------------------------- Many of you heard the horror stories about the evil PI code circling Linux for years, which makes no real sense at all and is only used by buggy applications and which has horrible overhead. Some of you have dreaded this very moment, when someone actually submits working PI code ;-) So why would we like to see PI support for futexes? We'd like to see it done purely for technological reasons. We dont think it's a buggy concept, we think it's useful functionality to offer to applications, which functionality cannot be achieved in other ways. We also think it's the right thing to do, and we think we've got the right arguments and the right numbers to prove that. We also believe that we can address all the counter-arguments as well. For these reasons (and the reasons outlined below) we are submitting this patch-set for upstream kernel inclusion. What are the benefits of PI? The short reply: ---------------- User-space PI helps achieving/improving determinism for user-space applications. In the best-case, it can help achieve determinism and well-bound latencies. Even in the worst-case, PI will improve the statistical distribution of locking related application delays. The longer reply: ----------------- Firstly, sharing locks between multiple tasks is a common programming technique that often cannot be replaced with lockless algorithms. As we can see it in the kernel [which is a quite complex program in itself], lockless structures are rather the exception than the norm - the current ratio of lockless vs. locky code for shared data structures is somewhere between 1:10 and 1:100. Lockless is hard, and the complexity of lockless algorithms often endangers to ability to do robust reviews of said code. I.e. critical RT apps often choose lock structures to protect critical data structures, instead of lockless algorithms. Furthermore, there are cases (like shared hardware, or other resource limits) where lockless access is mathematically impossible. Media players (such as Jack) are an example of reasonable application design with multiple tasks (with multiple priority levels) sharing short-held locks: for example, a highprio audio playback thread is combined with medium-prio construct-audio-data threads and low-prio display-colory-stuff threads. Add video and decoding to the mix and we've got even more priority levels. So once we accept that synchronization objects (locks) are an unavoidable fact of life, and once we accept that multi-task userspace apps have a very fair expectation of being able to use locks, we've got to think about how to offer the option of a deterministic locking implementation to user-space. Most of the technical counter-arguments against doing priority inheritance only apply to kernel-space locks. But user-space locks are different, there we cannot disable interrupts or make the task non-preemptible in a critical section, so the 'use spinlocks' argument does not apply (user-space spinlocks have the same priority inversion problems as other user-space locking constructs). Fact is, pretty much the only technique that currently enables good determinism for userspace locks (such as futex-based pthread mutexes) is priority inheritance: Currently (without PI), if a high-prio and a low-prio task shares a lock [this is a quite common scenario for most non-trivial RT applications], even if all critical sections are coded carefully to be deterministic (i.e. all critical sections are short in duration and only execute a limited number of instructions), the kernel cannot guarantee any deterministic execution of the high-prio task: any medium-priority task could preempt the low-prio task while it holds the shared lock and executes the critical section, and could delay it indefinitely. Implementation: --------------- As mentioned before, the userspace fastpath of PI-enabled pthread mutexes involves no kernel work at all - they behave quite similarly to normal futex-based locks: a 0 value means unlocked, and a value==TID means locked. (This is the same method as used by list-based robust futexes.) Userspace uses atomic ops to lock/unlock these mutexes without entering the kernel. To handle the slowpath, we have added two new futex ops: FUTEX_LOCK_PI FUTEX_UNLOCK_PI If the lock-acquire fastpath fails, [i.e. an atomic transition from 0 to TID fails], then FUTEX_LOCK_PI is called. The kernel does all the remaining work: if there is no futex-queue attached to the futex address yet then the code looks up the task that owns the futex [it has put its own TID into the futex value], and attaches a 'PI state' structure to the futex-queue. The pi_state includes an rt-mutex, which is a PI-aware, kernel-based synchronization object. The 'other' task is made the owner of the rt-mutex, and the FUTEX_WAITERS bit is atomically set in the futex value. Then this task tries to lock the rt-mutex, on which it blocks. Once it returns, it has the mutex acquired, and it sets the futex value to its own TID and returns. Userspace has no other work to perform - it now owns the lock, and futex value contains FUTEX_WAITERS|TID. If the unlock side fastpath succeeds, [i.e. userspace manages to do a TID -> 0 atomic transition of the futex value], then no kernel work is triggered. If the unlock fastpath fails (because the FUTEX_WAITERS bit is set), then FUTEX_UNLOCK_PI is called, and the kernel unlocks the futex on the behalf of userspace - and it also unlocks the attached pi_state->rt_mutex and thus wakes up any potential waiters. Note that under this approach, contrary to other PI-futex approaches, there is no prior 'registration' of a PI-futex. [which is not quite possible anyway, due to existing ABI properties of pthread mutexes.] Also, under this scheme, 'robustness' and 'PI' are two orthogonal properties of futexes, and all four combinations are possible: futex, robust-futex, PI-futex, robust+PI-futex. glibc support: -------------- Ulrich Drepper and Jakub Jelinek have written glibc support for PI-futexes (and robust futexes), enabling robust and PI (PTHREAD_PRIO_INHERIT) POSIX mutexes. (PTHREAD_PRIO_PROTECT support will be added later on too, no additional kernel changes are needed for that). [NOTE: The glibc patch is obviously inofficial and unsupported without matching upstream kernel functionality.] the patch-queue and the glibc patch can also be downloaded from: http://redhat.com/~mingo/PI-futex-patches/ Many thanks go to the people who helped us create this kernel feature: Steven Rostedt, Esben Nielsen, Benedikt Spranger, Daniel Walker, John Cooper, Arjan van de Ven, Oleg Nesterov and others. Credits for related prior projects goes to Dirk Grambow, Inaky Perez-Gonzalez, Bill Huey and many others. Clean up the futex code, before adding more features to it: - use u32 as the futex field type - that's the ABI - use __user and pointers to u32 instead of unsigned long - code style / comment style cleanups - rename hash-bucket name from 'bh' to 'hb'. I checked the pre and post futex.o object files to make sure this patch has no code effects. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Cc: Ulrich Drepper Cc: Jakub Jelinek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 245 ++++++++++++++++++++++++++------------------------ kernel/futex_compat.c | 3 +- 2 files changed, 131 insertions(+), 117 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e1a380c77a5..50356fb5d72 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -63,7 +63,7 @@ union futex_key { int offset; } shared; struct { - unsigned long uaddr; + unsigned long address; struct mm_struct *mm; int offset; } private; @@ -87,13 +87,13 @@ struct futex_q { struct list_head list; wait_queue_head_t waiters; - /* Which hash list lock to use. */ + /* Which hash list lock to use: */ spinlock_t *lock_ptr; - /* Key which the futex is hashed on. */ + /* Key which the futex is hashed on: */ union futex_key key; - /* For fd, sigio sent using these. */ + /* For fd, sigio sent using these: */ int fd; struct file *filp; }; @@ -144,8 +144,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) * * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. */ -static int get_futex_key(unsigned long uaddr, union futex_key *key) +static int get_futex_key(u32 __user *uaddr, union futex_key *key) { + unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct page *page; @@ -154,16 +155,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) /* * The futex address must be "naturally" aligned. */ - key->both.offset = uaddr % PAGE_SIZE; + key->both.offset = address % PAGE_SIZE; if (unlikely((key->both.offset % sizeof(u32)) != 0)) return -EINVAL; - uaddr -= key->both.offset; + address -= key->both.offset; /* * The futex is hashed differently depending on whether * it's in a shared or private mapping. So check vma first. */ - vma = find_extend_vma(mm, uaddr); + vma = find_extend_vma(mm, address); if (unlikely(!vma)) return -EFAULT; @@ -184,7 +185,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) */ if (likely(!(vma->vm_flags & VM_MAYSHARE))) { key->private.mm = mm; - key->private.uaddr = uaddr; + key->private.address = address; return 0; } @@ -194,7 +195,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) key->shared.inode = vma->vm_file->f_dentry->d_inode; key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { - key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) + key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff); return 0; } @@ -205,7 +206,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) * from swap. But that's a lot of code to duplicate here * for a rare case, so we simply fetch the page. */ - err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); + err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); if (err >= 0) { key->shared.pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -246,12 +247,12 @@ static void drop_key_refs(union futex_key *key) } } -static inline int get_futex_value_locked(int *dest, int __user *from) +static inline int get_futex_value_locked(u32 *dest, u32 __user *from) { int ret; inc_preempt_count(); - ret = __copy_from_user_inatomic(dest, from, sizeof(int)); + ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); dec_preempt_count(); return ret ? -EFAULT : 0; @@ -288,12 +289,12 @@ static void wake_futex(struct futex_q *q) * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake(unsigned long uaddr, int nr_wake) +static int futex_wake(u32 __user *uaddr, int nr_wake) { - union futex_key key; - struct futex_hash_bucket *bh; - struct list_head *head; + struct futex_hash_bucket *hb; struct futex_q *this, *next; + struct list_head *head; + union futex_key key; int ret; down_read(¤t->mm->mmap_sem); @@ -302,9 +303,9 @@ static int futex_wake(unsigned long uaddr, int nr_wake) if (unlikely(ret != 0)) goto out; - bh = hash_futex(&key); - spin_lock(&bh->lock); - head = &bh->chain; + hb = hash_futex(&key); + spin_lock(&hb->lock); + head = &hb->chain; list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { @@ -314,7 +315,7 @@ static int futex_wake(unsigned long uaddr, int nr_wake) } } - spin_unlock(&bh->lock); + spin_unlock(&hb->lock); out: up_read(¤t->mm->mmap_sem); return ret; @@ -324,10 +325,12 @@ out: * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) +static int +futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, + int nr_wake, int nr_wake2, int op) { union futex_key key1, key2; - struct futex_hash_bucket *bh1, *bh2; + struct futex_hash_bucket *hb1, *hb2; struct list_head *head; struct futex_q *this, *next; int ret, op_ret, attempt = 0; @@ -342,27 +345,29 @@ retryfull: if (unlikely(ret != 0)) goto out; - bh1 = hash_futex(&key1); - bh2 = hash_futex(&key2); + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); retry: - if (bh1 < bh2) - spin_lock(&bh1->lock); - spin_lock(&bh2->lock); - if (bh1 > bh2) - spin_lock(&bh1->lock); + if (hb1 < hb2) + spin_lock(&hb1->lock); + spin_lock(&hb2->lock); + if (hb1 > hb2) + spin_lock(&hb1->lock); - op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); + op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { - int dummy; + u32 dummy; - spin_unlock(&bh1->lock); - if (bh1 != bh2) - spin_unlock(&bh2->lock); + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); #ifndef CONFIG_MMU - /* we don't get EFAULT from MMU faults if we don't have an MMU, - * but we might get them from range checking */ + /* + * we don't get EFAULT from MMU faults if we don't have an MMU, + * but we might get them from range checking + */ ret = op_ret; goto out; #endif @@ -372,23 +377,26 @@ retry: goto out; } - /* futex_atomic_op_inuser needs to both read and write + /* + * futex_atomic_op_inuser needs to both read and write * *(int __user *)uaddr2, but we can't modify it * non-atomically. Therefore, if get_user below is not * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. */ + * still holding the mmap_sem. + */ if (attempt++) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; + unsigned long address = (unsigned long)uaddr2; ret = -EFAULT; if (attempt >= 2 || - !(vma = find_vma(mm, uaddr2)) || - vma->vm_start > uaddr2 || + !(vma = find_vma(mm, address)) || + vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) goto out; - switch (handle_mm_fault(mm, vma, uaddr2, 1)) { + switch (handle_mm_fault(mm, vma, address, 1)) { case VM_FAULT_MINOR: current->min_flt++; break; @@ -401,18 +409,20 @@ retry: goto retry; } - /* If we would have faulted, release mmap_sem, - * fault it in and start all over again. */ + /* + * If we would have faulted, release mmap_sem, + * fault it in and start all over again. + */ up_read(¤t->mm->mmap_sem); - ret = get_user(dummy, (int __user *)uaddr2); + ret = get_user(dummy, uaddr2); if (ret) return ret; goto retryfull; } - head = &bh1->chain; + head = &hb1->chain; list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { @@ -423,7 +433,7 @@ retry: } if (op_ret > 0) { - head = &bh2->chain; + head = &hb2->chain; op_ret = 0; list_for_each_entry_safe(this, next, head, list) { @@ -436,9 +446,9 @@ retry: ret += op_ret; } - spin_unlock(&bh1->lock); - if (bh1 != bh2) - spin_unlock(&bh2->lock); + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); out: up_read(¤t->mm->mmap_sem); return ret; @@ -448,11 +458,11 @@ out: * Requeue all waiters hashed on one physical page to another * physical page. */ -static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, - int nr_wake, int nr_requeue, int *valp) +static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, + int nr_wake, int nr_requeue, u32 *cmpval) { union futex_key key1, key2; - struct futex_hash_bucket *bh1, *bh2; + struct futex_hash_bucket *hb1, *hb2; struct list_head *head1; struct futex_q *this, *next; int ret, drop_count = 0; @@ -467,68 +477,69 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, if (unlikely(ret != 0)) goto out; - bh1 = hash_futex(&key1); - bh2 = hash_futex(&key2); + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); - if (bh1 < bh2) - spin_lock(&bh1->lock); - spin_lock(&bh2->lock); - if (bh1 > bh2) - spin_lock(&bh1->lock); + if (hb1 < hb2) + spin_lock(&hb1->lock); + spin_lock(&hb2->lock); + if (hb1 > hb2) + spin_lock(&hb1->lock); - if (likely(valp != NULL)) { - int curval; + if (likely(cmpval != NULL)) { + u32 curval; - ret = get_futex_value_locked(&curval, (int __user *)uaddr1); + ret = get_futex_value_locked(&curval, uaddr1); if (unlikely(ret)) { - spin_unlock(&bh1->lock); - if (bh1 != bh2) - spin_unlock(&bh2->lock); + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); - /* If we would have faulted, release mmap_sem, fault + /* + * If we would have faulted, release mmap_sem, fault * it in and start all over again. */ up_read(¤t->mm->mmap_sem); - ret = get_user(curval, (int __user *)uaddr1); + ret = get_user(curval, uaddr1); if (!ret) goto retry; return ret; } - if (curval != *valp) { + if (curval != *cmpval) { ret = -EAGAIN; goto out_unlock; } } - head1 = &bh1->chain; + head1 = &hb1->chain; list_for_each_entry_safe(this, next, head1, list) { if (!match_futex (&this->key, &key1)) continue; if (++ret <= nr_wake) { wake_futex(this); } else { - list_move_tail(&this->list, &bh2->chain); - this->lock_ptr = &bh2->lock; + list_move_tail(&this->list, &hb2->chain); + this->lock_ptr = &hb2->lock; this->key = key2; get_key_refs(&key2); drop_count++; if (ret - nr_wake >= nr_requeue) break; - /* Make sure to stop if key1 == key2 */ - if (head1 == &bh2->chain && head1 != &next->list) + /* Make sure to stop if key1 == key2: */ + if (head1 == &hb2->chain && head1 != &next->list) head1 = &this->list; } } out_unlock: - spin_unlock(&bh1->lock); - if (bh1 != bh2) - spin_unlock(&bh2->lock); + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); /* drop_key_refs() must be called outside the spinlocks. */ while (--drop_count >= 0) @@ -543,7 +554,7 @@ out: static inline struct futex_hash_bucket * queue_lock(struct futex_q *q, int fd, struct file *filp) { - struct futex_hash_bucket *bh; + struct futex_hash_bucket *hb; q->fd = fd; q->filp = filp; @@ -551,23 +562,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) init_waitqueue_head(&q->waiters); get_key_refs(&q->key); - bh = hash_futex(&q->key); - q->lock_ptr = &bh->lock; + hb = hash_futex(&q->key); + q->lock_ptr = &hb->lock; - spin_lock(&bh->lock); - return bh; + spin_lock(&hb->lock); + return hb; } -static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { - list_add_tail(&q->list, &bh->chain); - spin_unlock(&bh->lock); + list_add_tail(&q->list, &hb->chain); + spin_unlock(&hb->lock); } static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) +queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) { - spin_unlock(&bh->lock); + spin_unlock(&hb->lock); drop_key_refs(&q->key); } @@ -579,16 +590,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) /* The key must be already stored in q->key. */ static void queue_me(struct futex_q *q, int fd, struct file *filp) { - struct futex_hash_bucket *bh; - bh = queue_lock(q, fd, filp); - __queue_me(q, bh); + struct futex_hash_bucket *hb; + + hb = queue_lock(q, fd, filp); + __queue_me(q, hb); } /* Return 1 if we were still queued (ie. 0 means we were woken) */ static int unqueue_me(struct futex_q *q) { - int ret = 0; spinlock_t *lock_ptr; + int ret = 0; /* In the common case we don't take the spinlock, which is nice. */ retry: @@ -622,12 +634,13 @@ static int unqueue_me(struct futex_q *q) return ret; } -static int futex_wait(unsigned long uaddr, int val, unsigned long time) +static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) { DECLARE_WAITQUEUE(wait, current); - int ret, curval; + struct futex_hash_bucket *hb; struct futex_q q; - struct futex_hash_bucket *bh; + u32 uval; + int ret; retry: down_read(¤t->mm->mmap_sem); @@ -636,7 +649,7 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) if (unlikely(ret != 0)) goto out_release_sem; - bh = queue_lock(&q, -1, NULL); + hb = queue_lock(&q, -1, NULL); /* * Access the page AFTER the futex is queued. @@ -658,31 +671,31 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) * We hold the mmap semaphore, so the mapping cannot have changed * since we looked it up in get_futex_key. */ - - ret = get_futex_value_locked(&curval, (int __user *)uaddr); + ret = get_futex_value_locked(&uval, uaddr); if (unlikely(ret)) { - queue_unlock(&q, bh); + queue_unlock(&q, hb); - /* If we would have faulted, release mmap_sem, fault it in and + /* + * If we would have faulted, release mmap_sem, fault it in and * start all over again. */ up_read(¤t->mm->mmap_sem); - ret = get_user(curval, (int __user *)uaddr); + ret = get_user(uval, uaddr); if (!ret) goto retry; return ret; } - if (curval != val) { + if (uval != val) { ret = -EWOULDBLOCK; - queue_unlock(&q, bh); + queue_unlock(&q, hb); goto out_release_sem; } /* Only actually queue if *uaddr contained val. */ - __queue_me(&q, bh); + __queue_me(&q, hb); /* * Now the futex is queued and we have checked the data, we @@ -720,8 +733,10 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) return 0; if (time == 0) return -ETIMEDOUT; - /* We expect signal_pending(current), but another thread may - * have handled it for us already. */ + /* + * We expect signal_pending(current), but another thread may + * have handled it for us already. + */ return -EINTR; out_release_sem: @@ -735,6 +750,7 @@ static int futex_close(struct inode *inode, struct file *filp) unqueue_me(q); kfree(q); + return 0; } @@ -766,7 +782,7 @@ static struct file_operations futex_fops = { * Signal allows caller to avoid the race which would occur if they * set the sigio stuff up afterwards. */ -static int futex_fd(unsigned long uaddr, int signal) +static int futex_fd(u32 __user *uaddr, int signal) { struct futex_q *q; struct file *filp; @@ -937,7 +953,7 @@ retry: goto retry; if (uval & FUTEX_WAITERS) - futex_wake((unsigned long)uaddr, 1); + futex_wake(uaddr, 1); } return 0; } @@ -999,8 +1015,8 @@ void exit_robust_list(struct task_struct *curr) } } -long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, - unsigned long uaddr2, int val2, int val3) +long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, + u32 __user *uaddr2, u32 val2, u32 val3) { int ret; @@ -1031,13 +1047,13 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, } -asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, +asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, struct timespec __user *utime, u32 __user *uaddr2, - int val3) + u32 val3) { struct timespec t; unsigned long timeout = MAX_SCHEDULE_TIMEOUT; - int val2 = 0; + u32 val2 = 0; if (utime && (op == FUTEX_WAIT)) { if (copy_from_user(&t, utime, sizeof(t)) != 0) @@ -1050,10 +1066,9 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, * requeue parameter in 'utime' if op == FUTEX_REQUEUE. */ if (op >= FUTEX_REQUEUE) - val2 = (int) (unsigned long) utime; + val2 = (u32) (unsigned long) utime; - return do_futex((unsigned long)uaddr, op, val, timeout, - (unsigned long)uaddr2, val2, val3); + return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); } static int futexfs_get_sb(struct file_system_type *fs_type, diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 1ab6a0ea3d1..7e57c31670a 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -139,6 +139,5 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, if (op >= FUTEX_REQUEUE) val2 = (int) (unsigned long) utime; - return do_futex((unsigned long)uaddr, op, val, timeout, - (unsigned long)uaddr2, val2, val3); + return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); } -- cgit From b29739f902ee76a05493fb7d2303490fc75364f4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:54:51 -0700 Subject: [PATCH] pi-futex: scheduler support for pi Add framework to boost/unboost the priority of RT tasks. This consists of: - caching the 'normal' priority in ->normal_prio - providing a functions to set/get the priority of the task - make sched_setscheduler() aware of boosting The effective_prio() cleanups also fix a priority-calculation bug pointed out by Andrey Gelman, in set_user_nice(). has_rt_policy() fix: Peter Williams Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Cc: Andrey Gelman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 160 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 15abf083324..08431f07a99 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -354,6 +354,25 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ +/* + * __task_rq_lock - lock the runqueue a given task resides on. + * Must be called interrupts disabled. + */ +static inline runqueue_t *__task_rq_lock(task_t *p) + __acquires(rq->lock) +{ + struct runqueue *rq; + +repeat_lock_task: + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock(&rq->lock); + goto repeat_lock_task; + } + return rq; +} + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -375,6 +394,12 @@ repeat_lock_task: return rq; } +static inline void __task_rq_unlock(runqueue_t *rq) + __releases(rq->lock) +{ + spin_unlock(&rq->lock); +} + static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) __releases(rq->lock) { @@ -638,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) } /* - * effective_prio - return the priority that is based on the static + * __normal_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. * * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] @@ -651,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) * * Both properties are important to certain workloads. */ -static int effective_prio(task_t *p) + +static inline int __normal_prio(task_t *p) { int bonus, prio; - if (rt_task(p)) - return p->prio; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; @@ -692,7 +715,7 @@ static int effective_prio(task_t *p) static void set_load_weight(task_t *p) { - if (rt_task(p)) { + if (has_rt_policy(p)) { #ifdef CONFIG_SMP if (p == task_rq(p)->migration_thread) /* @@ -730,6 +753,44 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq) dec_raw_weighted_load(rq, p); } +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(task_t *p) +{ + int prio; + + if (has_rt_policy(p)) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(task_t *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + /* * __activate_task - move a task to the runqueue. */ @@ -752,6 +813,10 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) inc_nr_running(p, rq); } +/* + * Recalculate p->normal_prio and p->prio after having slept, + * updating the sleep-average too: + */ static int recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ @@ -1448,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; + + /* + * Make sure we do not leak PI boosting priority to the child: + */ + p->prio = current->normal_prio; + INIT_LIST_HEAD(&p->run_list); p->array = NULL; #ifdef CONFIG_SCHEDSTATS @@ -1527,6 +1598,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) __activate_task(p, rq); else { p->prio = current->prio; + p->normal_prio = current->normal_prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; @@ -3668,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) EXPORT_SYMBOL(sleep_on_timeout); +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(task_t *p, int prio) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int oldprio; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; + array = p->array; + if (array) + dequeue_task(p, array); + p->prio = prio; + + if (array) { + /* + * If changing to an RT priority then queue it + * in the active array! + */ + if (rt_task(p)) + array = rq->active; + enqueue_task(p, array); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); +} + +#endif + void set_user_nice(task_t *p, long nice) { unsigned long flags; prio_array_t *array; runqueue_t *rq; - int old_prio, new_prio, delta; + int old_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3688,7 +3813,7 @@ void set_user_nice(task_t *p, long nice) * it wont have any effect on scheduling until the task is * not SCHED_NORMAL/SCHED_BATCH: */ - if (rt_task(p)) { + if (has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } @@ -3698,12 +3823,11 @@ void set_user_nice(task_t *p, long nice) dec_raw_weighted_load(rq, p); } - old_prio = p->prio; - new_prio = NICE_TO_PRIO(nice); - delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); - p->prio += delta; + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; if (array) { enqueue_task(p, array); @@ -3718,7 +3842,6 @@ void set_user_nice(task_t *p, long nice) out_unlock: task_rq_unlock(rq, &flags); } - EXPORT_SYMBOL(set_user_nice); /* @@ -3833,16 +3956,14 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) BUG_ON(p->array); p->policy = policy; p->rt_priority = prio; - if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { - p->prio = MAX_RT_PRIO-1 - p->rt_priority; - } else { - p->prio = p->static_prio; - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; - } + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); + /* + * SCHED_BATCH tasks are treated as perpetual CPU hogs: + */ + if (policy == SCHED_BATCH) + p->sleep_avg = 0; set_load_weight(p); } @@ -3911,15 +4032,21 @@ recheck: retval = security_task_setscheduler(p, policy, param); if (retval) return retval; + /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + */ + spin_lock_irqsave(&p->pi_lock, flags); /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. */ - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } array = p->array; @@ -3940,7 +4067,9 @@ recheck: } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + return 0; } EXPORT_SYMBOL_GPL(sched_setscheduler); @@ -4575,7 +4704,7 @@ void __devinit init_idle(task_t *idle, int cpu) idle->timestamp = sched_clock(); idle->sleep_avg = 0; idle->array = NULL; - idle->prio = MAX_PRIO; + idle->prio = idle->normal_prio = MAX_PRIO; idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); @@ -6582,7 +6711,8 @@ void normalize_rt_tasks(void) if (!rt_task(p)) continue; - rq = task_rq_lock(p, &flags); + spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); array = p->array; if (array) @@ -6593,7 +6723,8 @@ void normalize_rt_tasks(void) resched_task(rq->curr); } - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); } read_unlock_irq(&tasklist_lock); } -- cgit From 23f78d4a03c53cbd75d87a795378ea540aa08c86 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:54:53 -0700 Subject: [PATCH] pi-futex: rt mutex core Core functions for the rt-mutex subsystem. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/fork.c | 16 + kernel/rtmutex.c | 904 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rtmutex.h | 29 ++ kernel/rtmutex_common.h | 93 +++++ kernel/sysctl.c | 15 + 6 files changed, 1058 insertions(+) create mode 100644 kernel/rtmutex.c create mode 100644 kernel/rtmutex.h create mode 100644 kernel/rtmutex_common.h (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 752bd7d383a..21df9a338ff 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o endif +obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o diff --git a/kernel/fork.c b/kernel/fork.c index 9b4e54ef022..b664a081fff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep; void free_task(struct task_struct *tsk) { free_thread_info(tsk->thread_info); + rt_mutex_debug_task_free(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -913,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) return current->pid; } +static inline void rt_mutex_init_task(struct task_struct *p) +{ +#ifdef CONFIG_RT_MUTEXES + spin_lock_init(&p->pi_lock); + plist_head_init(&p->pi_waiters, &p->pi_lock); + p->pi_blocked_on = NULL; +# ifdef CONFIG_DEBUG_RT_MUTEXES + spin_lock_init(&p->held_list_lock); + INIT_LIST_HEAD(&p->held_list_head); +# endif +#endif +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1034,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags, mpol_fix_fork_child_flag(p); #endif + rt_mutex_init_task(p); + #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c new file mode 100644 index 00000000000..937a474fae9 --- /dev/null +++ b/kernel/rtmutex.c @@ -0,0 +1,904 @@ +/* + * RT-Mutexes: simple blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner. + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner + * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt + * Copyright (C) 2006 Esben Nielsen + */ +#include +#include +#include +#include + +#include "rtmutex_common.h" + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + +/* + * lock->owner state tracking: + * + * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 + * are used to keep track of the "owner is pending" and "lock has + * waiters" state. + * + * owner bit1 bit0 + * NULL 0 0 lock is free (fast acquire possible) + * NULL 0 1 invalid state + * NULL 1 0 Transitional State* + * NULL 1 1 invalid state + * taskpointer 0 0 lock is held (fast release possible) + * taskpointer 0 1 task is pending owner + * taskpointer 1 0 lock is held and has waiters + * taskpointer 1 1 task is pending owner and lock has more waiters + * + * Pending ownership is assigned to the top (highest priority) + * waiter of the lock, when the lock is released. The thread is woken + * up and can now take the lock. Until the lock is taken (bit 0 + * cleared) a competing higher priority thread can steal the lock + * which puts the woken up thread back on the waiters list. + * + * The fast atomic compare exchange based acquire and release is only + * possible when bit 0 and 1 of lock->owner are 0. + * + * (*) There's a small time where the owner can be NULL and the + * "lock has waiters" bit is set. This can happen when grabbing the lock. + * To prevent a cmpxchg of the owner releasing the lock, we need to set this + * bit before looking at the lock, hence the reason this is a transitional + * state. + */ + +static void +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, + unsigned long mask) +{ + unsigned long val = (unsigned long)owner | mask; + + if (rt_mutex_has_waiters(lock)) + val |= RT_MUTEX_HAS_WAITERS; + + lock->owner = (struct task_struct *)val; +} + +static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static void fixup_rt_mutex_waiters(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + clear_rt_mutex_waiters(lock); +} + +/* + * We can speed up the acquire/release, if the architecture + * supports cmpxchg and if there's no debugging state to be set up + */ +#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + unsigned long owner, *p = (unsigned long *) &lock->owner; + + do { + owner = *p; + } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); +} +#else +# define rt_mutex_cmpxchg(l,c,n) (0) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); +} +#endif + +/* + * Calculate task priority from the waiter list priority + * + * Return task->normal_prio when the waiter list is empty or when + * the waiter is not allowed to do priority boosting + */ +int rt_mutex_getprio(struct task_struct *task) +{ + if (likely(!task_has_pi_waiters(task))) + return task->normal_prio; + + return min(task_top_pi_waiter(task)->pi_list_entry.prio, + task->normal_prio); +} + +/* + * Adjust the priority of a task, after its pi_waiters got modified. + * + * This can be both boosting and unboosting. task->pi_lock must be held. + */ +static void __rt_mutex_adjust_prio(struct task_struct *task) +{ + int prio = rt_mutex_getprio(task); + + if (task->prio != prio) + rt_mutex_setprio(task, prio); +} + +/* + * Adjust task priority (undo boosting). Called from the exit path of + * rt_mutex_slowunlock() and rt_mutex_slowlock(). + * + * (Note: We do this outside of the protection of lock->wait_lock to + * allow the lock to be taken while or before we readjust the priority + * of task. We do not use the spin_xx_mutex() variants here as we are + * outside of the debug path.) + */ +static void rt_mutex_adjust_prio(struct task_struct *task) +{ + unsigned long flags; + + spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + spin_unlock_irqrestore(&task->pi_lock, flags); +} + +/* + * Max number of times we'll walk the boosting chain: + */ +int max_lock_depth = 1024; + +/* + * Adjust the priority chain. Also used for deadlock detection. + * Decreases task's usage by one - may thus free the task. + * Returns 0 or -EDEADLK. + */ +static int rt_mutex_adjust_prio_chain(task_t *task, + int deadlock_detect, + struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter + __IP_DECL__) +{ + struct rt_mutex *lock; + struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; + int detect_deadlock, ret = 0, depth = 0; + unsigned long flags; + + detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, + deadlock_detect); + + /* + * The (de)boosting is a step by step approach with a lot of + * pitfalls. We want this to be preemptible and we want hold a + * maximum of two locks per step. So we have to check + * carefully whether things change under us. + */ + again: + if (++depth > max_lock_depth) { + static int prev_max; + + /* + * Print this only once. If the admin changes the limit, + * print a new message when reaching the limit again. + */ + if (prev_max != max_lock_depth) { + prev_max = max_lock_depth; + printk(KERN_WARNING "Maximum lock depth %d reached " + "task: %s (%d)\n", max_lock_depth, + current->comm, current->pid); + } + put_task_struct(task); + + return deadlock_detect ? -EDEADLK : 0; + } + retry: + /* + * Task can not go away as we did a get_task() before ! + */ + spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + /* + * Check whether the end of the boosting chain has been + * reached or the state of the chain has changed while we + * dropped the locks. + */ + if (!waiter || !waiter->task) + goto out_unlock_pi; + + if (top_waiter && (!task_has_pi_waiters(task) || + top_waiter != task_top_pi_waiter(task))) + goto out_unlock_pi; + + /* + * When deadlock detection is off then we check, if further + * priority adjustment is necessary. + */ + if (!detect_deadlock && waiter->list_entry.prio == task->prio) + goto out_unlock_pi; + + lock = waiter->lock; + if (!spin_trylock(&lock->wait_lock)) { + spin_unlock_irqrestore(&task->pi_lock, flags); + cpu_relax(); + goto retry; + } + + /* Deadlock detection */ + if (lock == orig_lock || rt_mutex_owner(lock) == current) { + debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); + spin_unlock(&lock->wait_lock); + ret = deadlock_detect ? -EDEADLK : 0; + goto out_unlock_pi; + } + + top_waiter = rt_mutex_top_waiter(lock); + + /* Requeue the waiter */ + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->list_entry.prio = task->prio; + plist_add(&waiter->list_entry, &lock->wait_list); + + /* Release the task */ + spin_unlock_irqrestore(&task->pi_lock, flags); + put_task_struct(task); + + /* Grab the next task */ + task = rt_mutex_owner(lock); + spin_lock_irqsave(&task->pi_lock, flags); + + if (waiter == rt_mutex_top_waiter(lock)) { + /* Boost the owner */ + plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + + } else if (top_waiter == waiter) { + /* Deboost the owner */ + plist_del(&waiter->pi_list_entry, &task->pi_waiters); + waiter = rt_mutex_top_waiter(lock); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + } + + get_task_struct(task); + spin_unlock_irqrestore(&task->pi_lock, flags); + + top_waiter = rt_mutex_top_waiter(lock); + spin_unlock(&lock->wait_lock); + + if (!detect_deadlock && waiter != top_waiter) + goto out_put_task; + + goto again; + + out_unlock_pi: + spin_unlock_irqrestore(&task->pi_lock, flags); + out_put_task: + put_task_struct(task); + return ret; +} + +/* + * Optimization: check if we can steal the lock from the + * assigned pending owner [which might not have taken the + * lock yet]: + */ +static inline int try_to_steal_lock(struct rt_mutex *lock) +{ + struct task_struct *pendowner = rt_mutex_owner(lock); + struct rt_mutex_waiter *next; + unsigned long flags; + + if (!rt_mutex_owner_pending(lock)) + return 0; + + if (pendowner == current) + return 1; + + spin_lock_irqsave(&pendowner->pi_lock, flags); + if (current->prio >= pendowner->prio) { + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + return 0; + } + + /* + * Check if a waiter is enqueued on the pending owners + * pi_waiters list. Remove it and readjust pending owners + * priority. + */ + if (likely(!rt_mutex_has_waiters(lock))) { + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + return 1; + } + + /* No chain handling, pending owner is not blocked on anything: */ + next = rt_mutex_top_waiter(lock); + plist_del(&next->pi_list_entry, &pendowner->pi_waiters); + __rt_mutex_adjust_prio(pendowner); + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + + /* + * We are going to steal the lock and a waiter was + * enqueued on the pending owners pi_waiters queue. So + * we have to enqueue this waiter into + * current->pi_waiters list. This covers the case, + * where current is boosted because it holds another + * lock and gets unboosted because the booster is + * interrupted, so we would delay a waiter with higher + * priority as current->normal_prio. + * + * Note: in the rare case of a SCHED_OTHER task changing + * its priority and thus stealing the lock, next->task + * might be current: + */ + if (likely(next->task != current)) { + spin_lock_irqsave(¤t->pi_lock, flags); + plist_add(&next->pi_list_entry, ¤t->pi_waiters); + __rt_mutex_adjust_prio(current); + spin_unlock_irqrestore(¤t->pi_lock, flags); + } + return 1; +} + +/* + * Try to take an rt-mutex + * + * This fails + * - when the lock has a real owner + * - when a different pending owner exists and has higher priority than current + * + * Must be called with lock->wait_lock held. + */ +static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__) +{ + /* + * We have to be careful here if the atomic speedups are + * enabled, such that, when + * - no other waiter is on the lock + * - the lock has been released since we did the cmpxchg + * the lock can be released or taken while we are doing the + * checks and marking the lock with RT_MUTEX_HAS_WAITERS. + * + * The atomic acquire/release aware variant of + * mark_rt_mutex_waiters uses a cmpxchg loop. After setting + * the WAITERS bit, the atomic release / acquire can not + * happen anymore and lock->wait_lock protects us from the + * non-atomic case. + * + * Note, that this might set lock->owner = + * RT_MUTEX_HAS_WAITERS in the case the lock is not contended + * any more. This is fixed up when we take the ownership. + * This is the transitional state explained at the top of this file. + */ + mark_rt_mutex_waiters(lock); + + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) + return 0; + + /* We got the lock. */ + debug_rt_mutex_lock(lock __IP__); + + rt_mutex_set_owner(lock, current, 0); + + rt_mutex_deadlock_account_lock(lock, current); + + return 1; +} + +/* + * Task blocks on lock. + * + * Prepare waiter and propagate pi chain + * + * This must be called with lock->wait_lock held. + */ +static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + int detect_deadlock + __IP_DECL__) +{ + struct rt_mutex_waiter *top_waiter = waiter; + task_t *owner = rt_mutex_owner(lock); + int boost = 0, res; + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + __rt_mutex_adjust_prio(current); + waiter->task = current; + waiter->lock = lock; + plist_node_init(&waiter->list_entry, current->prio); + plist_node_init(&waiter->pi_list_entry, current->prio); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&waiter->list_entry, &lock->wait_list); + + current->pi_blocked_on = waiter; + + spin_unlock_irqrestore(¤t->pi_lock, flags); + + if (waiter == rt_mutex_top_waiter(lock)) { + spin_lock_irqsave(&owner->pi_lock, flags); + plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); + plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + + __rt_mutex_adjust_prio(owner); + if (owner->pi_blocked_on) { + boost = 1; + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { + spin_lock_irqsave(&owner->pi_lock, flags); + if (owner->pi_blocked_on) { + boost = 1; + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + if (!boost) + return 0; + + spin_unlock(&lock->wait_lock); + + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, + waiter __IP__); + + spin_lock(&lock->wait_lock); + + return res; +} + +/* + * Wake up the next waiter on the lock. + * + * Remove the top waiter from the current tasks waiter list and from + * the lock waiter list. Set it as pending owner. Then wake it up. + * + * Called with lock->wait_lock held. + */ +static void wakeup_next_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *waiter; + struct task_struct *pendowner; + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + + waiter = rt_mutex_top_waiter(lock); + plist_del(&waiter->list_entry, &lock->wait_list); + + /* + * Remove it from current->pi_waiters. We do not adjust a + * possible priority boost right now. We execute wakeup in the + * boosted mode and go back to normal after releasing + * lock->wait_lock. + */ + plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + pendowner = waiter->task; + waiter->task = NULL; + + rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); + + spin_unlock_irqrestore(¤t->pi_lock, flags); + + /* + * Clear the pi_blocked_on variable and enqueue a possible + * waiter into the pi_waiters list of the pending owner. This + * prevents that in case the pending owner gets unboosted a + * waiter with higher priority than pending-owner->normal_prio + * is blocked on the unboosted (pending) owner. + */ + spin_lock_irqsave(&pendowner->pi_lock, flags); + + WARN_ON(!pendowner->pi_blocked_on); + WARN_ON(pendowner->pi_blocked_on != waiter); + WARN_ON(pendowner->pi_blocked_on->lock != lock); + + pendowner->pi_blocked_on = NULL; + + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &pendowner->pi_waiters); + } + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + + wake_up_process(pendowner); +} + +/* + * Remove a waiter from a lock + * + * Must be called with lock->wait_lock held + */ +static void remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter __IP_DECL__) +{ + int first = (waiter == rt_mutex_top_waiter(lock)); + int boost = 0; + task_t *owner = rt_mutex_owner(lock); + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->task = NULL; + current->pi_blocked_on = NULL; + spin_unlock_irqrestore(¤t->pi_lock, flags); + + if (first && owner != current) { + + spin_lock_irqsave(&owner->pi_lock, flags); + + plist_del(&waiter->pi_list_entry, &owner->pi_waiters); + + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &owner->pi_waiters); + } + __rt_mutex_adjust_prio(owner); + + if (owner->pi_blocked_on) { + boost = 1; + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + + WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + + if (!boost) + return; + + spin_unlock(&lock->wait_lock); + + rt_mutex_adjust_prio_chain(owner, 0, lock, NULL __IP__); + + spin_lock(&lock->wait_lock); +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__) +{ + struct rt_mutex_waiter waiter; + int ret = 0; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + spin_lock(&lock->wait_lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock __IP__)) { + spin_unlock(&lock->wait_lock); + return 0; + } + + set_current_state(state); + + /* Setup the timer, when timeout != NULL */ + if (unlikely(timeout)) + hrtimer_start(&timeout->timer, timeout->timer.expires, + HRTIMER_ABS); + + for (;;) { + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock __IP__)) + break; + + /* + * TASK_INTERRUPTIBLE checks for signals and + * timeout. Ignored otherwise. + */ + if (unlikely(state == TASK_INTERRUPTIBLE)) { + /* Signal pending? */ + if (signal_pending(current)) + ret = -EINTR; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + if (ret) + break; + } + + /* + * waiter.task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by a higher prio task. + */ + if (!waiter.task) { + ret = task_blocks_on_rt_mutex(lock, &waiter, + detect_deadlock __IP__); + /* + * If we got woken up by the owner then start loop + * all over without going into schedule to try + * to get the lock now: + */ + if (unlikely(!waiter.task)) + continue; + + if (unlikely(ret)) + break; + } + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(&waiter); + + schedule(); + + spin_lock(&lock->wait_lock); + set_current_state(state); + } + + set_current_state(TASK_RUNNING); + + if (unlikely(waiter.task)) + remove_waiter(lock, &waiter __IP__); + + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* Remove pending timer: */ + if (unlikely(timeout)) + hrtimer_cancel(&timeout->timer); + + /* + * Readjust priority, when we did not get the lock. We might + * have been the pending owner and boosted. Since we did not + * take the lock, the PI boost has to go. + */ + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + + debug_rt_mutex_free_waiter(&waiter); + + return ret; +} + +/* + * Slow path try-lock function: + */ +static inline int +rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__) +{ + int ret = 0; + + spin_lock(&lock->wait_lock); + + if (likely(rt_mutex_owner(lock) != current)) { + + ret = try_to_take_rt_mutex(lock __IP__); + /* + * try_to_take_rt_mutex() sets the lock waiters + * bit unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + } + + spin_unlock(&lock->wait_lock); + + return ret; +} + +/* + * Slow path to release a rt-mutex: + */ +static void __sched +rt_mutex_slowunlock(struct rt_mutex *lock) +{ + spin_lock(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + spin_unlock(&lock->wait_lock); + return; + } + + wakeup_next_waiter(lock); + + spin_unlock(&lock->wait_lock); + + /* Undo pi boosting if necessary: */ + rt_mutex_adjust_prio(current); +} + +/* + * debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static inline int +rt_mutex_fastlock(struct rt_mutex *lock, int state, + int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, NULL, detect_deadlock __RET_IP__); +} + +static inline int +rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, timeout, detect_deadlock __RET_IP__); +} + +static inline int +rt_mutex_fasttrylock(struct rt_mutex *lock, + int (*slowfn)(struct rt_mutex *lock __IP_DECL__)) +{ + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 1; + } + return slowfn(lock __RET_IP__); +} + +static inline void +rt_mutex_fastunlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(lock); +} + +/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + */ +void __sched rt_mutex_lock(struct rt_mutex *lock) +{ + might_sleep(); + + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock); + +/** + * rt_mutex_lock_interruptible - lock a rt_mutex interruptible + * + * @lock: the rt_mutex to be locked + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +/** + * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller + * + * @lock: the rt_mutex to be locked + * @timeout: timeout structure or NULL (no timeout) + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -ETIMEOUT when the timeout expired + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int +rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); + +/** + * rt_mutex_trylock - try to lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * + * Returns 1 on success and 0 on contention + */ +int __sched rt_mutex_trylock(struct rt_mutex *lock) +{ + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); +} +EXPORT_SYMBOL_GPL(rt_mutex_trylock); + +/** + * rt_mutex_unlock - unlock a rt_mutex + * + * @lock: the rt_mutex to be unlocked + */ +void __sched rt_mutex_unlock(struct rt_mutex *lock) +{ + rt_mutex_fastunlock(lock, rt_mutex_slowunlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +/*** + * rt_mutex_destroy - mark a mutex unusable + * @lock: the mutex to be destroyed + * + * This function marks the mutex uninitialized, and any subsequent + * use of the mutex is forbidden. The mutex must not be locked when + * this function is called. + */ +void rt_mutex_destroy(struct rt_mutex *lock) +{ + WARN_ON(rt_mutex_is_locked(lock)); +#ifdef CONFIG_DEBUG_RT_MUTEXES + lock->magic = NULL; +#endif +} + +EXPORT_SYMBOL_GPL(rt_mutex_destroy); + +/** + * __rt_mutex_init - initialize the rt lock + * + * @lock: the rt lock to be initialized + * + * Initialize the rt lock to unlocked state. + * + * Initializing of a locked rt lock is not allowed + */ +void __rt_mutex_init(struct rt_mutex *lock, const char *name) +{ + lock->owner = NULL; + spin_lock_init(&lock->wait_lock); + plist_head_init(&lock->wait_list, &lock->wait_lock); + + debug_rt_mutex_init(lock, name); +} +EXPORT_SYMBOL_GPL(__rt_mutex_init); diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h new file mode 100644 index 00000000000..1e0fca13ff7 --- /dev/null +++ b/kernel/rtmutex.h @@ -0,0 +1,29 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains macros used solely by rtmutex.c. + * Non-debug version. + */ + +#define __IP_DECL__ +#define __IP__ +#define __RET_IP__ +#define rt_mutex_deadlock_check(l) (0) +#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) +#define rt_mutex_deadlock_account_unlock(l) do { } while (0) +#define debug_rt_mutex_init_waiter(w) do { } while (0) +#define debug_rt_mutex_free_waiter(w) do { } while (0) +#define debug_rt_mutex_lock(l) do { } while (0) +#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) +#define debug_rt_mutex_proxy_unlock(l) do { } while (0) +#define debug_rt_mutex_unlock(l) do { } while (0) +#define debug_rt_mutex_init(m, n) do { } while (0) +#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) +#define debug_rt_mutex_print_deadlock(w) do { } while (0) +#define debug_rt_mutex_detect_deadlock(w,d) (d) +#define debug_rt_mutex_reset_waiter(w) do { } while (0) diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h new file mode 100644 index 00000000000..50eed60eb08 --- /dev/null +++ b/kernel/rtmutex_common.h @@ -0,0 +1,93 @@ +/* + * RT Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains the private data structure and API definitions. + */ + +#ifndef __KERNEL_RTMUTEX_COMMON_H +#define __KERNEL_RTMUTEX_COMMON_H + +#include + +/* + * This is the control structure for tasks blocked on a rt_mutex, + * which is allocated on the kernel stack on of the blocked task. + * + * @list_entry: pi node to enqueue into the mutex waiters list + * @pi_list_entry: pi node to enqueue into the mutex owner waiters list + * @task: task reference to the blocked task + */ +struct rt_mutex_waiter { + struct plist_node list_entry; + struct plist_node pi_list_entry; + struct task_struct *task; + struct rt_mutex *lock; +#ifdef CONFIG_DEBUG_RT_MUTEXES + unsigned long ip; + pid_t deadlock_task_pid; + struct rt_mutex *deadlock_lock; +#endif +}; + +/* + * Various helpers to access the waiters-plist: + */ +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ + return !plist_head_empty(&lock->wait_list); +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *w; + + w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, + list_entry); + BUG_ON(w->lock != lock); + + return w; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ + return !plist_head_empty(&p->pi_waiters); +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ + return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, + pi_list_entry); +} + +/* + * lock->owner state tracking: + */ +#define RT_MUTEX_OWNER_PENDING 1UL +#define RT_MUTEX_HAS_WAITERS 2UL +#define RT_MUTEX_OWNER_MASKALL 3UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); +} + +static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) +{ + return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; +} + +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f54afed8426..93a2c539864 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -133,6 +133,10 @@ extern int acct_parm[]; extern int no_unaligned_warning; #endif +#ifdef CONFIG_RT_MUTEXES +extern int max_lock_depth; +#endif + static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); static int proc_doutsstring(ctl_table *table, int write, struct file *filp, @@ -688,6 +692,17 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_RT_MUTEXES + { + .ctl_name = KERN_MAX_LOCK_DEPTH, + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { .ctl_name = 0 } }; -- cgit From e7eebaf6a81b956c989f184ee4b27277c88f8afe Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:54:55 -0700 Subject: [PATCH] pi-futex: rt mutex debug Runtime debugging functionality for rt-mutexes. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/exit.c | 1 + kernel/rtmutex-debug.c | 513 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rtmutex-debug.h | 37 ++++ 4 files changed, 552 insertions(+) create mode 100644 kernel/rtmutex-debug.c create mode 100644 kernel/rtmutex-debug.h (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 21df9a338ff..f9c92d34cde 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,6 +17,7 @@ ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o endif obj-$(CONFIG_RT_MUTEXES) += rtmutex.o +obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o diff --git a/kernel/exit.c b/kernel/exit.c index 304ef637be6..3e8a0282e9a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -929,6 +929,7 @@ fastcall NORET_TYPE void do_exit(long code) * If DEBUG_MUTEXES is on, make sure we are holding no locks: */ mutex_debug_check_no_locks_held(tsk); + rt_mutex_debug_check_no_locks_held(tsk); if (tsk->io_context) exit_io_context(); diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c new file mode 100644 index 00000000000..4aa8a2c9f45 --- /dev/null +++ b/kernel/rtmutex-debug.c @@ -0,0 +1,513 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner + * + * This code is based on the rt.c implementation in the preempt-rt tree. + * Portions of said code are + * + * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Copyright (C) 2006 Esben Nielsen + * Copyright (C) 2006 Kihon Technologies Inc., + * Steven Rostedt + * + * See rt.c in preempt-rt for proper credits and further information + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtmutex_common.h" + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + +# define TRACE_WARN_ON(x) WARN_ON(x) +# define TRACE_BUG_ON(x) BUG_ON(x) + +# define TRACE_OFF() \ +do { \ + if (rt_trace_on) { \ + rt_trace_on = 0; \ + console_verbose(); \ + if (spin_is_locked(¤t->pi_lock)) \ + spin_unlock(¤t->pi_lock); \ + if (spin_is_locked(¤t->held_list_lock)) \ + spin_unlock(¤t->held_list_lock); \ + } \ +} while (0) + +# define TRACE_OFF_NOLOCK() \ +do { \ + if (rt_trace_on) { \ + rt_trace_on = 0; \ + console_verbose(); \ + } \ +} while (0) + +# define TRACE_BUG_LOCKED() \ +do { \ + TRACE_OFF(); \ + BUG(); \ +} while (0) + +# define TRACE_WARN_ON_LOCKED(c) \ +do { \ + if (unlikely(c)) { \ + TRACE_OFF(); \ + WARN_ON(1); \ + } \ +} while (0) + +# define TRACE_BUG_ON_LOCKED(c) \ +do { \ + if (unlikely(c)) \ + TRACE_BUG_LOCKED(); \ +} while (0) + +#ifdef CONFIG_SMP +# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) +#else +# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) +#endif + +/* + * deadlock detection flag. We turn it off when we detect + * the first problem because we dont want to recurse back + * into the tracing code when doing error printk or + * executing a BUG(): + */ +int rt_trace_on = 1; + +void deadlock_trace_off(void) +{ + rt_trace_on = 0; +} + +static void printk_task(task_t *p) +{ + if (p) + printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); + else + printk(""); +} + +static void printk_task_short(task_t *p) +{ + if (p) + printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); + else + printk(""); +} + +static void printk_lock(struct rt_mutex *lock, int print_owner) +{ + if (lock->name) + printk(" [%p] {%s}\n", + lock, lock->name); + else + printk(" [%p] {%s:%d}\n", + lock, lock->file, lock->line); + + if (print_owner && rt_mutex_owner(lock)) { + printk(".. ->owner: %p\n", lock->owner); + printk(".. held by: "); + printk_task(rt_mutex_owner(lock)); + printk("\n"); + } + if (rt_mutex_owner(lock)) { + printk("... acquired at: "); + print_symbol("%s\n", lock->acquire_ip); + } +} + +static void printk_waiter(struct rt_mutex_waiter *w) +{ + printk("-------------------------\n"); + printk("| waiter struct %p:\n", w); + printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", + w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next, + w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next, + w->list_entry.prio); + printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", + w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next, + w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next, + w->pi_list_entry.prio); + printk("\n| lock:\n"); + printk_lock(w->lock, 1); + printk("| w->ti->task:\n"); + printk_task(w->task); + printk("| blocked at: "); + print_symbol("%s\n", w->ip); + printk("-------------------------\n"); +} + +static void show_task_locks(task_t *p) +{ + switch (p->state) { + case TASK_RUNNING: printk("R"); break; + case TASK_INTERRUPTIBLE: printk("S"); break; + case TASK_UNINTERRUPTIBLE: printk("D"); break; + case TASK_STOPPED: printk("T"); break; + case EXIT_ZOMBIE: printk("Z"); break; + case EXIT_DEAD: printk("X"); break; + default: printk("?"); break; + } + printk_task(p); + if (p->pi_blocked_on) { + struct rt_mutex *lock = p->pi_blocked_on->lock; + + printk(" blocked on:"); + printk_lock(lock, 1); + } else + printk(" (not blocked)\n"); +} + +void rt_mutex_show_held_locks(task_t *task, int verbose) +{ + struct list_head *curr, *cursor = NULL; + struct rt_mutex *lock; + task_t *t; + unsigned long flags; + int count = 0; + + if (!rt_trace_on) + return; + + if (verbose) { + printk("------------------------------\n"); + printk("| showing all locks held by: | ("); + printk_task_short(task); + printk("):\n"); + printk("------------------------------\n"); + } + +next: + spin_lock_irqsave(&task->held_list_lock, flags); + list_for_each(curr, &task->held_list_head) { + if (cursor && curr != cursor) + continue; + lock = list_entry(curr, struct rt_mutex, held_list_entry); + t = rt_mutex_owner(lock); + WARN_ON(t != task); + count++; + cursor = curr->next; + spin_unlock_irqrestore(&task->held_list_lock, flags); + + printk("\n#%03d: ", count); + printk_lock(lock, 0); + goto next; + } + spin_unlock_irqrestore(&task->held_list_lock, flags); + + printk("\n"); +} + +void rt_mutex_show_all_locks(void) +{ + task_t *g, *p; + int count = 10; + int unlock = 1; + + printk("\n"); + printk("----------------------\n"); + printk("| showing all tasks: |\n"); + printk("----------------------\n"); + + /* + * Here we try to get the tasklist_lock as hard as possible, + * if not successful after 2 seconds we ignore it (but keep + * trying). This is to enable a debug printout even if a + * tasklist_lock-holding task deadlocks or crashes. + */ +retry: + if (!read_trylock(&tasklist_lock)) { + if (count == 10) + printk("hm, tasklist_lock locked, retrying... "); + if (count) { + count--; + printk(" #%d", 10-count); + mdelay(200); + goto retry; + } + printk(" ignoring it.\n"); + unlock = 0; + } + if (count != 10) + printk(" locked it.\n"); + + do_each_thread(g, p) { + show_task_locks(p); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + printk("\n"); + + printk("-----------------------------------------\n"); + printk("| showing all locks held in the system: |\n"); + printk("-----------------------------------------\n"); + + do_each_thread(g, p) { + rt_mutex_show_held_locks(p, 0); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + + printk("=============================================\n\n"); + + if (unlock) + read_unlock(&tasklist_lock); +} + +void rt_mutex_debug_check_no_locks_held(task_t *task) +{ + struct rt_mutex_waiter *w; + struct list_head *curr; + struct rt_mutex *lock; + + if (!rt_trace_on) + return; + if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) { + printk("BUG: PI priority boost leaked!\n"); + printk_task(task); + printk("\n"); + } + if (list_empty(&task->held_list_head)) + return; + + spin_lock(&task->pi_lock); + plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) { + TRACE_OFF(); + + printk("hm, PI interest held at exit time? Task:\n"); + printk_task(task); + printk_waiter(w); + return; + } + spin_unlock(&task->pi_lock); + + list_for_each(curr, &task->held_list_head) { + lock = list_entry(curr, struct rt_mutex, held_list_entry); + + printk("BUG: %s/%d, lock held at task exit time!\n", + task->comm, task->pid); + printk_lock(lock, 1); + if (rt_mutex_owner(lock) != task) + printk("exiting task is not even the owner??\n"); + } +} + +int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len) +{ + const void *to = from + len; + struct list_head *curr; + struct rt_mutex *lock; + unsigned long flags; + void *lock_addr; + + if (!rt_trace_on) + return 0; + + spin_lock_irqsave(¤t->held_list_lock, flags); + list_for_each(curr, ¤t->held_list_head) { + lock = list_entry(curr, struct rt_mutex, held_list_entry); + lock_addr = lock; + if (lock_addr < from || lock_addr >= to) + continue; + TRACE_OFF(); + + printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", + current->comm, current->pid, lock, from, to); + dump_stack(); + printk_lock(lock, 1); + if (rt_mutex_owner(lock) != current) + printk("freeing task is not even the owner??\n"); + return 1; + } + spin_unlock_irqrestore(¤t->held_list_lock, flags); + + return 0; +} + +void rt_mutex_debug_task_free(struct task_struct *task) +{ + WARN_ON(!plist_head_empty(&task->pi_waiters)); + WARN_ON(task->pi_blocked_on); +} + +/* + * We fill out the fields in the waiter to store the information about + * the deadlock. We print when we return. act_waiter can be NULL in + * case of a remove waiter operation. + */ +void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, + struct rt_mutex *lock) +{ + struct task_struct *task; + + if (!rt_trace_on || detect || !act_waiter) + return; + + task = rt_mutex_owner(act_waiter->lock); + if (task && task != current) { + act_waiter->deadlock_task_pid = task->pid; + act_waiter->deadlock_lock = lock; + } +} + +void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) +{ + struct task_struct *task; + + if (!waiter->deadlock_lock || !rt_trace_on) + return; + + task = find_task_by_pid(waiter->deadlock_task_pid); + if (!task) + return; + + TRACE_OFF_NOLOCK(); + + printk("\n============================================\n"); + printk( "[ BUG: circular locking deadlock detected! ]\n"); + printk( "--------------------------------------------\n"); + printk("%s/%d is deadlocking current task %s/%d\n\n", + task->comm, task->pid, current->comm, current->pid); + + printk("\n1) %s/%d is trying to acquire this lock:\n", + current->comm, current->pid); + printk_lock(waiter->lock, 1); + + printk("... trying at: "); + print_symbol("%s\n", waiter->ip); + + printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); + printk_lock(waiter->deadlock_lock, 1); + + rt_mutex_show_held_locks(current, 1); + rt_mutex_show_held_locks(task, 1); + + printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); + show_stack(task, NULL); + printk("\n%s/%d's [current] stackdump:\n\n", + current->comm, current->pid); + dump_stack(); + rt_mutex_show_all_locks(); + printk("[ turning off deadlock detection." + "Please report this trace. ]\n\n"); + local_irq_disable(); +} + +void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__) +{ + unsigned long flags; + + if (rt_trace_on) { + TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(¤t->held_list_lock, flags); + list_add_tail(&lock->held_list_entry, ¤t->held_list_head); + spin_unlock_irqrestore(¤t->held_list_lock, flags); + + lock->acquire_ip = ip; + } +} + +void debug_rt_mutex_unlock(struct rt_mutex *lock) +{ + unsigned long flags; + + if (rt_trace_on) { + TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(¤t->held_list_lock, flags); + list_del_init(&lock->held_list_entry); + spin_unlock_irqrestore(¤t->held_list_lock, flags); + } +} + +void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, + struct task_struct *powner __IP_DECL__) +{ + unsigned long flags; + + if (rt_trace_on) { + TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(&powner->held_list_lock, flags); + list_add_tail(&lock->held_list_entry, &powner->held_list_head); + spin_unlock_irqrestore(&powner->held_list_lock, flags); + + lock->acquire_ip = ip; + } +} + +void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) +{ + unsigned long flags; + + if (rt_trace_on) { + struct task_struct *owner = rt_mutex_owner(lock); + + TRACE_WARN_ON_LOCKED(!owner); + TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(&owner->held_list_lock, flags); + list_del_init(&lock->held_list_entry); + spin_unlock_irqrestore(&owner->held_list_lock, flags); + } +} + +void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + memset(waiter, 0x11, sizeof(*waiter)); + plist_node_init(&waiter->list_entry, MAX_PRIO); + plist_node_init(&waiter->pi_list_entry, MAX_PRIO); +} + +void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) +{ + TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); + TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + TRACE_WARN_ON(waiter->task); + memset(waiter, 0x22, sizeof(*waiter)); +} + +void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) +{ + void *addr = lock; + + if (rt_trace_on) { + rt_mutex_debug_check_no_locks_freed(addr, + sizeof(struct rt_mutex)); + INIT_LIST_HEAD(&lock->held_list_entry); + lock->name = name; + } +} + +void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task) +{ +} + +void rt_mutex_deadlock_account_unlock(struct task_struct *task) +{ +} + diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h new file mode 100644 index 00000000000..7612fbc62d7 --- /dev/null +++ b/kernel/rtmutex-debug.h @@ -0,0 +1,37 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains macros used solely by rtmutex.c. Debug version. + */ + +#define __IP_DECL__ , unsigned long ip +#define __IP__ , ip +#define __RET_IP__ , (unsigned long)__builtin_return_address(0) + +extern void +rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); +extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); +extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); +extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__); +extern void debug_rt_mutex_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, + struct task_struct *powner __IP_DECL__); +extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, + struct rt_mutex *lock); +extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); +# define debug_rt_mutex_reset_waiter(w) \ + do { (w)->deadlock_lock = NULL; } while (0) + +static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, + int detect) +{ + return (waiter != NULL); +} -- cgit From 61a87122869b6340a63b6f9f84097d3688604b90 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 27 Jun 2006 02:54:56 -0700 Subject: [PATCH] pi-futex: rt mutex tester RT-mutex tester: scriptable tester for rt mutexes, which allows userspace scripting of mutex unit-tests (and dynamic tests as well), using the actual rt-mutex implementation of the kernel. [akpm@osdl.org: fixlet] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/rtmutex-tester.c | 436 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rtmutex.c | 3 +- kernel/rtmutex_common.h | 22 +++ 4 files changed, 461 insertions(+), 1 deletion(-) create mode 100644 kernel/rtmutex-tester.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f9c92d34cde..82fb182f6f6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_FUTEX) += futex_compat.o endif obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c new file mode 100644 index 00000000000..fe211ba3a5b --- /dev/null +++ b/kernel/rtmutex-tester.c @@ -0,0 +1,436 @@ +/* + * RT-Mutex-tester: scriptable tester for rt mutexes + * + * started by Thomas Gleixner: + * + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtmutex.h" + +#define MAX_RT_TEST_THREADS 8 +#define MAX_RT_TEST_MUTEXES 8 + +static spinlock_t rttest_lock; +static atomic_t rttest_event; + +struct test_thread_data { + int opcode; + int opdata; + int mutexes[MAX_RT_TEST_MUTEXES]; + int bkl; + int event; + struct sys_device sysdev; +}; + +static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; +static task_t *threads[MAX_RT_TEST_THREADS]; +static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; + +enum test_opcodes { + RTTEST_NOP = 0, + RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ + RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ + RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ + RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ + RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ + RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ + RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ + RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ + RTTEST_LOCKBKL, /* 9 Lock BKL */ + RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ + RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ + RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ + RTTEST_RESET = 99, /* 99 Reset all pending operations */ +}; + +static int handle_op(struct test_thread_data *td, int lockwakeup) +{ + struct sched_param schedpar; + int i, id, ret = -EINVAL; + + switch(td->opcode) { + + case RTTEST_NOP: + return 0; + + case RTTEST_SCHEDOT: + schedpar.sched_priority = 0; + ret = sched_setscheduler(current, SCHED_NORMAL, &schedpar); + if (!ret) + set_user_nice(current, 0); + return ret; + + case RTTEST_SCHEDRT: + schedpar.sched_priority = td->opdata; + return sched_setscheduler(current, SCHED_FIFO, &schedpar); + + case RTTEST_LOCKCONT: + td->mutexes[td->opdata] = 1; + td->event = atomic_add_return(1, &rttest_event); + return 0; + + case RTTEST_RESET: + for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { + if (td->mutexes[i] == 4) { + rt_mutex_unlock(&mutexes[i]); + td->mutexes[i] = 0; + } + } + + if (!lockwakeup && td->bkl == 4) { + unlock_kernel(); + td->bkl = 0; + } + return 0; + + case RTTEST_RESETEVENT: + atomic_set(&rttest_event, 0); + return 0; + + default: + if (lockwakeup) + return ret; + } + + switch(td->opcode) { + + case RTTEST_LOCK: + case RTTEST_LOCKNOWAIT: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES) + return ret; + + td->mutexes[id] = 1; + td->event = atomic_add_return(1, &rttest_event); + rt_mutex_lock(&mutexes[id]); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = 4; + return 0; + + case RTTEST_LOCKINT: + case RTTEST_LOCKINTNOWAIT: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES) + return ret; + + td->mutexes[id] = 1; + td->event = atomic_add_return(1, &rttest_event); + ret = rt_mutex_lock_interruptible(&mutexes[id], 0); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = ret ? 0 : 4; + return ret ? -EINTR : 0; + + case RTTEST_UNLOCK: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) + return ret; + + td->event = atomic_add_return(1, &rttest_event); + rt_mutex_unlock(&mutexes[id]); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = 0; + return 0; + + case RTTEST_LOCKBKL: + if (td->bkl) + return 0; + td->bkl = 1; + lock_kernel(); + td->bkl = 4; + return 0; + + case RTTEST_UNLOCKBKL: + if (td->bkl != 4) + break; + unlock_kernel(); + td->bkl = 0; + return 0; + + default: + break; + } + return ret; +} + +/* + * Schedule replacement for rtsem_down(). Only called for threads with + * PF_MUTEX_TESTER set. + * + * This allows us to have finegrained control over the event flow. + * + */ +void schedule_rt_mutex_test(struct rt_mutex *mutex) +{ + int tid, op, dat; + struct test_thread_data *td; + + /* We have to lookup the task */ + for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { + if (threads[tid] == current) + break; + } + + BUG_ON(tid == MAX_RT_TEST_THREADS); + + td = &thread_data[tid]; + + op = td->opcode; + dat = td->opdata; + + switch (op) { + case RTTEST_LOCK: + case RTTEST_LOCKINT: + case RTTEST_LOCKNOWAIT: + case RTTEST_LOCKINTNOWAIT: + if (mutex != &mutexes[dat]) + break; + + if (td->mutexes[dat] != 1) + break; + + td->mutexes[dat] = 2; + td->event = atomic_add_return(1, &rttest_event); + break; + + case RTTEST_LOCKBKL: + default: + break; + } + + schedule(); + + + switch (op) { + case RTTEST_LOCK: + case RTTEST_LOCKINT: + if (mutex != &mutexes[dat]) + return; + + if (td->mutexes[dat] != 2) + return; + + td->mutexes[dat] = 3; + td->event = atomic_add_return(1, &rttest_event); + break; + + case RTTEST_LOCKNOWAIT: + case RTTEST_LOCKINTNOWAIT: + if (mutex != &mutexes[dat]) + return; + + if (td->mutexes[dat] != 2) + return; + + td->mutexes[dat] = 1; + td->event = atomic_add_return(1, &rttest_event); + return; + + case RTTEST_LOCKBKL: + return; + default: + return; + } + + td->opcode = 0; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if (td->opcode > 0) { + int ret; + + set_current_state(TASK_RUNNING); + ret = handle_op(td, 1); + set_current_state(TASK_INTERRUPTIBLE); + if (td->opcode == RTTEST_LOCKCONT) + break; + td->opcode = ret; + } + + /* Wait for the next command to be executed */ + schedule(); + } + + /* Restore previous command and data */ + td->opcode = op; + td->opdata = dat; +} + +static int test_func(void *data) +{ + struct test_thread_data *td = data; + int ret; + + current->flags |= PF_MUTEX_TESTER; + allow_signal(SIGHUP); + + for(;;) { + + set_current_state(TASK_INTERRUPTIBLE); + + if (td->opcode > 0) { + set_current_state(TASK_RUNNING); + ret = handle_op(td, 0); + set_current_state(TASK_INTERRUPTIBLE); + td->opcode = ret; + } + + /* Wait for the next command to be executed */ + schedule(); + + if (signal_pending(current)) + flush_signals(current); + + if(kthread_should_stop()) + break; + } + return 0; +} + +/** + * sysfs_test_command - interface for test commands + * @dev: thread reference + * @buf: command for actual step + * @count: length of buffer + * + * command syntax: + * + * opcode:data + */ +static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, + size_t count) +{ + struct test_thread_data *td; + char cmdbuf[32]; + int op, dat, tid; + + td = container_of(dev, struct test_thread_data, sysdev); + tid = td->sysdev.id; + + /* strings from sysfs write are not 0 terminated! */ + if (count >= sizeof(cmdbuf)) + return -EINVAL; + + /* strip of \n: */ + if (buf[count-1] == '\n') + count--; + if (count < 1) + return -EINVAL; + + memcpy(cmdbuf, buf, count); + cmdbuf[count] = 0; + + if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) + return -EINVAL; + + switch (op) { + case RTTEST_SIGNAL: + send_sig(SIGHUP, threads[tid], 0); + break; + + default: + if (td->opcode > 0) + return -EBUSY; + td->opdata = dat; + td->opcode = op; + wake_up_process(threads[tid]); + } + + return count; +} + +/** + * sysfs_test_status - sysfs interface for rt tester + * @dev: thread to query + * @buf: char buffer to be filled with thread status info + */ +static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) +{ + struct test_thread_data *td; + char *curr = buf; + task_t *tsk; + int i; + + td = container_of(dev, struct test_thread_data, sysdev); + tsk = threads[td->sysdev.id]; + + spin_lock(&rttest_lock); + + curr += sprintf(curr, + "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", + td->opcode, td->event, tsk->state, + (MAX_RT_PRIO - 1) - tsk->prio, + (MAX_RT_PRIO - 1) - tsk->normal_prio, + tsk->pi_blocked_on, td->bkl); + + for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) + curr += sprintf(curr, "%d", td->mutexes[i]); + + spin_unlock(&rttest_lock); + + curr += sprintf(curr, ", T: %p, R: %p\n", tsk, + mutexes[td->sysdev.id].owner); + + return curr - buf; +} + +static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); +static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); + +static struct sysdev_class rttest_sysclass = { + set_kset_name("rttest"), +}; + +static int init_test_thread(int id) +{ + thread_data[id].sysdev.cls = &rttest_sysclass; + thread_data[id].sysdev.id = id; + + threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); + if (IS_ERR(threads[id])) + return PTR_ERR(threads[id]); + + return sysdev_register(&thread_data[id].sysdev); +} + +static int init_rttest(void) +{ + int ret, i; + + spin_lock_init(&rttest_lock); + + for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) + rt_mutex_init(&mutexes[i]); + + ret = sysdev_class_register(&rttest_sysclass); + if (ret) + return ret; + + for (i = 0; i < MAX_RT_TEST_THREADS; i++) { + ret = init_test_thread(i); + if (ret) + break; + ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); + if (ret) + break; + ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); + if (ret) + break; + } + + printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); + + return ret; +} + +device_initcall(init_rttest); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 937a474fae9..39c8ca0cf52 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -640,7 +640,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, debug_rt_mutex_print_deadlock(&waiter); - schedule(); + if (waiter.task) + schedule_rt_mutex(lock); spin_lock(&lock->wait_lock); set_current_state(state); diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 50eed60eb08..e068024eeff 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -14,6 +14,28 @@ #include +/* + * The rtmutex in kernel tester is independent of rtmutex debugging. We + * call schedule_rt_mutex_test() instead of schedule() for the tasks which + * belong to the tester. That way we can delay the wakeup path of those + * threads to provoke lock stealing and testing of complex boosting scenarios. + */ +#ifdef CONFIG_RT_MUTEX_TESTER + +extern void schedule_rt_mutex_test(struct rt_mutex *lock); + +#define schedule_rt_mutex(_lock) \ + do { \ + if (!(current->flags & PF_MUTEX_TESTER)) \ + schedule(); \ + else \ + schedule_rt_mutex_test(_lock); \ + } while (0) + +#else +# define schedule_rt_mutex(_lock) schedule() +#endif + /* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. -- cgit From 0cdbee9920fb37eb2dc49b860c2b28862d647adc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:54:57 -0700 Subject: [PATCH] pi-futex: rt mutex futex api Add proxy-locking rt-mutex functionality needed by pi-futexes. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rtmutex.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 39c8ca0cf52..3fc0f0680ca 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -903,3 +903,58 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) debug_rt_mutex_init(lock, name); } EXPORT_SYMBOL_GPL(__rt_mutex_init); + +/** + * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a + * proxy owner + * + * @lock: the rt_mutex to be locked + * @proxy_owner:the task to set as owner + * + * No locking. Caller has to do serializing itself + * Special API call for PI-futex support + */ +void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner) +{ + __rt_mutex_init(lock, NULL); + debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__); + rt_mutex_set_owner(lock, proxy_owner, 0); + rt_mutex_deadlock_account_lock(lock, proxy_owner); +} + +/** + * rt_mutex_proxy_unlock - release a lock on behalf of owner + * + * @lock: the rt_mutex to be locked + * + * No locking. Caller has to do serializing itself + * Special API call for PI-futex support + */ +void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner) +{ + debug_rt_mutex_proxy_unlock(lock); + rt_mutex_set_owner(lock, NULL, 0); + rt_mutex_deadlock_account_unlock(proxy_owner); +} + +/** + * rt_mutex_next_owner - return the next owner of the lock + * + * @lock: the rt lock query + * + * Returns the next owner of the lock or NULL + * + * Caller has to serialize against other accessors to the lock + * itself. + * + * Special API call for PI-futex support + */ +struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + return NULL; + + return rt_mutex_top_waiter(lock)->task; +} -- cgit From c87e2837be82df479a6bae9f155c43516d2feebc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:54:58 -0700 Subject: [PATCH] pi-futex: futex_lock_pi/futex_unlock_pi support This adds the actual pi-futex implementation, based on rt-mutexes. [dino@in.ibm.com: fix an oops-causing race] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Signed-off-by: Dinakar Guniguntala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 + kernel/fork.c | 3 + kernel/futex.c | 829 +++++++++++++++++++++++++++++++++++++++++++++--- kernel/futex_compat.c | 11 +- kernel/rtmutex_common.h | 8 + 5 files changed, 818 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 3e8a0282e9a..ab06b9f88f6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -925,6 +925,14 @@ fastcall NORET_TYPE void do_exit(long code) mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; #endif + /* + * This must happen late, after the PID is not + * hashed anymore: + */ + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); + if (unlikely(current->pi_state_cache)) + kfree(current->pi_state_cache); /* * If DEBUG_MUTEXES is on, make sure we are holding no locks: */ diff --git a/kernel/fork.c b/kernel/fork.c index b664a081fff..628198a4f28 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1092,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags, #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif + INIT_LIST_HEAD(&p->pi_state_list); + p->pi_state_cache = NULL; + /* * sigaltstack should be cleared when sharing the same VM */ diff --git a/kernel/futex.c b/kernel/futex.c index 50356fb5d72..b305b7f8dad 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -12,6 +12,10 @@ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * + * PI-futex support started by Ingo Molnar and Thomas Gleixner + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -46,6 +50,8 @@ #include #include +#include "rtmutex_common.h" + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -74,6 +80,27 @@ union futex_key { } both; }; +/* + * Priority Inheritance state: + */ +struct futex_pi_state { + /* + * list of 'owned' pi_state instances - these have to be + * cleaned up in do_exit() if the task exits prematurely: + */ + struct list_head list; + + /* + * The PI object: + */ + struct rt_mutex pi_mutex; + + struct task_struct *owner; + atomic_t refcount; + + union futex_key key; +}; + /* * We use this hashed waitqueue instead of a normal wait_queue_t, so * we can wake only the relevant ones (hashed queues may be shared). @@ -96,6 +123,10 @@ struct futex_q { /* For fd, sigio sent using these: */ int fd; struct file *filp; + + /* Optional priority inheritance state: */ + struct futex_pi_state *pi_state; + struct task_struct *task; }; /* @@ -258,6 +289,232 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from) return ret ? -EFAULT : 0; } +/* + * Fault handling. Called with current->mm->mmap_sem held. + */ +static int futex_handle_fault(unsigned long address, int attempt) +{ + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + + if (attempt >= 2 || !(vma = find_vma(mm, address)) || + vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) + return -EFAULT; + + switch (handle_mm_fault(mm, vma, address, 1)) { + case VM_FAULT_MINOR: + current->min_flt++; + break; + case VM_FAULT_MAJOR: + current->maj_flt++; + break; + default: + return -EFAULT; + } + return 0; +} + +/* + * PI code: + */ +static int refill_pi_state_cache(void) +{ + struct futex_pi_state *pi_state; + + if (likely(current->pi_state_cache)) + return 0; + + pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); + + if (!pi_state) + return -ENOMEM; + + memset(pi_state, 0, sizeof(*pi_state)); + INIT_LIST_HEAD(&pi_state->list); + /* pi_mutex gets initialized later */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + + current->pi_state_cache = pi_state; + + return 0; +} + +static struct futex_pi_state * alloc_pi_state(void) +{ + struct futex_pi_state *pi_state = current->pi_state_cache; + + WARN_ON(!pi_state); + current->pi_state_cache = NULL; + + return pi_state; +} + +static void free_pi_state(struct futex_pi_state *pi_state) +{ + if (!atomic_dec_and_test(&pi_state->refcount)) + return; + + /* + * If pi_state->owner is NULL, the owner is most probably dying + * and has cleaned up the pi_state already + */ + if (pi_state->owner) { + spin_lock_irq(&pi_state->owner->pi_lock); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + + rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + } + + if (current->pi_state_cache) + kfree(pi_state); + else { + /* + * pi_state->list is already empty. + * clear pi_state->owner. + * refcount is at 0 - put it back to 1. + */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + current->pi_state_cache = pi_state; + } +} + +/* + * Look up the task based on what TID userspace gave us. + * We dont trust it. + */ +static struct task_struct * futex_find_get_task(pid_t pid) +{ + struct task_struct *p; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid)) { + p = NULL; + goto out_unlock; + } + if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { + p = NULL; + goto out_unlock; + } + get_task_struct(p); +out_unlock: + read_unlock(&tasklist_lock); + + return p; +} + +/* + * This task is holding PI mutexes at exit time => bad. + * Kernel cleans up PI-state, but userspace is likely hosed. + * (Robust-futex cleanup is separate and might save the day for userspace.) + */ +void exit_pi_state_list(struct task_struct *curr) +{ + struct futex_hash_bucket *hb; + struct list_head *next, *head = &curr->pi_state_list; + struct futex_pi_state *pi_state; + union futex_key key; + + /* + * We are a ZOMBIE and nobody can enqueue itself on + * pi_state_list anymore, but we have to be careful + * versus waiters unqueueing themselfs + */ + spin_lock_irq(&curr->pi_lock); + while (!list_empty(head)) { + + next = head->next; + pi_state = list_entry(next, struct futex_pi_state, list); + key = pi_state->key; + spin_unlock_irq(&curr->pi_lock); + + hb = hash_futex(&key); + spin_lock(&hb->lock); + + spin_lock_irq(&curr->pi_lock); + if (head->next != next) { + spin_unlock(&hb->lock); + continue; + } + + list_del_init(&pi_state->list); + + WARN_ON(pi_state->owner != curr); + + pi_state->owner = NULL; + spin_unlock_irq(&curr->pi_lock); + + rt_mutex_unlock(&pi_state->pi_mutex); + + spin_unlock(&hb->lock); + + spin_lock_irq(&curr->pi_lock); + } + spin_unlock_irq(&curr->pi_lock); +} + +static int +lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) +{ + struct futex_pi_state *pi_state = NULL; + struct futex_q *this, *next; + struct list_head *head; + struct task_struct *p; + pid_t pid; + + head = &hb->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &me->key)) { + /* + * Another waiter already exists - bump up + * the refcount and return its pi_state: + */ + pi_state = this->pi_state; + atomic_inc(&pi_state->refcount); + me->pi_state = pi_state; + + return 0; + } + } + + /* + * We are the first waiter - try to look up the real owner and + * attach the new pi_state to it: + */ + pid = uval & FUTEX_TID_MASK; + p = futex_find_get_task(pid); + if (!p) + return -ESRCH; + + pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make 'p' + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = me->key; + + spin_lock_irq(&p->pi_lock); + list_add(&pi_state->list, &p->pi_state_list); + pi_state->owner = p; + spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + me->pi_state = pi_state; + + return 0; +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. @@ -285,6 +542,70 @@ static void wake_futex(struct futex_q *q) q->lock_ptr = NULL; } +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) +{ + struct task_struct *new_owner; + struct futex_pi_state *pi_state = this->pi_state; + u32 curval, newval; + + if (!pi_state) + return -EINVAL; + + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + + /* + * This happens when we have stolen the lock and the original + * pending owner did not enqueue itself back on the rt_mutex. + * Thats not a tragedy. We know that way, that a lock waiter + * is on the fly. We make the futex_q waiter the pending owner. + */ + if (!new_owner) + new_owner = this->task; + + /* + * We pass it to the next owner. (The WAITERS bit is always + * kept enabled while there is PI state around. We must also + * preserve the owner died bit.) + */ + newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + + if (curval == -EFAULT) + return -EFAULT; + if (curval != uval) + return -EINVAL; + + list_del_init(&pi_state->owner->pi_state_list); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + rt_mutex_unlock(&pi_state->pi_mutex); + + return 0; +} + +static int unlock_futex_pi(u32 __user *uaddr, u32 uval) +{ + u32 oldval; + + /* + * There is no waiter, so we unlock the futex. The owner died + * bit has not to be preserved here. We are the owner: + */ + inc_preempt_count(); + oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); + dec_preempt_count(); + + if (oldval == -EFAULT) + return oldval; + if (oldval != uval) + return -EAGAIN; + + return 0; +} + /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: @@ -309,6 +630,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake) list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { + if (this->pi_state) + return -EINVAL; wake_futex(this); if (++ret >= nr_wake) break; @@ -385,27 +708,9 @@ retry: * still holding the mmap_sem. */ if (attempt++) { - struct vm_area_struct * vma; - struct mm_struct *mm = current->mm; - unsigned long address = (unsigned long)uaddr2; - - ret = -EFAULT; - if (attempt >= 2 || - !(vma = find_vma(mm, address)) || - vma->vm_start > address || - !(vma->vm_flags & VM_WRITE)) + if (futex_handle_fault((unsigned long)uaddr2, + attempt)) goto out; - - switch (handle_mm_fault(mm, vma, address, 1)) { - case VM_FAULT_MINOR: - current->min_flt++; - break; - case VM_FAULT_MAJOR: - current->maj_flt++; - break; - default: - goto out; - } goto retry; } @@ -572,6 +877,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { list_add_tail(&q->list, &hb->chain); + q->task = current; spin_unlock(&hb->lock); } @@ -626,6 +932,9 @@ static int unqueue_me(struct futex_q *q) } WARN_ON(list_empty(&q->list)); list_del(&q->list); + + BUG_ON(q->pi_state); + spin_unlock(lock_ptr); ret = 1; } @@ -634,16 +943,36 @@ static int unqueue_me(struct futex_q *q) return ret; } +/* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock is held on entry and dropped here. + */ +static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) +{ + WARN_ON(list_empty(&q->list)); + list_del(&q->list); + + BUG_ON(!q->pi_state); + free_pi_state(q->pi_state); + q->pi_state = NULL; + + spin_unlock(&hb->lock); + + drop_key_refs(&q->key); +} + static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) { - DECLARE_WAITQUEUE(wait, current); + struct task_struct *curr = current; + DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; u32 uval; int ret; + q.pi_state = NULL; retry: - down_read(¤t->mm->mmap_sem); + down_read(&curr->mm->mmap_sem); ret = get_futex_key(uaddr, &q.key); if (unlikely(ret != 0)) @@ -680,7 +1009,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) * If we would have faulted, release mmap_sem, fault it in and * start all over again. */ - up_read(¤t->mm->mmap_sem); + up_read(&curr->mm->mmap_sem); ret = get_user(uval, uaddr); @@ -688,11 +1017,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) goto retry; return ret; } - if (uval != val) { - ret = -EWOULDBLOCK; - queue_unlock(&q, hb); - goto out_release_sem; - } + ret = -EWOULDBLOCK; + if (uval != val) + goto out_unlock_release_sem; /* Only actually queue if *uaddr contained val. */ __queue_me(&q, hb); @@ -700,8 +1027,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) /* * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. - */ - up_read(¤t->mm->mmap_sem); + */ + up_read(&curr->mm->mmap_sem); /* * There might have been scheduling since the queue_me(), as we @@ -739,8 +1066,415 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) */ return -EINTR; + out_unlock_release_sem: + queue_unlock(&q, hb); + out_release_sem: + up_read(&curr->mm->mmap_sem); + return ret; +} + +/* + * Userspace tried a 0 -> TID atomic transition of the futex value + * and failed. The kernel side here does the whole locking operation: + * if there are waiters then it will block, it does PI, etc. (Due to + * races the kernel might see a 0 value of the futex too.) + */ +static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, + struct hrtimer_sleeper *to) +{ + struct task_struct *curr = current; + struct futex_hash_bucket *hb; + u32 uval, newval, curval; + struct futex_q q; + int ret, attempt = 0; + + if (refill_pi_state_cache()) + return -ENOMEM; + + q.pi_state = NULL; + retry: + down_read(&curr->mm->mmap_sem); + + ret = get_futex_key(uaddr, &q.key); + if (unlikely(ret != 0)) + goto out_release_sem; + + hb = queue_lock(&q, -1, NULL); + + retry_locked: + /* + * To avoid races, we attempt to take the lock here again + * (by doing a 0 -> TID atomic cmpxchg), while holding all + * the locks. It will most likely not succeed. + */ + newval = current->pid; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + + /* We own the lock already */ + if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { + if (!detect && 0) + force_sig(SIGKILL, current); + ret = -EDEADLK; + goto out_unlock_release_sem; + } + + /* + * Surprise - we got the lock. Just return + * to userspace: + */ + if (unlikely(!curval)) + goto out_unlock_release_sem; + + uval = curval; + newval = uval | FUTEX_WAITERS; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + if (unlikely(curval != uval)) + goto retry_locked; + + /* + * We dont have the lock. Look up the PI state (or create it if + * we are the first waiter): + */ + ret = lookup_pi_state(uval, hb, &q); + + if (unlikely(ret)) { + /* + * There were no waiters and the owner task lookup + * failed. When the OWNER_DIED bit is set, then we + * know that this is a robust futex and we actually + * take the lock. This is safe as we are protected by + * the hash bucket lock. We also set the waiters bit + * unconditionally here, to simplify glibc handling of + * multiple tasks racing to acquire the lock and + * cleanup the problems which were left by the dead + * owner. + */ + if (curval & FUTEX_OWNER_DIED) { + uval = newval; + newval = current->pid | + FUTEX_OWNER_DIED | FUTEX_WAITERS; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + if (unlikely(curval != uval)) + goto retry_locked; + ret = 0; + } + goto out_unlock_release_sem; + } + + /* + * Only actually queue now that the atomic ops are done: + */ + __queue_me(&q, hb); + + /* + * Now the futex is queued and we have checked the data, we + * don't want to hold mmap_sem while we sleep. + */ + up_read(&curr->mm->mmap_sem); + + WARN_ON(!q.pi_state); + /* + * Block on the PI mutex: + */ + if (!trylock) + ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); + else { + ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + } + + down_read(&curr->mm->mmap_sem); + hb = queue_lock(&q, -1, NULL); + + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (!ret && q.pi_state->owner != curr) { + u32 newtid = current->pid | FUTEX_WAITERS; + + /* Owner died? */ + if (q.pi_state->owner != NULL) { + spin_lock_irq(&q.pi_state->owner->pi_lock); + list_del_init(&q.pi_state->list); + spin_unlock_irq(&q.pi_state->owner->pi_lock); + } else + newtid |= FUTEX_OWNER_DIED; + + q.pi_state->owner = current; + + spin_lock_irq(¤t->pi_lock); + list_add(&q.pi_state->list, ¤t->pi_state_list); + spin_unlock_irq(¤t->pi_lock); + + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); + /* + * We own it, so we have to replace the pending owner + * TID. This must be atomic as we have preserve the + * owner died bit here. + */ + ret = get_user(uval, uaddr); + while (!ret) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + if (curval == -EFAULT) + ret = -EFAULT; + if (curval == uval) + break; + uval = curval; + } + } else { + /* + * Catch the rare case, where the lock was released + * when we were on the way back before we locked + * the hash bucket. + */ + if (ret && q.pi_state->owner == curr) { + if (rt_mutex_trylock(&q.pi_state->pi_mutex)) + ret = 0; + } + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); + } + + if (!detect && ret == -EDEADLK && 0) + force_sig(SIGKILL, current); + + return ret; + + out_unlock_release_sem: + queue_unlock(&q, hb); + + out_release_sem: + up_read(&curr->mm->mmap_sem); + return ret; + + uaddr_faulted: + /* + * We have to r/w *(int __user *)uaddr, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. + */ + if (attempt++) { + if (futex_handle_fault((unsigned long)uaddr, attempt)) + goto out_unlock_release_sem; + + goto retry_locked; + } + + queue_unlock(&q, hb); + up_read(&curr->mm->mmap_sem); + + ret = get_user(uval, uaddr); + if (!ret && (uval != -EFAULT)) + goto retry; + + return ret; +} + +/* + * Restart handler + */ +static long futex_lock_pi_restart(struct restart_block *restart) +{ + struct hrtimer_sleeper timeout, *to = NULL; + int ret; + + restart->fn = do_no_restart_syscall; + + if (restart->arg2 || restart->arg3) { + to = &timeout; + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init_sleeper(to, current); + to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | + (u64) restart->arg0; + } + + pr_debug("lock_pi restart: %p, %d (%d)\n", + (u32 __user *)restart->arg0, current->pid); + + ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, + 0, to); + + if (ret != -EINTR) + return ret; + + restart->fn = futex_lock_pi_restart; + + /* The other values are filled in */ + return -ERESTART_RESTARTBLOCK; +} + +/* + * Called from the syscall entry below. + */ +static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, + long nsec, int trylock) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct restart_block *restart; + int ret; + + if (sec != MAX_SCHEDULE_TIMEOUT) { + to = &timeout; + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init_sleeper(to, current); + to->timer.expires = ktime_set(sec, nsec); + } + + ret = do_futex_lock_pi(uaddr, detect, trylock, to); + + if (ret != -EINTR) + return ret; + + pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); + + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_lock_pi_restart; + restart->arg0 = (unsigned long) uaddr; + restart->arg1 = detect; + if (to) { + restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; + restart->arg3 = to->timer.expires.tv64 >> 32; + } else + restart->arg2 = restart->arg3 = 0; + + return -ERESTART_RESTARTBLOCK; +} + +/* + * Userspace attempted a TID -> 0 atomic transition, and failed. + * This is the in-kernel slowpath: we look up the PI state (if any), + * and do the rt-mutex unlock. + */ +static int futex_unlock_pi(u32 __user *uaddr) +{ + struct futex_hash_bucket *hb; + struct futex_q *this, *next; + u32 uval; + struct list_head *head; + union futex_key key; + int ret, attempt = 0; + +retry: + if (get_user(uval, uaddr)) + return -EFAULT; + /* + * We release only a lock we actually own: + */ + if ((uval & FUTEX_TID_MASK) != current->pid) + return -EPERM; + /* + * First take all the futex related locks: + */ + down_read(¤t->mm->mmap_sem); + + ret = get_futex_key(uaddr, &key); + if (unlikely(ret != 0)) + goto out; + + hb = hash_futex(&key); + spin_lock(&hb->lock); + +retry_locked: + /* + * To avoid races, try to do the TID -> 0 atomic transition + * again. If it succeeds then we can return without waking + * anyone else up: + */ + inc_preempt_count(); + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + dec_preempt_count(); + + if (unlikely(uval == -EFAULT)) + goto pi_faulted; + /* + * Rare case: we managed to release the lock atomically, + * no need to wake anyone else up: + */ + if (unlikely(uval == current->pid)) + goto out_unlock; + + /* + * Ok, other tasks may need to be woken up - check waiters + * and do the wakeup if necessary: + */ + head = &hb->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (!match_futex (&this->key, &key)) + continue; + ret = wake_futex_pi(uaddr, uval, this); + /* + * The atomic access to the futex value + * generated a pagefault, so retry the + * user-access and the wakeup: + */ + if (ret == -EFAULT) + goto pi_faulted; + goto out_unlock; + } + /* + * No waiters - kernel unlocks the futex: + */ + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + +out_unlock: + spin_unlock(&hb->lock); +out: + up_read(¤t->mm->mmap_sem); + + return ret; + +pi_faulted: + /* + * We have to r/w *(int __user *)uaddr, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. + */ + if (attempt++) { + if (futex_handle_fault((unsigned long)uaddr, attempt)) + goto out_unlock; + + goto retry_locked; + } + + spin_unlock(&hb->lock); up_read(¤t->mm->mmap_sem); + + ret = get_user(uval, uaddr); + if (!ret && (uval != -EFAULT)) + goto retry; + return ret; } @@ -819,6 +1553,7 @@ static int futex_fd(u32 __user *uaddr, int signal) err = -ENOMEM; goto error; } + q->pi_state = NULL; down_read(¤t->mm->mmap_sem); err = get_futex_key(uaddr, &q->key); @@ -856,7 +1591,7 @@ error: * Implementation: user-space maintains a per-thread list of locks it * is holding. Upon do_exit(), the kernel carefully walks this list, * and marks all locks that are owned by this thread with the - * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is + * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is * always manipulated with the lock held, so the list is private and * per-thread. Userspace also maintains a per-thread 'list_op_pending' * field, to allow the kernel to clean up if the thread dies after @@ -931,7 +1666,7 @@ err_unlock: */ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) { - u32 uval; + u32 uval, nval; retry: if (get_user(uval, uaddr)) @@ -948,8 +1683,12 @@ retry: * thread-death.) The rest of the cleanup is done in * userspace. */ - if (futex_atomic_cmpxchg_inatomic(uaddr, uval, - uval | FUTEX_OWNER_DIED) != uval) + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, + uval | FUTEX_OWNER_DIED); + if (nval == -EFAULT) + return -1; + + if (nval != uval) goto retry; if (uval & FUTEX_WAITERS) @@ -994,7 +1733,7 @@ void exit_robust_list(struct task_struct *curr) while (entry != &head->list) { /* * A pending lock might already be on the list, so - * dont process it twice: + * don't process it twice: */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, @@ -1040,6 +1779,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, case FUTEX_WAKE_OP: ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); break; + case FUTEX_LOCK_PI: + ret = futex_lock_pi(uaddr, val, timeout, val2, 0); + break; + case FUTEX_UNLOCK_PI: + ret = futex_unlock_pi(uaddr); + break; + case FUTEX_TRYLOCK_PI: + ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); + break; default: ret = -ENOSYS; } @@ -1055,17 +1803,22 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout = MAX_SCHEDULE_TIMEOUT; u32 val2 = 0; - if (utime && (op == FUTEX_WAIT)) { + if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { if (copy_from_user(&t, utime, sizeof(t)) != 0) return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - timeout = timespec_to_jiffies(&t) + 1; + if (op == FUTEX_WAIT) + timeout = timespec_to_jiffies(&t) + 1; + else { + timeout = t.tv_sec; + val2 = t.tv_nsec; + } } /* * requeue parameter in 'utime' if op == FUTEX_REQUEUE. */ - if (op >= FUTEX_REQUEUE) + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) val2 = (u32) (unsigned long) utime; return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 7e57c31670a..d1d92b441fb 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -129,14 +129,19 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout = MAX_SCHEDULE_TIMEOUT; int val2 = 0; - if (utime && (op == FUTEX_WAIT)) { + if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { if (get_compat_timespec(&t, utime)) return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - timeout = timespec_to_jiffies(&t) + 1; + if (op == FUTEX_WAIT) + timeout = timespec_to_jiffies(&t) + 1; + else { + timeout = t.tv_sec; + val2 = t.tv_nsec; + } } - if (op >= FUTEX_REQUEUE) + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) val2 = (int) (unsigned long) utime; return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e068024eeff..9c75856e791 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -112,4 +112,12 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; } +/* + * PI-futex support (proxy locking functions, etc.): + */ +extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); +extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner); #endif -- cgit From e74c69f46d93d29eea0ad8647863d1c6488f0f55 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 27 Jun 2006 02:55:00 -0700 Subject: [PATCH] Drop tasklist lock in do_sched_setscheduler There is no need to hold tasklist_lock across the setscheduler call, when we pin the task structure with get_task_struct(). Interrupts are disabled in setscheduler anyway and the permission checks do not need interrupts disabled. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Cc: Steven Rostedt Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 08431f07a99..7a30addfd23 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4091,8 +4091,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) read_unlock_irq(&tasklist_lock); return -ESRCH; } - retval = sched_setscheduler(p, policy, &lparam); + get_task_struct(p); read_unlock_irq(&tasklist_lock); + retval = sched_setscheduler(p, policy, &lparam); + put_task_struct(p); return retval; } -- cgit From 0bafd214e4ba55dc1fb81a3031d0249292f1bc05 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 27 Jun 2006 02:55:01 -0700 Subject: [PATCH] rtmutex: Modify rtmutex-tester to test the setscheduler propagation Make test suite setscheduler calls asynchronously. Remove the waits in the test cases and add a new testcase to verify the correctness of the setscheduler priority propagation. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rtmutex-tester.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index fe211ba3a5b..e82c2f84824 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -46,7 +46,7 @@ enum test_opcodes { RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - RTTEST_LOCKBKL, /* 9 Lock BKL */ + RTTEST_LOCKBKL, /* 9 Lock BKL */ RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ @@ -55,7 +55,6 @@ enum test_opcodes { static int handle_op(struct test_thread_data *td, int lockwakeup) { - struct sched_param schedpar; int i, id, ret = -EINVAL; switch(td->opcode) { @@ -63,17 +62,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) case RTTEST_NOP: return 0; - case RTTEST_SCHEDOT: - schedpar.sched_priority = 0; - ret = sched_setscheduler(current, SCHED_NORMAL, &schedpar); - if (!ret) - set_user_nice(current, 0); - return ret; - - case RTTEST_SCHEDRT: - schedpar.sched_priority = td->opdata; - return sched_setscheduler(current, SCHED_FIFO, &schedpar); - case RTTEST_LOCKCONT: td->mutexes[td->opdata] = 1; td->event = atomic_add_return(1, &rttest_event); @@ -310,9 +298,10 @@ static int test_func(void *data) static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, size_t count) { + struct sched_param schedpar; struct test_thread_data *td; char cmdbuf[32]; - int op, dat, tid; + int op, dat, tid, ret; td = container_of(dev, struct test_thread_data, sysdev); tid = td->sysdev.id; @@ -334,6 +323,21 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, return -EINVAL; switch (op) { + case RTTEST_SCHEDOT: + schedpar.sched_priority = 0; + ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); + if (ret) + return ret; + set_user_nice(current, 0); + break; + + case RTTEST_SCHEDRT: + schedpar.sched_priority = dat; + ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); + if (ret) + return ret; + break; + case RTTEST_SIGNAL: send_sig(SIGHUP, threads[tid], 0); break; -- cgit From 95e02ca9bb5324360e7dea1ea1c563036d84a5e6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 27 Jun 2006 02:55:02 -0700 Subject: [PATCH] rtmutex: Propagate priority settings into PI lock chains When the priority of a task, which is blocked on a lock, changes we must propagate this change into the PI lock chain. Therefor the chain walk code is changed to get rid of the references to current to avoid false positives in the deadlock detector, as setscheduler might be called by a task which holds the lock on which the task whose priority is changed is blocked. Also add some comments about the get/put_task_struct usage to avoid confusion. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rtmutex.c | 42 ++++++++++++++++++++++++++++++++++++------ kernel/sched.c | 2 ++ 2 files changed, 38 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 3fc0f0680ca..45d61016da5 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -160,7 +160,8 @@ int max_lock_depth = 1024; static int rt_mutex_adjust_prio_chain(task_t *task, int deadlock_detect, struct rt_mutex *orig_lock, - struct rt_mutex_waiter *orig_waiter + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task __IP_DECL__) { struct rt_mutex *lock; @@ -189,7 +190,7 @@ static int rt_mutex_adjust_prio_chain(task_t *task, prev_max = max_lock_depth; printk(KERN_WARNING "Maximum lock depth %d reached " "task: %s (%d)\n", max_lock_depth, - current->comm, current->pid); + top_task->comm, top_task->pid); } put_task_struct(task); @@ -229,7 +230,7 @@ static int rt_mutex_adjust_prio_chain(task_t *task, } /* Deadlock detection */ - if (lock == orig_lock || rt_mutex_owner(lock) == current) { + if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); spin_unlock(&lock->wait_lock); ret = deadlock_detect ? -EDEADLK : 0; @@ -433,6 +434,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) { boost = 1; + /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); } spin_unlock_irqrestore(&owner->pi_lock, flags); @@ -441,6 +443,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, spin_lock_irqsave(&owner->pi_lock, flags); if (owner->pi_blocked_on) { boost = 1; + /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); } spin_unlock_irqrestore(&owner->pi_lock, flags); @@ -450,8 +453,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, spin_unlock(&lock->wait_lock); - res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, - waiter __IP__); + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, + current __IP__); spin_lock(&lock->wait_lock); @@ -552,6 +555,7 @@ static void remove_waiter(struct rt_mutex *lock, if (owner->pi_blocked_on) { boost = 1; + /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); } spin_unlock_irqrestore(&owner->pi_lock, flags); @@ -564,11 +568,36 @@ static void remove_waiter(struct rt_mutex *lock, spin_unlock(&lock->wait_lock); - rt_mutex_adjust_prio_chain(owner, 0, lock, NULL __IP__); + rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__); spin_lock(&lock->wait_lock); } +/* + * Recheck the pi chain, in case we got a priority setting + * + * Called from sched_setscheduler + */ +void rt_mutex_adjust_pi(struct task_struct *task) +{ + struct rt_mutex_waiter *waiter; + unsigned long flags; + + spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + if (!waiter || waiter->list_entry.prio == task->prio) { + spin_unlock_irqrestore(&task->pi_lock, flags); + return; + } + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); + spin_unlock_irqrestore(&task->pi_lock, flags); + + rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__); +} + /* * Slow path lock function: */ @@ -636,6 +665,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(ret)) break; } + spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(&waiter); diff --git a/kernel/sched.c b/kernel/sched.c index 7a30addfd23..2629c1711fd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4070,6 +4070,8 @@ recheck: __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); + rt_mutex_adjust_pi(p); + return 0; } EXPORT_SYMBOL_GPL(sched_setscheduler); -- cgit From 59e0e0ace7d33e8c0c125042f153f80fcc56b39e Mon Sep 17 00:00:00 2001 From: Sebastien Dugue Date: Tue, 27 Jun 2006 02:55:03 -0700 Subject: [PATCH] futex_requeue() optimization In futex_requeue(), when the 2 futexes keys hash to the same bucket, there is no need to move the futex_q to the end of the bucket list. Signed-off-by: Sebastien Dugue Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b305b7f8dad..6c91f938005 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -827,17 +827,20 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, if (++ret <= nr_wake) { wake_futex(this); } else { - list_move_tail(&this->list, &hb2->chain); - this->lock_ptr = &hb2->lock; + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(head1 != &hb2->chain)) { + list_move_tail(&this->list, &hb2->chain); + this->lock_ptr = &hb2->lock; + } this->key = key2; get_key_refs(&key2); drop_count++; if (ret - nr_wake >= nr_requeue) break; - /* Make sure to stop if key1 == key2: */ - if (head1 == &hb2->chain && head1 != &next->list) - head1 = &this->list; } } -- cgit From 9a66a53f558efc2619a438278d2919b3c9a7f673 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Tue, 27 Jun 2006 02:55:05 -0700 Subject: [PATCH] Remove redundant NULL checks before [kv]free - in kernel/ Remove redundant kfree NULL checks from kernel/ Signed-off-by: Jesper Juhl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index bdfb580a067..dc5e3f01efe 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -658,8 +658,7 @@ static void audit_log_task_context(struct audit_buffer *ab) return; error_path: - if (ctx) - kfree(ctx); + kfree(ctx); audit_panic("error in audit_log_task_context"); return; } -- cgit From c0ce7d0886cf0c2579c604eac41a7e125bc0e96d Mon Sep 17 00:00:00 2001 From: David Wilder Date: Fri, 23 Jun 2006 15:29:34 -0700 Subject: [POWERPC] Add the use of the firmware soft-reset-nmi to kdump. With this patch, kdump uses the firmware soft-reset NMI for two purposes: 1) Initiate the kdump (take a crash dump) by issuing a soft-reset. 2) Break a CPU out of a deadlock condition that is detected during kdump processing. When a soft-reset is initiated each CPU will enter system_reset_exception() and set its corresponding bit in the global bit-array cpus_in_sr then call die(). When die() finds the CPU's bit set in cpu_in_sr crash_kexec() is called to initiate a crash dump. The first CPU to enter crash_kexec() is called the "crashing CPU". All other CPUs are "secondary CPUs". The secondary CPU's pass through to crash_kexec_secondary() and sleep. The crashing CPU waits for all CPUs to enter via soft-reset then boots the kdump kernel (see crash_soft_reset_check()) When the system crashes due to a panic or exception, crash_kexec() is called by panic() or die(). The crashing CPU sends an IPI to all other CPUs to notify them of the pending shutdown. If a CPU is in a deadlock or hung state with interrupts disabled, the IPI will not be delivered. The result being, that the kdump kernel is not booted. This problem is solved with the use of a firmware generated soft-reset. After the crashing_cpu has issued the IPI, it waits for 10 sec for all CPUs to enter crash_ipi_callback(). A CPU signifies its entry to crash_ipi_callback() by setting its corresponding bit in the cpus_in_crash bit array. After 10 sec, if one or more CPUs have not set their bit in cpus_in_crash we assume that the CPU(s) is deadlocked. The operator is then prompted to generate a soft-reset to break the deadlock. Each CPU enters the soft reset handler as described above. Two conditions must be handled at this point: 1) The system crashed because the operator generated a soft-reset. See 2) The system had crashed before the soft-reset was generated ( in the case of a Panic or oops). The first CPU to enter crash_kexec() uses the state of the kexec_lock to determine this state. If kexec_lock is already held then condition 2 is true and crash_kexec_secondary() is called, else; this CPU is flagged as the crashing CPU, the kexec_lock is acquired and crash_kexec() proceeds as described above. Each additional CPUs responding to the soft-reset will pass through crash_kexec() to kexec_secondary(). All secondary CPUs call crash_ipi_callback() readying them self's for the shutdown. When ready they clear their bit in cpus_in_sr. The crashing CPU waits in kexec_secondary() until all other CPUs have cleared their bits in cpus_in_sr. The kexec kernel boot is then started. Signed-off-by: Haren Myneni Signed-off-by: David Wilder Signed-off-by: Paul Mackerras --- kernel/kexec.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 58f0f382597..50087ecf337 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1042,7 +1042,6 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, void crash_kexec(struct pt_regs *regs) { - struct kimage *image; int locked; @@ -1056,12 +1055,11 @@ void crash_kexec(struct pt_regs *regs) */ locked = xchg(&kexec_lock, 1); if (!locked) { - image = xchg(&kexec_crash_image, NULL); - if (image) { + if (kexec_crash_image) { struct pt_regs fixed_regs; crash_setup_regs(&fixed_regs, regs); machine_crash_shutdown(&fixed_regs); - machine_kexec(image); + machine_kexec(kexec_crash_image); } xchg(&kexec_lock, 0); } -- cgit From f71d20e961474dde77e6558396efb93d6ac80a4b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 28 Jun 2006 04:26:45 -0700 Subject: [PATCH] Add EXPORT_UNUSED_SYMBOL and EXPORT_UNUSED_SYMBOL_GPL Temporarily add EXPORT_UNUSED_SYMBOL and EXPORT_UNUSED_SYMBOL_GPL. These will be used as a transition measure for symbols that aren't used in the kernel and are on the way out. When a module uses such a symbol, a warning is printk'd at modprobe time. The main reason for removing unused exports is size: eacho export takes roughly between 100 and 150 bytes of kernel space in the binary. This patch gives users the option to immediately get this size gain via a config option. Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 10e5b872adf..03b738172a8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1,4 +1,4 @@ -/* Rewritten by Rusty Russell, on the backs of many others... +/* Copyright (C) 2002 Richard Henderson Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. @@ -122,9 +122,17 @@ extern const struct kernel_symbol __start___ksymtab_gpl[]; extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const struct kernel_symbol __start___ksymtab_gpl_future[]; extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; +extern const struct kernel_symbol __start___ksymtab_unused[]; +extern const struct kernel_symbol __stop___ksymtab_unused[]; +extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; +extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; +extern const struct kernel_symbol __start___ksymtab_gpl_future[]; +extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; extern const unsigned long __start___kcrctab[]; extern const unsigned long __start___kcrctab_gpl[]; extern const unsigned long __start___kcrctab_gpl_future[]; +extern const unsigned long __start___kcrctab_unused[]; +extern const unsigned long __start___kcrctab_unused_gpl[]; #ifndef CONFIG_MODVERSIONS #define symversion(base, idx) NULL @@ -144,6 +152,17 @@ static const struct kernel_symbol *lookup_symbol(const char *name, return NULL; } +static void printk_unused_warning(const char *name) +{ + printk(KERN_WARNING "Symbol %s is marked as UNUSED, " + "however this module is using it.\n", name); + printk(KERN_WARNING "This symbol will go away in the future.\n"); + printk(KERN_WARNING "Please evalute if this is the right api to use, " + "and if it really is, submit a report the linux kernel " + "mailinglist together with submitting your code for " + "inclusion.\n"); +} + /* Find a symbol, return value, crc and module which owns it */ static unsigned long __find_symbol(const char *name, struct module **owner, @@ -186,6 +205,25 @@ static unsigned long __find_symbol(const char *name, return ks->value; } + ks = lookup_symbol(name, __start___ksymtab_unused, + __stop___ksymtab_unused); + if (ks) { + printk_unused_warning(name); + *crc = symversion(__start___kcrctab_unused, + (ks - __start___ksymtab_unused)); + return ks->value; + } + + if (gplok) + ks = lookup_symbol(name, __start___ksymtab_unused_gpl, + __stop___ksymtab_unused_gpl); + if (ks) { + printk_unused_warning(name); + *crc = symversion(__start___kcrctab_unused_gpl, + (ks - __start___ksymtab_unused_gpl)); + return ks->value; + } + /* Now try modules. */ list_for_each_entry(mod, &modules, list) { *owner = mod; @@ -204,6 +242,23 @@ static unsigned long __find_symbol(const char *name, return ks->value; } } + ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms); + if (ks) { + printk_unused_warning(name); + *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms)); + return ks->value; + } + + if (gplok) { + ks = lookup_symbol(name, mod->unused_gpl_syms, + mod->unused_gpl_syms + mod->num_unused_gpl_syms); + if (ks) { + printk_unused_warning(name); + *crc = symversion(mod->unused_gpl_crcs, + (ks - mod->unused_gpl_syms)); + return ks->value; + } + } ks = lookup_symbol(name, mod->gpl_future_syms, (mod->gpl_future_syms + mod->num_gpl_future_syms)); @@ -1407,6 +1462,8 @@ static struct module *load_module(void __user *umod, exportindex, modindex, obsparmindex, infoindex, gplindex, crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, gplfuturecrcindex, unwindex = 0; + unsigned int unusedindex, unusedcrcindex, unusedgplindex, + unusedgplcrcindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -1487,9 +1544,13 @@ static struct module *load_module(void __user *umod, exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); + unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); + unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); + unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused"); + unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl"); setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); @@ -1638,14 +1699,27 @@ static struct module *load_module(void __user *umod, mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / sizeof(*mod->gpl_future_syms); + mod->num_unused_syms = sechdrs[unusedindex].sh_size / + sizeof(*mod->unused_syms); + mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / + sizeof(*mod->unused_gpl_syms); mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; if (gplfuturecrcindex) mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; + mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; + if (unusedcrcindex) + mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; + mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; + if (unusedgplcrcindex) + mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; + #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !crcindex) || (mod->num_gpl_syms && !gplcrcindex) || - (mod->num_gpl_future_syms && !gplfuturecrcindex)) { + (mod->num_gpl_future_syms && !gplfuturecrcindex) || + (mod->num_unused_syms && !unusedcrcindex) || + (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); add_taint(TAINT_FORCED_MODULE); -- cgit From 84860f9979804cfd97638ce0ec9d583daf338e0d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 28 Jun 2006 04:26:46 -0700 Subject: [PATCH] load_module() cleanup Undo bizarre declaration in load_module(). Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 03b738172a8..99c022ac3d2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1458,12 +1458,27 @@ static struct module *load_module(void __user *umod, Elf_Ehdr *hdr; Elf_Shdr *sechdrs; char *secstrings, *args, *modmagic, *strtab = NULL; - unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, - exportindex, modindex, obsparmindex, infoindex, gplindex, - crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, - gplfuturecrcindex, unwindex = 0; - unsigned int unusedindex, unusedcrcindex, unusedgplindex, - unusedgplcrcindex; + unsigned int i; + unsigned int symindex = 0; + unsigned int strindex = 0; + unsigned int setupindex; + unsigned int exindex; + unsigned int exportindex; + unsigned int modindex; + unsigned int obsparmindex; + unsigned int infoindex; + unsigned int gplindex; + unsigned int crcindex; + unsigned int gplcrcindex; + unsigned int versindex; + unsigned int pcpuindex; + unsigned int gplfutureindex; + unsigned int gplfuturecrcindex; + unsigned int unwindex = 0; + unsigned int unusedindex; + unsigned int unusedcrcindex; + unsigned int unusedgplindex; + unsigned int unusedgplcrcindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ -- cgit From d1bef4ed5faf7d9872337b33c4269e45ae1bf960 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:36 -0700 Subject: [PATCH] genirq: rename desc->handler to desc->chip This patch-queue improves the generic IRQ layer to be truly generic, by adding various abstractions and features to it, without impacting existing functionality. While the queue can be best described as "fix and improve everything in the generic IRQ layer that we could think of", and thus it consists of many smaller features and lots of cleanups, the one feature that stands out most is the new 'irq chip' abstraction. The irq-chip abstraction is about describing and coding and IRQ controller driver by mapping its raw hardware capabilities [and quirks, if needed] in a straightforward way, without having to think about "IRQ flow" (level/edge/etc.) type of details. This stands in contrast with the current 'irq-type' model of genirq architectures, which 'mixes' raw hardware capabilities with 'flow' details. The patchset supports both types of irq controller designs at once, and converts i386 and x86_64 to the new irq-chip design. As a bonus side-effect of the irq-chip approach, chained interrupt controllers (master/slave PIC constructs, etc.) are now supported by design as well. The end result of this patchset intends to be simpler architecture-level code and more consolidation between architectures. We reused many bits of code and many concepts from Russell King's ARM IRQ layer, the merging of which was one of the motivations for this patchset. This patch: rename desc->handler to desc->chip. Originally i did not want to do this, because it's a big patch. But having both "desc->handler", "desc->handle_irq" and "action->handler" caused a large degree of confusion and made the code appear alot less clean than it truly is. I have also attempted a dual approach as well by introducing a desc->chip alias - but that just wasnt robust enough and broke frequently. So lets get over with this quickly. The conversion was done automatically via scripts and converts all the code in the kernel. This renaming patch is the first one amongst the patches, so that the remaining patches can stay flexible and can be merged and split up without having some big monolithic patch act as a merge barrier. [akpm@osdl.org: build fix] [akpm@osdl.org: another build fix] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/autoprobe.c | 10 +++++----- kernel/irq/handle.c | 14 +++++++------- kernel/irq/manage.c | 24 ++++++++++++------------ kernel/irq/migration.c | 8 ++++---- kernel/irq/proc.c | 6 +++--- kernel/irq/spurious.c | 4 ++-- 6 files changed, 33 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 3467097ca61..6f1e68a46cb 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -41,7 +41,7 @@ unsigned long probe_irq_on(void) spin_lock_irq(&desc->lock); if (!irq_desc[i].action) - irq_desc[i].handler->startup(i); + irq_desc[i].chip->startup(i); spin_unlock_irq(&desc->lock); } @@ -59,7 +59,7 @@ unsigned long probe_irq_on(void) spin_lock_irq(&desc->lock); if (!desc->action) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->handler->startup(i)) + if (desc->chip->startup(i)) desc->status |= IRQ_PENDING; } spin_unlock_irq(&desc->lock); @@ -85,7 +85,7 @@ unsigned long probe_irq_on(void) /* It triggered already - consider it spurious. */ if (!(status & IRQ_WAITING)) { desc->status = status & ~IRQ_AUTODETECT; - desc->handler->shutdown(i); + desc->chip->shutdown(i); } else if (i < 32) val |= 1 << i; @@ -128,7 +128,7 @@ unsigned int probe_irq_mask(unsigned long val) mask |= 1 << i; desc->status = status & ~IRQ_AUTODETECT; - desc->handler->shutdown(i); + desc->chip->shutdown(i); } spin_unlock_irq(&desc->lock); } @@ -173,7 +173,7 @@ int probe_irq_off(unsigned long val) nr_irqs++; } desc->status = status & ~IRQ_AUTODETECT; - desc->handler->shutdown(i); + desc->chip->shutdown(i); } spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 0f653011710..f9d95705a4a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -31,7 +31,7 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, - .handler = &no_irq_type, + .chip = &no_irq_type, .lock = SPIN_LOCK_UNLOCKED } }; @@ -118,16 +118,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) /* * No locking required for CPU-local interrupts: */ - if (desc->handler->ack) - desc->handler->ack(irq); + if (desc->chip->ack) + desc->chip->ack(irq); action_ret = handle_IRQ_event(irq, regs, desc->action); - desc->handler->end(irq); + desc->chip->end(irq); return 1; } spin_lock(&desc->lock); - if (desc->handler->ack) - desc->handler->ack(irq); + if (desc->chip->ack) + desc->chip->ack(irq); /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested @@ -187,7 +187,7 @@ out: * The ->end() handler has to deal with interrupts which got * disabled while the handler was running. */ - desc->handler->end(irq); + desc->chip->end(irq); spin_unlock(&desc->lock); return 1; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1279e349953..31ee1f3bfcf 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -69,7 +69,7 @@ void disable_irq_nosync(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); if (!desc->depth++) { desc->status |= IRQ_DISABLED; - desc->handler->disable(irq); + desc->chip->disable(irq); } spin_unlock_irqrestore(&desc->lock, flags); } @@ -131,9 +131,9 @@ void enable_irq(unsigned int irq) desc->status = status; if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { desc->status = status | IRQ_REPLAY; - hw_resend_irq(desc->handler,irq); + hw_resend_irq(desc->chip,irq); } - desc->handler->enable(irq); + desc->chip->enable(irq); /* fall-through */ } default: @@ -178,7 +178,7 @@ int setup_irq(unsigned int irq, struct irqaction * new) if (irq >= NR_IRQS) return -EINVAL; - if (desc->handler == &no_irq_type) + if (desc->chip == &no_irq_type) return -ENOSYS; /* * Some drivers like serial.c use request_irq() heavily, @@ -230,10 +230,10 @@ int setup_irq(unsigned int irq, struct irqaction * new) desc->depth = 0; desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | IRQ_WAITING | IRQ_INPROGRESS); - if (desc->handler->startup) - desc->handler->startup(irq); + if (desc->chip->startup) + desc->chip->startup(irq); else - desc->handler->enable(irq); + desc->chip->enable(irq); } spin_unlock_irqrestore(&desc->lock,flags); @@ -295,16 +295,16 @@ void free_irq(unsigned int irq, void *dev_id) /* Currently used only by UML, might disappear one day.*/ #ifdef CONFIG_IRQ_RELEASE_METHOD - if (desc->handler->release) - desc->handler->release(irq, dev_id); + if (desc->chip->release) + desc->chip->release(irq, dev_id); #endif if (!desc->action) { desc->status |= IRQ_DISABLED; - if (desc->handler->shutdown) - desc->handler->shutdown(irq); + if (desc->chip->shutdown) + desc->chip->shutdown(irq); else - desc->handler->disable(irq); + desc->chip->disable(irq); } spin_unlock_irqrestore(&desc->lock,flags); unregister_handler_proc(irq, action); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index a12d00eb5e7..d978c87bca9 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -33,7 +33,7 @@ void move_native_irq(int irq) if (unlikely(cpus_empty(pending_irq_cpumask[irq]))) return; - if (!desc->handler->set_affinity) + if (!desc->chip->set_affinity) return; assert_spin_locked(&desc->lock); @@ -51,12 +51,12 @@ void move_native_irq(int irq) */ if (likely(!cpus_empty(tmp))) { if (likely(!(desc->status & IRQ_DISABLED))) - desc->handler->disable(irq); + desc->chip->disable(irq); - desc->handler->set_affinity(irq,tmp); + desc->chip->set_affinity(irq,tmp); if (likely(!(desc->status & IRQ_DISABLED))) - desc->handler->enable(irq); + desc->chip->enable(irq); } cpus_clear(pending_irq_cpumask[irq]); } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index afacd6f585f..90fe05f23e6 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -37,7 +37,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { set_balance_irq_affinity(irq, mask_val); irq_affinity[irq] = mask_val; - irq_desc[irq].handler->set_affinity(irq, mask_val); + irq_desc[irq].chip->set_affinity(irq, mask_val); } #endif @@ -59,7 +59,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, unsigned int irq = (int)(long)data, full_count = count, err; cpumask_t new_value, tmp; - if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) + if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) return -EIO; err = cpumask_parse(buffer, count, new_value); @@ -122,7 +122,7 @@ void register_irq_proc(unsigned int irq) char name [MAX_NAMELEN]; if (!root_irq_dir || - (irq_desc[irq].handler == &no_irq_type) || + (irq_desc[irq].chip == &no_irq_type) || irq_dir[irq]) return; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index b2fb3c18d06..ea3ceed362d 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -81,7 +81,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs) * IRQ controller clean up too */ if(work) - desc->handler->end(i); + desc->chip->end(i); spin_unlock(&desc->lock); } /* So the caller can adjust the irq error counts */ @@ -166,7 +166,7 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, */ printk(KERN_EMERG "Disabling IRQ #%d\n", irq); desc->status |= IRQ_DISABLED; - desc->handler->disable(irq); + desc->chip->disable(irq); } desc->irqs_unhandled = 0; } -- cgit From 74ffd553a3a7fbae34be70b751852d5b6fe5acac Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:37 -0700 Subject: [PATCH] genirq: sem2mutex probe_sem -> probing_active Convert the irq auto-probing semaphore to a mutex. (This allows us to find probing API usage bugs sooner, via the mutex debugging code.) Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/autoprobe.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 6f1e68a46cb..d1f1e6f10a2 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -16,7 +16,7 @@ * comes in on to an unassigned handler will get stuck with * "IRQ_WAITING" cleared and the interrupt disabled. */ -static DECLARE_MUTEX(probe_sem); +static DEFINE_MUTEX(probing_active); /** * probe_irq_on - begin an interrupt autodetect @@ -31,7 +31,7 @@ unsigned long probe_irq_on(void) irq_desc_t *desc; unsigned int i; - down(&probe_sem); + mutex_lock(&probing_active); /* * something may have generated an irq long ago and we want to * flush such a longstanding irq before considering it as spurious. @@ -132,7 +132,7 @@ unsigned int probe_irq_mask(unsigned long val) } spin_unlock_irq(&desc->lock); } - up(&probe_sem); + mutex_unlock(&probing_active); return mask & val; } @@ -177,10 +177,11 @@ int probe_irq_off(unsigned long val) } spin_unlock_irq(&desc->lock); } - up(&probe_sem); + mutex_unlock(&probing_active); if (nr_irqs > 1) irq_found = -irq_found; + return irq_found; } -- cgit From a53da52fd743fd637637572838c0a7af23a2d038 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:38 -0700 Subject: [PATCH] genirq: cleanup: merge irq_affinity[] into irq_desc[] Consolidation: remove the irq_affinity[NR_IRQS] array and move it into the irq_desc[NR_IRQS].affinity field. [akpm@osdl.org: sparc64 build fix] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 5 ++++- kernel/irq/manage.c | 2 -- kernel/irq/proc.c | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index f9d95705a4a..cc786aaf30d 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -32,7 +32,10 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, .chip = &no_irq_type, - .lock = SPIN_LOCK_UNLOCKED + .lock = SPIN_LOCK_UNLOCKED, +#ifdef CONFIG_SMP + .affinity = CPU_MASK_ALL +#endif } }; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 31ee1f3bfcf..c53662edc73 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -16,8 +16,6 @@ #ifdef CONFIG_SMP -cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; - #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE) cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; #endif diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 90fe05f23e6..847b98a611e 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -36,7 +36,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { set_balance_irq_affinity(irq, mask_val); - irq_affinity[irq] = mask_val; + irq_desc[irq].affinity = mask_val; irq_desc[irq].chip->set_affinity(irq, mask_val); } #endif @@ -44,7 +44,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) static int irq_affinity_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); + int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity); if (count - len < 2) return -EINVAL; -- cgit From a8553acd6c14e827078779c0a0ee1c18f27b2403 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:38 -0700 Subject: [PATCH] genirq: cleanup: remove irq_descp() Cleanup: remove irq_descp() - explicit use of irq_desc[] is shorter and more readable. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/migration.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index d978c87bca9..b4a4354d03d 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -15,7 +15,7 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) void move_native_irq(int irq) { cpumask_t tmp; - irq_desc_t *desc = irq_descp(irq); + irq_desc_t *desc = irq_desc + irq; if (likely(!desc->move_irq)) return; -- cgit From 2e60bbb6d50de654d8e68f115161e27878b5e72d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:39 -0700 Subject: [PATCH] genirq: cleanup: remove fastcall Now that i386 defaults to regparm, explicit uses of fastcall are not needed anymore. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index cc786aaf30d..6b313ccf0ed 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -79,8 +79,8 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) /* * Have got an event to handle: */ -fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, - struct irqaction *action) +irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, + struct irqaction *action) { irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; -- cgit From 06fcb0c6fb3aae9570a32ac3b72a8222563baa69 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:40 -0700 Subject: [PATCH] genirq: cleanup: misc code cleanups Assorted code cleanups to the generic IRQ code. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/autoprobe.c | 16 +++++++--------- kernel/irq/handle.c | 4 ++-- kernel/irq/manage.c | 29 ++++++++++++----------------- kernel/irq/spurious.c | 23 +++++++++++++---------- 4 files changed, 34 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index d1f1e6f10a2..d6eab98a317 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -27,7 +27,7 @@ static DEFINE_MUTEX(probing_active); */ unsigned long probe_irq_on(void) { - unsigned long val; + unsigned long mask; irq_desc_t *desc; unsigned int i; @@ -40,8 +40,8 @@ unsigned long probe_irq_on(void) desc = irq_desc + i; spin_lock_irq(&desc->lock); - if (!irq_desc[i].action) - irq_desc[i].chip->startup(i); + if (!desc->action) + desc->chip->startup(i); spin_unlock_irq(&desc->lock); } @@ -73,11 +73,11 @@ unsigned long probe_irq_on(void) /* * Now filter out any obviously spurious interrupts */ - val = 0; + mask = 0; for (i = 0; i < NR_IRQS; i++) { - irq_desc_t *desc = irq_desc + i; unsigned int status; + desc = irq_desc + i; spin_lock_irq(&desc->lock); status = desc->status; @@ -88,14 +88,13 @@ unsigned long probe_irq_on(void) desc->chip->shutdown(i); } else if (i < 32) - val |= 1 << i; + mask |= 1 << i; } spin_unlock_irq(&desc->lock); } - return val; + return mask; } - EXPORT_SYMBOL(probe_irq_on); /** @@ -184,6 +183,5 @@ int probe_irq_off(unsigned long val) return irq_found; } - EXPORT_SYMBOL(probe_irq_off); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6b313ccf0ed..f9c33a86cbd 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,7 +18,7 @@ * Linux has a controller-independent interrupt architecture. * Every controller has a 'controller-template', that is used * by the main code to do the right thing. Each driver-visible - * interrupt source is transparently wired to the apropriate + * interrupt source is transparently wired to the appropriate * controller. Thus drivers need not be aware of the * interrupt-controller. * @@ -111,7 +111,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) { irq_desc_t *desc = irq_desc + irq; - struct irqaction * action; + struct irqaction *action; unsigned int status; kstat_this_cpu.irqs[irq]++; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c53662edc73..261906ebdf0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -40,7 +40,6 @@ void synchronize_irq(unsigned int irq) while (desc->status & IRQ_INPROGRESS) cpu_relax(); } - EXPORT_SYMBOL(synchronize_irq); #endif @@ -71,7 +70,6 @@ void disable_irq_nosync(unsigned int irq) } spin_unlock_irqrestore(&desc->lock, flags); } - EXPORT_SYMBOL(disable_irq_nosync); /** @@ -97,7 +95,6 @@ void disable_irq(unsigned int irq) if (desc->action) synchronize_irq(irq); } - EXPORT_SYMBOL(disable_irq); /** @@ -139,7 +136,6 @@ void enable_irq(unsigned int irq) } spin_unlock_irqrestore(&desc->lock, flags); } - EXPORT_SYMBOL(enable_irq); /* @@ -166,7 +162,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. */ -int setup_irq(unsigned int irq, struct irqaction * new) +int setup_irq(unsigned int irq, struct irqaction *new) { struct irq_desc *desc = irq_desc + irq; struct irqaction *old, **p; @@ -198,9 +194,10 @@ int setup_irq(unsigned int irq, struct irqaction * new) /* * The following block of code has to be executed atomically */ - spin_lock_irqsave(&desc->lock,flags); + spin_lock_irqsave(&desc->lock, flags); p = &desc->action; - if ((old = *p) != NULL) { + old = *p; + if (old) { /* Can't share interrupts unless both agree to */ if (!(old->flags & new->flags & SA_SHIRQ)) goto mismatch; @@ -233,7 +230,7 @@ int setup_irq(unsigned int irq, struct irqaction * new) else desc->chip->enable(irq); } - spin_unlock_irqrestore(&desc->lock,flags); + spin_unlock_irqrestore(&desc->lock, flags); new->irq = irq; register_irq_proc(irq); @@ -276,10 +273,10 @@ void free_irq(unsigned int irq, void *dev_id) return; desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock,flags); + spin_lock_irqsave(&desc->lock, flags); p = &desc->action; for (;;) { - struct irqaction * action = *p; + struct irqaction *action = *p; if (action) { struct irqaction **pp = p; @@ -304,7 +301,7 @@ void free_irq(unsigned int irq, void *dev_id) else desc->chip->disable(irq); } - spin_unlock_irqrestore(&desc->lock,flags); + spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); /* Make sure it's not being used on another CPU */ @@ -312,12 +309,11 @@ void free_irq(unsigned int irq, void *dev_id) kfree(action); return; } - printk(KERN_ERR "Trying to free free IRQ%d\n",irq); - spin_unlock_irqrestore(&desc->lock,flags); + printk(KERN_ERR "Trying to free free IRQ%d\n", irq); + spin_unlock_irqrestore(&desc->lock, flags); return; } } - EXPORT_SYMBOL(free_irq); /** @@ -351,9 +347,9 @@ EXPORT_SYMBOL(free_irq); */ int request_irq(unsigned int irq, irqreturn_t (*handler)(int, void *, struct pt_regs *), - unsigned long irqflags, const char * devname, void *dev_id) + unsigned long irqflags, const char *devname, void *dev_id) { - struct irqaction * action; + struct irqaction *action; int retval; /* @@ -388,6 +384,5 @@ int request_irq(unsigned int irq, return retval; } - EXPORT_SYMBOL(request_irq); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index ea3ceed362d..5eae7bf3c34 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -16,22 +16,20 @@ static int irqfixup __read_mostly; /* * Recovery handler for misrouted interrupts. */ - static int misrouted_irq(int irq, struct pt_regs *regs) { int i; - irq_desc_t *desc; int ok = 0; int work = 0; /* Did we do work for a real IRQ */ - for(i = 1; i < NR_IRQS; i++) { + for (i = 1; i < NR_IRQS; i++) { + struct irq_desc *desc = irq_desc + i; struct irqaction *action; if (i == irq) /* Already tried */ continue; - desc = &irq_desc[i]; + spin_lock(&desc->lock); - action = desc->action; /* Already running on another processor */ if (desc->status & IRQ_INPROGRESS) { /* @@ -45,7 +43,9 @@ static int misrouted_irq(int irq, struct pt_regs *regs) } /* Honour the normal IRQ locking */ desc->status |= IRQ_INPROGRESS; + action = desc->action; spin_unlock(&desc->lock); + while (action) { /* Only shared IRQ handlers are safe to call */ if (action->flags & SA_SHIRQ) { @@ -62,9 +62,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) /* * While we were looking for a fixup someone queued a real - * IRQ clashing with our walk + * IRQ clashing with our walk: */ - while ((desc->status & IRQ_PENDING) && action) { /* * Perform real IRQ processing for the IRQ we deferred @@ -80,7 +79,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs) * If we did actual work for the real IRQ line we must let the * IRQ controller clean up too */ - if(work) + if (work) desc->chip->end(i); spin_unlock(&desc->lock); } @@ -113,6 +112,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) } dump_stack(); printk(KERN_ERR "handlers:\n"); + action = desc->action; while (action) { printk(KERN_ERR "[<%p>]", action->handler); @@ -123,7 +123,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) } } -static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) +static void +report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) { static int count = 100; @@ -134,7 +135,7 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio } void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, - struct pt_regs *regs) + struct pt_regs *regs) { if (unlikely(action_ret != IRQ_HANDLED)) { desc->irqs_unhandled++; @@ -177,6 +178,7 @@ int __init noirqdebug_setup(char *str) { noirqdebug = 1; printk(KERN_INFO "IRQ lockup detection disabled\n"); + return 1; } @@ -187,6 +189,7 @@ static int __init irqfixup_setup(char *str) irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); + return 1; } -- cgit From 34ffdb7233d5847808d2b63ca6761dac3af9c942 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:40 -0700 Subject: [PATCH] genirq: cleanup: reduce irq_desc_t use, mark it obsolete Cleanup: remove irq_desc_t use from the generic IRQ code, and mark it obsolete. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/autoprobe.c | 6 +++--- kernel/irq/handle.c | 4 ++-- kernel/irq/manage.c | 6 +++--- kernel/irq/migration.c | 4 ++-- kernel/irq/spurious.c | 9 +++++---- 5 files changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index d6eab98a317..5c988bba401 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -27,8 +27,8 @@ static DEFINE_MUTEX(probing_active); */ unsigned long probe_irq_on(void) { + struct irq_desc *desc; unsigned long mask; - irq_desc_t *desc; unsigned int i; mutex_lock(&probing_active); @@ -116,7 +116,7 @@ unsigned int probe_irq_mask(unsigned long val) mask = 0; for (i = 0; i < NR_IRQS; i++) { - irq_desc_t *desc = irq_desc + i; + struct irq_desc *desc = irq_desc + i; unsigned int status; spin_lock_irq(&desc->lock); @@ -159,7 +159,7 @@ int probe_irq_off(unsigned long val) int i, irq_found = 0, nr_irqs = 0; for (i = 0; i < NR_IRQS; i++) { - irq_desc_t *desc = irq_desc + i; + struct irq_desc *desc = irq_desc + i; unsigned int status; spin_lock_irq(&desc->lock); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index f9c33a86cbd..8eda1005d10 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -28,7 +28,7 @@ * * Controller mappings for all interrupt sources: */ -irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { +struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, .chip = &no_irq_type, @@ -110,7 +110,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, */ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) { - irq_desc_t *desc = irq_desc + irq; + struct irq_desc *desc = irq_desc + irq; struct irqaction *action; unsigned int status; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 261906ebdf0..6a6f1d3dd39 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -57,7 +57,7 @@ EXPORT_SYMBOL(synchronize_irq); */ void disable_irq_nosync(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + struct irq_desc *desc = irq_desc + irq; unsigned long flags; if (irq >= NR_IRQS) @@ -86,7 +86,7 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + struct irq_desc *desc = irq_desc + irq; if (irq >= NR_IRQS) return; @@ -109,7 +109,7 @@ EXPORT_SYMBOL(disable_irq); */ void enable_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + struct irq_desc *desc = irq_desc + irq; unsigned long flags; if (irq >= NR_IRQS) diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index b4a4354d03d..a571c3abb79 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -3,7 +3,7 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) { - irq_desc_t *desc = irq_desc + irq; + struct irq_desc *desc = irq_desc + irq; unsigned long flags; spin_lock_irqsave(&desc->lock, flags); @@ -14,8 +14,8 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) void move_native_irq(int irq) { + struct irq_desc *desc = irq_desc + irq; cpumask_t tmp; - irq_desc_t *desc = irq_desc + irq; if (likely(!desc->move_irq)) return; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 5eae7bf3c34..3a0a6212330 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -99,7 +99,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) */ static void -__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) +__report_bad_irq(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret) { struct irqaction *action; @@ -124,7 +125,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) } static void -report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) +report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { static int count = 100; @@ -134,8 +135,8 @@ report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) } } -void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, - struct pt_regs *regs) +void note_interrupt(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret, struct pt_regs *regs) { if (unlikely(action_ret != IRQ_HANDLED)) { desc->irqs_unhandled++; -- cgit From 4a733ee12618cf3ec25cbc337a5e0ba3ad5d7fb6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:42 -0700 Subject: [PATCH] genirq: cleanup: merge irq_dir[], smp_affinity_entry[] into irq_desc[] Consolidation: remove the irq_dir[NR_IRQS] and the smp_affinity_entry[NR_IRQS] arrays and move them into the irq_desc[] array. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/proc.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 847b98a611e..f60b85b61e8 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -12,15 +12,10 @@ #include "internals.h" -static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; +static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -/* - * The /proc/irq//smp_affinity values: - */ -static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; - #ifdef CONFIG_GENERIC_PENDING_IRQ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { @@ -102,7 +97,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) { char name [MAX_NAMELEN]; - if (!irq_dir[irq] || action->dir || !action->name || + if (!irq_desc[irq].dir || action->dir || !action->name || !name_unique(irq, action)) return; @@ -110,7 +105,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_dir[irq]); + action->dir = proc_mkdir(name, irq_desc[irq].dir); } #undef MAX_NAMELEN @@ -123,21 +118,21 @@ void register_irq_proc(unsigned int irq) if (!root_irq_dir || (irq_desc[irq].chip == &no_irq_type) || - irq_dir[irq]) + irq_desc[irq].dir) return; memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ - irq_dir[irq] = proc_mkdir(name, root_irq_dir); + irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); #ifdef CONFIG_SMP { struct proc_dir_entry *entry; /* create /proc/irq//smp_affinity */ - entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); + entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); if (entry) { entry->nlink = 1; @@ -145,7 +140,6 @@ void register_irq_proc(unsigned int irq) entry->read_proc = irq_affinity_read_proc; entry->write_proc = irq_affinity_write_proc; } - smp_affinity_entry[irq] = entry; } #endif } @@ -155,7 +149,7 @@ void register_irq_proc(unsigned int irq) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { if (action->dir) - remove_proc_entry(action->dir->name, irq_dir[irq]); + remove_proc_entry(action->dir->name, irq_desc[irq].dir); } void init_irq_proc(void) -- cgit From cd916d31cc31273eca8a620fae02b7bf7f577559 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:42 -0700 Subject: [PATCH] genirq: cleanup: merge pending_irq_cpumask[] into irq_desc[] Consolidation: remove the pending_irq_cpumask[NR_IRQS] array and move it into the irq_desc[NR_IRQS].pending_mask field. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 4 ---- kernel/irq/migration.c | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6a6f1d3dd39..ca9b5d36abe 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -16,10 +16,6 @@ #ifdef CONFIG_SMP -#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE) -cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; -#endif - /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index a571c3abb79..a57ebe9fa6f 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -8,7 +8,7 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) spin_lock_irqsave(&desc->lock, flags); desc->move_irq = 1; - pending_irq_cpumask[irq] = mask; + irq_desc[irq].pending_mask = mask; spin_unlock_irqrestore(&desc->lock, flags); } @@ -30,7 +30,7 @@ void move_native_irq(int irq) desc->move_irq = 0; - if (unlikely(cpus_empty(pending_irq_cpumask[irq]))) + if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) return; if (!desc->chip->set_affinity) @@ -38,7 +38,7 @@ void move_native_irq(int irq) assert_spin_locked(&desc->lock); - cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); + cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); /* * If there was a valid mask to work with, please @@ -58,5 +58,5 @@ void move_native_irq(int irq) if (likely(!(desc->status & IRQ_DISABLED))) desc->chip->enable(irq); } - cpus_clear(pending_irq_cpumask[irq]); + cpus_clear(irq_desc[irq].pending_mask); } -- cgit From 0d7012a968d006e277eb0fe20edd7a9b5563c2b7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:43 -0700 Subject: [PATCH] genirq: cleanup: turn ARCH_HAS_IRQ_PER_CPU into CONFIG_IRQ_PER_CPU Cleanup: change ARCH_HAS_IRQ_PER_CPU into a Kconfig method. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ca9b5d36abe..8389d1817fe 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -198,7 +198,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) if (!(old->flags & new->flags & SA_SHIRQ)) goto mismatch; -#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) +#if defined(CONFIG_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) /* All handlers must agree on per-cpuness */ if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) goto mismatch; @@ -213,7 +213,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) } *p = new; -#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) +#if defined(CONFIG_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) if (new->flags & SA_PERCPU_IRQ) desc->status |= IRQ_PER_CPU; #endif -- cgit From 096c8131c573ed37939dc3f1440221c92c87e74b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2006 02:24:44 -0700 Subject: [PATCH] genirq: debug: better debug printout in enable_irq() Make enable_irq() debug printouts user-readable. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8389d1817fe..76c8eda6729 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -114,6 +114,7 @@ void enable_irq(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); switch (desc->depth) { case 0: + printk(KERN_WARNING "Unablanced enable_irq(%d)\n", irq); WARN_ON(1); break; case 1: { -- cgit From c0ad90a32fb60f4129d0e24dfd5fd7128e2e09f2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:44 -0700 Subject: [PATCH] genirq: add ->retrigger() irq op to consolidate hw_irq_resend() Add ->retrigger() irq op to consolidate hw_irq_resend() implementations. (Most architectures had it defined to NOP anyway.) NOTE: ia64 needs testing. i386 and x86_64 tested. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 76c8eda6729..19b438e09f1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -123,7 +123,8 @@ void enable_irq(unsigned int irq) desc->status = status; if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { desc->status = status | IRQ_REPLAY; - hw_resend_irq(desc->chip,irq); + if (desc->chip && desc->chip->retrigger) + desc->chip->retrigger(irq); } desc->chip->enable(irq); /* fall-through */ -- cgit From 8d28bc751bb9ad479e33964d5d9eedfe5fb488a5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:46 -0700 Subject: [PATCH] genirq: doc: handle_IRQ_event() and __do_IRQ() comments Document handle_IRQ_event() and __do_IRQ(). Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 8eda1005d10..7fc7bc33d49 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -76,8 +76,13 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) return IRQ_NONE; } -/* - * Have got an event to handle: +/** + * handle_IRQ_event - irq action chain handler + * @irq: the interrupt number + * @regs: pointer to a register structure + * @action: the interrupt action chain for this irq + * + * Handles the action chain of an irq event */ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, struct irqaction *action) @@ -103,10 +108,17 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, return retval; } -/* - * do_IRQ handles all normal device IRQ's (the special +/** + * __do_IRQ - original all in one highlevel IRQ handler + * @irq: the interrupt number + * @regs: pointer to a register structure + * + * __do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). + * + * This is the original x86 implementation which is used for every + * interrupt type. */ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) { -- cgit From 77a5afecdb15e65034ab8390b46b824c186c62a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:46 -0700 Subject: [PATCH] genirq: cleanup: no_irq_type cleanups Clean up no_irq_type: share the NOP functions where possible, and properly name the ack_bad() function. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 7fc7bc33d49..402fa3aec1e 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -40,32 +40,37 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { }; /* - * Generic 'no controller' code + * What should we do if we get a hw irq event on an illegal vector? + * Each architecture has to answer this themself. */ -static void end_none(unsigned int irq) { } -static void enable_none(unsigned int irq) { } -static void disable_none(unsigned int irq) { } -static void shutdown_none(unsigned int irq) { } -static unsigned int startup_none(unsigned int irq) { return 0; } - -static void ack_none(unsigned int irq) +static void ack_bad(unsigned int irq) { - /* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themself. - */ ack_bad_irq(irq); } +/* + * NOP functions + */ +static void noop(unsigned int irq) +{ +} + +static unsigned int noop_ret(unsigned int irq) +{ + return 0; +} + +/* + * Generic no controller implementation + */ struct hw_interrupt_type no_irq_type = { - .typename = "none", - .startup = startup_none, - .shutdown = shutdown_none, - .enable = enable_none, - .disable = disable_none, - .ack = ack_none, - .end = end_none, - .set_affinity = NULL + .typename = "none", + .startup = noop_ret, + .shutdown = noop, + .enable = noop, + .disable = noop, + .ack = ack_bad, + .end = noop, }; /* -- cgit From a4633adcdbc15ac51afcd0e1395de58cee27cf92 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2006 02:24:48 -0700 Subject: [PATCH] genirq: add genirq sw IRQ-retrigger Enable platforms that do not have a hardware-assisted hardirq-resend mechanism to resend them via a softirq-driven IRQ emulation mechanism. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/Makefile | 2 +- kernel/irq/manage.c | 10 +------ kernel/irq/resend.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 10 deletions(-) create mode 100644 kernel/irq/resend.c (limited to 'kernel') diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 9f77f50d814..627ace98d4a 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,5 +1,5 @@ -obj-y := handle.o manage.o spurious.o +obj-y := handle.o manage.o spurious.o resend.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 19b438e09f1..cffde484389 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -118,15 +118,7 @@ void enable_irq(unsigned int irq) WARN_ON(1); break; case 1: { - unsigned int status = desc->status & ~IRQ_DISABLED; - - desc->status = status; - if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = status | IRQ_REPLAY; - if (desc->chip && desc->chip->retrigger) - desc->chip->retrigger(irq); - } - desc->chip->enable(irq); + check_irq_resend(desc, irq); /* fall-through */ } default: diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c new file mode 100644 index 00000000000..096b102fb39 --- /dev/null +++ b/kernel/irq/resend.c @@ -0,0 +1,78 @@ +/* + * linux/kernel/irq/resend.c + * + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner + * + * This file contains the IRQ-resend code + * + * If the interrupt is waiting to be processed, we try to re-run it. + * We can't directly run it from here since the caller might be in an + * interrupt-protected region. Not all irq controller chips can + * retrigger interrupts at the hardware level, so in those cases + * we allow the resending of IRQs via a tasklet. + */ + +#include +#include +#include +#include + +#include "internals.h" + +#ifdef CONFIG_HARDIRQS_SW_RESEND + +/* Bitmap to handle software resend of interrupts: */ +static DECLARE_BITMAP(irqs_resend, NR_IRQS); + +/* + * Run software resends of IRQ's + */ +static void resend_irqs(unsigned long arg) +{ + struct irq_desc *desc; + int irq; + + while (!bitmap_empty(irqs_resend, NR_IRQS)) { + irq = find_first_bit(irqs_resend, NR_IRQS); + clear_bit(irq, irqs_resend); + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + desc->handle_irq(irq, desc, NULL); + spin_unlock_irqrestore(&desc->lock, flags); + } +} + +/* Tasklet to handle resend: */ +static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); + +#endif + +/* + * IRQ resend + * + * Is called with interrupts disabled and desc->lock held. + */ +void check_irq_resend(struct irq_desc *desc, unsigned int irq) +{ + unsigned int status = desc->status; + + /* + * Make sure the interrupt is enabled, before resending it: + */ + desc->chip->enable(irq); + + if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { + desc->status &= ~IRQ_PENDING; + desc->status = status | IRQ_REPLAY; + + if (!desc->chip || !desc->chip->retrigger || + !desc->chip->retrigger(irq)) { +#ifdef CONFIG_HARDIRQS_SW_RESEND + /* Set it pending and activate the softirq: */ + set_bit(irq, irqs_resend); + tasklet_schedule(&resend_tasklet); +#endif + } + } +} -- cgit From 3418d72404e35eb19e7995cbf3e7a76ba8fefbce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2006 02:24:49 -0700 Subject: [PATCH] genirq: add IRQ_NOPROBE support Introduce IRQ_NOPROBE: enables platforms to control chip-probing. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/autoprobe.c | 4 ++-- kernel/irq/manage.c | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 5c988bba401..ed98c7d46cf 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -40,7 +40,7 @@ unsigned long probe_irq_on(void) desc = irq_desc + i; spin_lock_irq(&desc->lock); - if (!desc->action) + if (!desc->action && !(desc->status & IRQ_NOPROBE)) desc->chip->startup(i); spin_unlock_irq(&desc->lock); } @@ -57,7 +57,7 @@ unsigned long probe_irq_on(void) desc = irq_desc + i; spin_lock_irq(&desc->lock); - if (!desc->action) { + if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; if (desc->chip->startup(i)) desc->status |= IRQ_PENDING; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index cffde484389..90a944a7fad 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -118,6 +118,10 @@ void enable_irq(unsigned int irq) WARN_ON(1); break; case 1: { + unsigned int status = desc->status & ~IRQ_DISABLED; + + /* Prevent probing on this irq: */ + desc->status = status | IRQ_NOPROBE; check_irq_resend(desc, irq); /* fall-through */ } -- cgit From 6550c775cb5ee94c132d93d84de3bb23f0abf37b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2006 02:24:49 -0700 Subject: [PATCH] genirq: add IRQ_NOREQUEST support Enable platforms to disable request_irq() for certain interrupts. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 90a944a7fad..cae900a849c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -141,7 +141,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) { struct irqaction *action; - if (irq >= NR_IRQS) + if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) return 0; action = irq_desc[irq].action; @@ -356,6 +356,8 @@ int request_irq(unsigned int irq, return -EINVAL; if (irq >= NR_IRQS) return -EINVAL; + if (irq_desc[irq].status & IRQ_NOREQUEST) + return -EINVAL; if (!handler) return -EINVAL; -- cgit From 94d39e1f6e8132ea982a1d61acbe0423d3d14365 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2006 02:24:50 -0700 Subject: [PATCH] genirq: add IRQ_NOAUTOEN support Enable platforms to disable the automatic enabling of freshly set up irqs. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 1 + kernel/irq/manage.c | 18 +++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 402fa3aec1e..9b398d52f1b 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -32,6 +32,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, .chip = &no_irq_type, + .depth = 1, .lock = SPIN_LOCK_UNLOCKED, #ifdef CONFIG_SMP .affinity = CPU_MASK_ALL diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index cae900a849c..9ea18879fb6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -216,13 +216,17 @@ int setup_irq(unsigned int irq, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif if (!shared) { - desc->depth = 0; - desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | - IRQ_WAITING | IRQ_INPROGRESS); - if (desc->chip->startup) - desc->chip->startup(irq); - else - desc->chip->enable(irq); + desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | + IRQ_INPROGRESS); + + if (!(desc->status & IRQ_NOAUTOEN)) { + desc->depth = 0; + desc->status &= ~IRQ_DISABLED; + if (desc->chip->startup) + desc->chip->startup(irq); + else + desc->chip->enable(irq); + } } spin_unlock_irqrestore(&desc->lock, flags); -- cgit From a34db9b28a1c63317e1d6f1080a12d711579e7d0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:50 -0700 Subject: [PATCH] genirq: update copyrights Update/add copyrights in the generic IRQ code. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 6 +++++- kernel/irq/manage.c | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 9b398d52f1b..bddcb8f5fea 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -1,9 +1,13 @@ /* * linux/kernel/irq/handle.c * - * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * * This file contains the core interrupt handling code. + * + * Detailed information is available in Documentation/DocBook/genericirq + * */ #include diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9ea18879fb6..1a2e7663096 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1,7 +1,8 @@ /* * linux/kernel/irq/manage.c * - * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006 Thomas Gleixner * * This file contains driver APIs to the irq subsystem. */ -- cgit From 6a6de9ef5850d063c3d3fb50784bfe3a6d0712c6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2006 02:24:51 -0700 Subject: [PATCH] genirq: core Core genirq support: add the irq-chip and irq-flow abstractions. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/autoprobe.c | 9 ++++++++- kernel/irq/handle.c | 10 ++++++++++ kernel/irq/internals.h | 6 ++++++ kernel/irq/manage.c | 14 ++++++++++++++ kernel/irq/resend.c | 4 ++-- kernel/irq/spurious.c | 1 + 6 files changed, 41 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index ed98c7d46cf..cfdb63eb5c9 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -40,8 +40,15 @@ unsigned long probe_irq_on(void) desc = irq_desc + i; spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) + if (!desc->action && !(desc->status & IRQ_NOPROBE)) { + /* + * Some chips need to know about probing in + * progress: + */ + if (desc->chip->set_type) + desc->chip->set_type(i, IRQ_TYPE_PROBE); desc->chip->startup(i); + } spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index bddcb8f5fea..a04b516afa5 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,6 +18,16 @@ #include "internals.h" +/** + * handle_bad_irq - handle spurious and unhandled irqs + */ +void fastcall +handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +{ + kstat_this_cpu.irqs[irq]++; + ack_bad_irq(irq); +} + /* * Linux has a controller-independent interrupt architecture. * Every controller has a 'controller-template', that is used diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 46feba63026..2ba8ae3c8e9 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -4,6 +4,12 @@ extern int noirqdebug; +/* Set default functions for irq_chip structures: */ +extern void irq_chip_set_defaults(struct irq_chip *chip); + +/* Set default handler: */ +extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq); extern void register_handler_proc(unsigned int irq, struct irqaction *action); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1a2e7663096..b61784ee78b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -153,6 +153,17 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) return !action; } +void compat_irq_chip_set_default_handler(struct irq_desc *desc) +{ + /* + * If the architecture still has not overriden + * the flow handler then zap the default. This + * should catch incorrect flow-type setting. + */ + if (desc->handle_irq == &handle_bad_irq) + desc->handle_irq = NULL; +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -217,6 +228,9 @@ int setup_irq(unsigned int irq, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif if (!shared) { + irq_chip_set_defaults(desc->chip); + compat_irq_chip_set_default_handler(desc); + desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_INPROGRESS); diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 096b102fb39..872f91ba2ce 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -37,9 +37,9 @@ static void resend_irqs(unsigned long arg) irq = find_first_bit(irqs_resend, NR_IRQS); clear_bit(irq, irqs_resend); desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); + local_irq_disable(); desc->handle_irq(irq, desc, NULL); - spin_unlock_irqrestore(&desc->lock, flags); + local_irq_enable(); } } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 3a0a6212330..ca187b83f89 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -168,6 +168,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, */ printk(KERN_EMERG "Disabling IRQ #%d\n", irq); desc->status |= IRQ_DISABLED; + desc->depth = 1; desc->chip->disable(irq); } desc->irqs_unhandled = 0; -- cgit From dd87eb3a24c4527741122713e223d74b85d43c85 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2006 02:24:53 -0700 Subject: [PATCH] genirq: add irq-chip support Enable platforms to use the irq-chip and irq-flow abstractions: allow setting of the chip, the type and provide highlevel handlers for common irq-flows. [rostedt@goodmis.org: misroute-irq: Don't call desc->chip->end because of edge interrupts] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Cc: Benjamin Herrenschmidt Signed-off-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/Makefile | 2 +- kernel/irq/chip.c | 525 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/irq/spurious.c | 2 +- 3 files changed, 527 insertions(+), 2 deletions(-) create mode 100644 kernel/irq/chip.c (limited to 'kernel') diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 627ace98d4a..1dab0ac3f79 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,5 +1,5 @@ -obj-y := handle.o manage.o spurious.o resend.o +obj-y := handle.o manage.o spurious.o resend.o chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c new file mode 100644 index 00000000000..8736f2ca8a3 --- /dev/null +++ b/kernel/irq/chip.c @@ -0,0 +1,525 @@ +/* + * linux/kernel/irq/chip.c + * + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the core interrupt handling code, for irq-chip + * based architectures. + * + * Detailed information is available in Documentation/DocBook/genericirq + */ + +#include +#include +#include +#include + +#include "internals.h" + +/** + * set_irq_chip - set the irq chip for an irq + * @irq: irq number + * @chip: pointer to irq chip description structure + */ +int set_irq_chip(unsigned int irq, struct irq_chip *chip) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq); + WARN_ON(1); + return -EINVAL; + } + + if (!chip) + chip = &no_irq_chip; + + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + irq_chip_set_defaults(chip); + desc->chip = chip; + /* + * For compatibility only: + */ + desc->chip = chip; + spin_unlock_irqrestore(&desc->lock, flags); + + return 0; +} +EXPORT_SYMBOL(set_irq_chip); + +/** + * set_irq_type - set the irq type for an irq + * @irq: irq number + * @type: interrupt type - see include/linux/interrupt.h + */ +int set_irq_type(unsigned int irq, unsigned int type) +{ + struct irq_desc *desc; + unsigned long flags; + int ret = -ENXIO; + + if (irq >= NR_IRQS) { + printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); + return -ENODEV; + } + + desc = irq_desc + irq; + if (desc->chip->set_type) { + spin_lock_irqsave(&desc->lock, flags); + ret = desc->chip->set_type(irq, type); + spin_unlock_irqrestore(&desc->lock, flags); + } + return ret; +} +EXPORT_SYMBOL(set_irq_type); + +/** + * set_irq_data - set irq type data for an irq + * @irq: Interrupt number + * @data: Pointer to interrupt specific data + * + * Set the hardware irq controller data for an irq + */ +int set_irq_data(unsigned int irq, void *data) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR + "Trying to install controller data for IRQ%d\n", irq); + return -EINVAL; + } + + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + desc->handler_data = data; + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} +EXPORT_SYMBOL(set_irq_data); + +/** + * set_irq_chip_data - set irq chip data for an irq + * @irq: Interrupt number + * @data: Pointer to chip specific data + * + * Set the hardware irq chip data for an irq + */ +int set_irq_chip_data(unsigned int irq, void *data) +{ + struct irq_desc *desc = irq_desc + irq; + unsigned long flags; + + if (irq >= NR_IRQS || !desc->chip) { + printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); + return -EINVAL; + } + + spin_lock_irqsave(&desc->lock, flags); + desc->chip_data = data; + spin_unlock_irqrestore(&desc->lock, flags); + + return 0; +} +EXPORT_SYMBOL(set_irq_chip_data); + +/* + * default enable function + */ +static void default_enable(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + desc->chip->unmask(irq); + desc->status &= ~IRQ_MASKED; +} + +/* + * default disable function + */ +static void default_disable(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + if (!(desc->status & IRQ_DELAYED_DISABLE)) + irq_desc[irq].chip->mask(irq); +} + +/* + * default startup function + */ +static unsigned int default_startup(unsigned int irq) +{ + irq_desc[irq].chip->enable(irq); + + return 0; +} + +/* + * Fixup enable/disable function pointers + */ +void irq_chip_set_defaults(struct irq_chip *chip) +{ + if (!chip->enable) + chip->enable = default_enable; + if (!chip->disable) + chip->disable = default_disable; + if (!chip->startup) + chip->startup = default_startup; + if (!chip->shutdown) + chip->shutdown = chip->disable; + if (!chip->name) + chip->name = chip->typename; +} + +static inline void mask_ack_irq(struct irq_desc *desc, int irq) +{ + if (desc->chip->mask_ack) + desc->chip->mask_ack(irq); + else { + desc->chip->mask(irq); + desc->chip->ack(irq); + } +} + +/** + * handle_simple_irq - Simple and software-decoded IRQs. + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * @regs: pointer to a register structure + * + * Simple interrupts are either sent from a demultiplexing interrupt + * handler or come from hardware, where no interrupt hardware control + * is necessary. + * + * Note: The caller is expected to handle the ack, clear, mask and + * unmask issues if necessary. + */ +void fastcall +handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +{ + struct irqaction *action; + irqreturn_t action_ret; + const unsigned int cpu = smp_processor_id(); + + spin_lock(&desc->lock); + + if (unlikely(desc->status & IRQ_INPROGRESS)) + goto out_unlock; + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + kstat_cpu(cpu).irqs[irq]++; + + action = desc->action; + if (unlikely(!action || (desc->status & IRQ_DISABLED))) + goto out_unlock; + + desc->status |= IRQ_INPROGRESS; + spin_unlock(&desc->lock); + + action_ret = handle_IRQ_event(irq, regs, action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret, regs); + + spin_lock(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; +out_unlock: + spin_unlock(&desc->lock); +} + +/** + * handle_level_irq - Level type irq handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * @regs: pointer to a register structure + * + * Level type interrupts are active as long as the hardware line has + * the active level. This may require to mask the interrupt and unmask + * it after the associated handler has acknowledged the device, so the + * interrupt line is back to inactive. + */ +void fastcall +handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +{ + unsigned int cpu = smp_processor_id(); + struct irqaction *action; + irqreturn_t action_ret; + + spin_lock(&desc->lock); + mask_ack_irq(desc, irq); + + if (unlikely(desc->status & IRQ_INPROGRESS)) + goto out; + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + kstat_cpu(cpu).irqs[irq]++; + + /* + * If its disabled or no action available + * keep it masked and get out of here + */ + action = desc->action; + if (unlikely(!action || (desc->status & IRQ_DISABLED))) + goto out; + + desc->status |= IRQ_INPROGRESS; + spin_unlock(&desc->lock); + + action_ret = handle_IRQ_event(irq, regs, action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret, regs); + + spin_lock(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; +out: + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); + spin_unlock(&desc->lock); +} + +/** + * handle_fastack_irq - irq handler for transparent controllers + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * @regs: pointer to a register structure + * + * Only a single callback will be issued to the chip: an ->ack() + * call when the interrupt has been serviced. This enables support + * for modern forms of interrupt handlers, which handle the flow + * details in hardware, transparently. + */ +void fastcall +handle_fastack_irq(unsigned int irq, struct irq_desc *desc, + struct pt_regs *regs) +{ + unsigned int cpu = smp_processor_id(); + struct irqaction *action; + irqreturn_t action_ret; + + spin_lock(&desc->lock); + + if (unlikely(desc->status & IRQ_INPROGRESS)) + goto out; + + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + kstat_cpu(cpu).irqs[irq]++; + + /* + * If its disabled or no action available + * keep it masked and get out of here + */ + action = desc->action; + if (unlikely(!action || (desc->status & IRQ_DISABLED))) + goto out; + + desc->status |= IRQ_INPROGRESS; + spin_unlock(&desc->lock); + + action_ret = handle_IRQ_event(irq, regs, action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret, regs); + + spin_lock(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; +out: + if (!(desc->status & IRQ_DISABLED)) + desc->chip->ack(irq); + else + desc->chip->mask(irq); + + spin_unlock(&desc->lock); +} + +/** + * handle_edge_irq - edge type IRQ handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * @regs: pointer to a register structure + * + * Interrupt occures on the falling and/or rising edge of a hardware + * signal. The occurence is latched into the irq controller hardware + * and must be acked in order to be reenabled. After the ack another + * interrupt can happen on the same source even before the first one + * is handled by the assosiacted event handler. If this happens it + * might be necessary to disable (mask) the interrupt depending on the + * controller hardware. This requires to reenable the interrupt inside + * of the loop which handles the interrupts which have arrived while + * the handler was running. If all pending interrupts are handled, the + * loop is left. + */ +void fastcall +handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +{ + const unsigned int cpu = smp_processor_id(); + + spin_lock(&desc->lock); + + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + + /* + * If we're currently running this IRQ, or its disabled, + * we shouldn't process the IRQ. Mark it pending, handle + * the necessary masking and go out + */ + if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || + !desc->action)) { + desc->status |= (IRQ_PENDING | IRQ_MASKED); + mask_ack_irq(desc, irq); + goto out_unlock; + } + + kstat_cpu(cpu).irqs[irq]++; + + /* Start handling the irq */ + desc->chip->ack(irq); + + /* Mark the IRQ currently in progress.*/ + desc->status |= IRQ_INPROGRESS; + + do { + struct irqaction *action = desc->action; + irqreturn_t action_ret; + + if (unlikely(!action)) { + desc->chip->mask(irq); + goto out_unlock; + } + + /* + * When another irq arrived while we were handling + * one, we could have masked the irq. + * Renable it, if it was not disabled in meantime. + */ + if (unlikely((desc->status & + (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == + (IRQ_PENDING | IRQ_MASKED))) { + desc->chip->unmask(irq); + desc->status &= ~IRQ_MASKED; + } + + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, regs, action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret, regs); + spin_lock(&desc->lock); + + } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); + + desc->status &= ~IRQ_INPROGRESS; +out_unlock: + spin_unlock(&desc->lock); +} + +#ifdef CONFIG_SMP +/** + * handle_percpu_IRQ - Per CPU local irq handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * @regs: pointer to a register structure + * + * Per CPU interrupts on SMP machines without locking requirements + */ +void fastcall +handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +{ + irqreturn_t action_ret; + + kstat_this_cpu.irqs[irq]++; + + if (desc->chip->ack) + desc->chip->ack(irq); + + action_ret = handle_IRQ_event(irq, regs, desc->action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret, regs); + + if (desc->chip->eoi) + desc->chip->eoi(irq); +} + +#endif /* CONFIG_SMP */ + +void +__set_irq_handler(unsigned int irq, + void fastcall (*handle)(unsigned int, irq_desc_t *, + struct pt_regs *), + int is_chained) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR + "Trying to install type control for IRQ%d\n", irq); + return; + } + + desc = irq_desc + irq; + + if (!handle) + handle = handle_bad_irq; + + if (is_chained && desc->chip == &no_irq_chip) + printk(KERN_WARNING "Trying to install " + "chained interrupt type for IRQ%d\n", irq); + + spin_lock_irqsave(&desc->lock, flags); + + /* Uninstall? */ + if (handle == handle_bad_irq) { + if (desc->chip != &no_irq_chip) { + desc->chip->mask(irq); + desc->chip->ack(irq); + } + desc->status |= IRQ_DISABLED; + desc->depth = 1; + } + desc->handle_irq = handle; + + if (handle != handle_bad_irq && is_chained) { + desc->status &= ~IRQ_DISABLED; + desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; + desc->depth = 0; + desc->chip->unmask(irq); + } + spin_unlock_irqrestore(&desc->lock, flags); +} + +void +set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, + void fastcall (*handle)(unsigned int, + struct irq_desc *, + struct pt_regs *)) +{ + set_irq_chip(irq, chip); + __set_irq_handler(irq, handle, 0); +} + +/* + * Get a descriptive string for the highlevel handler, for + * /proc/interrupts output: + */ +const char * +handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *, + struct pt_regs *)) +{ + if (handle == handle_level_irq) + return "level "; + if (handle == handle_fastack_irq) + return "level "; + if (handle == handle_edge_irq) + return "edge "; + if (handle == handle_simple_irq) + return "simple"; +#ifdef CONFIG_SMP + if (handle == handle_percpu_irq) + return "percpu"; +#endif + if (handle == handle_bad_irq) + return "bad "; + + return NULL; +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index ca187b83f89..b483deed311 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -79,7 +79,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs) * If we did actual work for the real IRQ line we must let the * IRQ controller clean up too */ - if (work) + if (work && desc->chip && desc->chip->end) desc->chip->end(i); spin_unlock(&desc->lock); } -- cgit From 7a55713ab456d267815fd5ca3c3d0fd14301f306 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:54 -0700 Subject: [PATCH] genirq: add handle_bad_irq() Handle bad IRQ vectors via the irqchip mechanism. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/autoprobe.c | 8 ++++++++ kernel/irq/handle.c | 1 + 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index cfdb63eb5c9..533068cfb60 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -11,6 +11,8 @@ #include #include +#include "internals.h" + /* * Autodetection depends on the fact that any interrupt that * comes in on to an unassigned handler will get stuck with @@ -41,6 +43,12 @@ unsigned long probe_irq_on(void) spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { + /* + * An old-style architecture might still have + * the handle_bad_irq handler there: + */ + compat_irq_chip_set_default_handler(desc); + /* * Some chips need to know about probing in * progress: diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a04b516afa5..3a2dbcc9e21 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -46,6 +46,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, .chip = &no_irq_type, + .handle_irq = handle_bad_irq, .depth = 1, .lock = SPIN_LOCK_UNLOCKED, #ifdef CONFIG_SMP -- cgit From ba9a2331bae5da8f65be3722b9e2d210f1987857 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2006 02:24:55 -0700 Subject: [PATCH] genirq: add irq-wake (power-management) support Enable platforms to set the irq-wake (power-management) properties of an IRQ. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b61784ee78b..3ed7aee8486 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -133,6 +133,27 @@ void enable_irq(unsigned int irq) } EXPORT_SYMBOL(enable_irq); +/** + * set_irq_wake - control irq power management wakeup + * @irq: interrupt to control + * @on: enable/disable power management wakeup + * + * Enable/disable power management wakeup mode + */ +int set_irq_wake(unsigned int irq, unsigned int on) +{ + struct irq_desc *desc = irq_desc + irq; + unsigned long flags; + int ret = -ENXIO; + + spin_lock_irqsave(&desc->lock, flags); + if (desc->chip->set_wake) + ret = desc->chip->set_wake(irq, on); + spin_unlock_irqrestore(&desc->lock, flags); + return ret; +} +EXPORT_SYMBOL(set_irq_wake); + /* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available -- cgit From e76de9f8eb67b7acc1cc6f28c4be8583adf0a90c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2006 02:24:56 -0700 Subject: [PATCH] genirq: add SA_TRIGGER support Enable drivers to request an IRQ with a given irq-flow (trigger/polarity) setting. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3ed7aee8486..627d401c297 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -225,8 +225,14 @@ int setup_irq(unsigned int irq, struct irqaction *new) p = &desc->action; old = *p; if (old) { - /* Can't share interrupts unless both agree to */ - if (!(old->flags & new->flags & SA_SHIRQ)) + /* + * Can't share interrupts unless both agree to and are + * the same type (level, edge, polarity). So both flag + * fields must have SA_SHIRQ set and the bits which + * set the trigger type must match. + */ + if (!((old->flags & new->flags) & SA_SHIRQ) || + ((old->flags ^ new->flags) & SA_TRIGGER_MASK)) goto mismatch; #if defined(CONFIG_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) @@ -250,7 +256,22 @@ int setup_irq(unsigned int irq, struct irqaction *new) #endif if (!shared) { irq_chip_set_defaults(desc->chip); - compat_irq_chip_set_default_handler(desc); + + /* Setup the type (level, edge polarity) if configured: */ + if (new->flags & SA_TRIGGER_MASK) { + if (desc->chip && desc->chip->set_type) + desc->chip->set_type(irq, + new->flags & SA_TRIGGER_MASK); + else + /* + * SA_TRIGGER_* but the PIC does not support + * multiple flow-types? + */ + printk(KERN_WARNING "setup_irq(%d) SA_TRIGGER" + "set. No set_type function available\n", + irq); + } else + compat_irq_chip_set_default_handler(desc); desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_INPROGRESS); @@ -262,7 +283,9 @@ int setup_irq(unsigned int irq, struct irqaction *new) desc->chip->startup(irq); else desc->chip->enable(irq); - } + } else + /* Undo nested disables: */ + desc->depth = 1; } spin_unlock_irqrestore(&desc->lock, flags); -- cgit From f1c2662cbc6a0a9772655649bdf579803d33470b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:57 -0700 Subject: [PATCH] genirq: cleanup: no_irq_type -> no_irq_chip rename Rename no_irq_type to no_irq_chip. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 6 +++--- kernel/irq/manage.c | 2 +- kernel/irq/proc.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3a2dbcc9e21..01fc7f79d74 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -45,7 +45,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, - .chip = &no_irq_type, + .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, .lock = SPIN_LOCK_UNLOCKED, @@ -79,8 +79,8 @@ static unsigned int noop_ret(unsigned int irq) /* * Generic no controller implementation */ -struct hw_interrupt_type no_irq_type = { - .typename = "none", +struct irq_chip no_irq_chip = { + .name = "none", .startup = noop_ret, .shutdown = noop, .enable = noop, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 627d401c297..9eb1d518ee1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -199,7 +199,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) if (irq >= NR_IRQS) return -EINVAL; - if (desc->chip == &no_irq_type) + if (desc->chip == &no_irq_chip) return -ENOSYS; /* * Some drivers like serial.c use request_irq() heavily, diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index f60b85b61e8..607c7809ad0 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -117,7 +117,7 @@ void register_irq_proc(unsigned int irq) char name [MAX_NAMELEN]; if (!root_irq_dir || - (irq_desc[irq].chip == &no_irq_type) || + (irq_desc[irq].chip == &no_irq_chip) || irq_desc[irq].dir) return; -- cgit From 43f7775944e40221827e4b3aec43824aa4c4e4a9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:24:58 -0700 Subject: [PATCH] genirq: more verbose debugging on unexpected IRQ vectors One frequent sign of IRQ handling bugs is the appearance of unexpected vectors. Print out all the IRQ state in that case. We dont want this patch upstream, but it is useful during initial testing. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 2 ++ kernel/irq/internals.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 01fc7f79d74..5a360dd4331 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -24,6 +24,7 @@ void fastcall handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) { + print_irq_desc(irq, desc); kstat_this_cpu.irqs[irq]++; ack_bad_irq(irq); } @@ -61,6 +62,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { */ static void ack_bad(unsigned int irq) { + print_irq_desc(irq, irq_desc + irq); ack_bad_irq(irq); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 2ba8ae3c8e9..08a849a2244 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -22,3 +22,43 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif +/* + * Debugging printout: + */ + +#include + +#define P(f) if (desc->status & f) printk("%14s set\n", #f) + +static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", + irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); + printk("->handle_irq(): %p, ", desc->handle_irq); + print_symbol("%s\n", (unsigned long)desc->handle_irq); + printk("->chip(): %p, ", desc->chip); + print_symbol("%s\n", (unsigned long)desc->chip); + printk("->action(): %p\n", desc->action); + if (desc->action) { + printk("->action->handler(): %p, ", desc->action->handler); + print_symbol("%s\n", (unsigned long)desc->action->handler); + } + + P(IRQ_INPROGRESS); + P(IRQ_DISABLED); + P(IRQ_PENDING); + P(IRQ_REPLAY); + P(IRQ_AUTODETECT); + P(IRQ_WAITING); + P(IRQ_LEVEL); + P(IRQ_MASKED); +#ifdef CONFIG_IRQ_PER_CPU + P(IRQ_PER_CPU); +#endif + P(IRQ_NOPROBE); + P(IRQ_NOREQUEST); + P(IRQ_NOAUTOEN); +} + +#undef P + -- cgit From 98bb244b685eb2a297aa60fa2e5c0631f95828e1 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 29 Jun 2006 02:25:01 -0700 Subject: [PATCH] genirq: fasteoi handler: handle interrupt disabling Note when a disable interrupt happened with the fasteoi handler as well so that delayed disable can be implemented with fasteoi-type controllers. Signed-off-by: Benjamin Herrenschmidt Acked-by: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/chip.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 8736f2ca8a3..a99047a324e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -311,10 +311,13 @@ handle_fastack_irq(unsigned int irq, struct irq_desc *desc, * keep it masked and get out of here */ action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!action || (desc->status & IRQ_DISABLED))) { + desc->status |= IRQ_PENDING; goto out; + } desc->status |= IRQ_INPROGRESS; + desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, regs, action); -- cgit From 47c2a3aa4475d27073dd3c7e183fcc13f495c8f5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 29 Jun 2006 02:25:03 -0700 Subject: [PATCH] genirq: add chip->eoi(), fastack -> fasteoi Clean up the fastack concept by turning it into fasteoi and introducing the ->eoi() method for chips. This also allows the cleanup of an i386 EOI quirk - now the quirk is cleanly separated from the pure ACK implementation. Signed-off-by: Ingo Molnar Cc: Benjamin Herrenschmidt Cc: Roland Dreier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/chip.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a99047a324e..4a0952d9458 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -280,18 +280,18 @@ out: } /** - * handle_fastack_irq - irq handler for transparent controllers + * handle_fasteoi_irq - irq handler for transparent controllers * @irq: the interrupt number * @desc: the interrupt description structure for this irq * @regs: pointer to a register structure * - * Only a single callback will be issued to the chip: an ->ack() + * Only a single callback will be issued to the chip: an ->eoi() * call when the interrupt has been serviced. This enables support * for modern forms of interrupt handlers, which handle the flow * details in hardware, transparently. */ void fastcall -handle_fastack_irq(unsigned int irq, struct irq_desc *desc, +handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) { unsigned int cpu = smp_processor_id(); @@ -327,10 +327,7 @@ handle_fastack_irq(unsigned int irq, struct irq_desc *desc, spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out: - if (!(desc->status & IRQ_DISABLED)) - desc->chip->ack(irq); - else - desc->chip->mask(irq); + desc->chip->eoi(irq); spin_unlock(&desc->lock); } @@ -510,19 +507,19 @@ handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *, struct pt_regs *)) { if (handle == handle_level_irq) - return "level "; - if (handle == handle_fastack_irq) - return "level "; + return "level "; + if (handle == handle_fasteoi_irq) + return "fasteoi"; if (handle == handle_edge_irq) - return "edge "; + return "edge "; if (handle == handle_simple_irq) - return "simple"; + return "simple "; #ifdef CONFIG_SMP if (handle == handle_percpu_irq) - return "percpu"; + return "percpu "; #endif if (handle == handle_bad_irq) - return "bad "; + return "bad "; return NULL; } -- cgit From c7bdb545d23026b18be53289fd866d1ac07f5f8c Mon Sep 17 00:00:00 2001 From: Darrel Goeddel Date: Tue, 27 Jun 2006 13:26:11 -0700 Subject: [NETLINK]: Encapsulate eff_cap usage within security framework. This patch encapsulates the usage of eff_cap (in netlink_skb_params) within the security framework by extending security_netlink_recv to include a required capability parameter and converting all direct usage of eff_caps outside of the lsm modules to use the interface. It also updates the SELinux implementation of the security_netlink_send and security_netlink_recv hooks to take advantage of the sid in the netlink_skb_params struct. This also enables SELinux to perform auditing of netlink capability checks. Please apply, for 2.6.18 if possible. Signed-off-by: Darrel Goeddel Signed-off-by: Stephen Smalley Acked-by: James Morris Signed-off-by: David S. Miller --- kernel/audit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 82443fb433e..d417ca1db79 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -445,7 +445,7 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, * Check for appropriate CAP_AUDIT_ capabilities on incoming audit * control messages. */ -static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) +static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; @@ -459,13 +459,13 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) case AUDIT_DEL: case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: - if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) + if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: - if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) + if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ @@ -488,7 +488,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) char *ctx; u32 len; - err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); + err = audit_netlink_ok(skb, msg_type); if (err) return err; -- cgit From 80f7228b59e4bbe9d840af3ff0f2fe480d6e7c79 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 30 Jun 2006 18:27:16 +0200 Subject: typo fixes: occuring -> occurring Signed-off-by: Adrian Bunk --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1535af3a912..ee8c6dfd925 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1064,7 +1064,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) } /* - * Frequency meter - How fast is some event occuring? + * Frequency meter - How fast is some event occurring? * * These routines manage a digitally filtered, constant time based, * event frequency meter. There are four routines: -- cgit From e02169b682bc448ccdc819dc8639ed34a23cedd8 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Fri, 30 Jun 2006 18:59:59 +0200 Subject: remove obsolete swsusp_encrypt Remove SWSUSP_ENCRYPT config option; it is no longer implemented. Signed-off-by: Pavel Machek Signed-off-by: Adrian Bunk --- kernel/power/Kconfig | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 857b4fa0912..ae44a70aae8 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,18 +100,6 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. -config SWSUSP_ENCRYPT - bool "Encrypt suspend image" - depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y) - default "" - ---help--- - To prevent data gathering from swap after resume you can encrypt - the suspend image with a temporary key that is deleted on - resume. - - Note that the temporary key is stored unencrypted on disk while the - system is suspended. - config SUSPEND_SMP bool depends on HOTPLUG_CPU && X86 && PM -- cgit From 6ab3d5624e172c553004ecc862bfeac16d9d68b7 Mon Sep 17 00:00:00 2001 From: Jörn Engel Date: Fri, 30 Jun 2006 19:25:36 +0200 Subject: Remove obsolete #include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jörn Engel Signed-off-by: Adrian Bunk --- kernel/acct.c | 1 - kernel/configs.c | 1 - kernel/cpuset.c | 1 - kernel/exec_domain.c | 1 - kernel/exit.c | 1 - kernel/fork.c | 1 - kernel/irq/manage.c | 1 - kernel/kmod.c | 1 - kernel/ksysfs.c | 1 - kernel/module.c | 1 - kernel/panic.c | 1 - kernel/params.c | 1 - kernel/printk.c | 1 - kernel/profile.c | 1 - kernel/resource.c | 1 - kernel/signal.c | 1 - kernel/spinlock.c | 1 - kernel/sys.c | 1 - kernel/sysctl.c | 1 - kernel/wait.c | 1 - 20 files changed, 20 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 126ca43d5d2..f18e0b8df3e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -43,7 +43,6 @@ * a struct file opened for write. Fixed. 2/6/2000, AV. */ -#include #include #include #include diff --git a/kernel/configs.c b/kernel/configs.c index 009e1ebdcb8..f9e31974f4a 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -23,7 +23,6 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include #include #include #include diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ee8c6dfd925..c232dc07743 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -18,7 +18,6 @@ * distribution for more details. */ -#include #include #include #include diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index c01cead2cfd..3c2eaea66b1 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -7,7 +7,6 @@ * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) */ -#include #include #include #include diff --git a/kernel/exit.c b/kernel/exit.c index ab06b9f88f6..7f7ef225855 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -4,7 +4,6 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ -#include #include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 628198a4f28..9064bf9e131 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -11,7 +11,6 @@ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ -#include #include #include #include diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9eb1d518ee1..b7117e81ac5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -7,7 +7,6 @@ * This file contains driver APIs to the irq subsystem. */ -#include #include #include #include diff --git a/kernel/kmod.c b/kernel/kmod.c index 20a997c73c3..1b7157af051 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -20,7 +20,6 @@ */ #define __KERNEL_SYSCALLS__ -#include #include #include #include diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 9e28478a17a..e0ffe4ab091 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -8,7 +8,6 @@ * */ -#include #include #include #include diff --git a/kernel/module.c b/kernel/module.c index 99c022ac3d2..281172f01e9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -16,7 +16,6 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include #include #include #include diff --git a/kernel/panic.c b/kernel/panic.c index cc2a4c9c36a..ab13f0f668b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -8,7 +8,6 @@ * This function is used through-out the kernel (including mm and fs) * to indicate a major problem. */ -#include #include #include #include diff --git a/kernel/params.c b/kernel/params.c index af43ecdc8d9..91aea7aa532 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -15,7 +15,6 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include #include #include #include diff --git a/kernel/printk.c b/kernel/printk.c index 95b7fe17f12..39ae24d2a41 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -26,7 +26,6 @@ #include #include #include /* For in_interrupt() */ -#include #include #include #include diff --git a/kernel/profile.c b/kernel/profile.c index 5a730fdb1a2..d5bd75e7501 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -13,7 +13,6 @@ * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 */ -#include #include #include #include diff --git a/kernel/resource.c b/kernel/resource.c index bf1130d81b7..129cf046e56 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -7,7 +7,6 @@ * Arbitrary resource management. */ -#include #include #include #include diff --git a/kernel/signal.c b/kernel/signal.c index 52adf53929f..ecb610d0dc6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -10,7 +10,6 @@ * to allow signals to be sent reliably. */ -#include #include #include #include diff --git a/kernel/spinlock.c b/kernel/spinlock.c index d1b810782bc..b31e54eadf5 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -9,7 +9,6 @@ * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) */ -#include #include #include #include diff --git a/kernel/sys.c b/kernel/sys.c index 2d5179c67ce..dbb3b9c7ea6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -4,7 +4,6 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ -#include #include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 93a2c539864..6399c91e7b9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -18,7 +18,6 @@ * Removed it and replaced it with older style, 03/23/00, Bill Wendling */ -#include #include #include #include diff --git a/kernel/wait.c b/kernel/wait.c index 791681cfea9..5985d866531 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -3,7 +3,6 @@ * * (C) 2004 William Irwin, Oracle */ -#include #include #include #include -- cgit From 34aa1330f9b3c5783d269851d467326525207422 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:37 -0700 Subject: [PATCH] zoned vm counters: zone_reclaim: remove /proc/sys/vm/zone_reclaim_interval The zone_reclaim_interval was necessary because we were not able to determine how many unmapped pages exist in a zone. Therefore we had to scan in intervals to figure out if any pages were unmapped. With the zoned counters and NR_ANON_PAGES we now know the number of pagecache pages and the number of mapped pages in a zone. So we can simply skip the reclaim if there is an insufficient number of unmapped pages. We use SWAP_CLUSTER_MAX as the boundary. Drop all support for /proc/sys/vm/zone_reclaim_interval. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 93a2c539864..ee0db45e243 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -933,15 +933,6 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, - { - .ctl_name = VM_ZONE_RECLAIM_INTERVAL, - .procname = "zone_reclaim_interval", - .data = &zone_reclaim_interval, - .maxlen = sizeof(zone_reclaim_interval), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, #endif #ifdef CONFIG_X86_32 { -- cgit From 8f95dc58d0505516f5cc212a966aea2f2cdb5e44 Mon Sep 17 00:00:00 2001 From: David Quigley Date: Fri, 30 Jun 2006 01:55:47 -0700 Subject: [PATCH] SELinux: add security hook call to kill_proc_info_as_uid This patch adds a call to the extended security_task_kill hook introduced by the prior patch to the kill_proc_info_as_uid function so that these signals can be properly mediated by security modules. It also updates the existing hook call in check_kill_permission. Signed-off-by: David Quigley Signed-off-by: James Morris Cc: Stephen Smalley Cc: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 52adf53929f..477d11adf3d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -584,7 +584,7 @@ static int check_kill_permission(int sig, struct siginfo *info, && !capable(CAP_KILL)) return error; - error = security_task_kill(t, info, sig); + error = security_task_kill(t, info, sig, 0); if (!error) audit_signal_info(sig, t); /* Let audit system see the signal */ return error; @@ -1107,7 +1107,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) /* like kill_proc_info(), but doesn't use uid/euid of "current" */ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, - uid_t uid, uid_t euid) + uid_t uid, uid_t euid, u32 secid) { int ret = -EINVAL; struct task_struct *p; @@ -1127,6 +1127,9 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, ret = -EPERM; goto out_unlock; } + ret = security_task_kill(p, info, sig, secid); + if (ret) + goto out_unlock; if (sig && p->sighand) { unsigned long flags; spin_lock_irqsave(&p->sighand->siglock, flags); -- cgit From e7b384043e27bed4f23b108481b99c518dd01a01 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 30 Jun 2006 01:56:00 -0700 Subject: [PATCH] cond_resched() fix Fix a bug identified by Zou Nan hai : If the system is in state SYSTEM_BOOTING, and need_resched() is true, cond_resched() returns true even though it didn't reschedule. Consequently need_resched() remains true and JBD locks up. Fix that by teaching cond_resched() to only return true if it really did call schedule(). cond_resched_lock() and cond_resched_softirq() have a problem too. If we're in SYSTEM_BOOTING state and need_resched() is true, these functions will drop the lock and will then try to call schedule(), but the SYSTEM_BOOTING state will prevent schedule() from being called. So on return, need_resched() will still be true, but cond_resched_lock() has to return 1 to tell the caller that the lock was dropped. The caller will probably lock up. Bottom line: if these functions dropped the lock, they _must_ call schedule() to clear need_resched(). Make it so. Also, uninline __cond_resched(). It's largeish, and slowpath. Acked-by: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2629c1711fd..d5e37072ea5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4386,7 +4386,16 @@ asmlinkage long sys_sched_yield(void) return 0; } -static inline void __cond_resched(void) +static inline int __resched_legal(void) +{ + if (unlikely(preempt_count())) + return 0; + if (unlikely(system_state != SYSTEM_RUNNING)) + return 0; + return 1; +} + +static void __cond_resched(void) { #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP __might_sleep(__FILE__, __LINE__); @@ -4396,10 +4405,6 @@ static inline void __cond_resched(void) * PREEMPT_ACTIVE, which could trigger a second * cond_resched() call. */ - if (unlikely(preempt_count())) - return; - if (unlikely(system_state != SYSTEM_RUNNING)) - return; do { add_preempt_count(PREEMPT_ACTIVE); schedule(); @@ -4409,13 +4414,12 @@ static inline void __cond_resched(void) int __sched cond_resched(void) { - if (need_resched()) { + if (need_resched() && __resched_legal()) { __cond_resched(); return 1; } return 0; } - EXPORT_SYMBOL(cond_resched); /* @@ -4436,7 +4440,7 @@ int cond_resched_lock(spinlock_t *lock) ret = 1; spin_lock(lock); } - if (need_resched()) { + if (need_resched() && __resched_legal()) { _raw_spin_unlock(lock); preempt_enable_no_resched(); __cond_resched(); @@ -4445,14 +4449,13 @@ int cond_resched_lock(spinlock_t *lock) } return ret; } - EXPORT_SYMBOL(cond_resched_lock); int __sched cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched()) { + if (need_resched() && __resched_legal()) { __local_bh_enable(); __cond_resched(); local_bh_disable(); @@ -4460,10 +4463,8 @@ int __sched cond_resched_softirq(void) } return 0; } - EXPORT_SYMBOL(cond_resched_softirq); - /** * yield - yield the current processor to other threads. * -- cgit From 5adc8a6adc91c4c85a64c75a70a619fffc924817 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Wed, 14 Jun 2006 18:45:21 -0400 Subject: [PATCH] add rule filterkey Add support for a rule key, which can be used to tie audit records to audit rules. This is useful when a watched file is accessed through a link or symlink, as well as for general audit log analysis. Because this patch uses a string key instead of an integer key, there is a bit of extra overhead to do the kstrdup() when a rule fires. However, we're also allocating memory for the audit record buffer, so it's probably not that significant. I went ahead with a string key because it seems more user-friendly. Note that the user must ensure that filterkeys are unique. The kernel only checks for duplicate rules. Signed-off-by: Amy Griffis --- kernel/audit.h | 1 + kernel/auditfilter.c | 95 ++++++++++++++++++++++++++++++++-------------------- kernel/auditsc.c | 15 +++++++++ 3 files changed, 75 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 8323e4132a3..6aa33b848cf 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -81,6 +81,7 @@ struct audit_krule { u32 mask[AUDIT_BITMASK_SIZE]; u32 buflen; /* for data alloc on list rules */ u32 field_count; + char *filterkey; /* ties events to rules */ struct audit_field *fields; struct audit_field *inode_f; /* quick access to an inode field */ struct audit_watch *watch; /* associated watch */ diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4c99d2c586e..e98db08fc6d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -141,6 +141,7 @@ static inline void audit_free_rule(struct audit_entry *e) selinux_audit_rule_free(f->se_rule); } kfree(e->rule.fields); + kfree(e->rule.filterkey); kfree(e); } @@ -511,6 +512,16 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (err) goto exit_free; break; + case AUDIT_FILTERKEY: + err = -EINVAL; + if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) + goto exit_free; + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) + goto exit_free; + entry->rule.buflen += f->val; + entry->rule.filterkey = str; + break; default: goto exit_free; } @@ -612,6 +623,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, krule->watch->path); break; + case AUDIT_FILTERKEY: + data->buflen += data->values[i] = + audit_pack_string(&bufp, krule->filterkey); + break; default: data->values[i] = f->val; } @@ -651,6 +666,11 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->watch->path, b->watch->path)) return 1; break; + case AUDIT_FILTERKEY: + /* both filterkeys exist based on above type compare */ + if (strcmp(a->filterkey, b->filterkey)) + return 1; + break; default: if (a->fields[i].val != b->fields[i].val) return 1; @@ -730,6 +750,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, u32 fcount = old->field_count; struct audit_entry *entry; struct audit_krule *new; + char *fk; int i, err = 0; entry = audit_init_entry(fcount); @@ -760,6 +781,13 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, case AUDIT_SE_CLR: err = audit_dupe_selinux_field(&new->fields[i], &old->fields[i]); + break; + case AUDIT_FILTERKEY: + fk = kstrdup(old->filterkey, GFP_KERNEL); + if (unlikely(!fk)) + err = -ENOMEM; + else + new->filterkey = fk; } if (err) { audit_free_rule(entry); @@ -1245,6 +1273,34 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) skb_queue_tail(q, skb); } +/* Log rule additions and removals */ +static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, + struct audit_krule *rule, int res) +{ + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + if (!ab) + return; + audit_log_format(ab, "auid=%u", loginuid); + if (sid) { + char *ctx = NULL; + u32 len; + if (selinux_ctxid_to_string(sid, &ctx, &len)) + audit_log_format(ab, " ssid=%u", sid); + else + audit_log_format(ab, " subj=%s", ctx); + kfree(ctx); + } + audit_log_format(ab, " %s rule key=", action); + if (rule->filterkey) + audit_log_untrustedstring(ab, rule->filterkey); + else + audit_log_format(ab, "(null)"); + audit_log_format(ab, " list=%d res=%d", rule->listnr, res); + audit_log_end(ab); +} + /** * audit_receive_filter - apply all rules to the specified message type * @type: audit message type @@ -1304,24 +1360,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_add_rule(entry, &audit_filter_list[entry->rule.listnr]); - - if (sid) { - char *ctx = NULL; - u32 len; - if (selinux_ctxid_to_string(sid, &ctx, &len)) { - /* Maybe call audit_panic? */ - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u ssid=%u add rule to list=%d res=%d", - loginuid, sid, entry->rule.listnr, !err); - } else - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u subj=%s add rule to list=%d res=%d", - loginuid, ctx, entry->rule.listnr, !err); - kfree(ctx); - } else - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u add rule to list=%d res=%d", - loginuid, entry->rule.listnr, !err); + audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err); if (err) audit_free_rule(entry); @@ -1337,24 +1376,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_del_rule(entry, &audit_filter_list[entry->rule.listnr]); - - if (sid) { - char *ctx = NULL; - u32 len; - if (selinux_ctxid_to_string(sid, &ctx, &len)) { - /* Maybe call audit_panic? */ - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u ssid=%u remove rule from list=%d res=%d", - loginuid, sid, entry->rule.listnr, !err); - } else - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u subj=%s remove rule from list=%d res=%d", - loginuid, ctx, entry->rule.listnr, !err); - kfree(ctx); - } else - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u remove rule from list=%d res=%d", - loginuid, entry->rule.listnr, !err); + audit_log_rule_change(loginuid, sid, "remove", &entry->rule, + !err); audit_free_rule(entry); break; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index dc5e3f01efe..31665785516 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -186,6 +186,7 @@ struct audit_context { int auditable; /* 1 if record should be written */ int name_count; struct audit_names names[AUDIT_NAMES]; + char * filterkey; /* key for rule that triggered record */ struct dentry * pwd; struct vfsmount * pwdmnt; struct audit_context *previous; /* For nested syscalls */ @@ -348,11 +349,17 @@ static int audit_filter_rules(struct task_struct *tsk, if (ctx) result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); break; + case AUDIT_FILTERKEY: + /* ignore this field for filtering */ + result = 1; + break; } if (!result) return 0; } + if (rule->filterkey) + ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; @@ -627,6 +634,7 @@ static inline void audit_free_context(struct audit_context *context) } audit_free_names(context); audit_free_aux(context); + kfree(context->filterkey); kfree(context); context = previous; } while (context); @@ -735,6 +743,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->euid, context->suid, context->fsuid, context->egid, context->sgid, context->fsgid, tty); audit_log_task_info(ab, tsk); + if (context->filterkey) { + audit_log_format(ab, " key="); + audit_log_untrustedstring(ab, context->filterkey); + } else + audit_log_format(ab, " key=(null)"); audit_log_end(ab); for (aux = context->aux; aux; aux = aux->next) { @@ -1060,6 +1073,8 @@ void audit_syscall_exit(int valid, long return_code) } else { audit_free_names(context); audit_free_aux(context); + kfree(context->filterkey); + context->filterkey = NULL; tsk->audit_context = context; } } -- cgit From 3a6b9f85c641a3b89420b0c8150ed377526a1fe1 Mon Sep 17 00:00:00 2001 From: Darrel Goeddel Date: Thu, 29 Jun 2006 16:56:39 -0500 Subject: [PATCH] audit: rename AUDIT_SE_* constants This patch renames some audit constant definitions and adds additional definitions used by the following patch. The renaming avoids ambiguity with respect to the new definitions. Signed-off-by: Darrel Goeddel include/linux/audit.h | 15 ++++++++---- kernel/auditfilter.c | 50 ++++++++++++++++++++--------------------- kernel/auditsc.c | 10 ++++---- security/selinux/ss/services.c | 32 +++++++++++++------------- 4 files changed, 56 insertions(+), 51 deletions(-) Signed-off-by: Al Viro --- kernel/auditfilter.c | 50 +++++++++++++++++++++++++------------------------- kernel/auditsc.c | 10 +++++----- 2 files changed, 30 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index e98db08fc6d..40a9931a13e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -470,11 +470,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG2: case AUDIT_ARG3: break; - case AUDIT_SE_USER: - case AUDIT_SE_ROLE: - case AUDIT_SE_TYPE: - case AUDIT_SE_SEN: - case AUDIT_SE_CLR: + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: str = audit_unpack_string(&bufp, &remain, f->val); if (IS_ERR(str)) goto exit_free; @@ -611,11 +611,11 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->fields[i] = f->type; data->fieldflags[i] = f->op; switch(f->type) { - case AUDIT_SE_USER: - case AUDIT_SE_ROLE: - case AUDIT_SE_TYPE: - case AUDIT_SE_SEN: - case AUDIT_SE_CLR: + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: data->buflen += data->values[i] = audit_pack_string(&bufp, f->se_str); break; @@ -654,11 +654,11 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 1; switch(a->fields[i].type) { - case AUDIT_SE_USER: - case AUDIT_SE_ROLE: - case AUDIT_SE_TYPE: - case AUDIT_SE_SEN: - case AUDIT_SE_CLR: + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) return 1; break; @@ -774,11 +774,11 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, * the originals will all be freed when the old rule is freed. */ for (i = 0; i < fcount; i++) { switch (new->fields[i].type) { - case AUDIT_SE_USER: - case AUDIT_SE_ROLE: - case AUDIT_SE_TYPE: - case AUDIT_SE_SEN: - case AUDIT_SE_CLR: + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: err = audit_dupe_selinux_field(&new->fields[i], &old->fields[i]); break; @@ -1537,11 +1537,11 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule) for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; switch (f->type) { - case AUDIT_SE_USER: - case AUDIT_SE_ROLE: - case AUDIT_SE_TYPE: - case AUDIT_SE_SEN: - case AUDIT_SE_CLR: + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: return 1; } } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 31665785516..1d24fade17e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -321,11 +321,11 @@ static int audit_filter_rules(struct task_struct *tsk, if (ctx) result = audit_comparator(ctx->loginuid, f->op, f->val); break; - case AUDIT_SE_USER: - case AUDIT_SE_ROLE: - case AUDIT_SE_TYPE: - case AUDIT_SE_SEN: - case AUDIT_SE_CLR: + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: /* NOTE: this may return negative values indicating a temporary error. We simply treat this as a match for now to avoid losing information that -- cgit From 6e5a2d1d32596850a0ebf7fb3e54c0d69901dabd Mon Sep 17 00:00:00 2001 From: Darrel Goeddel Date: Thu, 29 Jun 2006 16:57:08 -0500 Subject: [PATCH] audit: support for object context filters This patch introduces object audit filters based on the elements of the SELinux context. Signed-off-by: Darrel Goeddel Acked-by: Stephen Smalley kernel/auditfilter.c | 25 +++++++++++++++++++++++++ kernel/auditsc.c | 40 ++++++++++++++++++++++++++++++++++++++++ security/selinux/ss/services.c | 18 +++++++++++++++++- 3 files changed, 82 insertions(+), 1 deletion(-) Signed-off-by: Al Viro --- kernel/auditfilter.c | 25 +++++++++++++++++++++++++ kernel/auditsc.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 40a9931a13e..7f2ea8b84a2 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -475,6 +475,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: str = audit_unpack_string(&bufp, &remain, f->val); if (IS_ERR(str)) goto exit_free; @@ -616,6 +621,11 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: data->buflen += data->values[i] = audit_pack_string(&bufp, f->se_str); break; @@ -659,6 +669,11 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) return 1; break; @@ -779,6 +794,11 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: err = audit_dupe_selinux_field(&new->fields[i], &old->fields[i]); break; @@ -1542,6 +1562,11 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule) case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: return 1; } } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1d24fade17e..ae40ac8c39e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -342,6 +342,46 @@ static int audit_filter_rules(struct task_struct *tsk, ctx); } break; + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR + also applies here */ + if (f->se_rule) { + /* Find files that match */ + if (name) { + result = selinux_audit_rule_match( + name->osid, f->type, f->op, + f->se_rule, ctx); + } else if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (selinux_audit_rule_match( + ctx->names[j].osid, + f->type, f->op, + f->se_rule, ctx)) { + ++result; + break; + } + } + } + /* Find ipc objects that match */ + if (ctx) { + struct audit_aux_data *aux; + for (aux = ctx->aux; aux; + aux = aux->next) { + if (aux->type == AUDIT_IPC) { + struct audit_aux_data_ipcctl *axi = (void *)aux; + if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) { + ++result; + break; + } + } + } + } + } + break; case AUDIT_ARG0: case AUDIT_ARG1: case AUDIT_ARG2: -- cgit From b915543b46a2aa599fdd2169e51bcfd88812a12b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 1 Jul 2006 03:56:16 -0400 Subject: [PATCH] audit syscall classes Allow to tie upper bits of syscall bitmap in audit rules to kernel-defined sets of syscalls. Infrastructure, a couple of classes (with 32bit counterparts for biarch targets) and actual tie-in on i386, amd64 and ia64. Signed-off-by: Al Viro --- kernel/auditfilter.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7f2ea8b84a2..5b4e16276ca 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -279,6 +279,29 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len, return 0; } +static __u32 *classes[AUDIT_SYSCALL_CLASSES]; + +int __init audit_register_class(int class, unsigned *list) +{ + __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); + if (!p) + return -ENOMEM; + while (*list != ~0U) { + unsigned n = *list++; + if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) { + kfree(p); + return -EINVAL; + } + p[AUDIT_WORD(n)] |= AUDIT_BIT(n); + } + if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) { + kfree(p); + return -EINVAL; + } + classes[class] = p; + return 0; +} + /* Common user-space to kernel rule translation. */ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) { @@ -322,6 +345,22 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) for (i = 0; i < AUDIT_BITMASK_SIZE; i++) entry->rule.mask[i] = rule->mask[i]; + for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) { + int bit = AUDIT_BITMASK_SIZE * 32 - i - 1; + __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)]; + __u32 *class; + + if (!(*p & AUDIT_BIT(bit))) + continue; + *p &= ~AUDIT_BIT(bit); + class = classes[i]; + if (class) { + int j; + for (j = 0; j < AUDIT_BITMASK_SIZE; j++) + entry->rule.mask[j] |= class[j]; + } + } + return entry; exit_err: -- cgit From a99e4e413e1ab9f3c567b5519f5557afd786dc62 Mon Sep 17 00:00:00 2001 From: Vernon Mauery Date: Sat, 1 Jul 2006 04:35:42 -0700 Subject: [PATCH] pi-futex: fix mm_struct memory leak lock_queue was getting called essentially twice in a row and was continually incrementing the mm_count ref count, thus causing a memory leak. Dinakar Guniguntala provided a proper fix for the problem that simply grabs the spinlock for the hash bucket queue rather than calling lock_queue. The second time we do a queue_lock in futex_lock_pi, we really only need to take the hash bucket lock. Signed-off-by: Dinakar Guniguntala Signed-off-by: Vernon Mauery Acked-by: Paul E. McKenney Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6c91f938005..22aa3c16ca7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1208,7 +1208,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, } down_read(&curr->mm->mmap_sem); - hb = queue_lock(&q, -1, NULL); + spin_lock(q.lock_ptr); /* * Got the lock. We might not be the anticipated owner if we -- cgit From ed6f7b10e657b98b4ba89385d02852c8bdf3980e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 1 Jul 2006 04:35:46 -0700 Subject: [PATCH] pi-futex: futex_wake() lockup fix Fix futex_wake() exit condition bug when handling the robust-list with PI futexes on them. (reported by Ulrich Drepper, debugged by the lock validator.) Signed-off-by: Ingo Molnar Cc: Ulrich Drepper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 22aa3c16ca7..15caf93e4a4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -630,8 +630,10 @@ static int futex_wake(u32 __user *uaddr, int nr_wake) list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { - if (this->pi_state) - return -EINVAL; + if (this->pi_state) { + ret = -EINVAL; + break; + } wake_futex(this); if (++ret >= nr_wake) break; -- cgit From 17311c03c3e2c16d64d9e8cb2a3f45be2e2f8d3b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Sat, 1 Jul 2006 04:35:44 -0700 Subject: [PATCH] IRQ: Use SA_PERCPU_IRQ, not IRQ_PER_CPU, for irqaction.flags IRQ_PER_CPU is a bit in the struct irq_desc "status" field, not in the struct irqaction "flags", so the previous code checked the wrong bit. SA_PERCPU_IRQ is only used by drivers/char/mmtimer.c for SGI ia64 boxes. Signed-off-by: Bjorn Helgaas Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b7117e81ac5..e3a122931e1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -236,7 +236,8 @@ int setup_irq(unsigned int irq, struct irqaction *new) #if defined(CONFIG_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) /* All handlers must agree on per-cpuness */ - if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) + if ((old->flags & SA_PERCPU_IRQ) != + (new->flags & SA_PERCPU_IRQ)) goto mismatch; #endif -- cgit From e8c4b9d003e72199a705fb5a40fcd2487fa16933 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Sat, 1 Jul 2006 04:35:45 -0700 Subject: [PATCH] IRQ: warning message cleanup Make warnings more consistent. Signed-off-by: Bjorn Helgaas Cc: Thomas Gleixner Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e3a122931e1..fcce5181e45 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -114,7 +114,7 @@ void enable_irq(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); switch (desc->depth) { case 0: - printk(KERN_WARNING "Unablanced enable_irq(%d)\n", irq); + printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); WARN_ON(1); break; case 1: { @@ -267,9 +267,10 @@ int setup_irq(unsigned int irq, struct irqaction *new) * SA_TRIGGER_* but the PIC does not support * multiple flow-types? */ - printk(KERN_WARNING "setup_irq(%d) SA_TRIGGER" - "set. No set_type function available\n", - irq); + printk(KERN_WARNING "No SA_TRIGGER set_type " + "function for IRQ %d (%s)\n", irq, + desc->chip ? desc->chip->name : + "unknown"); } else compat_irq_chip_set_default_handler(desc); @@ -299,7 +300,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) mismatch: spin_unlock_irqrestore(&desc->lock, flags); if (!(new->flags & SA_PROBEIRQ)) { - printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__); + printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); dump_stack(); } return -EBUSY; @@ -366,7 +367,7 @@ void free_irq(unsigned int irq, void *dev_id) kfree(action); return; } - printk(KERN_ERR "Trying to free free IRQ%d\n", irq); + printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); spin_unlock_irqrestore(&desc->lock, flags); return; } -- cgit From a2166abd06e7a9fd34eb18b7b27da18c6146e6ef Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 1 Jul 2006 22:30:07 +0100 Subject: [ARM] 3679/1: ARM: Make ARM dyntick implementation work with genirq Patch from Thomas Gleixner From: Thomas Gleixner Make the ARM dyntick implementation work with the generic irq code. This hopefully goes away once we consolidated the dyntick implementations. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Russell King --- kernel/irq/handle.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 5a360dd4331..961b8759173 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -16,6 +16,10 @@ #include #include +#if defined(CONFIG_NO_IDLE_HZ) && defined(CONFIG_ARM) +#include +#endif + #include "internals.h" /** @@ -113,6 +117,15 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; +#if defined(CONFIG_NO_IDLE_HZ) && defined(CONFIG_ARM) + if (!(action->flags & SA_TIMER) && system_timer->dyn_tick != NULL) { + write_seqlock(&xtime_lock); + if (system_timer->dyn_tick->state & DYN_TICK_ENABLED) + system_timer->dyn_tick->handler(irq, 0, regs); + write_sequnlock(&xtime_lock); + } +#endif + if (!(action->flags & SA_INTERRUPT)) local_irq_enable(); -- cgit From f8b5473fcbddbfde827ecf82aa0e81fa2a878220 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 1 Jul 2006 22:30:08 +0100 Subject: [ARM] 3690/1: genirq: Introduce and make use of dummy irq chip Patch from Thomas Gleixner From: Thomas Gleixner ARM has a couple of really dumb interrupt controllers. Implement a generic one and fixup the ARM migration. ARM reused the no_irq_chip for this purpose, but this does not work out for platforms which are not converted to the new interrupt type handling model. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Russell King --- kernel/irq/chip.c | 15 ++++++++++++--- kernel/irq/handle.c | 16 ++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4a0952d9458..54105bdfe20 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -462,9 +462,18 @@ __set_irq_handler(unsigned int irq, if (!handle) handle = handle_bad_irq; - if (is_chained && desc->chip == &no_irq_chip) - printk(KERN_WARNING "Trying to install " - "chained interrupt type for IRQ%d\n", irq); + if (desc->chip == &no_irq_chip) { + printk(KERN_WARNING "Trying to install %sinterrupt handler " + "for IRQ%d\n", is_chained ? "chained " : " ", irq); + /* + * Some ARM implementations install a handler for really dumb + * interrupt hardware without setting an irq_chip. This worked + * with the ARM no_irq_chip but the check in setup_irq would + * prevent us to setup the interrupt at all. Switch it to + * dummy_irq_chip for easy transition. + */ + desc->chip = &dummy_irq_chip; + } spin_lock_irqsave(&desc->lock, flags); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 961b8759173..e71266c3803 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -95,6 +95,22 @@ struct irq_chip no_irq_chip = { .end = noop, }; +/* + * Generic dummy implementation which can be used for + * real dumb interrupt sources + */ +struct irq_chip dummy_irq_chip = { + .name = "dummy", + .startup = noop_ret, + .shutdown = noop, + .enable = noop, + .disable = noop, + .ack = noop, + .mask = noop, + .unmask = noop, + .end = noop, +}; + /* * Special, empty irq handler: */ -- cgit From 3cca53b02a5bab0f407b1add2f84c22c20243a79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 1 Jul 2006 19:29:31 -0700 Subject: [PATCH] irq-flags: generic irq: Use the new IRQF_ constants Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 4 ++-- kernel/irq/manage.c | 38 +++++++++++++++++++------------------- kernel/irq/spurious.c | 4 ++-- 3 files changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 5a360dd4331..6070e046469 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -113,7 +113,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; - if (!(action->flags & SA_INTERRUPT)) + if (!(action->flags & IRQF_DISABLED)) local_irq_enable(); do { @@ -124,7 +124,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, action = action->next; } while (action); - if (status & SA_SAMPLE_RANDOM) + if (status & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); local_irq_disable(); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fcce5181e45..fede5fa351d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -167,7 +167,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) action = irq_desc[irq].action; if (action) - if (irqflags & action->flags & SA_SHIRQ) + if (irqflags & action->flags & IRQF_SHARED) action = NULL; return !action; @@ -205,7 +205,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) * so we have to be careful not to interfere with a * running system. */ - if (new->flags & SA_SAMPLE_RANDOM) { + if (new->flags & IRQF_SAMPLE_RANDOM) { /* * This function might sleep, we want to call it first, * outside of the atomic block. @@ -227,17 +227,17 @@ int setup_irq(unsigned int irq, struct irqaction *new) /* * Can't share interrupts unless both agree to and are * the same type (level, edge, polarity). So both flag - * fields must have SA_SHIRQ set and the bits which + * fields must have IRQF_SHARED set and the bits which * set the trigger type must match. */ - if (!((old->flags & new->flags) & SA_SHIRQ) || - ((old->flags ^ new->flags) & SA_TRIGGER_MASK)) + if (!((old->flags & new->flags) & IRQF_SHARED) || + ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) goto mismatch; -#if defined(CONFIG_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) +#if defined(CONFIG_IRQ_PER_CPU) && defined(IRQF_PERCPU) /* All handlers must agree on per-cpuness */ - if ((old->flags & SA_PERCPU_IRQ) != - (new->flags & SA_PERCPU_IRQ)) + if ((old->flags & IRQF_PERCPU) != + (new->flags & IRQF_PERCPU)) goto mismatch; #endif @@ -250,24 +250,24 @@ int setup_irq(unsigned int irq, struct irqaction *new) } *p = new; -#if defined(CONFIG_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) - if (new->flags & SA_PERCPU_IRQ) +#if defined(CONFIG_IRQ_PER_CPU) && defined(IRQF_PERCPU) + if (new->flags & IRQF_PERCPU) desc->status |= IRQ_PER_CPU; #endif if (!shared) { irq_chip_set_defaults(desc->chip); /* Setup the type (level, edge polarity) if configured: */ - if (new->flags & SA_TRIGGER_MASK) { + if (new->flags & IRQF_TRIGGER_MASK) { if (desc->chip && desc->chip->set_type) desc->chip->set_type(irq, - new->flags & SA_TRIGGER_MASK); + new->flags & IRQF_TRIGGER_MASK); else /* - * SA_TRIGGER_* but the PIC does not support + * IRQF_TRIGGER_* but the PIC does not support * multiple flow-types? */ - printk(KERN_WARNING "No SA_TRIGGER set_type " + printk(KERN_WARNING "No IRQF_TRIGGER set_type " "function for IRQ %d (%s)\n", irq, desc->chip ? desc->chip->name : "unknown"); @@ -299,7 +299,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) mismatch: spin_unlock_irqrestore(&desc->lock, flags); - if (!(new->flags & SA_PROBEIRQ)) { + if (!(new->flags & IRQF_PROBE_SHARED)) { printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); dump_stack(); } @@ -398,9 +398,9 @@ EXPORT_SYMBOL(free_irq); * * Flags: * - * SA_SHIRQ Interrupt is shared - * SA_INTERRUPT Disable local interrupts while processing - * SA_SAMPLE_RANDOM The interrupt can be used for entropy + * IRQF_SHARED Interrupt is shared + * IRQF_DISABLED Disable local interrupts while processing + * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy * */ int request_irq(unsigned int irq, @@ -416,7 +416,7 @@ int request_irq(unsigned int irq, * which interrupt is which (messes up the interrupt freeing * logic etc). */ - if ((irqflags & SA_SHIRQ) && !dev_id) + if ((irqflags & IRQF_SHARED) && !dev_id) return -EINVAL; if (irq >= NR_IRQS) return -EINVAL; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index b483deed311..417e98092cf 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -36,7 +36,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs) * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too */ - if (desc->action && (desc->action->flags & SA_SHIRQ)) + if (desc->action && (desc->action->flags & IRQF_SHARED)) desc->status |= IRQ_PENDING; spin_unlock(&desc->lock); continue; @@ -48,7 +48,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs) while (action) { /* Only shared IRQ handlers are safe to call */ - if (action->flags & SA_SHIRQ) { + if (action->flags & IRQF_SHARED) { if (action->handler(i, action->dev_id, regs) == IRQ_HANDLED) ok = 1; -- cgit From d061daa0e3abdddc28e21a37c8ac4536dedbf239 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 3 Jul 2006 02:18:48 +0200 Subject: [PATCH] genirq: ARM dyntick cleanup Linus: "The hacks in kernel/irq/handle.c are really horrid. REALLY horrid." They are indeed. Move the dyntick quirks to ARM where they belong. Signed-off-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6d8b3011496..aeb6e391276 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -16,10 +16,6 @@ #include #include -#if defined(CONFIG_NO_IDLE_HZ) && defined(CONFIG_ARM) -#include -#endif - #include "internals.h" /** @@ -133,14 +129,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; -#if defined(CONFIG_NO_IDLE_HZ) && defined(CONFIG_ARM) - if (!(action->flags & SA_TIMER) && system_timer->dyn_tick != NULL) { - write_seqlock(&xtime_lock); - if (system_timer->dyn_tick->state & DYN_TICK_ENABLED) - system_timer->dyn_tick->handler(irq, 0, regs); - write_sequnlock(&xtime_lock); - } -#endif + handle_dynamic_tick(action); if (!(action->flags & IRQF_DISABLED)) local_irq_enable(); -- cgit From 284c66806eb6df7f5c66d298681f1abe81a5a9ab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 3 Jul 2006 02:20:32 +0200 Subject: [PATCH] genirq:fixup missing SA_PERCPU replacement The irqflags consolidation converted SA_PERCPU_IRQ to IRQF_PERCPU but did not define the new constant. Signed-off-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fede5fa351d..c911c6ec4dd 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -234,7 +234,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) goto mismatch; -#if defined(CONFIG_IRQ_PER_CPU) && defined(IRQF_PERCPU) +#if defined(CONFIG_IRQ_PER_CPU) /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != (new->flags & IRQF_PERCPU)) @@ -250,7 +250,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) } *p = new; -#if defined(CONFIG_IRQ_PER_CPU) && defined(IRQF_PERCPU) +#if defined(CONFIG_IRQ_PER_CPU) if (new->flags & IRQF_PERCPU) desc->status |= IRQ_PER_CPU; #endif -- cgit