From db4315d6f53edc2cc0b0b06fce1beffebb119c71 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 5 Feb 2008 00:48:13 +0100 Subject: timer_list: print relative expiry time signed Relative expiry time can get negative, so it should be signed. Signed-off-by: Pavel Machek Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index d3d94c1a0fd..67fe8fc21fb 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -65,9 +65,9 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); #endif SEQ_printf(m, "\n"); - SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n", + SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n", (unsigned long long)ktime_to_ns(timer->expires), - (unsigned long long)(ktime_to_ns(timer->expires) - now)); + (long long)(ktime_to_ns(timer->expires) - now)); } static void -- cgit From b0abcfc14605b2a8c686bd8e193ab05b01a7980b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 18 Feb 2008 18:23:16 -0500 Subject: Audit: use == not = in if statements Clearly this was supposed to be an == not an = in the if statement. This patch also causes us to stop processing execve args once we have failed rather than continuing to loop on failure over and over and over. Signed-off-by: Eric Paris Acked-by: Al Viro Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ac6d9b23b01..2087d6de67e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1000,9 +1000,10 @@ static int audit_log_single_execve_arg(struct audit_context *context, * for strings that are too long, we should not have created * any. */ - if (unlikely((len = -1) || len > MAX_ARG_STRLEN - 1)) { + if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) { WARN_ON(1); send_sig(SIGKILL, current, 0); + return -1; } /* walk the whole argument looking for non-ascii chars */ @@ -1020,6 +1021,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, if (ret) { WARN_ON(1); send_sig(SIGKILL, current, 0); + return -1; } buf[to_send] = '\0'; has_cntl = audit_string_contains_control(buf, to_send); @@ -1083,6 +1085,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, if (ret) { WARN_ON(1); send_sig(SIGKILL, current, 0); + return -1; } buf[to_send] = '\0'; -- cgit From 188fd89d539d899bfca2bc83534e5508e0161139 Mon Sep 17 00:00:00 2001 From: "S.Caglar Onur" Date: Thu, 14 Feb 2008 17:36:51 +0200 Subject: genirq: spurious.c: use time_* macros The functions time_before, time_before_eq, time_after, and time_after_eq are more robust for comparing jiffies against other values. So following patch implements usage of the time_after() macro, defined at linux/jiffies.h, which deals with wrapping correctly Signed-off-by: S.Caglar Onur Acked-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index a6b2bc831dd..088dabbf2d6 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -6,6 +6,7 @@ * This file contains spurious interrupt handling. */ +#include #include #include #include @@ -179,7 +180,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * otherwise the couter becomes a doomsday timer for otherwise * working systems */ - if (jiffies - desc->last_unhandled > HZ/10) + if (time_after(jiffies, desc->last_unhandled + HZ/10)) desc->irqs_unhandled = 1; else desc->irqs_unhandled++; -- cgit From 89d694b9dbe769ca1004e01db0ca43964806a611 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 18 Feb 2008 18:25:17 +0100 Subject: genirq: do not leave interupts enabled on free_irq The default_disable() function was changed in commit: 76d2160147f43f982dfe881404cfde9fd0a9da21 genirq: do not mask interrupts by default It removed the mask function in favour of the default delayed interrupt disabling. Unfortunately this also broke the shutdown in free_irq() when the last handler is removed from the interrupt for those architectures which rely on the default implementations. Now we can end up with a enabled interrupt line after the last handler was removed, which can result in spurious interrupts. Fix this by adding a default_shutdown function, which is only installed, when the irqchip implementation does provide neither a shutdown nor a disable function. [@stable: affected versions: .21 - .24 ] Pointed-out-by: Michael Hennerich Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: stable@kernel.org Tested-by: Michael Hennerich --- kernel/irq/chip.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cc54c627635..fdb3fbe2b0c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -245,6 +245,17 @@ static unsigned int default_startup(unsigned int irq) return 0; } +/* + * default shutdown function + */ +static void default_shutdown(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + desc->chip->mask(irq); + desc->status |= IRQ_MASKED; +} + /* * Fixup enable/disable function pointers */ @@ -256,8 +267,15 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->disable = default_disable; if (!chip->startup) chip->startup = default_startup; + /* + * We use chip->disable, when the user provided its own. When + * we have default_disable set for chip->disable, then we need + * to use default_shutdown, otherwise the irq line is not + * disabled on free_irq(): + */ if (!chip->shutdown) - chip->shutdown = chip->disable; + chip->shutdown = chip->disable != default_disable ? + chip->disable : default_shutdown; if (!chip->name) chip->name = chip->typename; if (!chip->end) -- cgit From 8a235efad548abd2ab5ebea45a9ffa750c814375 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Feb 2008 01:47:44 +0100 Subject: Hibernation: Handle DEBUG_PAGEALLOC on x86 Make hibernation work with CONFIG_DEBUG_PAGEALLOC set on x86, by checking if the pages to be copied are marked as present in the kernel mapping and temporarily marking them as present if that's not the case. No functional modifications are introduced if CONFIG_DEBUG_PAGEALLOC is unset. Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- kernel/power/snapshot.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 95250d7c8d9..72a020cabb4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -875,8 +875,8 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } #endif /* CONFIG_HIGHMEM */ /** - * saveable - Determine whether a non-highmem page should be included in - * the suspend image. + * saveable_page - Determine whether a non-highmem page should be included + * in the suspend image. * * We should save the page if it isn't Nosave, and is not in the range * of pages statically defined as 'unsaveable', and it isn't a part of @@ -897,7 +897,8 @@ static struct page *saveable_page(unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; - if (PageReserved(page) && pfn_is_nosave(pfn)) + if (PageReserved(page) + && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; return page; @@ -938,6 +939,25 @@ static inline void do_copy_page(long *dst, long *src) *dst++ = *src++; } + +/** + * safe_copy_page - check if the page we are going to copy is marked as + * present in the kernel page tables (this always is the case if + * CONFIG_DEBUG_PAGEALLOC is not set and in that case + * kernel_page_present() always returns 'true'). + */ +static void safe_copy_page(void *dst, struct page *s_page) +{ + if (kernel_page_present(s_page)) { + do_copy_page(dst, page_address(s_page)); + } else { + kernel_map_pages(s_page, 1, 1); + do_copy_page(dst, page_address(s_page)); + kernel_map_pages(s_page, 1, 0); + } +} + + #ifdef CONFIG_HIGHMEM static inline struct page * page_is_saveable(struct zone *zone, unsigned long pfn) @@ -946,8 +966,7 @@ page_is_saveable(struct zone *zone, unsigned long pfn) saveable_highmem_page(pfn) : saveable_page(pfn); } -static inline void -copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { struct page *s_page, *d_page; void *src, *dst; @@ -961,29 +980,26 @@ copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) kunmap_atomic(src, KM_USER0); kunmap_atomic(dst, KM_USER1); } else { - src = page_address(s_page); if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel * data modified by kmap_atomic() */ - do_copy_page(buffer, src); + safe_copy_page(buffer, s_page); dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); memcpy(dst, buffer, PAGE_SIZE); kunmap_atomic(dst, KM_USER0); } else { - dst = page_address(d_page); - do_copy_page(dst, src); + safe_copy_page(page_address(d_page), s_page); } } } #else #define page_is_saveable(zone, pfn) saveable_page(pfn) -static inline void -copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { - do_copy_page(page_address(pfn_to_page(dst_pfn)), - page_address(pfn_to_page(src_pfn))); + safe_copy_page(page_address(pfn_to_page(dst_pfn)), + pfn_to_page(src_pfn)); } #endif /* CONFIG_HIGHMEM */ -- cgit From 120fc3d77acfd91f3521737a440d42839c475982 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 21 Feb 2008 00:33:20 +0100 Subject: modules: do not try to add sysfs attributes if !CONFIG_SYSFS Thanks to Alexey for the testing and the fix of the fix. Cc: Alexey Dobriyan Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 92595bad381..901cd6ac2f1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -987,12 +987,11 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, return ret; } - /* * /sys/module/foo/sections stuff * J. Corbet */ -#ifdef CONFIG_KALLSYMS +#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) static ssize_t module_sect_show(struct module_attribute *mattr, struct module *mod, char *buf) { @@ -1188,7 +1187,7 @@ static inline void add_notes_attrs(struct module *mod, unsigned int nsect, static inline void remove_notes_attrs(struct module *mod) { } -#endif /* CONFIG_KALLSYMS */ +#endif #ifdef CONFIG_SYSFS int module_add_modinfo_attrs(struct module *mod) @@ -1231,9 +1230,7 @@ void module_remove_modinfo_attrs(struct module *mod) } kfree(mod->modinfo_attrs); } -#endif -#ifdef CONFIG_SYSFS int mod_sysfs_init(struct module *mod) { int err; -- cgit From 3a2d5b700132f35401f1d9e22fe3c2cab02c2549 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 23 Feb 2008 19:13:25 +0100 Subject: PM: Introduce PM_EVENT_HIBERNATE callback state During the last step of hibernation in the "platform" mode (with the help of ACPI) we use the suspend code, including the devices' ->suspend() methods, to prepare the system for entering the ACPI S4 system sleep state. But at least for some devices the operations performed by the ->suspend() callback in that case must be different from its operations during regular suspend. For this reason, introduce the new PM event type PM_EVENT_HIBERNATE and pass it to the device drivers' ->suspend() methods during the last phase of hibernation, so that they can distinguish this case and handle it as appropriate. Modify the drivers that handle PM_EVENT_SUSPEND in a special way and need to handle PM_EVENT_HIBERNATE in the same way. These changes are necessary to fix a hibernation regression related to the i915 driver (ref. http://lkml.org/lkml/2008/2/22/488). Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Tested-by: Jeff Chua Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 859a8e59773..14a656cdc65 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -391,7 +391,7 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); - error = device_suspend(PMSG_SUSPEND); + error = device_suspend(PMSG_HIBERNATE); if (error) goto Resume_console; @@ -404,7 +404,7 @@ int hibernation_platform_enter(void) goto Finish; local_irq_disable(); - error = device_power_down(PMSG_SUSPEND); + error = device_power_down(PMSG_HIBERNATE); if (!error) { hibernation_ops->enter(); /* We should never get here */ -- cgit From de4fc64f0f2a4efbaad3e7c1e1e05a28f69b45e5 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 23 Feb 2008 15:23:33 -0800 Subject: markers: fix sparse warnings in markers.c char can be unsigned kernel/marker.c:64:20: error: dubious one-bit signed bitfield kernel/marker.c:65:14: error: dubious one-bit signed bitfield Signed-off-by: Harvey Harrison Acked-by: Mathieu Desnoyers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/marker.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index c4c2cd8b61f..50effc01d9a 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -61,8 +61,8 @@ struct marker_entry { int refcount; /* Number of times armed. 0 if disarmed. */ struct rcu_head rcu; void *oldptr; - char rcu_pending:1; - char ptype:1; + unsigned char rcu_pending:1; + unsigned char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; -- cgit From 3e4ab747efa8e78562ec6782b08bbf21a00aba1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Feb 2008 15:23:55 -0800 Subject: futex: fix init order When the futex init code fails to initialize the futex pseudo file system it returns early without initializing the hash queues. Should the boot succeed then a futex syscall which tries to enqueue a waiter on the hashqueue will crash due to the unitilialized plist heads. Initialize the hash queues before the filesystem. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Lennert Buytenhek Cc: Riku Voipio Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 221f2128a43..c21f667c63f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2145,8 +2145,14 @@ static struct file_system_type futex_fs_type = { static int __init init(void) { - int i = register_filesystem(&futex_fs_type); + int i; + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); + spin_lock_init(&futex_queues[i].lock); + } + + i = register_filesystem(&futex_fs_type); if (i) return i; @@ -2156,10 +2162,6 @@ static int __init init(void) return PTR_ERR(futex_mnt); } - for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { - plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); - spin_lock_init(&futex_queues[i].lock); - } return 0; } __initcall(init); -- cgit From a0c1e9073ef7428a14309cba010633a6cd6719ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Feb 2008 15:23:57 -0800 Subject: futex: runtime enable pi and robust functionality Not all architectures implement futex_atomic_cmpxchg_inatomic(). The default implementation returns -ENOSYS, which is currently not handled inside of the futex guts. Futex PI calls and robust list exits with a held futex result in an endless loop in the futex code on architectures which have no support. Fixing up every place where futex_atomic_cmpxchg_inatomic() is called would add a fair amount of extra if/else constructs to the already complex code. It is also not possible to disable the robust feature before user space tries to register robust lists. Compile time disabling is not a good idea either, as there are already architectures with runtime detection of futex_atomic_cmpxchg_inatomic support. Detect the functionality at runtime instead by calling cmpxchg_futex_value_locked() with a NULL pointer from the futex initialization code. This is guaranteed to fail, but the call of futex_atomic_cmpxchg_inatomic() happens with pagefaults disabled. On architectures, which use the asm-generic implementation or have a runtime CPU feature detection, a -ENOSYS return value disables the PI/robust features. On architectures with a working implementation the call returns -EFAULT and the PI/robust features are enabled. The relevant syscalls return -ENOSYS and the robust list exit code is blocked, when the detection fails. Fixes http://lkml.org/lkml/2008/2/11/149 Originally reported by: Lennart Buytenhek Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Lennert Buytenhek Cc: Riku Voipio Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 38 ++++++++++++++++++++++++++++++++++---- kernel/futex_compat.c | 9 +++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index c21f667c63f..06968cd7920 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -60,6 +60,8 @@ #include "rtmutex_common.h" +int __read_mostly futex_cmpxchg_enabled; + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -469,6 +471,8 @@ void exit_pi_state_list(struct task_struct *curr) struct futex_hash_bucket *hb; union futex_key key; + if (!futex_cmpxchg_enabled) + return; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful @@ -1870,6 +1874,8 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len) { + if (!futex_cmpxchg_enabled) + return -ENOSYS; /* * The kernel knows only one size for now: */ @@ -1894,6 +1900,9 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, struct robust_list_head __user *head; unsigned long ret; + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (!pid) head = current->robust_list; else { @@ -1997,6 +2006,9 @@ void exit_robust_list(struct task_struct *curr) unsigned long futex_offset; int rc; + if (!futex_cmpxchg_enabled) + return; + /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -2051,7 +2063,7 @@ void exit_robust_list(struct task_struct *curr) long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int ret; + int ret = -ENOSYS; int cmd = op & FUTEX_CMD_MASK; struct rw_semaphore *fshared = NULL; @@ -2083,13 +2095,16 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); break; case FUTEX_UNLOCK_PI: - ret = futex_unlock_pi(uaddr, fshared); + if (futex_cmpxchg_enabled) + ret = futex_unlock_pi(uaddr, fshared); break; case FUTEX_TRYLOCK_PI: - ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; default: ret = -ENOSYS; @@ -2145,8 +2160,23 @@ static struct file_system_type futex_fs_type = { static int __init init(void) { + u32 curval; int i; + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non functional ones will return + * -ENOSYS. + */ + curval = cmpxchg_futex_value_locked(NULL, 0, 0); + if (curval == -EFAULT) + futex_cmpxchg_enabled = 1; + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); spin_lock_init(&futex_queues[i].lock); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 7d5e4b016f3..ff90f049f8f 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -54,6 +54,9 @@ void compat_exit_robust_list(struct task_struct *curr) compat_long_t futex_offset; int rc; + if (!futex_cmpxchg_enabled) + return; + /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -115,6 +118,9 @@ asmlinkage long compat_sys_set_robust_list(struct compat_robust_list_head __user *head, compat_size_t len) { + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (unlikely(len != sizeof(*head))) return -EINVAL; @@ -130,6 +136,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, struct compat_robust_list_head __user *head; unsigned long ret; + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (!pid) head = current->compat_robust_list; else { -- cgit From 43627582799db317e966ecb0002c2c3c9805ec0f Mon Sep 17 00:00:00 2001 From: Srinivasa Ds Date: Sat, 23 Feb 2008 15:24:04 -0800 Subject: kprobes: refuse kprobe insertion on add/sub_preempt_counter() Kprobes makes use of preempt_disable(),preempt_enable_noresched() and these functions inturn call add/sub_preempt_count(). So we need to refuse user from inserting probe in to these functions. This patch disallows user from probing add/sub_preempt_count(). Signed-off-by: Srinivasa DS Acked-by: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f28f19e65b5..c4bc8c21095 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3766,7 +3766,7 @@ void scheduler_tick(void) #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) -void add_preempt_count(int val) +void __kprobes add_preempt_count(int val) { /* * Underflow? @@ -3782,7 +3782,7 @@ void add_preempt_count(int val) } EXPORT_SYMBOL(add_preempt_count); -void sub_preempt_count(int val) +void __kprobes sub_preempt_count(int val) { /* * Underflow? -- cgit From a043e3b2c63445512c5592cbe3c8694f3c655e81 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:09 -0800 Subject: cgroup: fix comments fix: - comments about need_forkexit_callback - comments about release agent - typo and comment style, etc. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 142 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 79 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4766bb65e4d..36066d8a491 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -113,9 +113,9 @@ static int root_count; #define dummytop (&rootnode.top_cgroup) /* This flag indicates whether tasks in the fork and exit paths should - * take callback_mutex and check for fork/exit handlers to call. This - * avoids us having to do extra work in the fork/exit path if none of the - * subsystems need to be called. + * check for fork/exit handlers to call. This avoids us having to do + * extra work in the fork/exit path if none of the subsystems need to + * be called. */ static int need_forkexit_callback; @@ -307,7 +307,6 @@ static inline void put_css_set_taskexit(struct css_set *cg) * template: location in which to build the desired set of subsystem * state objects for the new cgroup group */ - static struct css_set *find_existing_css_set( struct css_set *oldcg, struct cgroup *cgrp, @@ -354,7 +353,6 @@ static struct css_set *find_existing_css_set( * and chains them on tmp through their cgrp_link_list fields. Returns 0 on * success or a negative error */ - static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; @@ -396,7 +394,6 @@ static void free_cg_links(struct list_head *tmp) * substituted into the appropriate hierarchy. Must be called with * cgroup_mutex held */ - static struct css_set *find_css_set( struct css_set *oldcg, struct cgroup *cgrp) { @@ -507,8 +504,8 @@ static struct css_set *find_css_set( * critical pieces of code here. The exception occurs on cgroup_exit(), * when a task in a notify_on_release cgroup exits. Then cgroup_mutex * is taken, and if the cgroup count is zero, a usermode call made - * to /sbin/cgroup_release_agent with the name of the cgroup (path - * relative to the root of cgroup file system) as the argument. + * to the release agent with the name of the cgroup (path relative to + * the root of cgroup file system) as the argument. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all @@ -521,7 +518,7 @@ static struct css_set *find_css_set( * * The need for this exception arises from the action of * cgroup_attach_task(), which overwrites one tasks cgroup pointer with - * another. It does so using cgroup_mutexe, however there are + * another. It does so using cgroup_mutex, however there are * several performance critical places that need to reference * task->cgroup without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as @@ -537,7 +534,6 @@ static struct css_set *find_css_set( * cgroup_lock - lock out any changes to cgroup structures * */ - void cgroup_lock(void) { mutex_lock(&cgroup_mutex); @@ -548,7 +544,6 @@ void cgroup_lock(void) * * Undo the lock taken in a previous cgroup_lock() call. */ - void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); @@ -590,7 +585,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) * Call subsys's pre_destroy handler. * This is called before css refcnt check. */ - static void cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -600,7 +594,6 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) return; } - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -1129,8 +1122,13 @@ static inline struct cftype *__d_cft(struct dentry *dentry) return dentry->d_fsdata; } -/* - * Called with cgroup_mutex held. Writes path of cgroup into buf. +/** + * cgroup_path - generate the path of a cgroup + * @cgrp: the cgroup in question + * @buf: the buffer to write the path into + * @buflen: the length of the buffer + * + * Called with cgroup_mutex held. Writes path of cgroup into buf. * Returns 0 on success, -errno on error. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) @@ -1188,11 +1186,13 @@ static void get_first_subsys(const struct cgroup *cgrp, *subsys_id = test_ss->subsys_id; } -/* - * Attach task 'tsk' to cgroup 'cgrp' +/** + * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' + * @cgrp: the cgroup the task is attaching to + * @tsk: the task to be attached * - * Call holding cgroup_mutex. May take task_lock of - * the task 'pid' during call. + * Call holding cgroup_mutex. May take task_lock of + * the task 'tsk' during call. */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { @@ -1293,7 +1293,6 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) } /* The various types of files and directories in a cgroup file system */ - enum cgroup_filetype { FILE_ROOT, FILE_DIR, @@ -1584,12 +1583,11 @@ static int cgroup_create_file(struct dentry *dentry, int mode, } /* - * cgroup_create_dir - create a directory for an object. - * cgrp: the cgroup we create the directory for. - * It must have a valid ->parent field - * And we are going to fill its ->dentry field. - * dentry: dentry of the new cgroup - * mode: mode to set on new directory. + * cgroup_create_dir - create a directory for an object. + * @cgrp: the cgroup we create the directory for. It must have a valid + * ->parent field. And we are going to fill its ->dentry field. + * @dentry: dentry of the new cgroup + * @mode: mode to set on new directory. */ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, int mode) @@ -1651,8 +1649,12 @@ int cgroup_add_files(struct cgroup *cgrp, return 0; } -/* Count the number of tasks in a cgroup. */ - +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + * + * Return the number of tasks in the cgroup. + */ int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; @@ -1962,12 +1964,13 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) } /** - * Build and fill cgroupstats so that taskstats can export it to user - * space. - * + * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. + * + * Build and fill cgroupstats so that taskstats can export it to user + * space. */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { @@ -2199,14 +2202,13 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, } /* - * cgroup_create - create a cgroup - * parent: cgroup that will be parent of the new cgroup. - * name: name of the new cgroup. Will be strcpy'ed. - * mode: mode to set on new inode + * cgroup_create - create a cgroup + * @parent: cgroup that will be parent of the new cgroup + * @dentry: dentry of the new cgroup + * @mode: mode to set on new inode * - * Must be called with the mutex on the parent inode held + * Must be called with the mutex on the parent inode held */ - static long cgroup_create(struct cgroup *parent, struct dentry *dentry, int mode) { @@ -2349,13 +2351,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) parent = cgrp->parent; root = cgrp->root; sb = root->sb; + /* - * Call pre_destroy handlers of subsys + * Call pre_destroy handlers of subsys. Notify subsystems + * that rmdir() request comes. */ cgroup_call_pre_destroy(cgrp); - /* - * Notify subsyses that rmdir() request comes. - */ if (cgroup_has_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); @@ -2431,8 +2432,10 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss) } /** - * cgroup_init_early - initialize cgroups at system boot, and - * initialize any subsystems that request early init. + * cgroup_init_early - cgroup initialization at system boot + * + * Initialize cgroups at system boot, and initialize any + * subsystems that request early init. */ int __init cgroup_init_early(void) { @@ -2474,8 +2477,10 @@ int __init cgroup_init_early(void) } /** - * cgroup_init - register cgroup filesystem and /proc file, and - * initialize any subsystems that didn't request early init. + * cgroup_init - cgroup initialization + * + * Register cgroup filesystem and /proc file, and initialize + * any subsystems that didn't request early init. */ int __init cgroup_init(void) { @@ -2618,7 +2623,7 @@ static struct file_operations proc_cgroupstats_operations = { /** * cgroup_fork - attach newly forked task to its parents cgroup. - * @tsk: pointer to task_struct of forking parent process. + * @child: pointer to task_struct of forking parent process. * * Description: A task inherits its parent's cgroup at fork(). * @@ -2642,9 +2647,12 @@ void cgroup_fork(struct task_struct *child) } /** - * cgroup_fork_callbacks - called on a new task very soon before - * adding it to the tasklist. No need to take any locks since no-one - * can be operating on this task + * cgroup_fork_callbacks - run fork callbacks + * @child: the new task + * + * Called on a new task very soon before adding it to the + * tasklist. No need to take any locks since no-one can + * be operating on this task. */ void cgroup_fork_callbacks(struct task_struct *child) { @@ -2659,11 +2667,14 @@ void cgroup_fork_callbacks(struct task_struct *child) } /** - * cgroup_post_fork - called on a new task after adding it to the - * task list. Adds the task to the list running through its css_set - * if necessary. Has to be after the task is visible on the task list - * in case we race with the first call to cgroup_iter_start() - to - * guarantee that the new task ends up on its list. */ + * cgroup_post_fork - called on a new task after adding it to the task list + * @child: the task in question + * + * Adds the task to the list running through its css_set if necessary. + * Has to be after the task is visible on the task list in case we race + * with the first call to cgroup_iter_start() - to guarantee that the + * new task ends up on its list. + */ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { @@ -2676,6 +2687,7 @@ void cgroup_post_fork(struct task_struct *child) /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process + * @run_callback: run exit callbacks? * * Description: Detach cgroup from @tsk and release it. * @@ -2706,7 +2718,6 @@ void cgroup_post_fork(struct task_struct *child) * top_cgroup isn't going away, and either task has PF_EXITING set, * which wards off any cgroup_attach_task() attempts, or task is a failed * fork, never visible to cgroup_attach_task. - * */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { @@ -2743,9 +2754,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } /** - * cgroup_clone - duplicate the current cgroup in the hierarchy - * that the given subsystem is attached to, and move this task into - * the new child + * cgroup_clone - clone the cgroup the given subsystem is attached to + * @tsk: the task to be moved + * @subsys: the given subsystem + * + * Duplicate the current cgroup in the hierarchy that the given + * subsystem is attached to, and move this task into the new + * child. */ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) { @@ -2858,9 +2873,12 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) return ret; } -/* - * See if "cgrp" is a descendant of the current task's cgroup in - * the appropriate hierarchy +/** + * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp + * @cgrp: the cgroup in question + * + * See if @cgrp is a descendant of the current task's cgroup in + * the appropriate hierarchy. * * If we are sending in dummytop, then presumably we are creating * the top cgroup in the subsystem. @@ -2939,9 +2957,7 @@ void __css_put(struct cgroup_subsys_state *css) * release agent task. We don't bother to wait because the caller of * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. - * */ - static void cgroup_release_agent(struct work_struct *work) { BUG_ON(work != &release_agent_work); -- cgit From f777073848ba3708d68d87e43f104f83316187d7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:10 -0800 Subject: cgroup: fix memory leak in cgroup_get_sb() opts.release_agent is not kfree()ed in all necessary places. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 36066d8a491..947fe3b2218 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -954,8 +954,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type, } root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) + if (!root) { + if (opts.release_agent) + kfree(opts.release_agent); return -ENOMEM; + } init_cgroup_root(root); root->subsys_bits = opts.subsys_bits; -- cgit From 8d53d55d27754508e58e9ac18a4a445b110434bf Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:11 -0800 Subject: cgroup: fix subsys bitops Cgroup uses unsigned long for subsys bitops, not unsigned long long. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 947fe3b2218..84125936172 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -319,7 +319,7 @@ static struct css_set *find_existing_css_set( /* Built the set of subsystem state objects that we want to * see in the new css_set */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - if (root->subsys_bits & (1ull << i)) { + if (root->subsys_bits & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -689,7 +689,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, added_bits = final_bits & ~root->actual_subsys_bits; /* Check that any added subsystems are currently free */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long long bit = 1ull << i; + unsigned long bit = 1UL << i; struct cgroup_subsys *ss = subsys[i]; if (!(bit & added_bits)) continue; -- cgit From 68db38f1537a44097e264f28bda751d6b919cd53 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:11 -0800 Subject: cgroup: remove duplicate code in find_css_set() The list head res->tasks gets initialized twice in find_css_set(). Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 84125936172..2aa408201aa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -470,7 +470,6 @@ static struct css_set *find_css_set( /* Link this cgroup group into the list */ list_add(&res->list, &init_css_set.list); css_set_count++; - INIT_LIST_HEAD(&res->tasks); write_unlock(&css_set_lock); return res; -- cgit From bc231d2a048010d5e0b49ac7fddbfa822fc41109 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:12 -0800 Subject: cgroup: remove dead code in cgroup_get_rootdir() Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2aa408201aa..d8abe996e00 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -919,7 +919,6 @@ static int cgroup_get_rootdir(struct super_block *sb) if (!inode) return -ENOMEM; - inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_op = &cgroup_dir_inode_operations; /* directories start off with i_nlink == 2 (for "." entry) */ -- cgit From 04e2f1741d235ba599037734878d72e57cb302b5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 23 Feb 2008 18:05:03 -0800 Subject: Add memory barrier semantics to wake_up() & co Oleg Nesterov and others have pointed out that on some architectures, the traditional sequence of set_current_state(TASK_INTERRUPTIBLE); if (CONDITION) return; schedule(); is racy wrt another CPU doing CONDITION = 1; wake_up_process(p); because while set_current_state() has a memory barrier separating setting of the TASK_INTERRUPTIBLE state from reading of the CONDITION variable, there is no such memory barrier on the wakeup side. Now, wake_up_process() does actually take a spinlock before it reads and sets the task state on the waking side, and on x86 (and many other architectures) that spinlock is in fact equivalent to a memory barrier, but that is not generally guaranteed. The write that sets CONDITION could move into the critical region protected by the runqueue spinlock. However, adding a smp_wmb() to before the spinlock should now order the writing of CONDITION wrt the lock itself, which in turn is ordered wrt the accesses within the spinlock (which includes the reading of the old state). This should thus close the race (which probably has never been seen in practice, but since smp_wmb() is a no-op on x86, it's not like this will make anything worse either on the most common architecture where the spinlock already gave the required protection). Acked-by: Oleg Nesterov Acked-by: Dmitry Adamushko Cc: Andrew Morton Cc: Nick Piggin Signed-off-by: Linus Torvalds --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c4bc8c21095..b387a8de26a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1831,6 +1831,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) long old_state; struct rq *rq; + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) -- cgit From 6892b75e60557a48c01d57ba320419a9e2ce9846 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 Feb 2008 14:02:36 +0100 Subject: sched: make early bootup sched_clock() use safer do not call sched_clock() too early. Not only might rq->idle not be set up - but pure per-cpu data might not be accessible either. this solves an ia64 early bootup hang with CONFIG_PRINTK_TIME=y. Tested-by: Tony Luck Acked-by: Tony Luck Acked-by: David S. Miller Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b387a8de26a..7286ccb0108 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -668,6 +668,8 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; */ unsigned int sysctl_sched_rt_period = 1000000; +static __read_mostly int scheduler_running; + /* * part of the period that we allow rt tasks to run in us. * default: 0.95s @@ -689,14 +691,16 @@ unsigned long long cpu_clock(int cpu) unsigned long flags; struct rq *rq; - local_irq_save(flags); - rq = cpu_rq(cpu); /* * Only call sched_clock() if the scheduler has already been * initialized (some code might call cpu_clock() very early): */ - if (rq->idle) - update_rq_clock(rq); + if (unlikely(!scheduler_running)) + return 0; + + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); now = rq->clock; local_irq_restore(flags); @@ -7284,6 +7288,8 @@ void __init sched_init(void) * During early bootup we pretend to be a normal task: */ current->sched_class = &fair_sched_class; + + scheduler_running = 1; } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -- cgit From 70eee74b70c1a8485ec5f2bafa13dbc66fab6e02 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 22 Feb 2008 13:25:53 +0530 Subject: sched: remove duplicate code from sched_fair.c pick_task_entity() duplicates existing code. This functionality can be easily obtained using rb_last(). Avoid code duplication by using rb_last(). Signed-off-by: Balbir Singh Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6c091d6e159..7abad50d935 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -202,16 +202,13 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct sched_entity *se = NULL; - struct rb_node *parent; - - while (*link) { - parent = *link; - se = rb_entry(parent, struct sched_entity, run_node); - link = &parent->rb_right; - } + struct rb_node *last; + struct sched_entity *se; + last = rb_last(&cfs_rq->tasks_timeline); + if (!last) + return NULL; + se = rb_entry(last, struct sched_entity, run_node); return se; } -- cgit From 7eee3e677d6e2e9007afcd7d79b0715525aa552e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 Feb 2008 10:32:21 +0100 Subject: sched: clean up __pick_last_entity() a bit Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7abad50d935..c8e6492c592 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -202,14 +202,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node *last; - struct sched_entity *se; + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); - last = rb_last(&cfs_rq->tasks_timeline); if (!last) return NULL; - se = rb_entry(last, struct sched_entity, run_node); - return se; + + return rb_entry(last, struct sched_entity, run_node); } /************************************************************** -- cgit From 67ca7bde2e9d3516b5ae0188330ad1059ac03f38 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Fri, 15 Feb 2008 09:56:36 -0800 Subject: sched: fix signedness warnings in sched.c Unsigned long values are always assigned to switch_count, make it unsigned long. kernel/sched.c:3897:15: warning: incorrect type in assignment (different signedness) kernel/sched.c:3897:15: expected long *switch_count kernel/sched.c:3897:15: got unsigned long * kernel/sched.c:3921:16: warning: incorrect type in assignment (different signedness) kernel/sched.c:3921:16: expected long *switch_count kernel/sched.c:3921:16: got unsigned long * Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7286ccb0108..f06950c8a6c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3889,7 +3889,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev) asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; - long *switch_count; + unsigned long *switch_count; struct rq *rq; int cpu; -- cgit From 1481197b50114d7212d659d41cb97f31a8934883 Mon Sep 17 00:00:00 2001 From: Dale Farnsworth Date: Mon, 25 Feb 2008 23:03:02 +0100 Subject: Subject: lockdep: include all lock classes in all_lock_classes Add each lock class to the all_lock_classes list when it is first registered. Previously, lock classes were added to all_lock_classes when the lock class was first used. Since one of the uses of the list is to find unused locks, this didn't work well. Signed-off-by: Dale Farnsworth Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3574379f4d6..81a4e4a3f08 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -779,6 +779,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * parallel walking of the hash-list safe: */ list_add_tail_rcu(&class->hash_entry, hash_head); + /* + * Add it to the global list of classes: + */ + list_add_tail_rcu(&class->lock_entry, &all_lock_classes); if (verbose(class)) { graph_unlock(); @@ -2282,10 +2286,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; break; case LOCK_USED: - /* - * Add it to the global list of classes: - */ - list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); debug_atomic_dec(&nr_unused_locks); break; default: -- cgit From cf3680b90c7842cf91ed857ac4528f4e057da366 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Feb 2008 10:32:07 +0900 Subject: printk: fix possible printk overrun printk recursion detection prepends message to printk_buf and offsets printk_buf when actual message is printed but it forgets to trim buffer length accordingly. This can result in overrun in extreme cases. Fix it. [ mingo@elte.hu: bug was introduced by me via: commit 32a76006683f7b28ae3cc491da37716e002f198e Author: Ingo Molnar Date: Fri Jan 25 21:07:58 2008 +0100 printk: make printk more robust by not allowing recursion ] Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index bee36100f11..9adc2a473e6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -666,7 +666,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) } /* Emit the output into the temporary buffer */ printed_len += vscnprintf(printk_buf + printed_len, - sizeof(printk_buf), fmt, args); + sizeof(printk_buf) - printed_len, fmt, args); /* * Copy the output into log_buf. If the caller didn't provide -- cgit From 2232c2d8e0a6a31061dec311f3d1cf7624bc14f1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Feb 2008 18:46:50 +0100 Subject: rcu: add support for dynamic ticks and preempt rcu The PREEMPT-RCU can get stuck if a CPU goes idle and NO_HZ is set. The idle CPU will not progress the RCU through its grace period and a synchronize_rcu my get stuck. Without this patch I have a box that will not boot when PREEMPT_RCU and NO_HZ are set. That same box boots fine with this patch. This patch comes from the -rt kernel where it has been tested for several months. Signed-off-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++- kernel/softirq.c | 1 + kernel/time/tick-sched.c | 3 + 3 files changed, 224 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 987cfb7ade8..c7c52096df4 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -23,6 +23,10 @@ * to Suparna Bhattacharya for pushing me completely away * from atomic instructions on the read side. * + * - Added handling of Dynamic Ticks + * Copyright 2007 - Paul E. Mckenney + * - Steven Rostedt + * * Papers: http://www.rdrop.com/users/paulmck/RCU * * Design Document: http://lwn.net/Articles/253651/ @@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp) } } +#ifdef CONFIG_NO_HZ + +DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; +static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); +static DEFINE_PER_CPU(int, rcu_update_flag); + +/** + * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. + * + * If the CPU was idle with dynamic ticks active, this updates the + * dynticks_progress_counter to let the RCU handling know that the + * CPU is active. + */ +void rcu_irq_enter(void) +{ + int cpu = smp_processor_id(); + + if (per_cpu(rcu_update_flag, cpu)) + per_cpu(rcu_update_flag, cpu)++; + + /* + * Only update if we are coming from a stopped ticks mode + * (dynticks_progress_counter is even). + */ + if (!in_interrupt() && + (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { + /* + * The following might seem like we could have a race + * with NMI/SMIs. But this really isn't a problem. + * Here we do a read/modify/write, and the race happens + * when an NMI/SMI comes in after the read and before + * the write. But NMI/SMIs will increment this counter + * twice before returning, so the zero bit will not + * be corrupted by the NMI/SMI which is the most important + * part. + * + * The only thing is that we would bring back the counter + * to a postion that it was in during the NMI/SMI. + * But the zero bit would be set, so the rest of the + * counter would again be ignored. + * + * On return from the IRQ, the counter may have the zero + * bit be 0 and the counter the same as the return from + * the NMI/SMI. If the state machine was so unlucky to + * see that, it still doesn't matter, since all + * RCU read-side critical sections on this CPU would + * have already completed. + */ + per_cpu(dynticks_progress_counter, cpu)++; + /* + * The following memory barrier ensures that any + * rcu_read_lock() primitives in the irq handler + * are seen by other CPUs to follow the above + * increment to dynticks_progress_counter. This is + * required in order for other CPUs to correctly + * determine when it is safe to advance the RCU + * grace-period state machine. + */ + smp_mb(); /* see above block comment. */ + /* + * Since we can't determine the dynamic tick mode from + * the dynticks_progress_counter after this routine, + * we use a second flag to acknowledge that we came + * from an idle state with ticks stopped. + */ + per_cpu(rcu_update_flag, cpu)++; + /* + * If we take an NMI/SMI now, they will also increment + * the rcu_update_flag, and will not update the + * dynticks_progress_counter on exit. That is for + * this IRQ to do. + */ + } +} + +/** + * rcu_irq_exit - Called from exiting Hard irq context. + * + * If the CPU was idle with dynamic ticks active, update the + * dynticks_progress_counter to put let the RCU handling be + * aware that the CPU is going back to idle with no ticks. + */ +void rcu_irq_exit(void) +{ + int cpu = smp_processor_id(); + + /* + * rcu_update_flag is set if we interrupted the CPU + * when it was idle with ticks stopped. + * Once this occurs, we keep track of interrupt nesting + * because a NMI/SMI could also come in, and we still + * only want the IRQ that started the increment of the + * dynticks_progress_counter to be the one that modifies + * it on exit. + */ + if (per_cpu(rcu_update_flag, cpu)) { + if (--per_cpu(rcu_update_flag, cpu)) + return; + + /* This must match the interrupt nesting */ + WARN_ON(in_interrupt()); + + /* + * If an NMI/SMI happens now we are still + * protected by the dynticks_progress_counter being odd. + */ + + /* + * The following memory barrier ensures that any + * rcu_read_unlock() primitives in the irq handler + * are seen by other CPUs to preceed the following + * increment to dynticks_progress_counter. This + * is required in order for other CPUs to determine + * when it is safe to advance the RCU grace-period + * state machine. + */ + smp_mb(); /* see above block comment. */ + per_cpu(dynticks_progress_counter, cpu)++; + WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); + } +} + +static void dyntick_save_progress_counter(int cpu) +{ + per_cpu(rcu_dyntick_snapshot, cpu) = + per_cpu(dynticks_progress_counter, cpu); +} + +static inline int +rcu_try_flip_waitack_needed(int cpu) +{ + long curr; + long snap; + + curr = per_cpu(dynticks_progress_counter, cpu); + snap = per_cpu(rcu_dyntick_snapshot, cpu); + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot be in the middle of an rcu_read_lock(), so + * the next rcu_read_lock() it executes must use the new value + * of the counter. So we can safely pretend that this CPU + * already acknowledged the counter. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU passed through or entered a dynticks idle phase with + * no active irq handlers, then, as above, we can safely pretend + * that this CPU already acknowledged the counter. + */ + + if ((curr - snap) > 2 || (snap & 0x1) == 0) + return 0; + + /* We need this CPU to explicitly acknowledge the counter flip. */ + + return 1; +} + +static inline int +rcu_try_flip_waitmb_needed(int cpu) +{ + long curr; + long snap; + + curr = per_cpu(dynticks_progress_counter, cpu); + snap = per_cpu(rcu_dyntick_snapshot, cpu); + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot have executed an RCU read-side critical section + * during that time, so there is no need for it to execute a + * memory barrier. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU either entered or exited an outermost interrupt, + * SMI, NMI, or whatever handler, then we know that it executed + * a memory barrier when doing so. So we don't need another one. + */ + if (curr != snap) + return 0; + + /* We need the CPU to execute a memory barrier. */ + + return 1; +} + +#else /* !CONFIG_NO_HZ */ + +# define dyntick_save_progress_counter(cpu) do { } while (0) +# define rcu_try_flip_waitack_needed(cpu) (1) +# define rcu_try_flip_waitmb_needed(cpu) (1) + +#endif /* CONFIG_NO_HZ */ + /* * Get here when RCU is idle. Decide whether we need to * move out of idle state, and return non-zero if so. @@ -447,8 +657,10 @@ rcu_try_flip_idle(void) /* Now ask each CPU for acknowledgement of the flip. */ - for_each_cpu_mask(cpu, rcu_cpu_online_map) + for_each_cpu_mask(cpu, rcu_cpu_online_map) { per_cpu(rcu_flip_flag, cpu) = rcu_flipped; + dyntick_save_progress_counter(cpu); + } return 1; } @@ -464,7 +676,8 @@ rcu_try_flip_waitack(void) RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); for_each_cpu_mask(cpu, rcu_cpu_online_map) - if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { + if (rcu_try_flip_waitack_needed(cpu) && + per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); return 0; } @@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void) smp_mb(); /* ^^^^^^^^^^^^ */ /* Call for a memory barrier from each CPU. */ - for_each_cpu_mask(cpu, rcu_cpu_online_map) + for_each_cpu_mask(cpu, rcu_cpu_online_map) { per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; + dyntick_save_progress_counter(cpu); + } RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); return 1; @@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void) RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); for_each_cpu_mask(cpu, rcu_cpu_online_map) - if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { + if (rcu_try_flip_waitmb_needed(cpu) && + per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); return 0; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 5b3aea5f471..31e9f2a4792 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -313,6 +313,7 @@ void irq_exit(void) /* Make sure that timer wheel updates are propagated */ if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) tick_nohz_stop_sched_tick(); + rcu_irq_exit(); #endif preempt_enable_no_resched(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa9bb73dbdb..2968298f8f3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -282,6 +282,7 @@ void tick_nohz_stop_sched_tick(void) ts->idle_tick = ts->sched_timer.expires; ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; + rcu_enter_nohz(); } /* @@ -375,6 +376,8 @@ void tick_nohz_restart_sched_tick(void) return; } + rcu_exit_nohz(); + /* Update jiffies first */ select_nohz_load_balancer(0); now = ktime_get(); -- cgit From 7be2a03e3174cee3a3cdcdf17db357470f51caff Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Fri, 8 Feb 2008 15:41:13 +0100 Subject: softlockup: fix task state setting kthread_stop() can be called when a 'watchdog' thread is executing after kthread_should_stop() but before set_task_state(TASK_INTERRUPTIBLE). Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 7c2da88db4e..01b6522fd92 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -216,26 +216,27 @@ static int watchdog(void *__bind_cpu) /* initialize timestamp */ touch_softlockup_watchdog(); + set_current_state(TASK_INTERRUPTIBLE); /* * Run briefly once per second to reset the softlockup timestamp. * If this gets delayed for more than 60 seconds then the * debug-printout triggers in softlockup_tick(). */ while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); touch_softlockup_watchdog(); schedule(); if (kthread_should_stop()) break; - if (this_cpu != check_cpu) - continue; - - if (sysctl_hung_task_timeout_secs) - check_hung_uninterruptible_tasks(this_cpu); + if (this_cpu == check_cpu) { + if (sysctl_hung_task_timeout_secs) + check_hung_uninterruptible_tasks(this_cpu); + } + set_current_state(TASK_INTERRUPTIBLE); } + __set_current_state(TASK_RUNNING); return 0; } -- cgit From ae778869ae4549628b9e83efe958c3aaa63ed1b9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Feb 2008 16:21:10 -0800 Subject: rcupreempt: fix hibernate/resume in presence of PREEMPT_RCU and hotplug This fixes a oops encountered when doing hibernate/resume in presence of PREEMPT_RCU. The problem was that the code failed to disable preemption when accessing a per-CPU variable. This is OK when called from code that already has preemption disabled, but such is not the case from the suspend/resume code path. Reported-by: Dave Young Tested-by: Dave Young Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index c7c52096df4..845abcd472b 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -918,8 +918,9 @@ void rcu_offline_cpu(int cpu) * fix. */ + local_irq_save(flags); rdp = RCU_DATA_ME(); - spin_lock_irqsave(&rdp->lock, flags); + spin_lock(&rdp->lock); *rdp->nexttail = list; if (list) rdp->nexttail = tail; -- cgit From c9e71002aacc9821e99531dcc130db88bbc8ad05 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Feb 2008 11:51:07 -0800 Subject: rcupreempt: remove never-migrates assumption from rcu_process_callbacks() This patch fixes a potentially invalid access to a per-CPU variable in rcu_process_callbacks(). This per-CPU access needs to be done in such a way as to guarantee that the code using it cannot move to some other CPU before all uses of the value accessed have completed. Even though this code is currently only invoked from softirq context, which currrently cannot migrate to some other CPU, life would be better if this code did not silently make such an assumption. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 845abcd472b..e9517014b57 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -952,9 +952,11 @@ static void rcu_process_callbacks(struct softirq_action *unused) { unsigned long flags; struct rcu_head *next, *list; - struct rcu_data *rdp = RCU_DATA_ME(); + struct rcu_data *rdp; - spin_lock_irqsave(&rdp->lock, flags); + local_irq_save(flags); + rdp = RCU_DATA_ME(); + spin_lock(&rdp->lock); list = rdp->donelist; if (list == NULL) { spin_unlock_irqrestore(&rdp->lock, flags); -- cgit