From 788084aba2ab7348257597496befcbccabdc98a3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 31 Jul 2009 12:54:11 -0400 Subject: Security/SELinux: seperate lsm specific mmap_min_addr Currently SELinux enforcement of controls on the ability to map low memory is determined by the mmap_min_addr tunable. This patch causes SELinux to ignore the tunable and instead use a seperate Kconfig option specific to how much space the LSM should protect. The tunable will now only control the need for CAP_SYS_RAWIO and SELinux permissions will always protect the amount of low memory designated by CONFIG_LSM_MMAP_MIN_ADDR. This allows users who need to disable the mmap_min_addr controls (usual reason being they run WINE as a non-root user) to do so and still have SELinux controls preventing confined domains (like a web server) from being able to map some area of low memory. Signed-off-by: Eric Paris Signed-off-by: James Morris --- kernel/sysctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 98e02328c67..58be76017fd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -1306,10 +1307,10 @@ static struct ctl_table vm_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "mmap_min_addr", - .data = &mmap_min_addr, - .maxlen = sizeof(unsigned long), + .data = &dac_mmap_min_addr, + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &mmap_min_addr_handler, }, #ifdef CONFIG_NUMA { -- cgit From de809347aeef0a68c04576c464414d0e4dce59fc Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 17 Aug 2009 05:43:01 -0400 Subject: timers: Drop write permission on /proc/timer_list /proc/timer_list and /proc/slabinfo are not supposed to be written, so there should be no write permissions on it. Signed-off-by: WANG Cong Cc: Pekka Enberg Cc: Vegard Nossum Cc: Eduard - Gabriel Munteanu Cc: linux-mm@kvack.org Cc: Christoph Lameter Cc: David Rientjes Cc: Amerigo Wang Cc: Matt Mackall Cc: Arjan van de Ven LKML-Reference: <20090817094525.6355.88682.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar --- kernel/time/timer_list.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a999b92a127..fddd69d16e0 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -286,7 +286,7 @@ static int __init init_timer_list_procfs(void) { struct proc_dir_entry *pe; - pe = proc_create("timer_list", 0644, NULL, &timer_list_fops); + pe = proc_create("timer_list", 0444, NULL, &timer_list_fops); if (!pe) return -ENOMEM; return 0; -- cgit From f2d84b65b9778e8a35dd904f7d3993f0a60c9756 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 7 Aug 2009 18:55:48 +0800 Subject: ftrace: Unify effect of writing to trace_options and option/* "echo noglobal-clock > trace_options" can be used to change trace clock but "echo 0 > options/global-clock" can't. The flag toggling will be silently accepted without actually changing the clock callback. We can fix it by using set_tracer_flags() in trace_options_core_write(). Changelog: v1->v2: Simplified switch() after Li Zefan 's suggestion Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c22b40f8f57..8c358395d33 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3896,17 +3896,9 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; - switch (val) { - case 0: - trace_flags &= ~(1 << index); - break; - case 1: - trace_flags |= 1 << index; - break; - - default: + if (val != 0 && val != 1) return -EINVAL; - } + set_tracer_flags(1 << index, val); *ppos += cnt; -- cgit From 69ab849439b506cd8dd2879527fdb64d95dd5211 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Aug 2009 14:07:16 +0200 Subject: genirq: Wake up irq thread after action has been installed The wake_up_process() of the new irq thread in __setup_irq() is too early as the irqaction is not yet fully initialized especially action->irq is not yet set. The interrupt thread might dereference the wrong irq descriptor. Move the wakeup after the action is installed and action->irq has been set. Reported-by: Michael Buesch Signed-off-by: Thomas Gleixner Tested-by: Michael Buesch --- kernel/irq/manage.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d222515a5a0..0ec9ed83173 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -607,7 +607,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ get_task_struct(t); new->thread = t; - wake_up_process(t); } /* @@ -690,6 +689,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) (int)(new->flags & IRQF_TRIGGER_MASK)); } + new->irq = irq; *old_ptr = new; /* Reset broken irq detection when installing new handler */ @@ -707,7 +707,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) spin_unlock_irqrestore(&desc->lock, flags); - new->irq = irq; + /* + * Strictly no need to wake it up, but hung_task complains + * when no hard interrupt wakes the thread up. + */ + if (new->thread) + wake_up_process(new->thread); + register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); -- cgit From 0753ba01e126020bf0f8150934903b48935b697d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 18 Aug 2009 14:11:10 -0700 Subject: mm: revert "oom: move oom_adj value" The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to the mm_struct. It was a very good first step for sanitize OOM. However Paul Menage reported the commit makes regression to his job scheduler. Current OOM logic can kill OOM_DISABLED process. Why? His program has the code of similar to the following. ... set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */ ... if (vfork() == 0) { set_oom_adj(0); /* Invoked child can be killed */ execve("foo-bar-cmd"); } .... vfork() parent and child are shared the same mm_struct. then above set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also change oom_adj for vfork() parent. Then, vfork() parent (job scheduler) lost OOM immune and it was killed. Actually, fork-setting-exec idiom is very frequently used in userland program. We must not break this assumption. Then, this patch revert commit 2ff05b2b and related commit. Reverted commit list --------------------- - commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct) - commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE) - commit 8123681022 (oom: only oom kill exiting tasks with attached memory) - commit 933b787b57 (mm: copy over oom_adj value at fork time) Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 021e1138556..144326b7af5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -426,7 +426,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; - mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); -- cgit From eda1e328556565e211b7450250e40d6de751563a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 11 Aug 2009 17:29:04 +0200 Subject: tracing: handle broken names in ftrace filter If one filter item (for set_ftrace_filter and set_ftrace_notrace) is being setup by more than 1 consecutive writes (FTRACE_ITER_CONT flag), it won't be handled corretly. I used following program to test/verify: [snip] #include #include #include #include #include int main(int argc, char **argv) { int fd, i; char *file = argv[1]; if (-1 == (fd = open(file, O_WRONLY))) { perror("open failed"); return -1; } for(i = 0; i < (argc - 2); i++) { int len = strlen(argv[2+i]); int cnt, off = 0; while(len) { cnt = write(fd, argv[2+i] + off, len); len -= cnt; off += cnt; } } close(fd); return 0; } [snip] before change: sh-4.0# echo > ./set_ftrace_filter sh-4.0# /test ./set_ftrace_filter "sys" "_open " sh-4.0# cat ./set_ftrace_filter #### all functions enabled #### sh-4.0# after change: sh-4.0# echo > ./set_ftrace_notrace sh-4.0# test ./set_ftrace_notrace "sys" "_open " sh-4.0# cat ./set_ftrace_notrace sys_open sh-4.0# Signed-off-by: Jiri Olsa LKML-Reference: <20090811152904.GA26065@jolsa.lab.eng.brq.redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1e1d23c2630..25edd5cc593 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2278,7 +2278,11 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, read++; cnt--; - if (!(iter->flags & ~FTRACE_ITER_CONT)) { + /* + * If the parser haven't finished with the last write, + * continue reading the user input without skipping spaces. + */ + if (!(iter->flags & FTRACE_ITER_CONT)) { /* skip white space */ while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); @@ -2288,8 +2292,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, cnt--; } + /* only spaces were written */ if (isspace(ch)) { - file->f_pos += read; + *ppos += read; ret = read; goto out; } @@ -2319,12 +2324,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (ret) goto out; iter->buffer_idx = 0; - } else + } else { iter->flags |= FTRACE_ITER_CONT; + iter->buffer[iter->buffer_idx++] = ch; + } - - file->f_pos += read; - + *ppos += read; ret = read; out: mutex_unlock(&ftrace_regex_lock); -- cgit From f833bab87fca5c3ce13778421b1365845843b976 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 17 Aug 2009 14:34:59 -0700 Subject: clockevent: Prevent dead lock on clockevents_lock Currently clockevents_notify() is called with interrupts enabled at some places and interrupts disabled at some other places. This results in a deadlock in this scenario. cpu A holds clockevents_lock in clockevents_notify() with irqs enabled cpu B waits for clockevents_lock in clockevents_notify() with irqs disabled cpu C doing set_mtrr() which will try to rendezvous of all the cpus. This will result in C and A come to the rendezvous point and waiting for B. B is stuck forever waiting for the spinlock and thus not reaching the rendezvous point. Fix the clockevents code so that clockevents_lock is taken with interrupts disabled and thus avoid the above deadlock. Also call lapic_timer_propagate_broadcast() on the destination cpu so that we avoid calling smp_call_function() in the clockevents notifier chain. This issue left us wondering if we need to change the MTRR rendezvous logic to use stop machine logic (instead of smp_call_function) or add a check in spinlock debug code to see if there are other spinlocks which gets taken under both interrupts enabled/disabled conditions. Signed-off-by: Suresh Siddha Signed-off-by: Venkatesh Pallipadi Cc: "Pallipadi Venkatesh" Cc: "Brown Len" LKML-Reference: <1250544899.2709.210.camel@sbs-t61.sc.intel.com> Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 16 ++++++++++------ kernel/time/tick-broadcast.c | 7 +++---- 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a6dcd67b041..620b58abdc3 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -137,11 +137,12 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, */ int clockevents_register_notifier(struct notifier_block *nb) { + unsigned long flags; int ret; - spin_lock(&clockevents_lock); + spin_lock_irqsave(&clockevents_lock, flags); ret = raw_notifier_chain_register(&clockevents_chain, nb); - spin_unlock(&clockevents_lock); + spin_unlock_irqrestore(&clockevents_lock, flags); return ret; } @@ -178,16 +179,18 @@ static void clockevents_notify_released(void) */ void clockevents_register_device(struct clock_event_device *dev) { + unsigned long flags; + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); BUG_ON(!dev->cpumask); - spin_lock(&clockevents_lock); + spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); clockevents_notify_released(); - spin_unlock(&clockevents_lock); + spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_register_device); @@ -235,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old, void clockevents_notify(unsigned long reason, void *arg) { struct list_head *node, *tmp; + unsigned long flags; - spin_lock(&clockevents_lock); + spin_lock_irqsave(&clockevents_lock, flags); clockevents_do_notify(reason, arg); switch (reason) { @@ -251,7 +255,7 @@ void clockevents_notify(unsigned long reason, void *arg) default: break; } - spin_unlock(&clockevents_lock); + spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_notify); #endif diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 877dbedc311..c2ec25087a3 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -205,11 +205,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) * Powerstate information: The system enters/leaves a state, where * affected devices might stop */ -static void tick_do_broadcast_on_off(void *why) +static void tick_do_broadcast_on_off(unsigned long *reason) { struct clock_event_device *bc, *dev; struct tick_device *td; - unsigned long flags, *reason = why; + unsigned long flags; int cpu, bc_stopped; spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -276,8 +276,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu) printk(KERN_ERR "tick-broadcast: ignoring broadcast for " "offline CPU #%d\n", *oncpu); else - smp_call_function_single(*oncpu, tick_do_broadcast_on_off, - &reason, 1); + tick_do_broadcast_on_off(&reason); } /* -- cgit From 4ab6c08336535f8c8e42cf45d7adeda882eff06e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 Aug 2009 14:29:24 -0700 Subject: clone(): fix race between copy_process() and de_thread() Spotted by Hiroshi Shimamoto who also provided the test-case below. copy_process() uses signal->count as a reference counter, but it is not. This test case #include #include #include #include #include #include void *null_thread(void *p) { for (;;) sleep(1); return NULL; } void *exec_thread(void *p) { execl("/bin/true", "/bin/true", NULL); return null_thread(p); } int main(int argc, char **argv) { for (;;) { pid_t pid; int ret, status; pid = fork(); if (pid < 0) break; if (!pid) { pthread_t tid; pthread_create(&tid, NULL, exec_thread, NULL); for (;;) pthread_create(&tid, NULL, null_thread, NULL); } do { ret = waitpid(pid, &status, 0); } while (ret == -1 && errno == EINTR); } return 0; } quickly creates an unkillable task. If copy_process(CLONE_THREAD) races with de_thread() copy_signal()->atomic(signal->count) breaks the signal->notify_count logic, and the execing thread can hang forever in kernel space. Change copy_process() to increment count/live only when we know for sure we can't fail. In this case the forked thread will take care of its reference to signal correctly. If copy_process() fails, check CLONE_THREAD flag. If it it set - do nothing, the counters were not changed and current belongs to the same thread group. If it is not set, ->signal must be released in any case (and ->count must be == 1), the forked child is the only thread in the thread group. We need more cleanups here, in particular signal->count should not be used by de_thread/__exit_signal at all. This patch only fixes the bug. Reported-by: Hiroshi Shimamoto Tested-by: Hiroshi Shimamoto Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: KAMEZAWA Hiroyuki Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 144326b7af5..e6c04d462ab 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -815,11 +815,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; - if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); + if (clone_flags & CLONE_THREAD) return 0; - } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; @@ -877,16 +874,6 @@ void __cleanup_signal(struct signal_struct *sig) kmem_cache_free(signal_cachep, sig); } -static void cleanup_signal(struct task_struct *tsk) -{ - struct signal_struct *sig = tsk->signal; - - atomic_dec(&sig->live); - - if (atomic_dec_and_test(&sig->count)) - __cleanup_signal(sig); -} - static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; @@ -1239,6 +1226,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (clone_flags & CLONE_THREAD) { + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); } @@ -1282,7 +1271,8 @@ bad_fork_cleanup_mm: if (p->mm) mmput(p->mm); bad_fork_cleanup_signal: - cleanup_signal(p); + if (!(clone_flags & CLONE_THREAD)) + __cleanup_signal(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: -- cgit From 7d1d16e416e61aeef8655d542f8e4a4fc6e808e4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 26 Aug 2009 22:02:54 +0930 Subject: module: fix BUG_ON() for powerpc (and other function descriptor archs) The rarely-used symbol_put_addr() needs to use dereference_function_descriptor on powerpc. Reported-by: Paul Mackerras Signed-off-by: Rusty Russell --- kernel/module.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index fd141140355..07c80e68a6c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -909,16 +909,18 @@ void __symbol_put(const char *symbol) } EXPORT_SYMBOL(__symbol_put); +/* Note this assumes addr is a function, which it currently always is. */ void symbol_put_addr(void *addr) { struct module *modaddr; + unsigned long a = (unsigned long)dereference_function_descriptor(addr); - if (core_kernel_text((unsigned long)addr)) + if (core_kernel_text(a)) return; /* module_text_address is safe here: we're supposed to have reference * to module from symbol_get, so it can't go away. */ - modaddr = __module_text_address((unsigned long)addr); + modaddr = __module_text_address(a); BUG_ON(!modaddr); module_put(modaddr); } -- cgit From 1b364bf438cf337a3818aee77d68c0713f3e1fc4 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 26 Aug 2009 22:04:12 +0930 Subject: module: workaround duplicate section names The root cause is a duplicate section name (.text); is this legal? [ Amerigo Wang: "AFAIK, yes." ] However, there's a problem with commit 6d76013381ed28979cd122eb4b249a88b5e384fa in that if you fail to allocate a mod->sect_attrs (in this case it's null because of the duplication), it still gets used without checking in add_notes_attrs() This should fix it [ This patch leaves other problems, particularly the sections directory, but recent parisc toolchains seem to produce these modules and this prevents a crash and is a minimal change -- RR ] Signed-off-by: James Bottomley Signed-off-by: Rusty Russell Tested-by: Helge Deller Signed-off-by: Linus Torvalds --- kernel/module.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 07c80e68a6c..eccb561dd8a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2355,7 +2355,8 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto unlink; add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); + if (mod->sect_attrs) + add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); /* Get rid of temporary copy */ vfree(hdr); -- cgit From 6bb56347f5162d1a7cb1dc461023360781ecd4c0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 28 Aug 2009 13:44:53 +0200 Subject: perf_counters: Increase paranoia level Per-cpu counters are an ASLR information leak as they show the execution other tasks do. Increase the paranoia level to 1, which disallows per-cpu counters. (they still allow counting/profiling of own tasks - and admin can profile everything.) Acked-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f274e195988..7d4bb83b78c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -50,7 +50,7 @@ static atomic_t nr_task_counters __read_mostly; * 1 - disallow cpu counters to unpriv * 2 - disallow kernel profiling to unpriv */ -int sysctl_perf_counter_paranoid __read_mostly; +int sysctl_perf_counter_paranoid __read_mostly = 1; static inline bool perf_paranoid_cpu(void) { -- cgit From eced1dfcfcf6b0a35e925d73916a9d8e36ab5457 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Aug 2009 17:10:47 +0200 Subject: perf_counter: Fix /0 bug in swcounters We have a race in the swcounter stuff where we can start counting a counter that has never been enabled, this leads to a /0 situation. The below avoids the /0 but doesn't close the race, this would need a new counter state. The race is due to perf_swcounter_is_counting() which cannot discern between disabled due to scheduled out, and disabled for any other reason. Such a crash has been seen by Ingo: [ 967.092372] divide error: 0000 [#1] SMP [ 967.096499] last sysfs file: /sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_map [ 967.104846] CPU 5 [ 967.106965] Modules linked in: [ 967.110169] Pid: 3351, comm: hackbench Not tainted 2.6.31-rc8-tip-01158-gd940a54-dirty #1568 X8DTN [ 967.119456] RIP: 0010:[] [] perf_swcounter_ctx_event+0x127/0x1af [ 967.129137] RSP: 0018:ffff8801a95abd70 EFLAGS: 00010046 [ 967.134699] RAX: 0000000000000002 RBX: ffff8801bd645c00 RCX: 0000000000000002 [ 967.142162] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8801bd645d40 [ 967.149584] RBP: ffff8801a95abdb0 R08: 0000000000000001 R09: ffff8801a95abe00 [ 967.157042] R10: 0000000000000037 R11: ffff8801aa1245f8 R12: ffff8801a95abe00 [ 967.164481] R13: ffff8801a95abe00 R14: ffff8801aa1c0e78 R15: 0000000000000001 [ 967.171953] FS: 0000000000000000(0000) GS:ffffc90000a00000(0063) knlGS:00000000f7f486c0 [ 967.180406] CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b [ 967.186374] CR2: 000000004822c0ac CR3: 00000001b19a2000 CR4: 00000000000006e0 [ 967.193770] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 967.201224] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 967.208692] Process hackbench (pid: 3351, threadinfo ffff8801a95aa000, task ffff8801a96b0000) [ 967.217607] Stack: [ 967.219711] 0000000000000000 0000000000000037 0000000200000001 ffffc90000a1107c [ 967.227296] <0> ffff8801a95abe00 0000000000000001 0000000000000001 0000000000000037 [ 967.235333] <0> ffff8801a95abdf0 ffffffff810c0c20 0000000200a14f30 ffff8801a95abe40 [ 967.243532] Call Trace: [ 967.246103] [] do_perf_swcounter_event+0xde/0xec [ 967.252635] [] perf_tpcounter_event+0x79/0x7b [ 967.258957] [] ftrace_profile_sched_switch+0xc0/0xcb [ 967.265791] [] schedule+0x429/0x4c4 [ 967.271156] [] int_careful+0xd/0x14 Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1251472247.17617.74.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7d4bb83b78c..d7cbc579fc8 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -4066,6 +4066,7 @@ perf_counter_alloc(struct perf_counter_attr *attr, hwc->sample_period = attr->sample_period; if (attr->freq && attr->sample_freq) hwc->sample_period = 1; + hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); -- cgit