From 41719b03091911028116155deddc5eedf8c45e37 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jan 2009 15:36:26 +0100 Subject: mutex: preemption fixes The problem is that dropping the spinlock right before schedule is a voluntary preemption point and can cause a schedule, right after which we schedule again. Fix this inefficiency by keeping preemption disabled until we schedule, do this by explicity disabling preemption and providing a schedule() variant that assumes preemption is already disabled. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cae9b81a1f..9f0b372cfa6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -328,6 +328,7 @@ extern signed long schedule_timeout(signed long timeout); extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); +asmlinkage void __schedule(void); asmlinkage void schedule(void); struct nsproxy; -- cgit From 0d66bf6d3514b35eb6897629059443132992dbd7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 Jan 2009 14:01:47 +0100 Subject: mutex: implement adaptive spinning Change mutex contention behaviour such that it will sometimes busy wait on acquisition - moving its behaviour closer to that of spinlocks. This concept got ported to mainline from the -rt tree, where it was originally implemented for rtmutexes by Steven Rostedt, based on work by Gregory Haskins. Testing with Ingo's test-mutex application (http://lkml.org/lkml/2006/1/8/50) gave a 345% boost for VFS scalability on my testbox: # ./test-mutex-shm V 16 10 | grep "^avg ops" avg ops/sec: 296604 # ./test-mutex-shm V 16 10 | grep "^avg ops" avg ops/sec: 85870 The key criteria for the busy wait is that the lock owner has to be running on a (different) cpu. The idea is that as long as the owner is running, there is a fair chance it'll release the lock soon, and thus we'll be better off spinning instead of blocking/scheduling. Since regular mutexes (as opposed to rtmutexes) do not atomically track the owner, we add the owner in a non-atomic fashion and deal with the races in the slowpath. Furthermore, to ease the testing of the performance impact of this new code, there is means to disable this behaviour runtime (without having to reboot the system), when scheduler debugging is enabled (CONFIG_SCHED_DEBUG=y), by issuing the following command: # echo NO_OWNER_SPIN > /debug/sched_features This command re-enables spinning again (this is also the default): # echo OWNER_SPIN > /debug/sched_features Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9f0b372cfa6..c34b137cd1e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -330,6 +330,7 @@ extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void __schedule(void); asmlinkage void schedule(void); +extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); struct nsproxy; struct user_namespace; -- cgit From e162b39a368f0401e41b558f430c354d12a85b37 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 15 Jan 2009 11:08:40 -0800 Subject: softlockup: decouple hung tasks check from softlockup detection Decoupling allows: * hung tasks check to happen at very low priority * hung tasks check and softlockup to be enabled/disabled independently at compile and/or run-time * individual panic settings to be enabled disabled independently at compile and/or run-time * softlockup threshold to be reduced without increasing hung tasks poll frequency (hung task check is expensive relative to softlock watchdog) * hung task check to be zero over-head when disabled at run-time Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- include/linux/sched.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 54cbabf3b87..f2f94d53230 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -297,9 +297,6 @@ extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; -extern unsigned long sysctl_hung_task_check_count; -extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_warnings; extern int softlockup_thresh; #else static inline void softlockup_tick(void) @@ -316,6 +313,15 @@ static inline void touch_all_softlockup_watchdogs(void) } #endif +#ifdef CONFIG_DETECT_HUNG_TASK +extern unsigned int sysctl_hung_task_panic; +extern unsigned long sysctl_hung_task_check_count; +extern unsigned long sysctl_hung_task_timeout_secs; +extern unsigned long sysctl_hung_task_warnings; +extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos); +#endif /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) @@ -1236,7 +1242,7 @@ struct task_struct { /* ipc stuff */ struct sysv_sem sysvsem; #endif -#ifdef CONFIG_DETECT_SOFTLOCKUP +#ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ unsigned long last_switch_timestamp; unsigned long last_switch_count; -- cgit From 7e49fcce1bdadd723ae6a0b3b324c4daced61563 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Jan 2009 19:01:40 -0500 Subject: trace, lockdep: manual preempt count adding for local_bh_disable Impact: fix to preempt trace triggering lockdep check_flag failure In local_bh_disable, the use of add_preempt_count causes the preempt tracer to start recording the time preemption is off. But because it already modified the preempt_count to show softirqs disabled, and before it called the lockdep code to handle this, it causes a state that lockdep can not handle. The preempt tracer will reset the ring buffer on start of a trace, and the ring buffer reset code does a spin_lock_irqsave. This calls into lockdep and lockdep will fail when it detects the invalid state of having softirqs disabled but the internal current->softirqs_enabled is still set. The fix is to manually add the SOFTIRQ_OFFSET to preempt count and call the preempt tracer code outside the lockdep critical area. Thanks to Peter Zijlstra for suggesting this solution. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cae9b81a1f..33085b88f87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -137,6 +137,8 @@ extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern unsigned long get_parent_ip(unsigned long addr); + struct seq_file; struct cfs_rq; struct task_group; -- cgit From 5e54f5986a579b8445aa1d5ad3435c2cf7568bed Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Fri, 30 Jan 2009 15:29:54 -0800 Subject: softlockup: remove unused definition for spawn_softlockup_task The definition of spawn_softlockup_task in sched.h became unnecessary once it was converted to the early_initcall() interface. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index f2f94d53230..2a2811c6239 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -302,9 +302,6 @@ extern int softlockup_thresh; static inline void softlockup_tick(void) { } -static inline void spawn_softlockup_task(void) -{ -} static inline void touch_softlockup_watchdog(void) { } -- cgit From 17406b82d621930cca8ccc1272cdac9a7dae8e40 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Fri, 6 Feb 2009 15:37:47 -0800 Subject: softlockup: remove timestamp checking from hung_task Impact: saves sizeof(long) bytes per task_struct By guaranteeing that sysctl_hung_task_timeout_secs have elapsed between tasklist scans we can avoid using timestamps. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a2811c6239..e0d723fea9f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1241,7 +1241,6 @@ struct task_struct { #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ - unsigned long last_switch_timestamp; unsigned long last_switch_count; #endif /* CPU-specific state of this task */ -- cgit From cf40bd16fdad42c053040bcd3988f5fdedbb6c57 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 21 Jan 2009 08:12:39 +0100 Subject: lockdep: annotate reclaim context (__GFP_NOFS) Here is another version, with the incremental patch rolled up, and added reclaim context annotation to kswapd, and allocation tracing to slab allocators (which may only ever reach the page allocator in rare cases, so it is good to put annotations here too). Haven't tested this version as such, but it should be getting closer to merge worthy ;) -- After noticing some code in mm/filemap.c accidentally perform a __GFP_FS allocation when it should not have been, I thought it might be a good idea to try to catch this kind of thing with lockdep. I coded up a little idea that seems to work. Unfortunately the system has to actually be in __GFP_FS page reclaim, then take the lock, before it will mark it. But at least that might still be some orders of magnitude more common (and more debuggable) than an actual deadlock condition, so we have some improvement I hope (the concept is no less complete than discovery of a lock's interrupt contexts). I guess we could even do the same thing with __GFP_IO (normal reclaim), and even GFP_NOIO locks too... but filesystems will have the most locks and fiddly code paths, so let's start there and see how it goes. It *seems* to work. I did a quick test. ================================= [ INFO: inconsistent lock state ] 2.6.28-rc6-00007-ged31348-dirty #26 --------------------------------- inconsistent {in-reclaim-W} -> {ov-reclaim-W} usage. modprobe/8526 [HC0[0]:SC0[0]:HE1:SE1] takes: (testlock){--..}, at: [] brd_init+0x55/0x216 [brd] {in-reclaim-W} state was registered at: [] __lock_acquire+0x75b/0x1a60 [] lock_acquire+0x91/0xc0 [] mutex_lock_nested+0xb1/0x310 [] brd_init+0x2b/0x216 [brd] [] _stext+0x3b/0x170 [] sys_init_module+0xaf/0x1e0 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff irq event stamp: 3929 hardirqs last enabled at (3929): [] mutex_lock_nested+0x285/0x310 hardirqs last disabled at (3928): [] mutex_lock_nested+0x59/0x310 softirqs last enabled at (3732): [] sk_filter+0x83/0xe0 softirqs last disabled at (3730): [] sk_filter+0x16/0xe0 other info that might help us debug this: 1 lock held by modprobe/8526: #0: (testlock){--..}, at: [] brd_init+0x55/0x216 [brd] stack backtrace: Pid: 8526, comm: modprobe Not tainted 2.6.28-rc6-00007-ged31348-dirty #26 Call Trace: [] print_usage_bug+0x193/0x1d0 [] mark_lock+0xaf0/0xca0 [] mark_held_locks+0x55/0xc0 [] ? brd_init+0x0/0x216 [brd] [] trace_reclaim_fs+0x2a/0x60 [] __alloc_pages_internal+0x475/0x580 [] ? mutex_lock_nested+0x26e/0x310 [] ? brd_init+0x0/0x216 [brd] [] brd_init+0x6a/0x216 [brd] [] ? brd_init+0x0/0x216 [brd] [] _stext+0x3b/0x170 [] ? mutex_unlock+0x9/0x10 [] ? __mutex_unlock_slowpath+0x10d/0x180 [] ? trace_hardirqs_on_caller+0x12c/0x190 [] sys_init_module+0xaf/0x1e0 [] system_call_fastpath+0x16/0x1b Signed-off-by: Nick Piggin Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4efb552aca4..b00a77f4999 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1313,6 +1313,7 @@ struct task_struct { int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; + gfp_t lockdep_reclaim_gfp; #endif /* journalling filesystem info */ -- cgit From 3aa551c9b4c40018f0e261a178e3d25478dc04a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 23 Mar 2009 18:28:15 +0100 Subject: genirq: add threaded interrupt handler support Add support for threaded interrupt handlers: A device driver can request that its main interrupt handler runs in a thread. To achive this the device driver requests the interrupt with request_threaded_irq() and provides additionally to the handler a thread function. The handler function is called in hard interrupt context and needs to check whether the interrupt originated from the device. If the interrupt originated from the device then the handler can either return IRQ_HANDLED or IRQ_WAKE_THREAD. IRQ_HANDLED is returned when no further action is required. IRQ_WAKE_THREAD causes the genirq code to invoke the threaded (main) handler. When IRQ_WAKE_THREAD is returned handler must have disabled the interrupt on the device level. This is mandatory for shared interrupt handlers, but we need to do it as well for obscure x86 hardware where disabling an interrupt on the IO_APIC level redirects the interrupt to the legacy PIC interrupt lines. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 46d680643f8..38b77b0f56e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1292,6 +1292,11 @@ struct task_struct { /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ spinlock_t alloc_lock; +#ifdef CONFIG_GENERIC_HARDIRQS + /* IRQ handler threads */ + struct irqaction *irqaction; +#endif + /* Protection of the PI data structures: */ spinlock_t pi_lock; -- cgit From 8aef2d2856158a36c295a8d1288281e4839bff13 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 01:10:15 -0400 Subject: function-graph: ignore times across schedule Impact: more accurate timings The current method of function graph tracing does not take into account the time spent when a task is not running. This shows functions that call schedule have increased costs: 3) + 18.664 us | } ------------------------------------------ 3) -0 => kblockd-123 ------------------------------------------ 3) | finish_task_switch() { 3) 1.441 us | _spin_unlock_irq(); 3) 3.966 us | } 3) ! 2959.433 us | } 3) ! 2961.465 us | } This patch uses the tracepoint in the scheduling context switch to account for time that has elapsed while a task is scheduled out. Now we see: ------------------------------------------ 3) -0 => edac-po-1067 ------------------------------------------ 3) | finish_task_switch() { 3) 0.685 us | _spin_unlock_irq(); 3) 2.331 us | } 3) + 41.439 us | } 3) + 42.663 us | } Signed-off-by: Steven Rostedt --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 89cd308cc7a..471e36d3012 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1409,6 +1409,8 @@ struct task_struct { int curr_ret_stack; /* Stack of return addresses for return function tracing */ struct ftrace_ret_stack *ret_stack; + /* time stamp for last schedule */ + unsigned long long ftrace_timestamp; /* * Number of functions that haven't been traced * because of depth overrun. -- cgit From 5ad4e53bd5406ee214ddc5a41f03f779b8b2d526 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Mar 2009 19:50:06 -0400 Subject: Get rid of indirect include of fs_struct.h Don't pull it in sched.h; very few files actually need it and those can include directly. sched.h itself only needs forward declaration of struct fs_struct; Signed-off-by: Al Viro --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 29df6374d2d..b4e065ea0de 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -68,7 +68,7 @@ struct sched_param { #include #include #include -#include +#include #include #include #include @@ -97,6 +97,7 @@ struct futex_pi_state; struct robust_list_head; struct bio; struct bts_tracer; +struct fs_struct; /* * List of flags we want to share for kernel threads, -- cgit From 9de1581e75ba9d7979766d4ab6d56f0f2d87f7c6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 31 Mar 2009 15:19:29 -0700 Subject: get_mm_hiwater_xxx: trivial, s/define/inline/ Andrew pointed out get_mm_hiwater_xxx() evaluate "mm" argument thrice/twice, make them inline. Signed-off-by: Oleg Nesterov Cc: Hugh Dickins Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 29df6374d2d..481fad3a9b4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -391,8 +391,15 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); (mm)->hiwater_vm = (mm)->total_vm; \ } while (0) -#define get_mm_hiwater_rss(mm) max((mm)->hiwater_rss, get_mm_rss(mm)) -#define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm) +static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) +{ + return max(mm->hiwater_rss, get_mm_rss(mm)); +} + +static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) +{ + return max(mm->hiwater_vm, mm->total_vm); +} extern void set_dumpable(struct mm_struct *mm, int value); extern int get_dumpable(struct mm_struct *mm); -- cgit From 6f2c55b843836d26528c56a0968689accaedbc67 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 2 Apr 2009 16:56:59 -0700 Subject: Simplify copy_thread() First argument unused since 2.3.11. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 481fad3a9b4..9186f8c5d5f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1975,7 +1975,8 @@ extern void mm_release(struct task_struct *, struct mm_struct *); /* Allocate a new mm structure and copy contents from tsk->mm */ extern struct mm_struct *dup_mm(struct task_struct *tsk); -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern int copy_thread(unsigned long, unsigned long, unsigned long, + struct task_struct *, struct pt_regs *); extern void flush_thread(void); extern void exit_thread(void); -- cgit From 39c626ae47c469abdfd30c6e42eff884931380d6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:18 -0700 Subject: forget_original_parent: split out the un-ptrace part By discussion with Roland. - Rename ptrace_exit() to exit_ptrace(), and change it to do all the necessary work with ->ptraced list by its own. - Move this code from exit.c to ptrace.c - Update the comment in ptrace_detach() to explain the rechecking of the child->ptrace. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Cc: "Metzger, Markus T" Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9186f8c5d5f..b47c94e7560 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2061,6 +2061,11 @@ static inline int thread_group_empty(struct task_struct *p) #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) +static inline int task_detached(struct task_struct *p) +{ + return p->exit_signal == -1; +} + /* * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also -- cgit From 6dda81f4384b94930826eded254d8c16f89a9248 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:35 -0700 Subject: pids: document task_pgrp/task_session is not safe without tasklist/rcu Even if task == current, it is not safe to dereference the result of task_pgrp/task_session. We can race with another thread which changes the special pid via setpgid/setsid. Document this. The next 2 patches give an example of the unsafe usage, we have more bad users. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Oleg Nesterov Cc: Louis Rilling Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Sukadev Bhattiprolu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b47c94e7560..722dd313bf8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1489,6 +1489,11 @@ static inline struct pid *task_tgid(struct task_struct *task) return task->group_leader->pids[PIDTYPE_PID].pid; } +/* + * Without tasklist or rcu lock it is not safe to dereference + * the result of task_pgrp/task_session even if task == current, + * we can race with another thread doing sys_setsid/sys_setpgid. + */ static inline struct pid *task_pgrp(struct task_struct *task) { return task->group_leader->pids[PIDTYPE_PGID].pid; -- cgit From 52ee2dfdd4f51cf422ea6a96a0846dc94244aa37 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:38 -0700 Subject: pids: refactor vnr/nr_ns helpers to make them safe Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy. task_pid_nr_ns(task) needs rcu/tasklist depending on task == current. As for "special" pids, vnr/nr_ns helpers always need rcu. However, if task != current, they are unsafe even under rcu lock, we can't trust task->group_leader without the special checks. And almost every helper has a callsite which needs a fix. Also, it is a bit annoying that the implementations of, say, task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical". This patch introduces the new helper, __task_pid_nr_ns(), which is always safe to use, and turns all other helpers into the trivial wrappers. After this I'll send another patch which converts task_tgid_xxx() as well, they're are a bit special. Signed-off-by: Oleg Nesterov Cc: Louis Rilling Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Sukadev Bhattiprolu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 722dd313bf8..49df878a0ca 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1519,17 +1519,23 @@ struct pid_namespace; * * see also pid_nr() etc in include/linux/pid.h */ +pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + struct pid_namespace *ns); static inline pid_t task_pid_nr(struct task_struct *tsk) { return tsk->pid; } -pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); +static inline pid_t task_pid_nr_ns(struct task_struct *tsk, + struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); +} static inline pid_t task_pid_vnr(struct task_struct *tsk) { - return pid_vnr(task_pid(tsk)); + return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); } @@ -1551,11 +1557,15 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return tsk->signal->__pgrp; } -pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); +static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, + struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); +} static inline pid_t task_pgrp_vnr(struct task_struct *tsk) { - return pid_vnr(task_pgrp(tsk)); + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); } @@ -1564,14 +1574,17 @@ static inline pid_t task_session_nr(struct task_struct *tsk) return tsk->signal->__session; } -pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); +static inline pid_t task_session_nr_ns(struct task_struct *tsk, + struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); +} static inline pid_t task_session_vnr(struct task_struct *tsk) { - return pid_vnr(task_session(tsk)); + return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } - /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. -- cgit From 1b0f7ffd0ea27cd3a0b9ca04e3df9522048c32a3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:39 -0700 Subject: pids: kill signal_struct-> __pgrp/__session and friends We are wasting 2 words in signal_struct without any reason to implement task_pgrp_nr() and task_session_nr(). task_session_nr() has no callers since 2e2ba22ea4fd4bb85f0fa37c521066db6775cbef, we can remove it. task_pgrp_nr() is still (I believe wrongly) used in fs/autofsX and fs/coda. This patch reimplements task_pgrp_nr() via task_pgrp_nr_ns(), and kills __pgrp/__session and the related helpers. The change in drivers/char/tty_io.c is cosmetic, but hopefully makes sense anyway. Signed-off-by: Oleg Nesterov Acked-by: Alan Cox [tty parts] Cc: Cedric Le Goater Cc: Dave Hansen Cc: Eric Biederman Cc: Pavel Emelyanov Cc: Serge Hallyn Cc: Sukadev Bhattiprolu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 43 ++++++------------------------------------- 1 file changed, 6 insertions(+), 37 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 49df878a0ca..206ac003e8c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -547,25 +547,8 @@ struct signal_struct { struct list_head cpu_timers[3]; - /* job control IDs */ - - /* - * pgrp and session fields are deprecated. - * use the task_session_Xnr and task_pgrp_Xnr routines below - */ - - union { - pid_t pgrp __deprecated; - pid_t __pgrp; - }; - struct pid *tty_old_pgrp; - union { - pid_t session __deprecated; - pid_t __session; - }; - /* boolean value for session group leader */ int leader; @@ -1469,16 +1452,6 @@ static inline int rt_task(struct task_struct *p) return rt_prio(p->prio); } -static inline void set_task_session(struct task_struct *tsk, pid_t session) -{ - tsk->signal->__session = session; -} - -static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp) -{ - tsk->signal->__pgrp = pgrp; -} - static inline struct pid *task_pid(struct task_struct *task) { return task->pids[PIDTYPE_PID].pid; @@ -1552,11 +1525,6 @@ static inline pid_t task_tgid_vnr(struct task_struct *tsk) } -static inline pid_t task_pgrp_nr(struct task_struct *tsk) -{ - return tsk->signal->__pgrp; -} - static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { @@ -1569,11 +1537,6 @@ static inline pid_t task_pgrp_vnr(struct task_struct *tsk) } -static inline pid_t task_session_nr(struct task_struct *tsk) -{ - return tsk->signal->__session; -} - static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { @@ -1585,6 +1548,12 @@ static inline pid_t task_session_vnr(struct task_struct *tsk) return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } +/* obsolete, do not use */ +static inline pid_t task_pgrp_nr(struct task_struct *tsk) +{ + return task_pgrp_nr_ns(tsk, &init_pid_ns); +} + /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. -- cgit From e3c8ca8336707062f3f7cb1cd7e6b3c753baccdd Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Wed, 8 Apr 2009 19:45:12 -0500 Subject: sched: do not count frozen tasks toward load Freezing tasks via the cgroup freezer causes the load average to climb because the freezer's current implementation puts frozen tasks in uninterruptible sleep (D state). Some applications which perform job-scheduling functions consult the load average when making decisions. If a cgroup is frozen, the load average does not provide a useful measure of the system's utilization to such applications. This is especially inconvenient if the job scheduler employs the cgroup freezer as a mechanism for preempting low priority jobs. Contrast this with using SIGSTOP for the same purpose: the stopped tasks do not count toward system load. Change task_contributes_to_load() to return false if the task is frozen. This results in /proc/loadavg behavior that better meets users' expectations. Signed-off-by: Nathan Lynch Acked-by: Andrew Morton Acked-by: Nigel Cunningham Tested-by: Nigel Cunningham Cc: Cc: containers@lists.linux-foundation.org Cc: linux-pm@lists.linux-foundation.org Cc: Matt Helsley LKML-Reference: <20090408194512.47a99b95@manatee.lan> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 98e1fe51601..b4c38bc8049 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -205,7 +205,8 @@ extern unsigned long long time_sync_thresh; #define task_is_stopped_or_traced(task) \ ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ - ((task->state & TASK_UNINTERRUPTIBLE) != 0) + ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ + (task->flags & PF_FROZEN) == 0) #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) -- cgit