From 8bc719d3cab8414938f9ea6e33b58d8810d18068 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 25 Sep 2006 23:31:20 -0700 Subject: [PATCH] out of memory notifier Add a notifer chain to the out of memory killer. If one of the registered callbacks could release some memory, do not kill the process but return and retry the allocation that forced the oom killer to run. The purpose of the notifier is to add a safety net in the presence of memory ballooners. If the resource manager inflated the balloon to a size where memory allocations can not be satisfied anymore, it is better to deflate the balloon a bit instead of killing processes. The implementation for the s390 ballooner is included. [akpm@osdl.org: cleanups] Signed-off-by: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b9af136e5cf..7d056843fa2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include int sysctl_panic_on_oom; /* #define DEBUG */ @@ -306,6 +308,20 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, return oom_kill_task(p, message); } +static BLOCKING_NOTIFIER_HEAD(oom_notify_list); + +int register_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(register_oom_notifier); + +int unregister_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(unregister_oom_notifier); + /** * out_of_memory - kill the "best" process when we run out of memory * @@ -318,6 +334,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { struct task_struct *p; unsigned long points = 0; + unsigned long freed = 0; + + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0) + /* Got some memory back in the last second. */ + return; if (printk_ratelimit()) { printk("oom-killer: gfp_mask=0x%x, order=%d\n", -- cgit From 7887a3da753e1ba8244556cc9a2b38c815bfe256 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 25 Sep 2006 23:31:29 -0700 Subject: [PATCH] oom: cpuset hint cpuset_excl_nodes_overlap does not always indicate that killing a task will not free any memory we for us. For example, we may be asking for an allocation from _anywhere_ in the machine, or the task in question may be pinning memory that is outside its cpuset. Fix this by just causing cpuset_excl_nodes_overlap to reduce the badness rather than disallow it. Signed-off-by: Nick Piggin Acked-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7d056843fa2..4f815b06ac1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -128,6 +128,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) points /= 4; + /* + * If p's nodes don't overlap ours, it may still help to kill p + * because p may have allocated or otherwise mapped memory on + * this node before. However it will be less likely. + */ + if (!cpuset_excl_nodes_overlap(p)) + points /= 8; + /* * Adjust the score by oomkilladj. */ @@ -198,9 +206,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) continue; if (p->oomkilladj == OOM_DISABLE) continue; - /* If p's nodes don't overlap ours, it won't help to kill p. */ - if (!cpuset_excl_nodes_overlap(p)) - continue; /* * This is in the process of releasing memory so wait for it -- cgit From 50ec3bbffbe8a96347c54832d48110a5bc9e9ff8 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 25 Sep 2006 23:31:29 -0700 Subject: [PATCH] oom: handle current exiting If current *is* exiting, it should actually be allowed to access reserved memory rather than OOM kill something else. Can't do this via a straight check in page_alloc.c because that would allow multiple tasks to use up reserves. Instead cause current to OOM-kill itself which will mark it as TIF_MEMDIE. The current procedure of simply aborting the OOM-kill if a task is exiting can lead to OOM deadlocks. In the case of killing a PF_EXITING task, don't make a lot of noise about it. This becomes more important in future patches, where we can "kill" OOM_DISABLE tasks. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4f815b06ac1..0131bae2a16 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -210,11 +210,26 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) /* * This is in the process of releasing memory so wait for it * to finish before killing some other task by mistake. + * + * However, if p is the current task, we allow the 'kill' to + * go ahead if it is exiting: this will simply set TIF_MEMDIE, + * which will allow it to gain access to memory reserves in + * the process of exiting and releasing its resources. + * Otherwise we could get an OOM deadlock. */ releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || p->flags & PF_EXITING; - if (releasing && !(p->flags & PF_DEAD)) + if (releasing) { + /* PF_DEAD tasks have already released their mm */ + if (p->flags & PF_DEAD) + continue; + if (p->flags & PF_EXITING && p == current) { + chosen = p; + *ppoints = ULONG_MAX; + break; + } return ERR_PTR(-1UL); + } if (p->flags & PF_SWAPOFF) return p; @@ -248,8 +263,11 @@ static void __oom_kill_task(struct task_struct *p, const char *message) return; } task_unlock(p); - printk(KERN_ERR "%s: Killed process %d (%s).\n", + + if (message) { + printk(KERN_ERR "%s: Killed process %d (%s).\n", message, p->pid, p->comm); + } /* * We give our sacrificial lamb high priority and access to @@ -300,8 +318,17 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, struct task_struct *c; struct list_head *tsk; - printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and " - "children.\n", p->pid, p->comm, points); + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just set TIF_MEMDIE so it can die quickly + */ + if (p->flags & PF_EXITING) { + __oom_kill_task(p, NULL); + return 0; + } + + printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li" + " and children.\n", p->pid, p->comm, points); /* Try to kill a child first */ list_for_each(tsk, &p->children) { c = list_entry(tsk, struct task_struct, sibling); -- cgit From 4a3ede107e422a0c53d28024b0aa902ca22a8768 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 25 Sep 2006 23:31:30 -0700 Subject: [PATCH] oom: handle oom_disable exiting Having the oomkilladj == OOM_DISABLE check before the releasing check means that oomkilladj == OOM_DISABLE tasks exiting will not stop the OOM killer. Moving the test down will give the desired behaviour. Also: it will allow them to "OOM-kill" themselves if they are exiting. As per the previous patch, this is required to prevent OOM killer deadlocks (and they don't actually get killed, because they're already exiting -- they're simply allowed access to memory reserves). Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0131bae2a16..55a05f1ef76 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -204,8 +204,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) /* skip the init task with pid == 1 */ if (p->pid == 1) continue; - if (p->oomkilladj == OOM_DISABLE) - continue; /* * This is in the process of releasing memory so wait for it @@ -230,6 +228,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) } return ERR_PTR(-1UL); } + if (p->oomkilladj == OOM_DISABLE) + continue; if (p->flags & PF_SWAPOFF) return p; -- cgit From af5b912435de32fbede08cee949429823ed49781 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 25 Sep 2006 23:31:31 -0700 Subject: [PATCH] oom: swapoff tasks tweak PF_SWAPOFF processes currently cause select_bad_process to return straight away. Instead, give them high priority, so we will kill them first, however we also first ensure no parallel OOM kills are happening at the same time. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 55a05f1ef76..f1aba7e7b76 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -59,6 +59,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) return 0; } + /* + * swapoff can easily use up all memory, so kill those first. + */ + if (p->flags & PF_SWAPOFF) + return ULONG_MAX; + /* * The memory size of the process is the basis for the badness. */ @@ -230,8 +236,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) } if (p->oomkilladj == OOM_DISABLE) continue; - if (p->flags & PF_SWAPOFF) - return p; points = badness(p, uptime.tv_sec); if (points > *ppoints || !chosen) { -- cgit From 5081dde33f7a61d28d9b185cc386f12cb837c7a4 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 25 Sep 2006 23:31:32 -0700 Subject: [PATCH] oom: kthread infinite loop fix Skip kernel threads, rather than having them return 0 from badness. Theoretically, badness might truncate all results to 0, thus a kernel thread might be picked first, causing an infinite loop. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f1aba7e7b76..12cd4735dc2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -207,6 +207,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) unsigned long points; int releasing; + /* skip kernel threads */ + if (!p->mm) + continue; /* skip the init task with pid == 1 */ if (p->pid == 1) continue; -- cgit From b72f160443cb78b2f8addae6e331d2adaa70f869 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 25 Sep 2006 23:31:32 -0700 Subject: [PATCH] oom: more printk Print the name of the task invoking the OOM killer. Could make debugging easier. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 12cd4735dc2..c5e38400058 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -381,8 +381,9 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) return; if (printk_ratelimit()) { - printk("oom-killer: gfp_mask=0x%x, order=%d\n", - gfp_mask, order); + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); dump_stack(); show_mem(); } -- cgit From 5a291b98b2116d669449885abef3000f747504b3 Mon Sep 17 00:00:00 2001 From: Ram Gupta Date: Mon, 25 Sep 2006 23:31:54 -0700 Subject: [PATCH] oom-kill: update comments to reflect current code Update the comments for __oom_kill_task() to reflect the code changes. Signed-off-by: Ram Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c5e38400058..f1c0ef1fd21 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -250,9 +250,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) } /** - * We must be careful though to never send SIGKILL a process with - * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that - * we select a process with CAP_SYS_RAW_IO set). + * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO + * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO + * set. */ static void __oom_kill_task(struct task_struct *p, const char *message) { -- cgit From 89fa30242facca249aead2aac03c4c69764f911c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:55 -0700 Subject: [PATCH] NUMA: Add zone_to_nid function There are many places where we need to determine the node of a zone. Currently we use a difficult to read sequence of pointer dereferencing. Put that into an inline function and use throughout VM. Maybe we can find a way to optimize the lookup in the future. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f1c0ef1fd21..bada3d03119 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -177,8 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) for (z = zonelist->zones; *z; z++) if (cpuset_zone_allowed(*z, gfp_mask)) - node_clear((*z)->zone_pgdat->node_id, - nodes); + node_clear(zone_to_nid(*z), nodes); else return CONSTRAINT_CPUSET; -- cgit