From 8bc719d3cab8414938f9ea6e33b58d8810d18068 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 25 Sep 2006 23:31:20 -0700
Subject: [PATCH] out of memory notifier

Add a notifer chain to the out of memory killer.  If one of the registered
callbacks could release some memory, do not kill the process but return and
retry the allocation that forced the oom killer to run.

The purpose of the notifier is to add a safety net in the presence of
memory ballooners.  If the resource manager inflated the balloon to a size
where memory allocations can not be satisfied anymore, it is better to
deflate the balloon a bit instead of killing processes.

The implementation for the s390 ballooner is included.

[akpm@osdl.org: cleanups]
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'mm/oom_kill.c')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b9af136e5cf..7d056843fa2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -21,6 +21,8 @@
 #include <linux/timex.h>
 #include <linux/jiffies.h>
 #include <linux/cpuset.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
 
 int sysctl_panic_on_oom;
 /* #define DEBUG */
@@ -306,6 +308,20 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
 	return oom_kill_task(p, message);
 }
 
+static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
+
+int register_oom_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&oom_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(register_oom_notifier);
+
+int unregister_oom_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&oom_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_oom_notifier);
+
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  *
@@ -318,6 +334,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
 	struct task_struct *p;
 	unsigned long points = 0;
+	unsigned long freed = 0;
+
+	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+	if (freed > 0)
+		/* Got some memory back in the last second. */
+		return;
 
 	if (printk_ratelimit()) {
 		printk("oom-killer: gfp_mask=0x%x, order=%d\n",
-- 
cgit 


From 7887a3da753e1ba8244556cc9a2b38c815bfe256 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:29 -0700
Subject: [PATCH] oom: cpuset hint

cpuset_excl_nodes_overlap does not always indicate that killing a task will
not free any memory we for us.  For example, we may be asking for an
allocation from _anywhere_ in the machine, or the task in question may be
pinning memory that is outside its cpuset.  Fix this by just causing
cpuset_excl_nodes_overlap to reduce the badness rather than disallow it.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'mm/oom_kill.c')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7d056843fa2..4f815b06ac1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -128,6 +128,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
 		points /= 4;
 
+	/*
+	 * If p's nodes don't overlap ours, it may still help to kill p
+	 * because p may have allocated or otherwise mapped memory on
+	 * this node before. However it will be less likely.
+	 */
+	if (!cpuset_excl_nodes_overlap(p))
+		points /= 8;
+
 	/*
 	 * Adjust the score by oomkilladj.
 	 */
@@ -198,9 +206,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 			continue;
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
-		/* If p's nodes don't overlap ours, it won't help to kill p. */
-		if (!cpuset_excl_nodes_overlap(p))
-			continue;
 
 		/*
 		 * This is in the process of releasing memory so wait for it
-- 
cgit 


From 50ec3bbffbe8a96347c54832d48110a5bc9e9ff8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:29 -0700
Subject: [PATCH] oom: handle current exiting

If current *is* exiting, it should actually be allowed to access reserved
memory rather than OOM kill something else.  Can't do this via a straight
check in page_alloc.c because that would allow multiple tasks to use up
reserves.  Instead cause current to OOM-kill itself which will mark it as
TIF_MEMDIE.

The current procedure of simply aborting the OOM-kill if a task is exiting can
lead to OOM deadlocks.

In the case of killing a PF_EXITING task, don't make a lot of noise about it.
This becomes more important in future patches, where we can "kill" OOM_DISABLE
tasks.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

(limited to 'mm/oom_kill.c')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4f815b06ac1..0131bae2a16 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -210,11 +210,26 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		/*
 		 * This is in the process of releasing memory so wait for it
 		 * to finish before killing some other task by mistake.
+		 *
+		 * However, if p is the current task, we allow the 'kill' to
+		 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
+		 * which will allow it to gain access to memory reserves in
+		 * the process of exiting and releasing its resources.
+		 * Otherwise we could get an OOM deadlock.
 		 */
 		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
 						p->flags & PF_EXITING;
-		if (releasing && !(p->flags & PF_DEAD))
+		if (releasing) {
+			/* PF_DEAD tasks have already released their mm */
+			if (p->flags & PF_DEAD)
+				continue;
+			if (p->flags & PF_EXITING && p == current) {
+				chosen = p;
+				*ppoints = ULONG_MAX;
+				break;
+			}
 			return ERR_PTR(-1UL);
+		}
 		if (p->flags & PF_SWAPOFF)
 			return p;
 
@@ -248,8 +263,11 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
 		return;
 	}
 	task_unlock(p);
-	printk(KERN_ERR "%s: Killed process %d (%s).\n",
+
+	if (message) {
+		printk(KERN_ERR "%s: Killed process %d (%s).\n",
 				message, p->pid, p->comm);
+	}
 
 	/*
 	 * We give our sacrificial lamb high priority and access to
@@ -300,8 +318,17 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
 	struct task_struct *c;
 	struct list_head *tsk;
 
-	printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
-		"children.\n", p->pid, p->comm, points);
+	/*
+	 * If the task is already exiting, don't alarm the sysadmin or kill
+	 * its children or threads, just set TIF_MEMDIE so it can die quickly
+	 */
+	if (p->flags & PF_EXITING) {
+		__oom_kill_task(p, NULL);
+		return 0;
+	}
+
+	printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li"
+			" and children.\n", p->pid, p->comm, points);
 	/* Try to kill a child first */
 	list_for_each(tsk, &p->children) {
 		c = list_entry(tsk, struct task_struct, sibling);
-- 
cgit 


From 4a3ede107e422a0c53d28024b0aa902ca22a8768 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:30 -0700
Subject: [PATCH] oom: handle oom_disable exiting

Having the oomkilladj == OOM_DISABLE check before the releasing check means
that oomkilladj == OOM_DISABLE tasks exiting will not stop the OOM killer.

Moving the test down will give the desired behaviour.  Also: it will allow
them to "OOM-kill" themselves if they are exiting.  As per the previous patch,
this is required to prevent OOM killer deadlocks (and they don't actually get
killed, because they're already exiting -- they're simply allowed access to
memory reserves).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm/oom_kill.c')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0131bae2a16..55a05f1ef76 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -204,8 +204,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		/* skip the init task with pid == 1 */
 		if (p->pid == 1)
 			continue;
-		if (p->oomkilladj == OOM_DISABLE)
-			continue;
 
 		/*
 		 * This is in the process of releasing memory so wait for it
@@ -230,6 +228,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 			}
 			return ERR_PTR(-1UL);
 		}
+		if (p->oomkilladj == OOM_DISABLE)
+			continue;
 		if (p->flags & PF_SWAPOFF)
 			return p;
 
-- 
cgit 


From af5b912435de32fbede08cee949429823ed49781 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:31 -0700
Subject: [PATCH] oom: swapoff tasks tweak

PF_SWAPOFF processes currently cause select_bad_process to return straight
away.  Instead, give them high priority, so we will kill them first, however
we also first ensure no parallel OOM kills are happening at the same time.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'mm/oom_kill.c')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 55a05f1ef76..f1aba7e7b76 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -59,6 +59,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 		return 0;
 	}
 
+	/*
+	 * swapoff can easily use up all memory, so kill those first.
+	 */
+	if (p->flags & PF_SWAPOFF)
+		return ULONG_MAX;
+
 	/*
 	 * The memory size of the process is the basis for the badness.
 	 */
@@ -230,8 +236,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		}
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
-		if (p->flags & PF_SWAPOFF)
-			return p;
 
 		points = badness(p, uptime.tv_sec);
 		if (points > *ppoints || !chosen) {
-- 
cgit 


From 5081dde33f7a61d28d9b185cc386f12cb837c7a4 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:32 -0700
Subject: [PATCH] oom: kthread infinite loop fix

Skip kernel threads, rather than having them return 0 from badness.
Theoretically, badness might truncate all results to 0, thus a kernel thread
might be picked first, causing an infinite loop.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm/oom_kill.c')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f1aba7e7b76..12cd4735dc2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -207,6 +207,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		unsigned long points;
 		int releasing;
 
+		/* skip kernel threads */
+		if (!p->mm)
+			continue;
 		/* skip the init task with pid == 1 */
 		if (p->pid == 1)
 			continue;
-- 
cgit 


From b72f160443cb78b2f8addae6e331d2adaa70f869 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:32 -0700
Subject: [PATCH] oom: more printk

Print the name of the task invoking the OOM killer.  Could make debugging
easier.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm/oom_kill.c')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 12cd4735dc2..c5e38400058 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -381,8 +381,9 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 		return;
 
 	if (printk_ratelimit()) {
-		printk("oom-killer: gfp_mask=0x%x, order=%d\n",
-			gfp_mask, order);
+		printk(KERN_WARNING "%s invoked oom-killer: "
+			"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
+			current->comm, gfp_mask, order, current->oomkilladj);
 		dump_stack();
 		show_mem();
 	}
-- 
cgit 


From 5a291b98b2116d669449885abef3000f747504b3 Mon Sep 17 00:00:00 2001
From: Ram Gupta <ram.gupta5@gmail.com>
Date: Mon, 25 Sep 2006 23:31:54 -0700
Subject: [PATCH] oom-kill: update comments to reflect current code

Update the comments for __oom_kill_task() to reflect the code changes.

Signed-off-by: Ram Gupta <r.gupta@astronautics.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm/oom_kill.c')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c5e38400058..f1c0ef1fd21 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -250,9 +250,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 }
 
 /**
- * We must be careful though to never send SIGKILL a process with
- * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
- * we select a process with CAP_SYS_RAW_IO set).
+ * Send SIGKILL to the selected  process irrespective of  CAP_SYS_RAW_IO
+ * flag though it's unlikely that  we select a process with CAP_SYS_RAW_IO
+ * set.
  */
 static void __oom_kill_task(struct task_struct *p, const char *message)
 {
-- 
cgit 


From 89fa30242facca249aead2aac03c4c69764f911c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:55 -0700
Subject: [PATCH] NUMA: Add zone_to_nid function

There are many places where we need to determine the node of a zone.
Currently we use a difficult to read sequence of pointer dereferencing.
Put that into an inline function and use throughout VM.  Maybe we can find
a way to optimize the lookup in the future.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm/oom_kill.c')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f1c0ef1fd21..bada3d03119 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -177,8 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
 
 	for (z = zonelist->zones; *z; z++)
 		if (cpuset_zone_allowed(*z, gfp_mask))
-			node_clear((*z)->zone_pgdat->node_id,
-					nodes);
+			node_clear(zone_to_nid(*z), nodes);
 		else
 			return CONSTRAINT_CPUSET;
 
-- 
cgit