summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKyle McMartin <kyle@redhat.com>2010-12-02 11:57:44 -0500
committerKyle McMartin <kyle@redhat.com>2010-12-02 11:57:44 -0500
commit254585788fce648441c41bf2854ac0899f3e2b1a (patch)
tree721f31cb7920dd3c1a3e116d0f4dfef24e88d7d1
parent8077b629b4f8dedcaafadce6fc40e50de6e1f920 (diff)
downloadkernel-254585788fce648441c41bf2854ac0899f3e2b1a.tar.gz
kernel-254585788fce648441c41bf2854ac0899f3e2b1a.tar.xz
kernel-254585788fce648441c41bf2854ac0899f3e2b1a.zip
kswapd fixes from mmotm
(queuing them here since it's unclear if they'll go into .37 or wait for .38)
-rw-r--r--kernel.spec10
-rw-r--r--mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch389
-rw-r--r--mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch167
3 files changed, 566 insertions, 0 deletions
diff --git a/kernel.spec b/kernel.spec
index b01ba32a2..473b6e29d 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -705,6 +705,9 @@ Patch12401: debug-tty-print-dev-name.patch
Patch12402: tty-ldisc-fix-open-flag-handling.patch
Patch12403: tty-open-hangup-race-fixup.patch
+Patch12410: mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch
+Patch12411: mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch
+
%endif
BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root
@@ -1300,6 +1303,10 @@ ApplyPatch debug-tty-print-dev-name.patch
ApplyPatch tty-ldisc-fix-open-flag-handling.patch
ApplyPatch tty-open-hangup-race-fixup.patch
+# backport some fixes for kswapd from mmotm, rhbz#649694
+ApplyPatch mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch
+ApplyPatch mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch
+
# END OF PATCH APPLICATIONS
%endif
@@ -1912,6 +1919,9 @@ fi
# || ||
%changelog
+* Thu Dec 02 2010 Kyle McMartin <kyle@redhat.com>
+- Grab some of Mel's fixes from -mmotm to hopefully sort out #649694.
+
* Wed Dec 01 2010 Kyle McMartin <kyle@redhat.com> 2.6.37-0.rc4.git1.1
- Linux 2.6.37-rc4-git1
- Pull in DRM fixes that are queued for -rc5 [3074adc8]
diff --git a/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch b/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch
new file mode 100644
index 000000000..561c5897e
--- /dev/null
+++ b/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch
@@ -0,0 +1,389 @@
+From df43fae25437d7bc7dfff72599c1e825038b67cf Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mel@csn.ul.ie>
+Date: Wed, 24 Nov 2010 22:18:23 -0500
+Subject: [PATCH 1/2] mm: page allocator: Adjust the per-cpu counter threshold when memory is low
+
+Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory
+is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To
+avoid synchronization overhead, these counters are maintained on a per-cpu
+basis and drained both periodically and when a threshold is above a
+threshold. On large CPU systems, the difference between the estimate and
+real value of NR_FREE_PAGES can be very high. The system can get into a
+case where pages are allocated far below the min watermark potentially
+causing livelock issues. The commit solved the problem by taking a better
+reading of NR_FREE_PAGES when memory was low.
+
+Unfortately, as reported by Shaohua Li this accurate reading can consume a
+large amount of CPU time on systems with many sockets due to cache line
+bouncing. This patch takes a different approach. For large machines
+where counter drift might be unsafe and while kswapd is awake, the per-cpu
+thresholds for the target pgdat are reduced to limit the level of drift to
+what should be a safe level. This incurs a performance penalty in heavy
+memory pressure by a factor that depends on the workload and the machine
+but the machine should function correctly without accidentally exhausting
+all memory on a node. There is an additional cost when kswapd wakes and
+sleeps but the event is not expected to be frequent - in Shaohua's test
+case, there was one recorded sleep and wake event at least.
+
+To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is
+introduced that takes a more accurate reading of NR_FREE_PAGES when called
+from wakeup_kswapd, when deciding whether it is really safe to go back to
+sleep in sleeping_prematurely() and when deciding if a zone is really
+balanced or not in balance_pgdat(). We are still using an expensive
+function but limiting how often it is called.
+
+When the test case is reproduced, the time spent in the watermark
+functions is reduced. The following report is on the percentage of time
+spent cumulatively spent in the functions zone_nr_free_pages(),
+zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(),
+zone_page_state_snapshot(), zone_page_state().
+
+vanilla 11.6615%
+disable-threshold 0.2584%
+
+Reported-by: Shaohua Li <shaohua.li@intel.com>
+Signed-off-by: Mel Gorman <mel@csn.ul.ie>
+Reviewed-by: Christoph Lameter <cl@linux.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[[http://userweb.kernel.org/~akpm/mmotm/broken-out/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch]]
+---
+ include/linux/mmzone.h | 10 ++-----
+ include/linux/vmstat.h | 5 +++
+ mm/mmzone.c | 21 ---------------
+ mm/page_alloc.c | 35 +++++++++++++++++++-----
+ mm/vmscan.c | 23 +++++++++-------
+ mm/vmstat.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++-
+ 6 files changed, 115 insertions(+), 47 deletions(-)
+
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 3984c4e..8d789d7 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -448,12 +448,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
+ return test_bit(ZONE_OOM_LOCKED, &zone->flags);
+ }
+
+-#ifdef CONFIG_SMP
+-unsigned long zone_nr_free_pages(struct zone *zone);
+-#else
+-#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
+-#endif /* CONFIG_SMP */
+-
+ /*
+ * The "priority" of VM scanning is how much of the queues we will scan in one
+ * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
+@@ -651,7 +645,9 @@ typedef struct pglist_data {
+ extern struct mutex zonelists_mutex;
+ void build_all_zonelists(void *data);
+ void wakeup_kswapd(struct zone *zone, int order);
+-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
++bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
++ int classzone_idx, int alloc_flags);
++bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags);
+ enum memmap_context {
+ MEMMAP_EARLY,
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index eaaea37..e4cc21c 100644
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
+ extern void __dec_zone_state(struct zone *, enum zone_stat_item);
+
+ void refresh_cpu_vm_stats(int);
++void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
++void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
+ #else /* CONFIG_SMP */
+
+ /*
+@@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page,
+ #define dec_zone_page_state __dec_zone_page_state
+ #define mod_zone_page_state __mod_zone_page_state
+
++static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
++static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
++
+ static inline void refresh_cpu_vm_stats(int cpu) { }
+ #endif
+
+diff --git a/mm/mmzone.c b/mm/mmzone.c
+index e35bfb8..f5b7d17 100644
+--- a/mm/mmzone.c
++++ b/mm/mmzone.c
+@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
+ return 1;
+ }
+ #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+-
+-#ifdef CONFIG_SMP
+-/* Called when a more accurate view of NR_FREE_PAGES is needed */
+-unsigned long zone_nr_free_pages(struct zone *zone)
+-{
+- unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
+-
+- /*
+- * While kswapd is awake, it is considered the zone is under some
+- * memory pressure. Under pressure, there is a risk that
+- * per-cpu-counter-drift will allow the min watermark to be breached
+- * potentially causing a live-lock. While kswapd is awake and
+- * free pages are low, get a better estimate for free pages
+- */
+- if (nr_free_pages < zone->percpu_drift_mark &&
+- !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
+- return zone_page_state_snapshot(zone, NR_FREE_PAGES);
+-
+- return nr_free_pages;
+-}
+-#endif /* CONFIG_SMP */
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index f12ad18..0286150 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1454,24 +1454,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+ #endif /* CONFIG_FAIL_PAGE_ALLOC */
+
+ /*
+- * Return 1 if free pages are above 'mark'. This takes into account the order
++ * Return true if free pages are above 'mark'. This takes into account the order
+ * of the allocation.
+ */
+-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+- int classzone_idx, int alloc_flags)
++static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
++ int classzone_idx, int alloc_flags, long free_pages)
+ {
+ /* free_pages my go negative - that's OK */
+ long min = mark;
+- long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
+ int o;
+
++ free_pages -= (1 << order) + 1;
+ if (alloc_flags & ALLOC_HIGH)
+ min -= min / 2;
+ if (alloc_flags & ALLOC_HARDER)
+ min -= min / 4;
+
+ if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+- return 0;
++ return false;
+ for (o = 0; o < order; o++) {
+ /* At the next order, this order's pages become unavailable */
+ free_pages -= z->free_area[o].nr_free << o;
+@@ -1480,9 +1480,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ min >>= 1;
+
+ if (free_pages <= min)
+- return 0;
++ return false;
+ }
+- return 1;
++ return true;
++}
++
++bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
++ int classzone_idx, int alloc_flags)
++{
++ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
++ zone_page_state(z, NR_FREE_PAGES));
++}
++
++bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
++ int classzone_idx, int alloc_flags)
++{
++ long free_pages = zone_page_state(z, NR_FREE_PAGES);
++
++ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
++ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
++
++ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
++ free_pages);
+ }
+
+ #ifdef CONFIG_NUMA
+@@ -2436,7 +2455,7 @@ void show_free_areas(void)
+ " all_unreclaimable? %s"
+ "\n",
+ zone->name,
+- K(zone_nr_free_pages(zone)),
++ K(zone_page_state(zone, NR_FREE_PAGES)),
+ K(min_wmark_pages(zone)),
+ K(low_wmark_pages(zone)),
+ K(high_wmark_pages(zone)),
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index c5dfabf..3e71cb1 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2082,7 +2082,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+ if (zone->all_unreclaimable)
+ continue;
+
+- if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
++ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
+ 0, 0))
+ return 1;
+ }
+@@ -2169,7 +2169,7 @@ loop_again:
+ shrink_active_list(SWAP_CLUSTER_MAX, zone,
+ &sc, priority, 0);
+
+- if (!zone_watermark_ok(zone, order,
++ if (!zone_watermark_ok_safe(zone, order,
+ high_wmark_pages(zone), 0, 0)) {
+ end_zone = i;
+ break;
+@@ -2215,7 +2215,7 @@ loop_again:
+ * We put equal pressure on every zone, unless one
+ * zone has way too many pages free already.
+ */
+- if (!zone_watermark_ok(zone, order,
++ if (!zone_watermark_ok_safe(zone, order,
+ 8*high_wmark_pages(zone), end_zone, 0))
+ shrink_zone(priority, zone, &sc);
+ reclaim_state->reclaimed_slab = 0;
+@@ -2236,7 +2236,7 @@ loop_again:
+ total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
+ sc.may_writepage = 1;
+
+- if (!zone_watermark_ok(zone, order,
++ if (!zone_watermark_ok_safe(zone, order,
+ high_wmark_pages(zone), end_zone, 0)) {
+ all_zones_ok = 0;
+ /*
+@@ -2244,7 +2244,7 @@ loop_again:
+ * means that we have a GFP_ATOMIC allocation
+ * failure risk. Hurry up!
+ */
+- if (!zone_watermark_ok(zone, order,
++ if (!zone_watermark_ok_safe(zone, order,
+ min_wmark_pages(zone), end_zone, 0))
+ has_under_min_watermark_zone = 1;
+ }
+@@ -2378,7 +2378,9 @@ static int kswapd(void *p)
+ */
+ if (!sleeping_prematurely(pgdat, order, remaining)) {
+ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
++ restore_pgdat_percpu_threshold(pgdat);
+ schedule();
++ reduce_pgdat_percpu_threshold(pgdat);
+ } else {
+ if (remaining)
+ count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+@@ -2417,16 +2419,17 @@ void wakeup_kswapd(struct zone *zone, int order)
+ if (!populated_zone(zone))
+ return;
+
+- pgdat = zone->zone_pgdat;
+- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
++ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ return;
++ pgdat = zone->zone_pgdat;
+ if (pgdat->kswapd_max_order < order)
+ pgdat->kswapd_max_order = order;
+- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
+- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+- return;
+ if (!waitqueue_active(&pgdat->kswapd_wait))
+ return;
++ if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
++ return;
++
++ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
+ wake_up_interruptible(&pgdat->kswapd_wait);
+ }
+
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 355a9e6..4d7faeb 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -81,6 +81,30 @@ EXPORT_SYMBOL(vm_stat);
+
+ #ifdef CONFIG_SMP
+
++static int calculate_pressure_threshold(struct zone *zone)
++{
++ int threshold;
++ int watermark_distance;
++
++ /*
++ * As vmstats are not up to date, there is drift between the estimated
++ * and real values. For high thresholds and a high number of CPUs, it
++ * is possible for the min watermark to be breached while the estimated
++ * value looks fine. The pressure threshold is a reduced value such
++ * that even the maximum amount of drift will not accidentally breach
++ * the min watermark
++ */
++ watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
++ threshold = max(1, (int)(watermark_distance / num_online_cpus()));
++
++ /*
++ * Maximum threshold is 125
++ */
++ threshold = min(125, threshold);
++
++ return threshold;
++}
++
+ static int calculate_threshold(struct zone *zone)
+ {
+ int threshold;
+@@ -159,6 +183,48 @@ static void refresh_zone_stat_thresholds(void)
+ }
+ }
+
++void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
++{
++ struct zone *zone;
++ int cpu;
++ int threshold;
++ int i;
++
++ get_online_cpus();
++ for (i = 0; i < pgdat->nr_zones; i++) {
++ zone = &pgdat->node_zones[i];
++ if (!zone->percpu_drift_mark)
++ continue;
++
++ threshold = calculate_pressure_threshold(zone);
++ for_each_online_cpu(cpu)
++ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
++ = threshold;
++ }
++ put_online_cpus();
++}
++
++void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
++{
++ struct zone *zone;
++ int cpu;
++ int threshold;
++ int i;
++
++ get_online_cpus();
++ for (i = 0; i < pgdat->nr_zones; i++) {
++ zone = &pgdat->node_zones[i];
++ if (!zone->percpu_drift_mark)
++ continue;
++
++ threshold = calculate_threshold(zone);
++ for_each_online_cpu(cpu)
++ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
++ = threshold;
++ }
++ put_online_cpus();
++}
++
+ /*
+ * For use when we know that interrupts are disabled.
+ */
+@@ -826,7 +892,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
+ "\n scanned %lu"
+ "\n spanned %lu"
+ "\n present %lu",
+- zone_nr_free_pages(zone),
++ zone_page_state(zone, NR_FREE_PAGES),
+ min_wmark_pages(zone),
+ low_wmark_pages(zone),
+ high_wmark_pages(zone),
+--
+1.7.3.2
+
diff --git a/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch b/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch
new file mode 100644
index 000000000..058b1399a
--- /dev/null
+++ b/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch
@@ -0,0 +1,167 @@
+From 82e3d4969144377d13da97d511e849e8cf3e6dcc Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mel@csn.ul.ie>
+Date: Wed, 24 Nov 2010 22:24:24 -0500
+Subject: [PATCH 2/2] mm: vmstat: Use a single setter function and callback for adjusting percpu thresholds
+
+reduce_pgdat_percpu_threshold() and restore_pgdat_percpu_threshold() exist
+to adjust the per-cpu vmstat thresholds while kswapd is awake to avoid
+errors due to counter drift. The functions duplicate some code so this
+patch replaces them with a single set_pgdat_percpu_threshold() that takes
+a callback function to calculate the desired threshold as a parameter.
+
+Signed-off-by: Mel Gorman <mel@csn.ul.ie>
+Reviewed-by: Christoph Lameter <cl@linux.com>
+Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[the various mmotm patches updating this were rolled up. --kyle]
+[[http://userweb.kernel.org/~akpm/mmotm/broken-out/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds-fix-set_pgdat_percpu_threshold-dont-use-for_each_online_cpu.patch]]
+---
+ include/linux/vmstat.h | 10 ++++++----
+ mm/vmscan.c | 19 +++++++++++++++++--
+ mm/vmstat.c | 36 +++++++-----------------------------
+ 3 files changed, 30 insertions(+), 35 deletions(-)
+
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index e4cc21c..833e676 100644
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -254,8 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
+ extern void __dec_zone_state(struct zone *, enum zone_stat_item);
+
+ void refresh_cpu_vm_stats(int);
+-void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
+-void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
++
++int calculate_pressure_threshold(struct zone *zone);
++int calculate_normal_threshold(struct zone *zone);
++void set_pgdat_percpu_threshold(pg_data_t *pgdat,
++ int (*calculate_pressure)(struct zone *));
+ #else /* CONFIG_SMP */
+
+ /*
+@@ -300,8 +303,7 @@ static inline void __dec_zone_page_state(struct page *page,
+ #define dec_zone_page_state __dec_zone_page_state
+ #define mod_zone_page_state __mod_zone_page_state
+
+-static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+-static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
++#define set_pgdat_percpu_threshold(pgdat, callback) { }
+
+ static inline void refresh_cpu_vm_stats(int cpu) { }
+ #endif
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 3e71cb1..ba39948 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2378,9 +2378,24 @@ static int kswapd(void *p)
+ */
+ if (!sleeping_prematurely(pgdat, order, remaining)) {
+ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+- restore_pgdat_percpu_threshold(pgdat);
++
++ /*
++ * vmstat counters are not perfectly
++ * accurate and the estimated value
++ * for counters such as NR_FREE_PAGES
++ * can deviate from the true value by
++ * nr_online_cpus * threshold. To
++ * avoid the zone watermarks being
++ * breached while under pressure, we
++ * reduce the per-cpu vmstat threshold
++ * while kswapd is awake and restore
++ * them before going back to sleep.
++ */
++ set_pgdat_percpu_threshold(pgdat,
++ calculate_normal_threshold);
+ schedule();
+- reduce_pgdat_percpu_threshold(pgdat);
++ set_pgdat_percpu_threshold(pgdat,
++ calculate_pressure_threshold);
+ } else {
+ if (remaining)
+ count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 4d7faeb..511c2c0 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -81,7 +81,7 @@ EXPORT_SYMBOL(vm_stat);
+
+ #ifdef CONFIG_SMP
+
+-static int calculate_pressure_threshold(struct zone *zone)
++int calculate_pressure_threshold(struct zone *zone)
+ {
+ int threshold;
+ int watermark_distance;
+@@ -105,7 +105,7 @@ static int calculate_pressure_threshold(struct zone *zone)
+ return threshold;
+ }
+
+-static int calculate_threshold(struct zone *zone)
++int calculate_normal_threshold(struct zone *zone)
+ {
+ int threshold;
+ int mem; /* memory in 128 MB units */
+@@ -164,7 +164,7 @@ static void refresh_zone_stat_thresholds(void)
+ for_each_populated_zone(zone) {
+ unsigned long max_drift, tolerate_drift;
+
+- threshold = calculate_threshold(zone);
++ threshold = calculate_normal_threshold(zone);
+
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+@@ -183,46 +183,24 @@ static void refresh_zone_stat_thresholds(void)
+ }
+ }
+
+-void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
++void set_pgdat_percpu_threshold(pg_data_t *pgdat,
++ int (*calculate_pressure)(struct zone *))
+ {
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+- get_online_cpus();
+- for (i = 0; i < pgdat->nr_zones; i++) {
+- zone = &pgdat->node_zones[i];
+- if (!zone->percpu_drift_mark)
+- continue;
+-
+- threshold = calculate_pressure_threshold(zone);
+- for_each_online_cpu(cpu)
+- per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+- = threshold;
+- }
+- put_online_cpus();
+-}
+-
+-void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
+-{
+- struct zone *zone;
+- int cpu;
+- int threshold;
+- int i;
+-
+- get_online_cpus();
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+- threshold = calculate_threshold(zone);
+- for_each_online_cpu(cpu)
++ threshold = (*calculate_pressure)(zone);
++ for_each_possible_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+ }
+- put_online_cpus();
+ }
+
+ /*
+--
+1.7.3.2
+