diff options
author | Kyle McMartin <kyle@redhat.com> | 2010-12-02 11:57:44 -0500 |
---|---|---|
committer | Kyle McMartin <kyle@redhat.com> | 2010-12-02 11:57:44 -0500 |
commit | 254585788fce648441c41bf2854ac0899f3e2b1a (patch) | |
tree | 721f31cb7920dd3c1a3e116d0f4dfef24e88d7d1 | |
parent | 8077b629b4f8dedcaafadce6fc40e50de6e1f920 (diff) | |
download | kernel-254585788fce648441c41bf2854ac0899f3e2b1a.tar.gz kernel-254585788fce648441c41bf2854ac0899f3e2b1a.tar.xz kernel-254585788fce648441c41bf2854ac0899f3e2b1a.zip |
kswapd fixes from mmotm
(queuing them here since it's unclear if they'll go into .37 or wait for
.38)
3 files changed, 566 insertions, 0 deletions
diff --git a/kernel.spec b/kernel.spec index b01ba32a2..473b6e29d 100644 --- a/kernel.spec +++ b/kernel.spec @@ -705,6 +705,9 @@ Patch12401: debug-tty-print-dev-name.patch Patch12402: tty-ldisc-fix-open-flag-handling.patch Patch12403: tty-open-hangup-race-fixup.patch +Patch12410: mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch +Patch12411: mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch + %endif BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root @@ -1300,6 +1303,10 @@ ApplyPatch debug-tty-print-dev-name.patch ApplyPatch tty-ldisc-fix-open-flag-handling.patch ApplyPatch tty-open-hangup-race-fixup.patch +# backport some fixes for kswapd from mmotm, rhbz#649694 +ApplyPatch mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch +ApplyPatch mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch + # END OF PATCH APPLICATIONS %endif @@ -1912,6 +1919,9 @@ fi # || || %changelog +* Thu Dec 02 2010 Kyle McMartin <kyle@redhat.com> +- Grab some of Mel's fixes from -mmotm to hopefully sort out #649694. + * Wed Dec 01 2010 Kyle McMartin <kyle@redhat.com> 2.6.37-0.rc4.git1.1 - Linux 2.6.37-rc4-git1 - Pull in DRM fixes that are queued for -rc5 [3074adc8] diff --git a/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch b/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch new file mode 100644 index 000000000..561c5897e --- /dev/null +++ b/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch @@ -0,0 +1,389 @@ +From df43fae25437d7bc7dfff72599c1e825038b67cf Mon Sep 17 00:00:00 2001 +From: Mel Gorman <mel@csn.ul.ie> +Date: Wed, 24 Nov 2010 22:18:23 -0500 +Subject: [PATCH 1/2] mm: page allocator: Adjust the per-cpu counter threshold when memory is low + +Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory +is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To +avoid synchronization overhead, these counters are maintained on a per-cpu +basis and drained both periodically and when a threshold is above a +threshold. On large CPU systems, the difference between the estimate and +real value of NR_FREE_PAGES can be very high. The system can get into a +case where pages are allocated far below the min watermark potentially +causing livelock issues. The commit solved the problem by taking a better +reading of NR_FREE_PAGES when memory was low. + +Unfortately, as reported by Shaohua Li this accurate reading can consume a +large amount of CPU time on systems with many sockets due to cache line +bouncing. This patch takes a different approach. For large machines +where counter drift might be unsafe and while kswapd is awake, the per-cpu +thresholds for the target pgdat are reduced to limit the level of drift to +what should be a safe level. This incurs a performance penalty in heavy +memory pressure by a factor that depends on the workload and the machine +but the machine should function correctly without accidentally exhausting +all memory on a node. There is an additional cost when kswapd wakes and +sleeps but the event is not expected to be frequent - in Shaohua's test +case, there was one recorded sleep and wake event at least. + +To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is +introduced that takes a more accurate reading of NR_FREE_PAGES when called +from wakeup_kswapd, when deciding whether it is really safe to go back to +sleep in sleeping_prematurely() and when deciding if a zone is really +balanced or not in balance_pgdat(). We are still using an expensive +function but limiting how often it is called. + +When the test case is reproduced, the time spent in the watermark +functions is reduced. The following report is on the percentage of time +spent cumulatively spent in the functions zone_nr_free_pages(), +zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(), +zone_page_state_snapshot(), zone_page_state(). + +vanilla 11.6615% +disable-threshold 0.2584% + +Reported-by: Shaohua Li <shaohua.li@intel.com> +Signed-off-by: Mel Gorman <mel@csn.ul.ie> +Reviewed-by: Christoph Lameter <cl@linux.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +[[http://userweb.kernel.org/~akpm/mmotm/broken-out/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch]] +--- + include/linux/mmzone.h | 10 ++----- + include/linux/vmstat.h | 5 +++ + mm/mmzone.c | 21 --------------- + mm/page_alloc.c | 35 +++++++++++++++++++----- + mm/vmscan.c | 23 +++++++++------- + mm/vmstat.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++- + 6 files changed, 115 insertions(+), 47 deletions(-) + +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 3984c4e..8d789d7 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -448,12 +448,6 @@ static inline int zone_is_oom_locked(const struct zone *zone) + return test_bit(ZONE_OOM_LOCKED, &zone->flags); + } + +-#ifdef CONFIG_SMP +-unsigned long zone_nr_free_pages(struct zone *zone); +-#else +-#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES) +-#endif /* CONFIG_SMP */ +- + /* + * The "priority" of VM scanning is how much of the queues we will scan in one + * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the +@@ -651,7 +645,9 @@ typedef struct pglist_data { + extern struct mutex zonelists_mutex; + void build_all_zonelists(void *data); + void wakeup_kswapd(struct zone *zone, int order); +-int zone_watermark_ok(struct zone *z, int order, unsigned long mark, ++bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, ++ int classzone_idx, int alloc_flags); ++bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); + enum memmap_context { + MEMMAP_EARLY, +diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h +index eaaea37..e4cc21c 100644 +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); + extern void __dec_zone_state(struct zone *, enum zone_stat_item); + + void refresh_cpu_vm_stats(int); ++void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); ++void restore_pgdat_percpu_threshold(pg_data_t *pgdat); + #else /* CONFIG_SMP */ + + /* +@@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page, + #define dec_zone_page_state __dec_zone_page_state + #define mod_zone_page_state __mod_zone_page_state + ++static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } ++static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } ++ + static inline void refresh_cpu_vm_stats(int cpu) { } + #endif + +diff --git a/mm/mmzone.c b/mm/mmzone.c +index e35bfb8..f5b7d17 100644 +--- a/mm/mmzone.c ++++ b/mm/mmzone.c +@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, + return 1; + } + #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ +- +-#ifdef CONFIG_SMP +-/* Called when a more accurate view of NR_FREE_PAGES is needed */ +-unsigned long zone_nr_free_pages(struct zone *zone) +-{ +- unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); +- +- /* +- * While kswapd is awake, it is considered the zone is under some +- * memory pressure. Under pressure, there is a risk that +- * per-cpu-counter-drift will allow the min watermark to be breached +- * potentially causing a live-lock. While kswapd is awake and +- * free pages are low, get a better estimate for free pages +- */ +- if (nr_free_pages < zone->percpu_drift_mark && +- !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) +- return zone_page_state_snapshot(zone, NR_FREE_PAGES); +- +- return nr_free_pages; +-} +-#endif /* CONFIG_SMP */ +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index f12ad18..0286150 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1454,24 +1454,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) + #endif /* CONFIG_FAIL_PAGE_ALLOC */ + + /* +- * Return 1 if free pages are above 'mark'. This takes into account the order ++ * Return true if free pages are above 'mark'. This takes into account the order + * of the allocation. + */ +-int zone_watermark_ok(struct zone *z, int order, unsigned long mark, +- int classzone_idx, int alloc_flags) ++static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, ++ int classzone_idx, int alloc_flags, long free_pages) + { + /* free_pages my go negative - that's OK */ + long min = mark; +- long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; + int o; + ++ free_pages -= (1 << order) + 1; + if (alloc_flags & ALLOC_HIGH) + min -= min / 2; + if (alloc_flags & ALLOC_HARDER) + min -= min / 4; + + if (free_pages <= min + z->lowmem_reserve[classzone_idx]) +- return 0; ++ return false; + for (o = 0; o < order; o++) { + /* At the next order, this order's pages become unavailable */ + free_pages -= z->free_area[o].nr_free << o; +@@ -1480,9 +1480,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, + min >>= 1; + + if (free_pages <= min) +- return 0; ++ return false; + } +- return 1; ++ return true; ++} ++ ++bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, ++ int classzone_idx, int alloc_flags) ++{ ++ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, ++ zone_page_state(z, NR_FREE_PAGES)); ++} ++ ++bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, ++ int classzone_idx, int alloc_flags) ++{ ++ long free_pages = zone_page_state(z, NR_FREE_PAGES); ++ ++ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) ++ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); ++ ++ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, ++ free_pages); + } + + #ifdef CONFIG_NUMA +@@ -2436,7 +2455,7 @@ void show_free_areas(void) + " all_unreclaimable? %s" + "\n", + zone->name, +- K(zone_nr_free_pages(zone)), ++ K(zone_page_state(zone, NR_FREE_PAGES)), + K(min_wmark_pages(zone)), + K(low_wmark_pages(zone)), + K(high_wmark_pages(zone)), +diff --git a/mm/vmscan.c b/mm/vmscan.c +index c5dfabf..3e71cb1 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2082,7 +2082,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) + if (zone->all_unreclaimable) + continue; + +- if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), ++ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), + 0, 0)) + return 1; + } +@@ -2169,7 +2169,7 @@ loop_again: + shrink_active_list(SWAP_CLUSTER_MAX, zone, + &sc, priority, 0); + +- if (!zone_watermark_ok(zone, order, ++ if (!zone_watermark_ok_safe(zone, order, + high_wmark_pages(zone), 0, 0)) { + end_zone = i; + break; +@@ -2215,7 +2215,7 @@ loop_again: + * We put equal pressure on every zone, unless one + * zone has way too many pages free already. + */ +- if (!zone_watermark_ok(zone, order, ++ if (!zone_watermark_ok_safe(zone, order, + 8*high_wmark_pages(zone), end_zone, 0)) + shrink_zone(priority, zone, &sc); + reclaim_state->reclaimed_slab = 0; +@@ -2236,7 +2236,7 @@ loop_again: + total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) + sc.may_writepage = 1; + +- if (!zone_watermark_ok(zone, order, ++ if (!zone_watermark_ok_safe(zone, order, + high_wmark_pages(zone), end_zone, 0)) { + all_zones_ok = 0; + /* +@@ -2244,7 +2244,7 @@ loop_again: + * means that we have a GFP_ATOMIC allocation + * failure risk. Hurry up! + */ +- if (!zone_watermark_ok(zone, order, ++ if (!zone_watermark_ok_safe(zone, order, + min_wmark_pages(zone), end_zone, 0)) + has_under_min_watermark_zone = 1; + } +@@ -2378,7 +2378,9 @@ static int kswapd(void *p) + */ + if (!sleeping_prematurely(pgdat, order, remaining)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); ++ restore_pgdat_percpu_threshold(pgdat); + schedule(); ++ reduce_pgdat_percpu_threshold(pgdat); + } else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); +@@ -2417,16 +2419,17 @@ void wakeup_kswapd(struct zone *zone, int order) + if (!populated_zone(zone)) + return; + +- pgdat = zone->zone_pgdat; +- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) ++ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + return; ++ pgdat = zone->zone_pgdat; + if (pgdat->kswapd_max_order < order) + pgdat->kswapd_max_order = order; +- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); +- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) +- return; + if (!waitqueue_active(&pgdat->kswapd_wait)) + return; ++ if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) ++ return; ++ ++ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); + wake_up_interruptible(&pgdat->kswapd_wait); + } + +diff --git a/mm/vmstat.c b/mm/vmstat.c +index 355a9e6..4d7faeb 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -81,6 +81,30 @@ EXPORT_SYMBOL(vm_stat); + + #ifdef CONFIG_SMP + ++static int calculate_pressure_threshold(struct zone *zone) ++{ ++ int threshold; ++ int watermark_distance; ++ ++ /* ++ * As vmstats are not up to date, there is drift between the estimated ++ * and real values. For high thresholds and a high number of CPUs, it ++ * is possible for the min watermark to be breached while the estimated ++ * value looks fine. The pressure threshold is a reduced value such ++ * that even the maximum amount of drift will not accidentally breach ++ * the min watermark ++ */ ++ watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); ++ threshold = max(1, (int)(watermark_distance / num_online_cpus())); ++ ++ /* ++ * Maximum threshold is 125 ++ */ ++ threshold = min(125, threshold); ++ ++ return threshold; ++} ++ + static int calculate_threshold(struct zone *zone) + { + int threshold; +@@ -159,6 +183,48 @@ static void refresh_zone_stat_thresholds(void) + } + } + ++void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) ++{ ++ struct zone *zone; ++ int cpu; ++ int threshold; ++ int i; ++ ++ get_online_cpus(); ++ for (i = 0; i < pgdat->nr_zones; i++) { ++ zone = &pgdat->node_zones[i]; ++ if (!zone->percpu_drift_mark) ++ continue; ++ ++ threshold = calculate_pressure_threshold(zone); ++ for_each_online_cpu(cpu) ++ per_cpu_ptr(zone->pageset, cpu)->stat_threshold ++ = threshold; ++ } ++ put_online_cpus(); ++} ++ ++void restore_pgdat_percpu_threshold(pg_data_t *pgdat) ++{ ++ struct zone *zone; ++ int cpu; ++ int threshold; ++ int i; ++ ++ get_online_cpus(); ++ for (i = 0; i < pgdat->nr_zones; i++) { ++ zone = &pgdat->node_zones[i]; ++ if (!zone->percpu_drift_mark) ++ continue; ++ ++ threshold = calculate_threshold(zone); ++ for_each_online_cpu(cpu) ++ per_cpu_ptr(zone->pageset, cpu)->stat_threshold ++ = threshold; ++ } ++ put_online_cpus(); ++} ++ + /* + * For use when we know that interrupts are disabled. + */ +@@ -826,7 +892,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, + "\n scanned %lu" + "\n spanned %lu" + "\n present %lu", +- zone_nr_free_pages(zone), ++ zone_page_state(zone, NR_FREE_PAGES), + min_wmark_pages(zone), + low_wmark_pages(zone), + high_wmark_pages(zone), +-- +1.7.3.2 + diff --git a/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch b/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch new file mode 100644 index 000000000..058b1399a --- /dev/null +++ b/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch @@ -0,0 +1,167 @@ +From 82e3d4969144377d13da97d511e849e8cf3e6dcc Mon Sep 17 00:00:00 2001 +From: Mel Gorman <mel@csn.ul.ie> +Date: Wed, 24 Nov 2010 22:24:24 -0500 +Subject: [PATCH 2/2] mm: vmstat: Use a single setter function and callback for adjusting percpu thresholds + +reduce_pgdat_percpu_threshold() and restore_pgdat_percpu_threshold() exist +to adjust the per-cpu vmstat thresholds while kswapd is awake to avoid +errors due to counter drift. The functions duplicate some code so this +patch replaces them with a single set_pgdat_percpu_threshold() that takes +a callback function to calculate the desired threshold as a parameter. + +Signed-off-by: Mel Gorman <mel@csn.ul.ie> +Reviewed-by: Christoph Lameter <cl@linux.com> +Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +[the various mmotm patches updating this were rolled up. --kyle] +[[http://userweb.kernel.org/~akpm/mmotm/broken-out/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds-fix-set_pgdat_percpu_threshold-dont-use-for_each_online_cpu.patch]] +--- + include/linux/vmstat.h | 10 ++++++---- + mm/vmscan.c | 19 +++++++++++++++++-- + mm/vmstat.c | 36 +++++++----------------------------- + 3 files changed, 30 insertions(+), 35 deletions(-) + +diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h +index e4cc21c..833e676 100644 +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -254,8 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); + extern void __dec_zone_state(struct zone *, enum zone_stat_item); + + void refresh_cpu_vm_stats(int); +-void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); +-void restore_pgdat_percpu_threshold(pg_data_t *pgdat); ++ ++int calculate_pressure_threshold(struct zone *zone); ++int calculate_normal_threshold(struct zone *zone); ++void set_pgdat_percpu_threshold(pg_data_t *pgdat, ++ int (*calculate_pressure)(struct zone *)); + #else /* CONFIG_SMP */ + + /* +@@ -300,8 +303,7 @@ static inline void __dec_zone_page_state(struct page *page, + #define dec_zone_page_state __dec_zone_page_state + #define mod_zone_page_state __mod_zone_page_state + +-static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } +-static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } ++#define set_pgdat_percpu_threshold(pgdat, callback) { } + + static inline void refresh_cpu_vm_stats(int cpu) { } + #endif +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 3e71cb1..ba39948 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2378,9 +2378,24 @@ static int kswapd(void *p) + */ + if (!sleeping_prematurely(pgdat, order, remaining)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); +- restore_pgdat_percpu_threshold(pgdat); ++ ++ /* ++ * vmstat counters are not perfectly ++ * accurate and the estimated value ++ * for counters such as NR_FREE_PAGES ++ * can deviate from the true value by ++ * nr_online_cpus * threshold. To ++ * avoid the zone watermarks being ++ * breached while under pressure, we ++ * reduce the per-cpu vmstat threshold ++ * while kswapd is awake and restore ++ * them before going back to sleep. ++ */ ++ set_pgdat_percpu_threshold(pgdat, ++ calculate_normal_threshold); + schedule(); +- reduce_pgdat_percpu_threshold(pgdat); ++ set_pgdat_percpu_threshold(pgdat, ++ calculate_pressure_threshold); + } else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); +diff --git a/mm/vmstat.c b/mm/vmstat.c +index 4d7faeb..511c2c0 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -81,7 +81,7 @@ EXPORT_SYMBOL(vm_stat); + + #ifdef CONFIG_SMP + +-static int calculate_pressure_threshold(struct zone *zone) ++int calculate_pressure_threshold(struct zone *zone) + { + int threshold; + int watermark_distance; +@@ -105,7 +105,7 @@ static int calculate_pressure_threshold(struct zone *zone) + return threshold; + } + +-static int calculate_threshold(struct zone *zone) ++int calculate_normal_threshold(struct zone *zone) + { + int threshold; + int mem; /* memory in 128 MB units */ +@@ -164,7 +164,7 @@ static void refresh_zone_stat_thresholds(void) + for_each_populated_zone(zone) { + unsigned long max_drift, tolerate_drift; + +- threshold = calculate_threshold(zone); ++ threshold = calculate_normal_threshold(zone); + + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold +@@ -183,46 +183,24 @@ static void refresh_zone_stat_thresholds(void) + } + } + +-void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) ++void set_pgdat_percpu_threshold(pg_data_t *pgdat, ++ int (*calculate_pressure)(struct zone *)) + { + struct zone *zone; + int cpu; + int threshold; + int i; + +- get_online_cpus(); +- for (i = 0; i < pgdat->nr_zones; i++) { +- zone = &pgdat->node_zones[i]; +- if (!zone->percpu_drift_mark) +- continue; +- +- threshold = calculate_pressure_threshold(zone); +- for_each_online_cpu(cpu) +- per_cpu_ptr(zone->pageset, cpu)->stat_threshold +- = threshold; +- } +- put_online_cpus(); +-} +- +-void restore_pgdat_percpu_threshold(pg_data_t *pgdat) +-{ +- struct zone *zone; +- int cpu; +- int threshold; +- int i; +- +- get_online_cpus(); + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + +- threshold = calculate_threshold(zone); +- for_each_online_cpu(cpu) ++ threshold = (*calculate_pressure)(zone); ++ for_each_possible_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } +- put_online_cpus(); + } + + /* +-- +1.7.3.2 + |