summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/memory.c5
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c30
-rw-r--r--mm/page_alloc.c17
-rw-r--r--mm/rmap.c3
-rw-r--r--mm/slab.c96
-rw-r--r--mm/swap.c28
-rw-r--r--mm/vmscan.c21
8 files changed, 154 insertions, 47 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 9abc6008544..85e80a57db2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -623,11 +623,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
(*zap_work)--;
continue;
}
+
+ (*zap_work) -= PAGE_SIZE;
+
if (pte_present(ptent)) {
struct page *page;
- (*zap_work) -= PAGE_SIZE;
-
page = vm_normal_page(vma, addr, ptent);
if (unlikely(details) && page) {
/*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a918f77f02f..1fe76d963ac 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -130,6 +130,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
onlined_pages++;
}
zone->present_pages += onlined_pages;
+ zone->zone_pgdat->node_present_pages += onlined_pages;
setup_per_zone_pages_min();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d80fa7d8f72..b21869a39f0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -330,9 +330,19 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
int err;
struct vm_area_struct *first, *vma, *prev;
- /* Clear the LRU lists so pages can be isolated */
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+ /* Must have swap device for migration */
+ if (nr_swap_pages <= 0)
+ return ERR_PTR(-ENODEV);
+
+ /*
+ * Clear the LRU lists so pages can be isolated.
+ * Note that pages may be moved off the LRU after we have
+ * drained them. Those pages will fail to migrate like other
+ * pages that may be busy.
+ */
lru_add_drain_all();
+ }
first = find_vma(mm, start);
if (!first)
@@ -748,7 +758,7 @@ long do_mbind(unsigned long start, unsigned long len,
MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
|| mode > MPOL_MAX)
return -EINVAL;
- if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
if (start & ~PAGE_MASK)
@@ -942,20 +952,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
*/
if ((current->euid != task->suid) && (current->euid != task->uid) &&
(current->uid != task->suid) && (current->uid != task->uid) &&
- !capable(CAP_SYS_ADMIN)) {
+ !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
}
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
- if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+ if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
}
err = do_migrate_pages(mm, &old, &new,
- capable(CAP_SYS_ADMIN) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+ capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
mmput(mm);
return err;
@@ -1789,6 +1799,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
cond_resched();
}
+#ifdef CONFIG_HUGETLB_PAGE
static void check_huge_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct numa_maps *md)
@@ -1814,6 +1825,13 @@ static void check_huge_range(struct vm_area_struct *vma,
gather_stats(page, md, pte_dirty(*ptep));
}
}
+#else
+static inline void check_huge_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct numa_maps *md)
+{
+}
+#endif
int show_numa_map(struct seq_file *m, void *v)
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 791690d7d3f..234bd4895d1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -590,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
}
#ifdef CONFIG_NUMA
-/* Called from the slab reaper to drain remote pagesets */
-void drain_remote_pages(void)
+/*
+ * Called from the slab reaper to drain pagesets on a particular node that
+ * belong to the currently executing processor.
+ */
+void drain_node_pages(int nodeid)
{
- struct zone *zone;
- int i;
+ int i, z;
unsigned long flags;
local_irq_save(flags);
- for_each_zone(zone) {
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
struct per_cpu_pageset *pset;
- /* Do not drain local pagesets */
- if (zone->zone_pgdat->node_id == numa_node_id())
- continue;
-
pset = zone_pcp(zone, smp_processor_id());
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;
diff --git a/mm/rmap.c b/mm/rmap.c
index d8ce5ff6145..67f0e20b101 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -537,9 +537,6 @@ void page_add_new_anon_rmap(struct page *page,
*/
void page_add_file_rmap(struct page *page)
{
- BUG_ON(PageAnon(page));
- BUG_ON(!pfn_valid(page_to_pfn(page)));
-
if (atomic_inc_and_test(&page->_mapcount))
__inc_page_state(nr_mapped);
}
diff --git a/mm/slab.c b/mm/slab.c
index f2e92dc1c9c..d0bd7f07ab0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *
dump_stack();
}
+#ifdef CONFIG_NUMA
+/*
+ * Special reaping functions for NUMA systems called from cache_reap().
+ * These take care of doing round robin flushing of alien caches (containing
+ * objects freed on different nodes from which they were allocated) and the
+ * flushing of remote pcps by calling drain_node_pages.
+ */
+static DEFINE_PER_CPU(unsigned long, reap_node);
+
+static void init_reap_node(int cpu)
+{
+ int node;
+
+ node = next_node(cpu_to_node(cpu), node_online_map);
+ if (node == MAX_NUMNODES)
+ node = 0;
+
+ __get_cpu_var(reap_node) = node;
+}
+
+static void next_reap_node(void)
+{
+ int node = __get_cpu_var(reap_node);
+
+ /*
+ * Also drain per cpu pages on remote zones
+ */
+ if (node != numa_node_id())
+ drain_node_pages(node);
+
+ node = next_node(node, node_online_map);
+ if (unlikely(node >= MAX_NUMNODES))
+ node = first_node(node_online_map);
+ __get_cpu_var(reap_node) = node;
+}
+
+#else
+#define init_reap_node(cpu) do { } while (0)
+#define next_reap_node(void) do { } while (0)
+#endif
+
/*
* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
* via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
* at that time.
*/
if (keventd_up() && reap_work->func == NULL) {
+ init_reap_node(cpu);
INIT_WORK(reap_work, cache_reap, NULL);
schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
}
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
}
}
+/*
+ * Called from cache_reap() to regularly drain alien caches round robin.
+ */
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+{
+ int node = __get_cpu_var(reap_node);
+
+ if (l3->alien) {
+ struct array_cache *ac = l3->alien[node];
+ if (ac && ac->avail) {
+ spin_lock_irq(&ac->lock);
+ __drain_alien_cache(cachep, ac, node);
+ spin_unlock_irq(&ac->lock);
+ }
+ }
+}
+
static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
{
int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al
#else
#define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
static inline struct array_cache **alloc_alien_cache(int node, int limit)
{
@@ -1124,6 +1184,7 @@ void __init kmem_cache_init(void)
struct cache_sizes *sizes;
struct cache_names *names;
int i;
+ int order;
for (i = 0; i < NUM_INIT_LISTS; i++) {
kmem_list3_init(&initkmem_list3[i]);
@@ -1167,11 +1228,15 @@ void __init kmem_cache_init(void)
cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
- cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
- &left_over, &cache_cache.num);
+ for (order = 0; order < MAX_ORDER; order++) {
+ cache_estimate(order, cache_cache.buffer_size,
+ cache_line_size(), 0, &left_over, &cache_cache.num);
+ if (cache_cache.num)
+ break;
+ }
if (!cache_cache.num)
BUG();
-
+ cache_cache.gfporder = order;
cache_cache.colour = left_over / cache_cache.colour_off;
cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
sizeof(struct slab), cache_line_size());
@@ -1648,6 +1713,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
left_over = remainder;
/*
+ * A VFS-reclaimable slab tends to have most allocations
+ * as GFP_NOFS and we really don't want to have to be allocating
+ * higher-order pages when we are unable to shrink dcache.
+ */
+ if (flags & SLAB_RECLAIM_ACCOUNT)
+ break;
+
+ /*
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/
@@ -1869,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
size = ALIGN(size, align);
- if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
- /*
- * A VFS-reclaimable slab tends to have most allocations
- * as GFP_NOFS and we really don't want to have to be allocating
- * higher-order pages when we are unable to shrink dcache.
- */
- cachep->gfporder = 0;
- cache_estimate(cachep->gfporder, size, align, flags,
- &left_over, &cachep->num);
- } else
- left_over = calculate_slab_order(cachep, size, align, flags);
+ left_over = calculate_slab_order(cachep, size, align, flags);
if (!cachep->num) {
printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -3494,8 +3557,7 @@ static void cache_reap(void *unused)
check_irq_on();
l3 = searchp->nodelists[numa_node_id()];
- if (l3->alien)
- drain_alien_cache(searchp, l3->alien);
+ reap_alien(searchp, l3);
spin_lock_irq(&l3->list_lock);
drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3545,7 +3607,7 @@ static void cache_reap(void *unused)
}
check_irq_on();
mutex_unlock(&cache_chain_mutex);
- drain_remote_pages();
+ next_reap_node();
/* Setup the next iteration */
schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
}
diff --git a/mm/swap.c b/mm/swap.c
index cce3dda59c5..b524ea90bdd 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -393,7 +393,8 @@ void pagevec_strip(struct pagevec *pvec)
struct page *page = pvec->pages[i];
if (PagePrivate(page) && !TestSetPageLocked(page)) {
- try_to_release_page(page, 0);
+ if (PagePrivate(page))
+ try_to_release_page(page, 0);
unlock_page(page);
}
}
@@ -489,13 +490,34 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount)
if (count >= FBC_BATCH || count <= -FBC_BATCH) {
spin_lock(&fbc->lock);
fbc->count += count;
+ *pcount = 0;
spin_unlock(&fbc->lock);
- count = 0;
+ } else {
+ *pcount = count;
}
- *pcount = count;
put_cpu();
}
EXPORT_SYMBOL(percpu_counter_mod);
+
+/*
+ * Add up all the per-cpu counts, return the result. This is a more accurate
+ * but much slower version of percpu_counter_read_positive()
+ */
+long percpu_counter_sum(struct percpu_counter *fbc)
+{
+ long ret;
+ int cpu;
+
+ spin_lock(&fbc->lock);
+ ret = fbc->count;
+ for_each_cpu(cpu) {
+ long *pcount = per_cpu_ptr(fbc->counters, cpu);
+ ret += *pcount;
+ }
+ spin_unlock(&fbc->lock);
+ return ret < 0 ? 0 : ret;
+}
+EXPORT_SYMBOL(percpu_counter_sum);
#endif
/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0af7593d01..4fe7e3aa02e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -700,7 +700,7 @@ int migrate_page_remove_references(struct page *newpage,
* the page.
*/
if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
- return 1;
+ return -EAGAIN;
/*
* Establish swap ptes for anonymous pages or destroy pte
@@ -721,13 +721,15 @@ int migrate_page_remove_references(struct page *newpage,
* If the page was not migrated then the PageSwapCache bit
* is still set and the operation may continue.
*/
- try_to_unmap(page, 1);
+ if (try_to_unmap(page, 1) == SWAP_FAIL)
+ /* A vma has VM_LOCKED set -> Permanent failure */
+ return -EPERM;
/*
* Give up if we were unable to remove all mappings.
*/
if (page_mapcount(page))
- return 1;
+ return -EAGAIN;
write_lock_irq(&mapping->tree_lock);
@@ -738,7 +740,7 @@ int migrate_page_remove_references(struct page *newpage,
if (!page_mapping(page) || page_count(page) != nr_refs ||
*radix_pointer != page) {
write_unlock_irq(&mapping->tree_lock);
- return 1;
+ return -EAGAIN;
}
/*
@@ -813,10 +815,14 @@ EXPORT_SYMBOL(migrate_page_copy);
*/
int migrate_page(struct page *newpage, struct page *page)
{
+ int rc;
+
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- if (migrate_page_remove_references(newpage, page, 2))
- return -EAGAIN;
+ rc = migrate_page_remove_references(newpage, page, 2);
+
+ if (rc)
+ return rc;
migrate_page_copy(newpage, page);
@@ -1883,7 +1889,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
if (!(gfp_mask & __GFP_WAIT) ||
zone->all_unreclaimable ||
- atomic_read(&zone->reclaim_in_progress) > 0)
+ atomic_read(&zone->reclaim_in_progress) > 0 ||
+ (p->flags & PF_MEMALLOC))
return 0;
node_id = zone->zone_pgdat->node_id;