summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c25
-rw-r--r--mm/internal.h2
-rw-r--r--mm/memory-failure.c6
-rw-r--r--mm/mmu_notifier.c45
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/vmscan.c38
7 files changed, 91 insertions, 32 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e198831276a..19558df4032 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2393,6 +2393,22 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
{
mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
__unmap_hugepage_range(vma, start, end, ref_page);
+ /*
+ * Clear this flag so that x86's huge_pmd_share page_table_shareable
+ * test will fail on a vma being torn down, and not grab a page table
+ * on its way out. We're lucky that the flag has such an appropriate
+ * name, and can in fact be safely cleared here. We could clear it
+ * before the __unmap_hugepage_range above, but all that's necessary
+ * is to clear it before releasing the i_mmap_mutex below.
+ *
+ * This works because in the contexts this is called, the VMA is
+ * going to be destroyed. It is not vunerable to madvise(DONTNEED)
+ * because madvise is not supported on hugetlbfs. The same applies
+ * for direct IO. unmap_hugepage_range() is only being called just
+ * before free_pgtables() so clearing VM_MAYSHARE will not cause
+ * surprises later.
+ */
+ vma->vm_flags &= ~VM_MAYSHARE;
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
}
@@ -2959,9 +2975,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
}
}
spin_unlock(&mm->page_table_lock);
- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
-
+ /*
+ * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+ * may have cleared our pud entry and done put_page on the page table:
+ * once we release i_mmap_mutex, another task can do the final put_page
+ * and that page table be reused and filled with junk.
+ */
flush_tlb_range(vma, start, end);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
}
int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/internal.h b/mm/internal.h
index 2ba87fbfb75..8052379a55a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -347,3 +347,5 @@ extern u32 hwpoison_filter_enable;
extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
+
+extern void set_pageblock_order(void);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index de4ce705845..6de0d613bbe 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1433,8 +1433,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
/* Keep page count to indicate a given hugepage is isolated. */
list_add(&hpage->lru, &pagelist);
- ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
- true);
+ ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false,
+ MIGRATE_SYNC);
if (ret) {
struct page *page1, *page2;
list_for_each_entry_safe(page1, page2, &pagelist, lru)
@@ -1563,7 +1563,7 @@ int soft_offline_page(struct page *page, int flags)
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
- 0, MIGRATE_SYNC);
+ false, MIGRATE_SYNC);
if (ret) {
putback_lru_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9a611d3a184..862b60822d9 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
void __mmu_notifier_release(struct mm_struct *mm)
{
struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ /*
+ * RCU here will block mmu_notifier_unregister until
+ * ->release returns.
+ */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
+ /*
+ * if ->release runs before mmu_notifier_unregister it
+ * must be handled as it's the only way for the driver
+ * to flush all existing sptes and stop the driver
+ * from establishing any more sptes before all the
+ * pages in the mm are freed.
+ */
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ rcu_read_unlock();
spin_lock(&mm->mmu_notifier_mm->lock);
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
* mmu_notifier_unregister to return.
*/
hlist_del_init_rcu(&mn->hlist);
- /*
- * RCU here will block mmu_notifier_unregister until
- * ->release returns.
- */
- rcu_read_lock();
- spin_unlock(&mm->mmu_notifier_mm->lock);
- /*
- * if ->release runs before mmu_notifier_unregister it
- * must be handled as it's the only way for the driver
- * to flush all existing sptes and stop the driver
- * from establishing any more sptes before all the
- * pages in the mm are freed.
- */
- if (mn->ops->release)
- mn->ops->release(mn, mm);
- rcu_read_unlock();
- spin_lock(&mm->mmu_notifier_mm->lock);
}
spin_unlock(&mm->mmu_notifier_mm->lock);
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
{
BUG_ON(atomic_read(&mm->mm_count) <= 0);
- spin_lock(&mm->mmu_notifier_mm->lock);
if (!hlist_unhashed(&mn->hlist)) {
- hlist_del_rcu(&mn->hlist);
-
/*
* RCU here will force exit_mmap to wait ->release to finish
* before freeing the pages.
*/
rcu_read_lock();
- spin_unlock(&mm->mmu_notifier_mm->lock);
+
/*
* exit_mmap will block in mmu_notifier_release to
* guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
if (mn->ops->release)
mn->ops->release(mn, mm);
rcu_read_unlock();
- } else
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ hlist_del_rcu(&mn->hlist);
spin_unlock(&mm->mmu_notifier_mm->lock);
+ }
/*
* Wait any running method to finish, of course including
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4a4f9219683..201b50813b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4301,7 +4301,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
unsigned int order;
@@ -4329,7 +4329,7 @@ static inline void __init set_pageblock_order(void)
* include/linux/pageblock-flags.h for the values of pageblock_order based on
* the kernel config
*/
-static inline void set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
}
diff --git a/mm/sparse.c b/mm/sparse.c
index c7bb952400c..950981fd07c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -493,6 +493,9 @@ void __init sparse_init(void)
struct page **map_map;
#endif
+ /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
+ set_pageblock_order();
+
/*
* map is using big page (aka 2M in x86 64 bit)
* usemap is less one page (aka 24 bytes)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 66e431060c0..f0f8ebbb224 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -720,9 +720,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
if (PageWriteback(page)) {
- nr_writeback++;
- unlock_page(page);
- goto keep;
+ /*
+ * memcg doesn't have any dirty pages throttling so we
+ * could easily OOM just because too many pages are in
+ * writeback and there is nothing else to reclaim.
+ *
+ * Check __GFP_IO, certainly because a loop driver
+ * thread might enter reclaim, and deadlock if it waits
+ * on a page for which it is needed to do the write
+ * (loop masks off __GFP_IO|__GFP_FS for this reason);
+ * but more thought would probably show more reasons.
+ *
+ * Don't require __GFP_FS, since we're not going into
+ * the FS, just waiting on its writeback completion.
+ * Worryingly, ext4 gfs2 and xfs allocate pages with
+ * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
+ * testing may_enter_fs here is liable to OOM on them.
+ */
+ if (global_reclaim(sc) ||
+ !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+ /*
+ * This is slightly racy - end_page_writeback()
+ * might have just cleared PageReclaim, then
+ * setting PageReclaim here end up interpreted
+ * as PageReadahead - but that does not matter
+ * enough to care. What we do want is for this
+ * page to have PageReclaim set next time memcg
+ * reclaim reaches the tests above, so it will
+ * then wait_on_page_writeback() to avoid OOM;
+ * and it's also appropriate in global reclaim.
+ */
+ SetPageReclaim(page);
+ nr_writeback++;
+ goto keep_locked;
+ }
+ wait_on_page_writeback(page);
}
references = page_check_references(page, sc);