diff options
Diffstat (limited to 'mm-thp-fix-pmd_bad-triggering.patch')
-rw-r--r-- | mm-thp-fix-pmd_bad-triggering.patch | 447 |
1 files changed, 0 insertions, 447 deletions
diff --git a/mm-thp-fix-pmd_bad-triggering.patch b/mm-thp-fix-pmd_bad-triggering.patch deleted file mode 100644 index 8e1a77cd7..000000000 --- a/mm-thp-fix-pmd_bad-triggering.patch +++ /dev/null @@ -1,447 +0,0 @@ -In some cases it may happen that pmd_none_or_clear_bad() is called -with the mmap_sem hold in read mode. In those cases the huge page -faults can allocate hugepmds under pmd_none_or_clear_bad() and that -can trigger a false positive from pmd_bad() that will not like to see -a pmd materializing as trans huge. - -It's not khugepaged the problem, khugepaged holds the mmap_sem in -write mode (and all those sites must hold the mmap_sem in read mode to -prevent pagetables to go away from under them, during code review it -seems vm86 mode on 32bit kernels requires that too unless it's -restricted to 1 thread per process or UP builds). The race is only -with the huge pagefaults that can convert a pmd_none() into a -pmd_trans_huge(). - -Effectively all these pmd_none_or_clear_bad() sites running with -mmap_sem in read mode are somewhat speculative with the page faults, -and the result is always undefined when they run simultaneously. This -is probably why it wasn't common to run into this. For example if the -madvise(MADV_DONTNEED) runs zap_page_range() shortly before the page -fault, the hugepage will not be zapped, if the page fault runs first -it will be zapped. - -Altering pmd_bad() not to error out if it finds hugepmds won't be -enough to fix this, because zap_pmd_range would then proceed to call -zap_pte_range (which would be incorrect if the pmd become a -pmd_trans_huge()). - -The simplest way to fix this is to read the pmd in the local stack -(regardless of what we read, no need of actual CPU barriers, only -compiler barrier needed), and be sure it is not changing under the -code that computes its value. Even if the real pmd is changing under -the value we hold on the stack, we don't care. If we actually end up -in zap_pte_range it means the pmd was not none already and it was not -huge, and it can't become huge from under us (khugepaged locking -explained above). - -All we need is to enforce that there is no way anymore that in a code -path like below, pmd_trans_huge can be false, but -pmd_none_or_clear_bad can run into a hugepmd. The overhead of a -barrier() is just a compiler tweak and should not be measurable (I -only added it for THP builds). I don't exclude different compiler -versions may have prevented the race too by caching the value of *pmd -on the stack (that hasn't been verified, but it wouldn't be impossible -considering pmd_none_or_clear_bad, pmd_bad, pmd_trans_huge, pmd_none -are all inlines and there's no external function called in between -pmd_trans_huge and pmd_none_or_clear_bad). - - if (pmd_trans_huge(*pmd)) { - if (next-addr != HPAGE_PMD_SIZE) { - VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); - split_huge_page_pmd(vma->vm_mm, pmd); - } else if (zap_huge_pmd(tlb, vma, pmd, addr)) - continue; - /* fall through */ - } - if (pmd_none_or_clear_bad(pmd)) - -Because this race condition could be exercised without special -privileges this was reported in CVE-2012-1179. - -The race was identified and fully explained by Ulrich who debugged it. -I'm quoting his accurate explanation below, for reference. - -====== start quote ======= - mapcount 0 page_mapcount 1 - kernel BUG at mm/huge_memory.c:1384! - -At some point prior to the panic, a "bad pmd ..." message similar to the -following is logged on the console: - - mm/memory.c:145: bad pmd ffff8800376e1f98(80000000314000e7). - -The "bad pmd ..." message is logged by pmd_clear_bad() before it clears -the page's PMD table entry. - - 143 void pmd_clear_bad(pmd_t *pmd) - 144 { --> 145 pmd_ERROR(*pmd); - 146 pmd_clear(pmd); - 147 } - -After the PMD table entry has been cleared, there is an inconsistency -between the actual number of PMD table entries that are mapping the page -and the page's map count (_mapcount field in struct page). When the page -is subsequently reclaimed, __split_huge_page() detects this inconsistency. - - 1381 if (mapcount != page_mapcount(page)) - 1382 printk(KERN_ERR "mapcount %d page_mapcount %d\n", - 1383 mapcount, page_mapcount(page)); --> 1384 BUG_ON(mapcount != page_mapcount(page)); - -The root cause of the problem is a race of two threads in a multithreaded -process. Thread B incurs a page fault on a virtual address that has never -been accessed (PMD entry is zero) while Thread A is executing an madvise() -system call on a virtual address within the same 2 MB (huge page) range. - - virtual address space - .---------------------. - | | - | | - .-|---------------------| - | | | - | | |<-- B(fault) - | | | - 2 MB | |/////////////////////|-. - huge < |/////////////////////| > A(range) - page | |/////////////////////|-' - | | | - | | | - '-|---------------------| - | | - | | - '---------------------' - -- Thread A is executing an madvise(..., MADV_DONTNEED) system call - on the virtual address range "A(range)" shown in the picture. - -sys_madvise - // Acquire the semaphore in shared mode. - down_read(¤t->mm->mmap_sem) - ... - madvise_vma - switch (behavior) - case MADV_DONTNEED: - madvise_dontneed - zap_page_range - unmap_vmas - unmap_page_range - zap_pud_range - zap_pmd_range - // - // Assume that this huge page has never been accessed. - // I.e. content of the PMD entry is zero (not mapped). - // - if (pmd_trans_huge(*pmd)) { - // We don't get here due to the above assumption. - } - // - // Assume that Thread B incurred a page fault and - .---------> // sneaks in here as shown below. - | // - | if (pmd_none_or_clear_bad(pmd)) - | { - | if (unlikely(pmd_bad(*pmd))) - | pmd_clear_bad - | { - | pmd_ERROR - | // Log "bad pmd ..." message here. - | pmd_clear - | // Clear the page's PMD entry. - | // Thread B incremented the map count - | // in page_add_new_anon_rmap(), but - | // now the page is no longer mapped - | // by a PMD entry (-> inconsistency). - | } - | } - | - v -- Thread B is handling a page fault on virtual address "B(fault)" shown - in the picture. - -... -do_page_fault - __do_page_fault - // Acquire the semaphore in shared mode. - down_read_trylock(&mm->mmap_sem) - ... - handle_mm_fault - if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) - // We get here due to the above assumption (PMD entry is zero). - do_huge_pmd_anonymous_page - alloc_hugepage_vma - // Allocate a new transparent huge page here. - ... - __do_huge_pmd_anonymous_page - ... - spin_lock(&mm->page_table_lock) - ... - page_add_new_anon_rmap - // Here we increment the page's map count (starts at -1). - atomic_set(&page->_mapcount, 0) - set_pmd_at - // Here we set the page's PMD entry which will be cleared - // when Thread A calls pmd_clear_bad(). - ... - spin_unlock(&mm->page_table_lock) - -The mmap_sem does not prevent the race because both threads are acquiring -it in shared mode (down_read). Thread B holds the page_table_lock while -the page's map count and PMD table entry are updated. However, Thread A -does not synchronize on that lock. -====== end quote ======= - -Reported-by: Ulrich Obergfell <uobergfe@redhat.com> -Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> ---- - arch/x86/kernel/vm86_32.c | 2 + - fs/proc/task_mmu.c | 9 ++++++ - include/asm-generic/pgtable.h | 57 +++++++++++++++++++++++++++++++++++++++++ - mm/memcontrol.c | 4 +++ - mm/memory.c | 14 ++++++++-- - mm/mempolicy.c | 2 +- - mm/mincore.c | 2 +- - mm/pagewalk.c | 2 +- - mm/swapfile.c | 4 +-- - 9 files changed, 87 insertions(+), 9 deletions(-) - -diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c -index b466cab..328cb37 100644 ---- a/arch/x86/kernel/vm86_32.c -+++ b/arch/x86/kernel/vm86_32.c -@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) - spinlock_t *ptl; - int i; - -+ down_write(&mm->mmap_sem); - pgd = pgd_offset(mm, 0xA0000); - if (pgd_none_or_clear_bad(pgd)) - goto out; -@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) - } - pte_unmap_unlock(pte, ptl); - out: -+ up_write(&mm->mmap_sem); - flush_tlb(); - } - -diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c -index 7dcd2a2..3efa725 100644 ---- a/fs/proc/task_mmu.c -+++ b/fs/proc/task_mmu.c -@@ -409,6 +409,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - } else { - spin_unlock(&walk->mm->page_table_lock); - } -+ -+ if (pmd_trans_unstable(pmd)) -+ return 0; - /* - * The mmap_sem held all the way back in m_start() is what - * keeps khugepaged out of here and from collapsing things -@@ -507,6 +510,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, - struct page *page; - - split_huge_page_pmd(walk->mm, pmd); -+ if (pmd_trans_unstable(pmd)) -+ return 0; - - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (; addr != end; pte++, addr += PAGE_SIZE) { -@@ -670,6 +675,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - int err = 0; - - split_huge_page_pmd(walk->mm, pmd); -+ if (pmd_trans_unstable(pmd)) -+ return 0; - - /* find the first VMA at or above 'addr' */ - vma = find_vma(walk->mm, addr); -@@ -961,6 +968,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, - spin_unlock(&walk->mm->page_table_lock); - } - -+ if (pmd_trans_unstable(pmd)) -+ return 0; - orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - do { - struct page *page = can_gather_numa_stats(*pte, md->vma, addr); -diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h -index 76bff2b..10f8291 100644 ---- a/include/asm-generic/pgtable.h -+++ b/include/asm-generic/pgtable.h -@@ -443,6 +443,63 @@ static inline int pmd_write(pmd_t pmd) - #endif /* __HAVE_ARCH_PMD_WRITE */ - #endif - -+/* -+ * This function is meant to be used by sites walking pagetables with -+ * the mmap_sem hold in read mode to protect against MADV_DONTNEED and -+ * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd -+ * into a null pmd and the transhuge page fault can convert a null pmd -+ * into an hugepmd or into a regular pmd (if the hugepage allocation -+ * fails). While holding the mmap_sem in read mode the pmd becomes -+ * stable and stops changing under us only if it's not null and not a -+ * transhuge pmd. When those races occurs and this function makes a -+ * difference vs the standard pmd_none_or_clear_bad, the result is -+ * undefined so behaving like if the pmd was none is safe (because it -+ * can return none anyway). The compiler level barrier() is critically -+ * important to compute the two checks atomically on the same pmdval. -+ */ -+static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) -+{ -+ /* depend on compiler for an atomic pmd read */ -+ pmd_t pmdval = *pmd; -+ /* -+ * The barrier will stabilize the pmdval in a register or on -+ * the stack so that it will stop changing under the code. -+ */ -+#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+ barrier(); -+#endif -+ if (pmd_none(pmdval)) -+ return 1; -+ if (unlikely(pmd_bad(pmdval))) { -+ if (!pmd_trans_huge(pmdval)) -+ pmd_clear_bad(pmd); -+ return 1; -+ } -+ return 0; -+} -+ -+/* -+ * This is a noop if Transparent Hugepage Support is not built into -+ * the kernel. Otherwise it is equivalent to -+ * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in -+ * places that already verified the pmd is not none and they want to -+ * walk ptes while holding the mmap sem in read mode (write mode don't -+ * need this). If THP is not enabled, the pmd can't go away under the -+ * code even if MADV_DONTNEED runs, but if THP is enabled we need to -+ * run a pmd_trans_unstable before walking the ptes after -+ * split_huge_page_pmd returns (because it may have run when the pmd -+ * become null, but then a page fault can map in a THP and not a -+ * regular page). -+ */ -+static inline int pmd_trans_unstable(pmd_t *pmd) -+{ -+#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+ return pmd_none_or_trans_huge_or_clear_bad(pmd); -+#else -+ return 0; -+#endif -+} -+ - #endif /* !__ASSEMBLY__ */ - - #endif /* _ASM_GENERIC_PGTABLE_H */ -diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index d0e57a3..67b0578 100644 ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -5193,6 +5193,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, - spinlock_t *ptl; - - split_huge_page_pmd(walk->mm, pmd); -+ if (pmd_trans_unstable(pmd)) -+ return 0; - - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (; addr != end; pte++, addr += PAGE_SIZE) -@@ -5355,6 +5357,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, - spinlock_t *ptl; - - split_huge_page_pmd(walk->mm, pmd); -+ if (pmd_trans_unstable(pmd)) -+ return 0; - retry: - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (; addr != end; addr += PAGE_SIZE) { -diff --git a/mm/memory.c b/mm/memory.c -index fa2f04e..e3090fc 100644 ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -1251,12 +1251,20 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, - VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); - split_huge_page_pmd(vma->vm_mm, pmd); - } else if (zap_huge_pmd(tlb, vma, pmd, addr)) -- continue; -+ goto next; - /* fall through */ - } -- if (pmd_none_or_clear_bad(pmd)) -- continue; -+ /* -+ * Here there can be other concurrent MADV_DONTNEED or -+ * trans huge page faults running, and if the pmd is -+ * none or trans huge it can change under us. This is -+ * because MADV_DONTNEED holds the mmap_sem in read -+ * mode. -+ */ -+ if (pmd_none_or_trans_huge_or_clear_bad(pmd)) -+ goto next; - next = zap_pte_range(tlb, vma, pmd, addr, next, details); -+ next: - cond_resched(); - } while (pmd++, addr = next, addr != end); - -diff --git a/mm/mempolicy.c b/mm/mempolicy.c -index 47296fe..0a37570 100644 ---- a/mm/mempolicy.c -+++ b/mm/mempolicy.c -@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, - do { - next = pmd_addr_end(addr, end); - split_huge_page_pmd(vma->vm_mm, pmd); -- if (pmd_none_or_clear_bad(pmd)) -+ if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - continue; - if (check_pte_range(vma, pmd, addr, next, nodes, - flags, private)) -diff --git a/mm/mincore.c b/mm/mincore.c -index 636a868..936b4ce 100644 ---- a/mm/mincore.c -+++ b/mm/mincore.c -@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, - } - /* fall through */ - } -- if (pmd_none_or_clear_bad(pmd)) -+ if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - mincore_unmapped_range(vma, addr, next, vec); - else - mincore_pte_range(vma, pmd, addr, next, vec); -diff --git a/mm/pagewalk.c b/mm/pagewalk.c -index 2f5cf10..aa9701e 100644 ---- a/mm/pagewalk.c -+++ b/mm/pagewalk.c -@@ -59,7 +59,7 @@ again: - continue; - - split_huge_page_pmd(walk->mm, pmd); -- if (pmd_none_or_clear_bad(pmd)) -+ if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - goto again; - err = walk_pte_range(pmd, addr, next, walk); - if (err) -diff --git a/mm/swapfile.c b/mm/swapfile.c -index d999f09..f31b29d 100644 ---- a/mm/swapfile.c -+++ b/mm/swapfile.c -@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); -- if (unlikely(pmd_trans_huge(*pmd))) -- continue; -- if (pmd_none_or_clear_bad(pmd)) -+ if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - continue; - ret = unuse_pte_range(vma, pmd, addr, next, entry, page); - if (ret) - --- -To unsubscribe, send a message with 'unsubscribe linux-mm' in -the body to majordomo@kvack.org. For more info on Linux MM, -see: http://www.linux-mm.org/ . -Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ -Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
\ No newline at end of file |