1 files changed, 447 insertions, 0 deletions
diff --git a/mm-thp-fix-pmd_bad-triggering.patch b/mm-thp-fix-pmd_bad-triggering.patch
new file mode 100644
index 000000000..8e1a77cd7
--- /dev/null
+++ b/mm-thp-fix-pmd_bad-triggering.patch
@@ -0,0 +1,447 @@
+In some cases it may happen that pmd_none_or_clear_bad() is called
+with the mmap_sem hold in read mode. In those cases the huge page
+faults can allocate hugepmds under pmd_none_or_clear_bad() and that
+can trigger a false positive from pmd_bad() that will not like to see
+a pmd materializing as trans huge.
+
+It's not khugepaged the problem, khugepaged holds the mmap_sem in
+write mode (and all those sites must hold the mmap_sem in read mode to
+prevent pagetables to go away from under them, during code review it
+seems vm86 mode on 32bit kernels requires that too unless it's
+restricted to 1 thread per process or UP builds). The race is only
+with the huge pagefaults that can convert a pmd_none() into a
+pmd_trans_huge().
+
+Effectively all these pmd_none_or_clear_bad() sites running with
+mmap_sem in read mode are somewhat speculative with the page faults,
+and the result is always undefined when they run simultaneously. This
+is probably why it wasn't common to run into this. For example if the
+madvise(MADV_DONTNEED) runs zap_page_range() shortly before the page
+fault, the hugepage will not be zapped, if the page fault runs first
+it will be zapped.
+
+Altering pmd_bad() not to error out if it finds hugepmds won't be
+enough to fix this, because zap_pmd_range would then proceed to call
+zap_pte_range (which would be incorrect if the pmd become a
+pmd_trans_huge()).
+
+The simplest way to fix this is to read the pmd in the local stack
+(regardless of what we read, no need of actual CPU barriers, only
+compiler barrier needed), and be sure it is not changing under the
+code that computes its value. Even if the real pmd is changing under
+the value we hold on the stack, we don't care. If we actually end up
+in zap_pte_range it means the pmd was not none already and it was not
+huge, and it can't become huge from under us (khugepaged locking
+explained above).
+
+All we need is to enforce that there is no way anymore that in a code
+path like below, pmd_trans_huge can be false, but
+pmd_none_or_clear_bad can run into a hugepmd. The overhead of a
+barrier() is just a compiler tweak and should not be measurable (I
+only added it for THP builds). I don't exclude different compiler
+versions may have prevented the race too by caching the value of *pmd
+on the stack (that hasn't been verified, but it wouldn't be impossible
+considering pmd_none_or_clear_bad, pmd_bad, pmd_trans_huge, pmd_none
+are all inlines and there's no external function called in between
+pmd_trans_huge and pmd_none_or_clear_bad).
+
+		if (pmd_trans_huge(*pmd)) {
+			if (next-addr != HPAGE_PMD_SIZE) {
+				VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+				split_huge_page_pmd(vma->vm_mm, pmd);
+			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
+				continue;
+			/* fall through */
+		}
+		if (pmd_none_or_clear_bad(pmd))
+
+Because this race condition could be exercised without special
+privileges this was reported in CVE-2012-1179.
+
+The race was identified and fully explained by Ulrich who debugged it.
+I'm quoting his accurate explanation below, for reference.
+
+====== start quote =======
+  mapcount 0 page_mapcount 1
+  kernel BUG at mm/huge_memory.c:1384!
+
+At some point prior to the panic, a "bad pmd ..." message similar to the
+following is logged on the console:
+
+  mm/memory.c:145: bad pmd ffff8800376e1f98(80000000314000e7).
+
+The "bad pmd ..." message is logged by pmd_clear_bad() before it clears
+the page's PMD table entry.
+
+    143 void pmd_clear_bad(pmd_t *pmd)
+    144 {
+->  145         pmd_ERROR(*pmd);
+    146         pmd_clear(pmd);
+    147 }
+
+After the PMD table entry has been cleared, there is an inconsistency
+between the actual number of PMD table entries that are mapping the page
+and the page's map count (_mapcount field in struct page). When the page
+is subsequently reclaimed, __split_huge_page() detects this inconsistency.
+
+   1381         if (mapcount != page_mapcount(page))
+   1382                 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
+   1383                        mapcount, page_mapcount(page));
+-> 1384         BUG_ON(mapcount != page_mapcount(page));
+
+The root cause of the problem is a race of two threads in a multithreaded
+process. Thread B incurs a page fault on a virtual address that has never
+been accessed (PMD entry is zero) while Thread A is executing an madvise()
+system call on a virtual address within the same 2 MB (huge page) range.
+
+           virtual address space
+          .---------------------.
+          |                     |
+          |                     |
+        .-|---------------------|
+        | |                     |
+        | |                     |<-- B(fault)
+        | |                     |
+  2 MB  | |/////////////////////|-.
+  huge <  |/////////////////////|  > A(range)
+  page  | |/////////////////////|-'
+        | |                     |
+        | |                     |
+        '-|---------------------|
+          |                     |
+          |                     |
+          '---------------------'
+
+- Thread A is executing an madvise(..., MADV_DONTNEED) system call
+  on the virtual address range "A(range)" shown in the picture.
+
+sys_madvise
+  // Acquire the semaphore in shared mode.
+  down_read(&current->mm->mmap_sem)
+  ...
+  madvise_vma
+    switch (behavior)
+    case MADV_DONTNEED:
+         madvise_dontneed
+           zap_page_range
+             unmap_vmas
+               unmap_page_range
+                 zap_pud_range
+                   zap_pmd_range
+                     //
+                     // Assume that this huge page has never been accessed.
+                     // I.e. content of the PMD entry is zero (not mapped).
+                     //
+                     if (pmd_trans_huge(*pmd)) {
+                         // We don't get here due to the above assumption.
+                     }
+                     //
+                     // Assume that Thread B incurred a page fault and
+         .---------> // sneaks in here as shown below.
+         |           //
+         |           if (pmd_none_or_clear_bad(pmd))
+         |               {
+         |                 if (unlikely(pmd_bad(*pmd)))
+         |                     pmd_clear_bad
+         |                     {
+         |                       pmd_ERROR
+         |                         // Log "bad pmd ..." message here.
+         |                       pmd_clear
+         |                         // Clear the page's PMD entry.
+         |                         // Thread B incremented the map count
+         |                         // in page_add_new_anon_rmap(), but
+         |                         // now the page is no longer mapped
+         |                         // by a PMD entry (-> inconsistency).
+         |                     }
+         |               }
+         |
+         v
+- Thread B is handling a page fault on virtual address "B(fault)" shown
+  in the picture.
+
+...
+do_page_fault
+  __do_page_fault
+    // Acquire the semaphore in shared mode.
+    down_read_trylock(&mm->mmap_sem)
+    ...
+    handle_mm_fault
+      if (pmd_none(*pmd) && transparent_hugepage_enabled(vma))
+          // We get here due to the above assumption (PMD entry is zero).
+          do_huge_pmd_anonymous_page
+            alloc_hugepage_vma
+              // Allocate a new transparent huge page here.
+            ...
+            __do_huge_pmd_anonymous_page
+              ...
+              spin_lock(&mm->page_table_lock)
+              ...
+              page_add_new_anon_rmap
+                // Here we increment the page's map count (starts at -1).
+                atomic_set(&page->_mapcount, 0)
+              set_pmd_at
+                // Here we set the page's PMD entry which will be cleared
+                // when Thread A calls pmd_clear_bad().
+              ...
+              spin_unlock(&mm->page_table_lock)
+
+The mmap_sem does not prevent the race because both threads are acquiring
+it in shared mode (down_read). Thread B holds the page_table_lock while
+the page's map count and PMD table entry are updated. However, Thread A
+does not synchronize on that lock.
+====== end quote =======
+
+Reported-by: Ulrich Obergfell <uobergfe@redhat.com>
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+---
+ arch/x86/kernel/vm86_32.c     |    2 +
+ fs/proc/task_mmu.c            |    9 ++++++
+ include/asm-generic/pgtable.h |   57 +++++++++++++++++++++++++++++++++++++++++
+ mm/memcontrol.c               |    4 +++
+ mm/memory.c                   |   14 ++++++++--
+ mm/mempolicy.c                |    2 +-
+ mm/mincore.c                  |    2 +-
+ mm/pagewalk.c                 |    2 +-
+ mm/swapfile.c                 |    4 +--
+ 9 files changed, 87 insertions(+), 9 deletions(-)
+
+diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
+index b466cab..328cb37 100644
+--- a/arch/x86/kernel/vm86_32.c
++++ b/arch/x86/kernel/vm86_32.c
+@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
+ 	spinlock_t *ptl;
+ 	int i;
+ 
++	down_write(&mm->mmap_sem);
+ 	pgd = pgd_offset(mm, 0xA0000);
+ 	if (pgd_none_or_clear_bad(pgd))
+ 		goto out;
+@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
+ 	}
+ 	pte_unmap_unlock(pte, ptl);
+ out:
++	up_write(&mm->mmap_sem);
+ 	flush_tlb();
+ }
+ 
+diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
+index 7dcd2a2..3efa725 100644
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -409,6 +409,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ 	} else {
+ 		spin_unlock(&walk->mm->page_table_lock);
+ 	}
++
++	if (pmd_trans_unstable(pmd))
++		return 0;
+ 	/*
+ 	 * The mmap_sem held all the way back in m_start() is what
+ 	 * keeps khugepaged out of here and from collapsing things
+@@ -507,6 +510,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+ 	struct page *page;
+ 
+ 	split_huge_page_pmd(walk->mm, pmd);
++	if (pmd_trans_unstable(pmd))
++		return 0;
+ 
+ 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ 	for (; addr != end; pte++, addr += PAGE_SIZE) {
+@@ -670,6 +675,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ 	int err = 0;
+ 
+ 	split_huge_page_pmd(walk->mm, pmd);
++	if (pmd_trans_unstable(pmd))
++		return 0;
+ 
+ 	/* find the first VMA at or above 'addr' */
+ 	vma = find_vma(walk->mm, addr);
+@@ -961,6 +968,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
+ 		spin_unlock(&walk->mm->page_table_lock);
+ 	}
+ 
++	if (pmd_trans_unstable(pmd))
++		return 0;
+ 	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ 	do {
+ 		struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
+diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
+index 76bff2b..10f8291 100644
+--- a/include/asm-generic/pgtable.h
++++ b/include/asm-generic/pgtable.h
+@@ -443,6 +443,63 @@ static inline int pmd_write(pmd_t pmd)
+ #endif /* __HAVE_ARCH_PMD_WRITE */
+ #endif
+ 
++/*
++ * This function is meant to be used by sites walking pagetables with
++ * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
++ * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
++ * into a null pmd and the transhuge page fault can convert a null pmd
++ * into an hugepmd or into a regular pmd (if the hugepage allocation
++ * fails). While holding the mmap_sem in read mode the pmd becomes
++ * stable and stops changing under us only if it's not null and not a
++ * transhuge pmd. When those races occurs and this function makes a
++ * difference vs the standard pmd_none_or_clear_bad, the result is
++ * undefined so behaving like if the pmd was none is safe (because it
++ * can return none anyway). The compiler level barrier() is critically
++ * important to compute the two checks atomically on the same pmdval.
++ */
++static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
++{
++	/* depend on compiler for an atomic pmd read */
++	pmd_t pmdval = *pmd;
++	/*
++	 * The barrier will stabilize the pmdval in a register or on
++	 * the stack so that it will stop changing under the code.
++	 */
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++	barrier();
++#endif
++	if (pmd_none(pmdval))
++		return 1;
++	if (unlikely(pmd_bad(pmdval))) {
++		if (!pmd_trans_huge(pmdval))
++			pmd_clear_bad(pmd);
++		return 1;
++	}
++	return 0;
++}
++
++/*
++ * This is a noop if Transparent Hugepage Support is not built into
++ * the kernel. Otherwise it is equivalent to
++ * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
++ * places that already verified the pmd is not none and they want to
++ * walk ptes while holding the mmap sem in read mode (write mode don't
++ * need this). If THP is not enabled, the pmd can't go away under the
++ * code even if MADV_DONTNEED runs, but if THP is enabled we need to
++ * run a pmd_trans_unstable before walking the ptes after
++ * split_huge_page_pmd returns (because it may have run when the pmd
++ * become null, but then a page fault can map in a THP and not a
++ * regular page).
++ */
++static inline int pmd_trans_unstable(pmd_t *pmd)
++{
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++	return pmd_none_or_trans_huge_or_clear_bad(pmd);
++#else
++	return 0;
++#endif
++}
++
+ #endif /* !__ASSEMBLY__ */
+ 
+ #endif /* _ASM_GENERIC_PGTABLE_H */
+diff --git a/mm/memcontrol.c b/mm/memcontrol.c
+index d0e57a3..67b0578 100644
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -5193,6 +5193,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
+ 	spinlock_t *ptl;
+ 
+ 	split_huge_page_pmd(walk->mm, pmd);
++	if (pmd_trans_unstable(pmd))
++		return 0;
+ 
+ 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ 	for (; addr != end; pte++, addr += PAGE_SIZE)
+@@ -5355,6 +5357,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
+ 	spinlock_t *ptl;
+ 
+ 	split_huge_page_pmd(walk->mm, pmd);
++	if (pmd_trans_unstable(pmd))
++		return 0;
+ retry:
+ 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ 	for (; addr != end; addr += PAGE_SIZE) {
+diff --git a/mm/memory.c b/mm/memory.c
+index fa2f04e..e3090fc 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1251,12 +1251,20 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
+ 				VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+ 				split_huge_page_pmd(vma->vm_mm, pmd);
+ 			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
+-				continue;
++				goto next;
+ 			/* fall through */
+ 		}
+-		if (pmd_none_or_clear_bad(pmd))
+-			continue;
++		/*
++		 * Here there can be other concurrent MADV_DONTNEED or
++		 * trans huge page faults running, and if the pmd is
++		 * none or trans huge it can change under us. This is
++		 * because MADV_DONTNEED holds the mmap_sem in read
++		 * mode.
++		 */
++		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
++			goto next;
+ 		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
++	next:
+ 		cond_resched();
+ 	} while (pmd++, addr = next, addr != end);
+ 
+diff --git a/mm/mempolicy.c b/mm/mempolicy.c
+index 47296fe..0a37570 100644
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ 	do {
+ 		next = pmd_addr_end(addr, end);
+ 		split_huge_page_pmd(vma->vm_mm, pmd);
+-		if (pmd_none_or_clear_bad(pmd))
++		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ 			continue;
+ 		if (check_pte_range(vma, pmd, addr, next, nodes,
+ 				    flags, private))
+diff --git a/mm/mincore.c b/mm/mincore.c
+index 636a868..936b4ce 100644
+--- a/mm/mincore.c
++++ b/mm/mincore.c
+@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ 			}
+ 			/* fall through */
+ 		}
+-		if (pmd_none_or_clear_bad(pmd))
++		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ 			mincore_unmapped_range(vma, addr, next, vec);
+ 		else
+ 			mincore_pte_range(vma, pmd, addr, next, vec);
+diff --git a/mm/pagewalk.c b/mm/pagewalk.c
+index 2f5cf10..aa9701e 100644
+--- a/mm/pagewalk.c
++++ b/mm/pagewalk.c
+@@ -59,7 +59,7 @@ again:
+ 			continue;
+ 
+ 		split_huge_page_pmd(walk->mm, pmd);
+-		if (pmd_none_or_clear_bad(pmd))
++		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ 			goto again;
+ 		err = walk_pte_range(pmd, addr, next, walk);
+ 		if (err)
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index d999f09..f31b29d 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ 	pmd = pmd_offset(pud, addr);
+ 	do {
+ 		next = pmd_addr_end(addr, end);
+-		if (unlikely(pmd_trans_huge(*pmd)))
+-			continue;
+-		if (pmd_none_or_clear_bad(pmd))
++		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ 			continue;
+ 		ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
+ 		if (ret)
+
+--
+To unsubscribe, send a message with 'unsubscribe linux-mm' in
+the body to majordomo@kvack.org.  For more info on Linux MM,
+see: http://www.linux-mm.org/ .
+Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
+Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
+\ No newline at end of file