summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJustin M. Forbes <jforbes@redhat.com>2016-04-28 14:26:27 -0500
committerJustin M. Forbes <jforbes@redhat.com>2016-04-28 14:26:27 -0500
commit2f85fe0f9e1e46f919df492c65c4ccf4ff056c45 (patch)
tree3d12a2126fe71b3c22deedd3413c90b9cb6a4d7a
parentf9f9100e3cd9d1eb19240a7abd295bcf9655eeb2 (diff)
downloadkernel-2f85fe0f9e1e46f919df492c65c4ccf4ff056c45.tar.gz
kernel-2f85fe0f9e1e46f919df492c65c4ccf4ff056c45.tar.xz
kernel-2f85fe0f9e1e46f919df492c65c4ccf4ff056c45.zip
Fix KVM with THP corruption (rhbz 1331092)
-rw-r--r--kernel.spec6
-rw-r--r--mm-thp-kvm-fix-memory-corruption-in-KVM-with-THP-ena.patch126
2 files changed, 132 insertions, 0 deletions
diff --git a/kernel.spec b/kernel.spec
index a9a092a72..240812438 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -617,6 +617,9 @@ Patch701: antenna_select.patch
# Stop splashing crap about broken firmware BGRT
Patch702: x86-efi-bgrt-Switch-all-pr_err-to-pr_debug-for-inval.patch
+#rhbz 1331092
+Patch703: mm-thp-kvm-fix-memory-corruption-in-KVM-with-THP-ena.patch
+
# END OF PATCH DEFINITIONS
%endif
@@ -2142,6 +2145,9 @@ fi
#
#
%changelog
+* Thu Apr 28 2016 Justin M. Forbes <jforbes@fedoraproject.org>
+- Fix KVM with THP corruption (rhbz 1331092)
+
* Thu Apr 28 2016 Josh Boyer <jwboyer@fedoraproject.org> - 4.6.0-0.rc5.git2.1
- Linux v4.6-rc5-89-gb75a2bf899b6
diff --git a/mm-thp-kvm-fix-memory-corruption-in-KVM-with-THP-ena.patch b/mm-thp-kvm-fix-memory-corruption-in-KVM-with-THP-ena.patch
new file mode 100644
index 000000000..2f90ec8e5
--- /dev/null
+++ b/mm-thp-kvm-fix-memory-corruption-in-KVM-with-THP-ena.patch
@@ -0,0 +1,126 @@
+From 94f984ff563d1777652b822d7a282cacc1e481c2 Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Wed, 27 Apr 2016 12:04:46 -0500
+Subject: [PATCH] mm: thp: kvm: fix memory corruption in KVM with THP enabled
+
+After the THP refcounting change, obtaining a compound pages from
+get_user_pages() no longer allows us to assume the entire compound
+page is immediately mappable from a secondary MMU.
+
+A secondary MMU doesn't want to call get_user_pages() more than once
+for each compound page, in order to know if it can map the whole
+compound page. So a secondary MMU needs to know from a single
+get_user_pages() invocation when it can map immediately the entire
+compound page to avoid a flood of unnecessary secondary MMU faults and
+spurious atomic_inc()/atomic_dec() (pages don't have to be pinned by
+MMU notifier users).
+
+Ideally instead of the page->_mapcount < 1 check, get_user_pages()
+should return the granularity of the "page" mapping in the "mm" passed
+to get_user_pages(). However it's non trivial change to pass the "pmd"
+status belonging to the "mm" walked by get_user_pages up the stack (up
+to the caller of get_user_pages). So the fix just checks if there is
+not a single pte mapping on the page returned by get_user_pages, and
+in turn if the caller can assume that the whole compound page is
+mapped in the current "mm" (in a pmd_trans_huge()). In such case the
+entire compound page is safe to map into the secondary MMU without
+additional get_user_pages() calls on the surrounding tail/head
+pages. In addition of being faster, not having to run other
+get_user_pages() calls also reduces the memory footprint of the
+secondary MMU fault in case the pmd split happened as result of memory
+pressure.
+
+Without this fix after a MADV_DONTNEED (like invoked by QEMU during
+postcopy live migration or balloning) or after generic swapping (with
+a failure in split_huge_page() that would only result in pmd splitting
+and not a physical page split), KVM would map the whole compound page
+into the shadow pagetables, despite regular faults or userfaults (like
+UFFDIO_COPY) may map regular pages into the primary MMU as result of
+the pte faults, leading to the guest mode and userland mode going out
+of sync and not working on the same memory at all times.
+
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+---
+ arch/arm/kvm/mmu.c | 2 +-
+ arch/x86/kvm/mmu.c | 4 ++--
+ include/linux/page-flags.h | 22 ++++++++++++++++++++++
+ 3 files changed, 25 insertions(+), 3 deletions(-)
+
+diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
+index aba61fd..8dafe97 100644
+--- a/arch/arm/kvm/mmu.c
++++ b/arch/arm/kvm/mmu.c
+@@ -997,7 +997,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
+ kvm_pfn_t pfn = *pfnp;
+ gfn_t gfn = *ipap >> PAGE_SHIFT;
+
+- if (PageTransCompound(pfn_to_page(pfn))) {
++ if (PageTransCompoundMap(pfn_to_page(pfn))) {
+ unsigned long mask;
+ /*
+ * The address we faulted on is backed by a transparent huge
+diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
+index 1e7a49b..3a371f7 100644
+--- a/arch/x86/kvm/mmu.c
++++ b/arch/x86/kvm/mmu.c
+@@ -2767,7 +2767,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
+ */
+ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
+ level == PT_PAGE_TABLE_LEVEL &&
+- PageTransCompound(pfn_to_page(pfn)) &&
++ PageTransCompoundMap(pfn_to_page(pfn)) &&
+ !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+ unsigned long mask;
+ /*
+@@ -4621,7 +4621,7 @@ restart:
+ */
+ if (sp->role.direct &&
+ !kvm_is_reserved_pfn(pfn) &&
+- PageTransCompound(pfn_to_page(pfn))) {
++ PageTransCompoundMap(pfn_to_page(pfn))) {
+ drop_spte(kvm, sptep);
+ need_tlb_flush = 1;
+ goto restart;
+diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
+index 19724e6..522bd6d 100644
+--- a/include/linux/page-flags.h
++++ b/include/linux/page-flags.h
+@@ -517,6 +517,27 @@ static inline int PageTransCompound(struct page *page)
+ }
+
+ /*
++ * PageTransCompoundMap is the same as PageTransCompound, but it also
++ * guarantees the primary MMU has the entire compound page mapped
++ * through pmd_trans_huge, which in turn guarantees the secondary MMUs
++ * can also map the entire compound page. This allows the secondary
++ * MMUs to call get_user_pages() only once for each compound page and
++ * to immediately map the entire compound page with a single secondary
++ * MMU fault. If there will be a pmd split later, the secondary MMUs
++ * will get an update through the MMU notifier invalidation through
++ * split_huge_pmd().
++ *
++ * Unlike PageTransCompound, this is safe to be called only while
++ * split_huge_pmd() cannot run from under us, like if protected by the
++ * MMU notifier, otherwise it may result in page->_mapcount < 0 false
++ * positives.
++ */
++static inline int PageTransCompoundMap(struct page *page)
++{
++ return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0;
++}
++
++/*
+ * PageTransTail returns true for both transparent huge pages
+ * and hugetlbfs pages, so it should only be called when it's known
+ * that hugetlbfs pages aren't involved.
+@@ -559,6 +580,7 @@ static inline int TestClearPageDoubleMap(struct page *page)
+ #else
+ TESTPAGEFLAG_FALSE(TransHuge)
+ TESTPAGEFLAG_FALSE(TransCompound)
++TESTPAGEFLAG_FALSE(TransCompoundMap)
+ TESTPAGEFLAG_FALSE(TransTail)
+ TESTPAGEFLAG_FALSE(DoubleMap)
+ TESTSETFLAG_FALSE(DoubleMap)
+--
+2.7.4
+