diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 111 |
1 files changed, 99 insertions, 12 deletions
diff --git a/mm/memory.c b/mm/memory.c index bbab1e37055..2302d228fe0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -311,6 +311,21 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) if (!new) return -ENOMEM; + /* + * Ensure all pte setup (eg. pte page lock and page clearing) are + * visible before the pte is made visible to other CPUs by being + * put into page tables. + * + * The other side of the story is the pointer chasing in the page + * table walking code (when walking the page table without locking; + * ie. most of the time). Fortunately, these data accesses consist + * of a chain of data-dependent loads, meaning most CPUs (alpha + * being the notable exception) will already guarantee loads are + * seen in-order. See the alpha page table accessors for the + * smp_read_barrier_depends() barriers in page table walking code. + */ + smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ + spin_lock(&mm->page_table_lock); if (!pmd_present(*pmd)) { /* Has another populated it ? */ mm->nr_ptes++; @@ -329,6 +344,8 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) if (!new) return -ENOMEM; + smp_wmb(); /* See comment in __pte_alloc */ + spin_lock(&init_mm.page_table_lock); if (!pmd_present(*pmd)) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); @@ -969,7 +986,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto no_page_table; pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + if (pmd_none(*pmd)) goto no_page_table; if (pmd_huge(*pmd)) { @@ -978,18 +995,19 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto out; } + if (unlikely(pmd_bad(*pmd))) + goto no_page_table; + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!ptep) - goto out; pte = *ptep; if (!pte_present(pte)) - goto unlock; + goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; page = vm_normal_page(vma, address, pte); if (unlikely(!page)) - goto unlock; + goto bad_page; if (flags & FOLL_GET) get_page(page); @@ -1004,6 +1022,15 @@ unlock: out: return page; +bad_page: + pte_unmap_unlock(ptep, ptl); + return ERR_PTR(-EFAULT); + +no_page: + pte_unmap_unlock(ptep, ptl); + if (!pte_none(pte)) + return page; + /* Fall through to ZERO_PAGE handling */ no_page_table: /* * When core dumping an enormous anonymous area that nobody @@ -1018,6 +1045,26 @@ no_page_table: return page; } +/* Can we do the FOLL_ANON optimization? */ +static inline int use_zero_page(struct vm_area_struct *vma) +{ + /* + * We don't want to optimize FOLL_ANON for make_pages_present() + * when it tries to page in a VM_LOCKED region. As to VM_SHARED, + * we want to get the page from the page tables to make sure + * that we serialize and update with any other user of that + * mapping. + */ + if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) + return 0; + /* + * And if we have a fault or a nopfn routine, it's not an + * anonymous region. + */ + return !vma->vm_ops || + (!vma->vm_ops->fault && !vma->vm_ops->nopfn); +} + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -1092,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, foll_flags = FOLL_TOUCH; if (pages) foll_flags |= FOLL_GET; - if (!write && !(vma->vm_flags & VM_LOCKED) && - (!vma->vm_ops || !vma->vm_ops->fault)) + if (!write && use_zero_page(vma)) foll_flags |= FOLL_ANON; do { @@ -1105,7 +1151,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * be processed until returning to user space. */ if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) - return -ENOMEM; + return i ? i : -ENOMEM; if (write) foll_flags |= FOLL_WRITE; @@ -1139,6 +1185,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); } + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); if (pages) { pages[i] = page; @@ -1649,8 +1697,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte); - if (!old_page) + if (!old_page) { + /* + * VM_MIXEDMAP !pfn_valid() case + * + * We should not cow pages in a shared writeable mapping. + * Just mark the pages writable as we can't do any dirty + * accounting on raw pfn maps. + */ + if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED)) + goto reuse; goto gotten; + } /* * Take out anonymous pages first, anonymous shared vmas are @@ -1703,6 +1762,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } if (reuse) { +reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -1737,7 +1797,6 @@ gotten: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { - page_remove_rmap(old_page, vma); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); @@ -1759,6 +1818,32 @@ gotten: lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); + if (old_page) { + /* + * Only after switching the pte to the new page may + * we remove the mapcount here. Otherwise another + * process may come and find the rmap count decremented + * before the pte is switched to the new page, and + * "reuse" the old page writing into it while our pte + * here still points into it and can be read by other + * threads. + * + * The critical issue is to order this + * page_remove_rmap with the ptp_clear_flush above. + * Those stores are ordered by (if nothing else,) + * the barrier present in the atomic_add_negative + * in page_remove_rmap. + * + * Then the TLB flush in ptep_clear_flush ensures that + * no process can access the old page before the + * decremented mapcount is visible. And the old page + * cannot be reused until after the decremented + * mapcount is visible. So transitively, TLBs to + * old page will be flushed before it can be reused. + */ + page_remove_rmap(old_page, vma); + } + /* Free the old page.. */ new_page = old_page; ret |= VM_FAULT_WRITE; @@ -2275,8 +2360,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, vmf.flags = flags; vmf.page = NULL; - BUG_ON(vma->vm_flags & VM_PFNMAP); - ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; @@ -2616,6 +2699,8 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) if (!new) return -ENOMEM; + smp_wmb(); /* See comment in __pte_alloc */ + spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) /* Another has populated it */ pud_free(mm, new); @@ -2637,6 +2722,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) if (!new) return -ENOMEM; + smp_wmb(); /* See comment in __pte_alloc */ + spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK if (pud_present(*pud)) /* Another has populated it */ |