diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/bootmem.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/fremap.c | 52 | ||||
-rw-r--r-- | mm/hugetlb.c | 6 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 404 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 16 | ||||
-rw-r--r-- | mm/mmap.c | 15 | ||||
-rw-r--r-- | mm/mprotect.c | 8 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/msync.c | 12 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 1 | ||||
-rw-r--r-- | mm/page_alloc.c | 327 | ||||
-rw-r--r-- | mm/rmap.c | 60 | ||||
-rw-r--r-- | mm/slab.c | 57 | ||||
-rw-r--r-- | mm/swap.c | 3 | ||||
-rw-r--r-- | mm/thrash.c | 10 | ||||
-rw-r--r-- | mm/truncate.c | 6 | ||||
-rw-r--r-- | mm/vmscan.c | 35 |
22 files changed, 604 insertions, 426 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index ae9ce6b73e8..21eb51d4da8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -125,12 +125,10 @@ comment "Memory hotplug is currently incompatible with Software Suspend" # space can be handled with less contention: split it at this NR_CPUS. # Default to 4 for wider testing, though 8 might be more appropriate. # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. -# PA-RISC's debug spinlock_t is too large for the 32-bit struct page. -# ARM26 and SPARC32 and PPC64 may use one page for multiple page tables. +# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. # config SPLIT_PTLOCK_CPUS int default "4096" if ARM && !CPU_CACHE_VIPT - default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT - default "4096" if ARM26 || SPARC32 || PPC64 + default "4096" if PARISC && !PA20 default "4" diff --git a/mm/bootmem.c b/mm/bootmem.c index e8c567177dc..16b9465eb4e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -204,6 +204,8 @@ restart_scan: unsigned long j; i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i); i = ALIGN(i, incr); + if (i >= eidx) + break; if (test_bit(i, bdata->node_bootmem_map)) continue; for (j = i + 1; j < i + areasize; ++j) { diff --git a/mm/filemap.c b/mm/filemap.c index 5d6e4c2000d..33a28bfde15 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -134,7 +134,7 @@ static int sync_page(void *word) struct address_space *mapping; struct page *page; - page = container_of((page_flags_t *)word, struct page, flags); + page = container_of((unsigned long *)word, struct page, flags); /* * page_mapping() is being called without PG_locked held. diff --git a/mm/fremap.c b/mm/fremap.c index d862be3bc3e..9f381e58bf4 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; if (pte_present(pte)) { - unsigned long pfn = pte_pfn(pte); - flush_cache_page(vma, addr, pfn); + flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); - goto out; + page = vm_normal_page(vma, addr, pte); + if (page) { + if (pte_dirty(pte)) + set_page_dirty(page); + page_remove_rmap(page); + page_cache_release(page); } - page = pfn_to_page(pfn); - if (pte_dirty(pte)) - set_page_dirty(page); - page_remove_rmap(page); - page_cache_release(page); } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear(mm, addr, ptep); } -out: return !!page; } @@ -59,22 +55,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, pgoff_t size; int err = -ENOMEM; pte_t *pte; - pmd_t *pmd; - pud_t *pud; - pgd_t *pgd; pte_t pte_val; spinlock_t *ptl; - BUG_ON(vma->vm_flags & VM_RESERVED); - - pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); - if (!pud) - goto out; - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - goto out; - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; @@ -116,22 +100,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, { int err = -ENOMEM; pte_t *pte; - pmd_t *pmd; - pud_t *pud; - pgd_t *pgd; pte_t pte_val; spinlock_t *ptl; - BUG_ON(vma->vm_flags & VM_RESERVED); - - pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); - if (!pud) - goto out; - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - goto out; - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; @@ -204,12 +176,10 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, * Make sure the vma is shared, that it supports prefaulting, * and that the remapped range is valid and fully within * the single existing vma. vm_private_data is used as a - * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED - * or VM_LOCKED, but VM_LOCKED could be revoked later on). + * swapout cursor in a VM_NONLINEAR vma. */ if (vma && (vma->vm_flags & VM_SHARED) && - (!vma->vm_private_data || - (vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) && + (!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) && vma->vm_ops && vma->vm_ops->populate && end > start && start >= vma->vm_start && end <= vma->vm_end) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 728e9bda12e..3e52df7c471 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,6 +22,10 @@ unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; + +/* + * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages + */ static DEFINE_SPINLOCK(hugetlb_lock); static void enqueue_huge_page(struct page *page) @@ -61,8 +65,10 @@ static struct page *alloc_fresh_huge_page(void) HUGETLB_PAGE_ORDER); nid = (nid + 1) % num_online_nodes(); if (page) { + spin_lock(&hugetlb_lock); nr_huge_pages++; nr_huge_pages_node[page_to_nid(page)]++; + spin_unlock(&hugetlb_lock); } return page; } diff --git a/mm/madvise.c b/mm/madvise.c index 17aaf3e1644..2b7cf0400a2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, unsigned long start, unsigned long end) { *prev = vma; - if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED)) + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { diff --git a/mm/memory.c b/mm/memory.c index 0f60baf6f69..d8dde07a365 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) } /* - * This function is called to print an error when a pte in a - * !VM_RESERVED region is found pointing to an invalid pfn (which - * is an error. + * This function is called to print an error when a bad pte + * is found. For example, we might have a PFN-mapped pte in + * a region that doesn't allow it. * * The calling function must still handle the error. */ @@ -349,6 +349,66 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) dump_stack(); } +static inline int is_cow_mapping(unsigned int flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + +/* + * This function gets the "struct page" associated with a pte. + * + * NOTE! Some mappings do not have "struct pages". A raw PFN mapping + * will have each page table entry just pointing to a raw page frame + * number, and as far as the VM layer is concerned, those do not have + * pages associated with them - even if the PFN might point to memory + * that otherwise is perfectly fine and has a "struct page". + * + * The way we recognize those mappings is through the rules set up + * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, + * and the vm_pgoff will point to the first PFN mapped: thus every + * page that is a raw mapping will always honor the rule + * + * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) + * + * and if that isn't true, the page has been COW'ed (in which case it + * _does_ have a "struct page" associated with it even if it is in a + * VM_PFNMAP range). + */ +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + if (vma->vm_flags & VM_PFNMAP) { + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + + /* + * Add some anal sanity checks for now. Eventually, + * we should just do "return pfn_to_page(pfn)", but + * in the meantime we check that we get a valid pfn, + * and that the resulting page looks ok. + * + * Remove this test eventually! + */ + if (unlikely(!pfn_valid(pfn))) { + print_bad_pte(vma, pte, addr); + return NULL; + } + + /* + * NOTE! We still have PageReserved() pages in the page + * tables. + * + * The PAGE_ZERO() pages and various VDSO mappings can + * cause them to exist. + */ + return pfn_to_page(pfn); +} + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range @@ -363,7 +423,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; - unsigned long pfn; /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { @@ -381,28 +440,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_set_pte; } - /* If the region is VM_RESERVED, the mapping is not - * mapped via rmap - duplicate the pte as is. - */ - if (vm_flags & VM_RESERVED) - goto out_set_pte; - - pfn = pte_pfn(pte); - /* If the pte points outside of valid memory but - * the region is not VM_RESERVED, we have a problem. - */ - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); - goto out_set_pte; /* try to do something sane */ - } - - page = pfn_to_page(pfn); - /* * If it's a COW mapping, write protect it both * in the parent and the child */ - if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) { + if (is_cow_mapping(vm_flags)) { ptep_set_wrprotect(src_mm, addr, src_pte); pte = *src_pte; } @@ -414,9 +456,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); - get_page(page); - page_dup_rmap(page); - rss[!!PageAnon(page)]++; + + page = vm_normal_page(vma, addr, pte); + if (page) { + get_page(page); + page_dup_rmap(page); + rss[!!PageAnon(page)]++; + } out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); @@ -528,7 +574,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) { + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { if (!vma->anon_vma) return 0; } @@ -549,10 +595,10 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, return 0; } -static void zap_pte_range(struct mmu_gather *tlb, +static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, - struct zap_details *details) + long *zap_work, struct zap_details *details) { struct mm_struct *mm = tlb->mm; pte_t *pte; @@ -563,17 +609,16 @@ static void zap_pte_range(struct mmu_gather *tlb, pte = pte_offset_map_lock(mm, pmd, addr, &ptl); do { pte_t ptent = *pte; - if (pte_none(ptent)) + if (pte_none(ptent)) { + (*zap_work)--; continue; + } if (pte_present(ptent)) { - struct page *page = NULL; - if (!(vma->vm_flags & VM_RESERVED)) { - unsigned long pfn = pte_pfn(ptent); - if (unlikely(!pfn_valid(pfn))) - print_bad_pte(vma, ptent, addr); - else - page = pfn_to_page(pfn); - } + struct page *page; + + (*zap_work) -= PAGE_SIZE; + + page = vm_normal_page(vma, addr, ptent); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -624,16 +669,18 @@ static void zap_pte_range(struct mmu_gather *tlb, if (!pte_file(ptent)) free_swap_and_cache(pte_to_swp_entry(ptent)); pte_clear_full(mm, addr, pte, tlb->fullmm); - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); add_mm_rss(mm, file_rss, anon_rss); pte_unmap_unlock(pte - 1, ptl); + + return addr; } -static inline void zap_pmd_range(struct mmu_gather *tlb, +static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - struct zap_details *details) + long *zap_work, struct zap_details *details) { pmd_t *pmd; unsigned long next; @@ -641,16 +688,21 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_clear_bad(pmd)) { + (*zap_work)--; continue; - zap_pte_range(tlb, vma, pmd, addr, next, details); - } while (pmd++, addr = next, addr != end); + } + next = zap_pte_range(tlb, vma, pmd, addr, next, + zap_work, details); + } while (pmd++, addr = next, (addr != end && *zap_work > 0)); + + return addr; } -static inline void zap_pud_range(struct mmu_gather *tlb, +static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - struct zap_details *details) + long *zap_work, struct zap_details *details) { pud_t *pud; unsigned long next; @@ -658,15 +710,21 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) + if (pud_none_or_clear_bad(pud)) { + (*zap_work)--; continue; - zap_pmd_range(tlb, vma, pud, addr, next, details); - } while (pud++, addr = next, addr != end); + } + next = zap_pmd_range(tlb, vma, pud, addr, next, + zap_work, details); + } while (pud++, addr = next, (addr != end && *zap_work > 0)); + + return addr; } -static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, +static unsigned long unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, unsigned long end, - struct zap_details *details) + long *zap_work, struct zap_details *details) { pgd_t *pgd; unsigned long next; @@ -679,11 +737,16 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none_or_clear_bad(pgd)) { + (*zap_work)--; continue; - zap_pud_range(tlb, vma, pgd, addr, next, details); - } while (pgd++, addr = next, addr != end); + } + next = zap_pud_range(tlb, vma, pgd, addr, next, + zap_work, details); + } while (pgd++, addr = next, (addr != end && *zap_work > 0)); tlb_end_vma(tlb, vma); + + return addr; } #ifdef CONFIG_PREEMPT @@ -724,7 +787,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { - unsigned long zap_bytes = ZAP_BLOCK_SIZE; + long zap_work = ZAP_BLOCK_SIZE; unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; unsigned long start = start_addr; @@ -745,27 +808,25 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, *nr_accounted += (end - start) >> PAGE_SHIFT; while (start != end) { - unsigned long block; - if (!tlb_start_valid) { tlb_start = start; tlb_start_valid = 1; } - if (is_vm_hugetlb_page(vma)) { - block = end - start; + if (unlikely(is_vm_hugetlb_page(vma))) { unmap_hugepage_range(vma, start, end); - } else { - block = min(zap_bytes, end - start); - unmap_page_range(*tlbp, vma, start, - start + block, details); + zap_work -= (end - start) / + (HPAGE_SIZE / PAGE_SIZE); + start = end; + } else + start = unmap_page_range(*tlbp, vma, + start, end, &zap_work, details); + + if (zap_work > 0) { + BUG_ON(start != end); + break; } - start += block; - zap_bytes -= block; - if ((long)zap_bytes > 0) - continue; - tlb_finish_mmu(*tlbp, tlb_start, start); if (need_resched() || @@ -779,7 +840,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); tlb_start_valid = 0; - zap_bytes = ZAP_BLOCK_SIZE; + zap_work = ZAP_BLOCK_SIZE; } } out: @@ -813,7 +874,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, /* * Do a quick page-table lookup for a single page. */ -struct page *follow_page(struct mm_struct *mm, unsigned long address, +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pgd_t *pgd; @@ -821,8 +882,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address, pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; - unsigned long pfn; struct page *page; + struct mm_struct *mm = vma->vm_mm; page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { @@ -858,11 +919,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address, goto unlock; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; - pfn = pte_pfn(pte); - if (!pfn_valid(pfn)) + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) goto unlock; - page = pfn_to_page(pfn); if (flags & FOLL_GET) get_page(page); if (flags & FOLL_TOUCH) { @@ -935,8 +995,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? : -EFAULT; } if (pages) { - pages[i] = pte_page(*pte); - get_page(pages[i]); + struct page *page = vm_normal_page(gate_vma, start, *pte); + pages[i] = page; + if (page) + get_page(page); } pte_unmap(pte); if (vmas) @@ -947,7 +1009,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) + if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; @@ -971,7 +1033,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, foll_flags |= FOLL_WRITE; cond_resched(); - while (!(page = follow_page(mm, start, foll_flags))) { + while (!(page = follow_page(vma, start, foll_flags))) { int ret; ret = __handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE); @@ -1091,6 +1153,86 @@ int zeromap_page_range(struct vm_area_struct *vma, return err; } +pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) +{ + pgd_t * pgd = pgd_offset(mm, addr); + pud_t * pud = pud_alloc(mm, pgd, addr); + if (pud) { + pmd_t * pmd = pmd_alloc(mm, pud, addr); + if (pmd) + return pte_alloc_map_lock(mm, pmd, addr, ptl); + } + return NULL; +} + +/* + * This is the old fallback for page remapping. + * + * For historical reasons, it only allows reserved pages. Only + * old drivers should use this, and they needed to mark their + * pages reserved for the old functions anyway. + */ +static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) +{ + int retval; + pte_t *pte; + spinlock_t *ptl; + + retval = -EINVAL; + if (PageAnon(page)) + goto out; + retval = -ENOMEM; + flush_dcache_page(page); + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = -EBUSY; + if (!pte_none(*pte)) + goto out_unlock; + + /* Ok, finally just insert the thing.. */ + get_page(page); + inc_mm_counter(mm, file_rss); + page_add_file_rmap(page); + set_pte_at(mm, addr, pte, mk_pte(page, prot)); + + retval = 0; +out_unlock: + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +/* + * This allows drivers to insert individual pages they've allocated + * into a user vma. + * + * The page has to be a nice clean _individual_ kernel allocation. + * If you allocate a compound page, you need to have marked it as + * such (__GFP_COMP), or manually just split the page up yourself + * (which is mainly an issue of doing "set_page_count(page, 1)" for + * each sub-page, and then freeing them one by one when you free + * them rather than freeing it as a compound page). + * + * NOTE! Traditionally this was done with "remap_pfn_range()" which + * took an arbitrary page protection parameter. This doesn't allow + * that. Your vma protection will have to be set up correctly, which + * means that if you want a shared writable mapping, you'd better + * ask for a shared writable mapping! + * + * The page does not need to be reserved. + */ +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) +{ + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + if (!page_count(page)) + return -EINVAL; + vma->vm_flags |= VM_INSERTPAGE; + return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_page); + /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results @@ -1170,10 +1312,26 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). - * VM_RESERVED tells the core MM not to "manage" these pages - * (e.g. refcount, mapcount, try to swap them out). + * VM_RESERVED is specified all over the place, because + * in 2.4 it kept swapout's vma scan off this vma; but + * in 2.6 the LRU scan won't even find its pages, so this + * flag means no more than count its pages in reserved_vm, + * and omit it from core dump, even when VM_IO turned off. + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". */ - vma->vm_flags |= VM_IO | VM_RESERVED; + if (is_cow_mapping(vma->vm_flags)) { + if (addr != vma->vm_start || end != vma->vm_end) + return -EINVAL; + vma->vm_pgoff = pfn; + } + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -1228,6 +1386,33 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) +{ + /* + * If the source page was a PFN mapping, we don't have + * a "struct page" for it. We do a best-effort copy by + * just copying from the original user address. If that + * fails, we just zero-fill it. Live with it. + */ + if (unlikely(!src)) { + void *kaddr = kmap_atomic(dst, KM_USER0); + void __user *uaddr = (void __user *)(va & PAGE_MASK); + + /* + * This really shouldn't fail, because the page is there + * in the page tables. But it might just be unreadable, + * in which case we just give up and fill the result with + * zeroes. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) + memset(kaddr, 0, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + return; + + } + copy_user_highpage(dst, src, va); +} + /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -1251,27 +1436,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl, pte_t orig_pte) { struct page *old_page, *new_page; - unsigned long pfn = pte_pfn(orig_pte); pte_t entry; int ret = VM_FAULT_MINOR; - BUG_ON(vma->vm_flags & VM_RESERVED); - - if (unlikely(!pfn_valid(pfn))) { - /* - * Page table corrupted: show pte and kill process. - */ - print_bad_pte(vma, orig_pte, address); - ret = VM_FAULT_OOM; - goto unlock; - } - old_page = pfn_to_page(pfn); + old_page = vm_normal_page(vma, address, orig_pte); + if (!old_page) + goto gotten; if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { int reuse = can_share_swap_page(old_page); unlock_page(old_page); if (reuse) { - flush_cache_page(vma, address, pfn); + flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); ptep_set_access_flags(vma, address, page_table, entry, 1); @@ -1286,6 +1462,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * Ok, we need to copy. Oh, well.. */ page_cache_get(old_page); +gotten: pte_unmap_unlock(page_table, ptl); if (unlikely(anon_vma_prepare(vma))) @@ -1298,7 +1475,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!new_page) goto oom; - copy_user_highpage(new_page, old_page, address); + cow_user_page(new_page, old_page, address); } /* @@ -1306,12 +1483,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { - page_remove_rmap(old_page); - if (!PageAnon(old_page)) { + if (old_page) { + page_remove_rmap(old_page); + if (!PageAnon(old_page)) { + dec_mm_counter(mm, file_rss); + inc_mm_counter(mm, anon_rss); + } + } else inc_mm_counter(mm, anon_rss); - dec_mm_counter(mm, file_rss); - } - flush_cache_page(vma, address, pfn); + flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); ptep_establish(vma, address, page_table, entry); @@ -1324,13 +1504,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, new_page = old_page; ret |= VM_FAULT_WRITE; } - page_cache_release(new_page); - page_cache_release(old_page); + if (new_page) + page_cache_release(new_page); + if (old_page) + page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); return ret; oom: - page_cache_release(old_page); + if (old_page) + page_cache_release(old_page); return VM_FAULT_OOM; } @@ -1828,6 +2011,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, int anon = 0; pte_unmap(page_table); + BUG_ON(vma->vm_flags & VM_PFNMAP); if (vma->vm_file) { mapping = vma->vm_file->f_mapping; @@ -1903,7 +2087,7 @@ retry: inc_mm_counter(mm, anon_rss); lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); - } else if (!(vma->vm_flags & VM_RESERVED)) { + } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); } @@ -2080,6 +2264,12 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) spin_unlock(&mm->page_table_lock); return 0; } +#else +/* Workaround for gcc 2.96 */ +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + return 0; +} #endif /* __PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_PMD_FOLDED @@ -2108,6 +2298,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) spin_unlock(&mm->page_table_lock); return 0; } +#else +/* Workaround for gcc 2.96 */ +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + return 0; +} #endif /* __PAGETABLE_PMD_FOLDED */ int make_pages_present(unsigned long addr, unsigned long end) @@ -2182,7 +2378,7 @@ static int __init gate_vma_init(void) gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_page_prot = PAGE_READONLY; - gate_vma.vm_flags = VM_RESERVED; + gate_vma.vm_flags = 0; return 0; } __initcall(gate_vma_init); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 431a64f021c..f6d4af8af8a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -104,7 +104,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat, pgdat->node_start_pfn = start_pfn; if (end_pfn > old_pgdat_end_pfn) - pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages; + pgdat->node_spanned_pages = end_pfn - pgdat->node_start_pfn; } int online_pages(unsigned long pfn, unsigned long nr_pages) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5abc57c2b8b..72f402cc9c9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -161,6 +161,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) switch (mode) { case MPOL_INTERLEAVE: policy->v.nodes = *nodes; + if (nodes_weight(*nodes) == 0) { + kmem_cache_free(policy_cache, policy); + return ERR_PTR(-EINVAL); + } break; case MPOL_PREFERRED: policy->v.preferred_node = first_node(*nodes); @@ -189,17 +193,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { - unsigned long pfn; + struct page *page; unsigned int nid; if (!pte_present(*pte)) continue; - pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) { - print_bad_pte(vma, *pte, addr); + page = vm_normal_page(vma, addr, *pte); + if (!page) continue; - } - nid = pfn_to_nid(pfn); + nid = page_to_nid(page); if (!node_isset(nid, *nodes)) break; } while (pte++, addr += PAGE_SIZE, addr != end); @@ -269,8 +271,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); - if (first->vm_flags & VM_RESERVED) - return ERR_PTR(-EACCES); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { if (!vma->vm_next && vma->vm_end < end) diff --git a/mm/mmap.c b/mm/mmap.c index 6c997b15960..64ba4dbcb7d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -611,7 +611,7 @@ again: remove_next = 1 + (end > next->vm_end); * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ -#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) @@ -1076,17 +1076,6 @@ munmap_back: error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; - if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED)) - == (VM_WRITE | VM_RESERVED)) { - printk(KERN_WARNING "program %s is using MAP_PRIVATE, " - "PROT_WRITE mmap of VM_RESERVED memory, which " - "is deprecated. Please report this to " - "linux-kernel@vger.kernel.org\n",current->comm); - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); - error = -EACCES; - goto unmap_and_free_vma; - } } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) @@ -1501,7 +1490,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un * PA-RISC uses this for its stack; IA64 for its Register Backing Store. * vma is the last one with address > vma->vm_end. Have to extend vma. */ -#ifdef CONFIG_STACK_GROWSUP +#ifndef CONFIG_IA64 static inline #endif int expand_upwards(struct vm_area_struct *vma, unsigned long address) diff --git a/mm/mprotect.c b/mm/mprotect.c index 17a2b52b753..653b8571c1e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -124,14 +124,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * a MAP_NORESERVE private mapping to writable will now reserve. */ if (newflags & VM_WRITE) { - if (oldflags & VM_RESERVED) { - BUG_ON(oldflags & VM_WRITE); - printk(KERN_WARNING "program %s is using MAP_PRIVATE, " - "PROT_WRITE mprotect of VM_RESERVED memory, " - "which is deprecated. Please report this to " - "linux-kernel@vger.kernel.org\n",current->comm); - return -EACCES; - } if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { charged = nrpages; if (security_vm_enough_memory(charged)) diff --git a/mm/mremap.c b/mm/mremap.c index b535438c363..ddaeee9a0b6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -323,7 +323,7 @@ unsigned long do_mremap(unsigned long addr, /* We can't remap across vm area boundaries */ if (old_len > vma->vm_end - addr) goto out; - if (vma->vm_flags & VM_DONTEXPAND) { + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { if (new_len > old_len) goto out; } diff --git a/mm/msync.c b/mm/msync.c index 0e040e9c39d..1b5b6f662dc 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, again: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { - unsigned long pfn; struct page *page; if (progress >= 64) { @@ -40,13 +39,9 @@ again: continue; if (!pte_maybe_dirty(*pte)) continue; - pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, *pte, addr); + page = vm_normal_page(vma, addr, *pte); + if (!page) continue; - } - page = pfn_to_page(pfn); - if (ptep_clear_flush_dirty(vma, addr, pte) || page_test_and_clear_dirty(page)) set_page_dirty(page); @@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma, /* For hugepages we can't go walking the page table normally, * but that's ok, hugetlbfs is memory based, so we don't need * to do anything more on an msync(). - * Can't do anything with VM_RESERVED regions either. */ - if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED)) + if (vma->vm_flags & VM_HUGETLB) return; BUG_ON(addr >= end); diff --git a/mm/nommu.c b/mm/nommu.c index 6deb6ab3d6a..c1196812876 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); -struct page *follow_page(struct mm_struct *mm, unsigned long address, +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) { return NULL; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 74138c9a22b..0166ea15c9e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -750,6 +750,7 @@ int clear_page_dirty_for_io(struct page *page) } return TestClearPageDirty(page); } +EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ff81b5c6551..fe14a8c87fc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -60,8 +60,11 @@ long nr_swap_pages; * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA + * + * TBD: should special case ZONE_DMA32 machines here - in those we normally + * don't need any ZONE_NORMAL reservation */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; EXPORT_SYMBOL(totalram_pages); @@ -72,7 +75,7 @@ EXPORT_SYMBOL(totalram_pages); struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; EXPORT_SYMBOL(zone_table); -static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; +static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; int min_free_kbytes = 1024; unsigned long __initdata nr_kernel_pages; @@ -124,7 +127,7 @@ static void bad_page(const char *function, struct page *page) printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", function, current->comm, page); printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, + (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, page->mapping, page_mapcount(page), page_count(page)); printk(KERN_EMERG "Backtrace:\n"); dump_stack(); @@ -137,18 +140,13 @@ static void bad_page(const char *function, struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved ); + 1 << PG_writeback ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; add_taint(TAINT_BAD_PAGE); } -#ifndef CONFIG_HUGETLB_PAGE -#define prep_compound_page(page, order) do { } while (0) -#define destroy_compound_page(page, order) do { } while (0) -#else /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -202,7 +200,6 @@ static void destroy_compound_page(struct page *page, unsigned long order) ClearPageCompound(p); } } -#endif /* CONFIG_HUGETLB_PAGE */ /* * function for dealing with page's order in buddy system. @@ -337,7 +334,7 @@ static inline void __free_pages_bulk (struct page *page, zone->free_area[order].nr_free++; } -static inline void free_pages_check(const char *function, struct page *page) +static inline int free_pages_check(const char *function, struct page *page) { if ( page_mapcount(page) || page->mapping != NULL || @@ -355,6 +352,12 @@ static inline void free_pages_check(const char *function, struct page *page) bad_page(function, page); if (PageDirty(page)) __ClearPageDirty(page); + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not free the page. But we shall soon need + * to do more, for when the ZERO_PAGE count wraps negative. + */ + return PageReserved(page); } /* @@ -394,11 +397,10 @@ void __free_pages_ok(struct page *page, unsigned int order) { LIST_HEAD(list); int i; + int reserved = 0; arch_free_page(page, order); - mod_page_state(pgfree, 1 << order); - #ifndef CONFIG_MMU if (order > 0) for (i = 1 ; i < (1 << order) ; ++i) @@ -406,8 +408,12 @@ void __free_pages_ok(struct page *page, unsigned int order) #endif for (i = 0 ; i < (1 << order) ; ++i) - free_pages_check(__FUNCTION__, page + i); + reserved += free_pages_check(__FUNCTION__, page + i); + if (reserved) + return; + list_add(&page->lru, &list); + mod_page_state(pgfree, 1 << order); kernel_map_pages(page, 1<<order, 0); free_pages_bulk(page_zone(page), 1, &list, order); } @@ -465,7 +471,7 @@ void set_page_refs(struct page *page, int order) /* * This page is about to be returned from the page allocator */ -static void prep_new_page(struct page *page, int order) +static int prep_new_page(struct page *page, int order) { if ( page_mapcount(page) || page->mapping != NULL || @@ -483,12 +489,20 @@ static void prep_new_page(struct page *page, int order) 1 << PG_reserved ))) bad_page(__FUNCTION__, page); + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not allocate the page: as a safety net. + */ + if (PageReserved(page)) + return 1; + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); set_page_private(page, 0); set_page_refs(page, order); kernel_map_pages(page, 1 << order, 1); + return 0; } /* @@ -671,11 +685,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); - kernel_map_pages(page, 1, 0); - inc_page_state(pgfree); if (PageAnon(page)) page->mapping = NULL; - free_pages_check(__FUNCTION__, page); + if (free_pages_check(__FUNCTION__, page)) + return; + + inc_page_state(pgfree); + kernel_map_pages(page, 1, 0); + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); list_add(&page->lru, &pcp->list); @@ -714,12 +731,14 @@ static struct page * buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) { unsigned long flags; - struct page *page = NULL; + struct page *page; int cold = !!(gfp_flags & __GFP_COLD); +again: if (order == 0) { struct per_cpu_pages *pcp; + page = NULL; pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); if (pcp->count <= pcp->low) @@ -732,9 +751,7 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) } local_irq_restore(flags); put_cpu(); - } - - if (page == NULL) { + } else { spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); spin_unlock_irqrestore(&zone->lock, flags); @@ -743,7 +760,8 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) if (page != NULL) { BUG_ON(bad_range(zone, page)); mod_page_state_zone(zone, pgalloc, 1 << order); - prep_new_page(page, order); + if (prep_new_page(page, order)) + goto again; if (gfp_flags & __GFP_ZERO) prep_zero_page(page, order, gfp_flags); @@ -754,20 +772,28 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) return page; } +#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ +#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ +#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ +#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ + /* * Return 1 if free pages are above 'mark'. This takes into account the order * of the allocation. */ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int can_try_harder, gfp_t gfp_high) + int classzone_idx, int alloc_flags) { /* free_pages my go negative - that's OK */ long min = mark, free_pages = z->free_pages - (1 << order) + 1; int o; - if (gfp_high) + if (alloc_flags & ALLOC_HIGH) min -= min / 2; - if (can_try_harder) + if (alloc_flags & ALLOC_HARDER) min -= min / 4; if (free_pages <= min + z->lowmem_reserve[classzone_idx]) @@ -785,14 +811,47 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, return 1; } -static inline int -should_reclaim_zone(struct zone *z, gfp_t gfp_mask) +/* + * get_page_from_freeliest goes through the zonelist trying to allocate + * a page. + */ +static struct page * +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, int alloc_flags) { - if (!z->reclaim_pages) - return 0; - if (gfp_mask & __GFP_NORECLAIM) - return 0; - return 1; + struct zone **z = zonelist->zones; + struct page *page = NULL; + int classzone_idx = zone_idx(*z); + + /* + * Go through the zonelist once, looking for a zone with enough free. + * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + */ + do { + if ((alloc_flags & ALLOC_CPUSET) && + !cpuset_zone_allowed(*z, gfp_mask)) + continue; + + if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { + unsigned long mark; + if (alloc_flags & ALLOC_WMARK_MIN) + mark = (*z)->pages_min; + else if (alloc_flags & ALLOC_WMARK_LOW) + mark = (*z)->pages_low; + else + mark = (*z)->pages_high; + if (!zone_watermark_ok(*z, order, mark, + classzone_idx, alloc_flags)) + continue; + } + + page = buffered_rmqueue(*z, order, gfp_mask); + if (page) { + zone_statistics(zonelist, *z); + break; + } + } while (*(++z) != NULL); + return page; } /* @@ -803,105 +862,76 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist) { const gfp_t wait = gfp_mask & __GFP_WAIT; - struct zone **zones, *z; + struct zone **z; struct page *page; struct reclaim_state reclaim_state; struct task_struct *p = current; - int i; - int classzone_idx; int do_retry; - int can_try_harder; + int alloc_flags; int did_some_progress; might_sleep_if(wait); - /* - * The caller may dip into page reserves a bit more if the caller - * cannot run direct reclaim, or is the caller has realtime scheduling - * policy - */ - can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; - - zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ +restart: + z = zonelist->zones; /* the list of zones suitable for gfp_mask */ - if (unlikely(zones[0] == NULL)) { + if (unlikely(*z == NULL)) { /* Should this ever happen?? */ return NULL; } - classzone_idx = zone_idx(zones[0]); + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (page) + goto got_pg; + + do { + wakeup_kswapd(*z, order); + } while (*(++z)); -restart: /* - * Go through the zonelist once, looking for a zone with enough free. - * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + * OK, we're below the kswapd watermark and have kicked background + * reclaim. Now things get more complex, so set up alloc_flags according + * to how we want to proceed. + * + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy. */ - for (i = 0; (z = zones[i]) != NULL; i++) { - int do_reclaim = should_reclaim_zone(z, gfp_mask); - - if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) - continue; - - /* - * If the zone is to attempt early page reclaim then this loop - * will try to reclaim pages and check the watermark a second - * time before giving up and falling back to the next zone. - */ -zone_reclaim_retry: - if (!zone_watermark_ok(z, order, z->pages_low, - classzone_idx, 0, 0)) { - if (!do_reclaim) - continue; - else { - zone_reclaim(z, gfp_mask, order); - /* Only try reclaim once */ - do_reclaim = 0; - goto zone_reclaim_retry; - } - } - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } - - for (i = 0; (z = zones[i]) != NULL; i++) - wakeup_kswapd(z, order); + alloc_flags = ALLOC_WMARK_MIN; + if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) + alloc_flags |= ALLOC_HARDER; + if (gfp_mask & __GFP_HIGH) + alloc_flags |= ALLOC_HIGH; + if (wait) + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations - * coming from realtime tasks to go deeper into reserves + * coming from realtime tasks go deeper into reserves. * * This is the last chance, in general, before the goto nopage. * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!zone_watermark_ok(z, order, z->pages_min, - classzone_idx, can_try_harder, - gfp_mask & __GFP_HIGH)) - continue; - - if (wait && !cpuset_zone_allowed(z, gfp_mask)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } + page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); + if (page) + goto got_pg; /* This allocation should allow future memory freeing. */ if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { if (!(gfp_mask & __GFP_NOMEMALLOC)) { +nofail_alloc: /* go through the zonelist yet again, ignoring mins */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!cpuset_zone_allowed(z, gfp_mask)) - continue; - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; + page = get_page_from_freelist(gfp_mask, order, + zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); + if (page) + goto got_pg; + if (gfp_mask & __GFP_NOFAIL) { + blk_congestion_wait(WRITE, HZ/50); + goto nofail_alloc; } } goto nopage; @@ -919,7 +949,7 @@ rebalance: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zones, gfp_mask); + did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; @@ -927,19 +957,10 @@ rebalance: cond_resched(); if (likely(did_some_progress)) { - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!zone_watermark_ok(z, order, z->pages_min, - classzone_idx, can_try_harder, - gfp_mask & __GFP_HIGH)) - continue; - - if (!cpuset_zone_allowed(z, gfp_mask)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } + page = get_page_from_freelist(gfp_mask, order, + zonelist, alloc_flags); + if (page) + goto got_pg; } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { /* * Go through the zonelist yet one more time, keep @@ -947,18 +968,10 @@ rebalance: * a parallel oom killing, we must fail if we're still * under heavy pressure. */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!zone_watermark_ok(z, order, z->pages_high, - classzone_idx, 0, 0)) - continue; - - if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); + if (page) + goto got_pg; out_of_memory(gfp_mask, order); goto restart; @@ -991,9 +1004,7 @@ nopage: dump_stack(); show_mem(); } - return NULL; got_pg: - zone_statistics(zonelist, z); return page; } @@ -1330,7 +1341,7 @@ void show_free_areas(void) } else printk("\n"); - for_each_cpu(cpu) { + for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; pageset = zone_pcp(zone, cpu); @@ -1441,6 +1452,10 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli zone = pgdat->node_zones + ZONE_NORMAL; if (zone->present_pages) zonelist->zones[j++] = zone; + case ZONE_DMA32: + zone = pgdat->node_zones + ZONE_DMA32; + if (zone->present_pages) + zonelist->zones[j++] = zone; case ZONE_DMA: zone = pgdat->node_zones + ZONE_DMA; if (zone->present_pages) @@ -1455,6 +1470,8 @@ static inline int highest_zone(int zone_bits) int res = ZONE_NORMAL; if (zone_bits & (__force int)__GFP_HIGHMEM) res = ZONE_HIGHMEM; + if (zone_bits & (__force int)__GFP_DMA32) + res = ZONE_DMA32; if (zone_bits & (__force int)__GFP_DMA) res = ZONE_DMA; return res; @@ -1755,16 +1772,16 @@ static int __devinit zone_batchsize(struct zone *zone) batch = 1; /* - * We will be trying to allcoate bigger chunks of contiguous - * memory of the order of fls(batch). This should result in - * better cache coloring. + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. * - * A sanity check also to ensure that batch is still in limits. + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. */ - batch = (1 << fls(batch + batch/2)); - - if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2)) - batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2); + batch = (1 << (fls(batch + batch/2)-1)) - 1; return batch; } @@ -1866,11 +1883,10 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, if (process_zones(cpu)) ret = NOTIFY_BAD; break; -#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: case CPU_DEAD: free_zone_pagesets(cpu); break; -#endif default: break; } @@ -1880,7 +1896,7 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, static struct notifier_block pageset_notifier = { &pageset_cpuup_callback, NULL, 0 }; -void __init setup_per_cpu_pageset() +void __init setup_per_cpu_pageset(void) { int err; @@ -1975,7 +1991,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, if (zholes_size) realsize -= zholes_size[j]; - if (j == ZONE_DMA || j == ZONE_NORMAL) + if (j < ZONE_HIGHMEM) nr_kernel_pages += realsize; nr_all_pages += realsize; @@ -2417,13 +2433,18 @@ void setup_per_zone_pages_min(void) } for_each_zone(zone) { + unsigned long tmp; spin_lock_irqsave(&zone->lru_lock, flags); + tmp = (pages_min * zone->present_pages) / lowmem_pages; if (is_highmem(zone)) { /* - * Often, highmem doesn't need to reserve any pages. - * But the pages_min/low/high values are also used for - * batching up page reclaim activity so we need a - * decent value here. + * __GFP_HIGH and PF_MEMALLOC allocations usually don't + * need highmem pages, so cap pages_min to a small + * value here. + * + * The (pages_high-pages_low) and (pages_low-pages_min) + * deltas controls asynch page reclaim, and so should + * not be capped for highmem. */ int min_pages; @@ -2434,19 +2455,15 @@ void setup_per_zone_pages_min(void) min_pages = 128; zone->pages_min = min_pages; } else { - /* if it's a lowmem zone, reserve a number of pages + /* + * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->pages_min = (pages_min * zone->present_pages) / - lowmem_pages; + zone->pages_min = tmp; } - /* - * When interpreting these watermarks, just keep in mind that: - * zone->pages_min == (zone->pages_min * 4) / 4; - */ - zone->pages_low = (zone->pages_min * 5) / 4; - zone->pages_high = (zone->pages_min * 6) / 4; + zone->pages_low = zone->pages_min + tmp / 4; + zone->pages_high = zone->pages_min + tmp / 2; spin_unlock_irqrestore(&zone->lru_lock, flags); } } diff --git a/mm/rmap.c b/mm/rmap.c index 914d04b98be..f853c6def15 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -225,7 +225,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) /* * At what user virtual address is page expected in vma? checking that the - * page matches the vma: currently only used by unuse_process, on anon pages. + * page matches the vma: currently only used on anon pages, by unuse_vma; */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { @@ -234,7 +234,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) (void *)page->mapping - PAGE_MAPPING_ANON) return -EFAULT; } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { - if (vma->vm_file->f_mapping != page->mapping) + if (!vma->vm_file || + vma->vm_file->f_mapping != page->mapping) return -EFAULT; } else return -EFAULT; @@ -289,7 +290,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, * repeatedly from either page_referenced_anon or page_referenced_file. */ static int page_referenced_one(struct page *page, - struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token) + struct vm_area_struct *vma, unsigned int *mapcount) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -310,7 +311,7 @@ static int page_referenced_one(struct page *page, /* Pretend the page is referenced if the task has the swap token and is in the middle of a page fault. */ - if (mm != current->mm && !ignore_token && has_swap_token(mm) && + if (mm != current->mm && has_swap_token(mm) && rwsem_is_locked(&mm->mmap_sem)) referenced++; @@ -320,7 +321,7 @@ out: return referenced; } -static int page_referenced_anon(struct page *page, int ignore_token) +static int page_referenced_anon(struct page *page) { unsigned int mapcount; struct anon_vma *anon_vma; @@ -333,8 +334,7 @@ static int page_referenced_anon(struct page *page, int ignore_token) mapcount = page_mapcount(page); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - referenced += page_referenced_one(page, vma, &mapcount, - ignore_token); + referenced += page_referenced_one(page, vma, &mapcount); if (!mapcount) break; } @@ -353,7 +353,7 @@ static int page_referenced_anon(struct page *page, int ignore_token) * * This function is only called from page_referenced for object-based pages. */ -static int page_referenced_file(struct page *page, int ignore_token) +static int page_referenced_file(struct page *page) { unsigned int mapcount; struct address_space *mapping = page->mapping; @@ -391,8 +391,7 @@ static int page_referenced_file(struct page *page, int ignore_token) referenced++; break; } - referenced += page_referenced_one(page, vma, &mapcount, - ignore_token); + referenced += page_referenced_one(page, vma, &mapcount); if (!mapcount) break; } @@ -409,13 +408,10 @@ static int page_referenced_file(struct page *page, int ignore_token) * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. */ -int page_referenced(struct page *page, int is_locked, int ignore_token) +int page_referenced(struct page *page, int is_locked) { int referenced = 0; - if (!swap_token_default_timeout) - ignore_token = 1; - if (page_test_and_clear_young(page)) referenced++; @@ -424,15 +420,14 @@ int page_referenced(struct page *page, int is_locked, int ignore_token) if (page_mapped(page) && page->mapping) { if (PageAnon(page)) - referenced += page_referenced_anon(page, ignore_token); + referenced += page_referenced_anon(page); else if (is_locked) - referenced += page_referenced_file(page, ignore_token); + referenced += page_referenced_file(page); else if (TestSetPageLocked(page)) referenced++; else { if (page->mapping) - referenced += page_referenced_file(page, - ignore_token); + referenced += page_referenced_file(page); unlock_page(page); } } @@ -529,10 +524,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) * If the page is mlock()d, we cannot swap it out. * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. - * - * Pages belonging to VM_RESERVED regions should not happen here. */ - if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || + if ((vma->vm_flags & VM_LOCKED) || ptep_clear_flush_young(vma, address, pte)) { ret = SWAP_FAIL; goto out_unmap; @@ -613,7 +606,6 @@ static void try_to_unmap_cluster(unsigned long cursor, struct page *page; unsigned long address; unsigned long end; - unsigned long pfn; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -642,21 +634,14 @@ static void try_to_unmap_cluster(unsigned long cursor, for (; address < end; pte++, address += PAGE_SIZE) { if (!pte_present(*pte)) continue; - - pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, *pte, address); - continue; - } - - page = pfn_to_page(pfn); - BUG_ON(PageAnon(page)); + page = vm_normal_page(vma, address, *pte); + BUG_ON(!page || PageAnon(page)); if (ptep_clear_flush_young(vma, address, pte)) continue; /* Nuke the page table entry. */ - flush_cache_page(vma, address, pfn); + flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ @@ -727,7 +712,7 @@ static int try_to_unmap_file(struct page *page) list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) + if (vma->vm_flags & VM_LOCKED) continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) @@ -761,7 +746,7 @@ static int try_to_unmap_file(struct page *page) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) + if (vma->vm_flags & VM_LOCKED) continue; cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && @@ -783,11 +768,8 @@ static int try_to_unmap_file(struct page *page) * in locked vmas). Reset cursor on all unreserved nonlinear * vmas, now forgetting on which ones it had fallen behind. */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { - if (!(vma->vm_flags & VM_RESERVED)) - vma->vm_private_data = NULL; - } + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + vma->vm_private_data = NULL; out: spin_unlock(&mapping->i_mmap_lock); return ret; diff --git a/mm/slab.c b/mm/slab.c index e291f5e1afb..e5ec26e0c46 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -434,7 +434,7 @@ struct kmem_cache { /* Optimization question: fewer reaps means less * probability for unnessary cpucache drain/refill cycles. * - * OTHO the cpuarrays can contain lots of objects, + * OTOH the cpuarrays can contain lots of objects, * which could lock up otherwise freeable slabs. */ #define REAPTIMEOUT_CPUC (2*HZ) @@ -565,14 +565,29 @@ static void **dbg_userword(kmem_cache_t *cachep, void *objp) #define BREAK_GFP_ORDER_LO 0 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; -/* Macros for storing/retrieving the cachep and or slab from the +/* Functions for storing/retrieving the cachep and or slab from the * global 'mem_map'. These are used to find the slab an obj belongs to. * With kfree(), these are used to find the cache which an obj belongs to. */ -#define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x)) -#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next) -#define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x)) -#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev) +static inline void page_set_cache(struct page *page, struct kmem_cache *cache) +{ + page->lru.next = (struct list_head *)cache; +} + +static inline struct kmem_cache *page_get_cache(struct page *page) +{ + return (struct kmem_cache *)page->lru.next; +} + +static inline void page_set_slab(struct page *page, struct slab *slab) +{ + page->lru.prev = (struct list_head *)slab; +} + +static inline struct slab *page_get_slab(struct page *page) +{ + return (struct slab *)page->lru.prev; +} /* These are the default caches for kmalloc. Custom caches can have other sizes. */ struct cache_sizes malloc_sizes[] = { @@ -1190,11 +1205,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid) int i; flags |= cachep->gfpflags; - if (likely(nodeid == -1)) { - page = alloc_pages(flags, cachep->gfporder); - } else { - page = alloc_pages_node(nodeid, flags, cachep->gfporder); - } + page = alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) return NULL; addr = page_address(page); @@ -1368,7 +1379,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp) /* Print some data about the neighboring objects, if they * exist: */ - struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp)); + struct slab *slabp = page_get_slab(virt_to_page(objp)); int objnr; objnr = (objp-slabp->s_mem)/cachep->objsize; @@ -2138,8 +2149,8 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) i = 1 << cachep->gfporder; page = virt_to_page(objp); do { - SET_PAGE_CACHE(page, cachep); - SET_PAGE_SLAB(page, slabp); + page_set_cache(page, cachep); + page_set_slab(page, slabp); page++; } while (--i); } @@ -2269,14 +2280,14 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, kfree_debugcheck(objp); page = virt_to_page(objp); - if (GET_PAGE_CACHE(page) != cachep) { + if (page_get_cache(page) != cachep) { printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n", - GET_PAGE_CACHE(page),cachep); + page_get_cache(page),cachep); printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); - printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name); + printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name); WARN_ON(1); } - slabp = GET_PAGE_SLAB(page); + slabp = page_get_slab(page); if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { @@ -2628,7 +2639,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n struct slab *slabp; unsigned int objnr; - slabp = GET_PAGE_SLAB(virt_to_page(objp)); + slabp = page_get_slab(virt_to_page(objp)); l3 = cachep->nodelists[node]; list_del(&slabp->list); objnr = (objp - slabp->s_mem) / cachep->objsize; @@ -2744,7 +2755,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp) #ifdef CONFIG_NUMA { struct slab *slabp; - slabp = GET_PAGE_SLAB(virt_to_page(objp)); + slabp = page_get_slab(virt_to_page(objp)); if (unlikely(slabp->nodeid != numa_node_id())) { struct array_cache *alien = NULL; int nodeid = slabp->nodeid; @@ -2830,7 +2841,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) page = virt_to_page(ptr); if (unlikely(!PageSlab(page))) goto out; - if (unlikely(GET_PAGE_CACHE(page) != cachep)) + if (unlikely(page_get_cache(page) != cachep)) goto out; return 1; out: @@ -3026,7 +3037,7 @@ void kfree(const void *objp) return; local_irq_save(flags); kfree_debugcheck(objp); - c = GET_PAGE_CACHE(virt_to_page(objp)); + c = page_get_cache(virt_to_page(objp)); __cache_free(c, (void*)objp); local_irq_restore(flags); } @@ -3596,7 +3607,7 @@ unsigned int ksize(const void *objp) if (unlikely(objp == NULL)) return 0; - return obj_reallen(GET_PAGE_CACHE(virt_to_page(objp))); + return obj_reallen(page_get_cache(virt_to_page(objp))); } diff --git a/mm/swap.c b/mm/swap.c index d09cf7f03e7..73d351439ef 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -34,8 +34,6 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; -#ifdef CONFIG_HUGETLB_PAGE - void put_page(struct page *page) { if (unlikely(PageCompound(page))) { @@ -52,7 +50,6 @@ void put_page(struct page *page) __page_cache_release(page); } EXPORT_SYMBOL(put_page); -#endif /* * Writeback is about to end against a page which has been marked for immediate diff --git a/mm/thrash.c b/mm/thrash.c index eff3c18c33a..f4c560b4a2b 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -57,14 +57,17 @@ void grab_swap_token(void) /* We have the token. Let others know we still need it. */ if (has_swap_token(current->mm)) { current->mm->recent_pagein = 1; + if (unlikely(!swap_token_default_timeout)) + disable_swap_token(); return; } if (time_after(jiffies, swap_token_check)) { - /* Can't get swapout protection if we exceed our RSS limit. */ - // if (current->mm->rss > current->mm->rlimit_rss) - // return; + if (!swap_token_default_timeout) { + swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; + return; + } /* ... or if we recently held the token. */ if (time_before(jiffies, current->mm->swap_token_time)) @@ -95,6 +98,7 @@ void __put_swap_token(struct mm_struct *mm) { spin_lock(&swap_token_lock); if (likely(mm == swap_token_mm)) { + mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL; swap_token_mm = &init_mm; swap_token_check = jiffies; } diff --git a/mm/truncate.c b/mm/truncate.c index 29c18f68dc3..9173ab50060 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -282,8 +282,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Zap the rest of the file in one hit. */ unmap_mapping_range(mapping, - page_index << PAGE_CACHE_SHIFT, - (end - page_index + 1) + (loff_t)page_index<<PAGE_CACHE_SHIFT, + (loff_t)(end - page_index + 1) << PAGE_CACHE_SHIFT, 0); did_range_unmap = 1; @@ -292,7 +292,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Just zap this page */ unmap_mapping_range(mapping, - page_index << PAGE_CACHE_SHIFT, + (loff_t)page_index<<PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE, 0); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 135bf8ca96e..b0cd81c32de 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -201,13 +201,25 @@ static int shrink_slab(unsigned long scanned, gfp_t gfp_mask, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; + unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); delta = (4 * scanned) / shrinker->seeks; - delta *= (*shrinker->shrinker)(0, gfp_mask); + delta *= max_pass; do_div(delta, lru_pages + 1); shrinker->nr += delta; - if (shrinker->nr < 0) - shrinker->nr = LONG_MAX; /* It wrapped! */ + if (shrinker->nr < 0) { + printk(KERN_ERR "%s: nr=%ld\n", + __FUNCTION__, shrinker->nr); + shrinker->nr = max_pass; + } + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ + if (shrinker->nr > max_pass * 2) + shrinker->nr = max_pass * 2; total_scan = shrinker->nr; shrinker->nr = 0; @@ -407,7 +419,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) if (PageWriteback(page)) goto keep_locked; - referenced = page_referenced(page, 1, sc->priority <= 0); + referenced = page_referenced(page, 1); /* In active use or really unfreeable? Activate it. */ if (referenced && page_mapping_inuse(page)) goto activate_locked; @@ -756,7 +768,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) if (page_mapped(page)) { if (!reclaim_mapped || (total_swap_pages == 0 && PageAnon(page)) || - page_referenced(page, 0, sc->priority <= 0)) { + page_referenced(page, 0)) { list_add(&page->lru, &l_active); continue; } @@ -960,6 +972,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) sc.nr_reclaimed = 0; sc.priority = priority; sc.swap_cluster_max = SWAP_CLUSTER_MAX; + if (!priority) + disable_swap_token(); shrink_caches(zones, &sc); shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); if (reclaim_state) { @@ -1056,6 +1070,10 @@ loop_again: int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; + /* The swap token gets in the way of swapout... */ + if (!priority) + disable_swap_token(); + all_zones_ok = 1; if (nr_pages == 0) { @@ -1074,7 +1092,7 @@ loop_again: continue; if (!zone_watermark_ok(zone, order, - zone->pages_high, 0, 0, 0)) { + zone->pages_high, 0, 0)) { end_zone = i; goto scan; } @@ -1111,7 +1129,7 @@ scan: if (nr_pages == 0) { /* Not software suspend */ if (!zone_watermark_ok(zone, order, - zone->pages_high, end_zone, 0, 0)) + zone->pages_high, end_zone, 0)) all_zones_ok = 0; } zone->temp_priority = priority; @@ -1259,7 +1277,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0)) + if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; @@ -1360,6 +1378,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) sc.nr_reclaimed = 0; /* scan at the highest priority */ sc.priority = 0; + disable_swap_token(); if (nr_pages > SWAP_CLUSTER_MAX) sc.swap_cluster_max = nr_pages; |