From 19770b32609b6bf97a3dece2529089494cbfc549 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Apr 2008 02:12:18 -0700 Subject: mm: filter based on a nodemask as well as a gfp_mask The MPOL_BIND policy creates a zonelist that is used for allocations controlled by that mempolicy. As the per-node zonelist is already being filtered based on a zone id, this patch adds a version of __alloc_pages() that takes a nodemask for further filtering. This eliminates the need for MPOL_BIND to create a custom zonelist. A positive benefit of this is that allocations using MPOL_BIND now use the local node's distance-ordered zonelist instead of a custom node-id-ordered zonelist. I.e., pages will be allocated from the closest allowed node with available memory. [Lee.Schermerhorn@hp.com: Mempolicy: update stale documentation and comments] [Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask] [Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask rework] Signed-off-by: Mel Gorman Acked-by: Christoph Lameter Signed-off-by: Lee Schermerhorn Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 4 +-- include/linux/gfp.h | 4 +++ include/linux/mempolicy.h | 19 ++++++----- include/linux/mmzone.h | 80 +++++++++++++++++++++++++++++------------------ 4 files changed, 68 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 726761e2400..038578362b4 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -26,7 +26,7 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); void cpuset_update_task_memory_state(void); -int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl); +int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask); extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask); @@ -103,7 +103,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) static inline void cpuset_init_current_mems_allowed(void) {} static inline void cpuset_update_task_memory_state(void) {} -static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) +static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { return 1; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index e1c6064cb6c..898aa9d5b6c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -182,6 +182,10 @@ static inline void arch_alloc_page(struct page *page, int order) { } extern struct page *__alloc_pages(gfp_t, unsigned int, struct zonelist *); +extern struct page * +__alloc_pages_nodemask(gfp_t, unsigned int, + struct zonelist *, nodemask_t *nodemask); + static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 69160dc32d4..b8b3da7a331 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -54,19 +54,20 @@ struct mm_struct; * mmap_sem. * * Freeing policy: - * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd. - * All other policies don't have any external state. mpol_free() handles this. + * Mempolicy objects are reference counted. A mempolicy will be freed when + * mpol_free() decrements the reference count to zero. * * Copying policy objects: - * For MPOL_BIND the zonelist must be always duplicated. mpol_clone() does this. + * mpol_copy() allocates a new mempolicy and copies the specified mempolicy + * to the new storage. The reference count of the new object is initialized + * to 1, representing the caller of mpol_copy(). */ struct mempolicy { atomic_t refcnt; short policy; /* See MPOL_* above */ union { - struct zonelist *zonelist; /* bind */ short preferred_node; /* preferred */ - nodemask_t nodes; /* interleave */ + nodemask_t nodes; /* interleave/bind */ /* undefined for default */ } v; nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */ @@ -151,7 +152,8 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p); extern struct mempolicy default_policy; extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, - unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol); + unsigned long addr, gfp_t gfp_flags, + struct mempolicy **mpol, nodemask_t **nodemask); extern unsigned slab_node(struct mempolicy *policy); extern enum zone_type policy_zone; @@ -239,8 +241,11 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p) } static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, - unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol) + unsigned long addr, gfp_t gfp_flags, + struct mempolicy **mpol, nodemask_t **nodemask) { + *mpol = NULL; + *nodemask = NULL; return node_zonelist(0, gfp_flags); } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d34b4c29001..498d6ceff2f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -749,36 +749,60 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) #endif /* CONFIG_NUMA */ } -static inline void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) -{ - zoneref->zone = zone; - zoneref->zone_idx = zone_idx(zone); -} +/** + * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point + * @z - The cursor used as a starting point for the search + * @highest_zoneidx - The zone index of the highest zone to return + * @nodes - An optional nodemask to filter the zonelist with + * @zone - The first suitable zone found is returned via this parameter + * + * This function returns the next zone at or below a given zone index that is + * within the allowed nodemask using a cursor as the starting point for the + * search. The zoneref returned is a cursor that is used as the next starting + * point for future calls to next_zones_zonelist(). + */ +struct zoneref *next_zones_zonelist(struct zoneref *z, + enum zone_type highest_zoneidx, + nodemask_t *nodes, + struct zone **zone); -/* Returns the first zone at or below highest_zoneidx in a zonelist */ +/** + * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist + * @zonelist - The zonelist to search for a suitable zone + * @highest_zoneidx - The zone index of the highest zone to return + * @nodes - An optional nodemask to filter the zonelist with + * @zone - The first suitable zone found is returned via this parameter + * + * This function returns the first zone at or below a given zone index that is + * within the allowed nodemask. The zoneref returned is a cursor that can be + * used to iterate the zonelist with next_zones_zonelist. The cursor should + * not be used by the caller as it does not match the value of the zone + * returned. + */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, - enum zone_type highest_zoneidx) + enum zone_type highest_zoneidx, + nodemask_t *nodes, + struct zone **zone) { - struct zoneref *z; - - /* Find the first suitable zone to use for the allocation */ - z = zonelist->_zonerefs; - while (zonelist_zone_idx(z) > highest_zoneidx) - z++; - - return z; + return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes, + zone); } -/* Returns the next zone at or below highest_zoneidx in a zonelist */ -static inline struct zoneref *next_zones_zonelist(struct zoneref *z, - enum zone_type highest_zoneidx) -{ - /* Find the next suitable zone to use for the allocation */ - while (zonelist_zone_idx(z) > highest_zoneidx) - z++; - - return z; -} +/** + * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask + * @zone - The current zone in the iterator + * @z - The current pointer within zonelist->zones being iterated + * @zlist - The zonelist being iterated + * @highidx - The zone index of the highest zone to return + * @nodemask - Nodemask allowed by the allocator + * + * This iterator iterates though all zones at or below a given zone index and + * within a given nodemask + */ +#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ + for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ + zone; \ + z = next_zones_zonelist(z, highidx, nodemask, &zone)) \ /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index @@ -790,11 +814,7 @@ static inline struct zoneref *next_zones_zonelist(struct zoneref *z, * This iterator iterates though all zones at or below a given zone index. */ #define for_each_zone_zonelist(zone, z, zlist, highidx) \ - for (z = first_zones_zonelist(zlist, highidx), \ - zone = zonelist_zone(z++); \ - zone; \ - z = next_zones_zonelist(z, highidx), \ - zone = zonelist_zone(z++)) + for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) #ifdef CONFIG_SPARSEMEM #include -- cgit