summaryrefslogtreecommitdiffstats
path: root/0001-mm-CONFIG_NR_ZONES_EXTENDED.patch
blob: 44ef3662b41406c5ac26488d55ba9805ec2268db (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
From 8b368e8e961944105945fbe36f3f264252bfd19a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 25 Feb 2016 01:02:30 +0000
Subject: [PATCH] mm: CONFIG_NR_ZONES_EXTENDED

ZONE_DEVICE (merged in 4.3) and ZONE_CMA (proposed) are examples of new mm
zones that are bumping up against the current maximum limit of 4 zones,
i.e.  2 bits in page->flags.  When adding a zone this equation still needs
to be satisified:

    SECTIONS_WIDTH + ZONES_WIDTH + NODES_SHIFT + LAST_CPUPID_SHIFT
	  <= BITS_PER_LONG - NR_PAGEFLAGS

ZONE_DEVICE currently tries to satisfy this equation by requiring that
ZONE_DMA be disabled, but this is untenable given generic kernels want to
support ZONE_DEVICE and ZONE_DMA simultaneously.  ZONE_CMA would like to
increase the amount of memory covered per section, but that limits the
minimum granularity at which consecutive memory ranges can be added via
devm_memremap_pages().

The trade-off of what is acceptable to sacrifice depends heavily on the
platform.  For example, ZONE_CMA is targeted for 32-bit platforms where
page->flags is constrained, but those platforms likely do not care about
the minimum granularity of memory hotplug.  A big iron machine with 1024
numa nodes can likely sacrifice ZONE_DMA where a general purpose
distribution kernel can not.

CONFIG_NR_ZONES_EXTENDED is a configuration symbol that gets selected when
the number of configured zones exceeds 4.  It documents the configuration
symbols and definitions that get modified when ZONES_WIDTH is greater than
2.

For now, it steals a bit from NODES_SHIFT.  Later on it can be used to
document the definitions that get modified when a 32-bit configuration
wants more zone bits.

Note that GFP_ZONE_TABLE poses an interesting constraint since
include/linux/gfp.h gets included by the 32-bit portion of a 64-bit build.
We need to be careful to only build the table for zones that have a
corresponding gfp_t flag.  GFP_ZONES_SHIFT is introduced for this purpose.
This patch does not attempt to solve the problem of adding a new zone
that also has a corresponding GFP_ flag.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931
Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Mark <markk@clara.co.uk>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/Kconfig                  |  6 ++++--
 include/linux/gfp.h               | 33 ++++++++++++++++++++-------------
 include/linux/page-flags-layout.h |  2 ++
 mm/Kconfig                        |  7 +++++--
 4 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3fef519..b94704a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1409,8 +1409,10 @@ config NUMA_EMU
 
 config NODES_SHIFT
 	int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
-	range 1 10
-	default "10" if MAXSMP
+	range 1 10 if !NR_ZONES_EXTENDED
+	range 1 9 if NR_ZONES_EXTENDED
+	default "10" if MAXSMP && !NR_ZONES_EXTENDED
+	default "9" if MAXSMP && NR_ZONES_EXTENDED
 	default "6" if X86_64
 	default "3"
 	depends on NEED_MULTIPLE_NODES
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index af1f2b2..d201d8a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -329,22 +329,29 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
  *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
  *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
  *
- * ZONES_SHIFT must be <= 2 on 32 bit platforms.
+ * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
  */
 
-#if 16 * ZONES_SHIFT > BITS_PER_LONG
-#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
+#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
+/* ZONE_DEVICE is not a valid GFP zone specifier */
+#define GFP_ZONES_SHIFT 2
+#else
+#define GFP_ZONES_SHIFT ZONES_SHIFT
+#endif
+
+#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
+#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
 #endif
 
 #define GFP_ZONE_TABLE ( \
-	(ZONE_NORMAL << 0 * ZONES_SHIFT)				      \
-	| (OPT_ZONE_DMA << ___GFP_DMA * ZONES_SHIFT)			      \
-	| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * ZONES_SHIFT)		      \
-	| (OPT_ZONE_DMA32 << ___GFP_DMA32 * ZONES_SHIFT)		      \
-	| (ZONE_NORMAL << ___GFP_MOVABLE * ZONES_SHIFT)			      \
-	| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * ZONES_SHIFT)	      \
-	| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * ZONES_SHIFT)   \
-	| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * ZONES_SHIFT)   \
+	(ZONE_NORMAL << 0 * GFP_ZONES_SHIFT)					\
+	| (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT)			\
+	| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT)		\
+	| (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT)		      	\
+	| (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT)			\
+	| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT)	\
+	| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)	\
+	| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)	\
 )
 
 /*
@@ -369,8 +376,8 @@ static inline enum zone_type gfp_zone(gfp_t flags)
 	enum zone_type z;
 	int bit = (__force int) (flags & GFP_ZONEMASK);
 
-	z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
-					 ((1 << ZONES_SHIFT) - 1);
+	z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
+					 ((1 << GFP_ZONES_SHIFT) - 1);
 	VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
 	return z;
 }
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index da52366..77b078c 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -17,6 +17,8 @@
 #define ZONES_SHIFT 1
 #elif MAX_NR_ZONES <= 4
 #define ZONES_SHIFT 2
+#elif MAX_NR_ZONES <= 8
+#define ZONES_SHIFT 3
 #else
 #error ZONES_SHIFT -- too many zones configured adjust calculation
 #endif
diff --git a/mm/Kconfig b/mm/Kconfig
index 031a329..7826216 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -652,8 +652,6 @@ config IDLE_PAGE_TRACKING
 
 config ZONE_DEVICE
 	bool "Device memory (pmem, etc...) hotplug support"
-	default !ZONE_DMA
-	depends on !ZONE_DMA
 	depends on MEMORY_HOTPLUG
 	depends on MEMORY_HOTREMOVE
 	depends on X86_64 #arch_add_memory() comprehends device memory
@@ -667,5 +665,10 @@ config ZONE_DEVICE
 
 	  If FS_DAX is enabled, then say Y.
 
+config NR_ZONES_EXTENDED
+	bool
+	default n if !64BIT
+	default y if ZONE_DEVICE && ZONE_DMA && ZONE_DMA32
+
 config FRAME_VECTOR
 	bool
-- 
2.5.0