dma-debug-account-for-cachelines-and-read-only-mappings.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298

Bugzilla: 1062833
Upstream-status: Submitted for 3.14

While debug_dma_assert_idle() checks if a given *page* is actively
undergoing dma the valid granularity of a dma mapping is a *cacheline*.
Sander's testing shows that the warning message "DMA-API: exceeded 7
overlapping mappings of pfn..." is falsely triggering.  The test is
simply mapping multiple cachelines in a given page.

Ultimately we want overlap tracking to be valid as it is a real api
violation, so we need to track active mappings by cachelines.  Update
the active dma tracking to use the page-frame-relative cacheline of the
mapping as the key, and update debug_dma_assert_idle() to check for all
possible mapped cachelines for a given page.

However, the need to track active mappings is only relevant when the
dma-mapping is writable by the device.  In fact it is fairly standard
for read-only mappings to have hundreds or thousands of overlapping
mappings at once.  Limiting the overlap tracking to writable
(!DMA_TO_DEVICE) eliminates this class of false-positive overlap
reports.

Note, the radix gang lookup is sub-optimal.  It would be best if it
stopped fetching entries once the search passed a page boundary.
Nevertheless, this implementation does not perturb the original net_dma
failing case.  That is to say the extra overhead does not show up in
terms of making the failing case pass due to a timing change.

References:
http://marc.info/?l=linux-netdev&m=139232263419315&w=2
http://marc.info/?l=linux-netdev&m=139217088107122&w=2

Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
Reported-by: Dave Jones <davej@redhat.com>
Tested-by: Dave Jones <davej@redhat.com>
Tested-by: Sander Eikelenboom <linux@eikelenboom.it>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Francois Romieu <romieu@fr.zoreil.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---

Changes in v2:
1/ replaced 'cln' acronym with 'cacheline'. Now 'cln' is only a local
   variable name

2/ renamed 'to_cln()' to 'to_cacheline_number()'

3/ made the storage for a 'cacheline number' phys_addr_t since...
   "unsigned long cln = page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT;"
   ...may drop bits on some archs/memory-models.

 lib/dma-debug.c |  131 ++++++++++++++++++++++++++++++++++++-------------------
 1 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 2defd1308b04..98f2d7e91a91 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -424,111 +424,134 @@ void debug_dma_dump_mappings(struct device *dev)
 EXPORT_SYMBOL(debug_dma_dump_mappings);
 
 /*
- * For each page mapped (initial page in the case of
- * dma_alloc_coherent/dma_map_{single|page}, or each page in a
- * scatterlist) insert into this tree using the pfn as the key. At
+ * For each mapping (initial cacheline in the case of
+ * dma_alloc_coherent/dma_map_page, initial cacheline in each page of a
+ * scatterlist, or the cacheline specified in dma_map_single) insert
+ * into this tree using the cacheline as the key. At
  * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry.  If
- * the pfn already exists at insertion time add a tag as a reference
+ * the entry already exists at insertion time add a tag as a reference
  * count for the overlapping mappings.  For now, the overlap tracking
- * just ensures that 'unmaps' balance 'maps' before marking the pfn
- * idle, but we should also be flagging overlaps as an API violation.
+ * just ensures that 'unmaps' balance 'maps' before marking the
+ * cacheline idle, but we should also be flagging overlaps as an API
+ * violation.
  *
  * Memory usage is mostly constrained by the maximum number of available
  * dma-debug entries in that we need a free dma_debug_entry before
- * inserting into the tree.  In the case of dma_map_{single|page} and
- * dma_alloc_coherent there is only one dma_debug_entry and one pfn to
- * track per event.  dma_map_sg(), on the other hand,
- * consumes a single dma_debug_entry, but inserts 'nents' entries into
- * the tree.
+ * inserting into the tree.  In the case of dma_map_page and
+ * dma_alloc_coherent there is only one dma_debug_entry and one
+ * dma_active_cacheline entry to track per event.  dma_map_sg(), on the
+ * other hand, consumes a single dma_debug_entry, but inserts 'nents'
+ * entries into the tree.
  *
  * At any time debug_dma_assert_idle() can be called to trigger a
- * warning if the given page is in the active set.
+ * warning if any cachelines in the given page are in the active set.
  */
-static RADIX_TREE(dma_active_pfn, GFP_NOWAIT);
+static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT);
 static DEFINE_SPINLOCK(radix_lock);
-#define ACTIVE_PFN_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1)
+#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1)
+#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT)
+#define CACHELINES_PER_PAGE (1 << CACHELINE_PER_PAGE_SHIFT)
 
-static int active_pfn_read_overlap(unsigned long pfn)
+static phys_addr_t to_cacheline_number(struct dma_debug_entry *entry)
+{
+	return (entry->pfn << CACHELINE_PER_PAGE_SHIFT) +
+		(entry->offset >> L1_CACHE_SHIFT);
+}
+
+static int active_cacheline_read_overlap(phys_addr_t cln)
 {
 	int overlap = 0, i;
 
 	for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--)
-		if (radix_tree_tag_get(&dma_active_pfn, pfn, i))
+		if (radix_tree_tag_get(&dma_active_cacheline, cln, i))
 			overlap |= 1 << i;
 	return overlap;
 }
 
-static int active_pfn_set_overlap(unsigned long pfn, int overlap)
+static int active_cacheline_set_overlap(phys_addr_t cln, int overlap)
 {
 	int i;
 
-	if (overlap > ACTIVE_PFN_MAX_OVERLAP || overlap < 0)
+	if (overlap > ACTIVE_CACHELINE_MAX_OVERLAP || overlap < 0)
 		return overlap;
 
 	for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--)
 		if (overlap & 1 << i)
-			radix_tree_tag_set(&dma_active_pfn, pfn, i);
+			radix_tree_tag_set(&dma_active_cacheline, cln, i);
 		else
-			radix_tree_tag_clear(&dma_active_pfn, pfn, i);
+			radix_tree_tag_clear(&dma_active_cacheline, cln, i);
 
 	return overlap;
 }
 
-static void active_pfn_inc_overlap(unsigned long pfn)
+static void active_cacheline_inc_overlap(phys_addr_t cln)
 {
-	int overlap = active_pfn_read_overlap(pfn);
+	int overlap = active_cacheline_read_overlap(cln);
 
-	overlap = active_pfn_set_overlap(pfn, ++overlap);
+	overlap = active_cacheline_set_overlap(cln, ++overlap);
 
 	/* If we overflowed the overlap counter then we're potentially
 	 * leaking dma-mappings.  Otherwise, if maps and unmaps are
 	 * balanced then this overflow may cause false negatives in
-	 * debug_dma_assert_idle() as the pfn may be marked idle
+	 * debug_dma_assert_idle() as the cacheline may be marked idle
 	 * prematurely.
 	 */
-	WARN_ONCE(overlap > ACTIVE_PFN_MAX_OVERLAP,
-		  "DMA-API: exceeded %d overlapping mappings of pfn %lx\n",
-		  ACTIVE_PFN_MAX_OVERLAP, pfn);
+	WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
+		  "DMA-API: exceeded %d overlapping mappings of cacheline %pa\n",
+		  ACTIVE_CACHELINE_MAX_OVERLAP, &cln);
 }
 
-static int active_pfn_dec_overlap(unsigned long pfn)
+static int active_cacheline_dec_overlap(phys_addr_t cln)
 {
-	int overlap = active_pfn_read_overlap(pfn);
+	int overlap = active_cacheline_read_overlap(cln);
 
-	return active_pfn_set_overlap(pfn, --overlap);
+	return active_cacheline_set_overlap(cln, --overlap);
 }
 
-static int active_pfn_insert(struct dma_debug_entry *entry)
+static int active_cacheline_insert(struct dma_debug_entry *entry)
 {
+	phys_addr_t cln = to_cacheline_number(entry);
 	unsigned long flags;
 	int rc;
 
+	/* If the device is not writing memory then we don't have any
+	 * concerns about the cpu consuming stale data.  This mitigates
+	 * legitimate usages of overlapping mappings.
+	 */
+	if (entry->direction == DMA_TO_DEVICE)
+		return 0;
+
 	spin_lock_irqsave(&radix_lock, flags);
-	rc = radix_tree_insert(&dma_active_pfn, entry->pfn, entry);
+	rc = radix_tree_insert(&dma_active_cacheline, cln, entry);
 	if (rc == -EEXIST)
-		active_pfn_inc_overlap(entry->pfn);
+		active_cacheline_inc_overlap(cln);
 	spin_unlock_irqrestore(&radix_lock, flags);
 
 	return rc;
 }
 
-static void active_pfn_remove(struct dma_debug_entry *entry)
+static void active_cacheline_remove(struct dma_debug_entry *entry)
 {
+	phys_addr_t cln = to_cacheline_number(entry);
 	unsigned long flags;
 
+	/* ...mirror the insert case */
+	if (entry->direction == DMA_TO_DEVICE)
+		return;
+
 	spin_lock_irqsave(&radix_lock, flags);
 	/* since we are counting overlaps the final put of the
-	 * entry->pfn will occur when the overlap count is 0.
-	 * active_pfn_dec_overlap() returns -1 in that case
+	 * cacheline will occur when the overlap count is 0.
+	 * active_cacheline_dec_overlap() returns -1 in that case
 	 */
-	if (active_pfn_dec_overlap(entry->pfn) < 0)
-		radix_tree_delete(&dma_active_pfn, entry->pfn);
+	if (active_cacheline_dec_overlap(cln) < 0)
+		radix_tree_delete(&dma_active_cacheline, cln);
 	spin_unlock_irqrestore(&radix_lock, flags);
 }
 
 /**
  * debug_dma_assert_idle() - assert that a page is not undergoing dma
- * @page: page to lookup in the dma_active_pfn tree
+ * @page: page to lookup in the dma_active_cacheline tree
  *
  * Place a call to this routine in cases where the cpu touching the page
  * before the dma completes (page is dma_unmapped) will lead to data
@@ -536,22 +559,38 @@ static void active_pfn_remove(struct dma_debug_entry *entry)
  */
 void debug_dma_assert_idle(struct page *page)
 {
+	static struct dma_debug_entry *ents[CACHELINES_PER_PAGE];
+	struct dma_debug_entry *entry = NULL;
+	void **results = (void **) &ents;
+	unsigned int nents, i;
 	unsigned long flags;
-	struct dma_debug_entry *entry;
+	phys_addr_t cln;
 
 	if (!page)
 		return;
 
+	cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT;
 	spin_lock_irqsave(&radix_lock, flags);
-	entry = radix_tree_lookup(&dma_active_pfn, page_to_pfn(page));
+	nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln,
+				       CACHELINES_PER_PAGE);
+	for (i = 0; i < nents; i++) {
+		phys_addr_t ent_cln = to_cacheline_number(ents[i]);
+
+		if (ent_cln == cln) {
+			entry = ents[i];
+			break;
+		} else if (ent_cln >= cln + CACHELINES_PER_PAGE)
+			break;
+	}
 	spin_unlock_irqrestore(&radix_lock, flags);
 
 	if (!entry)
 		return;
 
+	cln = to_cacheline_number(entry);
 	err_printk(entry->dev, entry,
-		   "DMA-API: cpu touching an active dma mapped page "
-		   "[pfn=0x%lx]\n", entry->pfn);
+		   "DMA-API: cpu touching an active dma mapped cacheline [cln=%pa]\n",
+		   &cln);
 }
 
 /*
@@ -568,9 +607,9 @@ static void add_dma_entry(struct dma_debug_entry *entry)
 	hash_bucket_add(bucket, entry);
 	put_hash_bucket(bucket, &flags);
 
-	rc = active_pfn_insert(entry);
+	rc = active_cacheline_insert(entry);
 	if (rc == -ENOMEM) {
-		pr_err("DMA-API: pfn tracking ENOMEM, dma-debug disabled\n");
+		pr_err("DMA-API: cacheline tracking ENOMEM, dma-debug disabled\n");
 		global_disable = true;
 	}
 
@@ -631,7 +670,7 @@ static void dma_entry_free(struct dma_debug_entry *entry)
 {
 	unsigned long flags;
 
-	active_pfn_remove(entry);
+	active_cacheline_remove(entry);
 
 	/*
 	 * add to beginning of the list - this way the entries are