summaryrefslogtreecommitdiffstats
path: root/kernfs-kernfs_notify-must-be-usable-from-non-sleepable-contexts.patch
blob: 83bacc342c42a72df2954f3dae1801d0ca9719e4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
Bugzilla: 1113805
Upstream-status: queued for 3.16                                                                                                                                                                                                                                                               
Delivered-To: jwboyer@gmail.com
Received: by 10.76.6.212 with SMTP id d20csp61002oaa;
        Tue, 1 Jul 2014 13:41:10 -0700 (PDT)
X-Received: by 10.70.36.203 with SMTP id s11mr216865pdj.30.1404247270266;
        Tue, 01 Jul 2014 13:41:10 -0700 (PDT)
Return-Path: <htejun@gmail.com>
Received: from bastion.fedoraproject.org (bastion02.fedoraproject.org. [209.132.181.3])
        by mx.google.com with ESMTP id kk10si24969097pbd.238.2014.07.01.13.41.09
        for <jwboyer@gmail.com>;
        Tue, 01 Jul 2014 13:41:10 -0700 (PDT)
Received-SPF: softfail (google.com: domain of transitioning htejun@gmail.com does not designate 209.132.181.3 as permitted sender) client-ip=209.132.181.3;
Authentication-Results: mx.google.com;
       spf=softfail (google.com: domain of transitioning htejun@gmail.com does not designate 209.132.181.3 as permitted sender) smtp.mail=htejun@gmail.com;
       dkim=pass header.i=@gmail.com
Received: by bastion02.phx2.fedoraproject.org (Postfix)
	id B10D940254; Tue,  1 Jul 2014 20:41:09 +0000 (UTC)
Delivered-To: jwboyer@fedoraproject.org
Received: from mx1.redhat.com (ext-mx11.extmail.prod.ext.phx2.redhat.com [10.5.110.16])
	by bastion02.phx2.fedoraproject.org (Postfix) with ESMTP id B85F34023B
	for <jwboyer@fedoraproject.org>; Tue,  1 Jul 2014 20:41:08 +0000 (UTC)
Received: from mail-qg0-f54.google.com (mail-qg0-f54.google.com [209.85.192.54])
	by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id s61Kf6SO012731
	(version=TLSv1/SSLv3 cipher=RC4-SHA bits=128 verify=FAIL)
	for <jwboyer@fedoraproject.org>; Tue, 1 Jul 2014 16:41:07 -0400
Received: by mail-qg0-f54.google.com with SMTP id q107so4001033qgd.13
        for <jwboyer@fedoraproject.org>; Tue, 01 Jul 2014 13:41:06 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20120113;
        h=sender:date:from:to:cc:subject:message-id:references:mime-version
         :content-type:content-disposition:in-reply-to:user-agent;
        bh=6XKZNT2j9E5fPfYRAGlZGjGhRFI/mJ2H4GG/9wx0YJ8=;
        b=y5rarhV+vMKAm+ta99eylQ+byg9AD0erza6YL9fdzP/Zn1StotASphxOFR5kgPTecY
         QDAy7UTOje/NG0GtGLPU7r1rIT77mUNCSYhALFpG6yUHQmMlX5Lo+BuXQSOJBoIGefcv
         mLwG3YwsHrmE8JEov4fJvU67VO2ZPq2GW1iTASOea3qn/f6595XqdIUHzpHjvmz/aX9R
         nxAxkE+VGEZDm1Rp72rlwSSh2MJ5k3fbprLW8Ec41NyV2u5mehYHTgEf5DCu4JodmlHY
         j94xfjqvm7HIqNtKGB3aTsNl2L7+dGDZ0dwbdo2/WRINOhGkPuNnJ5wqS9YaDDaQG4ND
         HQ0w==
X-Received: by 10.140.22.134 with SMTP id 6mr70449172qgn.4.1404247266223;
        Tue, 01 Jul 2014 13:41:06 -0700 (PDT)
Received: from htj.dyndns.org (207-38-225-25.c3-0.43d-ubr1.qens-43d.ny.cable.rcn.com. [207.38.225.25])
        by mx.google.com with ESMTPSA id q12sm39024393qad.40.2014.07.01.13.41.04
        for <multiple recipients>
        (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128);
        Tue, 01 Jul 2014 13:41:05 -0700 (PDT)
Sender: Tejun Heo <htejun@gmail.com>
Date: Tue, 1 Jul 2014 16:41:03 -0400
From: Tejun Heo <tj@kernel.org>
To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jens Axboe <axboe@kernel.dk>, "Michael S. Tsirkin" <mst@redhat.com>,
        Christoph Hellwig <hch@lst.de>, Josh Boyer <jwboyer@fedoraproject.org>,
        Rusty Russell <rusty@rustcorp.com.au>,
        virtualization@lists.linux-foundation.org,
        "Linux-Kernel@Vger. Kernel. Org" <linux-kernel@vger.kernel.org>,
        Brian Lane <bcl@redhat.com>, John McCutchan <john@johnmccutchan.com>,
        Robert Love <rlove@rlove.org>, Eric Paris <eparis@parisplace.org>
Subject: [PATCH driver-core-linus] kernfs: kernfs_notify() must be useable
 from non-sleepable contexts
Message-ID: <20140701204103.GA12459@htj.dyndns.org>
References: <CA+5PVA4h2hQJLrAm4eEZ8oUgGiAjq2y5eAyBcJWxK_FiQXnuQg@mail.gmail.com>
 <20140629082637.GA23942@redhat.com>
 <20140629193222.GA7030@lst.de>
 <20140629204710.GB11100@redhat.com>
 <53B07D48.60003@kernel.dk>
 <20140630201741.GA20853@htj.dyndns.org>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20140630201741.GA20853@htj.dyndns.org>
User-Agent: Mutt/1.5.23 (2014-03-12)
X-RedHat-Spam-Score: -2.31  (BAYES_00,DCC_REPUT_00_12,DKIM_SIGNED,DKIM_VALID,FREEMAIL_FROM,RCVD_IN_DNSWL_NONE,SPF_PASS)
X-Scanned-By: MIMEDefang 2.68 on 10.5.110.16

d911d9874801 ("kernfs: make kernfs_notify() trigger inotify events
too") added fsnotify triggering to kernfs_notify() which requires a
sleepable context.  There are already existing users of
kernfs_notify() which invoke it from an atomic context and in general
it's silly to require a sleepable context for triggering a
notification.

The following is an invalid context bug triggerd by md invoking
sysfs_notify() from IO completion path.

 BUG: sleeping function called from invalid context at kernel/locking/mutex.c:586
 in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: swapper/1
 2 locks held by swapper/1/0:
  #0:  (&(&vblk->vq_lock)->rlock){-.-...}, at: [<ffffffffa0039042>] virtblk_done+0x42/0xe0 [virtio_blk]
  #1:  (&(&bitmap->counts.lock)->rlock){-.....}, at: [<ffffffff81633718>] bitmap_endwrite+0x68/0x240
 irq event stamp: 33518
 hardirqs last  enabled at (33515): [<ffffffff8102544f>] default_idle+0x1f/0x230
 hardirqs last disabled at (33516): [<ffffffff818122ed>] common_interrupt+0x6d/0x72
 softirqs last  enabled at (33518): [<ffffffff810a1272>] _local_bh_enable+0x22/0x50
 softirqs last disabled at (33517): [<ffffffff810a29e0>] irq_enter+0x60/0x80
 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.16.0-0.rc2.git2.1.fc21.x86_64 #1
 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  0000000000000000 f90db13964f4ee05 ffff88007d403b80 ffffffff81807b4c
  0000000000000000 ffff88007d403ba8 ffffffff810d4f14 0000000000000000
  0000000000441800 ffff880078fa1780 ffff88007d403c38 ffffffff8180caf2
 Call Trace:
  <IRQ>  [<ffffffff81807b4c>] dump_stack+0x4d/0x66
  [<ffffffff810d4f14>] __might_sleep+0x184/0x240
  [<ffffffff8180caf2>] mutex_lock_nested+0x42/0x440
  [<ffffffff812d76a0>] kernfs_notify+0x90/0x150
  [<ffffffff8163377c>] bitmap_endwrite+0xcc/0x240
  [<ffffffffa00de863>] close_write+0x93/0xb0 [raid1]
  [<ffffffffa00df029>] r1_bio_write_done+0x29/0x50 [raid1]
  [<ffffffffa00e0474>] raid1_end_write_request+0xe4/0x260 [raid1]
  [<ffffffff813acb8b>] bio_endio+0x6b/0xa0
  [<ffffffff813b46c4>] blk_update_request+0x94/0x420
  [<ffffffff813bf0ea>] blk_mq_end_io+0x1a/0x70
  [<ffffffffa00392c2>] virtblk_request_done+0x32/0x80 [virtio_blk]
  [<ffffffff813c0648>] __blk_mq_complete_request+0x88/0x120
  [<ffffffff813c070a>] blk_mq_complete_request+0x2a/0x30
  [<ffffffffa0039066>] virtblk_done+0x66/0xe0 [virtio_blk]
  [<ffffffffa002535a>] vring_interrupt+0x3a/0xa0 [virtio_ring]
  [<ffffffff81116177>] handle_irq_event_percpu+0x77/0x340
  [<ffffffff8111647d>] handle_irq_event+0x3d/0x60
  [<ffffffff81119436>] handle_edge_irq+0x66/0x130
  [<ffffffff8101c3e4>] handle_irq+0x84/0x150
  [<ffffffff818146ad>] do_IRQ+0x4d/0xe0
  [<ffffffff818122f2>] common_interrupt+0x72/0x72
  <EOI>  [<ffffffff8105f706>] ? native_safe_halt+0x6/0x10
  [<ffffffff81025454>] default_idle+0x24/0x230
  [<ffffffff81025f9f>] arch_cpu_idle+0xf/0x20
  [<ffffffff810f5adc>] cpu_startup_entry+0x37c/0x7b0
  [<ffffffff8104df1b>] start_secondary+0x25b/0x300

This patch fixes it by punting the notification delivery through a
work item.  This ends up adding an extra pointer to kernfs_elem_attr
enlarging kernfs_node by a pointer, which is not ideal but not a very
big deal either.  If this turns out to be an actual issue, we can move
kernfs_elem_attr->size to kernfs_node->iattr later.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Josh Boyer <jwboyer@fedoraproject.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
---
 fs/kernfs/file.c       |   69 +++++++++++++++++++++++++++++++++++++++----------
 include/linux/kernfs.h |    1 
 2 files changed, 56 insertions(+), 14 deletions(-)

--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -39,6 +39,19 @@ struct kernfs_open_node {
 	struct list_head	files; /* goes through kernfs_open_file.list */
 };
 
+/*
+ * kernfs_notify() may be called from any context and bounces notifications
+ * through a work item.  To minimize space overhead in kernfs_node, the
+ * pending queue is implemented as a singly linked list of kernfs_nodes.
+ * The list is terminated with the self pointer so that whether a
+ * kernfs_node is on the list or not can be determined by testing the next
+ * pointer for NULL.
+ */
+#define KERNFS_NOTIFY_EOL			((void *)&kernfs_notify_list)
+
+static DEFINE_SPINLOCK(kernfs_notify_lock);
+static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
+
 static struct kernfs_open_file *kernfs_of(struct file *file)
 {
 	return ((struct seq_file *)file->private_data)->private;
@@ -783,24 +796,25 @@ static unsigned int kernfs_fop_poll(stru
 	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
 }
 
-/**
- * kernfs_notify - notify a kernfs file
- * @kn: file to notify
- *
- * Notify @kn such that poll(2) on @kn wakes up.
- */
-void kernfs_notify(struct kernfs_node *kn)
+static void kernfs_notify_workfn(struct work_struct *work)
 {
-	struct kernfs_root *root = kernfs_root(kn);
+	struct kernfs_node *kn;
 	struct kernfs_open_node *on;
 	struct kernfs_super_info *info;
-	unsigned long flags;
-
-	if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
+repeat:
+	/* pop one off the notify_list */
+	spin_lock_irq(&kernfs_notify_lock);
+	kn = kernfs_notify_list;
+	if (kn == KERNFS_NOTIFY_EOL) {
+		spin_unlock_irq(&kernfs_notify_lock);
 		return;
+	}
+	kernfs_notify_list = kn->attr.notify_next;
+	kn->attr.notify_next = NULL;
+	spin_unlock_irq(&kernfs_notify_lock);
 
 	/* kick poll */
-	spin_lock_irqsave(&kernfs_open_node_lock, flags);
+	spin_lock_irq(&kernfs_open_node_lock);
 
 	on = kn->attr.open;
 	if (on) {
@@ -808,12 +822,12 @@ void kernfs_notify(struct kernfs_node *k
 		wake_up_interruptible(&on->poll);
 	}
 
-	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+	spin_unlock_irq(&kernfs_open_node_lock);
 
 	/* kick fsnotify */
 	mutex_lock(&kernfs_mutex);
 
-	list_for_each_entry(info, &root->supers, node) {
+	list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
 		struct inode *inode;
 		struct dentry *dentry;
 
@@ -833,6 +847,33 @@ void kernfs_notify(struct kernfs_node *k
 	}
 
 	mutex_unlock(&kernfs_mutex);
+	kernfs_put(kn);
+	goto repeat;
+}
+
+/**
+ * kernfs_notify - notify a kernfs file
+ * @kn: file to notify
+ *
+ * Notify @kn such that poll(2) on @kn wakes up.  Maybe be called from any
+ * context.
+ */
+void kernfs_notify(struct kernfs_node *kn)
+{
+	static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
+	unsigned long flags;
+
+	if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
+		return;
+
+	spin_lock_irqsave(&kernfs_notify_lock, flags);
+	if (!kn->attr.notify_next) {
+		kernfs_get(kn);
+		kn->attr.notify_next = kernfs_notify_list;
+		kernfs_notify_list = kn;
+		schedule_work(&kernfs_notify_work);
+	}
+	spin_unlock_irqrestore(&kernfs_notify_lock, flags);
 }
 EXPORT_SYMBOL_GPL(kernfs_notify);
 
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -91,6 +91,7 @@ struct kernfs_elem_attr {
 	const struct kernfs_ops	*ops;
 	struct kernfs_open_node	*open;
 	loff_t			size;
+	struct kernfs_node	*notify_next;	/* for kernfs_notify() */
 };
 
 /*