1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
|
From: Dmitry Monakhov <dmonakhov@xxxxxxxx>
Date: Thu, 9 Oct 2014 15:14:47 +0400
Subject: [PATCH] ext4: fix race between write and fcntl(F_SETFL)
O_DIRECT flags can be toggeled via fcntl(F_SETFL).
But this value checked twice inside ext4_file_write_iter() and __generic_file_write()
which result in BUG_ON (see typical stack trace below)
In order to fix this we have to use our own copy of __generic_file_write and
pass o_direct status explicitly.
TESTCASE: xfstest:generic/326 (http://patchwork.ozlabs.org/patch/397949/)
kernel BUG at fs/ext4/inode.c:2960!
invalid opcode: 0000 [#1] SMP
Modules linked in: brd iTCO_wdt lpc_ich mfd_core igb ptp dm_mirror dm_region_hash dm_log dm_mod
CPU: 6 PID: 5505 Comm: aio-dio-fcntl-r Not tainted 3.17.0-rc2-00176-gff5c017 #161
Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011
task: ffff88080e95a7c0 ti: ffff88080f908000 task.ti: ffff88080f908000
RIP: 0010:[<ffffffff811fabf2>] [<ffffffff811fabf2>] ext4_direct_IO+0x162/0x3d0
RSP: 0018:ffff88080f90bb58 EFLAGS: 00010246
RAX: 0000000000000400 RBX: ffff88080fdb2a28 RCX: 00000000a802c818
RDX: 0000040000080000 RSI: ffff88080d8aeb80 RDI: 0000000000000001
RBP: ffff88080f90bbc8 R08: 0000000000000000 R09: 0000000000001581
R10: 0000000000000000 R11: 0000000000000000 R12: ffff88080d8aeb80
R13: ffff88080f90bbf8 R14: ffff88080fdb28c8 R15: ffff88080fdb2a28
FS: 00007f23b2055700(0000) GS:ffff880818400000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f23b2045000 CR3: 000000080cedf000 CR4: 00000000000407e0
Stack:
ffff88080f90bb98 0000000000000000 7ffffffffffffffe ffff88080fdb2c30
0000000000000200 0000000000000200 0000000000000001 0000000000000200
ffff88080f90bbc8 ffff88080fdb2c30 ffff88080f90be08 0000000000000200
Call Trace:
[<ffffffff8112ca9d>] generic_file_direct_write+0xed/0x180
[<ffffffff8112f2b2>] __generic_file_write_iter+0x222/0x370
[<ffffffff811f495b>] ext4_file_write_iter+0x34b/0x400
[<ffffffff811bd709>] ? aio_run_iocb+0x239/0x410
[<ffffffff811bd709>] ? aio_run_iocb+0x239/0x410
[<ffffffff810990e5>] ? local_clock+0x25/0x30
[<ffffffff810abd94>] ? __lock_acquire+0x274/0x700
[<ffffffff811f4610>] ? ext4_unwritten_wait+0xb0/0xb0
[<ffffffff811bd756>] aio_run_iocb+0x286/0x410
[<ffffffff810990e5>] ? local_clock+0x25/0x30
[<ffffffff810ac359>] ? lock_release_holdtime+0x29/0x190
[<ffffffff811bc05b>] ? lookup_ioctx+0x4b/0xf0
[<ffffffff811bde3b>] do_io_submit+0x55b/0x740
[<ffffffff811bdcaa>] ? do_io_submit+0x3ca/0x740
[<ffffffff811be030>] SyS_io_submit+0x10/0x20
[<ffffffff815ce192>] system_call_fastpath+0x16/0x1b
Code: 01 48 8b 80 f0 01 00 00 48 8b 18 49 8b 45 10 0f 85 f1 01 00 00 48 03 45 c8 48 3b 43 48 0f 8f e3 01 00 00 49 83 7c 24 18 00 75 04 <0f> 0b eb fe f0 ff 83 ec 01 00 00 49 8b 44 24 18 8b 00 85 c0 89
RIP [<ffffffff811fabf2>] ext4_direct_IO+0x162/0x3d0
RSP <ffff88080f90bb58>
Upstream-status: Submitted but likely not accepted
Bugzilla: 1152608
Reported-by: Sasha Levin <sasha.levin@xxxxxxxxxx>
Signed-off-by: Dmitry Monakhov <dmonakhov@xxxxxxxxxx>
---
fs/ext4/file.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 95 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index aca7b24a4432..8477eb259809 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -88,6 +88,100 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
return 0;
}
+/**
+ * copy of __generic_file_write_iter with explicit O_DIRECT status
+ * @iocb: IO state structure (file, offset, etc.)
+ * @from: iov_iter with data to write
+ * @direct: perform O_DIRECT IO
+ */
+static ssize_t
+__ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from, int direct)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos = iocb->ki_pos;
+ ssize_t written = 0;
+ ssize_t err;
+ ssize_t status;
+ size_t count = iov_iter_count(from);
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err)
+ goto out;
+
+ if (count == 0)
+ goto out;
+
+ iov_iter_truncate(from, count);
+
+ err = file_remove_suid(file);
+ if (err)
+ goto out;
+
+ err = file_update_time(file);
+ if (err)
+ goto out;
+
+ /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+ if (unlikely(direct)) {
+ loff_t endbyte;
+
+ written = generic_file_direct_write(iocb, from, pos);
+ if (written < 0 || written == count)
+ goto out;
+
+ /*
+ * direct-io write to a hole: fall through to buffered I/O
+ * for completing the rest of the request.
+ */
+ pos += written;
+ count -= written;
+
+ status = generic_perform_write(file, from, pos);
+ /*
+ * If generic_perform_write() returned a synchronous error
+ * then we want to return the number of bytes which were
+ * direct-written, or the error code if that was zero. Note
+ * that this differs from normal direct-io semantics, which
+ * will return -EFOO even if some bytes were written.
+ */
+ if (unlikely(status < 0)) {
+ err = status;
+ goto out;
+ }
+ iocb->ki_pos = pos + status;
+ /*
+ * We need to ensure that the page cache pages are written to
+ * disk and invalidated to preserve the expected O_DIRECT
+ * semantics.
+ */
+ endbyte = pos + status - 1;
+ err = filemap_write_and_wait_range(file->f_mapping, pos,
+ endbyte);
+ if (err == 0) {
+ written += status;
+ invalidate_mapping_pages(mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+ } else {
+ /*
+ * We don't know how much we wrote, so just return
+ * the number of bytes which were direct-written
+ */
+ }
+ } else {
+ written = generic_perform_write(file, from, pos);
+ if (likely(written >= 0))
+ iocb->ki_pos = pos + written;
+ }
+out:
+ current->backing_dev_info = NULL;
+ return written ? written : err;
+}
+
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
@@ -172,7 +266,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
}
- ret = __generic_file_write_iter(iocb, from);
+ ret = __ext4_file_write_iter(iocb, from, o_direct);
mutex_unlock(&inode->i_mutex);
if (ret > 0) {
--
1.9.3
|