From 35c80d5f400f68f2eccf3069d1c068e154bde9c9 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 15 Apr 2009 13:22:38 -0400 Subject: Add block_write_full_page_endio for passing endio handler block_write_full_page doesn't allow the caller to control what happens when the IO is over. This adds a new call named block_write_full_page_endio so the buffer head end_io handler can be provided by the caller. This will be used by the ext3 data=guarded mode to do i_size updates in a workqueue based end_io handler. end_buffer_async_write is also exported so it can be called to do the dirty work of managing page writeback for the higher level end_io handler. Signed-off-by: Chris Mason Acked-by: Theodore Tso Acked-by: Jan Kara Signed-off-by: Linus Torvalds --- fs/buffer.c | 45 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index ff8bb1f2333..b3e5be7514f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -360,7 +360,7 @@ still_busy: * Completion handler for block_write_full_page() - pages which are unlocked * during I/O, and which have PageWriteback cleared upon I/O completion. */ -static void end_buffer_async_write(struct buffer_head *bh, int uptodate) +void end_buffer_async_write(struct buffer_head *bh, int uptodate) { char b[BDEVNAME_SIZE]; unsigned long flags; @@ -438,11 +438,17 @@ static void mark_buffer_async_read(struct buffer_head *bh) set_buffer_async_read(bh); } -void mark_buffer_async_write(struct buffer_head *bh) +void mark_buffer_async_write_endio(struct buffer_head *bh, + bh_end_io_t *handler) { - bh->b_end_io = end_buffer_async_write; + bh->b_end_io = handler; set_buffer_async_write(bh); } + +void mark_buffer_async_write(struct buffer_head *bh) +{ + mark_buffer_async_write_endio(bh, end_buffer_async_write); +} EXPORT_SYMBOL(mark_buffer_async_write); @@ -1615,7 +1621,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata); * unplugging the device queue. */ static int __block_write_full_page(struct inode *inode, struct page *page, - get_block_t *get_block, struct writeback_control *wbc) + get_block_t *get_block, struct writeback_control *wbc, + bh_end_io_t *handler) { int err; sector_t block; @@ -1700,7 +1707,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, continue; } if (test_clear_buffer_dirty(bh)) { - mark_buffer_async_write(bh); + mark_buffer_async_write_endio(bh, handler); } else { unlock_buffer(bh); } @@ -1753,7 +1760,7 @@ recover: if (buffer_mapped(bh) && buffer_dirty(bh) && !buffer_delay(bh)) { lock_buffer(bh); - mark_buffer_async_write(bh); + mark_buffer_async_write_endio(bh, handler); } else { /* * The buffer may have been set dirty during @@ -2679,7 +2686,8 @@ int nobh_writepage(struct page *page, get_block_t *get_block, out: ret = mpage_writepage(page, get_block, wbc); if (ret == -EAGAIN) - ret = __block_write_full_page(inode, page, get_block, wbc); + ret = __block_write_full_page(inode, page, get_block, wbc, + end_buffer_async_write); return ret; } EXPORT_SYMBOL(nobh_writepage); @@ -2837,9 +2845,10 @@ out: /* * The generic ->writepage function for buffer-backed address_spaces + * this form passes in the end_io handler used to finish the IO. */ -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) +int block_write_full_page_endio(struct page *page, get_block_t *get_block, + struct writeback_control *wbc, bh_end_io_t *handler) { struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); @@ -2848,7 +2857,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block, /* Is the page fully inside i_size? */ if (page->index < end_index) - return __block_write_full_page(inode, page, get_block, wbc); + return __block_write_full_page(inode, page, get_block, wbc, + handler); /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_CACHE_SIZE-1); @@ -2871,9 +2881,20 @@ int block_write_full_page(struct page *page, get_block_t *get_block, * writes to that region are not written out to the file." */ zero_user_segment(page, offset, PAGE_CACHE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc); + return __block_write_full_page(inode, page, get_block, wbc, handler); } +/* + * The generic ->writepage function for buffer-backed address_spaces + */ +int block_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) +{ + return block_write_full_page_endio(page, get_block, wbc, + end_buffer_async_write); +} + + sector_t generic_block_bmap(struct address_space *mapping, sector_t block, get_block_t *get_block) { @@ -3342,9 +3363,11 @@ EXPORT_SYMBOL(block_read_full_page); EXPORT_SYMBOL(block_sync_page); EXPORT_SYMBOL(block_truncate_page); EXPORT_SYMBOL(block_write_full_page); +EXPORT_SYMBOL(block_write_full_page_endio); EXPORT_SYMBOL(cont_write_begin); EXPORT_SYMBOL(end_buffer_read_sync); EXPORT_SYMBOL(end_buffer_write_sync); +EXPORT_SYMBOL(end_buffer_async_write); EXPORT_SYMBOL(file_fsync); EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_cont_expand_simple); -- cgit From d110271e1f4140a9fb06d968b1afe9ca56a6064e Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Wed, 25 Mar 2009 15:11:36 -0600 Subject: sysfs: don't use global workqueue in sysfs_schedule_callback() A sysfs attribute using sysfs_schedule_callback() to commit suicide may end up calling device_unregister(), which will eventually call a driver's ->remove function. Drivers may call flush_scheduled_work() in their shutdown routines, in which case lockdep will complain with something like the following: ============================================= [ INFO: possible recursive locking detected ] 2.6.29-rc8-kk #1 --------------------------------------------- events/4/56 is trying to acquire lock: (events){--..}, at: [] flush_workqueue+0x0/0xa0 but task is already holding lock: (events){--..}, at: [] run_workqueue+0x108/0x230 other info that might help us debug this: 3 locks held by events/4/56: #0: (events){--..}, at: [] run_workqueue+0x108/0x230 #1: (&ss->work){--..}, at: [] run_workqueue+0x108/0x230 #2: (pci_remove_rescan_mutex){--..}, at: [] remove_callback+0x21/0x40 stack backtrace: Pid: 56, comm: events/4 Not tainted 2.6.29-rc8-kk #1 Call Trace: [] validate_chain+0xb7d/0x1260 [] __lock_acquire+0x42e/0xa40 [] lock_acquire+0x58/0x80 [] ? flush_workqueue+0x0/0xa0 [] flush_workqueue+0x4d/0xa0 [] ? flush_workqueue+0x0/0xa0 [] flush_scheduled_work+0x10/0x20 [] e1000_remove+0x55/0xfe [e1000e] [] ? sysfs_schedule_callback_work+0x0/0x50 [] pci_device_remove+0x32/0x70 [] __device_release_driver+0x59/0x90 [] device_release_driver+0x2b/0x40 [] bus_remove_device+0xa6/0x120 [] device_del+0x12b/0x190 [] device_unregister+0x26/0x70 [] pci_stop_dev+0x49/0x60 [] pci_remove_bus_device+0x40/0xc0 [] remove_callback+0x29/0x40 [] sysfs_schedule_callback_work+0x1f/0x50 [] run_workqueue+0x15a/0x230 [] ? run_workqueue+0x108/0x230 [] worker_thread+0x9f/0x100 [] ? autoremove_wake_function+0x0/0x40 [] ? worker_thread+0x0/0x100 [] kthread+0x4d/0x80 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? kthread+0x0/0x80 [] ? child_rip+0x0/0x20 Although we know that the device_unregister path will never acquire a lock that a driver might try to acquire in its ->remove, in general we should never attempt to flush a workqueue from within the same workqueue, and lockdep rightly complains. So as long as sysfs attributes cannot commit suicide directly and we are stuck with this callback mechanism, put the sysfs callbacks on their own workqueue instead of the global one. This has the side benefit that if a suicidal sysfs attribute kicks off a long chain of ->remove callbacks, we no longer induce a long delay on the global queue. This also fixes a missing module_put in the error path introduced by sysfs-only-allow-one-scheduled-removal-callback-per-kobj.patch. We never destroy the workqueue, but I'm not sure that's a problem. Reported-by: Kenji Kaneshige Tested-by: Kenji Kaneshige Signed-off-by: Alex Chiang Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 289c43a4726..979e9379fb5 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -667,6 +667,7 @@ struct sysfs_schedule_callback_struct { struct work_struct work; }; +static struct workqueue_struct *sysfs_workqueue; static DEFINE_MUTEX(sysfs_workq_mutex); static LIST_HEAD(sysfs_workq); static void sysfs_schedule_callback_work(struct work_struct *work) @@ -715,11 +716,20 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), mutex_lock(&sysfs_workq_mutex); list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) if (ss->kobj == kobj) { + module_put(owner); mutex_unlock(&sysfs_workq_mutex); return -EAGAIN; } mutex_unlock(&sysfs_workq_mutex); + if (sysfs_workqueue == NULL) { + sysfs_workqueue = create_workqueue("sysfsd"); + if (sysfs_workqueue == NULL) { + module_put(owner); + return -ENOMEM; + } + } + ss = kmalloc(sizeof(*ss), GFP_KERNEL); if (!ss) { module_put(owner); @@ -735,7 +745,7 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), mutex_lock(&sysfs_workq_mutex); list_add_tail(&ss->workq_list, &sysfs_workq); mutex_unlock(&sysfs_workq_mutex); - schedule_work(&ss->work); + queue_work(sysfs_workqueue, &ss->work); return 0; } EXPORT_SYMBOL_GPL(sysfs_schedule_callback); -- cgit From 1af3557abdef34ee036a6de4cb79e24468544b8d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 9 Apr 2009 13:53:22 +0900 Subject: sysfs: sysfs poll keep the poll rule of regular file. Currently, following test programs don't finished. % ruby -e ' Thread.new { sleep } File.read("/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies") ' strace expose the reason. ... open("/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies", O_RDONLY|O_LARGEFILE) = 3 ioctl(3, SNDCTL_TMR_TIMEBASE or TCGETS, 0xbf9fa6b8) = -1 ENOTTY (Inappropriate ioctl for device) fstat64(3, {st_mode=S_IFREG|0444, st_size=4096, ...}) = 0 _llseek(3, 0, [0], SEEK_CUR) = 0 select(4, [3], NULL, NULL, NULL) = 1 (in [3]) read(3, "1400000 1300000 1200000 1100000 1"..., 4096) = 62 select(4, [3], NULL, NULL, NULL Because Ruby (the scripting language) VM assume select system-call against regular file don't block. it because SUSv3 says "Regular files shall always poll TRUE for reading and writing". see http://www.opengroup.org/onlinepubs/009695399/functions/poll.html it seems valid assumption. But sysfs_poll() don't keep this rule although sysfs file can read and write always. This patch restore proper poll behavior to sysfs. /sys/block/md*/md/sync_action polling application and another sysfs updating sensitive application still can use POLLERR and POLLPRI. Cc: Neil Brown Signed-off-by: KOSAKI Motohiro Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 979e9379fb5..b1606e07b7a 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -446,11 +446,11 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) if (buffer->event != atomic_read(&od->event)) goto trigger; - return 0; + return DEFAULT_POLLMASK; trigger: buffer->needs_read_fill = 1; - return POLLERR|POLLPRI; + return DEFAULT_POLLMASK|POLLERR|POLLPRI; } void sysfs_notify_dirent(struct sysfs_dirent *sd) -- cgit From 31b07093c44a7a442394d44423e21d783f5523b8 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 9 Apr 2009 13:57:59 +0900 Subject: proc: mounts_poll() make consistent to mdstat_poll In recently sysfs_poll discussion, Neil Brown pointed out /proc/mounts also should be fixed. SUSv3 says "Regular files shall always poll TRUE for reading and writing". see http://www.opengroup.org/onlinepubs/009695399/functions/poll.html Then, mounts_poll()'s default should be "POLLIN | POLLRDNORM". it mean always readable. In addition, event trigger should use "POLLERR | POLLPRI" instead POLLERR. it makes consistent to mdstat_poll() and sysfs_poll(). and, select(2) can handle POLLPRI easily. Reported-by: Neil Brown Signed-off-by: KOSAKI Motohiro Cc: Ram Pai Cc: Miklos Szeredi Cc: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/proc/base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index f71559784bf..aa763ab0077 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -648,14 +648,14 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) { struct proc_mounts *p = file->private_data; struct mnt_namespace *ns = p->ns; - unsigned res = 0; + unsigned res = POLLIN | POLLRDNORM; poll_wait(file, &ns->poll, wait); spin_lock(&vfsmount_lock); if (p->event != ns->event) { p->event = ns->event; - res = POLLERR; + res |= POLLERR | POLLPRI; } spin_unlock(&vfsmount_lock); -- cgit From b80901bbf599553f483b9509f2dce416b938aae8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 16 Apr 2009 19:09:55 -0700 Subject: splice: fix new kernel-doc warnings splice: fix kernel-doc warnings Warning(fs/splice.c:617): bad line: Warning(fs/splice.c:722): No description found for parameter 'sd' Warning(fs/splice.c:722): Excess function parameter 'pipe' description in 'splice_from_pipe_begin' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- fs/splice.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 5384a90665d..666953d59a3 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -614,7 +614,6 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe) * @actor: handler that splices the data * * Description: - * This function loops over the pipe and calls @actor to do the * actual moving of a single struct pipe_buffer to the desired * destination. It returns when there's no more buffers left in @@ -711,7 +710,7 @@ EXPORT_SYMBOL(splice_from_pipe_next); /** * splice_from_pipe_begin - start splicing from pipe - * @pipe: pipe to splice from + * @sd: information about the splice operation * * Description: * This function should be called before a loop containing -- cgit From d29a2e943867bfa48f72ee6e99723a1b29fe6f7e Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 17 Apr 2009 12:22:35 +0100 Subject: vfat: Note the NLS requirement Close bug #4754. Stop people getting into a situation where they can't get their FAT filesystems to mount as they expect. Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- fs/fat/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index d0a69ff2537..182f9ffe2b5 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -95,3 +95,6 @@ config FAT_DEFAULT_IOCHARSET Note that "utf8" is not recommended for FAT filesystems. If unsure, you shouldn't set "utf8" here. See for more information. + + Enable any character sets you need in File Systems/Native Language + Support. -- cgit From 6566abdbd0566fc1b5950c9f87ef57c7443d6fa8 Mon Sep 17 00:00:00 2001 From: Matt Kraai Date: Fri, 17 Apr 2009 12:56:38 +0100 Subject: AFS: Guard afs_file_readpage_read_complete() definition with CONFIG_AFS_FSCACHE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_AFS_FSCACHE is not defined, the following warning is displayed when fs/afs/file.c is compiled: fs/afs/file.c:111: warning: ‘afs_file_readpage_read_complete’ defined but not used This occurs because all calls to this function are guarded by CONFIG_AFS_FSCACHE. Thus, guard its definition as well. Signed-off-by: Matt Kraai Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/afs/file.c b/fs/afs/file.c index 7a1d942ef68..0149dab365e 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -102,6 +102,7 @@ int afs_release(struct inode *inode, struct file *file) return 0; } +#ifdef CONFIG_AFS_FSCACHE /* * deal with notification that a page was read from the cache */ @@ -117,6 +118,7 @@ static void afs_file_readpage_read_complete(struct page *page, SetPageUptodate(page); unlock_page(page); } +#endif /* * AFS read page from file, directory or symlink -- cgit