From 35c80d5f400f68f2eccf3069d1c068e154bde9c9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 15 Apr 2009 13:22:38 -0400
Subject: Add block_write_full_page_endio for passing endio handler

block_write_full_page doesn't allow the caller to control what happens
when the IO is over.  This adds a new call named block_write_full_page_endio
so the buffer head end_io handler can be provided by the caller.

This will be used by the ext3 data=guarded mode to do i_size updates in
a workqueue based end_io handler.  end_buffer_async_write is also
exported so it can be called to do the dirty work of managing page
writeback for the higher level end_io handler.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Acked-by: Theodore Tso <tytso@mit.edu>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 45 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index ff8bb1f2333..b3e5be7514f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -360,7 +360,7 @@ still_busy:
  * Completion handler for block_write_full_page() - pages which are unlocked
  * during I/O, and which have PageWriteback cleared upon I/O completion.
  */
-static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
 	char b[BDEVNAME_SIZE];
 	unsigned long flags;
@@ -438,11 +438,17 @@ static void mark_buffer_async_read(struct buffer_head *bh)
 	set_buffer_async_read(bh);
 }
 
-void mark_buffer_async_write(struct buffer_head *bh)
+void mark_buffer_async_write_endio(struct buffer_head *bh,
+				   bh_end_io_t *handler)
 {
-	bh->b_end_io = end_buffer_async_write;
+	bh->b_end_io = handler;
 	set_buffer_async_write(bh);
 }
+
+void mark_buffer_async_write(struct buffer_head *bh)
+{
+	mark_buffer_async_write_endio(bh, end_buffer_async_write);
+}
 EXPORT_SYMBOL(mark_buffer_async_write);
 
 
@@ -1615,7 +1621,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
  * unplugging the device queue.
  */
 static int __block_write_full_page(struct inode *inode, struct page *page,
-			get_block_t *get_block, struct writeback_control *wbc)
+			get_block_t *get_block, struct writeback_control *wbc,
+			bh_end_io_t *handler)
 {
 	int err;
 	sector_t block;
@@ -1700,7 +1707,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 			continue;
 		}
 		if (test_clear_buffer_dirty(bh)) {
-			mark_buffer_async_write(bh);
+			mark_buffer_async_write_endio(bh, handler);
 		} else {
 			unlock_buffer(bh);
 		}
@@ -1753,7 +1760,7 @@ recover:
 		if (buffer_mapped(bh) && buffer_dirty(bh) &&
 		    !buffer_delay(bh)) {
 			lock_buffer(bh);
-			mark_buffer_async_write(bh);
+			mark_buffer_async_write_endio(bh, handler);
 		} else {
 			/*
 			 * The buffer may have been set dirty during
@@ -2679,7 +2686,8 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 out:
 	ret = mpage_writepage(page, get_block, wbc);
 	if (ret == -EAGAIN)
-		ret = __block_write_full_page(inode, page, get_block, wbc);
+		ret = __block_write_full_page(inode, page, get_block, wbc,
+					      end_buffer_async_write);
 	return ret;
 }
 EXPORT_SYMBOL(nobh_writepage);
@@ -2837,9 +2845,10 @@ out:
 
 /*
  * The generic ->writepage function for buffer-backed address_spaces
+ * this form passes in the end_io handler used to finish the IO.
  */
-int block_write_full_page(struct page *page, get_block_t *get_block,
-			struct writeback_control *wbc)
+int block_write_full_page_endio(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc, bh_end_io_t *handler)
 {
 	struct inode * const inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
@@ -2848,7 +2857,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 
 	/* Is the page fully inside i_size? */
 	if (page->index < end_index)
-		return __block_write_full_page(inode, page, get_block, wbc);
+		return __block_write_full_page(inode, page, get_block, wbc,
+					       handler);
 
 	/* Is the page fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_CACHE_SIZE-1);
@@ -2871,9 +2881,20 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 	 * writes to that region are not written out to the file."
 	 */
 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-	return __block_write_full_page(inode, page, get_block, wbc);
+	return __block_write_full_page(inode, page, get_block, wbc, handler);
 }
 
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
+int block_write_full_page(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc)
+{
+	return block_write_full_page_endio(page, get_block, wbc,
+					   end_buffer_async_write);
+}
+
+
 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
 			    get_block_t *get_block)
 {
@@ -3342,9 +3363,11 @@ EXPORT_SYMBOL(block_read_full_page);
 EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
+EXPORT_SYMBOL(block_write_full_page_endio);
 EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
+EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(file_fsync);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
-- 
cgit 


From d110271e1f4140a9fb06d968b1afe9ca56a6064e Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Wed, 25 Mar 2009 15:11:36 -0600
Subject: sysfs: don't use global workqueue in sysfs_schedule_callback()

A sysfs attribute using sysfs_schedule_callback() to commit suicide
may end up calling device_unregister(), which will eventually call
a driver's ->remove function.

Drivers may call flush_scheduled_work() in their shutdown routines,
in which case lockdep will complain with something like the following:

  =============================================
  [ INFO: possible recursive locking detected ]
  2.6.29-rc8-kk #1
  ---------------------------------------------
  events/4/56 is trying to acquire lock:
  (events){--..}, at: [<ffffffff80257fc0>] flush_workqueue+0x0/0xa0

  but task is already holding lock:
  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230

  other info that might help us debug this:
  3 locks held by events/4/56:
  #0:  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
  #1:  (&ss->work){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
  #2:  (pci_remove_rescan_mutex){--..}, at: [<ffffffff803c10d1>] remove_callback+0x21/0x40

  stack backtrace:
  Pid: 56, comm: events/4 Not tainted 2.6.29-rc8-kk #1
  Call Trace:
  [<ffffffff8026dfcd>] validate_chain+0xb7d/0x1260
  [<ffffffff8026eade>] __lock_acquire+0x42e/0xa40
  [<ffffffff8026f148>] lock_acquire+0x58/0x80
  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
  [<ffffffff8025800d>] flush_workqueue+0x4d/0xa0
  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
  [<ffffffff80258070>] flush_scheduled_work+0x10/0x20
  [<ffffffffa0144065>] e1000_remove+0x55/0xfe [e1000e]
  [<ffffffff8033ee30>] ? sysfs_schedule_callback_work+0x0/0x50
  [<ffffffff803bfeb2>] pci_device_remove+0x32/0x70
  [<ffffffff80441da9>] __device_release_driver+0x59/0x90
  [<ffffffff80441edb>] device_release_driver+0x2b/0x40
  [<ffffffff804419d6>] bus_remove_device+0xa6/0x120
  [<ffffffff8043e46b>] device_del+0x12b/0x190
  [<ffffffff8043e4f6>] device_unregister+0x26/0x70
  [<ffffffff803ba969>] pci_stop_dev+0x49/0x60
  [<ffffffff803baab0>] pci_remove_bus_device+0x40/0xc0
  [<ffffffff803c10d9>] remove_callback+0x29/0x40
  [<ffffffff8033ee4f>] sysfs_schedule_callback_work+0x1f/0x50
  [<ffffffff8025769a>] run_workqueue+0x15a/0x230
  [<ffffffff80257648>] ? run_workqueue+0x108/0x230
  [<ffffffff8025846f>] worker_thread+0x9f/0x100
  [<ffffffff8025bce0>] ? autoremove_wake_function+0x0/0x40
  [<ffffffff802583d0>] ? worker_thread+0x0/0x100
  [<ffffffff8025b89d>] kthread+0x4d/0x80
  [<ffffffff8020d4ba>] child_rip+0xa/0x20
  [<ffffffff8020cebc>] ? restore_args+0x0/0x30
  [<ffffffff8025b850>] ? kthread+0x0/0x80
  [<ffffffff8020d4b0>] ? child_rip+0x0/0x20

Although we know that the device_unregister path will never acquire
a lock that a driver might try to acquire in its ->remove, in general
we should never attempt to flush a workqueue from within the same
workqueue, and lockdep rightly complains.

So as long as sysfs attributes cannot commit suicide directly and we
are stuck with this callback mechanism, put the sysfs callbacks on
their own workqueue instead of the global one.

This has the side benefit that if a suicidal sysfs attribute kicks
off a long chain of ->remove callbacks, we no longer induce a long
delay on the global queue.

This also fixes a missing module_put in the error path introduced
by sysfs-only-allow-one-scheduled-removal-callback-per-kobj.patch.

We never destroy the workqueue, but I'm not sure that's a
problem.

Reported-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Tested-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 289c43a4726..979e9379fb5 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -667,6 +667,7 @@ struct sysfs_schedule_callback_struct {
 	struct work_struct	work;
 };
 
+static struct workqueue_struct *sysfs_workqueue;
 static DEFINE_MUTEX(sysfs_workq_mutex);
 static LIST_HEAD(sysfs_workq);
 static void sysfs_schedule_callback_work(struct work_struct *work)
@@ -715,11 +716,20 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
 	mutex_lock(&sysfs_workq_mutex);
 	list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
 		if (ss->kobj == kobj) {
+			module_put(owner);
 			mutex_unlock(&sysfs_workq_mutex);
 			return -EAGAIN;
 		}
 	mutex_unlock(&sysfs_workq_mutex);
 
+	if (sysfs_workqueue == NULL) {
+		sysfs_workqueue = create_workqueue("sysfsd");
+		if (sysfs_workqueue == NULL) {
+			module_put(owner);
+			return -ENOMEM;
+		}
+	}
+
 	ss = kmalloc(sizeof(*ss), GFP_KERNEL);
 	if (!ss) {
 		module_put(owner);
@@ -735,7 +745,7 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
 	mutex_lock(&sysfs_workq_mutex);
 	list_add_tail(&ss->workq_list, &sysfs_workq);
 	mutex_unlock(&sysfs_workq_mutex);
-	schedule_work(&ss->work);
+	queue_work(sysfs_workqueue, &ss->work);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
-- 
cgit 


From 1af3557abdef34ee036a6de4cb79e24468544b8d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 9 Apr 2009 13:53:22 +0900
Subject: sysfs: sysfs poll keep the poll rule of regular file.

Currently, following test programs don't finished.

% ruby -e '
Thread.new { sleep }
File.read("/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies")
'

strace expose the reason.

...
open("/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies", O_RDONLY|O_LARGEFILE) = 3
ioctl(3, SNDCTL_TMR_TIMEBASE or TCGETS, 0xbf9fa6b8) = -1 ENOTTY (Inappropriate ioctl for device)
fstat64(3, {st_mode=S_IFREG|0444, st_size=4096, ...}) = 0
_llseek(3, 0, [0], SEEK_CUR)            = 0
select(4, [3], NULL, NULL, NULL)        = 1 (in [3])
read(3, "1400000 1300000 1200000 1100000 1"..., 4096) = 62
select(4, [3], NULL, NULL, NULL


Because Ruby (the scripting language) VM assume select system-call
against regular file don't block.  it because SUSv3 says "Regular files
shall always poll TRUE for reading and writing".  see
http://www.opengroup.org/onlinepubs/009695399/functions/poll.html it
seems valid assumption.

But sysfs_poll() don't keep this rule although sysfs file can read and
write always.

This patch restore proper poll behavior to sysfs.
/sys/block/md*/md/sync_action polling application and another sysfs
updating sensitive application still can use POLLERR and POLLPRI.

Cc: Neil Brown <neilb@suse.de>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 979e9379fb5..b1606e07b7a 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -446,11 +446,11 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 	if (buffer->event != atomic_read(&od->event))
 		goto trigger;
 
-	return 0;
+	return DEFAULT_POLLMASK;
 
  trigger:
 	buffer->needs_read_fill = 1;
-	return POLLERR|POLLPRI;
+	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
 }
 
 void sysfs_notify_dirent(struct sysfs_dirent *sd)
-- 
cgit 


From 31b07093c44a7a442394d44423e21d783f5523b8 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 9 Apr 2009 13:57:59 +0900
Subject: proc: mounts_poll() make consistent to mdstat_poll

In recently sysfs_poll discussion, Neil Brown pointed out /proc/mounts
also should be fixed.

SUSv3 says "Regular files shall always poll TRUE for reading and
writing".  see
http://www.opengroup.org/onlinepubs/009695399/functions/poll.html

Then, mounts_poll()'s default should be "POLLIN | POLLRDNORM".  it mean
always readable.

In addition, event trigger should use "POLLERR | POLLPRI" instead
POLLERR.  it makes consistent to mdstat_poll() and sysfs_poll(). and,
select(2) can handle POLLPRI easily.


Reported-by: Neil Brown <neilb@suse.de>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/proc/base.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f71559784bf..aa763ab0077 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -648,14 +648,14 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
 {
 	struct proc_mounts *p = file->private_data;
 	struct mnt_namespace *ns = p->ns;
-	unsigned res = 0;
+	unsigned res = POLLIN | POLLRDNORM;
 
 	poll_wait(file, &ns->poll, wait);
 
 	spin_lock(&vfsmount_lock);
 	if (p->event != ns->event) {
 		p->event = ns->event;
-		res = POLLERR;
+		res |= POLLERR | POLLPRI;
 	}
 	spin_unlock(&vfsmount_lock);
 
-- 
cgit 


From b80901bbf599553f483b9509f2dce416b938aae8 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 16 Apr 2009 19:09:55 -0700
Subject: splice: fix new kernel-doc warnings

splice: fix kernel-doc warnings

  Warning(fs/splice.c:617): bad line:
  Warning(fs/splice.c:722): No description found for parameter 'sd'
  Warning(fs/splice.c:722): Excess function parameter 'pipe' description in 'splice_from_pipe_begin'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/splice.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 5384a90665d..666953d59a3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -614,7 +614,6 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
  * @actor:	handler that splices the data
  *
  * Description:
-
  *    This function loops over the pipe and calls @actor to do the
  *    actual moving of a single struct pipe_buffer to the desired
  *    destination.  It returns when there's no more buffers left in
@@ -711,7 +710,7 @@ EXPORT_SYMBOL(splice_from_pipe_next);
 
 /**
  * splice_from_pipe_begin - start splicing from pipe
- * @pipe:	pipe to splice from
+ * @sd:		information about the splice operation
  *
  * Description:
  *    This function should be called before a loop containing
-- 
cgit 


From d29a2e943867bfa48f72ee6e99723a1b29fe6f7e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Fri, 17 Apr 2009 12:22:35 +0100
Subject: vfat: Note the NLS requirement

Close bug #4754. Stop people getting into a situation where they can't
get their FAT filesystems to mount as they expect.

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index d0a69ff2537..182f9ffe2b5 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -95,3 +95,6 @@ config FAT_DEFAULT_IOCHARSET
 	  Note that "utf8" is not recommended for FAT filesystems.
 	  If unsure, you shouldn't set "utf8" here.
 	  See <file:Documentation/filesystems/vfat.txt> for more information.
+
+	  Enable any character sets you need in File Systems/Native Language
+	  Support.
-- 
cgit 


From 6566abdbd0566fc1b5950c9f87ef57c7443d6fa8 Mon Sep 17 00:00:00 2001
From: Matt Kraai <kraai@ftbfs.org>
Date: Fri, 17 Apr 2009 12:56:38 +0100
Subject: AFS: Guard afs_file_readpage_read_complete() definition with
 CONFIG_AFS_FSCACHE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_AFS_FSCACHE is not defined, the following warning is displayed when
fs/afs/file.c is compiled:

 fs/afs/file.c:111: warning: ‘afs_file_readpage_read_complete’ defined but not used

This occurs because all calls to this function are guarded by
CONFIG_AFS_FSCACHE.  Thus, guard its definition as well.

Signed-off-by: Matt Kraai <kraai@ftbfs.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/file.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 7a1d942ef68..0149dab365e 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -102,6 +102,7 @@ int afs_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+#ifdef CONFIG_AFS_FSCACHE
 /*
  * deal with notification that a page was read from the cache
  */
@@ -117,6 +118,7 @@ static void afs_file_readpage_read_complete(struct page *page,
 		SetPageUptodate(page);
 	unlock_page(page);
 }
+#endif
 
 /*
  * AFS read page from file, directory or symlink
-- 
cgit