summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorAnton Arapov <anton@redhat.com>2012-10-29 11:32:50 +0100
committerAnton Arapov <anton@redhat.com>2012-10-29 11:32:50 +0100
commit7f6dc119ae2a77a23a3c07ebefba6fd7df65a74c (patch)
tree109fdbb7cff60b946bdadae15f5de052d96abae8 /fs
parent4f87faf0a40754afe619316ef1e5980a1811d883 (diff)
downloadkernel-uprobes-f18.zip
kernel-uprobes-f18.tar.gz
kernel-uprobes-f18.tar.xz
fedora kernel: 1c1fad82422e58b20bf06bb371c38154b4b965dbv3.6.3-4f18
Signed-off-by: Anton Arapov <anton@redhat.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/autofs4/root.c6
-rw-r--r--fs/binfmt_elf.c19
-rw-r--r--fs/btrfs/qgroup.c8
-rw-r--r--fs/buffer.c13
-rw-r--r--fs/ceph/export.c18
-rw-r--r--fs/cifs/cifs_unicode.c24
-rw-r--r--fs/cifs/connect.c9
-rw-r--r--fs/cifs/dir.c7
-rw-r--r--fs/dcache.c12
-rw-r--r--fs/debugfs/file.c76
-rw-r--r--fs/ext4/ext4.h39
-rw-r--r--fs/ext4/extents.c158
-rw-r--r--fs/ext4/file.c6
-rw-r--r--fs/ext4/fsync.c83
-rw-r--r--fs/ext4/indirect.c18
-rw-r--r--fs/ext4/inode.c76
-rw-r--r--fs/ext4/move_extent.c182
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c176
-rw-r--r--fs/ext4/resize.c26
-rw-r--r--fs/ext4/super.c5
-rw-r--r--fs/fs-writeback.c1
-rw-r--r--fs/gfs2/export.c4
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/jbd/commit.c45
-rw-r--r--fs/jbd/transaction.c64
-rw-r--r--fs/jbd2/journal.c5
-rw-r--r--fs/jffs2/super.c4
-rw-r--r--fs/jffs2/wbuf.c8
-rw-r--r--fs/lockd/mon.c86
-rw-r--r--fs/lockd/netns.h4
-rw-r--r--fs/lockd/svc.c1
-rw-r--r--fs/lockd/svclock.c3
-rw-r--r--fs/namei.c2
-rw-r--r--fs/namespace.c10
-rw-r--r--fs/nfs/blocklayout/blocklayout.c177
-rw-r--r--fs/nfs/blocklayout/blocklayout.h1
-rw-r--r--fs/nfs/client.c1
-rw-r--r--fs/nfs/nfs4proc.c9
-rw-r--r--fs/nfsd/nfs4idmap.c2
-rw-r--r--fs/nfsd/nfs4state.c19
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/proc/page.c8
-rw-r--r--fs/proc/proc_sysctl.c5
-rw-r--r--fs/reiserfs/inode.c6
-rw-r--r--fs/xfs/xfs_buf.c5
-rw-r--r--fs/xfs/xfs_buf.h41
-rw-r--r--fs/xfs/xfs_export.c3
-rw-r--r--fs/xfs/xfs_super.c1
49 files changed, 931 insertions, 550 deletions
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e7396cf..91b1165 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -392,10 +392,12 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
ino->flags |= AUTOFS_INF_PENDING;
spin_unlock(&sbi->fs_lock);
status = autofs4_mount_wait(dentry);
- if (status)
- return ERR_PTR(status);
spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_PENDING;
+ if (status) {
+ spin_unlock(&sbi->fs_lock);
+ return ERR_PTR(status);
+ }
}
done:
if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 1b52956..0225fdd 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1696,30 +1696,19 @@ static int elf_note_info_init(struct elf_note_info *info)
return 0;
info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
if (!info->psinfo)
- goto notes_free;
+ return 0;
info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
if (!info->prstatus)
- goto psinfo_free;
+ return 0;
info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
if (!info->fpu)
- goto prstatus_free;
+ return 0;
#ifdef ELF_CORE_COPY_XFPREGS
info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
if (!info->xfpu)
- goto fpu_free;
+ return 0;
#endif
return 1;
-#ifdef ELF_CORE_COPY_XFPREGS
- fpu_free:
- kfree(info->fpu);
-#endif
- prstatus_free:
- kfree(info->prstatus);
- psinfo_free:
- kfree(info->psinfo);
- notes_free:
- kfree(info->notes);
- return 0;
}
static int fill_note_info(struct elfhdr *elf, int phdrs,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 38b42e7..b650155 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1371,10 +1371,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
if (srcid) {
srcgroup = find_qgroup_rb(fs_info, srcid);
- if (!srcgroup) {
- ret = -EINVAL;
+ if (!srcgroup)
goto unlock;
- }
dstgroup->rfer = srcgroup->rfer - level_size;
dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
srcgroup->excl = level_size;
@@ -1383,10 +1381,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
qgroup_dirty(fs_info, srcgroup);
}
- if (!inherit) {
- ret = -EINVAL;
+ if (!inherit)
goto unlock;
- }
i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 58e2e7b..b5f0442 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2312,12 +2312,6 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
loff_t size;
int ret;
- /*
- * Update file times before taking page lock. We may end up failing the
- * fault so this update may be superfluous but who really cares...
- */
- file_update_time(vma->vm_file);
-
lock_page(page);
size = i_size_read(inode);
if ((page->mapping != inode->i_mapping) ||
@@ -2355,6 +2349,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
sb_start_pagefault(sb);
+
+ /*
+ * Update file times before taking page lock. We may end up failing the
+ * fault so this update may be superfluous but who really cares...
+ */
+ file_update_time(vma->vm_file);
+
ret = __block_page_mkwrite(vma, vmf, get_block);
sb_end_pagefault(sb);
return block_page_mkwrite_return(ret);
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 8e1b60e..02ce909 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -99,7 +99,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
* FIXME: we should try harder by querying the mds for the ino.
*/
static struct dentry *__fh_to_dentry(struct super_block *sb,
- struct ceph_nfs_fh *fh)
+ struct ceph_nfs_fh *fh, int fh_len)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
@@ -107,6 +107,9 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
struct ceph_vino vino;
int err;
+ if (fh_len < sizeof(*fh) / 4)
+ return ERR_PTR(-ESTALE);
+
dout("__fh_to_dentry %llx\n", fh->ino);
vino.ino = fh->ino;
vino.snap = CEPH_NOSNAP;
@@ -150,7 +153,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
* convert connectable fh to dentry
*/
static struct dentry *__cfh_to_dentry(struct super_block *sb,
- struct ceph_nfs_confh *cfh)
+ struct ceph_nfs_confh *cfh, int fh_len)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
@@ -158,6 +161,9 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
struct ceph_vino vino;
int err;
+ if (fh_len < sizeof(*cfh) / 4)
+ return ERR_PTR(-ESTALE);
+
dout("__cfh_to_dentry %llx (%llx/%x)\n",
cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
@@ -207,9 +213,11 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type)
{
if (fh_type == 1)
- return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
+ return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw,
+ fh_len);
else
- return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
+ return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw,
+ fh_len);
}
/*
@@ -230,6 +238,8 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
if (fh_type == 1)
return ERR_PTR(-ESTALE);
+ if (fh_len < sizeof(*cfh) / 4)
+ return ERR_PTR(-ESTALE);
pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
cfh->parent_name_hash);
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 7dab9c0..71d5d0a 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -203,6 +203,27 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
int i;
wchar_t wchar_to; /* needed to quiet sparse */
+ /* special case for utf8 to handle no plane0 chars */
+ if (!strcmp(codepage->charset, "utf8")) {
+ /*
+ * convert utf8 -> utf16, we assume we have enough space
+ * as caller should have assumed conversion does not overflow
+ * in destination len is length in wchar_t units (16bits)
+ */
+ i = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
+ (wchar_t *) to, len);
+
+ /* if success terminate and exit */
+ if (i >= 0)
+ goto success;
+ /*
+ * if fails fall back to UCS encoding as this
+ * function should not return negative values
+ * currently can fail only if source contains
+ * invalid encoded characters
+ */
+ }
+
for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
charlen = codepage->char2uni(from, len, &wchar_to);
if (charlen < 1) {
@@ -215,6 +236,7 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
put_unaligned_le16(wchar_to, &to[i]);
}
+success:
put_unaligned_le16(0, &to[i]);
return i;
}
@@ -328,7 +350,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
}
ctoUTF16_out:
- return i;
+ return j;
}
#ifdef CONFIG_CIFS_SMB2
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 6df6fa1..b39bb4a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -67,6 +67,7 @@ enum {
/* Mount options that take no arguments */
Opt_user_xattr, Opt_nouser_xattr,
Opt_forceuid, Opt_noforceuid,
+ Opt_forcegid, Opt_noforcegid,
Opt_noblocksend, Opt_noautotune,
Opt_hard, Opt_soft, Opt_perm, Opt_noperm,
Opt_mapchars, Opt_nomapchars, Opt_sfu,
@@ -118,6 +119,8 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_nouser_xattr, "nouser_xattr" },
{ Opt_forceuid, "forceuid" },
{ Opt_noforceuid, "noforceuid" },
+ { Opt_forcegid, "forcegid" },
+ { Opt_noforcegid, "noforcegid" },
{ Opt_noblocksend, "noblocksend" },
{ Opt_noautotune, "noautotune" },
{ Opt_hard, "hard" },
@@ -1190,6 +1193,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_noforceuid:
override_uid = 0;
break;
+ case Opt_forcegid:
+ override_gid = 1;
+ break;
+ case Opt_noforcegid:
+ override_gid = 0;
+ break;
case Opt_noblocksend:
vol->noblocksnd = 1;
break;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 781025b..cfd0e07 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -392,7 +392,12 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
* in network traffic in the other paths.
*/
if (!(oflags & O_CREAT)) {
- struct dentry *res = cifs_lookup(inode, direntry, 0);
+ struct dentry *res;
+
+ if (!direntry->d_inode)
+ return -ENOENT;
+
+ res = cifs_lookup(inode, direntry, 0);
if (IS_ERR(res))
return PTR_ERR(res);
diff --git a/fs/dcache.c b/fs/dcache.c
index 672241c..69fc57d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -389,7 +389,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
* Inform try_to_ascend() that we are no longer attached to the
* dentry tree
*/
- dentry->d_flags |= DCACHE_DISCONNECTED;
+ dentry->d_flags |= DCACHE_DENTRY_KILLED;
if (parent)
spin_unlock(&parent->d_lock);
dentry_iput(dentry);
@@ -1048,7 +1048,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq
* or deletion
*/
if (new != old->d_parent ||
- (old->d_flags & DCACHE_DISCONNECTED) ||
+ (old->d_flags & DCACHE_DENTRY_KILLED) ||
(!locked && read_seqretry(&rename_lock, seq))) {
spin_unlock(&new->d_lock);
new = NULL;
@@ -1134,6 +1134,8 @@ positive:
return 1;
rename_retry:
+ if (locked)
+ goto again;
locked = 1;
write_seqlock(&rename_lock);
goto again;
@@ -1141,7 +1143,7 @@ rename_retry:
EXPORT_SYMBOL(have_submounts);
/*
- * Search the dentry child list for the specified parent,
+ * Search the dentry child list of the specified parent,
* and move any unused dentries to the end of the unused
* list for prune_dcache(). We descend to the next level
* whenever the d_subdirs list is non-empty and continue
@@ -1236,6 +1238,8 @@ out:
rename_retry:
if (found)
return found;
+ if (locked)
+ goto again;
locked = 1;
write_seqlock(&rename_lock);
goto again;
@@ -3037,6 +3041,8 @@ resume:
return;
rename_retry:
+ if (locked)
+ goto again;
locked = 1;
write_seqlock(&rename_lock);
goto again;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 2340f69..c5ca6ae 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -526,73 +526,51 @@ struct array_data {
u32 elements;
};
-static int u32_array_open(struct inode *inode, struct file *file)
-{
- file->private_data = NULL;
- return nonseekable_open(inode, file);
-}
-
-static size_t format_array(char *buf, size_t bufsize, const char *fmt,
- u32 *array, u32 array_size)
+static size_t u32_format_array(char *buf, size_t bufsize,
+ u32 *array, int array_size)
{
size_t ret = 0;
- u32 i;
- for (i = 0; i < array_size; i++) {
+ while (--array_size >= 0) {
size_t len;
+ char term = array_size ? ' ' : '\n';
- len = snprintf(buf, bufsize, fmt, array[i]);
- len++; /* ' ' or '\n' */
+ len = snprintf(buf, bufsize, "%u%c", *array++, term);
ret += len;
- if (buf) {
- buf += len;
- bufsize -= len;
- buf[-1] = (i == array_size-1) ? '\n' : ' ';
- }
+ buf += len;
+ bufsize -= len;
}
-
- ret++; /* \0 */
- if (buf)
- *buf = '\0';
-
return ret;
}
-static char *format_array_alloc(const char *fmt, u32 *array,
- u32 array_size)
+static int u32_array_open(struct inode *inode, struct file *file)
{
- size_t len = format_array(NULL, 0, fmt, array, array_size);
- char *ret;
-
- ret = kmalloc(len, GFP_KERNEL);
- if (ret == NULL)
- return NULL;
+ struct array_data *data = inode->i_private;
+ int size, elements = data->elements;
+ char *buf;
+
+ /*
+ * Max size:
+ * - 10 digits + ' '/'\n' = 11 bytes per number
+ * - terminating NUL character
+ */
+ size = elements*11;
+ buf = kmalloc(size+1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ buf[size] = 0;
+
+ file->private_data = buf;
+ u32_format_array(buf, size, data->array, data->elements);
- format_array(ret, len, fmt, array, array_size);
- return ret;
+ return nonseekable_open(inode, file);
}
static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
loff_t *ppos)
{
- struct inode *inode = file->f_path.dentry->d_inode;
- struct array_data *data = inode->i_private;
- size_t size;
-
- if (*ppos == 0) {
- if (file->private_data) {
- kfree(file->private_data);
- file->private_data = NULL;
- }
-
- file->private_data = format_array_alloc("%u", data->array,
- data->elements);
- }
-
- size = 0;
- if (file->private_data)
- size = strlen(file->private_data);
+ size_t size = strlen(file->private_data);
return simple_read_from_buffer(buf, len, ppos,
file->private_data, size);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c3411d4..7f13292 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -186,7 +186,6 @@ struct mpage_da_data {
#define EXT4_IO_END_ERROR 0x0002
#define EXT4_IO_END_QUEUED 0x0004
#define EXT4_IO_END_DIRECT 0x0008
-#define EXT4_IO_END_IN_FSYNC 0x0010
struct ext4_io_page {
struct page *p_page;
@@ -912,9 +911,7 @@ struct ext4_inode_info {
struct list_head i_completed_io_list;
spinlock_t i_completed_io_lock;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
- /* current io_end structure for async DIO write*/
- ext4_io_end_t *cur_aio_dio;
- atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
+ atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -1328,10 +1325,20 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+ atomic_inc(&EXT4_I(inode)->i_unwritten);
}
}
+static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
+{
+ return inode->i_private;
+}
+
+static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
+{
+ inode->i_private = io;
+}
+
/*
* Inode dynamic state flags
*/
@@ -1345,6 +1352,8 @@ enum {
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
+ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
+ nolocking */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1932,7 +1941,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_completed_IO(struct inode *);
+extern int ext4_flush_unwritten_io(struct inode *);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2352,6 +2361,7 @@ extern const struct file_operations ext4_dir_operations;
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+extern void ext4_unwritten_wait(struct inode *inode);
/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
@@ -2400,11 +2410,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */
extern int __init ext4_init_pageio(void);
+extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern int ext4_end_io_nolock(ext4_io_end_t *io);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
@@ -2452,6 +2462,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+ ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+ smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+ smp_mb();
+ ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
/* For ioend & aio unwritten conversion wait queues */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index aabbb3f..ee0d61c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -52,6 +52,9 @@
#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
+#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */
+#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
+
static __le32 ext4_extent_block_csum(struct inode *inode,
struct ext4_extent_header *eh)
{
@@ -2572,7 +2575,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct ext4_ext_path *path = NULL;
ext4_fsblk_t partial_cluster = 0;
handle_t *handle;
- int i = 0, err;
+ int i = 0, err = 0;
ext_debug("truncate since %u to %u\n", start, end);
@@ -2604,12 +2607,16 @@ again:
return PTR_ERR(path);
}
depth = ext_depth(inode);
+ /* Leaf not may not exist only if inode has no blocks at all */
ex = path[depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
- path = NULL;
- goto cont;
+ if (depth) {
+ EXT4_ERROR_INODE(inode,
+ "path[%d].p_hdr == NULL",
+ depth);
+ err = -EIO;
+ }
+ goto out;
}
ee_block = le32_to_cpu(ex->ee_block);
@@ -2641,8 +2648,6 @@ again:
goto out;
}
}
-cont:
-
/*
* We start scanning from right side, freeing all the blocks
* after i_size and walking into the tree depth-wise.
@@ -2895,6 +2900,9 @@ static int ext4_split_extent_at(handle_t *handle,
unsigned int ee_len, depth;
int err = 0;
+ BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
+ (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
+
ext_debug("ext4_split_extents_at: inode %lu, logical"
"block %llu\n", inode->i_ino, (unsigned long long)split);
@@ -2953,7 +2961,14 @@ static int ext4_split_extent_at(handle_t *handle,
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
- err = ext4_ext_zeroout(inode, &orig_ex);
+ if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
+ if (split_flag & EXT4_EXT_DATA_VALID1)
+ err = ext4_ext_zeroout(inode, ex2);
+ else
+ err = ext4_ext_zeroout(inode, ex);
+ } else
+ err = ext4_ext_zeroout(inode, &orig_ex);
+
if (err)
goto fix_extent_len;
/* update the extent length and mark as initialized */
@@ -3006,12 +3021,13 @@ static int ext4_split_extent(handle_t *handle,
uninitialized = ext4_ext_is_uninitialized(ex);
if (map->m_lblk + map->m_len < ee_block + ee_len) {
- split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
- EXT4_EXT_MAY_ZEROOUT : 0;
+ split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
if (uninitialized)
split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
EXT4_EXT_MARK_UNINIT2;
+ if (split_flag & EXT4_EXT_DATA_VALID2)
+ split_flag1 |= EXT4_EXT_DATA_VALID1;
err = ext4_split_extent_at(handle, inode, path,
map->m_lblk + map->m_len, split_flag1, flags1);
if (err)
@@ -3024,8 +3040,8 @@ static int ext4_split_extent(handle_t *handle,
return PTR_ERR(path);
if (map->m_lblk >= ee_block) {
- split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
- EXT4_EXT_MAY_ZEROOUT : 0;
+ split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT |
+ EXT4_EXT_DATA_VALID2);
if (uninitialized)
split_flag1 |= EXT4_EXT_MARK_UNINIT1;
if (split_flag & EXT4_EXT_MARK_UNINIT2)
@@ -3303,26 +3319,47 @@ static int ext4_split_unwritten_extents(handle_t *handle,
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
split_flag |= EXT4_EXT_MARK_UNINIT2;
-
+ if (flags & EXT4_GET_BLOCKS_CONVERT)
+ split_flag |= EXT4_EXT_DATA_VALID2;
flags |= EXT4_GET_BLOCKS_PRE_IO;
return ext4_split_extent(handle, inode, path, map, split_flag, flags);
}
static int ext4_convert_unwritten_extents_endio(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path *path)
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
{
struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+ unsigned int ee_len;
int depth;
int err = 0;
depth = ext_depth(inode);
ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)le32_to_cpu(ex->ee_block),
- ext4_ext_get_actual_len(ex));
+ (unsigned long long)ee_block, ee_len);
+
+ /* If extent is larger than requested then split is required */
+ if (ee_block != map->m_lblk || ee_len > map->m_len) {
+ err = ext4_split_unwritten_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT);
+ if (err < 0)
+ goto out;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -3600,7 +3637,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
{
int ret = 0;
int err = 0;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
"block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -3615,6 +3652,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
ret = ext4_split_unwritten_extents(handle, inode, map,
path, flags);
+ if (ret <= 0)
+ goto out;
/*
* Flag the inode(non aio case) or end_io struct (aio case)
* that this IO needs to conversion to written when IO is
@@ -3630,7 +3669,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
}
/* IO end_io complete, convert the filled extent to written */
if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
- ret = ext4_convert_unwritten_extents_endio(handle, inode,
+ ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
path);
if (ret >= 0) {
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3858,8 +3897,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset;
+ int set_unwritten = 0;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
@@ -4082,13 +4122,8 @@ got_allocated_blocks:
* For non asycn direct IO case, flag the inode state
* that we need to perform conversion when IO is done.
*/
- if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- if (io)
- ext4_set_io_unwritten_flag(inode, io);
- else
- ext4_set_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN);
- }
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO))
+ set_unwritten = 1;
if (ext4_should_dioread_nolock(inode))
map->m_flags |= EXT4_MAP_UNINIT;
}
@@ -4100,6 +4135,15 @@ got_allocated_blocks:
if (!err)
err = ext4_ext_insert_extent(handle, inode, path,
&newex, flags);
+
+ if (!err && set_unwritten) {
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
+ ext4_set_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN);
+ }
+
if (err && free_on_err) {
int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
@@ -4241,7 +4285,7 @@ void ext4_ext_truncate(struct inode *inode)
* finish any pending end_io work so we won't run the risk of
* converting any truncated blocks to initialized later
*/
- ext4_flush_completed_IO(inode);
+ ext4_flush_unwritten_io(inode);
/*
* probably first extent we're gonna free will be last in block
@@ -4401,6 +4445,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
*/
if (len <= EXT_UNINIT_MAX_LEN << blkbits)
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+
+ /* Prevent race condition between unwritten */
+ ext4_flush_unwritten_io(inode);
retry:
while (ret >= 0 && ret < max_blocks) {
map.m_lblk = map.m_lblk + ret;
@@ -4769,9 +4816,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
loff_t first_page_offset, last_page_offset;
int credits, err = 0;
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ err = filemap_write_and_wait_range(mapping,
+ offset, offset + length - 1);
+
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ /* It's not possible punch hole on append only file */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+ err = -EPERM;
+ goto out_mutex;
+ }
+ if (IS_SWAPFILE(inode)) {
+ err = -ETXTBSY;
+ goto out_mutex;
+ }
+
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
- return 0;
+ goto out_mutex;
/*
* If the hole extends beyond i_size, set the hole
@@ -4789,31 +4859,25 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
first_page_offset = first_page << PAGE_CACHE_SHIFT;
last_page_offset = last_page << PAGE_CACHE_SHIFT;
- /*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- err = filemap_write_and_wait_range(mapping,
- offset, offset + length - 1);
-
- if (err)
- return err;
- }
-
/* Now release the pages */
if (last_page_offset > first_page_offset) {
truncate_pagecache_range(inode, first_page_offset,
last_page_offset - 1);
}
- /* finish any pending end_io work */
- ext4_flush_completed_IO(inode);
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ err = ext4_flush_unwritten_io(inode);
+ if (err)
+ goto out_dio;
+ inode_dio_wait(inode);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_dio;
+ }
err = ext4_orphan_add(handle, inode);
if (err)
@@ -4907,6 +4971,10 @@ out:
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
return err;
}
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3b0e3bd..ca6f07a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static void ext4_aiodio_wait(struct inode *inode)
+void ext4_unwritten_wait(struct inode *inode)
{
wait_queue_head_t *wq = ext4_ioend_wq(inode);
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+ wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
}
/*
@@ -116,7 +116,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
"performance will be poor.",
inode->i_ino, current->comm);
mutex_lock(ext4_aio_mutex(inode));
- ext4_aiodio_wait(inode);
+ ext4_unwritten_wait(inode);
}
BUG_ON(iocb->ki_pos != pos);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2a1dcea..76051c6 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,87 +34,6 @@
#include <trace/events/ext4.h>
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef EXT4FS_DEBUG
- struct list_head *cur, *before, *after;
- ext4_io_end_t *io, *io0, *io1;
- unsigned long flags;
-
- if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
- ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
- return;
- }
-
- ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
- cur = &io->list;
- before = cur->prev;
- io0 = container_of(before, ext4_io_end_t, list);
- after = cur->next;
- io1 = container_of(after, ext4_io_end_t, list);
-
- ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
- io, inode->i_ino, io0, io1);
- }
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int ext4_flush_completed_IO(struct inode *inode)
-{
- ext4_io_end_t *io;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long flags;
- int ret = 0;
- int ret2 = 0;
-
- dump_completed_IO(inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- while (!list_empty(&ei->i_completed_io_list)){
- io = list_entry(ei->i_completed_io_list.next,
- ext4_io_end_t, list);
- list_del_init(&io->list);
- io->flag |= EXT4_IO_END_IN_FSYNC;
- /*
- * Calling ext4_end_io_nolock() to convert completed
- * IO to written.
- *
- * When ext4_sync_file() is called, run_queue() may already
- * about to flush the work corresponding to this io structure.
- * It will be upset if it founds the io structure related
- * to the work-to-be schedule is freed.
- *
- * Thus we need to keep the io structure still valid here after
- * conversion finished. The io structure has a flag to
- * avoid double converting from both fsync and background work
- * queue work.
- */
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- ret = ext4_end_io_nolock(io);
- if (ret < 0)
- ret2 = ret;
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- io->flag &= ~EXT4_IO_END_IN_FSYNC;
- }
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- return (ret2 < 0) ? ret2 : 0;
-}
-
/*
* If we're not journaling and this is a just-created file, we have to
* sync our parent directory (if it was freshly created) since
@@ -219,7 +138,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (inode->i_sb->s_flags & MS_RDONLY)
goto out;
- ret = ext4_flush_completed_IO(inode);
+ ret = ext4_flush_unwritten_io(inode);
if (ret < 0)
goto out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 830e1b2..792e388 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -807,16 +807,30 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
retry:
if (rw == READ && ext4_should_dioread_nolock(inode)) {
- if (unlikely(!list_empty(&ei->i_completed_io_list))) {
+ if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
mutex_lock(&inode->i_mutex);
- ext4_flush_completed_IO(inode);
+ ext4_flush_unwritten_io(inode);
mutex_unlock(&inode->i_mutex);
}
+ /*
+ * Nolock dioread optimization may be dynamically disabled
+ * via ext4_inode_block_unlocked_dio(). Check inode's state
+ * while holding extra i_dio_count ref.
+ */
+ atomic_inc(&inode->i_dio_count);
+ smp_mb();
+ if (unlikely(ext4_test_inode_state(inode,
+ EXT4_STATE_DIOREAD_LOCK))) {
+ inode_dio_done(inode);
+ goto locked;
+ }
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block, NULL, NULL, 0);
+ inode_dio_done(inode);
} else {
+locked:
ret = blockdev_direct_IO(rw, iocb, inode, iov,
offset, nr_segs, ext4_get_block);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dff171c..6f0e7b6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2463,6 +2463,16 @@ static int ext4_nonda_switch(struct super_block *sb)
free_blocks = EXT4_C2B(sbi,
percpu_counter_read_positive(&sbi->s_freeclusters_counter));
dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+ /*
+ * Start pushing delalloc when 1/2 of free blocks are dirty.
+ */
+ if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
+ !writeback_in_progress(sb->s_bdi) &&
+ down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
+ up_read(&sb->s_umount);
+ }
+
if (2 * free_blocks < 3 * dirty_blocks ||
free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
/*
@@ -2471,13 +2481,6 @@ static int ext4_nonda_switch(struct super_block *sb)
*/
return 1;
}
- /*
- * Even if we don't switch but are nearing capacity,
- * start pushing delalloc when 1/2 of free blocks are dirty.
- */
- if (free_blocks < 2 * dirty_blocks)
- writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
-
return 0;
}
@@ -2879,9 +2882,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
ext4_io_end_t *io_end = iocb->private;
- struct workqueue_struct *wq;
- unsigned long flags;
- struct ext4_inode_info *ei;
/* if not async direct IO or dio with 0 bytes write, just return */
if (!io_end || !size)
@@ -2910,24 +2910,14 @@ out:
io_end->iocb = iocb;
io_end->result = ret;
}
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
- /* Add the io_end to per-inode completed aio dio list*/
- ei = EXT4_I(io_end->inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &ei->i_completed_io_list);
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
}
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
{
ext4_io_end_t *io_end = bh->b_private;
- struct workqueue_struct *wq;
struct inode *inode;
- unsigned long flags;
if (!test_clear_buffer_uninit(bh) || !io_end)
goto out;
@@ -2946,15 +2936,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
*/
inode = io_end->inode;
ext4_set_io_unwritten_flag(inode, io_end);
-
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
- wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
out:
bh->b_private = NULL;
bh->b_end_io = NULL;
@@ -3029,6 +3011,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
overwrite = *((int *)iocb->private);
if (overwrite) {
+ atomic_inc(&inode->i_dio_count);
down_read(&EXT4_I(inode)->i_data_sem);
mutex_unlock(&inode->i_mutex);
}
@@ -3054,7 +3037,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* hook to the iocb.
*/
iocb->private = NULL;
- EXT4_I(inode)->cur_aio_dio = NULL;
+ ext4_inode_aio_set(inode, NULL);
if (!is_sync_kiocb(iocb)) {
ext4_io_end_t *io_end =
ext4_init_io_end(inode, GFP_NOFS);
@@ -3071,7 +3054,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* is a unwritten extents needs to be converted
* when IO is completed.
*/
- EXT4_I(inode)->cur_aio_dio = iocb->private;
+ ext4_inode_aio_set(inode, io_end);
}
if (overwrite)
@@ -3091,7 +3074,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
NULL,
DIO_LOCKING);
if (iocb->private)
- EXT4_I(inode)->cur_aio_dio = NULL;
+ ext4_inode_aio_set(inode, NULL);
/*
* The io_end structure takes a reference to the inode,
* that structure needs to be destroyed and the
@@ -3126,6 +3109,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
retake_lock:
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite) {
+ inode_dio_done(inode);
up_read(&EXT4_I(inode)->i_data_sem);
mutex_lock(&inode->i_mutex);
}
@@ -4052,6 +4036,7 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+ int need_datasync = 0;
uid_t i_uid;
gid_t i_gid;
@@ -4102,7 +4087,10 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- ext4_isize_set(raw_inode, ei->i_disksize);
+ if (ei->i_disksize != ext4_isize(raw_inode)) {
+ ext4_isize_set(raw_inode, ei->i_disksize);
+ need_datasync = 1;
+ }
if (ei->i_disksize > 0x7fffffffULL) {
struct super_block *sb = inode->i_sb;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -4155,7 +4143,7 @@ static int ext4_do_update_inode(handle_t *handle,
err = rc;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
- ext4_update_inode_fsync_trans(handle, inode, 0);
+ ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_brelse:
brelse(bh);
ext4_std_error(inode->i_sb, err);
@@ -4298,7 +4286,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- inode_dio_wait(inode);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4347,8 +4334,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size != i_size_read(inode))
+ if (attr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attr->ia_size);
+ /* Inode size will be reduced, wait for dio in flight.
+ * Temporarily disable dioread_nolock to prevent
+ * livelock. */
+ if (orphan) {
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ }
+ }
ext4_truncate(inode);
}
@@ -4727,6 +4723,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
jbd2_journal_lock_updates(journal);
/*
@@ -4746,6 +4746,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
+ ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
@@ -4780,6 +4781,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
int retries = 0;
sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c5826c6..b3ab3c5 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -141,55 +141,21 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
- * mext_check_null_inode - NULL check for two inodes
- *
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
- */
-static int
-mext_check_null_inode(struct inode *inode1, struct inode *inode2,
- const char *function, unsigned int line)
-{
- int ret = 0;
-
- if (inode1 == NULL) {
- __ext4_error(inode2->i_sb, function, line,
- "Both inodes should not be NULL: "
- "inode1 NULL inode2 %lu", inode2->i_ino);
- ret = -EIO;
- } else if (inode2 == NULL) {
- __ext4_error(inode1->i_sb, function, line,
- "Both inodes should not be NULL: "
- "inode1 %lu inode2 NULL", inode1->i_ino);
- ret = -EIO;
- }
- return ret;
-}
-
-/**
* double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
*
- * @orig_inode: original inode structure
- * @donor_inode: donor inode structure
- * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
- * i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes
*/
static void
-double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *first, struct inode *second)
{
- struct inode *first = orig_inode, *second = donor_inode;
+ if (first < second) {
+ down_write(&EXT4_I(first)->i_data_sem);
+ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+ } else {
+ down_write(&EXT4_I(second)->i_data_sem);
+ down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
- /*
- * Use the inode number to provide the stable locking order instead
- * of its address, because the C language doesn't guarantee you can
- * compare pointers that don't come from the same array.
- */
- if (donor_inode->i_ino < orig_inode->i_ino) {
- first = donor_inode;
- second = orig_inode;
}
-
- down_write(&EXT4_I(first)->i_data_sem);
- down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
}
/**
@@ -969,14 +935,6 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- /* Files should be in the same ext4 FS */
- if (orig_inode->i_sb != donor_inode->i_sb) {
- ext4_debug("ext4 move extent: The argument files "
- "should be in same FS [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
/* Ext4 move extent supports only extent based file */
if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
@@ -1072,35 +1030,19 @@ mext_check_arguments(struct inode *orig_inode,
* @inode1: the inode structure
* @inode2: the inode structure
*
- * Lock two inodes' i_mutex by i_ino order.
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ * Lock two inodes' i_mutex
*/
-static int
+static void
mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
- int ret = 0;
-
- BUG_ON(inode1 == NULL && inode2 == NULL);
-
- ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
- if (ret < 0)
- goto out;
-
- if (inode1 == inode2) {
- mutex_lock(&inode1->i_mutex);
- goto out;
- }
-
- if (inode1->i_ino < inode2->i_ino) {
+ BUG_ON(inode1 == inode2);
+ if (inode1 < inode2) {
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
} else {
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
}
-
-out:
- return ret;
}
/**
@@ -1109,28 +1051,13 @@ out:
* @inode1: the inode that is released first
* @inode2: the inode that is released second
*
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static int
+static void
mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
- int ret = 0;
-
- BUG_ON(inode1 == NULL && inode2 == NULL);
-
- ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
- if (ret < 0)
- goto out;
-
- if (inode1)
- mutex_unlock(&inode1->i_mutex);
-
- if (inode2 && inode2 != inode1)
- mutex_unlock(&inode2->i_mutex);
-
-out:
- return ret;
+ mutex_unlock(&inode1->i_mutex);
+ mutex_unlock(&inode2->i_mutex);
}
/**
@@ -1187,16 +1114,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
ext4_lblk_t rest_blocks;
pgoff_t orig_page_offset = 0, seq_end_page;
- int ret1, ret2, depth, last_extent = 0;
+ int ret, depth, last_extent = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
int data_offset_in_page;
int block_len_in_page;
int uninit;
- /* orig and donor should be different file */
- if (orig_inode->i_ino == donor_inode->i_ino) {
+ if (orig_inode->i_sb != donor_inode->i_sb) {
+ ext4_debug("ext4 move extent: The argument files "
+ "should be in same FS [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* orig and donor should be different inodes */
+ if (orig_inode == donor_inode) {
ext4_debug("ext4 move extent: The argument files should not "
- "be same file [ino:orig %lu, donor %lu]\n",
+ "be same inode [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
@@ -1208,18 +1142,27 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
-
+ /* TODO: This is non obvious task to swap blocks for inodes with full
+ jornaling enabled */
+ if (ext4_should_journal_data(orig_inode) ||
+ ext4_should_journal_data(donor_inode)) {
+ return -EINVAL;
+ }
/* Protect orig and donor inodes against a truncate */
- ret1 = mext_inode_double_lock(orig_inode, donor_inode);
- if (ret1 < 0)
- return ret1;
+ mext_inode_double_lock(orig_inode, donor_inode);
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(orig_inode);
+ ext4_inode_block_unlocked_dio(donor_inode);
+ inode_dio_wait(orig_inode);
+ inode_dio_wait(donor_inode);
/* Protect extent tree against block allocations via delalloc */
double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
- ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
donor_start, &len);
- if (ret1)
+ if (ret)
goto out;
file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
@@ -1227,13 +1170,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
if (file_end < block_end)
len -= block_end - file_end;
- ret1 = get_ext_path(orig_inode, block_start, &orig_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, block_start, &orig_path);
+ if (ret)
goto out;
/* Get path structure to check the hole */
- ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (ret)
goto out;
depth = ext_depth(orig_inode);
@@ -1252,13 +1195,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode,
holecheck_path, &ext_cur);
if (last_extent < 0) {
- ret1 = last_extent;
+ ret = last_extent;
goto out;
}
last_extent = mext_next_extent(orig_inode, orig_path,
&ext_dummy);
if (last_extent < 0) {
- ret1 = last_extent;
+ ret = last_extent;
goto out;
}
seq_start = le32_to_cpu(ext_cur->ee_block);
@@ -1272,7 +1215,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
if (le32_to_cpu(ext_cur->ee_block) > block_end) {
ext4_debug("ext4 move extent: The specified range of file "
"may be the hole\n");
- ret1 = -EINVAL;
+ ret = -EINVAL;
goto out;
}
@@ -1292,7 +1235,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode, holecheck_path,
&ext_cur);
if (last_extent < 0) {
- ret1 = last_extent;
+ ret = last_extent;
break;
}
add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1349,18 +1292,18 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
orig_page_offset,
data_offset_in_page,
block_len_in_page, uninit,
- &ret1);
+ &ret);
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
- if (ret1 < 0)
+ if (ret < 0)
break;
if (*moved_len > len) {
EXT4_ERROR_INODE(orig_inode,
"We replaced blocks too much! "
"sum of replaced: %llu requested: %llu",
*moved_len, len);
- ret1 = -EIO;
+ ret = -EIO;
break;
}
@@ -1374,22 +1317,22 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
}
double_down_write_data_sem(orig_inode, donor_inode);
- if (ret1 < 0)
+ if (ret < 0)
break;
/* Decrease buffer counter */
if (holecheck_path)
ext4_ext_drop_refs(holecheck_path);
- ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (ret)
break;
depth = holecheck_path->p_depth;
/* Decrease buffer counter */
if (orig_path)
ext4_ext_drop_refs(orig_path);
- ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (ret)
break;
ext_cur = holecheck_path[depth].p_ext;
@@ -1412,12 +1355,9 @@ out:
kfree(holecheck_path);
}
double_up_write_data_sem(orig_inode, donor_inode);
- ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
-
- if (ret1)
- return ret1;
- else if (ret2)
- return ret2;
+ ext4_inode_resume_unlocked_dio(orig_inode);
+ ext4_inode_resume_unlocked_dio(donor_inode);
+ mext_inode_double_unlock(orig_inode, donor_inode);
- return 0;
+ return ret;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2a42cc0..d13873d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2149,9 +2149,7 @@ retry:
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT4_FS_XATTR
inode->i_op = &ext4_special_inode_operations;
-#endif
err = ext4_add_nondir(handle, dentry, inode);
}
ext4_journal_stop(handle);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dcdeef1..68e896e 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -71,6 +71,9 @@ void ext4_free_io_end(ext4_io_end_t *io)
int i;
BUG_ON(!io);
+ BUG_ON(!list_empty(&io->list));
+ BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
+
if (io->page)
put_page(io->page);
for (i = 0; i < io->num_io_pages; i++)
@@ -81,13 +84,8 @@ void ext4_free_io_end(ext4_io_end_t *io)
kmem_cache_free(io_end_cachep, io);
}
-/*
- * check a range of space and convert unwritten extents to written.
- *
- * Called with inode->i_mutex; we depend on this when we manipulate
- * io->flag, since we could otherwise race with ext4_flush_completed_IO()
- */
-int ext4_end_io_nolock(ext4_io_end_t *io)
+/* check a range of space and convert unwritten extents to written. */
+static int ext4_end_io(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
@@ -106,63 +104,136 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
-
if (io->iocb)
aio_complete(io->iocb, io->result, 0);
if (io->flag & EXT4_IO_END_DIRECT)
inode_dio_done(inode);
/* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(io->inode));
return ret;
}
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
+static void dump_completed_IO(struct inode *inode)
+{
+#ifdef EXT4FS_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+ unsigned long flags;
+
+ if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
+ ext4_debug("inode %lu completed_io list is empty\n",
+ inode->i_ino);
+ return;
+ }
+
+ ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
+ list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+#endif
+}
+
+/* Add the io_end to per-inode completed end_io list. */
+void ext4_add_complete_io(ext4_io_end_t *io_end)
{
- ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
- struct inode *inode = io->inode;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long flags;
+ struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+ struct workqueue_struct *wq;
+ unsigned long flags;
+
+ BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (io->flag & EXT4_IO_END_IN_FSYNC)
- goto requeue;
- if (list_empty(&io->list)) {
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- goto free;
+ if (list_empty(&ei->i_completed_io_list)) {
+ io_end->flag |= EXT4_IO_END_QUEUED;
+ queue_work(wq, &io_end->work);
}
+ list_add_tail(&io_end->list, &ei->i_completed_io_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
- if (!mutex_trylock(&inode->i_mutex)) {
- bool was_queued;
-requeue:
- was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
- io->flag |= EXT4_IO_END_QUEUED;
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- /*
- * Requeue the work instead of waiting so that the work
- * items queued after this can be processed.
- */
- queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
- /*
- * To prevent the ext4-dio-unwritten thread from keeping
- * requeueing end_io requests and occupying cpu for too long,
- * yield the cpu if it sees an end_io request that has already
- * been requeued.
- */
- if (was_queued)
- yield();
- return;
+static int ext4_do_flush_completed_IO(struct inode *inode,
+ ext4_io_end_t *work_io)
+{
+ ext4_io_end_t *io;
+ struct list_head unwritten, complete, to_free;
+ unsigned long flags;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int err, ret = 0;
+
+ INIT_LIST_HEAD(&complete);
+ INIT_LIST_HEAD(&to_free);
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ dump_completed_IO(inode);
+ list_replace_init(&ei->i_completed_io_list, &unwritten);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ while (!list_empty(&unwritten)) {
+ io = list_entry(unwritten.next, ext4_io_end_t, list);
+ BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+ list_del_init(&io->list);
+
+ err = ext4_end_io(io);
+ if (unlikely(!ret && err))
+ ret = err;
+
+ list_add_tail(&io->list, &complete);
+ }
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ while (!list_empty(&complete)) {
+ io = list_entry(complete.next, ext4_io_end_t, list);
+ io->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* end_io context can not be destroyed now because it still
+ * used by queued worker. Worker thread will destroy it later */
+ if (io->flag & EXT4_IO_END_QUEUED)
+ list_del_init(&io->list);
+ else
+ list_move(&io->list, &to_free);
+ }
+ /* If we are called from worker context, it is time to clear queued
+ * flag, and destroy it's end_io if it was converted already */
+ if (work_io) {
+ work_io->flag &= ~EXT4_IO_END_QUEUED;
+ if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
+ list_add_tail(&work_io->list, &to_free);
}
- list_del_init(&io->list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- (void) ext4_end_io_nolock(io);
- mutex_unlock(&inode->i_mutex);
-free:
- ext4_free_io_end(io);
+
+ while (!list_empty(&to_free)) {
+ io = list_entry(to_free.next, ext4_io_end_t, list);
+ list_del_init(&io->list);
+ ext4_free_io_end(io);
+ }
+ return ret;
+}
+
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ ext4_do_flush_completed_IO(io->inode, io);
+}
+
+int ext4_flush_unwritten_io(struct inode *inode)
+{
+ int ret;
+ WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
+ !(inode->i_state & I_FREEING));
+ ret = ext4_do_flush_completed_IO(inode, NULL);
+ ext4_unwritten_wait(inode);
+ return ret;
}
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -195,9 +266,7 @@ static void buffer_io_error(struct buffer_head *bh)
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
- struct workqueue_struct *wq;
struct inode *inode;
- unsigned long flags;
int i;
sector_t bi_sector = bio->bi_sector;
@@ -255,14 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error)
return;
}
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
- wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
}
void ext4_io_submit(struct ext4_io_submit *io)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 41f6ef6..0be1789 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -200,8 +200,11 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
* be a partial of a flex group.
*
* @sb: super block of fs to which the groups belongs
+ *
+ * Returns 0 on a successful allocation of the metadata blocks in the
+ * block group.
*/
-static void ext4_alloc_group_tables(struct super_block *sb,
+static int ext4_alloc_group_tables(struct super_block *sb,
struct ext4_new_flex_group_data *flex_gd,
int flexbg_size)
{
@@ -226,6 +229,8 @@ static void ext4_alloc_group_tables(struct super_block *sb,
(last_group & ~(flexbg_size - 1))));
next_group:
group = group_data[0].group;
+ if (src_group >= group_data[0].group + flex_gd->count)
+ return -ENOSPC;
start_blk = ext4_group_first_block_no(sb, src_group);
last_blk = start_blk + group_data[src_group - group].blocks_count;
@@ -235,7 +240,6 @@ next_group:
start_blk += overhead;
- BUG_ON(src_group >= group_data[0].group + flex_gd->count);
/* We collect contiguous blocks as much as possible. */
src_group++;
for (; src_group <= last_group; src_group++)
@@ -300,6 +304,7 @@ next_group:
group_data[i].free_blocks_count);
}
}
+ return 0;
}
static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
@@ -451,6 +456,9 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
gdblocks = ext4_bg_num_gdb(sb, group);
start = ext4_group_first_block_no(sb, group);
+ if (!ext4_bg_has_super(sb, group))
+ goto handle_itb;
+
/* Copy all of the GDT blocks into the backup in this group */
for (j = 0, block = start + 1; j < gdblocks; j++, block++) {
struct buffer_head *gdb;
@@ -493,6 +501,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
goto out;
}
+handle_itb:
/* Initialize group tables of the grop @group */
if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
goto handle_bb;
@@ -1349,13 +1358,15 @@ exit_journal:
err = err2;
if (!err) {
- int i;
+ int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+ int gdb_num_end = ((group + flex_gd->count - 1) /
+ EXT4_DESC_PER_BLOCK(sb));
+
update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
sizeof(struct ext4_super_block));
- for (i = 0; i < flex_gd->count; i++, group++) {
+ for (; gdb_num <= gdb_num_end; gdb_num++) {
struct buffer_head *gdb_bh;
- int gdb_num;
- gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
+
gdb_bh = sbi->s_group_desc[gdb_num];
update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
gdb_bh->b_size);
@@ -1729,7 +1740,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
*/
while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
flexbg_size)) {
- ext4_alloc_group_tables(sb, flex_gd, flexbg_size);
+ if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0)
+ break;
err = ext4_flex_group_add(sb, resize_inode, flex_gd);
if (unlikely(err))
break;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c6e0cb3..d3c9ffa 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -956,11 +956,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_completed_io_list);
spin_lock_init(&ei->i_completed_io_lock);
- ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
- atomic_set(&ei->i_aiodio_unwritten, 0);
+ atomic_set(&ei->i_unwritten, 0);
return &ei->vfs_inode;
}
@@ -1735,7 +1734,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
static const char *token2str(int token)
{
- static const struct match_token *t;
+ const struct match_token *t;
for (t = tokens; t->token != Opt_err; t++)
if (t->token == token && !strchr(t->pattern, '='))
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index be3efc4..5602d73 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -63,6 +63,7 @@ int writeback_in_progress(struct backing_dev_info *bdi)
{
return test_bit(BDI_writeback_running, &bdi->state);
}
+EXPORT_SYMBOL(writeback_in_progress);
static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index e8ed6d4..4767774 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -161,6 +161,8 @@ static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
case GFS2_SMALL_FH_SIZE:
case GFS2_LARGE_FH_SIZE:
case GFS2_OLD_FH_SIZE:
+ if (fh_len < GFS2_SMALL_FH_SIZE)
+ return NULL;
this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
this.no_formal_ino |= be32_to_cpu(fh[1]);
this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
@@ -180,6 +182,8 @@ static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid,
switch (fh_type) {
case GFS2_LARGE_FH_SIZE:
case GFS2_OLD_FH_SIZE:
+ if (fh_len < GFS2_LARGE_FH_SIZE)
+ return NULL;
parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
parent.no_formal_ino |= be32_to_cpu(fh[5]);
parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 1d38044..2b4f235 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -175,7 +175,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb,
{
struct isofs_fid *ifid = (struct isofs_fid *)fid;
- if (fh_type != 2)
+ if (fh_len < 2 || fh_type != 2)
return NULL;
return isofs_export_iget(sb,
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 52c15c7..86b39b1 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -86,7 +86,12 @@ nope:
static void release_data_buffer(struct buffer_head *bh)
{
if (buffer_freed(bh)) {
+ WARN_ON_ONCE(buffer_dirty(bh));
clear_buffer_freed(bh);
+ clear_buffer_mapped(bh);
+ clear_buffer_new(bh);
+ clear_buffer_req(bh);
+ bh->b_bdev = NULL;
release_buffer_page(bh);
} else
put_bh(bh);
@@ -866,17 +871,35 @@ restart_loop:
* there's no point in keeping a checkpoint record for
* it. */
- /* A buffer which has been freed while still being
- * journaled by a previous transaction may end up still
- * being dirty here, but we want to avoid writing back
- * that buffer in the future after the "add to orphan"
- * operation been committed, That's not only a performance
- * gain, it also stops aliasing problems if the buffer is
- * left behind for writeback and gets reallocated for another
- * use in a different page. */
- if (buffer_freed(bh) && !jh->b_next_transaction) {
- clear_buffer_freed(bh);
- clear_buffer_jbddirty(bh);
+ /*
+ * A buffer which has been freed while still being journaled by
+ * a previous transaction.
+ */
+ if (buffer_freed(bh)) {
+ /*
+ * If the running transaction is the one containing
+ * "add to orphan" operation (b_next_transaction !=
+ * NULL), we have to wait for that transaction to
+ * commit before we can really get rid of the buffer.
+ * So just clear b_modified to not confuse transaction
+ * credit accounting and refile the buffer to
+ * BJ_Forget of the running transaction. If the just
+ * committed transaction contains "add to orphan"
+ * operation, we can completely invalidate the buffer
+ * now. We are rather throughout in that since the
+ * buffer may be still accessible when blocksize <
+ * pagesize and it is attached to the last partial
+ * page.
+ */
+ jh->b_modified = 0;
+ if (!jh->b_next_transaction) {
+ clear_buffer_freed(bh);
+ clear_buffer_jbddirty(bh);
+ clear_buffer_mapped(bh);
+ clear_buffer_new(bh);
+ clear_buffer_req(bh);
+ bh->b_bdev = NULL;
+ }
}
if (buffer_jbddirty(bh)) {
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index febc10d..78b7f84 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1843,15 +1843,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
* We're outside-transaction here. Either or both of j_running_transaction
* and j_committing_transaction may be NULL.
*/
-static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
+ int partial_page)
{
transaction_t *transaction;
struct journal_head *jh;
int may_free = 1;
- int ret;
BUFFER_TRACE(bh, "entry");
+retry:
/*
* It is safe to proceed here without the j_list_lock because the
* buffers cannot be stolen by try_to_free_buffers as long as we are
@@ -1879,10 +1880,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* clear the buffer dirty bit at latest at the moment when the
* transaction marking the buffer as freed in the filesystem
* structures is committed because from that moment on the
- * buffer can be reallocated and used by a different page.
+ * block can be reallocated and used by a different page.
* Since the block hasn't been freed yet but the inode has
* already been added to orphan list, it is safe for us to add
* the buffer to BJ_Forget list of the newest transaction.
+ *
+ * Also we have to clear buffer_mapped flag of a truncated buffer
+ * because the buffer_head may be attached to the page straddling
+ * i_size (can happen only when blocksize < pagesize) and thus the
+ * buffer_head can be reused when the file is extended again. So we end
+ * up keeping around invalidated buffers attached to transactions'
+ * BJ_Forget list just to stop checkpointing code from cleaning up
+ * the transaction this buffer was modified in.
*/
transaction = jh->b_transaction;
if (transaction == NULL) {
@@ -1909,13 +1918,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* committed, the buffer won't be needed any
* longer. */
JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
- ret = __dispose_buffer(jh,
+ may_free = __dispose_buffer(jh,
journal->j_running_transaction);
- journal_put_journal_head(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- spin_unlock(&journal->j_state_lock);
- return ret;
+ goto zap_buffer;
} else {
/* There is no currently-running transaction. So the
* orphan record which we wrote for this file must have
@@ -1923,13 +1928,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* the committing transaction, if it exists. */
if (journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "give to committing trans");
- ret = __dispose_buffer(jh,
+ may_free = __dispose_buffer(jh,
journal->j_committing_transaction);
- journal_put_journal_head(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- spin_unlock(&journal->j_state_lock);
- return ret;
+ goto zap_buffer;
} else {
/* The orphan record's transaction has
* committed. We can cleanse this buffer */
@@ -1950,10 +1951,24 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
/*
* The buffer is committing, we simply cannot touch
- * it. So we just set j_next_transaction to the
- * running transaction (if there is one) and mark
- * buffer as freed so that commit code knows it should
- * clear dirty bits when it is done with the buffer.
+ * it. If the page is straddling i_size we have to wait
+ * for commit and try again.
+ */
+ if (partial_page) {
+ tid_t tid = journal->j_committing_transaction->t_tid;
+
+ journal_put_journal_head(jh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ spin_unlock(&journal->j_state_lock);
+ log_wait_commit(journal, tid);
+ goto retry;
+ }
+ /*
+ * OK, buffer won't be reachable after truncate. We just set
+ * j_next_transaction to the running transaction (if there is
+ * one) and mark buffer as freed so that commit code knows it
+ * should clear dirty bits when it is done with the buffer.
*/
set_buffer_freed(bh);
if (journal->j_running_transaction && buffer_jbddirty(bh))
@@ -1976,6 +1991,14 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
zap_buffer:
+ /*
+ * This is tricky. Although the buffer is truncated, it may be reused
+ * if blocksize < pagesize and it is attached to the page straddling
+ * EOF. Since the buffer might have been added to BJ_Forget list of the
+ * running transaction, journal_get_write_access() won't clear
+ * b_modified and credit accounting gets confused. So clear b_modified
+ * here. */
+ jh->b_modified = 0;
journal_put_journal_head(jh);
zap_buffer_no_jh:
spin_unlock(&journal->j_list_lock);
@@ -2024,7 +2047,8 @@ void journal_invalidatepage(journal_t *journal,
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
- may_free &= journal_unmap_buffer(journal, bh);
+ may_free &= journal_unmap_buffer(journal, bh,
+ offset > 0);
unlock_buffer(bh);
}
curr_off = next_off;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e149b99..484b8d1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1354,6 +1354,11 @@ static void jbd2_mark_journal_empty(journal_t *journal)
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
read_lock(&journal->j_state_lock);
+ /* Is it already empty? */
+ if (sb->s_start == 0) {
+ read_unlock(&journal->j_state_lock);
+ return;
+ }
jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
journal->j_tail_sequence);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 61ea413..1224d6b 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -100,6 +100,10 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
{
struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+ cancel_delayed_work_sync(&c->wbuf_dwork);
+#endif
+
mutex_lock(&c->alloc_sem);
jffs2_flush_wbuf_pad(c);
mutex_unlock(&c->alloc_sem);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 6f4529d..a6597d6 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1044,10 +1044,10 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
ops.datbuf = NULL;
ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
- if (ret || ops.oobretlen != ops.ooblen) {
+ if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) {
pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
jeb->offset, ops.ooblen, ops.oobretlen, ret);
- if (!ret)
+ if (!ret || mtd_is_bitflip(ret))
ret = -EIO;
return ret;
}
@@ -1086,10 +1086,10 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
ops.datbuf = NULL;
ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
- if (ret || ops.oobretlen != ops.ooblen) {
+ if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) {
pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
jeb->offset, ops.ooblen, ops.oobretlen, ret);
- if (!ret)
+ if (!ret || mtd_is_bitflip(ret))
ret = -EIO;
return ret;
}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7ef14b3..e4fb3ba 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -7,7 +7,6 @@
*/
#include <linux/types.h>
-#include <linux/utsname.h>
#include <linux/kernel.h>
#include <linux/ktime.h>
#include <linux/slab.h>
@@ -19,6 +18,8 @@
#include <asm/unaligned.h>
+#include "netns.h"
+
#define NLMDBG_FACILITY NLMDBG_MONITOR
#define NSM_PROGRAM 100024
#define NSM_VERSION 1
@@ -40,6 +41,7 @@ struct nsm_args {
u32 proc;
char *mon_name;
+ char *nodename;
};
struct nsm_res {
@@ -70,7 +72,7 @@ static struct rpc_clnt *nsm_create(struct net *net)
};
struct rpc_create_args args = {
.net = net,
- .protocol = XPRT_TRANSPORT_UDP,
+ .protocol = XPRT_TRANSPORT_TCP,
.address = (struct sockaddr *)&sin,
.addrsize = sizeof(sin),
.servername = "rpc.statd",
@@ -83,10 +85,54 @@ static struct rpc_clnt *nsm_create(struct net *net)
return rpc_create(&args);
}
-static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
- struct net *net)
+static struct rpc_clnt *nsm_client_get(struct net *net)
{
+ static DEFINE_MUTEX(nsm_create_mutex);
struct rpc_clnt *clnt;
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+ spin_lock(&ln->nsm_clnt_lock);
+ if (ln->nsm_users) {
+ ln->nsm_users++;
+ clnt = ln->nsm_clnt;
+ spin_unlock(&ln->nsm_clnt_lock);
+ goto out;
+ }
+ spin_unlock(&ln->nsm_clnt_lock);
+
+ mutex_lock(&nsm_create_mutex);
+ clnt = nsm_create(net);
+ if (!IS_ERR(clnt)) {
+ ln->nsm_clnt = clnt;
+ smp_wmb();
+ ln->nsm_users = 1;
+ }
+ mutex_unlock(&nsm_create_mutex);
+out:
+ return clnt;
+}
+
+static void nsm_client_put(struct net *net)
+{
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
+ struct rpc_clnt *clnt = ln->nsm_clnt;
+ int shutdown = 0;
+
+ spin_lock(&ln->nsm_clnt_lock);
+ if (ln->nsm_users) {
+ if (--ln->nsm_users)
+ ln->nsm_clnt = NULL;
+ shutdown = !ln->nsm_users;
+ }
+ spin_unlock(&ln->nsm_clnt_lock);
+
+ if (shutdown)
+ rpc_shutdown_client(clnt);
+}
+
+static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
+ struct rpc_clnt *clnt)
+{
int status;
struct nsm_args args = {
.priv = &nsm->sm_priv,
@@ -94,31 +140,24 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
.vers = 3,
.proc = NLMPROC_NSM_NOTIFY,
.mon_name = nsm->sm_mon_name,
+ .nodename = clnt->cl_nodename,
};
struct rpc_message msg = {
.rpc_argp = &args,
.rpc_resp = res,
};
- clnt = nsm_create(net);
- if (IS_ERR(clnt)) {
- status = PTR_ERR(clnt);
- dprintk("lockd: failed to create NSM upcall transport, "
- "status=%d\n", status);
- goto out;
- }
+ BUG_ON(clnt == NULL);
memset(res, 0, sizeof(*res));
msg.rpc_proc = &clnt->cl_procinfo[proc];
- status = rpc_call_sync(clnt, &msg, 0);
+ status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
if (status < 0)
dprintk("lockd: NSM upcall RPC failed, status=%d\n",
status);
else
status = 0;
- rpc_shutdown_client(clnt);
- out:
return status;
}
@@ -138,6 +177,7 @@ int nsm_monitor(const struct nlm_host *host)
struct nsm_handle *nsm = host->h_nsmhandle;
struct nsm_res res;
int status;
+ struct rpc_clnt *clnt;
dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
@@ -150,7 +190,15 @@ int nsm_monitor(const struct nlm_host *host)
*/
nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
- status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host->net);
+ clnt = nsm_client_get(host->net);
+ if (IS_ERR(clnt)) {
+ status = PTR_ERR(clnt);
+ dprintk("lockd: failed to create NSM upcall transport, "
+ "status=%d, net=%p\n", status, host->net);
+ return status;
+ }
+
+ status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt);
if (unlikely(res.status != 0))
status = -EIO;
if (unlikely(status < 0)) {
@@ -182,9 +230,11 @@ void nsm_unmonitor(const struct nlm_host *host)
if (atomic_read(&nsm->sm_count) == 1
&& nsm->sm_monitored && !nsm->sm_sticky) {
+ struct lockd_net *ln = net_generic(host->net, lockd_net_id);
+
dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
- status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host->net);
+ status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt);
if (res.status != 0)
status = -EIO;
if (status < 0)
@@ -192,6 +242,8 @@ void nsm_unmonitor(const struct nlm_host *host)
nsm->sm_name);
else
nsm->sm_monitored = 0;
+
+ nsm_client_put(host->net);
}
}
@@ -430,7 +482,7 @@ static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
{
__be32 *p;
- encode_nsm_string(xdr, utsname()->nodename);
+ encode_nsm_string(xdr, argp->nodename);
p = xdr_reserve_space(xdr, 4 + 4 + 4);
*p++ = cpu_to_be32(argp->prog);
*p++ = cpu_to_be32(argp->vers);
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 4eee248..5010b55 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -12,6 +12,10 @@ struct lockd_net {
struct delayed_work grace_period_end;
struct lock_manager lockd_manager;
struct list_head grace_list;
+
+ spinlock_t nsm_clnt_lock;
+ unsigned int nsm_users;
+ struct rpc_clnt *nsm_clnt;
};
extern int lockd_net_id;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 31a63f8..7e35587 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -596,6 +596,7 @@ static int lockd_init_net(struct net *net)
INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
INIT_LIST_HEAD(&ln->grace_list);
+ spin_lock_init(&ln->nsm_clnt_lock);
return 0;
}
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index fb1a2be..8d80c99 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -289,7 +289,6 @@ static void nlmsvc_free_block(struct kref *kref)
dprintk("lockd: freeing block %p...\n", block);
/* Remove block from file's list of blocks */
- mutex_lock(&file->f_mutex);
list_del_init(&block->b_flist);
mutex_unlock(&file->f_mutex);
@@ -303,7 +302,7 @@ static void nlmsvc_free_block(struct kref *kref)
static void nlmsvc_release_block(struct nlm_block *block)
{
if (block != NULL)
- kref_put(&block->b_count, nlmsvc_free_block);
+ kref_put_mutex(&block->b_count, nlmsvc_free_block, &block->b_file->f_mutex);
}
/*
diff --git a/fs/namei.c b/fs/namei.c
index dd1ed1b..81bd546 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -692,9 +692,9 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd)
if (parent->i_uid == inode->i_uid)
return 0;
+ audit_log_link_denied("follow_link", link);
path_put_conditional(link, nd);
path_put(&nd->path);
- audit_log_link_denied("follow_link", link);
return -EACCES;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 4d31f73..7bdf790 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1886,8 +1886,14 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
return err;
err = -EINVAL;
- if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(real_mount(path->mnt)))
- goto unlock;
+ if (unlikely(!check_mnt(real_mount(path->mnt)))) {
+ /* that's acceptable only for automounts done in private ns */
+ if (!(mnt_flags & MNT_SHRINKABLE))
+ goto unlock;
+ /* ... and for those we'd better have mountpoint still alive */
+ if (!real_mount(path->mnt)->mnt_ns)
+ goto unlock;
+ }
/* Refuse the same filesystem on the same mount point */
err = -EBUSY;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index dd392ed..f3d16ad 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -162,25 +162,39 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
return bio;
}
-static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
sector_t isect, struct page *page,
struct pnfs_block_extent *be,
void (*end_io)(struct bio *, int err),
- struct parallel_io *par)
+ struct parallel_io *par,
+ unsigned int offset, int len)
{
+ isect = isect + (offset >> SECTOR_SHIFT);
+ dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
+ npg, rw, (unsigned long long)isect, offset, len);
retry:
if (!bio) {
bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
if (!bio)
return ERR_PTR(-ENOMEM);
}
- if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+ if (bio_add_page(bio, page, len, offset) < len) {
bio = bl_submit_bio(rw, bio);
goto retry;
}
return bio;
}
+static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+ sector_t isect, struct page *page,
+ struct pnfs_block_extent *be,
+ void (*end_io)(struct bio *, int err),
+ struct parallel_io *par)
+{
+ return do_add_page_to_bio(bio, npg, rw, isect, page, be,
+ end_io, par, 0, PAGE_CACHE_SIZE);
+}
+
/* This is basically copied from mpage_end_io_read */
static void bl_end_io_read(struct bio *bio, int err)
{
@@ -461,6 +475,106 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
return;
}
+static void
+bl_read_single_end_io(struct bio *bio, int error)
+{
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct page *page = bvec->bv_page;
+
+ /* Only one page in bvec */
+ unlock_page(page);
+}
+
+static int
+bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
+ unsigned int offset, unsigned int len)
+{
+ struct bio *bio;
+ struct page *shadow_page;
+ sector_t isect;
+ char *kaddr, *kshadow_addr;
+ int ret = 0;
+
+ dprintk("%s: offset %u len %u\n", __func__, offset, len);
+
+ shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (shadow_page == NULL)
+ return -ENOMEM;
+
+ bio = bio_alloc(GFP_NOIO, 1);
+ if (bio == NULL)
+ return -ENOMEM;
+
+ isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
+ (offset / SECTOR_SIZE);
+
+ bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+ bio->bi_bdev = be->be_mdev;
+ bio->bi_end_io = bl_read_single_end_io;
+
+ lock_page(shadow_page);
+ if (bio_add_page(bio, shadow_page,
+ SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
+ unlock_page(shadow_page);
+ bio_put(bio);
+ return -EIO;
+ }
+
+ submit_bio(READ, bio);
+ wait_on_page_locked(shadow_page);
+ if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
+ ret = -EIO;
+ } else {
+ kaddr = kmap_atomic(page);
+ kshadow_addr = kmap_atomic(shadow_page);
+ memcpy(kaddr + offset, kshadow_addr + offset, len);
+ kunmap_atomic(kshadow_addr);
+ kunmap_atomic(kaddr);
+ }
+ __free_page(shadow_page);
+ bio_put(bio);
+
+ return ret;
+}
+
+static int
+bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
+ unsigned int dirty_offset, unsigned int dirty_len,
+ bool full_page)
+{
+ int ret = 0;
+ unsigned int start, end;
+
+ if (full_page) {
+ start = 0;
+ end = PAGE_CACHE_SIZE;
+ } else {
+ start = round_down(dirty_offset, SECTOR_SIZE);
+ end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
+ }
+
+ dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
+ if (!be) {
+ zero_user_segments(page, start, dirty_offset,
+ dirty_offset + dirty_len, end);
+ if (start == 0 && end == PAGE_CACHE_SIZE &&
+ trylock_page(page)) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ return ret;
+ }
+
+ if (start != dirty_offset)
+ ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
+
+ if (!ret && (dirty_offset + dirty_len < end))
+ ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
+ end - dirty_offset - dirty_len);
+
+ return ret;
+}
+
/* Given an unmapped page, zero it or read in page for COW, page is locked
* by caller.
*/
@@ -494,7 +608,6 @@ init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
SetPageUptodate(page);
cleanup:
- bl_put_extent(cow_read);
if (bh)
free_buffer_head(bh);
if (ret) {
@@ -566,6 +679,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
struct parallel_io *par = NULL;
loff_t offset = wdata->args.offset;
size_t count = wdata->args.count;
+ unsigned int pg_offset, pg_len, saved_len;
struct page **pages = wdata->args.pages;
struct page *page;
pgoff_t index;
@@ -674,10 +788,11 @@ next_page:
if (!extent_length) {
/* We've used up the previous extent */
bl_put_extent(be);
+ bl_put_extent(cow_read);
bio = bl_submit_bio(WRITE, bio);
/* Get the next one */
be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
- isect, NULL);
+ isect, &cow_read);
if (!be || !is_writable(be, isect)) {
header->pnfs_error = -EINVAL;
goto out;
@@ -694,7 +809,26 @@ next_page:
extent_length = be->be_length -
(isect - be->be_f_offset);
}
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+
+ dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
+ pg_offset = offset & ~PAGE_CACHE_MASK;
+ if (pg_offset + count > PAGE_CACHE_SIZE)
+ pg_len = PAGE_CACHE_SIZE - pg_offset;
+ else
+ pg_len = count;
+
+ saved_len = pg_len;
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
+ !bl_is_sector_init(be->be_inval, isect)) {
+ ret = bl_read_partial_page_sync(pages[i], cow_read,
+ pg_offset, pg_len, true);
+ if (ret) {
+ dprintk("%s bl_read_partial_page_sync fail %d\n",
+ __func__, ret);
+ header->pnfs_error = ret;
+ goto out;
+ }
+
ret = bl_mark_sectors_init(be->be_inval, isect,
PAGE_CACHE_SECTORS);
if (unlikely(ret)) {
@@ -703,15 +837,35 @@ next_page:
header->pnfs_error = ret;
goto out;
}
+
+ /* Expand to full page write */
+ pg_offset = 0;
+ pg_len = PAGE_CACHE_SIZE;
+ } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
+ (pg_len & (SECTOR_SIZE - 1))){
+ /* ahh, nasty case. We have to do sync full sector
+ * read-modify-write cycles.
+ */
+ unsigned int saved_offset = pg_offset;
+ ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
+ pg_len, false);
+ pg_offset = round_down(pg_offset, SECTOR_SIZE);
+ pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
+ - pg_offset;
}
- bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
+
+
+ bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
isect, pages[i], be,
- bl_end_io_write, par);
+ bl_end_io_write, par,
+ pg_offset, pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
goto out;
}
+ offset += saved_len;
+ count -= saved_len;
isect += PAGE_CACHE_SECTORS;
last_isect = isect;
extent_length -= PAGE_CACHE_SECTORS;
@@ -729,17 +883,16 @@ next_page:
}
write_done:
- wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
- if (count < wdata->res.count) {
- wdata->res.count = count;
- }
+ wdata->res.count = wdata->args.count;
out:
bl_put_extent(be);
+ bl_put_extent(cow_read);
bl_submit_bio(WRITE, bio);
put_parallel(par);
return PNFS_ATTEMPTED;
out_mds:
bl_put_extent(be);
+ bl_put_extent(cow_read);
kfree(par);
return PNFS_NOT_ATTEMPTED;
}
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 0335069..39bb51a 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -41,6 +41,7 @@
#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
struct block_mount_id {
spinlock_t bm_lock; /* protects list */
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 9969444..0e7cd89 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -855,7 +855,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
server->wsize = NFS_MAX_FILE_IO_SIZE;
server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- server->pnfs_blksize = fsinfo->blksize;
server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1e50326..d5a0cf1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1774,7 +1774,11 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
nfs41_free_stateid(server, stateid);
+ nfs_remove_bad_delegation(state->inode);
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ write_sequnlock(&state->seqlock);
clear_bit(NFS_DELEGATED_STATE, &state->flags);
}
}
@@ -3362,8 +3366,11 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s
nfs_fattr_init(fsinfo->fattr);
error = nfs4_do_fsinfo(server, fhandle, fsinfo);
- if (error == 0)
+ if (error == 0) {
+ /* block layout checks this! */
+ server->pnfs_blksize = fsinfo->blksize;
set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype);
+ }
return error;
}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index fdc91a6..ccfe0d0 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -598,7 +598,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel
/* Just to make sure it's null-terminated: */
memcpy(buf, name, namelen);
buf[namelen] = '\0';
- ret = kstrtouint(name, 10, id);
+ ret = kstrtouint(buf, 10, id);
return ret == 0;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cc894ed..5b3224c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1223,10 +1223,26 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2)
return true;
}
+/*
+ * RFC 3530 language requires clid_inuse be returned when the
+ * "principal" associated with a requests differs from that previously
+ * used. We use uid, gid's, and gss principal string as our best
+ * approximation. We also don't want to allow non-gss use of a client
+ * established using gss: in theory cr_principal should catch that
+ * change, but in practice cr_principal can be null even in the gss case
+ * since gssd doesn't always pass down a principal string.
+ */
+static bool is_gss_cred(struct svc_cred *cr)
+{
+ /* Is cr_flavor one of the gss "pseudoflavors"?: */
+ return (cr->cr_flavor > RPC_AUTH_MAXFLAVOR);
+}
+
+
static bool
same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
{
- if ((cr1->cr_flavor != cr2->cr_flavor)
+ if ((is_gss_cred(cr1) != is_gss_cred(cr2))
|| (cr1->cr_uid != cr2->cr_uid)
|| (cr1->cr_gid != cr2->cr_gid)
|| !groups_equal(cr1->cr_group_info, cr2->cr_group_info))
@@ -3766,6 +3782,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
nfsd4_close_open_stateid(stp);
+ release_last_closed_stateid(oo);
oo->oo_last_closed_stid = stp;
if (list_empty(&oo->oo_owner.so_stateids)) {
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index a4d56ac..5b387a4 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -116,6 +116,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (unlikely(ret))
goto out;
+ file_update_time(vma->vm_file);
ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
if (ret) {
nilfs_transaction_abort(inode->i_sb);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7fcd0d6..b8730d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,7 +115,13 @@ u64 stable_page_flags(struct page *page)
u |= 1 << KPF_COMPOUND_TAIL;
if (PageHuge(page))
u |= 1 << KPF_HUGE;
- else if (PageTransCompound(page))
+ /*
+ * PageTransCompound can be true for non-huge compound pages (slab
+ * pages or pages allocated by drivers with __GFP_COMP) because it
+ * just checks PG_head/PG_tail, so we need to check PageLRU to make
+ * sure a given page is a thp, not a non-huge compound page.
+ */
+ else if (PageTransCompound(page) && PageLRU(compound_trans_head(page)))
u |= 1 << KPF_THP;
/*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index dfafeb2..eb7cc91 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -462,9 +462,6 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
err = ERR_PTR(-ENOMEM);
inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
- if (h)
- sysctl_head_finish(h);
-
if (!inode)
goto out;
@@ -473,6 +470,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
d_add(dentry, inode);
out:
+ if (h)
+ sysctl_head_finish(h);
sysctl_head_finish(head);
return err;
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 855da58..63ce6be 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1573,8 +1573,10 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
reiserfs_warning(sb, "reiserfs-13077",
"nfsd/reiserfs, fhtype=%d, len=%d - odd",
fh_type, fh_len);
- fh_type = 5;
+ fh_type = fh_len;
}
+ if (fh_len < 2)
+ return NULL;
return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
(fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
@@ -1583,6 +1585,8 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type)
{
+ if (fh_type > fh_len)
+ fh_type = fh_len;
if (fh_type < 4)
return NULL;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d7a9dd7..933b793 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -96,6 +96,7 @@ xfs_buf_lru_add(
atomic_inc(&bp->b_hold);
list_add_tail(&bp->b_lru, &btp->bt_lru);
btp->bt_lru_nr++;
+ bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
}
spin_unlock(&btp->bt_lru_lock);
}
@@ -154,7 +155,8 @@ xfs_buf_stale(
struct xfs_buftarg *btp = bp->b_target;
spin_lock(&btp->bt_lru_lock);
- if (!list_empty(&bp->b_lru)) {
+ if (!list_empty(&bp->b_lru) &&
+ !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) {
list_del_init(&bp->b_lru);
btp->bt_lru_nr--;
atomic_dec(&bp->b_hold);
@@ -1501,6 +1503,7 @@ xfs_buftarg_shrink(
*/
list_move(&bp->b_lru, &dispose);
btp->bt_lru_nr--;
+ bp->b_lru_flags |= _XBF_LRU_DISPOSE;
}
spin_unlock(&btp->bt_lru_lock);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index d03b73b..7c0b6a0 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -38,27 +38,28 @@ typedef enum {
XBRW_ZERO = 3, /* Zero target memory */
} xfs_buf_rw_t;
-#define XBF_READ (1 << 0) /* buffer intended for reading from device */
-#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
-#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
-#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
-#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
-#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
+#define XBF_READ (1 << 0) /* buffer intended for reading from device */
+#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
+#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
+#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
+#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
/* I/O hints for the BIO layer */
-#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
-#define XBF_FUA (1 << 11)/* force cache write through mode */
-#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */
+#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
+#define XBF_FUA (1 << 11)/* force cache write through mode */
+#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */
/* flags used only as arguments to access routines */
-#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
-#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */
+#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
+#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */
/* flags used only internally */
-#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
-#define _XBF_KMEM (1 << 21)/* backed by heap memory */
-#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
-#define _XBF_COMPOUND (1 << 23)/* compound buffer */
+#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
+#define _XBF_KMEM (1 << 21)/* backed by heap memory */
+#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
+#define _XBF_COMPOUND (1 << 23)/* compound buffer */
+#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */
typedef unsigned int xfs_buf_flags_t;
@@ -72,12 +73,13 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_SYNCIO, "SYNCIO" }, \
{ XBF_FUA, "FUA" }, \
{ XBF_FLUSH, "FLUSH" }, \
- { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
+ { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
{ XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
- { _XBF_COMPOUND, "COMPOUND" }
+ { _XBF_COMPOUND, "COMPOUND" }, \
+ { _XBF_LRU_DISPOSE, "LRU_DISPOSE" }
typedef struct xfs_buftarg {
dev_t bt_dev;
@@ -124,7 +126,12 @@ typedef struct xfs_buf {
xfs_buf_flags_t b_flags; /* status flags */
struct semaphore b_sema; /* semaphore for lockables */
+ /*
+ * concurrent access to b_lru and b_lru_flags are protected by
+ * bt_lru_lock and not by b_sema
+ */
struct list_head b_lru; /* lru list */
+ xfs_buf_flags_t b_lru_flags; /* internal lru status flags */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 4267922..8c6d1d7 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -189,6 +189,9 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid;
struct inode *inode = NULL;
+ if (fh_len < xfs_fileid_length(fileid_type))
+ return NULL;
+
switch (fileid_type) {
case FILEID_INO32_GEN_PARENT:
inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bdaf4cb..19e2380 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -919,6 +919,7 @@ xfs_fs_put_super(
struct xfs_mount *mp = XFS_M(sb);
xfs_filestream_unmount(mp);
+ cancel_delayed_work_sync(&mp->m_sync_work);
xfs_unmountfs(mp);
xfs_syncd_stop(mp);
xfs_freesb(mp);