diff options
author | Anton Arapov <anton@redhat.com> | 2012-06-08 12:58:00 +0200 |
---|---|---|
committer | Anton Arapov <anton@redhat.com> | 2012-06-08 12:58:00 +0200 |
commit | 6792a3f47a2e42d7164292bf7f1a55cfc4c91652 (patch) | |
tree | b90c002bfbbeaec92f5d8a2383dcabf6524016f7 /fs | |
parent | fe2895d3d55146cac65b273c0f83e2c7e543cd0e (diff) | |
download | kernel-uprobes-6792a3f47a2e42d7164292bf7f1a55cfc4c91652.tar.gz kernel-uprobes-6792a3f47a2e42d7164292bf7f1a55cfc4c91652.tar.xz kernel-uprobes-6792a3f47a2e42d7164292bf7f1a55cfc4c91652.zip |
fedora kernel: b920e9b748c595f970bf80ede7832d39f8d567dav3.4.1-2
Signed-off-by: Anton Arapov <anton@redhat.com>
Diffstat (limited to 'fs')
539 files changed, 26539 insertions, 13592 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 1964f98e74b..b85efa77394 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -594,21 +594,21 @@ static int __init init_v9fs(void) int err; pr_info("Installing v9fs 9p2000 file system support\n"); /* TODO: Setup list of registered trasnport modules */ - err = register_filesystem(&v9fs_fs_type); - if (err < 0) { - pr_err("Failed to register filesystem\n"); - return err; - } err = v9fs_cache_register(); if (err < 0) { pr_err("Failed to register v9fs for caching\n"); - goto out_fs_unreg; + return err; } err = v9fs_sysfs_init(); if (err < 0) { pr_err("Failed to register with sysfs\n"); + goto out_cache; + } + err = register_filesystem(&v9fs_fs_type); + if (err < 0) { + pr_err("Failed to register filesystem\n"); goto out_sysfs_cleanup; } @@ -617,8 +617,8 @@ static int __init init_v9fs(void) out_sysfs_cleanup: v9fs_sysfs_cleanup(); -out_fs_unreg: - unregister_filesystem(&v9fs_fs_type); +out_cache: + v9fs_cache_unregister(); return err; } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 7b0cd87b07c..8c92a9ba833 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -155,9 +155,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } - root = d_alloc_root(inode); + root = d_make_root(inode); if (!root) { - iput(inode); retval = -ENOMEM; goto release_sb; } @@ -260,7 +259,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) if (v9fs_proto_dotl(v9ses)) { res = p9_client_statfs(fid, &rs); if (res == 0) { - buf->f_type = V9FS_MAGIC; + buf->f_type = rs.type; buf->f_bsize = rs.bsize; buf->f_blocks = rs.blocks; buf->f_bfree = rs.bfree; diff --git a/fs/Kconfig b/fs/Kconfig index d621f02a3f9..f95ae3a027f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -4,6 +4,10 @@ menu "File systems" +# Use unaligned word dcache accesses +config DCACHE_WORD_ACCESS + bool + if BLOCK source "fs/ext2/Kconfig" @@ -210,6 +214,7 @@ source "fs/minix/Kconfig" source "fs/omfs/Kconfig" source "fs/hpfs/Kconfig" source "fs/qnx4/Kconfig" +source "fs/qnx6/Kconfig" source "fs/romfs/Kconfig" source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 93804d4d66e..2fb97793467 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_UBIFS_FS) += ubifs/ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ obj-$(CONFIG_QNX4FS_FS) += qnx4/ +obj-$(CONFIG_QNX6FS_FS) += qnx6/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 8e3b36ace30..06fdcc9382c 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -483,10 +483,9 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_d_op = &adfs_dentry_operations; root = adfs_iget(sb, &root_obj); - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { int i; - iput(root); for (i = 0; i < asb->s_map_size; i++) brelse(asb->s_map[i].dm_bh); kfree(asb->s_map); diff --git a/fs/affs/super.c b/fs/affs/super.c index 8ba73fed796..0782653a05a 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -473,7 +473,7 @@ got_root: root_inode = affs_iget(sb, root_block); if (IS_ERR(root_inode)) { ret = PTR_ERR(root_inode); - goto out_error_noinode; + goto out_error; } if (AFFS_SB(sb)->s_flags & SF_INTL) @@ -481,7 +481,7 @@ got_root: else sb->s_d_op = &affs_dentry_operations; - sb->s_root = d_alloc_root(root_inode); + sb->s_root = d_make_root(root_inode); if (!sb->s_root) { printk(KERN_ERR "AFFS: Get root inode failed\n"); goto out_error; @@ -494,9 +494,6 @@ got_root: * Begin the cascaded cleanup ... */ out_error: - if (root_inode) - iput(root_inode); -out_error_noinode: kfree(sbi->s_bitmap); affs_brelse(root_bh); kfree(sbi->s_prefix); diff --git a/fs/afs/file.c b/fs/afs/file.c index 14d89fa58fe..8f6e9234d56 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -251,7 +251,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping, ASSERT(key != NULL); vnode = AFS_FS_I(mapping->host); - if (vnode->flags & AFS_VNODE_DELETED) { + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { _leave(" = -ESTALE"); return -ESTALE; } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 2f213d109c2..b960ff05ea0 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -365,10 +365,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, _debug("extract data"); if (call->count > 0) { page = call->reply3; - buffer = kmap_atomic(page, KM_USER0); + buffer = kmap_atomic(page); ret = afs_extract_data(call, skb, last, buffer, call->count); - kunmap_atomic(buffer, KM_USER0); + kunmap_atomic(buffer); switch (ret) { case 0: break; case -EAGAIN: return 0; @@ -411,9 +411,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, if (call->count < PAGE_SIZE) { _debug("clear"); page = call->reply3; - buffer = kmap_atomic(page, KM_USER0); + buffer = kmap_atomic(page); memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap_atomic(buffer, KM_USER0); + kunmap_atomic(buffer); } _leave(" = 0 [done]"); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 8f4ce2658b7..298cf8919ec 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -200,9 +200,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) if (PageError(page)) goto error; - buf = kmap_atomic(page, KM_USER0); + buf = kmap_atomic(page); memcpy(devname, buf, size); - kunmap_atomic(buf, KM_USER0); + kunmap_atomic(buf); page_cache_release(page); page = NULL; } diff --git a/fs/afs/super.c b/fs/afs/super.c index 983ec59fc80..f02b31e7e64 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -301,7 +301,6 @@ static int afs_fill_super(struct super_block *sb, { struct afs_super_info *as = sb->s_fs_info; struct afs_fid fid; - struct dentry *root = NULL; struct inode *inode = NULL; int ret; @@ -327,18 +326,16 @@ static int afs_fill_super(struct super_block *sb, set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); ret = -ENOMEM; - root = d_alloc_root(inode); - if (!root) + sb->s_root = d_make_root(inode); + if (!sb->s_root) goto error; sb->s_d_op = &afs_fs_dentry_operations; - sb->s_root = root; _leave(" = 0"); return 0; error: - iput(inode); _leave(" = %d", ret); return ret; } @@ -13,7 +13,7 @@ #include <linux/errno.h> #include <linux/time.h> #include <linux/aio_abi.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/syscalls.h> #include <linux/backing-dev.h> #include <linux/uio.h> @@ -93,9 +93,8 @@ static void aio_free_ring(struct kioctx *ctx) put_page(info->ring_pages[i]); if (info->mmap_size) { - down_write(&ctx->mm->mmap_sem); - do_munmap(ctx->mm, info->mmap_base, info->mmap_size); - up_write(&ctx->mm->mmap_sem); + BUG_ON(ctx->mm != current->mm); + vm_munmap(info->mmap_base, info->mmap_size); } if (info->ring_pages && info->ring_pages != info->internal_pages) @@ -160,7 +159,7 @@ static int aio_setup_ring(struct kioctx *ctx) info->nr = nr_events; /* trusted copy */ - ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring = kmap_atomic(info->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; ring->head = ring->tail = 0; @@ -168,47 +167,38 @@ static int aio_setup_ring(struct kioctx *ctx) ring->compat_features = AIO_RING_COMPAT_FEATURES; ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); - kunmap_atomic(ring, KM_USER0); + kunmap_atomic(ring); return 0; } /* aio_ring_event: returns a pointer to the event at the given index from - * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); + * kmap_atomic(). Release the pointer with put_aio_ring_event(); */ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr, km) ({ \ +#define aio_ring_event(info, nr) ({ \ unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ struct io_event *__event; \ __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ + (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \ __event += pos % AIO_EVENTS_PER_PAGE; \ __event; \ }) -#define put_aio_ring_event(event, km) do { \ +#define put_aio_ring_event(event) do { \ struct io_event *__event = (event); \ (void)__event; \ - kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ + kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \ } while(0) static void ctx_rcu_free(struct rcu_head *head) { struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); - unsigned nr_events = ctx->max_reqs; - kmem_cache_free(kioctx_cachep, ctx); - - if (nr_events) { - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - nr_events > aio_nr); - aio_nr -= nr_events; - spin_unlock(&aio_nr_lock); - } } /* __put_ioctx @@ -217,13 +207,19 @@ static void ctx_rcu_free(struct rcu_head *head) */ static void __put_ioctx(struct kioctx *ctx) { + unsigned nr_events = ctx->max_reqs; BUG_ON(ctx->reqs_active); - cancel_delayed_work(&ctx->wq); - cancel_work_sync(&ctx->wq.work); + cancel_delayed_work_sync(&ctx->wq); aio_free_ring(ctx); mmdrop(ctx->mm); ctx->mm = NULL; + if (nr_events) { + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - nr_events > aio_nr); + aio_nr -= nr_events; + spin_unlock(&aio_nr_lock); + } pr_debug("__put_ioctx: freeing %p\n", ctx); call_rcu(&ctx->rcu_head, ctx_rcu_free); } @@ -247,7 +243,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) { struct mm_struct *mm; struct kioctx *ctx; - int did_sync = 0; + int err = -ENOMEM; /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || @@ -256,7 +252,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-EINVAL); } - if ((unsigned long)nr_events > aio_max_nr) + if (!nr_events || (unsigned long)nr_events > aio_max_nr) return ERR_PTR(-EAGAIN); ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); @@ -280,25 +276,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) goto out_freectx; /* limit the number of system wide aios */ - do { - spin_lock_bh(&aio_nr_lock); - if (aio_nr + nr_events > aio_max_nr || - aio_nr + nr_events < aio_nr) - ctx->max_reqs = 0; - else - aio_nr += ctx->max_reqs; - spin_unlock_bh(&aio_nr_lock); - if (ctx->max_reqs || did_sync) - break; - - /* wait for rcu callbacks to have completed before giving up */ - synchronize_rcu(); - did_sync = 1; - ctx->max_reqs = nr_events; - } while (1); - - if (ctx->max_reqs == 0) + spin_lock(&aio_nr_lock); + if (aio_nr + nr_events > aio_max_nr || + aio_nr + nr_events < aio_nr) { + spin_unlock(&aio_nr_lock); goto out_cleanup; + } + aio_nr += ctx->max_reqs; + spin_unlock(&aio_nr_lock); /* now link into global list. */ spin_lock(&mm->ioctx_lock); @@ -310,27 +295,27 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ctx; out_cleanup: - __put_ioctx(ctx); - return ERR_PTR(-EAGAIN); - + err = -EAGAIN; + aio_free_ring(ctx); out_freectx: mmdrop(mm); kmem_cache_free(kioctx_cachep, ctx); - ctx = ERR_PTR(-ENOMEM); - - dprintk("aio: error allocating ioctx %p\n", ctx); - return ctx; + dprintk("aio: error allocating ioctx %d\n", err); + return ERR_PTR(err); } -/* aio_cancel_all +/* kill_ctx * Cancels all outstanding aio requests on an aio context. Used * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void aio_cancel_all(struct kioctx *ctx) +static void kill_ctx(struct kioctx *ctx) { int (*cancel)(struct kiocb *, struct io_event *); + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); struct io_event res; + spin_lock_irq(&ctx->ctx_lock); ctx->dead = 1; while (!list_empty(&ctx->active_reqs)) { @@ -346,15 +331,7 @@ static void aio_cancel_all(struct kioctx *ctx) spin_lock_irq(&ctx->ctx_lock); } } - spin_unlock_irq(&ctx->ctx_lock); -} - -static void wait_for_all_aios(struct kioctx *ctx) -{ - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - spin_lock_irq(&ctx->ctx_lock); if (!ctx->reqs_active) goto out; @@ -404,19 +381,24 @@ void exit_aio(struct mm_struct *mm) ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); hlist_del_rcu(&ctx->list); - aio_cancel_all(ctx); - - wait_for_all_aios(ctx); - /* - * Ensure we don't leave the ctx on the aio_wq - */ - cancel_work_sync(&ctx->wq.work); + kill_ctx(ctx); if (1 != atomic_read(&ctx->users)) printk(KERN_DEBUG "exit_aio:ioctx still alive: %d %d %d\n", atomic_read(&ctx->users), ctx->dead, ctx->reqs_active); + /* + * We don't need to bother with munmap() here - + * exit_mmap(mm) is coming and it'll unmap everything. + * Since aio_free_ring() uses non-zero ->mmap_size + * as indicator that it needs to unmap the area, + * just set it to 0; aio_free_ring() is the only + * place that uses ->mmap_size, so it's safe. + * That way we get all munmap done to current->mm - + * all other callers have ctx->mm == current->mm. + */ + ctx->ring_info.mmap_size = 0; put_ioctx(ctx); } } @@ -920,7 +902,7 @@ static void aio_kick_handler(struct work_struct *work) unuse_mm(mm); set_fs(oldfs); /* - * we're in a worker thread already, don't use queue_delayed_work, + * we're in a worker thread already; no point using non-zero delay */ if (requeue) queue_delayed_work(aio_wq, &ctx->wq, 0); @@ -1019,10 +1001,10 @@ int aio_complete(struct kiocb *iocb, long res, long res2) if (kiocbIsCancelled(iocb)) goto put_rq; - ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); + ring = kmap_atomic(info->ring_pages[0]); tail = info->tail; - event = aio_ring_event(info, tail, KM_IRQ0); + event = aio_ring_event(info, tail); if (++tail >= info->nr) tail = 0; @@ -1043,8 +1025,8 @@ int aio_complete(struct kiocb *iocb, long res, long res2) info->tail = tail; ring->tail = tail; - put_aio_ring_event(event, KM_IRQ0); - kunmap_atomic(ring, KM_IRQ1); + put_aio_ring_event(event); + kunmap_atomic(ring); pr_debug("added to ring %p at [%lu]\n", iocb, tail); @@ -1089,7 +1071,7 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) unsigned long head; int ret = 0; - ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring = kmap_atomic(info->ring_pages[0]); dprintk("in aio_read_evt h%lu t%lu m%lu\n", (unsigned long)ring->head, (unsigned long)ring->tail, (unsigned long)ring->nr); @@ -1101,18 +1083,18 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) head = ring->head % info->nr; if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head, KM_USER1); + struct io_event *evp = aio_ring_event(info, head); *ent = *evp; head = (head + 1) % info->nr; smp_mb(); /* finish reading the event before updatng the head */ ring->head = head; ret = 1; - put_aio_ring_event(evp, KM_USER1); + put_aio_ring_event(evp); } spin_unlock(&info->ring_lock); out: - kunmap_atomic(ring, KM_USER0); + kunmap_atomic(ring); dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, (unsigned long)ring->head, (unsigned long)ring->tail); return ret; @@ -1290,8 +1272,7 @@ static void io_destroy(struct kioctx *ioctx) if (likely(!was_dead)) put_ioctx(ioctx); /* twice for the list */ - aio_cancel_all(ioctx); - wait_for_all_aios(ioctx); + kill_ctx(ioctx); /* * Wake up any waiters. The setting of ctx->dead must be seen @@ -1299,7 +1280,6 @@ static void io_destroy(struct kioctx *ioctx) * locking done by the above calls to ensure this consistency. */ wake_up_all(&ioctx->wait); - put_ioctx(ioctx); /* once for the lookup */ } /* sys_io_setup: @@ -1336,11 +1316,9 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); - if (!ret) { - put_ioctx(ioctx); - return 0; - } - io_destroy(ioctx); + if (ret) + io_destroy(ioctx); + put_ioctx(ioctx); } out: @@ -1358,6 +1336,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { io_destroy(ioctx); + put_ioctx(ioctx); return 0; } pr_debug("EINVAL: io_destroy: invalid context id\n"); @@ -1477,6 +1456,10 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) if (ret < 0) goto out; + ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret); + if (ret < 0) + goto out; + kiocb->ki_nr_segs = kiocb->ki_nbytes; kiocb->ki_cur_seg = 0; /* ki_nbytes/left now reflect bytes instead of segs */ @@ -1488,11 +1471,17 @@ out: return ret; } -static ssize_t aio_setup_single_vector(struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb) { + int bytes; + + bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left); + if (bytes < 0) + return bytes; + kiocb->ki_iovec = &kiocb->ki_inline_vec; kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = kiocb->ki_left; + kiocb->ki_iovec->iov_len = bytes; kiocb->ki_nr_segs = 1; kiocb->ki_cur_seg = 0; return 0; @@ -1517,10 +1506,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, kiocb->ki_left))) break; - ret = security_file_permission(file, MAY_READ); - if (unlikely(ret)) - break; - ret = aio_setup_single_vector(kiocb); + ret = aio_setup_single_vector(READ, file, kiocb); if (ret) break; ret = -EINVAL; @@ -1535,10 +1521,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, kiocb->ki_left))) break; - ret = security_file_permission(file, MAY_WRITE); - if (unlikely(ret)) - break; - ret = aio_setup_single_vector(kiocb); + ret = aio_setup_single_vector(WRITE, file, kiocb); if (ret) break; ret = -EINVAL; @@ -1549,9 +1532,6 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) break; - ret = security_file_permission(file, MAY_READ); - if (unlikely(ret)) - break; ret = aio_setup_vectored_rw(READ, kiocb, compat); if (ret) break; @@ -1563,9 +1543,6 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_WRITE))) break; - ret = security_file_permission(file, MAY_WRITE); - if (unlikely(ret)) - break; ret = aio_setup_vectored_rw(WRITE, kiocb, compat); if (ret) break; diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index f11e43ed907..28d39fb84ae 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -39,19 +39,6 @@ static const struct dentry_operations anon_inodefs_dentry_operations = { .d_dname = anon_inodefs_dname, }; -static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_pseudo(fs_type, "anon_inode:", NULL, - &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); -} - -static struct file_system_type anon_inode_fs_type = { - .name = "anon_inodefs", - .mount = anon_inodefs_mount, - .kill_sb = kill_anon_super, -}; - /* * nop .set_page_dirty method so that people can use .page_mkwrite on * anon inodes. @@ -65,6 +52,62 @@ static const struct address_space_operations anon_aops = { .set_page_dirty = anon_set_page_dirty, }; +/* + * A single inode exists for all anon_inode files. Contrary to pipes, + * anon_inode inodes have no associated per-instance data, so we need + * only allocate one of them. + */ +static struct inode *anon_inode_mkinode(struct super_block *s) +{ + struct inode *inode = new_inode_pseudo(s); + + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_ino = get_next_ino(); + inode->i_fop = &anon_inode_fops; + + inode->i_mapping->a_ops = &anon_aops; + + /* + * Mark the inode dirty from the very beginning, + * that way it will never be moved to the dirty + * list because mark_inode_dirty() will think + * that it already _is_ on the dirty list. + */ + inode->i_state = I_DIRTY; + inode->i_mode = S_IRUSR | S_IWUSR; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_flags |= S_PRIVATE; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + return inode; +} + +static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct dentry *root; + root = mount_pseudo(fs_type, "anon_inode:", NULL, + &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); + if (!IS_ERR(root)) { + struct super_block *s = root->d_sb; + anon_inode_inode = anon_inode_mkinode(s); + if (IS_ERR(anon_inode_inode)) { + dput(root); + deactivate_locked_super(s); + root = ERR_CAST(anon_inode_inode); + } + } + return root; +} + +static struct file_system_type anon_inode_fs_type = { + .name = "anon_inodefs", + .mount = anon_inodefs_mount, + .kill_sb = kill_anon_super, +}; + /** * anon_inode_getfile - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" @@ -180,38 +223,6 @@ err_put_unused_fd: } EXPORT_SYMBOL_GPL(anon_inode_getfd); -/* - * A single inode exists for all anon_inode files. Contrary to pipes, - * anon_inode inodes have no associated per-instance data, so we need - * only allocate one of them. - */ -static struct inode *anon_inode_mkinode(void) -{ - struct inode *inode = new_inode_pseudo(anon_inode_mnt->mnt_sb); - - if (!inode) - return ERR_PTR(-ENOMEM); - - inode->i_ino = get_next_ino(); - inode->i_fop = &anon_inode_fops; - - inode->i_mapping->a_ops = &anon_aops; - - /* - * Mark the inode dirty from the very beginning, - * that way it will never be moved to the dirty - * list because mark_inode_dirty() will think - * that it already _is_ on the dirty list. - */ - inode->i_state = I_DIRTY; - inode->i_mode = S_IRUSR | S_IWUSR; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_flags |= S_PRIVATE; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - return inode; -} - static int __init anon_inode_init(void) { int error; @@ -224,16 +235,8 @@ static int __init anon_inode_init(void) error = PTR_ERR(anon_inode_mnt); goto err_unregister_filesystem; } - anon_inode_inode = anon_inode_mkinode(); - if (IS_ERR(anon_inode_inode)) { - error = PTR_ERR(anon_inode_inode); - goto err_mntput; - } - return 0; -err_mntput: - kern_unmount(anon_inode_mnt); err_unregister_filesystem: unregister_filesystem(&anon_inode_fs_type); err_exit: diff --git a/fs/attr.c b/fs/attr.c index 95053ad8abc..73f69a6ce9e 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -5,7 +5,7 @@ * changes by Thomas Schoebel-Theuer */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/string.h> diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index d06d95a5c63..aa9103f8f01 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -230,7 +230,7 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) fdt = files_fdtable(files); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); - FD_SET(fd, fdt->close_on_exec); + __set_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); } diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index c038727b405..cddc74b9cdb 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -31,11 +31,11 @@ static int __init init_autofs4_fs(void) { int err; + autofs_dev_ioctl_init(); + err = register_filesystem(&autofs_fs_type); if (err) - return err; - - autofs_dev_ioctl_init(); + autofs_dev_ioctl_exit(); return err; } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 9ef53a68ea0..6e488ebe778 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -245,12 +245,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (!ino) goto fail_free; root_inode = autofs4_get_inode(s, S_IFDIR | 0755); - if (!root_inode) - goto fail_ino; - - root = d_alloc_root(root_inode); + root = d_make_root(root_inode); if (!root) - goto fail_iput; + goto fail_ino; pipe = NULL; root->d_fsdata = ino; @@ -315,9 +312,6 @@ fail_fput: fail_dput: dput(root); goto fail_free; -fail_iput: - printk("autofs: get root dentry failed\n"); - iput(root_inode); fail_ino: kfree(ino); fail_free: diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index f624cd0ff84..da8876d38a7 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -91,7 +91,7 @@ static int autofs4_write(struct autofs_sb_info *sbi, return (bytes > 0); } - + static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 22e9a78872f..37268c5bb98 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -9,7 +9,7 @@ */ #include <linux/fs.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 6e6d536767f..e18da23d42b 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -852,9 +852,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) ret = PTR_ERR(root); goto unacquire_priv_sbp; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { - iput(root); befs_error(sb, "get root inode failed"); goto unacquire_priv_sbp; } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index b0391bc402b..e23dc7c8b88 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -367,9 +367,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) ret = PTR_ERR(inode); goto out2; } - s->s_root = d_alloc_root(inode); + s->s_root = d_make_root(inode); if (!s->s_root) { - iput(inode); ret = -ENOMEM; goto out2; } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 1ff94054d35..d146e181d10 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -26,7 +26,6 @@ #include <linux/coredump.h> #include <linux/slab.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> #include <asm/a.out-core.h> @@ -51,9 +50,7 @@ static int set_brk(unsigned long start, unsigned long end) end = PAGE_ALIGN(end); if (end > start) { unsigned long addr; - down_write(¤t->mm->mmap_sem); - addr = do_brk(start, end - start); - up_write(¤t->mm->mmap_sem); + addr = vm_brk(start, end - start); if (BAD_ADDR(addr)) return addr; } @@ -267,7 +264,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) } install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; if (N_MAGIC(ex) == OMAGIC) { unsigned long text_addr, map_size; @@ -282,9 +278,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) pos = 32; map_size = ex.a_text+ex.a_data; #endif - down_write(¤t->mm->mmap_sem); - error = do_brk(text_addr & PAGE_MASK, map_size); - up_write(¤t->mm->mmap_sem); + error = vm_brk(text_addr & PAGE_MASK, map_size); if (error != (text_addr & PAGE_MASK)) { send_sig(SIGKILL, current, 0); return error; @@ -315,9 +309,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { loff_t pos = fd_offset; - down_write(¤t->mm->mmap_sem); - do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - up_write(¤t->mm->mmap_sem); + vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); bprm->file->f_op->read(bprm->file, (char __user *)N_TXTADDR(ex), ex.a_text+ex.a_data, &pos); @@ -327,24 +319,20 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) goto beyond_if; } - down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, + error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, fd_offset); - up_write(¤t->mm->mmap_sem); if (error != N_TXTADDR(ex)) { send_sig(SIGKILL, current, 0); return error; } - down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, + error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, fd_offset + ex.a_text); - up_write(¤t->mm->mmap_sem); if (error != N_DATADDR(ex)) { send_sig(SIGKILL, current, 0); return error; @@ -414,9 +402,7 @@ static int load_aout_library(struct file *file) "N_TXTOFF is not page aligned. Please convert library: %s\n", file->f_path.dentry->d_name.name); } - down_write(¤t->mm->mmap_sem); - do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - up_write(¤t->mm->mmap_sem); + vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); file->f_op->read(file, (char __user *)start_addr, ex.a_text + ex.a_data, &pos); @@ -427,12 +413,10 @@ static int load_aout_library(struct file *file) goto out; } /* Now use mmap to map the library into memory. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(file, start_addr, ex.a_text + ex.a_data, + error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, N_TXTOFF(ex)); - up_write(¤t->mm->mmap_sem); retval = error; if (error != start_addr) goto out; @@ -440,9 +424,7 @@ static int load_aout_library(struct file *file) len = PAGE_ALIGN(ex.a_text + ex.a_data); bss = ex.a_text + ex.a_data + ex.a_bss; if (bss > len) { - down_write(¤t->mm->mmap_sem); - error = do_brk(start_addr + len, bss - len); - up_write(¤t->mm->mmap_sem); + error = vm_brk(start_addr + len, bss - len); retval = error; if (error != start_addr + len) goto out; @@ -454,7 +436,8 @@ out: static int __init init_aout_binfmt(void) { - return register_binfmt(&aout_format); + register_binfmt(&aout_format); + return 0; } static void __exit exit_aout_binfmt(void) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fa7483fc4a4..16f73541707 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -35,6 +35,7 @@ #include <asm/uaccess.h> #include <asm/param.h> #include <asm/page.h> +#include <asm/exec.h> static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); static int load_elf_library(struct file *); @@ -81,9 +82,7 @@ static int set_brk(unsigned long start, unsigned long end) end = ELF_PAGEALIGN(end); if (end > start) { unsigned long addr; - down_write(¤t->mm->mmap_sem); - addr = do_brk(start, end - start); - up_write(¤t->mm->mmap_sem); + addr = vm_brk(start, end - start); if (BAD_ADDR(addr)) return addr; } @@ -513,9 +512,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); /* Map the last of the bss segment */ - down_write(¤t->mm->mmap_sem); - error = do_brk(elf_bss, last_bss - elf_bss); - up_write(¤t->mm->mmap_sem); + error = vm_brk(elf_bss, last_bss - elf_bss); if (BAD_ADDR(error)) goto out_close; } @@ -711,17 +708,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval) goto out_free_dentry; -#ifdef CONFIG_X86_32 - /* - * Turn off the CS limit completely if exec-shield disabled or - * NX active: - */ - if (disable_nx || executable_stack != EXSTACK_DISABLE_X || (__supported_pte_mask & _PAGE_NX)) - arch_add_exec_range(current->mm, -1); -#endif - /* OK, This is the point of no return */ - current->flags &= ~PF_FORKNOEXEC; current->mm->def_flags = def_flags; /* Do this immediately, since STACK_TOP as used in setup_arg_pages @@ -943,7 +930,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; retval = create_elf_tables(bprm, &loc->elf_ex, load_addr, interp_load_addr); if (retval < 0) { @@ -972,10 +958,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) and some applications "depend" upon this behavior. Since we do not have the power to recompile these, we emulate the SVr4 behavior. Sigh. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC, + error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, 0); - up_write(¤t->mm->mmap_sem); } #ifdef ELF_PLAT_INIT @@ -1060,8 +1044,7 @@ static int load_elf_library(struct file *file) eppnt++; /* Now use mmap to map the library into memory. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(file, + error = vm_mmap(file, ELF_PAGESTART(eppnt->p_vaddr), (eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr)), @@ -1069,7 +1052,6 @@ static int load_elf_library(struct file *file) MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, (eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr))); - up_write(¤t->mm->mmap_sem); if (error != ELF_PAGESTART(eppnt->p_vaddr)) goto out_free_ph; @@ -1082,11 +1064,8 @@ static int load_elf_library(struct file *file) len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + ELF_MIN_ALIGN - 1); bss = eppnt->p_memsz + eppnt->p_vaddr; - if (bss > len) { - down_write(¤t->mm->mmap_sem); - do_brk(len, bss - len); - up_write(¤t->mm->mmap_sem); - } + if (bss > len) + vm_brk(len, bss - len); error = 0; out_free_ph: @@ -1104,6 +1083,29 @@ out: */ /* + * The purpose of always_dump_vma() is to make sure that special kernel mappings + * that are useful for post-mortem analysis are included in every core dump. + * In that way we ensure that the core dump is fully interpretable later + * without matching up the same kernel and hardware config to see what PC values + * meant. These special mappings include - vDSO, vsyscall, and other + * architecture specific mappings + */ +static bool always_dump_vma(struct vm_area_struct *vma) +{ + /* Any vsyscall mappings? */ + if (vma == get_gate_vma(vma->vm_mm)) + return true; + /* + * arch_vma_name() returns non-NULL for special architecture mappings, + * such as vDSO sections. + */ + if (arch_vma_name(vma)) + return true; + + return false; +} + +/* * Decide what to dump of a segment, part, all or none. */ static unsigned long vma_dump_size(struct vm_area_struct *vma, @@ -1111,10 +1113,13 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, { #define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - /* The vma can be set up to tell us the answer directly. */ - if (vma->vm_flags & VM_ALWAYSDUMP) + /* always dump the vdso and vsyscall sections */ + if (always_dump_vma(vma)) goto whole; + if (vma->vm_flags & VM_NODUMP) + return 0; + /* Hugetlb memory check */ if (vma->vm_flags & VM_HUGETLB) { if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) @@ -1399,6 +1404,22 @@ static void do_thread_regset_writeback(struct task_struct *task, regset->writeback(task, regset, 1); } +#ifndef PR_REG_SIZE +#define PR_REG_SIZE(S) sizeof(S) +#endif + +#ifndef PRSTATUS_SIZE +#define PRSTATUS_SIZE(S) sizeof(S) +#endif + +#ifndef PR_REG_PTR +#define PR_REG_PTR(S) (&((S)->pr_reg)) +#endif + +#ifndef SET_PR_FPVALID +#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V)) +#endif + static int fill_thread_core_info(struct elf_thread_core_info *t, const struct user_regset_view *view, long signr, size_t *total) @@ -1413,11 +1434,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, */ fill_prstatus(&t->prstatus, t->task, signr); (void) view->regsets[0].get(t->task, &view->regsets[0], - 0, sizeof(t->prstatus.pr_reg), - &t->prstatus.pr_reg, NULL); + 0, PR_REG_SIZE(t->prstatus.pr_reg), + PR_REG_PTR(&t->prstatus), NULL); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, - sizeof(t->prstatus), &t->prstatus); + PRSTATUS_SIZE(t->prstatus), &t->prstatus); *total += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); @@ -1447,7 +1468,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset->core_note_type, size, data); else { - t->prstatus.pr_fpvalid = 1; + SET_PR_FPVALID(&t->prstatus, 1); fill_note(&t->notes[i], "CORE", NT_PRFPREG, size, data); } @@ -2086,7 +2107,8 @@ out: static int __init init_elf_binfmt(void) { - return register_binfmt(&elf_format); + register_binfmt(&elf_format); + return 0; } static void __exit exit_elf_binfmt(void) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 30745f459fa..d390a0fffc6 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -39,6 +39,7 @@ #include <asm/uaccess.h> #include <asm/param.h> #include <asm/pgalloc.h> +#include <asm/exec.h> typedef char *elf_caddr_t; @@ -91,7 +92,8 @@ static struct linux_binfmt elf_fdpic_format = { static int __init init_elf_fdpic_binfmt(void) { - return register_binfmt(&elf_fdpic_format); + register_binfmt(&elf_fdpic_format); + return 0; } static void __exit exit_elf_fdpic_binfmt(void) @@ -334,8 +336,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, current->mm->context.exec_fdpic_loadmap = 0; current->mm->context.interp_fdpic_loadmap = 0; - current->flags &= ~PF_FORKNOEXEC; - #ifdef CONFIG_MMU elf_fdpic_arch_lay_out_mm(&exec_params, &interp_params, @@ -390,21 +390,17 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC)) stack_prot |= PROT_EXEC; - down_write(¤t->mm->mmap_sem); - current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot, + current->mm->start_brk = vm_mmap(NULL, 0, stack_size, stack_prot, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED | MAP_GROWSDOWN, 0); if (IS_ERR_VALUE(current->mm->start_brk)) { - up_write(¤t->mm->mmap_sem); retval = current->mm->start_brk; current->mm->start_brk = 0; goto error_kill; } - up_write(¤t->mm->mmap_sem); - current->mm->brk = current->mm->start_brk; current->mm->context.end_brk = current->mm->start_brk; current->mm->context.end_brk += @@ -413,7 +409,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, #endif install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; if (create_elf_fdpic_tables(bprm, current->mm, &exec_params, &interp_params) < 0) goto error_kill; @@ -956,10 +951,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) mflags |= MAP_EXECUTABLE; - down_write(&mm->mmap_sem); - maddr = do_mmap(NULL, load_addr, top - base, + maddr = vm_mmap(NULL, load_addr, top - base, PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); - up_write(&mm->mmap_sem); if (IS_ERR_VALUE(maddr)) return (int) maddr; @@ -1097,10 +1090,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, /* create the mapping */ disp = phdr->p_vaddr & ~PAGE_MASK; - down_write(&mm->mmap_sem); - maddr = do_mmap(file, maddr, phdr->p_memsz + disp, prot, flags, + maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags, phdr->p_offset - disp); - up_write(&mm->mmap_sem); kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx", loop, phdr->p_memsz + disp, prot, flags, @@ -1144,10 +1135,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, unsigned long xmaddr; flags |= MAP_FIXED | MAP_ANONYMOUS; - down_write(&mm->mmap_sem); - xmaddr = do_mmap(NULL, xaddr, excess - excess1, + xmaddr = vm_mmap(NULL, xaddr, excess - excess1, prot, flags, 0); - up_write(&mm->mmap_sem); kdebug("mmap[%d] <anon>" " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx", diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index b8e8b0acf9b..2790c7e1912 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -100,7 +100,8 @@ static struct linux_binfmt em86_format = { static int __init init_em86_binfmt(void) { - return register_binfmt(&em86_format); + register_binfmt(&em86_format); + return 0; } static void __exit exit_em86_binfmt(void) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 1bffbe0ed77..6b2daf99fab 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -15,7 +15,7 @@ * JAN/99 -- coded full program relocation (gerg@snapgear.com) */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> @@ -37,7 +37,6 @@ #include <linux/syscalls.h> #include <asm/byteorder.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/unaligned.h> #include <asm/cacheflush.h> @@ -543,10 +542,8 @@ static int load_flat_file(struct linux_binprm * bprm, */ DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n"); - down_write(¤t->mm->mmap_sem); - textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, + textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_EXECUTABLE, 0); - up_write(¤t->mm->mmap_sem); if (!textpos || IS_ERR_VALUE(textpos)) { if (!textpos) textpos = (unsigned long) -ENOMEM; @@ -557,10 +554,8 @@ static int load_flat_file(struct linux_binprm * bprm, len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); - down_write(¤t->mm->mmap_sem); - realdatastart = do_mmap(0, 0, len, + realdatastart = vm_mmap(0, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); - up_write(¤t->mm->mmap_sem); if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) { if (!realdatastart) @@ -604,10 +599,8 @@ static int load_flat_file(struct linux_binprm * bprm, len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); - down_write(¤t->mm->mmap_sem); - textpos = do_mmap(0, 0, len, + textpos = vm_mmap(0, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); - up_write(¤t->mm->mmap_sem); if (!textpos || IS_ERR_VALUE(textpos)) { if (!textpos) @@ -902,7 +895,6 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) libinfo.lib_list[j].start_data:UNLOADED_LIB; install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; set_binfmt(&flat_format); @@ -950,7 +942,8 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) static int __init init_flat_binfmt(void) { - return register_binfmt(&flat_format); + register_binfmt(&flat_format); + return 0; } /****************************************************************************/ diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a9198dfd5f8..613aa061823 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> +#include <linux/magic.h> #include <linux/binfmts.h> #include <linux/slab.h> #include <linux/ctype.h> @@ -699,7 +700,7 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent) [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} }; - int err = simple_fill_super(sb, 0x42494e4d, bm_files); + int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); if (!err) sb->s_op = &s_ops; return err; @@ -726,11 +727,8 @@ static struct file_system_type bm_fs_type = { static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); - if (!err) { - err = insert_binfmt(&misc_format); - if (err) - unregister_filesystem(&bm_fs_type); - } + if (!err) + insert_binfmt(&misc_format); return err; } diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 396a9884591..d3b8c1f6315 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -105,7 +105,8 @@ static struct linux_binfmt script_format = { static int __init init_script_binfmt(void) { - return register_binfmt(&script_format); + register_binfmt(&script_format); + return 0; } static void __exit exit_script_binfmt(void) diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index cc8560f6c9b..4517aaff61b 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -147,10 +147,8 @@ static int map_som_binary(struct file *file, code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize); current->mm->start_code = code_start; current->mm->end_code = code_start + code_size; - down_write(¤t->mm->mmap_sem); - retval = do_mmap(file, code_start, code_size, prot, + retval = vm_mmap(file, code_start, code_size, prot, flags, SOM_PAGESTART(hpuxhdr->exec_tfile)); - up_write(¤t->mm->mmap_sem); if (retval < 0 && retval > -1024) goto out; @@ -158,20 +156,16 @@ static int map_som_binary(struct file *file, data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize); current->mm->start_data = data_start; current->mm->end_data = bss_start = data_start + data_size; - down_write(¤t->mm->mmap_sem); - retval = do_mmap(file, data_start, data_size, + retval = vm_mmap(file, data_start, data_size, prot | PROT_WRITE, flags, SOM_PAGESTART(hpuxhdr->exec_dfile)); - up_write(¤t->mm->mmap_sem); if (retval < 0 && retval > -1024) goto out; som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize); current->mm->start_brk = current->mm->brk = som_brk; - down_write(¤t->mm->mmap_sem); - retval = do_mmap(NULL, bss_start, som_brk - bss_start, + retval = vm_mmap(NULL, bss_start, som_brk - bss_start, prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0); - up_write(¤t->mm->mmap_sem); if (retval > 0 || retval < -1024) retval = 0; out: @@ -225,7 +219,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) goto out_free; /* OK, This is the point of no return */ - current->flags &= ~PF_FORKNOEXEC; current->personality = PER_HPUX; setup_new_exec(bprm); @@ -289,7 +282,8 @@ static int load_som_library(struct file *f) static int __init init_som_binfmt(void) { - return register_binfmt(&som_format); + register_binfmt(&som_format); + return 0; } static void __exit exit_som_binfmt(void) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index c2183f3917c..e85c04b9f61 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -357,7 +357,7 @@ static void bio_integrity_generate(struct bio *bio) bix.sector_size = bi->sector_size; bio_for_each_segment(bv, bio, i) { - void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + void *kaddr = kmap_atomic(bv->bv_page); bix.data_buf = kaddr + bv->bv_offset; bix.data_size = bv->bv_len; bix.prot_buf = prot_buf; @@ -371,7 +371,7 @@ static void bio_integrity_generate(struct bio *bio) total += sectors * bi->tuple_size; BUG_ON(total > bio->bi_integrity->bip_size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); } } @@ -498,7 +498,7 @@ static int bio_integrity_verify(struct bio *bio) bix.sector_size = bi->sector_size; bio_for_each_segment(bv, bio, i) { - void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + void *kaddr = kmap_atomic(bv->bv_page); bix.data_buf = kaddr + bv->bv_offset; bix.data_size = bv->bv_len; bix.prot_buf = prot_buf; @@ -507,7 +507,7 @@ static int bio_integrity_verify(struct bio *bio) ret = bi->verify_fn(&bix); if (ret) { - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); return ret; } @@ -517,7 +517,7 @@ static int bio_integrity_verify(struct bio *bio) total += sectors * bi->tuple_size; BUG_ON(total > bio->bi_integrity->bip_size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); } return ret; @@ -22,7 +22,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mempool.h> #include <linux/workqueue.h> #include <scsi/sg.h> /* for struct sg_iovec */ @@ -505,9 +505,14 @@ EXPORT_SYMBOL(bio_clone); int bio_get_nr_vecs(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - return min_t(unsigned, + int nr_pages; + + nr_pages = min_t(unsigned, queue_max_segments(q), queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); + + return min_t(unsigned, nr_pages, BIO_MAX_PAGES); + } EXPORT_SYMBOL(bio_get_nr_vecs); diff --git a/fs/block_dev.c b/fs/block_dev.c index 5e9f198f771..ba11c30f302 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -16,6 +16,7 @@ #include <linux/blkdev.h> #include <linux/module.h> #include <linux/blkpg.h> +#include <linux/magic.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/pagevec.h> @@ -69,7 +70,7 @@ static void bdev_inode_switch_bdi(struct inode *inode, spin_unlock(&dst->wb.list_lock); } -static sector_t max_block(struct block_device *bdev) +sector_t blkdev_max_block(struct block_device *bdev) { sector_t retval = ~((sector_t)0); loff_t sz = i_size_read(bdev->bd_inode); @@ -109,7 +110,7 @@ void invalidate_bdev(struct block_device *bdev) /* 99% of the time, we don't need to flush the cleancache on the bdev. * But, for the strange corners, lets be cautious */ - cleancache_flush_inode(mapping); + cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); @@ -162,7 +163,7 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - if (iblock >= max_block(I_BDEV(inode))) { + if (iblock >= blkdev_max_block(I_BDEV(inode))) { if (create) return -EIO; @@ -184,7 +185,7 @@ static int blkdev_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - sector_t end_block = max_block(I_BDEV(inode)); + sector_t end_block = blkdev_max_block(I_BDEV(inode)); unsigned long max_blocks = bh->b_size >> inode->i_blkbits; if ((iblock + max_blocks) > end_block) { @@ -506,7 +507,7 @@ static const struct super_operations bdev_sops = { static struct dentry *bd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576); + return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); } static struct file_system_type bd_type = { diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 0cc20b35c1c..42704149b72 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -171,11 +171,11 @@ out: spin_unlock_irqrestore(&workers->lock, flags); } -static noinline int run_ordered_completions(struct btrfs_workers *workers, +static noinline void run_ordered_completions(struct btrfs_workers *workers, struct btrfs_work *work) { if (!workers->ordered) - return 0; + return; set_bit(WORK_DONE_BIT, &work->flags); @@ -213,7 +213,6 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, } spin_unlock(&workers->order_lock); - return 0; } static void put_worker(struct btrfs_worker_thread *worker) @@ -399,7 +398,7 @@ again: /* * this will wait for all the worker threads to shutdown */ -int btrfs_stop_workers(struct btrfs_workers *workers) +void btrfs_stop_workers(struct btrfs_workers *workers) { struct list_head *cur; struct btrfs_worker_thread *worker; @@ -427,7 +426,6 @@ int btrfs_stop_workers(struct btrfs_workers *workers) put_worker(worker); } spin_unlock_irq(&workers->lock); - return 0; } /* @@ -615,14 +613,14 @@ found: * it was taken from. It is intended for use with long running work functions * that make some progress and want to give the cpu up for others. */ -int btrfs_requeue_work(struct btrfs_work *work) +void btrfs_requeue_work(struct btrfs_work *work) { struct btrfs_worker_thread *worker = work->worker; unsigned long flags; int wake = 0; if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - goto out; + return; spin_lock_irqsave(&worker->lock, flags); if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) @@ -649,9 +647,6 @@ int btrfs_requeue_work(struct btrfs_work *work) if (wake) wake_up_process(worker->task); spin_unlock_irqrestore(&worker->lock, flags); -out: - - return 0; } void btrfs_set_work_high_prio(struct btrfs_work *work) diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index f34cc31fa3c..063698b90ce 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -111,9 +111,9 @@ struct btrfs_workers { void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); int btrfs_start_workers(struct btrfs_workers *workers); -int btrfs_stop_workers(struct btrfs_workers *workers); +void btrfs_stop_workers(struct btrfs_workers *workers); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, struct btrfs_workers *async_starter); -int btrfs_requeue_work(struct btrfs_work *work); +void btrfs_requeue_work(struct btrfs_work *work); void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 56136d9046f..bcec0675023 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -22,6 +22,7 @@ #include "ulist.h" #include "transaction.h" #include "delayed-ref.h" +#include "locking.h" /* * this structure records all encountered refs on the way up to the root @@ -893,18 +894,22 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, s64 bytes_left = size - 1; struct extent_buffer *eb = eb_in; struct btrfs_key found_key; + int leave_spinning = path->leave_spinning; if (bytes_left >= 0) dest[bytes_left] = '\0'; + path->leave_spinning = 1; while (1) { len = btrfs_inode_ref_name_len(eb, iref); bytes_left -= len; if (bytes_left >= 0) read_extent_buffer(eb, dest + bytes_left, (unsigned long)(iref + 1), len); - if (eb != eb_in) + if (eb != eb_in) { + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); + } ret = inode_ref_info(parent, 0, fs_root, path, &found_key); if (ret > 0) ret = -ENOENT; @@ -919,8 +924,11 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, slot = path->slots[0]; eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ - if (eb != eb_in) + if (eb != eb_in) { atomic_inc(&eb->refs); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } btrfs_release_path(path); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); @@ -931,6 +939,7 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, } btrfs_release_path(path); + path->leave_spinning = leave_spinning; if (ret) return ERR_PTR(ret); @@ -1247,7 +1256,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_path *path, iterate_irefs_t *iterate, void *ctx) { - int ret; + int ret = 0; int slot; u32 cur; u32 len; @@ -1259,7 +1268,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_inode_ref *iref; struct btrfs_key found_key; - while (1) { + while (!ret) { + path->leave_spinning = 1; ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, &found_key); if (ret < 0) @@ -1275,6 +1285,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ atomic_inc(&eb->refs); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item = btrfs_item_nr(eb, slot); @@ -1288,13 +1300,12 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, (unsigned long long)found_key.objectid, (unsigned long long)fs_root->objectid); ret = iterate(parent, iref, eb, ctx); - if (ret) { - free_extent_buffer(eb); + if (ret) break; - } len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } @@ -1359,12 +1370,6 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) inode_to_path, ipath); } -/* - * allocates space to return multiple file system paths for an inode. - * total_bytes to allocate are passed, note that space usable for actual path - * information will be total_bytes - sizeof(struct inode_fs_paths). - * the returned pointer must be freed with free_ipath() in the end. - */ struct btrfs_data_container *init_data_container(u32 total_bytes) { struct btrfs_data_container *data; @@ -1420,5 +1425,8 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, void free_ipath(struct inode_fs_paths *ipath) { + if (!ipath) + return; + kfree(ipath->fspath); kfree(ipath); } diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index d986824bb2b..c053e90f200 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -89,7 +89,6 @@ #include "disk-io.h" #include "transaction.h" #include "extent_io.h" -#include "disk-io.h" #include "volumes.h" #include "print-tree.h" #include "locking.h" diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d02c27cd14c..86eff48dab7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -120,10 +120,10 @@ static int check_compressed_csum(struct inode *inode, page = cb->compressed_pages[i]; csum = ~(u32)0; - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); btrfs_csum_final(csum, (char *)&csum); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); if (csum != *cb_sum) { printk(KERN_INFO "btrfs csum failed ino %llu " @@ -226,8 +226,8 @@ out: * Clear the writeback bits on all of the file * pages for a compressed write */ -static noinline int end_compressed_writeback(struct inode *inode, u64 start, - unsigned long ram_size) +static noinline void end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; @@ -253,7 +253,6 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start, index += ret; } /* the inode may be gone now */ - return 0; } /* @@ -392,20 +391,21 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, */ atomic_inc(&cb->pending_bios); ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(bio); bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + BUG_ON(!bio); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); @@ -421,15 +421,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_get(bio); ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(bio); return 0; @@ -497,7 +497,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * sure they map to this compressed extent on disk. */ set_page_extent_mapped(page); - lock_extent(tree, last_offset, end, GFP_NOFS); + lock_extent(tree, last_offset, end); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, PAGE_CACHE_SIZE); @@ -507,7 +507,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || (em->block_start >> 9) != cb->orig_bio->bi_sector) { free_extent_map(em); - unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_extent(tree, last_offset, end); unlock_page(page); page_cache_release(page); break; @@ -521,10 +521,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (zero_offset) { int zeros; zeros = PAGE_CACHE_SIZE - zero_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, zeros); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); } } @@ -535,7 +535,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, nr_pages++; page_cache_release(page); } else { - unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_extent(tree, last_offset, end); unlock_page(page); page_cache_release(page); break; @@ -662,7 +662,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_get(comp_bio); ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ /* * inc the count before we submit the bio so @@ -675,19 +675,20 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } sums += (comp_bio->bi_size + root->sectorsize - 1) / root->sectorsize; ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(comp_bio); comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + BUG_ON(!comp_bio); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -698,15 +699,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_get(comp_bio); ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(comp_bio); return 0; @@ -734,7 +735,7 @@ struct btrfs_compress_op *btrfs_compress_op[] = { &btrfs_lzo_compress, }; -int __init btrfs_init_compress(void) +void __init btrfs_init_compress(void) { int i; @@ -744,7 +745,6 @@ int __init btrfs_init_compress(void) atomic_set(&comp_alloc_workspace[i], 0); init_waitqueue_head(&comp_workspace_wait[i]); } - return 0; } /* @@ -993,9 +993,9 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, bytes = min(PAGE_CACHE_SIZE - *pg_offset, PAGE_CACHE_SIZE - buf_offset); bytes = min(bytes, working_bytes); - kaddr = kmap_atomic(page_out, KM_USER0); + kaddr = kmap_atomic(page_out); memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); flush_dcache_page(page_out); *pg_offset += bytes; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index a12059f4f0f..9afb0a62ae8 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -19,7 +19,7 @@ #ifndef __BTRFS_COMPRESSION_ #define __BTRFS_COMPRESSION_ -int btrfs_init_compress(void); +void btrfs_init_compress(void); void btrfs_exit_compress(void); int btrfs_compress_pages(int type, struct address_space *mapping, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0639a555e16..4106264fbc6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -36,7 +36,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); -static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, +static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); struct btrfs_path *btrfs_alloc_path(void) @@ -156,10 +156,23 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; - rcu_read_lock(); - eb = rcu_dereference(root->node); - extent_buffer_get(eb); - rcu_read_unlock(); + while (1) { + rcu_read_lock(); + eb = rcu_dereference(root->node); + + /* + * RCU really hurts here, we could free up the root node because + * it was cow'ed but we may not get the new root node yet so do + * the inc_not_zero dance and if it doesn't work then + * synchronize_rcu and try again. + */ + if (atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + synchronize_rcu(); + } return eb; } @@ -207,10 +220,12 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) */ static void add_root_to_dirty_list(struct btrfs_root *root) { + spin_lock(&root->fs_info->trans_lock); if (root->track_dirty && list_empty(&root->dirty_list)) { list_add(&root->dirty_list, &root->fs_info->dirty_cowonly_roots); } + spin_unlock(&root->fs_info->trans_lock); } /* @@ -331,8 +346,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (btrfs_block_can_be_shared(root, buf)) { ret = btrfs_lookup_extent_info(trans, root, buf->start, buf->len, &refs, &flags); - BUG_ON(ret); - BUG_ON(refs == 0); + if (ret) + return ret; + if (refs == 0) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + return ret; + } } else { refs = 1; if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || @@ -351,14 +371,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { ret = btrfs_inc_ref(trans, root, buf, 1, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_dec_ref(trans, root, buf, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_inc_ref(trans, root, cow, 1, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; } else { @@ -368,14 +388,15 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1, 1); else ret = btrfs_inc_ref(trans, root, cow, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { ret = btrfs_set_disk_extent_flags(trans, root, buf->start, buf->len, new_flags, 0); - BUG_ON(ret); + if (ret) + return ret; } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { @@ -384,9 +405,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1, 1); else ret = btrfs_inc_ref(trans, root, cow, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, buf, 1, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } clean_tree_block(trans, root, buf); *last_ref = 1; @@ -415,7 +436,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_disk_key disk_key; struct extent_buffer *cow; - int level; + int level, ret; int last_ref = 0; int unlock_orig = 0; u64 parent_start; @@ -467,7 +488,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); - update_ref_for_cow(trans, root, buf, cow, &last_ref); + ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } if (root->ref_cows) btrfs_reloc_cow_block(trans, root, buf, cow); @@ -504,7 +529,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } if (unlock_orig) btrfs_tree_unlock(buf); - free_extent_buffer(buf); + free_extent_buffer_stale(buf); btrfs_mark_buffer_dirty(cow); *cow_ret = cow; return 0; @@ -700,7 +725,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, cur = btrfs_find_tree_block(root, blocknr, blocksize); if (cur) - uptodate = btrfs_buffer_uptodate(cur, gen); + uptodate = btrfs_buffer_uptodate(cur, gen, 0); else uptodate = 0; if (!cur || !uptodate) { @@ -934,7 +959,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); - BUG_ON(!child); + if (!child) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + goto enospc; + } + btrfs_tree_lock(child); btrfs_set_lock_blocking(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child); @@ -959,7 +989,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); /* once for the root ptr */ - free_extent_buffer(mid); + free_extent_buffer_stale(mid); return 0; } if (btrfs_header_nritems(mid) > @@ -1010,13 +1040,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - wret = del_ptr(trans, root, path, level + 1, pslot + - 1); - if (wret) - ret = wret; + del_ptr(trans, root, path, level + 1, pslot + 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1, 0); - free_extent_buffer(right); + free_extent_buffer_stale(right); right = NULL; } else { struct btrfs_disk_key right_key; @@ -1035,7 +1062,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * otherwise we would have pulled some pointers from the * right */ - BUG_ON(!left); + if (!left) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + goto enospc; + } wret = balance_node_right(trans, root, mid, left); if (wret < 0) { ret = wret; @@ -1051,12 +1082,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - wret = del_ptr(trans, root, path, level + 1, pslot); - if (wret) - ret = wret; + del_ptr(trans, root, path, level + 1, pslot); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); - free_extent_buffer(mid); + free_extent_buffer_stale(mid); mid = NULL; } else { /* update the parent key to reflect our changes */ @@ -1331,7 +1360,12 @@ static noinline int reada_for_balance(struct btrfs_root *root, block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); eb = btrfs_find_tree_block(root, block1, blocksize); - if (eb && btrfs_buffer_uptodate(eb, gen)) + /* + * if we get -eagain from btrfs_buffer_uptodate, we + * don't want to return eagain here. That will loop + * forever + */ + if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block1 = 0; free_extent_buffer(eb); } @@ -1339,7 +1373,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); eb = btrfs_find_tree_block(root, block2, blocksize); - if (eb && btrfs_buffer_uptodate(eb, gen)) + if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); } @@ -1382,7 +1416,8 @@ static noinline int reada_for_balance(struct btrfs_root *root, * if lowest_unlock is 1, level 0 won't be unlocked */ static noinline void unlock_up(struct btrfs_path *path, int level, - int lowest_unlock) + int lowest_unlock, int min_write_lock_level, + int *write_lock_level) { int i; int skip_level = level; @@ -1414,6 +1449,11 @@ static noinline void unlock_up(struct btrfs_path *path, int level, if (i >= lowest_unlock && i > skip_level && path->locks[i]) { btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; + if (write_lock_level && + i > min_write_lock_level && + i <= *write_lock_level) { + *write_lock_level = i - 1; + } } } } @@ -1471,8 +1511,9 @@ read_block_for_search(struct btrfs_trans_handle *trans, tmp = btrfs_find_tree_block(root, blocknr, blocksize); if (tmp) { - if (btrfs_buffer_uptodate(tmp, 0)) { - if (btrfs_buffer_uptodate(tmp, gen)) { + /* first we do an atomic uptodate check */ + if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { /* * we found an up to date block without * sleeping, return @@ -1490,8 +1531,9 @@ read_block_for_search(struct btrfs_trans_handle *trans, free_extent_buffer(tmp); btrfs_set_path_blocking(p); + /* now we're allowed to do a blocking uptodate check */ tmp = read_tree_block(root, blocknr, blocksize, gen); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { *eb_ret = tmp; return 0; } @@ -1526,7 +1568,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!btrfs_buffer_uptodate(tmp, 0)) + if (!btrfs_buffer_uptodate(tmp, 0, 0)) ret = -EIO; free_extent_buffer(tmp); } @@ -1637,6 +1679,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root /* everything at write_lock_level or lower must be write locked */ int write_lock_level = 0; u8 lowest_level = 0; + int min_write_lock_level; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1664,6 +1707,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (cow && (p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL; + min_write_lock_level = write_lock_level; + again: /* * we try very hard to do read locks on the root @@ -1795,7 +1840,8 @@ cow_done: goto again; } - unlock_up(p, level, lowest_unlock); + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); if (level == lowest_level) { if (dec) @@ -1857,7 +1903,8 @@ cow_done: } } if (!p->search_for_split) - unlock_up(p, level, lowest_unlock); + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); goto done; } } @@ -1881,15 +1928,12 @@ done: * fixing up pointers when a given leaf/node is not in slot 0 of the * higher levels * - * If this fails to write a tree block, it returns -1, but continues - * fixing up the blocks in ram so the tree is consistent. */ -static int fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_disk_key *key, int level) +static void fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, int level) { int i; - int ret = 0; struct extent_buffer *t; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -1902,7 +1946,6 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, if (tslot != 0) break; } - return ret; } /* @@ -1911,9 +1954,9 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key) +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; struct extent_buffer *eb; @@ -1923,13 +1966,11 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, slot = path->slots[0]; if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); - if (comp_keys(&disk_key, new_key) >= 0) - return -1; + BUG_ON(comp_keys(&disk_key, new_key) >= 0); } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); - if (comp_keys(&disk_key, new_key) <= 0) - return -1; + BUG_ON(comp_keys(&disk_key, new_key) <= 0); } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -1937,7 +1978,6 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(eb); if (slot == 0) fixup_low_keys(trans, root, path, &disk_key, 1); - return 0; } /* @@ -2140,12 +2180,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * * slot and level indicate where you want the key to go, and * blocknr is the block the key points to. - * - * returns zero on success and < 0 on any error */ -static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, struct btrfs_disk_key - *key, u64 bytenr, int slot, int level) +static void insert_ptr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, u64 bytenr, + int slot, int level) { struct extent_buffer *lower; int nritems; @@ -2155,8 +2194,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); - if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) - BUG(); + BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); if (slot != nritems) { memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(slot + 1), @@ -2169,7 +2207,6 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); btrfs_mark_buffer_dirty(lower); - return 0; } /* @@ -2190,7 +2227,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; int mid; int ret; - int wret; u32 c_nritems; c = path->nodes[level]; @@ -2247,11 +2283,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - wret = insert_ptr(trans, root, path, &disk_key, split->start, - path->slots[level + 1] + 1, - level + 1); - if (wret) - ret = wret; + insert_ptr(trans, root, path, &disk_key, split->start, + path->slots[level + 1] + 1, level + 1); if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -2320,6 +2353,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; + struct btrfs_map_token token; struct btrfs_disk_key disk_key; int slot; u32 i; @@ -2331,6 +2365,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, u32 data_end; u32 this_item_size; + btrfs_init_map_token(&token); + if (empty) nr = 0; else @@ -2408,8 +2444,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - push_space -= btrfs_item_size(right, item); - btrfs_set_item_offset(right, item, push_space); + push_space -= btrfs_token_item_size(right, item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); } left_nritems -= push_items; @@ -2537,9 +2573,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, u32 old_left_nritems; u32 nr; int ret = 0; - int wret; u32 this_item_size; u32 old_left_item_size; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); if (empty) nr = min(right_nritems, max_slot); @@ -2600,9 +2638,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, item = btrfs_item_nr(left, i); - ioff = btrfs_item_offset(left, item); - btrfs_set_item_offset(left, item, - ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); + ioff = btrfs_token_item_offset(left, item, &token); + btrfs_set_token_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), + &token); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -2632,8 +2671,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - push_space = push_space - btrfs_item_size(right, item); - btrfs_set_item_offset(right, item, push_space); + push_space = push_space - btrfs_token_item_size(right, + item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); } btrfs_mark_buffer_dirty(left); @@ -2643,9 +2683,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, clean_tree_block(trans, root, right); btrfs_item_key(right, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, &disk_key, 1); - if (wret) - ret = wret; + fixup_low_keys(trans, root, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -2716,7 +2754,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root path->nodes[1], slot - 1, &left); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ - ret = 1; + if (ret == -ENOSPC) + ret = 1; goto out; } @@ -2738,22 +2777,21 @@ out: /* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. - * - * returns 0 if all went well and < 0 on failure. */ -static noinline int copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *l, - struct extent_buffer *right, - int slot, int mid, int nritems) +static noinline void copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) { int data_copy_size; int rt_data_off; int i; - int ret = 0; - int wret; struct btrfs_disk_key disk_key; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); @@ -2775,17 +2813,15 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_item *item = btrfs_item_nr(right, i); u32 ioff; - ioff = btrfs_item_offset(right, item); - btrfs_set_item_offset(right, item, ioff + rt_data_off); + ioff = btrfs_token_item_offset(right, item, &token); + btrfs_set_token_item_offset(right, item, + ioff + rt_data_off, &token); } btrfs_set_header_nritems(l, mid); - ret = 0; btrfs_item_key(right, &disk_key, 0); - wret = insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -2803,8 +2839,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, } BUG_ON(path->slots[0] < 0); - - return ret; } /* @@ -2993,12 +3027,8 @@ again: if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; - + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3006,29 +3036,21 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, - right->start, + insert_ptr(trans, root, path, &disk_key, right->start, path->slots[1], 1); - if (wret) - ret = wret; btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; path->slots[0] = 0; - if (path->slots[1] == 0) { - wret = fixup_low_keys(trans, root, - path, &disk_key, 1); - if (wret) - ret = wret; - } + if (path->slots[1] == 0) + fixup_low_keys(trans, root, path, + &disk_key, 1); } btrfs_mark_buffer_dirty(right); return ret; } - ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); - BUG_ON(ret); + copy_for_split(trans, root, path, l, right, slot, mid, nritems); if (split == 2) { BUG_ON(num_doubles != 0); @@ -3036,7 +3058,7 @@ again: goto again; } - return ret; + return 0; push_for_double: push_for_double_split(trans, root, path, data_size); @@ -3238,11 +3260,9 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - ret = setup_items_for_insert(trans, root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); - BUG_ON(ret); - + setup_items_for_insert(trans, root, path, new_key, &item_size, + item_size, item_size + + sizeof(struct btrfs_item), 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -3257,10 +3277,10 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -int btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u32 new_size, int from_end) +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -3271,13 +3291,16 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, unsigned int old_size; unsigned int size_diff; int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; old_size = btrfs_item_size_nr(leaf, slot); if (old_size == new_size) - return 0; + return; nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(root, leaf); @@ -3297,8 +3320,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff + size_diff); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + size_diff, &token); } /* shift the data */ @@ -3350,15 +3374,14 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return 0; } /* * make the item pointed to by the path bigger, data_size is the new size. */ -int btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - u32 data_size) +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size) { int slot; struct extent_buffer *leaf; @@ -3368,6 +3391,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, unsigned int old_data; unsigned int old_size; int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; @@ -3397,8 +3423,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - data_size); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - data_size, &token); } /* shift the data */ @@ -3416,7 +3443,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return 0; } /* @@ -3441,6 +3467,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, unsigned int data_end; struct btrfs_disk_key disk_key; struct btrfs_key found_key; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); for (i = 0; i < nr; i++) { if (total_size + data_size[i] + sizeof(struct btrfs_item) > @@ -3506,8 +3535,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - total_data); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - total_data, &token); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -3534,9 +3564,10 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(leaf, slot + i); - btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + btrfs_set_token_item_offset(leaf, item, + data_end - data_size[i], &token); data_end -= data_size[i]; - btrfs_set_item_size(leaf, item, data_size[i]); + btrfs_set_token_item_size(leaf, item, data_size[i], &token); } btrfs_set_header_nritems(leaf, nritems + nr); btrfs_mark_buffer_dirty(leaf); @@ -3544,7 +3575,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, ret = 0; if (slot == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - ret = fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(trans, root, path, &disk_key, 1); } if (btrfs_leaf_free_space(root, leaf) < 0) { @@ -3562,19 +3593,21 @@ out: * to save stack depth by doing the bulk of the work in a function * that doesn't call btrfs_search_slot */ -int setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) +void setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr) { struct btrfs_item *item; int i; u32 nritems; unsigned int data_end; struct btrfs_disk_key disk_key; - int ret; struct extent_buffer *leaf; int slot; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; @@ -3606,8 +3639,9 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - total_data); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - total_data, &token); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -3626,17 +3660,17 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(leaf, slot + i); - btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + btrfs_set_token_item_offset(leaf, item, + data_end - data_size[i], &token); data_end -= data_size[i]; - btrfs_set_item_size(leaf, item, data_size[i]); + btrfs_set_token_item_size(leaf, item, data_size[i], &token); } btrfs_set_header_nritems(leaf, nritems + nr); - ret = 0; if (slot == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - ret = fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(trans, root, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(leaf); @@ -3645,7 +3679,6 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return ret; } /* @@ -3672,16 +3705,14 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, if (ret == 0) return -EEXIST; if (ret < 0) - goto out; + return ret; slot = path->slots[0]; BUG_ON(slot < 0); - ret = setup_items_for_insert(trans, root, path, cpu_key, data_size, + setup_items_for_insert(trans, root, path, cpu_key, data_size, total_data, total_size, nr); - -out: - return ret; + return 0; } /* @@ -3717,13 +3748,11 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * the tree should have been previously balanced so the deletion does not * empty a node. */ -static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot) +static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; - int ret = 0; - int wret; nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { @@ -3743,12 +3772,9 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, &disk_key, level + 1); - if (wret) - ret = wret; + fixup_low_keys(trans, root, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); - return ret; } /* @@ -3761,17 +3787,13 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf) +static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { - int ret; - WARN_ON(btrfs_header_generation(leaf) != trans->transid); - ret = del_ptr(trans, root, path, 1, path->slots[1]); - if (ret) - return ret; + del_ptr(trans, root, path, 1, path->slots[1]); /* * btrfs_free_extent is expensive, we want to make sure we @@ -3781,8 +3803,9 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); + extent_buffer_get(leaf); btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); - return 0; + free_extent_buffer_stale(leaf); } /* * delete the item at the leaf level in path. If that empties @@ -3799,6 +3822,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int wret; int i; u32 nritems; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); @@ -3820,8 +3846,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff + dsize); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + dsize, &token); } memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), @@ -3839,8 +3866,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } else { btrfs_set_path_blocking(path); clean_tree_block(trans, root, leaf); - ret = btrfs_del_leaf(trans, root, path, leaf); - BUG_ON(ret); + btrfs_del_leaf(trans, root, path, leaf); } } else { int used = leaf_space_used(leaf, 0, nritems); @@ -3848,10 +3874,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, - &disk_key, 1); - if (wret) - ret = wret; + fixup_low_keys(trans, root, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ @@ -3879,9 +3902,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - ret = btrfs_del_leaf(trans, root, path, leaf); - BUG_ON(ret); + btrfs_del_leaf(trans, root, path, leaf); free_extent_buffer(leaf); + ret = 0; } else { /* if we're still in the path, make sure * we're dirty. Otherwise, one of the @@ -4029,7 +4052,7 @@ again: tmp = btrfs_find_tree_block(root, blockptr, btrfs_level_size(root, level - 1)); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) { free_extent_buffer(tmp); break; } @@ -4059,18 +4082,18 @@ find_next_key: path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; - unlock_up(path, level, 1); + unlock_up(path, level, 1, 0, NULL); goto out; } btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); - BUG_ON(!cur); + BUG_ON(!cur); /* -ENOMEM */ btrfs_tree_read_lock(cur); path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; - unlock_up(path, level, 1); + unlock_up(path, level, 1, 0, NULL); btrfs_clear_path_blocking(path, NULL, 0); } out: @@ -4152,7 +4175,8 @@ next: struct extent_buffer *cur; cur = btrfs_find_tree_block(root, blockptr, btrfs_level_size(root, level - 1)); - if (!cur || !btrfs_buffer_uptodate(cur, gen)) { + if (!cur || + btrfs_buffer_uptodate(cur, gen, 1) <= 0) { slot++; if (cur) free_extent_buffer(cur); @@ -4306,7 +4330,7 @@ again: } ret = 0; done: - unlock_up(path, 0, 1); + unlock_up(path, 0, 1, 0, NULL); path->leave_spinning = old_spinning; if (!old_spinning) btrfs_set_path_blocking(path); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 91128d7e9d3..8fd72331d60 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,6 +48,8 @@ struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" +#define BTRFS_MAX_MIRRORS 2 + #define BTRFS_MAX_LEVEL 8 #define BTRFS_COMPAT_EXTENT_TREE_V0 @@ -138,6 +140,12 @@ struct btrfs_ordered_sum; #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 /* + * the max metadata block size. This limit is somewhat artificial, + * but the memmove costs go through the roof for larger blocks. + */ +#define BTRFS_MAX_METADATA_BLOCKSIZE 65536 + +/* * we can actually store much bigger names, but lets not confuse the rest * of linux */ @@ -461,6 +469,19 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) +/* + * some patches floated around with a second compression method + * lets save that incompat here for when they do get in + * Note we don't actually support it, we're just reserving the + * number + */ +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) + +/* + * older kernels tried to do bigger metadata blocks, but the + * code was pretty buggy. Lets not let them try anymore. + */ +#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL @@ -468,6 +489,7 @@ struct btrfs_super_block { (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) /* @@ -829,6 +851,21 @@ struct btrfs_csum_item { */ #define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) +#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ + BTRFS_AVAIL_ALLOC_BIT_SINGLE) + +static inline u64 chunk_to_extended(u64 flags) +{ + if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) + flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + return flags; +} +static inline u64 extended_to_chunk(u64 flags) +{ + return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; +} + struct btrfs_block_group_item { __le64 used; __le64 chunk_objectid; @@ -1041,7 +1078,7 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - unsigned long mount_opt:21; + unsigned long mount_opt; unsigned long compress_type:4; u64 max_inline; u64 alloc_start; @@ -1503,6 +1540,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) +#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1526,6 +1564,17 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) +struct btrfs_map_token { + struct extent_buffer *eb; + char *kaddr; + unsigned long offset; +}; + +static inline void btrfs_init_map_token (struct btrfs_map_token *token) +{ + memset(token, 0, sizeof(*token)); +} + /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: @@ -1549,20 +1598,22 @@ struct btrfs_ioctl_defrag_range_args { #ifndef BTRFS_SETGET_FUNCS #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \ +void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); #endif #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(struct extent_buffer *eb) \ { \ - type *p = page_address(eb->first_page); \ + type *p = page_address(eb->pages[0]); \ u##bits res = le##bits##_to_cpu(p->member); \ return res; \ } \ static inline void btrfs_set_##name(struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = page_address(eb->first_page); \ + type *p = page_address(eb->pages[0]); \ p->member = cpu_to_le##bits(val); \ } @@ -2466,8 +2517,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data); + struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2484,8 +2534,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root); +void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, @@ -2548,8 +2598,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root, u64 num_bytes); int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); -int btrfs_set_block_group_rw(struct btrfs_root *root, - struct btrfs_block_group_cache *cache); +void btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_root *root, @@ -2568,9 +2618,9 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); -int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key); +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -2590,12 +2640,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); -int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, u32 data_size); -int btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u32 new_size, int from_end); +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size); +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -2629,10 +2680,10 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, return btrfs_del_items(trans, root, path, path->slots[0], 1); } -int setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr); +void setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, @@ -2659,9 +2710,9 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) } int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -void btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref, - int for_reloc); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + int update_ref, int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -2687,24 +2738,6 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->super_for_commit); kfree(fs_info); } -/** - * profile_is_valid - tests whether a given profile is valid and reduced - * @flags: profile to validate - * @extended: if true @flags is treated as an extended profile - */ -static inline int profile_is_valid(u64 flags, int extended) -{ - u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK; - - flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; - if (extended) - mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - if (flags & mask) - return 0; - /* true if zero or exactly one bit set */ - return (flags & (~flags + 1)) == flags; -} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, @@ -2723,9 +2756,10 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); -int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_root_item - *item); +int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_root_item *item); int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); @@ -2909,7 +2943,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); -int btrfs_invalidate_inodes(struct btrfs_root *root); +void btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_root *root); int btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -2961,13 +2995,41 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); +void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...); void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno); + unsigned int line, int errno, const char *fmt, ...); + +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *function, + unsigned int line, int errno); + +#define btrfs_abort_transaction(trans, root, errno) \ +do { \ + __btrfs_abort_transaction(trans, root, __func__, \ + __LINE__, errno); \ +} while (0) #define btrfs_std_error(fs_info, errno) \ do { \ if ((errno)) \ - __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\ + __btrfs_std_error((fs_info), __func__, \ + __LINE__, (errno), NULL); \ +} while (0) + +#define btrfs_error(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_std_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ +} while (0) + +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + struct btrfs_fs_info *_i = (fs_info); \ + __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ + BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \ } while (0) /* acl.c */ @@ -3003,16 +3065,17 @@ void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); -void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); /* scrub.c */ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly); -int btrfs_scrub_pause(struct btrfs_root *root); -int btrfs_scrub_pause_super(struct btrfs_root *root); -int btrfs_scrub_continue(struct btrfs_root *root); -int btrfs_scrub_continue_super(struct btrfs_root *root); +void btrfs_scrub_pause(struct btrfs_root *root); +void btrfs_scrub_pause_super(struct btrfs_root *root); +void btrfs_scrub_continue(struct btrfs_root *root); +void btrfs_scrub_continue_super(struct btrfs_root *root); +int __btrfs_scrub_cancel(struct btrfs_fs_info *info); int btrfs_scrub_cancel(struct btrfs_root *root); int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index fe4cd0f1cef..03e3748d84d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -115,6 +115,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) return NULL; } +/* Will return either the node or PTR_ERR(-ENOMEM) */ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( struct inode *inode) { @@ -836,10 +837,8 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, btrfs_clear_path_blocking(path, NULL, 0); /* insert the keys of the items */ - ret = setup_items_for_insert(trans, root, path, keys, data_size, - total_data_size, total_size, nitems); - if (ret) - goto error; + setup_items_for_insert(trans, root, path, keys, data_size, + total_data_size, total_size, nitems); /* insert the dir index items */ slot = path->slots[0]; @@ -1108,16 +1107,25 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, return 0; } -/* Called when committing the transaction. */ +/* + * Called when committing the transaction. + * Returns 0 on success. + * Returns < 0 on error and returns with an aborted transaction with any + * outstanding delayed items cleaned up. + */ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_root *curr_root = root; struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret = 0; + if (trans->aborted) + return -EIO; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1130,17 +1138,18 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, curr_node = btrfs_first_delayed_node(delayed_root); while (curr_node) { - root = curr_node->root; - ret = btrfs_insert_delayed_items(trans, path, root, + curr_root = curr_node->root; + ret = btrfs_insert_delayed_items(trans, path, curr_root, curr_node); if (!ret) - ret = btrfs_delete_delayed_items(trans, path, root, - curr_node); + ret = btrfs_delete_delayed_items(trans, path, + curr_root, curr_node); if (!ret) - ret = btrfs_update_delayed_inode(trans, root, path, - curr_node); + ret = btrfs_update_delayed_inode(trans, curr_root, + path, curr_node); if (ret) { btrfs_release_delayed_node(curr_node); + btrfs_abort_transaction(trans, root, ret); break; } @@ -1151,6 +1160,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, btrfs_free_path(path); trans->block_rsv = block_rsv; + return ret; } @@ -1371,6 +1381,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root) btrfs_wq_run_delayed_node(delayed_root, root, 0); } +/* Will return 0 or -ENOMEM */ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *dir, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 66e4f29505a..69f22e3ab3b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -420,7 +420,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * this does all the dirty work in terms of maintaining the correct * overall modification count. */ -static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, +static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, @@ -487,20 +487,19 @@ static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(ref); + kfree(head_ref); } else { delayed_refs->num_heads++; delayed_refs->num_heads_ready++; delayed_refs->num_entries++; trans->delayed_ref_updates++; } - return 0; } /* * helper to insert a delayed tree ref into the rbtree. */ -static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, +static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, @@ -549,18 +548,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(ref); + kfree(full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; } - return 0; } /* * helper to insert a delayed data ref into the rbtree. */ -static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, +static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, @@ -611,12 +609,11 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(ref); + kfree(full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; } - return 0; } /* @@ -634,7 +631,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - int ret; BUG_ON(extent_op && extent_op->is_data); ref = kmalloc(sizeof(*ref), GFP_NOFS); @@ -656,14 +652,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, action, 0); - BUG_ON(ret); - ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, + add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, for_cow); - BUG_ON(ret); if (!need_ref_seq(for_cow, ref_root) && waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); @@ -685,7 +679,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - int ret; BUG_ON(extent_op && !extent_op->is_data); ref = kmalloc(sizeof(*ref), GFP_NOFS); @@ -707,14 +700,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, action, 1); - BUG_ON(ret); - ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, + add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action, for_cow); - BUG_ON(ret); if (!need_ref_seq(for_cow, ref_root) && waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); @@ -729,7 +720,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, { struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - int ret; head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); if (!head_ref) @@ -740,10 +730,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); - BUG_ON(ret); if (waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 31d84e78129..c1a074d0696 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -49,9 +49,8 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle di = btrfs_match_dir_item_name(root, path, name, name_len); if (di) return ERR_PTR(-EEXIST); - ret = btrfs_extend_item(trans, root, path, data_size); - } - if (ret < 0) + btrfs_extend_item(trans, root, path, data_size); + } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0]; @@ -116,6 +115,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * 'location' is the key to stuff into the directory item, 'type' is the * type of the inode we're pointing to, and 'index' is the sequence number * to use for the second index (if one is created). + * Will return 0 or -ENOMEM */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, @@ -383,8 +383,8 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); - ret = btrfs_truncate_item(trans, root, path, - item_len - sub_item_len, 1); + btrfs_truncate_item(trans, root, path, + item_len - sub_item_len, 1); } return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 534266fe505..a7ffc88a7db 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -48,20 +48,19 @@ static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); -static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); -static int btrfs_destroy_ordered_operations(struct btrfs_root *root); -static int btrfs_destroy_ordered_extents(struct btrfs_root *root); +static void btrfs_destroy_ordered_operations(struct btrfs_root *root); +static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); -static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); -static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root); +static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); static int btrfs_destroy_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); static int btrfs_destroy_pinned_extent(struct btrfs_root *root, struct extent_io_tree *pinned_extents); -static int btrfs_cleanup_transaction(struct btrfs_root *root); /* * end_io_wq structs are used to do processing in task context when an IO is @@ -99,6 +98,7 @@ struct async_submit_bio { */ u64 bio_offset; struct btrfs_work work; + int error; }; /* @@ -323,7 +323,8 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, * in the wrong place. */ static int verify_parent_transid(struct extent_io_tree *io_tree, - struct extent_buffer *eb, u64 parent_transid) + struct extent_buffer *eb, u64 parent_transid, + int atomic) { struct extent_state *cached_state = NULL; int ret; @@ -331,9 +332,12 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; + if (atomic) + return -EAGAIN; + lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, - 0, &cached_state, GFP_NOFS); - if (extent_buffer_uptodate(io_tree, eb, cached_state) && + 0, &cached_state); + if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; goto out; @@ -344,7 +348,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, (unsigned long long)parent_transid, (unsigned long long)btrfs_header_generation(eb)); ret = 1; - clear_extent_buffer_uptodate(io_tree, eb, &cached_state); + clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); @@ -360,9 +364,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, u64 start, u64 parent_transid) { struct extent_io_tree *io_tree; + int failed = 0; int ret; int num_copies = 0; int mirror_num = 0; + int failed_mirror = 0; clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; @@ -370,9 +376,9 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, ret = read_extent_buffer_pages(io_tree, eb, start, WAIT_COMPLETE, btree_get_extent, mirror_num); - if (!ret && - !verify_parent_transid(io_tree, eb, parent_transid)) - return ret; + if (!ret && !verify_parent_transid(io_tree, eb, + parent_transid, 0)) + break; /* * This buffer's crc is fine, but its contents are corrupted, so @@ -380,18 +386,30 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, * any less wrong. */ if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) - return ret; + break; num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, eb->start, eb->len); if (num_copies == 1) - return ret; + break; + + if (!failed_mirror) { + failed = 1; + failed_mirror = eb->read_mirror; + } mirror_num++; + if (mirror_num == failed_mirror) + mirror_num++; + if (mirror_num > num_copies) - return ret; + break; } - return -EIO; + + if (failed && !ret) + repair_eb_io_failure(root, eb, failed_mirror); + + return ret; } /* @@ -404,50 +422,27 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) struct extent_io_tree *tree; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 found_start; - unsigned long len; struct extent_buffer *eb; - int ret; tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) { - WARN_ON(1); - goto out; - } - if (!page->private) { - WARN_ON(1); - goto out; - } - len = page->private >> 2; - WARN_ON(len == 0); - - eb = alloc_extent_buffer(tree, start, len, page); - if (eb == NULL) { - WARN_ON(1); - goto out; - } - ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, - btrfs_header_generation(eb)); - BUG_ON(ret); - WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)); - + eb = (struct extent_buffer *)page->private; + if (page != eb->pages[0]) + return 0; found_start = btrfs_header_bytenr(eb); if (found_start != start) { WARN_ON(1); - goto err; + return 0; } - if (eb->first_page != page) { + if (eb->pages[0] != page) { WARN_ON(1); - goto err; + return 0; } if (!PageUptodate(page)) { WARN_ON(1); - goto err; + return 0; } csum_tree_block(root, eb, 0); -err: - free_extent_buffer(eb); -out: return 0; } @@ -537,34 +532,75 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } +struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree, + struct page *page, int max_walk) +{ + struct extent_buffer *eb; + u64 start = page_offset(page); + u64 target = start; + u64 min_start; + + if (start < max_walk) + min_start = 0; + else + min_start = start - max_walk; + + while (start >= min_start) { + eb = find_extent_buffer(tree, start, 0); + if (eb) { + /* + * we found an extent buffer and it contains our page + * horray! + */ + if (eb->start <= target && + eb->start + eb->len > target) + return eb; + + /* we found an extent buffer that wasn't for us */ + free_extent_buffer(eb); + return NULL; + } + if (start == 0) + break; + start -= PAGE_CACHE_SIZE; + } + return NULL; +} + static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state) + struct extent_state *state, int mirror) { struct extent_io_tree *tree; u64 found_start; int found_level; - unsigned long len; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; int ret = 0; + int reads_done; - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) - goto out; if (!page->private) goto out; - len = page->private >> 2; - WARN_ON(len == 0); + tree = &BTRFS_I(page->mapping->host)->io_tree; + eb = (struct extent_buffer *)page->private; + + /* the pending IO might have been the only thing that kept this buffer + * in memory. Make sure we have a ref for all this other checks + */ + extent_buffer_get(eb); + + reads_done = atomic_dec_and_test(&eb->io_pages); + if (!reads_done) + goto err; - eb = alloc_extent_buffer(tree, start, len, page); - if (eb == NULL) { + eb->read_mirror = mirror; + if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { ret = -EIO; - goto out; + goto err; } found_start = btrfs_header_bytenr(eb); - if (found_start != start) { + if (found_start != eb->start) { printk_ratelimited(KERN_INFO "btrfs bad tree block start " "%llu %llu\n", (unsigned long long)found_start, @@ -572,13 +608,6 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ret = -EIO; goto err; } - if (eb->first_page != page) { - printk(KERN_INFO "btrfs bad first page %lu %lu\n", - eb->first_page->index, page->index); - WARN_ON(1); - ret = -EIO; - goto err; - } if (check_tree_block_fsid(root, eb)) { printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", (unsigned long long)eb->start); @@ -606,48 +635,31 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ret = -EIO; } - end = min_t(u64, eb->len, PAGE_CACHE_SIZE); - end = eb->start + end - 1; + if (!ret) + set_extent_buffer_uptodate(eb); err: if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); btree_readahead_hook(root, eb, eb->start, ret); } + if (ret) + clear_extent_buffer_uptodate(eb); free_extent_buffer(eb); out: return ret; } -static int btree_io_failed_hook(struct bio *failed_bio, - struct page *page, u64 start, u64 end, - int mirror_num, struct extent_state *state) +static int btree_io_failed_hook(struct page *page, int failed_mirror) { - struct extent_io_tree *tree; - unsigned long len; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) - goto out; - if (!page->private) - goto out; - - len = page->private >> 2; - WARN_ON(len == 0); - - eb = alloc_extent_buffer(tree, start, len, page); - if (eb == NULL) - goto out; - - if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { - clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + eb = (struct extent_buffer *)page->private; + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + eb->read_mirror = failed_mirror; + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, -EIO); - } - free_extent_buffer(eb); - -out: return -EIO; /* we fixed nothing */ } @@ -719,11 +731,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; + int ret; async = container_of(work, struct async_submit_bio, work); - async->submit_bio_start(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags, - async->bio_offset); + ret = async->submit_bio_start(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags, + async->bio_offset); + if (ret) + async->error = ret; } static void run_one_async_done(struct btrfs_work *work) @@ -744,6 +759,12 @@ static void run_one_async_done(struct btrfs_work *work) waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); + /* If an error occured we just want to clean up the bio and move on */ + if (async->error) { + bio_endio(async->bio, async->error); + return; + } + async->submit_bio_done(async->inode, async->rw, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); @@ -785,6 +806,8 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; async->bio_offset = bio_offset; + async->error = 0; + atomic_inc(&fs_info->nr_async_submits); if (rw & REQ_SYNC) @@ -806,15 +829,18 @@ static int btree_csum_one_bio(struct bio *bio) struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; struct btrfs_root *root; + int ret = 0; WARN_ON(bio->bi_vcnt <= 0); while (bio_index < bio->bi_vcnt) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; - csum_dirty_buffer(root, bvec->bv_page); + ret = csum_dirty_buffer(root, bvec->bv_page); + if (ret) + break; bio_index++; bvec++; } - return 0; + return ret; } static int __btree_submit_bio_start(struct inode *inode, int rw, @@ -826,8 +852,7 @@ static int __btree_submit_bio_start(struct inode *inode, int rw, * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - btree_csum_one_bio(bio); - return 0; + return btree_csum_one_bio(bio); } static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, @@ -847,15 +872,16 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, { int ret; - ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, - bio, 1); - BUG_ON(ret); - if (!(rw & REQ_WRITE)) { + /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads */ + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, + bio, 1); + if (ret) + return ret; return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); } @@ -893,34 +919,6 @@ static int btree_migratepage(struct address_space *mapping, } #endif -static int btree_writepage(struct page *page, struct writeback_control *wbc) -{ - struct extent_io_tree *tree; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct extent_buffer *eb; - int was_dirty; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (!(current->flags & PF_MEMALLOC)) { - return extent_write_full_page(tree, page, - btree_get_extent, wbc); - } - - redirty_page_for_writepage(wbc, page); - eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE); - WARN_ON(!eb); - - was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; - spin_unlock(&root->fs_info->delalloc_lock); - } - free_extent_buffer(eb); - - unlock_page(page); - return 0; -} static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -940,7 +938,7 @@ static int btree_writepages(struct address_space *mapping, if (num_dirty < thresh) return 0; } - return extent_writepages(tree, mapping, btree_get_extent, wbc); + return btree_write_cache_pages(mapping, wbc); } static int btree_readpage(struct file *file, struct page *page) @@ -952,16 +950,8 @@ static int btree_readpage(struct file *file, struct page *page) static int btree_releasepage(struct page *page, gfp_t gfp_flags) { - struct extent_io_tree *tree; - struct extent_map_tree *map; - int ret; - if (PageWriteback(page) || PageDirty(page)) return 0; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - map = &BTRFS_I(page->mapping->host)->extent_tree; - /* * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing * slab allocation from alloc_extent_state down the callchain where @@ -969,18 +959,7 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) */ gfp_flags &= ~GFP_SLAB_BUG_MASK; - ret = try_release_extent_state(map, tree, page, gfp_flags); - if (!ret) - return 0; - - ret = try_release_extent_buffer(tree, page); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - page_cache_release(page); - } - - return ret; + return try_release_extent_buffer(page, gfp_flags); } static void btree_invalidatepage(struct page *page, unsigned long offset) @@ -998,15 +977,28 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } +static int btree_set_page_dirty(struct page *page) +{ + struct extent_buffer *eb; + + BUG_ON(!PagePrivate(page)); + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(!atomic_read(&eb->refs)); + btrfs_assert_tree_locked(eb); + return __set_page_dirty_nobuffers(page); +} + static const struct address_space_operations btree_aops = { .readpage = btree_readpage, - .writepage = btree_writepage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, #ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, #endif + .set_page_dirty = btree_set_page_dirty, }; int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -1049,7 +1041,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { free_extent_buffer(buf); return -EIO; - } else if (extent_buffer_uptodate(io_tree, buf, NULL)) { + } else if (extent_buffer_uptodate(buf)) { *eb = buf; } else { free_extent_buffer(buf); @@ -1074,20 +1066,20 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, struct extent_buffer *eb; eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize, NULL); + bytenr, blocksize); return eb; } int btrfs_write_tree_block(struct extent_buffer *buf) { - return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, + return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, buf->start + buf->len - 1); } int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return filemap_fdatawait_range(buf->first_page->mapping, + return filemap_fdatawait_range(buf->pages[0]->mapping, buf->start, buf->start + buf->len - 1); } @@ -1102,17 +1094,13 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, return NULL; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); - - if (ret == 0) - set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); return buf; } -int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf) +void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf) { - struct inode *btree_inode = root->fs_info->btree_inode; if (btrfs_header_generation(buf) == root->fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); @@ -1121,23 +1109,27 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, spin_lock(&root->fs_info->delalloc_lock); if (root->fs_info->dirty_metadata_bytes >= buf->len) root->fs_info->dirty_metadata_bytes -= buf->len; - else - WARN_ON(1); + else { + spin_unlock(&root->fs_info->delalloc_lock); + btrfs_panic(root->fs_info, -EOVERFLOW, + "Can't clear %lu bytes from " + " dirty_mdatadata_bytes (%lu)", + buf->len, + root->fs_info->dirty_metadata_bytes); + } spin_unlock(&root->fs_info->delalloc_lock); } /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); - clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, - buf); + clear_extent_buffer_dirty(buf); } - return 0; } -static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, - u32 stripesize, struct btrfs_root *root, - struct btrfs_fs_info *fs_info, - u64 objectid) +static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + u32 stripesize, struct btrfs_root *root, + struct btrfs_fs_info *fs_info, + u64 objectid) { root->node = NULL; root->commit_root = NULL; @@ -1189,13 +1181,12 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->defrag_running = 0; root->root_key.objectid = objectid; root->anon_dev = 0; - return 0; } -static int find_and_setup_root(struct btrfs_root *tree_root, - struct btrfs_fs_info *fs_info, - u64 objectid, - struct btrfs_root *root) +static int __must_check find_and_setup_root(struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, + u64 objectid, + struct btrfs_root *root) { int ret; u32 blocksize; @@ -1208,14 +1199,15 @@ static int find_and_setup_root(struct btrfs_root *tree_root, &root->root_item, &root->root_key); if (ret > 0) return -ENOENT; - BUG_ON(ret); + else if (ret < 0) + return ret; generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->commit_root = NULL; root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); - if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { + if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) { free_extent_buffer(root->node); root->node = NULL; return -EIO; @@ -1377,7 +1369,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); root->commit_root = btrfs_root_node(root); - BUG_ON(!root->node); + BUG_ON(!root->node); /* -ENOMEM */ out: if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { root->ref_cows = 1; @@ -1513,41 +1505,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) return 0; } -static int bio_ready_for_csum(struct bio *bio) -{ - u64 length = 0; - u64 buf_len = 0; - u64 start = 0; - struct page *page; - struct extent_io_tree *io_tree = NULL; - struct bio_vec *bvec; - int i; - int ret; - - bio_for_each_segment(bvec, bio, i) { - page = bvec->bv_page; - if (page->private == EXTENT_PAGE_PRIVATE) { - length += bvec->bv_len; - continue; - } - if (!page->private) { - length += bvec->bv_len; - continue; - } - length = bvec->bv_len; - buf_len = page->private >> 2; - start = page_offset(page) + bvec->bv_offset; - io_tree = &BTRFS_I(page->mapping->host)->io_tree; - } - /* are we fully contained in this bio? */ - if (buf_len <= length) - return 1; - - ret = extent_range_uptodate(io_tree, start + length, - start + buf_len - 1); - return ret; -} - /* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens @@ -1563,17 +1520,6 @@ static void end_workqueue_fn(struct btrfs_work *work) bio = end_io_wq->bio; fs_info = end_io_wq->info; - /* metadata bio reads are special because the whole tree block must - * be checksummed at once. This makes sure the entire block is in - * ram and up to date before trying to verify things. For - * blocksize <= pagesize, it is basically a noop - */ - if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata && - !bio_ready_for_csum(bio)) { - btrfs_queue_worker(&fs_info->endio_meta_workers, - &end_io_wq->work); - return; - } error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; @@ -1614,9 +1560,10 @@ static int transaction_kthread(void *arg) u64 transid; unsigned long now; unsigned long delay; - int ret; + bool cannot_commit; do { + cannot_commit = false; delay = HZ * 30; vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); @@ -1638,11 +1585,14 @@ static int transaction_kthread(void *arg) transid = cur->transid; spin_unlock(&root->fs_info->trans_lock); + /* If the file system is aborted, this will always fail. */ trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + cannot_commit = true; + goto sleep; + } if (transid == trans->transid) { - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + btrfs_commit_transaction(trans, root); } else { btrfs_end_transaction(trans, root); } @@ -1653,7 +1603,8 @@ sleep: if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && - !btrfs_transaction_blocked(root->fs_info)) + (!btrfs_transaction_blocked(root->fs_info) || + cannot_commit)) schedule_timeout(delay); __set_current_state(TASK_RUNNING); } @@ -2042,6 +1993,7 @@ int open_ctree(struct super_block *sb, RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, fs_info->btree_inode->i_mapping); + BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; @@ -2084,6 +2036,7 @@ int open_ctree(struct super_block *sb, __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); + invalidate_bdev(fs_devices->latest_bdev); bh = btrfs_read_dev_super(fs_devices->latest_bdev); if (!bh) { err = -EINVAL; @@ -2104,7 +2057,12 @@ int open_ctree(struct super_block *sb, /* check FS state, whether FS is broken. */ fs_info->fs_state |= btrfs_super_flags(disk_super); - btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + if (ret) { + printk(KERN_ERR "btrfs: superblock contains fatal errors\n"); + err = ret; + goto fail_alloc; + } /* * run through our array of backup supers and setup @@ -2135,10 +2093,55 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } + if (btrfs_super_leafsize(disk_super) != + btrfs_super_nodesize(disk_super)) { + printk(KERN_ERR "BTRFS: couldn't mount because metadata " + "blocksizes don't match. node %d leaf %d\n", + btrfs_super_nodesize(disk_super), + btrfs_super_leafsize(disk_super)); + err = -EINVAL; + goto fail_alloc; + } + if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: couldn't mount because metadata " + "blocksize (%d) was too large\n", + btrfs_super_leafsize(disk_super)); + err = -EINVAL; + goto fail_alloc; + } + features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + + /* + * flag our filesystem as having big metadata blocks if + * they are bigger than the page size + */ + if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { + if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) + printk(KERN_INFO "btrfs flagging fs with big metadata feature\n"); + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + } + + nodesize = btrfs_super_nodesize(disk_super); + leafsize = btrfs_super_leafsize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = btrfs_super_stripesize(disk_super); + + /* + * mixed block groups end up with duplicate but slightly offset + * extent buffers for the same range. It leads to corruptions + */ + if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (sectorsize != leafsize)) { + printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes " + "are not allowed for mixed block groups on %s\n", + sb->s_id); + goto fail_alloc; + } + btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & @@ -2242,10 +2245,6 @@ int open_ctree(struct super_block *sb, fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 4 * 1024 * 1024 / PAGE_CACHE_SIZE); - nodesize = btrfs_super_nodesize(disk_super); - leafsize = btrfs_super_leafsize(disk_super); - sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = btrfs_super_stripesize(disk_super); tree_root->nodesize = nodesize; tree_root->leafsize = leafsize; tree_root->sectorsize = sectorsize; @@ -2260,9 +2259,9 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - if (sectorsize < PAGE_SIZE) { - printk(KERN_WARNING "btrfs: Incompatible sector size " - "found on %s\n", sb->s_id); + if (sectorsize != PAGE_SIZE) { + printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) " + "found on %s\n", (unsigned long)sectorsize, sb->s_id); goto fail_sb_buffer; } @@ -2285,7 +2284,7 @@ int open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), blocksize, generation); - BUG_ON(!chunk_root->node); + BUG_ON(!chunk_root->node); /* -ENOMEM */ if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", sb->s_id); @@ -2425,21 +2424,31 @@ retry_root_backup: log_tree_root->node = read_tree_block(tree_root, bytenr, blocksize, generation + 1); + /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); - BUG_ON(ret); + if (ret) { + btrfs_error(tree_root->fs_info, ret, + "Failed to recover log tree"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + goto fail_trans_kthread; + } if (sb->s_flags & MS_RDONLY) { - ret = btrfs_commit_super(tree_root); - BUG_ON(ret); + ret = btrfs_commit_super(tree_root); + if (ret) + goto fail_trans_kthread; } } ret = btrfs_find_orphan_roots(tree_root); - BUG_ON(ret); + if (ret) + goto fail_trans_kthread; if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_cleanup_fs_roots(fs_info); - BUG_ON(ret); + if (ret) { + } ret = btrfs_recover_relocation(tree_root); if (ret < 0) { @@ -2859,6 +2868,8 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) if (total_errors > max_errors) { printk(KERN_ERR "btrfs: %d errors while writing supers\n", total_errors); + + /* This shouldn't happen. FUA is masked off if unsupported */ BUG(); } @@ -2875,9 +2886,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - printk(KERN_ERR "btrfs: %d errors while writing supers\n", - total_errors); - BUG(); + btrfs_error(root->fs_info, -EIO, + "%d errors while writing supers", total_errors); + return -EIO; } return 0; } @@ -2891,7 +2902,20 @@ int write_ctree_super(struct btrfs_trans_handle *trans, return ret; } -int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +/* Kill all outstanding I/O */ +void btrfs_abort_devices(struct btrfs_root *root) +{ + struct list_head *head; + struct btrfs_device *dev; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + head = &root->fs_info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + blk_abort_queue(dev->bdev->bd_disk->queue); + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); +} + +void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, @@ -2904,7 +2928,6 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) __btrfs_remove_free_space_cache(root->free_ino_pinned); __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); - return 0; } static void free_fs_root(struct btrfs_root *root) @@ -2921,7 +2944,7 @@ static void free_fs_root(struct btrfs_root *root) kfree(root); } -static int del_fs_roots(struct btrfs_fs_info *fs_info) +static void del_fs_roots(struct btrfs_fs_info *fs_info) { int ret; struct btrfs_root *gang[8]; @@ -2950,7 +2973,6 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) btrfs_free_fs_root(fs_info, gang[i]); } - return 0; } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) @@ -2999,14 +3021,21 @@ int btrfs_commit_super(struct btrfs_root *root) if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + if (ret) + return ret; /* run commit again to drop the original snapshot */ trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; ret = btrfs_write_and_wait_transaction(NULL, root); - BUG_ON(ret); + if (ret) { + btrfs_error(root->fs_info, ret, + "Failed to sync btree inode to disk."); + return ret; + } ret = write_ctree_super(NULL, root, 0); return ret; @@ -3119,33 +3148,32 @@ int close_ctree(struct btrfs_root *root) return 0; } -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, + int atomic) { int ret; - struct inode *btree_inode = buf->first_page->mapping->host; + struct inode *btree_inode = buf->pages[0]->mapping->host; - ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf, - NULL); + ret = extent_buffer_uptodate(buf); if (!ret) return ret; ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, - parent_transid); + parent_transid, atomic); + if (ret == -EAGAIN) + return ret; return !ret; } int btrfs_set_buffer_uptodate(struct extent_buffer *buf) { - struct inode *btree_inode = buf->first_page->mapping->host; - return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, - buf); + return set_extent_buffer_uptodate(buf); } void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; u64 transid = btrfs_header_generation(buf); - struct inode *btree_inode = root->fs_info->btree_inode; int was_dirty; btrfs_assert_tree_locked(buf); @@ -3157,8 +3185,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)root->fs_info->generation); WARN_ON(1); } - was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, - buf); + was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) { spin_lock(&root->fs_info->delalloc_lock); root->fs_info->dirty_metadata_bytes += buf->len; @@ -3212,12 +3239,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) { - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - int ret; - ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); - if (ret == 0) - set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); - return ret; + struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; + return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); } static int btree_lock_page_hook(struct page *page, void *data, @@ -3225,17 +3248,21 @@ static int btree_lock_page_hook(struct page *page, void *data, { struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_buffer *eb; - unsigned long len; - u64 bytenr = page_offset(page); - if (page->private == EXTENT_PAGE_PRIVATE) + /* + * We culled this eb but the page is still hanging out on the mapping, + * carry on. + */ + if (!PagePrivate(page)) goto out; - len = page->private >> 2; - eb = find_extent_buffer(io_tree, bytenr, len); - if (!eb) + eb = (struct extent_buffer *)page->private; + if (!eb) { + WARN_ON(1); + goto out; + } + if (page != eb->pages[0]) goto out; if (!btrfs_try_tree_write_lock(eb)) { @@ -3254,7 +3281,6 @@ static int btree_lock_page_hook(struct page *page, void *data, } btrfs_tree_unlock(eb); - free_extent_buffer(eb); out: if (!trylock_page(page)) { flush_fn(data); @@ -3263,15 +3289,23 @@ out: return 0; } -static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { + if (btrfs_super_csum_type(fs_info->super_copy) >= ARRAY_SIZE(btrfs_csum_sizes)) { + printk(KERN_ERR "btrfs: unsupported checksum algorithm\n"); + return -EINVAL; + } + if (read_only) - return; + return 0; - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { printk(KERN_WARNING "warning: mount fs with errors, " "running btrfsck is recommended\n"); + } + + return 0; } int btrfs_error_commit_super(struct btrfs_root *root) @@ -3293,7 +3327,7 @@ int btrfs_error_commit_super(struct btrfs_root *root) return ret; } -static int btrfs_destroy_ordered_operations(struct btrfs_root *root) +static void btrfs_destroy_ordered_operations(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; struct list_head splice; @@ -3315,11 +3349,9 @@ static int btrfs_destroy_ordered_operations(struct btrfs_root *root) spin_unlock(&root->fs_info->ordered_extent_lock); mutex_unlock(&root->fs_info->ordered_operations_mutex); - - return 0; } -static int btrfs_destroy_ordered_extents(struct btrfs_root *root) +static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct list_head splice; struct btrfs_ordered_extent *ordered; @@ -3351,12 +3383,10 @@ static int btrfs_destroy_ordered_extents(struct btrfs_root *root) } spin_unlock(&root->fs_info->ordered_extent_lock); - - return 0; } -static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_root *root) +int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; @@ -3365,6 +3395,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs = &trans->delayed_refs; +again: spin_lock(&delayed_refs->lock); if (delayed_refs->num_entries == 0) { spin_unlock(&delayed_refs->lock); @@ -3386,6 +3417,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_delayed_ref_head *head; head = btrfs_delayed_node_to_head(ref); + spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); kfree(head->extent_op); delayed_refs->num_heads--; @@ -3393,8 +3425,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs->num_heads_ready--; list_del_init(&head->cluster); mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(ref); + goto again; } - spin_unlock(&delayed_refs->lock); btrfs_put_delayed_ref(ref); @@ -3407,7 +3440,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } -static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) { struct btrfs_pending_snapshot *snapshot; struct list_head splice; @@ -3425,11 +3458,9 @@ static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) kfree(snapshot); } - - return 0; } -static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) +static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; struct list_head splice; @@ -3449,8 +3480,6 @@ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) } spin_unlock(&root->fs_info->delalloc_lock); - - return 0; } static int btrfs_destroy_marked_extents(struct btrfs_root *root, @@ -3541,13 +3570,43 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, return 0; } -static int btrfs_cleanup_transaction(struct btrfs_root *root) +void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, + struct btrfs_root *root) +{ + btrfs_destroy_delayed_refs(cur_trans, root); + btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, + cur_trans->dirty_pages.dirty_bytes); + + /* FIXME: cleanup wait for commit */ + cur_trans->in_commit = 1; + cur_trans->blocked = 1; + if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) + wake_up(&root->fs_info->transaction_blocked_wait); + + cur_trans->blocked = 0; + if (waitqueue_active(&root->fs_info->transaction_wait)) + wake_up(&root->fs_info->transaction_wait); + + cur_trans->commit_done = 1; + if (waitqueue_active(&cur_trans->commit_wait)) + wake_up(&cur_trans->commit_wait); + + btrfs_destroy_pending_snapshots(cur_trans); + + btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, + EXTENT_DIRTY); + + /* + memset(cur_trans, 0, sizeof(*cur_trans)); + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + */ +} + +int btrfs_cleanup_transaction(struct btrfs_root *root) { struct btrfs_transaction *t; LIST_HEAD(list); - WARN_ON(1); - mutex_lock(&root->fs_info->transaction_kthread_mutex); spin_lock(&root->fs_info->trans_lock); @@ -3612,6 +3671,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) return 0; } +static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state) +{ + struct super_block *sb = page->mapping->host->i_sb; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + btrfs_error(fs_info, -EIO, + "Error occured while writing out btree at %llu", start); + return -EIO; +} + static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, @@ -3619,4 +3689,5 @@ static struct extent_io_ops btree_extent_io_ops = { .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, + .writepage_io_failed_hook = btree_writepage_io_failed_hook, }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index e4bc4741319..ab1830aaf0e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -44,8 +44,8 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); -int clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf); +void clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf); int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); @@ -64,9 +64,10 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); -int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, + int atomic); int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); @@ -85,6 +86,10 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_cleanup_transaction(struct btrfs_root *root); +void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, + struct btrfs_root *root); +void btrfs_abort_devices(struct btrfs_root *root); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 5f77166fd01..e887ee62b6d 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -193,7 +193,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) if (ret < 0) goto fail; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Key with offset of -1 found */ if (path->slots[0] == 0) { ret = -ENOENT; goto fail; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f6f584fd549..49fd7b66d57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -245,7 +245,7 @@ static int exclude_super_stripes(struct btrfs_root *root, cache->bytes_super += stripe_len; ret = add_excluded_extent(root, cache->key.objectid, stripe_len); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -253,13 +253,13 @@ static int exclude_super_stripes(struct btrfs_root *root, ret = btrfs_rmap_block(&root->fs_info->mapping_tree, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ while (nr--) { cache->bytes_super += stripe_len; ret = add_excluded_extent(root, logical[nr], stripe_len); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } kfree(logical); @@ -321,7 +321,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, total_added += size; ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic error */ start = extent_end + 1; } else { break; @@ -332,7 +332,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, size = end - start; total_added += size; ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic error */ } return total_added; @@ -474,7 +474,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, int ret = 0; caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - BUG_ON(!caching_ctl); + if (!caching_ctl) + return -ENOMEM; INIT_LIST_HEAD(&caching_ctl->list); mutex_init(&caching_ctl->mutex); @@ -528,9 +529,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, * allocate blocks for the tree root we can't do the fast caching since * we likely hold important locks. */ - if (trans && (!trans->transaction->in_commit) && - (root && root != root->fs_info->tree_root) && - btrfs_test_opt(root, SPACE_CACHE)) { + if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); @@ -982,7 +981,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; - BUG_ON(ret > 0); + BUG_ON(ret > 0); /* Corruption */ leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, @@ -1008,9 +1007,9 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, new_size + extra_size, 1); if (ret < 0) return ret; - BUG_ON(ret); + BUG_ON(ret); /* Corruption */ - ret = btrfs_extend_item(trans, root, path, new_size); + btrfs_extend_item(trans, root, path, new_size); leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1478,7 +1477,11 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, err = ret; goto out; } - BUG_ON(ret); + if (ret && !insert) { + err = -ENOENT; + goto out; + } + BUG_ON(ret); /* Corruption */ leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -1592,13 +1595,13 @@ out: * helper to add new inline back ref */ static noinline_for_stack -int setup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) +void setup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1608,7 +1611,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, u64 refs; int size; int type; - int ret; leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1617,7 +1619,7 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - ret = btrfs_extend_item(trans, root, path, size); + btrfs_extend_item(trans, root, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1652,7 +1654,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } btrfs_mark_buffer_dirty(leaf); - return 0; } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1687,12 +1688,12 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, * helper to update/remove inline back ref */ static noinline_for_stack -int update_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op) +void update_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1703,7 +1704,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, u32 item_size; int size; int type; - int ret; u64 refs; leaf = path->nodes[0]; @@ -1745,10 +1745,9 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - ret = btrfs_truncate_item(trans, root, path, item_size, 1); + btrfs_truncate_item(trans, root, path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); - return 0; } static noinline_for_stack @@ -1768,13 +1767,13 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, root_objectid, owner, offset, 1); if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - ret = update_inline_extent_backref(trans, root, path, iref, - refs_to_add, extent_op); + update_inline_extent_backref(trans, root, path, iref, + refs_to_add, extent_op); } else if (ret == -ENOENT) { - ret = setup_inline_extent_backref(trans, root, path, iref, - parent, root_objectid, - owner, offset, refs_to_add, - extent_op); + setup_inline_extent_backref(trans, root, path, iref, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + ret = 0; } return ret; } @@ -1804,12 +1803,12 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref, int refs_to_drop, int is_data) { - int ret; + int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); if (iref) { - ret = update_inline_extent_backref(trans, root, path, iref, - -refs_to_drop, NULL); + update_inline_extent_backref(trans, root, path, iref, + -refs_to_drop, NULL); } else if (is_data) { ret = remove_extent_data_ref(trans, root, path, refs_to_drop); } else { @@ -1835,6 +1834,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, bytenr, &num_bytes, &bbio, 0); + /* Error condition is -ENOMEM */ if (!ret) { struct btrfs_bio_stripe *stripe = bbio->stripes; int i; @@ -1850,7 +1850,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, if (!ret) discarded_bytes += stripe->length; else if (ret != -EOPNOTSUPP) - break; + break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ /* * Just in case we get back EOPNOTSUPP for some reason, @@ -1869,6 +1869,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, return ret; } +/* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -1944,7 +1945,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, root, ret); out: btrfs_free_path(path); return err; @@ -2031,6 +2033,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, int ret; int err = 0; + if (trans->aborted) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2128,7 +2133,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op, int insert_reserved) { - int ret; + int ret = 0; + + if (trans->aborted) + return 0; + if (btrfs_delayed_ref_is_head(node)) { struct btrfs_delayed_ref_head *head; /* @@ -2146,11 +2155,10 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, ret = btrfs_del_csums(trans, root, node->bytenr, node->num_bytes); - BUG_ON(ret); } } mutex_unlock(&head->mutex); - return 0; + return ret; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || @@ -2197,6 +2205,10 @@ again: return NULL; } +/* + * Returns 0 on success or if called with an already aborted transaction. + * Returns -ENOMEM or -EIO on failure and will abort the transaction. + */ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct list_head *cluster) @@ -2285,9 +2297,14 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_delayed_extent_op(trans, root, ref, extent_op); - BUG_ON(ret); kfree(extent_op); + if (ret) { + printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); + spin_lock(&delayed_refs->lock); + return ret; + } + goto next; } @@ -2308,11 +2325,17 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); - BUG_ON(ret); btrfs_put_delayed_ref(ref); kfree(extent_op); count++; + + if (ret) { + printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); + spin_lock(&delayed_refs->lock); + return ret; + } + next: do_chunk_alloc(trans, root->fs_info->extent_root, 2 * 1024 * 1024, @@ -2347,6 +2370,9 @@ static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, * 0, which means to process everything in the tree at the start * of the run (but not newly added entries), or it can be some target * number you'd like to process. + * + * Returns 0 on success or if called with an aborted transaction + * Returns <0 on error and aborts the transaction */ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count) @@ -2362,6 +2388,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long num_refs = 0; int consider_waiting; + /* We'll clean this up in btrfs_cleanup_transaction */ + if (trans->aborted) + return 0; + if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; @@ -2419,7 +2449,11 @@ again: } ret = run_clustered_refs(trans, root, &cluster); - BUG_ON(ret < 0); + if (ret < 0) { + spin_unlock(&delayed_refs->lock); + btrfs_abort_transaction(trans, root, ret); + return ret; + } count -= min_t(unsigned long, ret, count); @@ -2584,7 +2618,7 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ ret = -ENOENT; if (path->slots[0] == 0) @@ -2738,7 +2772,6 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } return 0; fail: - BUG(); return ret; } @@ -2767,7 +2800,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); if (ret < 0) goto fail; - BUG_ON(ret); + BUG_ON(ret); /* Corruption */ leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -2775,8 +2808,10 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); fail: - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); return ret; + } return 0; } @@ -2949,7 +2984,8 @@ again: if (last == 0) { err = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(err); + if (err) /* File system offline */ + goto out; } cache = btrfs_lookup_first_block_group(root->fs_info, last); @@ -2976,7 +3012,9 @@ again: last = cache->key.objectid + cache->key.offset; err = write_one_cache_group(trans, root, path, cache); - BUG_ON(err); + if (err) /* File system offline */ + goto out; + btrfs_put_block_group(cache); } @@ -2989,7 +3027,8 @@ again: if (last == 0) { err = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(err); + if (err) /* File system offline */ + goto out; } cache = btrfs_lookup_first_block_group(root->fs_info, last); @@ -3014,20 +3053,21 @@ again: continue; } - btrfs_write_out_cache(root, trans, cache, path); + err = btrfs_write_out_cache(root, trans, cache, path); /* * If we didn't have an error then the cache state is still * NEED_WRITE, so we can set it to WRITTEN. */ - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) + if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) cache->disk_cache_state = BTRFS_DC_WRITTEN; last = cache->key.objectid + cache->key.offset; btrfs_put_block_group(cache); } +out: btrfs_free_path(path); - return 0; + return err; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -3098,11 +3138,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; - - /* chunk -> extended profile */ - if (extra_flags == 0) - extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits |= extra_flags; @@ -3113,6 +3150,34 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } /* + * returns target flags in extended format or 0 if restripe for this + * chunk_type is not in progress + * + * should be called with either volume_mutex or balance_lock held + */ +static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + u64 target = 0; + + if (!bctl) + return 0; + + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + return target; +} + +/* * @flags: available profiles in extended format (see ctree.h) * * Returns reduced profile in chunk format. If profile changing is in @@ -3128,31 +3193,19 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) */ u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + u64 target; - /* pick restriper's target profile if it's available */ + /* + * see if restripe for this chunk_type is in progress, if so + * try to reduce to the target profile + */ spin_lock(&root->fs_info->balance_lock); - if (root->fs_info->balance_ctl) { - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; - u64 tgt = 0; - - if ((flags & BTRFS_BLOCK_GROUP_DATA) && - (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->data.target)) { - tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) && - (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->sys.target)) { - tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) && - (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->meta.target)) { - tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } - - if (tgt) { + target = get_restripe_target(root->fs_info, flags); + if (target) { + /* pick target profile only if it's already available */ + if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { spin_unlock(&root->fs_info->balance_lock); - flags = tgt; - goto out; + return extended_to_chunk(target); } } spin_unlock(&root->fs_info->balance_lock); @@ -3180,10 +3233,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) flags &= ~BTRFS_BLOCK_GROUP_RAID0; } -out: - /* extended -> chunk profile */ - flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - return flags; + return extended_to_chunk(flags); } static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) @@ -3312,8 +3362,7 @@ commit_trans: } data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)(unsigned long)data_sinfo, - bytes, 1); + data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -3334,8 +3383,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)(unsigned long)data_sinfo, - bytes, 0); + data_sinfo->flags, bytes, 0); spin_unlock(&data_sinfo->lock); } @@ -3396,6 +3444,50 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; } +static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +{ + u64 num_dev; + + if (type & BTRFS_BLOCK_GROUP_RAID10 || + type & BTRFS_BLOCK_GROUP_RAID0) + num_dev = root->fs_info->fs_devices->rw_devices; + else if (type & BTRFS_BLOCK_GROUP_RAID1) + num_dev = 2; + else + num_dev = 1; /* DUP or single */ + + /* metadata for updaing devices and chunk tree */ + return btrfs_calc_trans_metadata_size(root, num_dev + 1); +} + +static void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + struct btrfs_space_info *info; + u64 left; + u64 thresh; + + info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + spin_lock(&info->lock); + left = info->total_bytes - info->bytes_used - info->bytes_pinned - + info->bytes_reserved - info->bytes_readonly; + spin_unlock(&info->lock); + + thresh = get_system_chunk_thresh(root, type); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { + printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n", + left, thresh, type); + dump_space_info(info, 0, 0); + } + + if (left < thresh) { + u64 flags; + + flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); + btrfs_alloc_chunk(trans, root, flags); + } +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) @@ -3405,15 +3497,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; - BUG_ON(!profile_is_valid(flags, 0)); - space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, 0, 0, &space_info); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } - BUG_ON(!space_info); + BUG_ON(!space_info); /* Logic error */ again: spin_lock(&space_info->lock); @@ -3468,6 +3558,12 @@ again: force_metadata_allocation(fs_info); } + /* + * Check if we have enough space in SYSTEM chunk because we may need + * to update devices. + */ + check_system_chunk(trans, extent_root, flags); + ret = btrfs_alloc_chunk(trans, extent_root, flags); if (ret < 0 && ret != -ENOSPC) goto out; @@ -3675,9 +3771,8 @@ again: */ if (current->journal_info) return -EAGAIN; - ret = wait_event_interruptible(space_info->wait, - !space_info->flush); - /* Must have been interrupted, return */ + ret = wait_event_killable(space_info->wait, !space_info->flush); + /* Must have been killed, return */ if (ret) return -EINTR; @@ -3700,9 +3795,7 @@ again: if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)(unsigned long)space_info, - orig_bytes, 1); + "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else { /* @@ -3771,9 +3864,7 @@ again: if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)(unsigned long)space_info, - orig_bytes, 1); + "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else { wait_ordered = true; @@ -3836,8 +3927,9 @@ out: return ret; } -static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static struct btrfs_block_rsv *get_block_rsv( + const struct btrfs_trans_handle *trans, + const struct btrfs_root *root) { struct btrfs_block_rsv *block_rsv = NULL; @@ -3918,8 +4010,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); space_info->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)(unsigned long)space_info, - num_bytes, 0); + space_info->flags, num_bytes, 0); space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -4123,8 +4214,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = calc_global_metadata_size(fs_info); - spin_lock(&block_rsv->lock); spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); block_rsv->size = num_bytes; @@ -4137,21 +4228,21 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->reserved += num_bytes; sinfo->bytes_may_use += num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)(unsigned long)sinfo, num_bytes, 1); + sinfo->flags, num_bytes, 1); } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)(unsigned long)sinfo, num_bytes, 0); + sinfo->flags, num_bytes, 0); sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; } - spin_unlock(&sinfo->lock); spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); } static void init_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -4198,12 +4289,12 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, return; trace_btrfs_space_reservation(root->fs_info, "transaction", - (u64)(unsigned long)trans, - trans->bytes_reserved, 0); + trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } +/* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) { @@ -4540,7 +4631,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, while (total) { cache = btrfs_lookup_block_group(info, bytenr); if (!cache) - return -1; + return -ENOENT; if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -4643,7 +4734,7 @@ int btrfs_pin_extent(struct btrfs_root *root, struct btrfs_block_group_cache *cache; cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); + BUG_ON(!cache); /* Logic error */ pin_down_extent(root, cache, bytenr, num_bytes, reserved); @@ -4661,7 +4752,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *cache; cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); + BUG_ON(!cache); /* Logic error */ /* * pull in the free space cache (if any) so that our pin @@ -4706,6 +4797,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; + spin_lock(&space_info->lock); spin_lock(&cache->lock); if (reserve != RESERVE_FREE) { @@ -4716,9 +4808,8 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_reserved += num_bytes; if (reserve == RESERVE_ALLOC) { trace_btrfs_space_reservation(cache->fs_info, - "space_info", - (u64)(unsigned long)space_info, - num_bytes, 0); + "space_info", space_info->flags, + num_bytes, 0); space_info->bytes_may_use -= num_bytes; } } @@ -4734,7 +4825,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, return ret; } -int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, +void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4764,7 +4855,6 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->extent_commit_sem); update_global_block_rsv(fs_info); - return 0; } static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) @@ -4779,7 +4869,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) if (cache) btrfs_put_block_group(cache); cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); + BUG_ON(!cache); /* Logic error */ } len = cache->key.objectid + cache->key.offset - start; @@ -4816,6 +4906,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, u64 end; int ret; + if (trans->aborted) + return 0; + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) unpin = &fs_info->freed_extents[1]; else @@ -4901,7 +4994,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); - BUG_ON(ret); + if (ret) + goto abort; btrfs_release_path(path); path->leave_spinning = 1; @@ -4919,10 +5013,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); } - BUG_ON(ret); + if (ret < 0) + goto abort; extent_slot = path->slots[0]; } - } else { + } else if (ret == -ENOENT) { btrfs_print_leaf(extent_root, path->nodes[0]); WARN_ON(1); printk(KERN_ERR "btrfs unable to find ref byte nr %llu " @@ -4932,6 +5027,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (unsigned long long)root_objectid, (unsigned long long)owner_objectid, (unsigned long long)owner_offset); + } else { + goto abort; } leaf = path->nodes[0]; @@ -4941,7 +5038,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(found_extent || extent_slot != path->slots[0]); ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); - BUG_ON(ret < 0); + if (ret < 0) + goto abort; btrfs_release_path(path); path->leave_spinning = 1; @@ -4958,7 +5056,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (unsigned long long)bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } - BUG_ON(ret); + if (ret < 0) + goto abort; extent_slot = path->slots[0]; leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, extent_slot); @@ -4995,7 +5094,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); - BUG_ON(ret); + if (ret) + goto abort; } } else { if (found_extent) { @@ -5012,23 +5112,27 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - BUG_ON(ret); + if (ret) + goto abort; btrfs_release_path(path); if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); - BUG_ON(ret); - } else { - invalidate_mapping_pages(info->btree_inode->i_mapping, - bytenr >> PAGE_CACHE_SHIFT, - (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); + if (ret) + goto abort; } ret = update_block_group(trans, root, bytenr, num_bytes, 0); - BUG_ON(ret); + if (ret) + goto abort; } +out: btrfs_free_path(path); return ret; + +abort: + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } /* @@ -5124,7 +5228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, parent, root->root_key.objectid, btrfs_header_level(buf), BTRFS_DROP_DELAYED_REF, NULL, for_cow); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } if (!last_ref) @@ -5158,6 +5262,7 @@ out: btrfs_put_block_group(cache); } +/* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int for_cow) @@ -5179,14 +5284,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, num_bytes, parent, root_objectid, (int)owner, BTRFS_DROP_DELAYED_REF, NULL, for_cow); - BUG_ON(ret); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, BTRFS_DROP_DELAYED_REF, NULL, for_cow); - BUG_ON(ret); } return ret; } @@ -5243,28 +5346,34 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return 0; } -static int get_block_group_index(struct btrfs_block_group_cache *cache) +static int __get_block_group_index(u64 flags) { int index; - if (cache->flags & BTRFS_BLOCK_GROUP_RAID10) + + if (flags & BTRFS_BLOCK_GROUP_RAID10) index = 0; - else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1) + else if (flags & BTRFS_BLOCK_GROUP_RAID1) index = 1; - else if (cache->flags & BTRFS_BLOCK_GROUP_DUP) + else if (flags & BTRFS_BLOCK_GROUP_DUP) index = 2; - else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) + else if (flags & BTRFS_BLOCK_GROUP_RAID0) index = 3; else index = 4; + return index; } +static int get_block_group_index(struct btrfs_block_group_cache *cache) +{ + return __get_block_group_index(cache->flags); +} + enum btrfs_loop_type { - LOOP_FIND_IDEAL = 0, - LOOP_CACHING_NOWAIT = 1, - LOOP_CACHING_WAIT = 2, - LOOP_ALLOC_CHUNK = 3, - LOOP_NO_EMPTY_SIZE = 4, + LOOP_CACHING_NOWAIT = 0, + LOOP_CACHING_WAIT = 1, + LOOP_ALLOC_CHUNK = 2, + LOOP_NO_EMPTY_SIZE = 3, }; /* @@ -5278,7 +5387,6 @@ enum btrfs_loop_type { static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, - u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, u64 data) { @@ -5287,6 +5395,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; struct btrfs_block_group_cache *used_block_group; + u64 search_start = 0; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; int done_chunk_alloc = 0; @@ -5300,8 +5409,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, bool failed_alloc = false; bool use_cluster = true; bool have_caching_bg = false; - u64 ideal_cache_percent = 0; - u64 ideal_cache_offset = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -5351,7 +5458,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, empty_cluster = 0; if (search_start == hint_byte) { -ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); used_block_group = block_group; @@ -5363,8 +5469,7 @@ ideal_cache: * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, data) && - (block_group->cached != BTRFS_CACHE_NO || - search_start == ideal_cache_offset)) { + block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -5418,44 +5523,13 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { - u64 free_percent; - found_uncached_bg = true; ret = cache_block_group(block_group, trans, - orig_root, 1); - if (block_group->cached == BTRFS_CACHE_FINISHED) - goto alloc; - - free_percent = btrfs_block_group_used(&block_group->item); - free_percent *= 100; - free_percent = div64_u64(free_percent, - block_group->key.offset); - free_percent = 100 - free_percent; - if (free_percent > ideal_cache_percent && - likely(!block_group->ro)) { - ideal_cache_offset = block_group->key.objectid; - ideal_cache_percent = free_percent; - } - - /* - * The caching workers are limited to 2 threads, so we - * can queue as much work as we care to. - */ - if (loop > LOOP_FIND_IDEAL) { - ret = cache_block_group(block_group, trans, - orig_root, 0); - BUG_ON(ret); - } - - /* - * If loop is set for cached only, try the next block - * group. - */ - if (loop == LOOP_FIND_IDEAL) - goto loop; + orig_root, 0); + BUG_ON(ret < 0); + ret = 0; } -alloc: if (unlikely(block_group->ro)) goto loop; @@ -5606,11 +5680,6 @@ unclustered_alloc: } checks: search_start = stripe_align(root, offset); - /* move on to the next group */ - if (search_start + num_bytes >= search_end) { - btrfs_add_free_space(used_block_group, offset, num_bytes); - goto loop; - } /* move on to the next group */ if (search_start + num_bytes > @@ -5661,9 +5730,7 @@ loop: if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) goto search; - /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for - * for them to make caching progress. Also - * determine the best possible bg to cache + /* * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching @@ -5673,50 +5740,17 @@ loop: */ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; - if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { - found_uncached_bg = false; - loop++; - if (!ideal_cache_percent) - goto search; - - /* - * 1 of the following 2 things have happened so far - * - * 1) We found an ideal block group for caching that - * is mostly full and will cache quickly, so we might - * as well wait for it. - * - * 2) We searched for cached only and we didn't find - * anything, and we didn't start any caching kthreads - * either, so chances are we will loop through and - * start a couple caching kthreads, and then come back - * around and just wait for them. This will be slower - * because we will have 2 caching kthreads reading at - * the same time when we could have just started one - * and waited for it to get far enough to give us an - * allocation, so go ahead and go to the wait caching - * loop. - */ - loop = LOOP_CACHING_WAIT; - search_start = ideal_cache_offset; - ideal_cache_percent = 0; - goto ideal_cache; - } else if (loop == LOOP_FIND_IDEAL) { - /* - * Didn't find a uncached bg, wait on anything we find - * next. - */ - loop = LOOP_CACHING_WAIT; - goto search; - } - loop++; - if (loop == LOOP_ALLOC_CHUNK) { if (allowed_chunk_alloc) { ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, data, CHUNK_ALLOC_LIMITED); + if (ret < 0) { + btrfs_abort_transaction(trans, + root, ret); + goto out; + } allowed_chunk_alloc = 0; if (ret == 1) done_chunk_alloc = 1; @@ -5745,6 +5779,7 @@ loop: } else if (ins->objectid) { ret = 0; } +out: return ret; } @@ -5798,12 +5833,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data) + struct btrfs_key *ins, u64 data) { bool final_tried = false; int ret; - u64 search_start = 0; data = btrfs_get_alloc_profile(root, data); again: @@ -5811,23 +5844,31 @@ again: * the only place that sets empty_size is btrfs_realloc_node, which * is not called recursively on allocations */ - if (empty_size || root->ref_cows) + if (empty_size || root->ref_cows) { ret = do_chunk_alloc(trans, root->fs_info->extent_root, num_bytes + 2 * 1024 * 1024, data, CHUNK_ALLOC_NO_FORCE); + if (ret < 0 && ret != -ENOSPC) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, - search_start, search_end, hint_byte, - ins, data); + hint_byte, ins, data); if (ret == -ENOSPC) { if (!final_tried) { num_bytes = num_bytes >> 1; num_bytes = num_bytes & ~(root->sectorsize - 1); num_bytes = max(num_bytes, min_alloc_size); - do_chunk_alloc(trans, root->fs_info->extent_root, + ret = do_chunk_alloc(trans, root->fs_info->extent_root, num_bytes, data, CHUNK_ALLOC_FORCE); + if (ret < 0 && ret != -ENOSPC) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } if (num_bytes == min_alloc_size) final_tried = true; goto again; @@ -5838,7 +5879,8 @@ again: printk(KERN_ERR "btrfs allocation failed flags %llu, " "wanted %llu\n", (unsigned long long)data, (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes, 1); + if (sinfo) + dump_space_info(sinfo, num_bytes, 1); } } @@ -5917,7 +5959,10 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); - BUG_ON(ret); + if (ret) { + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], @@ -5947,7 +5992,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_free_path(path); ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); - if (ret) { + if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, (unsigned long long)ins->offset); @@ -5978,7 +6023,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); - BUG_ON(ret); + if (ret) { + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], @@ -6008,7 +6056,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_free_path(path); ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); - if (ret) { + if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, (unsigned long long)ins->offset); @@ -6056,28 +6104,28 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, if (!caching_ctl) { BUG_ON(!block_group_cache_done(block_group)); ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } else { mutex_lock(&caching_ctl->mutex); if (start >= caching_ctl->progress) { ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } else if (start + num_bytes <= caching_ctl->progress) { ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } else { num_bytes = caching_ctl->progress - start; ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ start = caching_ctl->progress; num_bytes = ins->objectid + ins->offset - caching_ctl->progress; ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } mutex_unlock(&caching_ctl->mutex); @@ -6086,7 +6134,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_update_reserved_bytes(block_group, ins->offset, RESERVE_ALLOC_NO_ACCOUNT); - BUG_ON(ret); + BUG_ON(ret); /* logic error */ btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); @@ -6107,6 +6155,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); + clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); btrfs_set_buffer_uptodate(buf); @@ -6214,7 +6263,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, - empty_size, hint, (u64)-1, &ins, 0); + empty_size, hint, &ins, 0); if (ret) { unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); @@ -6222,7 +6271,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize, level); - BUG_ON(IS_ERR(buf)); + BUG_ON(IS_ERR(buf)); /* -ENOMEM */ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -6234,7 +6283,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); + BUG_ON(!extent_op); /* -ENOMEM */ if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); else @@ -6249,7 +6298,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, extent_op, for_cow); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } return buf; } @@ -6319,7 +6368,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, /* We don't lock the tree block, it's OK to be racy here */ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, &refs, &flags); - BUG_ON(ret); + /* We don't care about errors in readahead. */ + if (ret < 0) + continue; BUG_ON(refs == 0); if (wc->stage == DROP_REFERENCE) { @@ -6386,7 +6437,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, eb->start, eb->len, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + BUG_ON(ret == -ENOMEM); + if (ret) + return ret; BUG_ON(wc->refs[level] == 0); } @@ -6405,12 +6458,12 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, eb->len, flag, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -6482,7 +6535,11 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, &wc->refs[level - 1], &wc->flags[level - 1]); - BUG_ON(ret); + if (ret < 0) { + btrfs_tree_unlock(next); + return ret; + } + BUG_ON(wc->refs[level - 1] == 0); *lookup_info = 0; @@ -6511,7 +6568,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, goto skip; } - if (!btrfs_buffer_uptodate(next, generation)) { + if (!btrfs_buffer_uptodate(next, generation, 0)) { btrfs_tree_unlock(next); free_extent_buffer(next); next = NULL; @@ -6551,7 +6608,7 @@ skip: ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, root->root_key.objectid, level - 1, 0, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } btrfs_tree_unlock(next); free_extent_buffer(next); @@ -6609,7 +6666,10 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, eb->start, eb->len, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + if (ret < 0) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + return ret; + } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); @@ -6629,7 +6689,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, else ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } /* make block locked assertion in clean_tree_block happy */ if (!path->locks[level] && @@ -6738,7 +6798,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. */ -void btrfs_drop_snapshot(struct btrfs_root *root, +int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int update_ref, int for_reloc) { @@ -6766,7 +6826,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root, } trans = btrfs_start_transaction(tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } if (block_rsv) trans->block_rsv = block_rsv; @@ -6791,7 +6854,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root, path->lowest_level = 0; if (ret < 0) { err = ret; - goto out_free; + goto out_end_trans; } WARN_ON(ret > 0); @@ -6811,7 +6874,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root, path->nodes[level]->len, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + if (ret < 0) { + err = ret; + goto out_end_trans; + } BUG_ON(wc->refs[level] == 0); if (level == root_item->drop_level) @@ -6862,26 +6928,40 @@ void btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + err = ret; + goto out_end_trans; + } btrfs_end_transaction_throttle(trans, tree_root); trans = btrfs_start_transaction(tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } if (block_rsv) trans->block_rsv = block_rsv; } } btrfs_release_path(path); - BUG_ON(err); + if (err) + goto out_end_trans; ret = btrfs_del_root(trans, tree_root, &root->root_key); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + goto out_end_trans; + } if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_find_last_root(tree_root, root->root_key.objectid, NULL, NULL); - BUG_ON(ret < 0); - if (ret > 0) { + if (ret < 0) { + btrfs_abort_transaction(trans, tree_root, ret); + err = ret; + goto out_end_trans; + } else if (ret > 0) { /* if we fail to delete the orphan item this time * around, it'll get picked up the next time. * @@ -6899,14 +6979,15 @@ void btrfs_drop_snapshot(struct btrfs_root *root, free_extent_buffer(root->commit_root); kfree(root); } -out_free: +out_end_trans: btrfs_end_transaction_throttle(trans, tree_root); +out_free: kfree(wc); btrfs_free_path(path); out: if (err) btrfs_std_error(root->fs_info, err); - return; + return err; } /* @@ -6983,31 +7064,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { u64 num_devices; - u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - - if (root->fs_info->balance_ctl) { - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; - u64 tgt = 0; - - /* pick restriper's target profile and return */ - if (flags & BTRFS_BLOCK_GROUP_DATA && - bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && - bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if (flags & BTRFS_BLOCK_GROUP_METADATA && - bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } + u64 stripped; - if (tgt) { - /* extended -> chunk profile */ - tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - return tgt; - } - } + /* + * if restripe for this chunk_type is on pick target profile and + * return, otherwise do the usual balance + */ + stripped = get_restripe_target(root->fs_info, flags); + if (stripped) + return extended_to_chunk(stripped); /* * we add in the count of missing devices because we want @@ -7017,6 +7082,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + if (num_devices == 1) { stripped |= BTRFS_BLOCK_GROUP_DUP; stripped = flags & ~stripped; @@ -7029,7 +7097,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) if (flags & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) return stripped | BTRFS_BLOCK_GROUP_DUP; - return flags; } else { /* they already had raid on here, just return */ if (flags & stripped) @@ -7042,9 +7109,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) if (flags & BTRFS_BLOCK_GROUP_DUP) return stripped | BTRFS_BLOCK_GROUP_RAID1; - /* turn single device chunks into raid0 */ - return stripped | BTRFS_BLOCK_GROUP_RAID0; + /* this is drive concat, leave it alone */ } + return flags; } @@ -7103,12 +7170,16 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, BUG_ON(cache->ro); trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); alloc_flags = update_block_group_flags(root, cache->flags); - if (alloc_flags != cache->flags) - do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, - CHUNK_ALLOC_FORCE); + if (alloc_flags != cache->flags) { + ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + CHUNK_ALLOC_FORCE); + if (ret < 0) + goto out; + } ret = set_block_group_ro(cache, 0); if (!ret) @@ -7188,7 +7259,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } -int btrfs_set_block_group_rw(struct btrfs_root *root, +void btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct btrfs_space_info *sinfo = cache->space_info; @@ -7204,7 +7275,6 @@ int btrfs_set_block_group_rw(struct btrfs_root *root, cache->ro = 0; spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); - return 0; } /* @@ -7222,6 +7292,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) u64 min_free; u64 dev_min = 1; u64 dev_nr = 0; + u64 target; int index; int full = 0; int ret = 0; @@ -7262,13 +7333,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) /* * ok we don't have enough space, but maybe we have free space on our * devices to allocate new chunks for relocation, so loop through our - * alloc devices and guess if we have enough space. However, if we - * were marked as full, then we know there aren't enough chunks, and we - * can just return. + * alloc devices and guess if we have enough space. if this block + * group is going to be restriped, run checks against the target + * profile instead of the current one. */ ret = -1; - if (full) - goto out; /* * index: @@ -7278,7 +7347,20 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * 3: raid0 * 4: single */ - index = get_block_group_index(block_group); + target = get_restripe_target(root->fs_info, block_group->flags); + if (target) { + index = __get_block_group_index(extended_to_chunk(target)); + } else { + /* + * this is just a balance, so if we were marked as full + * we know there is no space for a new chunk + */ + if (full) + goto out; + + index = get_block_group_index(block_group); + } + if (index == 0) { dev_min = 4; /* Divide by 2 */ @@ -7572,7 +7654,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), &space_info); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ cache->space_info = space_info; spin_lock(&cache->space_info->lock); cache->space_info->bytes_readonly += cache->bytes_super; @@ -7581,7 +7663,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) __link_block_group(space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); + BUG_ON(ret); /* Logic error */ set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) @@ -7663,7 +7745,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ update_global_block_rsv(root->fs_info); spin_lock(&cache->space_info->lock); @@ -7673,11 +7755,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, __link_block_group(cache->space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); + BUG_ON(ret); /* Logic error */ ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, sizeof(cache->item)); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + return ret; + } set_avail_alloc_bits(extent_root->fs_info, type); @@ -7686,11 +7771,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; - - /* chunk -> extended profile */ - if (extra_flags == 0) - extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits &= ~extra_flags; @@ -7758,7 +7840,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, inode = lookup_free_space_inode(tree_root, block_group, path); if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); - BUG_ON(ret); + if (ret) { + btrfs_add_delayed_iput(inode); + goto out; + } clear_nlink(inode); /* One for the block groups ref */ spin_lock(&block_group->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a55fbe6252d..c9018a05036 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -19,6 +19,7 @@ #include "btrfs_inode.h" #include "volumes.h" #include "check-integrity.h" +#include "locking.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -53,6 +54,13 @@ struct extent_page_data { unsigned int sync_io:1; }; +static noinline void flush_write_bio(void *data); +static inline struct btrfs_fs_info * +tree_fs_info(struct extent_io_tree *tree) +{ + return btrfs_sb(tree->mapping->host->i_sb); +} + int __init extent_io_init(void) { extent_state_cache = kmem_cache_create("extent_state", @@ -136,6 +144,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) #endif atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); + trace_alloc_extent_state(state, mask, _RET_IP_); return state; } @@ -153,6 +162,7 @@ void free_extent_state(struct extent_state *state) list_del(&state->leak_list); spin_unlock_irqrestore(&leak_lock, flags); #endif + trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } } @@ -392,20 +402,28 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, return 0; } +static struct extent_state *next_state(struct extent_state *state) +{ + struct rb_node *next = rb_next(&state->rb_node); + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1), or - * forcibly remove the state from the tree (delete == 1). + * it will optionally wake up any one waiting on this state (wake == 1) * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree */ -static int clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, - int *bits, int wake) +static struct extent_state *clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, + int *bits, int wake) { + struct extent_state *next; int bits_to_clear = *bits & ~EXTENT_CTLBITS; - int ret = state->state & bits_to_clear; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -417,6 +435,7 @@ static int clear_state_bit(struct extent_io_tree *tree, if (wake) wake_up(&state->wq); if (state->state == 0) { + next = next_state(state); if (state->tree) { rb_erase(&state->rb_node, &tree->state); state->tree = NULL; @@ -426,8 +445,9 @@ static int clear_state_bit(struct extent_io_tree *tree, } } else { merge_state(tree, state); + next = next_state(state); } - return ret; + return next; } static struct extent_state * @@ -439,6 +459,13 @@ alloc_extent_state_atomic(struct extent_state *prealloc) return prealloc; } +void extent_io_tree_panic(struct extent_io_tree *tree, int err) +{ + btrfs_panic(tree_fs_info(tree), err, "Locking error: " + "Extent tree was modified by another " + "thread while locked."); +} + /* * clear some bits on a range in the tree. This may require splitting * or inserting elements in the tree, so the gfp mask is used to @@ -449,8 +476,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc) * * the range [start, end] is inclusive. * - * This takes the tree lock, and returns < 0 on error, > 0 if any of the - * bits were already set, or zero if none of the bits were already set. + * This takes the tree lock, and returns 0 on success and < 0 on error. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int wake, int delete, @@ -460,11 +486,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *cached; struct extent_state *prealloc = NULL; - struct rb_node *next_node; struct rb_node *node; u64 last_end; int err; - int set = 0; int clear = 0; if (delete) @@ -513,14 +537,11 @@ hit_next: WARN_ON(state->end < start); last_end = state->end; - if (state->end < end && !need_resched()) - next_node = rb_next(&state->rb_node); - else - next_node = NULL; - /* the state doesn't have the wanted bits, go ahead */ - if (!(state->state & bits)) + if (!(state->state & bits)) { + state = next_state(state); goto next; + } /* * | ---- desired range ---- | @@ -542,12 +563,14 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, &bits, wake); + clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -564,26 +587,25 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, &bits, wake); + clear_state_bit(tree, prealloc, &bits, wake); prealloc = NULL; goto out; } - set |= clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake); next: if (last_end == (u64)-1) goto out; start = last_end + 1; - if (start <= end && next_node) { - state = rb_entry(next_node, struct extent_state, - rb_node); + if (start <= end && state && !need_resched()) goto hit_next; - } goto search_again; out: @@ -591,7 +613,7 @@ out: if (prealloc) free_extent_state(prealloc); - return set; + return 0; search_again: if (start > end) @@ -602,8 +624,8 @@ search_again: goto again; } -static int wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) +static void wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) __releases(tree->lock) __acquires(tree->lock) { @@ -613,7 +635,6 @@ static int wait_on_state(struct extent_io_tree *tree, schedule(); spin_lock(&tree->lock); finish_wait(&state->wq, &wait); - return 0; } /* @@ -621,7 +642,7 @@ static int wait_on_state(struct extent_io_tree *tree, * The range [start, end] is inclusive. * The tree lock is taken by this function */ -int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) { struct extent_state *state; struct rb_node *node; @@ -658,7 +679,6 @@ again: } out: spin_unlock(&tree->lock); - return 0; } static void set_state_bits(struct extent_io_tree *tree, @@ -706,9 +726,10 @@ static void uncache_state(struct extent_state **cached_ptr) * [start, end] is inclusive This takes the tree lock. */ -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask) +static int __must_check +__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -742,8 +763,10 @@ again: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = insert_state(tree, prealloc, start, end, &bits); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; - BUG_ON(err == -EEXIST); goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -809,7 +832,9 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; if (err) goto out; @@ -846,12 +871,9 @@ hit_next: */ err = insert_state(tree, prealloc, start, this_end, &bits); - BUG_ON(err == -EEXIST); - if (err) { - free_extent_state(prealloc); - prealloc = NULL; - goto out; - } + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; start = this_end + 1; @@ -873,7 +895,8 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); cache_state(prealloc, cached_state); @@ -900,6 +923,15 @@ search_again: goto again; } +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, + u64 *failed_start, struct extent_state **cached_state, + gfp_t mask) +{ + return __set_extent_bit(tree, start, end, bits, 0, failed_start, + cached_state, mask); +} + + /** * convert_extent - convert all bits in a given range from one bit to another * @tree: the io tree to search @@ -946,7 +978,8 @@ again: } err = insert_state(tree, prealloc, start, end, &bits); prealloc = NULL; - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -1002,7 +1035,8 @@ hit_next: goto out; } err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); prealloc = NULL; if (err) goto out; @@ -1041,12 +1075,8 @@ hit_next: */ err = insert_state(tree, prealloc, start, this_end, &bits); - BUG_ON(err == -EEXIST); - if (err) { - free_extent_state(prealloc); - prealloc = NULL; - goto out; - } + if (err) + extent_io_tree_panic(tree, err); prealloc = NULL; start = this_end + 1; goto search_again; @@ -1065,7 +1095,8 @@ hit_next: } err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); clear_state_bit(tree, prealloc, &clear_bits, 0); @@ -1095,14 +1126,14 @@ search_again: int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, NULL, mask); } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { - return set_extent_bit(tree, start, end, bits, 0, NULL, + return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); } @@ -1117,7 +1148,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE, - 0, NULL, cached_state, mask); + NULL, cached_state, mask); } int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, @@ -1131,7 +1162,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); } @@ -1139,7 +1170,7 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, - NULL, cached_state, mask); + cached_state, mask); } static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -1155,42 +1186,40 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached_state, gfp_t mask) + int bits, struct extent_state **cached_state) { int err; u64 failed_start; while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, - EXTENT_LOCKED, &failed_start, - cached_state, mask); - if (err == -EEXIST && (mask & __GFP_WAIT)) { + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, + cached_state, GFP_NOFS); + if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; - } else { + } else break; - } WARN_ON(start > end); } return err; } -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { - return lock_extent_bits(tree, start, end, 0, NULL, mask); + return lock_extent_bits(tree, start, end, 0, NULL); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { int err; u64 failed_start; - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, mask); + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, GFP_NOFS); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL, mask); + EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); return 0; } return 1; @@ -1203,10 +1232,10 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, mask); } -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) { return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - mask); + GFP_NOFS); } /* @@ -1220,7 +1249,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) while (index <= end_index) { page = find_get_page(tree->mapping, index); - BUG_ON(!page); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ set_page_writeback(page); page_cache_release(page); index++; @@ -1343,9 +1372,9 @@ out: return found; } -static noinline int __unlock_for_delalloc(struct inode *inode, - struct page *locked_page, - u64 start, u64 end) +static noinline void __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) { int ret; struct page *pages[16]; @@ -1355,7 +1384,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode, int i; if (index == locked_page->index && end_index == index) - return 0; + return; while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, @@ -1370,7 +1399,6 @@ static noinline int __unlock_for_delalloc(struct inode *inode, index += ret; cond_resched(); } - return 0; } static noinline int lock_delalloc_pages(struct inode *inode, @@ -1500,11 +1528,10 @@ again: goto out_failed; } } - BUG_ON(ret); + BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, - 0, &cached_state, GFP_NOFS); + lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, @@ -1761,39 +1788,34 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, * helper function to set a given page up to date if all the * extents in the tree for that page are up to date */ -static int check_page_uptodate(struct extent_io_tree *tree, - struct page *page) +static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); - return 0; } /* * helper function to unlock a page if all the extents in the tree * for that page are unlocked */ -static int check_page_locked(struct extent_io_tree *tree, - struct page *page) +static void check_page_locked(struct extent_io_tree *tree, struct page *page) { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) unlock_page(page); - return 0; } /* * helper function to end page writeback if all the extents * in the tree for that page are done with writeback */ -static int check_page_writeback(struct extent_io_tree *tree, - struct page *page) +static void check_page_writeback(struct extent_io_tree *tree, + struct page *page) { end_page_writeback(page); - return 0; } /* @@ -1912,6 +1934,26 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, return 0; } +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + u64 start = eb->start; + unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); + int ret = 0; + + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, + start, p, mirror_num); + if (ret) + break; + start += PAGE_CACHE_SIZE; + } + + return ret; +} + /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record @@ -2141,6 +2183,10 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, } bio = bio_alloc(GFP_NOFS, 1); + if (!bio) { + free_io_failure(inode, failrec, 0); + return -EIO; + } bio->bi_private = state; bio->bi_end_io = failed_bio->bi_end_io; bio->bi_sector = failrec->logical >> 9; @@ -2258,6 +2304,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) u64 start; u64 end; int whole_page; + int mirror; int ret; if (err) @@ -2296,17 +2343,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } spin_unlock(&tree->lock); + mirror = (int)(unsigned long)bio->bi_bdev; if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, - state); + state, mirror); if (ret) uptodate = 0; else clean_io_failure(start, page); } - if (!uptodate) { - int failed_mirror; - failed_mirror = (int)(unsigned long)bio->bi_bdev; + + if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(page, mirror); + if (!ret && !err && + test_bit(BIO_UPTODATE, &bio->bi_flags)) + uptodate = 1; + } else if (!uptodate) { /* * The generic bio_readpage_error handles errors the * following way: If possible, new read requests are @@ -2317,10 +2369,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err) * can't handle the error it will return -EIO and we * remain responsible for that page. */ - ret = bio_readpage_error(bio, page, start, end, - failed_mirror, NULL); + ret = bio_readpage_error(bio, page, start, end, mirror, NULL); if (ret == 0) { -error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) @@ -2328,16 +2378,9 @@ error_handled: uncache_state(&cached); continue; } - if (tree->ops && tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook( - bio, page, start, end, - failed_mirror, state); - if (ret == 0) - goto error_handled; - } } - if (uptodate) { + if (uptodate && tree->track_uptodate) { set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); } @@ -2386,8 +2429,12 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } -static int submit_one_bio(int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) +/* + * Since writes are async, they will only return -ENOMEM. + * Reads can return the full range of I/O error conditions. + */ +static int __must_check submit_one_bio(int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) { int ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -2413,6 +2460,19 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, return ret; } +static int merge_bio(struct extent_io_tree *tree, struct page *page, + unsigned long offset, size_t size, struct bio *bio, + unsigned long bio_flags) +{ + int ret = 0; + if (tree->ops && tree->ops->merge_bio_hook) + ret = tree->ops->merge_bio_hook(page, offset, size, bio, + bio_flags); + BUG_ON(ret < 0); + return ret; + +} + static int submit_extent_page(int rw, struct extent_io_tree *tree, struct page *page, sector_t sector, size_t size, unsigned long offset, @@ -2441,12 +2501,12 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, sector; if (prev_bio_flags != bio_flags || !contig || - (tree->ops && tree->ops->merge_bio_hook && - tree->ops->merge_bio_hook(page, offset, page_size, bio, - bio_flags)) || + merge_bio(tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); + if (ret < 0) + return ret; bio = NULL; } else { return 0; @@ -2473,25 +2533,31 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, return ret; } -void set_page_extent_mapped(struct page *page) +void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { if (!PagePrivate(page)) { SetPagePrivate(page); page_cache_get(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); + set_page_private(page, (unsigned long)eb); + } else { + WARN_ON(page->private != (unsigned long)eb); } } -static void set_page_extent_head(struct page *page, unsigned long len) +void set_page_extent_mapped(struct page *page) { - WARN_ON(!PagePrivate(page)); - set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, EXTENT_PAGE_PRIVATE); + } } /* * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io * handlers) + * XXX JDM: This needs looking at to ensure proper page locking */ static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, @@ -2531,11 +2597,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end = page_end; while (1) { - lock_extent(tree, start, end, GFP_NOFS); + lock_extent(tree, start, end); ordered = btrfs_lookup_ordered_extent(inode, start); if (!ordered) break; - unlock_extent(tree, start, end, GFP_NOFS); + unlock_extent(tree, start, end); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } @@ -2546,10 +2612,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree, if (zero_offset) { iosize = PAGE_CACHE_SIZE - zero_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); } } while (cur <= end) { @@ -2558,10 +2624,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct extent_state *cached = NULL; iosize = PAGE_CACHE_SIZE - pg_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, cur + iosize - 1, @@ -2572,7 +2638,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end - cur + 1, 0); if (IS_ERR_OR_NULL(em)) { SetPageError(page); - unlock_extent(tree, cur, end, GFP_NOFS); + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; @@ -2607,10 +2673,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree, char *userpage; struct extent_state *cached = NULL; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -2624,7 +2690,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -2634,7 +2700,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -2654,6 +2720,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag); + BUG_ON(ret == -ENOMEM); nr++; *bio_flags = this_bio_flag; } @@ -2756,10 +2823,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (page->index == end_index) { char *userpage; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, PAGE_CACHE_SIZE - pg_offset); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); flush_dcache_page(page); } pg_offset = 0; @@ -2795,7 +2862,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_end, &page_started, &nr_written); - BUG_ON(ret); + /* File system has been set read-only */ + if (ret) { + SetPageError(page); + goto done; + } /* * delalloc_end is already one less than the total * length, so we don't subtract one from @@ -2968,6 +3039,275 @@ done_unlocked: return 0; } +static int eb_wait(void *word) +{ + io_schedule(); + return 0; +} + +static void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, + TASK_UNINTERRUPTIBLE); +} + +static int lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) +{ + unsigned long i, num_pages; + int flush = 0; + int ret = 0; + + if (!btrfs_try_tree_write_lock(eb)) { + flush = 1; + flush_write_bio(epd); + btrfs_tree_lock(eb); + } + + if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_unlock(eb); + if (!epd->sync_io) + return 0; + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + while (1) { + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); + if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) + break; + btrfs_tree_unlock(eb); + } + } + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + spin_lock(&fs_info->delalloc_lock); + if (fs_info->dirty_metadata_bytes >= eb->len) + fs_info->dirty_metadata_bytes -= eb->len; + else + WARN_ON(1); + spin_unlock(&fs_info->delalloc_lock); + ret = 1; + } + + btrfs_tree_unlock(eb); + + if (!ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + if (!trylock_page(p)) { + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + lock_page(p); + } + } + + return ret; +} + +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_clear_bit(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + +static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +{ + int uptodate = err == 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_buffer *eb; + int done; + + do { + struct page *page = bvec->bv_page; + + bvec--; + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + done = atomic_dec_and_test(&eb->io_pages); + + if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + ClearPageUptodate(page); + SetPageError(page); + } + + end_page_writeback(page); + + if (!done) + continue; + + end_extent_buffer_writeback(eb); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); + +} + +static int write_one_eb(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct block_device *bdev = fs_info->fs_devices->latest_bdev; + u64 offset = eb->start; + unsigned long i, num_pages; + int rw = (epd->sync_io ? WRITE_SYNC : WRITE); + int ret; + + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + atomic_set(&eb->io_pages, num_pages); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + clear_page_dirty_for_io(p); + set_page_writeback(p); + ret = submit_extent_page(rw, eb->tree, p, offset >> 9, + PAGE_CACHE_SIZE, 0, bdev, &epd->bio, + -1, end_bio_extent_buffer_writepage, + 0, 0, 0); + if (ret) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + SetPageError(p); + if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) + end_extent_buffer_writeback(eb); + ret = -EIO; + break; + } + offset += PAGE_CACHE_SIZE; + update_nr_written(p, wbc, 1); + unlock_page(p); + } + + if (unlikely(ret)) { + for (; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + unlock_page(p); + } + } + + return ret; +} + +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + struct extent_buffer *eb, *prev_eb = NULL; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + }; + int ret = 0; + int done = 0; + int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int tag; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + scanned = 1; + } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (!PagePrivate(page)) + continue; + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + break; + } + + eb = (struct extent_buffer *)page->private; + if (!eb) { + WARN_ON(1); + continue; + } + + if (eb == prev_eb) + continue; + + if (!atomic_inc_not_zero(&eb->refs)) { + WARN_ON(1); + continue; + } + + prev_eb = eb; + ret = lock_extent_buffer_for_io(eb, fs_info, &epd); + if (!ret) { + free_extent_buffer(eb); + continue; + } + + ret = write_one_eb(eb, fs_info, wbc, &epd); + if (ret) { + done = 1; + free_extent_buffer(eb); + break; + } + free_extent_buffer(eb); + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + flush_write_bio(&epd); + return ret; +} + /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write @@ -3099,10 +3439,14 @@ retry: static void flush_epd_write_bio(struct extent_page_data *epd) { if (epd->bio) { + int rw = WRITE; + int ret; + if (epd->sync_io) - submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); - else - submit_one_bio(WRITE, epd->bio, 0, 0); + rw = WRITE_SYNC; + + ret = submit_one_bio(rw, epd->bio, 0, 0); + BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } } @@ -3219,7 +3563,7 @@ int extent_readpages(struct extent_io_tree *tree, } BUG_ON(!list_empty(pages)); if (bio) - submit_one_bio(READ, bio, 0, bio_flags); + return submit_one_bio(READ, bio, 0, bio_flags); return 0; } @@ -3240,7 +3584,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); + lock_extent_bits(tree, start, end, 0, &cached_state); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -3454,7 +3798,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, - &cached_state, GFP_NOFS); + &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); @@ -3548,26 +3892,7 @@ out: inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { - struct page *p; - struct address_space *mapping; - - if (i == 0) - return eb->first_page; - i += eb->start >> PAGE_CACHE_SHIFT; - mapping = eb->first_page->mapping; - if (!mapping) - return NULL; - - /* - * extent_buffer_page is only called after pinning the page - * by increasing the reference count. So we know the page must - * be in the radix tree. - */ - rcu_read_lock(); - p = radix_tree_lookup(&mapping->page_tree, i); - rcu_read_unlock(); - - return p; + return eb->pages[i]; } inline unsigned long num_extent_pages(u64 start, u64 len) @@ -3576,6 +3901,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len) (start >> PAGE_CACHE_SHIFT); } +static void __free_extent_buffer(struct extent_buffer *eb) +{ +#if LEAK_DEBUG + unsigned long flags; + spin_lock_irqsave(&leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + if (eb->pages && eb->pages != eb->inline_pages) + kfree(eb->pages); + kmem_cache_free(extent_buffer_cache, eb); +} + static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len, @@ -3591,6 +3929,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; + eb->tree = tree; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); atomic_set(&eb->read_locks, 0); @@ -3607,20 +3946,32 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, list_add(&eb->leak_list, &buffers); spin_unlock_irqrestore(&leak_lock, flags); #endif + spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); + atomic_set(&eb->io_pages, 0); + + if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { + struct page **pages; + int num_pages = (len + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + pages = kzalloc(num_pages, mask); + if (!pages) { + __free_extent_buffer(eb); + return NULL; + } + eb->pages = pages; + } else { + eb->pages = eb->inline_pages; + } return eb; } -static void __free_extent_buffer(struct extent_buffer *eb) +static int extent_buffer_under_io(struct extent_buffer *eb) { -#if LEAK_DEBUG - unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); - list_del(&eb->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif - kmem_cache_free(extent_buffer_cache, eb); + return (atomic_read(&eb->io_pages) || + test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } /* @@ -3632,8 +3983,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, unsigned long index; struct page *page; - if (!eb->first_page) - return; + BUG_ON(extent_buffer_under_io(eb)); index = num_extent_pages(eb->start, eb->len); if (start_idx >= index) @@ -3642,8 +3992,34 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, do { index--; page = extent_buffer_page(eb, index); - if (page) + if (page) { + spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); + /* + * We need to make sure we haven't be attached + * to a new eb. + */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ + page_cache_release(page); + } + spin_unlock(&page->mapping->private_lock); + + /* One for when we alloced the page */ page_cache_release(page); + } } while (index != start_idx); } @@ -3656,9 +4032,50 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) __free_extent_buffer(eb); } +static void check_buffer_tree_ref(struct extent_buffer *eb) +{ + /* the ref bit is tricky. We have to make sure it is set + * if we have the buffer dirty. Otherwise the + * code to free a buffer can end up dropping a dirty + * page + * + * Once the ref bit is set, it won't go away while the + * buffer is dirty or in writeback, and it also won't + * go away while we have the reference count on the + * eb bumped. + * + * We can't just set the ref bit without bumping the + * ref on the eb because free_extent_buffer might + * see the ref bit and try to clear it. If this happens + * free_extent_buffer might end up dropping our original + * ref by mistake and freeing the page before we are able + * to add one more ref. + * + * So bump the ref count first, then set the bit. If someone + * beat us to it, drop the ref we added. + */ + if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + atomic_inc(&eb->refs); + if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + } +} + +static void mark_extent_buffer_accessed(struct extent_buffer *eb) +{ + unsigned long num_pages, i; + + check_buffer_tree_ref(eb); + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + mark_page_accessed(p); + } +} + struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - struct page *page0) + u64 start, unsigned long len) { unsigned long num_pages = num_extent_pages(start, len); unsigned long i; @@ -3674,7 +4091,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_page_accessed(eb->first_page); + mark_extent_buffer_accessed(eb); return eb; } rcu_read_unlock(); @@ -3683,32 +4100,44 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (!eb) return NULL; - if (page0) { - eb->first_page = page0; - i = 1; - index++; - page_cache_get(page0); - mark_page_accessed(page0); - set_page_extent_mapped(page0); - set_page_extent_head(page0, len); - uptodate = PageUptodate(page0); - } else { - i = 0; - } - for (; i < num_pages; i++, index++) { + for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS); if (!p) { WARN_ON(1); goto free_eb; } - set_page_extent_mapped(p); - mark_page_accessed(p); - if (i == 0) { - eb->first_page = p; - set_page_extent_head(p, len); - } else { - set_page_private(p, EXTENT_PAGE_PRIVATE); + + spin_lock(&mapping->private_lock); + if (PagePrivate(p)) { + /* + * We could have already allocated an eb for this page + * and attached one so lets see if we can get a ref on + * the existing eb, and if we can we know it's good and + * we can just return that one, else we know we can just + * overwrite page->private. + */ + exists = (struct extent_buffer *)p->private; + if (atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + page_cache_release(p); + mark_extent_buffer_accessed(exists); + goto free_eb; + } + + /* + * Do this so attach doesn't complain and we need to + * drop the ref the old guy had. + */ + ClearPagePrivate(p); + WARN_ON(PageDirty(p)); + page_cache_release(p); } + attach_extent_buffer_page(eb, p); + spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); + mark_page_accessed(p); + eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; @@ -3716,12 +4145,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * see below about how we avoid a nasty race with release page * and why we unlock later */ - if (i != 0) - unlock_page(p); } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - +again: ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (ret) goto free_eb; @@ -3731,14 +4158,21 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (ret == -EEXIST) { exists = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - /* add one reference for the caller */ - atomic_inc(&exists->refs); + if (!atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); + exists = NULL; + goto again; + } spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); + mark_extent_buffer_accessed(exists); goto free_eb; } /* add one reference for the tree */ - atomic_inc(&eb->refs); + spin_lock(&eb->refs_lock); + check_buffer_tree_ref(eb); + spin_unlock(&eb->refs_lock); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); @@ -3751,18 +4185,22 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * after the extent buffer is in the radix tree so * it doesn't get lost */ - set_page_extent_mapped(eb->first_page); - set_page_extent_head(eb->first_page, eb->len); - if (!page0) - unlock_page(eb->first_page); + SetPageChecked(eb->pages[0]); + for (i = 1; i < num_pages; i++) { + p = extent_buffer_page(eb, i); + ClearPageChecked(p); + unlock_page(p); + } + unlock_page(eb->pages[0]); return eb; free_eb: - if (eb->first_page && !page0) - unlock_page(eb->first_page); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) + unlock_page(eb->pages[i]); + } - if (!atomic_dec_and_test(&eb->refs)) - return exists; + WARN_ON(!atomic_dec_and_test(&eb->refs)); btrfs_release_extent_buffer(eb); return exists; } @@ -3776,7 +4214,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_page_accessed(eb->first_page); + mark_extent_buffer_accessed(eb); return eb; } rcu_read_unlock(); @@ -3784,19 +4222,71 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, return NULL; } +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +{ + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); + + __free_extent_buffer(eb); +} + +/* Expects to have eb->eb_lock already held */ +static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) +{ + WARN_ON(atomic_read(&eb->refs) == 0); + if (atomic_dec_and_test(&eb->refs)) { + struct extent_io_tree *tree = eb->tree; + + spin_unlock(&eb->refs_lock); + + spin_lock(&tree->buffer_lock); + radix_tree_delete(&tree->buffer, + eb->start >> PAGE_CACHE_SHIFT); + spin_unlock(&tree->buffer_lock); + + /* Should be safe to release our pages at this point */ + btrfs_release_extent_buffer_page(eb, 0); + + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); + return; + } + spin_unlock(&eb->refs_lock); +} + void free_extent_buffer(struct extent_buffer *eb) { if (!eb) return; - if (!atomic_dec_and_test(&eb->refs)) + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && + !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + + /* + * I know this is terrible, but it's temporary until we stop tracking + * the uptodate bits and such for the extent buffers. + */ + release_extent_buffer(eb, GFP_ATOMIC); +} + +void free_extent_buffer_stale(struct extent_buffer *eb) +{ + if (!eb) return; - WARN_ON(1); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_STALE, &eb->bflags); + + if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + release_extent_buffer(eb, GFP_NOFS); } -int clear_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +void clear_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; @@ -3812,10 +4302,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, lock_page(page); WARN_ON(!PagePrivate(page)); - set_page_extent_mapped(page); - if (i == 0) - set_page_extent_head(page, eb->len); - clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); if (!PageDirty(page)) { @@ -3827,24 +4313,29 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, ClearPageError(page); unlock_page(page); } - return 0; + WARN_ON(atomic_read(&eb->refs) == 0); } -int set_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; int was_dirty = 0; + check_buffer_tree_ref(eb); + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + for (i = 0; i < num_pages; i++) - __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + set_page_dirty(extent_buffer_page(eb, i)); return was_dirty; } -static int __eb_straddles_pages(u64 start, u64 len) +static int range_straddles_pages(u64 start, u64 len) { if (len < PAGE_CACHE_SIZE) return 1; @@ -3855,25 +4346,14 @@ static int __eb_straddles_pages(u64 start, u64 len) return 0; } -static int eb_straddles_pages(struct extent_buffer *eb) -{ - return __eb_straddles_pages(eb->start, eb->len); -} - -int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state **cached_state) +int clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; - num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); - + num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3882,27 +4362,16 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, return 0; } -int set_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); - - if (eb_straddles_pages(eb)) { - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - NULL, GFP_NOFS); - } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - check_page_uptodate(tree, page); - continue; - } SetPageUptodate(page); } return 0; @@ -3917,7 +4386,7 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - if (__eb_straddles_pages(start, end - start + 1)) { + if (range_straddles_pages(start, end - start + 1)) { ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); if (ret) @@ -3939,35 +4408,9 @@ int extent_range_uptodate(struct extent_io_tree *tree, return pg_uptodate; } -int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state *cached_state) +int extent_buffer_uptodate(struct extent_buffer *eb) { - int ret = 0; - unsigned long num_pages; - unsigned long i; - struct page *page; - int pg_uptodate = 1; - - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return 1; - - if (eb_straddles_pages(eb)) { - ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, cached_state); - if (ret) - return ret; - } - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (!PageUptodate(page)) { - pg_uptodate = 0; - break; - } - } - return pg_uptodate; + return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); } int read_extent_buffer_pages(struct extent_io_tree *tree, @@ -3981,21 +4424,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, int ret = 0; int locked_pages = 0; int all_uptodate = 1; - int inc_all_pages = 0; unsigned long num_pages; + unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - if (eb_straddles_pages(eb)) { - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL)) { - return 0; - } - } - if (start) { WARN_ON(start < eb->start); start_i = (start >> PAGE_CACHE_SHIFT) - @@ -4014,8 +4450,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, lock_page(page); } locked_pages++; - if (!PageUptodate(page)) + if (!PageUptodate(page)) { + num_reads++; all_uptodate = 0; + } } if (all_uptodate) { if (start_i == 0) @@ -4023,20 +4461,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + eb->read_mirror = 0; + atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - - WARN_ON(!PagePrivate(page)); - - set_page_extent_mapped(page); - if (i == 0) - set_page_extent_head(page, eb->len); - - if (inc_all_pages) - page_cache_get(page); if (!PageUptodate(page)) { - if (start_i == 0) - inc_all_pages = 1; ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, @@ -4048,8 +4478,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } } - if (bio) - submit_one_bio(READ, bio, mirror_num, bio_flags); + if (bio) { + err = submit_one_bio(READ, bio, mirror_num, bio_flags); + if (err) + return err; + } if (ret || wait != WAIT_COMPLETE) return ret; @@ -4061,8 +4494,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ret = -EIO; } - if (!ret) - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); return ret; unlock_exit: @@ -4304,15 +4735,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page, { char *dst_kaddr = page_address(dst_page); char *src_kaddr; + int must_memmove = 0; if (dst_page != src_page) { src_kaddr = page_address(src_page); } else { src_kaddr = dst_kaddr; - BUG_ON(areas_overlap(src_off, dst_off, len)); + if (areas_overlap(src_off, dst_off, len)) + must_memmove = 1; } - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + if (must_memmove) + memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); + else + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, @@ -4382,7 +4818,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } - if (!areas_overlap(src_offset, dst_offset, len)) { + if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } @@ -4408,47 +4844,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } -static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) -{ - struct extent_buffer *eb = - container_of(head, struct extent_buffer, rcu_head); - - btrfs_release_extent_buffer(eb); -} - -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) +int try_release_extent_buffer(struct page *page, gfp_t mask) { - u64 start = page_offset(page); struct extent_buffer *eb; - int ret = 1; - spin_lock(&tree->buffer_lock); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (!eb) { - spin_unlock(&tree->buffer_lock); - return ret; + /* + * We need to make sure noboody is attaching this page to an eb right + * now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + return 1; } - if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - ret = 0; - goto out; - } + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); /* - * set @eb->refs to 0 if it is already 1, and then release the @eb. - * Or go back. + * This is a little awful but should be ok, we need to make sure that + * the eb doesn't disappear out from under us while we're looking at + * this page. */ - if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { - ret = 0; - goto out; + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&page->mapping->private_lock); + return 0; } + spin_unlock(&page->mapping->private_lock); - radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); -out: - spin_unlock(&tree->buffer_lock); + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; - /* at this point we can safely release the extent buffer */ - if (atomic_read(&eb->refs) == 0) - call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); - return ret; + /* + * If tree ref isn't set then we know the ref on this eb is a real ref, + * so just return, this page will likely be freed soon anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + return 0; + } + release_extent_buffer(eb, mask); + + return 1; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index cecc3518c12..b516c3b8dec 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -35,6 +35,10 @@ #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 #define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ +#define EXTENT_BUFFER_TREE_REF 5 +#define EXTENT_BUFFER_STALE 6 +#define EXTENT_BUFFER_WRITEBACK 7 +#define EXTENT_BUFFER_IOERR 8 /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -54,6 +58,7 @@ #define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 struct extent_state; +struct btrfs_root; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, @@ -69,14 +74,12 @@ struct extent_io_ops { size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); - int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, int failed_mirror, - struct extent_state *state); + int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, struct extent_state *state); int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state); + struct extent_state *state, int mirror); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, @@ -97,6 +100,7 @@ struct extent_io_tree { struct radix_tree_root buffer; struct address_space *mapping; u64 dirty_bytes; + int track_uptodate; spinlock_t lock; spinlock_t buffer_lock; struct extent_io_ops *ops; @@ -119,16 +123,21 @@ struct extent_state { struct list_head leak_list; }; +#define INLINE_EXTENT_BUFFER_PAGES 16 +#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE) struct extent_buffer { u64 start; unsigned long len; unsigned long map_start; unsigned long map_len; - struct page *first_page; unsigned long bflags; + struct extent_io_tree *tree; + spinlock_t refs_lock; + atomic_t refs; + atomic_t io_pages; + int read_mirror; struct list_head leak_list; struct rcu_head rcu_head; - atomic_t refs; pid_t lock_owner; /* count of read lock holders on the extent buffer */ @@ -152,6 +161,9 @@ struct extent_buffer { * to unlock */ wait_queue_head_t read_lock_wq; + wait_queue_head_t lock_wq; + struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; + struct page **pages; }; static inline void extent_set_compress_type(unsigned long *bio_flags, @@ -178,18 +190,17 @@ void extent_io_tree_init(struct extent_io_tree *tree, int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); +int try_release_extent_buffer(struct page *page, gfp_t mask); int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached, gfp_t mask); -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); + int bits, struct extent_state **cached); +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached, gfp_t mask); -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); @@ -210,7 +221,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, + int bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); @@ -240,6 +251,8 @@ int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, get_extent_t *get_extent, struct writeback_control *wbc); +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc); int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, struct list_head *pages, unsigned nr_pages, @@ -251,11 +264,11 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - struct page *page0); + u64 start, unsigned long len); struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); +void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2 @@ -287,19 +300,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); -int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); -int clear_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb); -int set_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb); -int set_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb); -int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state **cached_state); -int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state *cached_state); +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); +void clear_extent_buffer_dirty(struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_buffer *eb); +int clear_extent_buffer_uptodate(struct extent_buffer *eb); +int extent_buffer_uptodate(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, @@ -320,4 +326,6 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, u64 length, u64 logical, struct page *page, int mirror_num); int end_extent_writepage(struct page *page, int err, u64 start, u64 end); +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c7fb3a4247d..5d158d32023 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -25,10 +25,12 @@ #include "transaction.h" #include "print-tree.h" -#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ +#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ size) - 1)) +#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE)) + #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ sizeof(struct btrfs_sector_sum) * \ @@ -59,7 +61,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, sizeof(*item)); if (ret < 0) goto out; - BUG_ON(ret); + BUG_ON(ret); /* Can't happen */ leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -284,6 +286,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_ordered_sum *sums; struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; + LIST_HEAD(tmplist); unsigned long offset; int ret; size_t size; @@ -358,7 +361,10 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), GFP_NOFS); - BUG_ON(!sums); + if (!sums) { + ret = -ENOMEM; + goto fail; + } sector_sum = sums->sums; sums->bytenr = start; @@ -380,12 +386,19 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, offset += csum_size; sector_sum++; } - list_add_tail(&sums->list, list); + list_add_tail(&sums->list, &tmplist); } path->slots[0]++; } ret = 0; fail: + while (ret < 0 && !list_empty(&tmplist)) { + sums = list_entry(&tmplist, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + list_splice_tail(&tmplist, list); + btrfs_free_path(path); return ret; } @@ -420,7 +433,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, offset = page_offset(bvec->bv_page) + bvec->bv_offset; ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); + BUG_ON(!ordered); /* Logic error */ sums->bytenr = ordered->start; while (bio_index < bio->bi_vcnt) { @@ -439,21 +452,21 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), GFP_NOFS); - BUG_ON(!sums); + BUG_ON(!sums); /* -ENOMEM */ sector_sum = sums->sums; sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); + BUG_ON(!ordered); /* Logic error */ sums->bytenr = ordered->start; } - data = kmap_atomic(bvec->bv_page, KM_USER0); + data = kmap_atomic(bvec->bv_page); sector_sum->sum = ~(u32)0; sector_sum->sum = btrfs_csum_data(root, data + bvec->bv_offset, sector_sum->sum, bvec->bv_len); - kunmap_atomic(data, KM_USER0); + kunmap_atomic(data); btrfs_csum_final(sector_sum->sum, (char *)§or_sum->sum); sector_sum->bytenr = disk_bytenr; @@ -483,18 +496,17 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, * This calls btrfs_truncate_item with the correct args based on the * overlap, and fixes up the key as required. */ -static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *key, - u64 bytenr, u64 len) +static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 bytenr, u64 len) { struct extent_buffer *leaf; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; - int ret; leaf = path->nodes[0]; csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; @@ -510,7 +522,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; - ret = btrfs_truncate_item(trans, root, path, new_size, 1); + btrfs_truncate_item(trans, root, path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* @@ -522,15 +534,13 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; - ret = btrfs_truncate_item(trans, root, path, new_size, 0); + btrfs_truncate_item(trans, root, path, new_size, 0); key->offset = end_byte; - ret = btrfs_set_item_key_safe(trans, root, path, key); - BUG_ON(ret); + btrfs_set_item_key_safe(trans, root, path, key); } else { BUG(); } - return 0; } /* @@ -635,13 +645,14 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, * item changed size or key */ ret = btrfs_split_item(trans, root, path, &key, offset); - BUG_ON(ret && ret != -EAGAIN); + if (ret && ret != -EAGAIN) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } key.offset = end_byte - 1; } else { - ret = truncate_one_csum(trans, root, path, - &key, bytenr, len); - BUG_ON(ret); + truncate_one_csum(trans, root, path, &key, bytenr, len); if (key.offset < bytenr) break; } @@ -772,7 +783,7 @@ again: if (diff != csum_size) goto insert; - ret = btrfs_extend_item(trans, root, path, diff); + btrfs_extend_item(trans, root, path, diff); goto csum; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e8d06b6b919..53bf2d764bb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -452,7 +452,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split = alloc_extent_map(); if (!split2) split2 = alloc_extent_map(); - BUG_ON(!split || !split2); + BUG_ON(!split || !split2); /* -ENOMEM */ write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -494,7 +494,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->flags = flags; split->compress_type = em->compress_type; ret = add_extent_mapping(em_tree, split); - BUG_ON(ret); + BUG_ON(ret); /* Logic error */ free_extent_map(split); split = split2; split2 = NULL; @@ -520,7 +520,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } ret = add_extent_mapping(em_tree, split); - BUG_ON(ret); + BUG_ON(ret); /* Logic error */ free_extent_map(split); split = NULL; } @@ -567,6 +567,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, int extent_type; int recow; int ret; + int modify_tree = -1; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); @@ -575,10 +576,13 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, if (!path) return -ENOMEM; + if (start >= BTRFS_I(inode)->disk_i_size) + modify_tree = 0; + while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, - search_start, -1); + search_start, modify_tree); if (ret < 0) break; if (ret > 0 && path->slots[0] > 0 && search_start == start) { @@ -634,7 +638,8 @@ next_slot: } search_start = max(key.offset, start); - if (recow) { + if (recow || !modify_tree) { + modify_tree = -1; btrfs_release_path(path); continue; } @@ -679,7 +684,7 @@ next_slot: root->root_key.objectid, new_key.objectid, start - extent_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ *hint_byte = disk_bytenr; } key.offset = start; @@ -754,7 +759,7 @@ next_slot: root->root_key.objectid, key.objectid, key.offset - extent_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); *hint_byte = disk_bytenr; @@ -770,7 +775,10 @@ next_slot: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } del_nr = 0; del_slot = 0; @@ -782,11 +790,13 @@ next_slot: BUG_ON(1); } - if (del_nr > 0) { + if (!ret && del_nr > 0) { ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, root, ret); } +out: btrfs_free_path(path); return ret; } @@ -944,7 +954,10 @@ again: btrfs_release_path(path); goto again; } - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0] - 1, @@ -963,7 +976,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (split == start) { key.offset = start; @@ -990,7 +1003,7 @@ again: ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } other_start = 0; other_end = start; @@ -1007,7 +1020,7 @@ again: ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } if (del_nr == 0) { fi = btrfs_item_ptr(leaf, path->slots[0], @@ -1025,7 +1038,10 @@ again: btrfs_mark_buffer_dirty(leaf); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } } out: btrfs_free_path(path); @@ -1105,8 +1121,7 @@ again: if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, 0, &cached_state, - GFP_NOFS); + start_pos, last_pos - 1, 0, &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, last_pos - 1); if (ordered && @@ -1638,7 +1653,7 @@ static long btrfs_fallocate(struct file *file, int mode, * transaction */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state, GFP_NOFS); + locked_end, 0, &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -1667,7 +1682,13 @@ static long btrfs_fallocate(struct file *file, int mode, em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); - BUG_ON(IS_ERR_OR_NULL(em)); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + break; + } last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = (last_byte + mask) & ~mask; @@ -1737,7 +1758,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) return -ENXIO; lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, - &cached_state, GFP_NOFS); + &cached_state); /* * Delalloc is such a pain. If we have a hole and we have pending diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 710ea380c7e..202008ec367 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -230,11 +230,13 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, if (ret) { trans->block_rsv = rsv; - WARN_ON(1); + btrfs_abort_transaction(trans, root, ret); return ret; } ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_abort_transaction(trans, root, ret); trans->block_rsv = rsv; return ret; @@ -746,13 +748,6 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, u64 used = btrfs_block_group_used(&block_group->item); /* - * If we're unmounting then just return, since this does a search on the - * normal root and not the commit root and we could deadlock. - */ - if (btrfs_fs_closing(fs_info)) - return 0; - - /* * If this block group has been marked to be cleared for one reason or * another then we can't trust the on disk cache, so just return. */ @@ -766,6 +761,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, path = btrfs_alloc_path(); if (!path) return 0; + path->search_commit_root = 1; + path->skip_locking = 1; inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode)) { @@ -869,7 +866,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_prepare_pages(&io_ctl, inode, 0); lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); node = rb_first(&ctl->free_space_offset); if (!node && cluster) { @@ -1068,7 +1065,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, spin_unlock(&block_group->lock); ret = 0; #ifdef DEBUG - printk(KERN_ERR "btrfs: failed to write free space cace " + printk(KERN_ERR "btrfs: failed to write free space cache " "for block group %llu\n", block_group->key.objectid); #endif } @@ -1948,14 +1945,14 @@ again: */ ret = btrfs_add_free_space(block_group, old_start, offset - old_start); - WARN_ON(ret); + WARN_ON(ret); /* -ENOMEM */ goto out; } ret = remove_from_bitmap(ctl, info, &offset, &bytes); if (ret == -EAGAIN) goto again; - BUG_ON(ret); + BUG_ON(ret); /* logic error */ out_lock: spin_unlock(&ctl->tree_lock); out: @@ -2346,7 +2343,7 @@ again: rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 1); - BUG_ON(ret); + BUG_ON(ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, total_found * block_group->sectorsize, 1); @@ -2439,7 +2436,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); total_size += entry->bytes; - BUG_ON(ret); + BUG_ON(ret); /* -EEXIST; Logic error */ } while (node && entry != last); cluster->max_size = max_extent; @@ -2830,6 +2827,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) int ret; ret = search_bitmap(ctl, entry, &offset, &count); + /* Logic error; Should be empty if it can't find anything */ BUG_ON(ret); ino = offset; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index baa74f3db69..a13cf1a96c7 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -19,6 +19,7 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "print-tree.h" static int find_name_in_backref(struct btrfs_path *path, const char *name, int name_len, struct btrfs_inode_ref **ref_ret) @@ -128,13 +129,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - ret = btrfs_truncate_item(trans, root, path, + btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1); out: btrfs_free_path(path); return ret; } +/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, @@ -165,7 +167,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - ret = btrfs_extend_item(trans, root, path, ins_len); + btrfs_extend_item(trans, root, path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index ee15d88b33d..b1a1c929ba8 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root) tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", root->root_key.objectid); - BUG_ON(IS_ERR(tsk)); + BUG_ON(IS_ERR(tsk)); /* -ENOMEM */ } int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) @@ -271,7 +271,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) break; info = rb_entry(n, struct btrfs_free_space, offset_index); - BUG_ON(info->bitmap); + BUG_ON(info->bitmap); /* Logic error */ if (info->offset > root->cache_progress) goto free; @@ -439,17 +439,16 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (ret) goto out; trace_btrfs_space_reservation(root->fs_info, "ino_cache", - (u64)(unsigned long)trans, - trans->bytes_reserved, 1); + trans->transid, trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) { ret = PTR_ERR(inode); goto out_release; } if (IS_ERR(inode)) { - BUG_ON(retry); + BUG_ON(retry); /* Logic error */ retry = true; ret = create_free_ino_inode(root, trans, path); @@ -460,12 +459,17 @@ again: BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); - WARN_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } if (i_size_read(inode) > 0) { ret = btrfs_truncate_free_space_cache(root, trans, path, inode); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto out_put; + } } spin_lock(&root->cache_lock); @@ -502,8 +506,7 @@ out_put: iput(inode); out_release: trace_btrfs_space_reservation(root->fs_info, "ino_cache", - (u64)(unsigned long)trans, - trans->bytes_reserved, 0); + trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: trans->block_rsv = rsv; @@ -532,7 +535,7 @@ static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ if (path->slots[0] > 0) { slot = path->slots[0] - 1; l = path->nodes[0]; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 892b34785cc..61b16c641ce 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -150,7 +150,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, inode_add_bytes(inode, size); ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); - BUG_ON(ret); if (ret) { err = ret; goto fail; @@ -173,9 +172,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_CACHE_SIZE); - kaddr = kmap_atomic(cpage, KM_USER0); + kaddr = kmap_atomic(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); i++; ptr += cur_size; @@ -187,10 +186,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, page = find_get_page(inode->i_mapping, start >> PAGE_CACHE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); offset = start & (PAGE_CACHE_SIZE - 1); write_extent_buffer(leaf, kaddr + offset, ptr, size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); page_cache_release(page); } btrfs_mark_buffer_dirty(leaf); @@ -206,9 +205,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, * could end up racing with unlink. */ BTRFS_I(inode)->disk_i_size = inode->i_size; - btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, inode); - return 0; + return ret; fail: btrfs_free_path(path); return err; @@ -250,14 +249,18 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, ret = btrfs_drop_extents(trans, inode, start, aligned_end, &hint_byte, 1); - BUG_ON(ret); + if (ret) + return ret; if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); ret = insert_inline_extent(trans, root, inode, start, inline_len, compressed_size, compress_type, compressed_pages); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; @@ -293,7 +296,7 @@ static noinline int add_async_extent(struct async_cow *cow, struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); - BUG_ON(!async_extent); + BUG_ON(!async_extent); /* -ENOMEM */ async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; @@ -344,8 +347,9 @@ static noinline int compress_file_range(struct inode *inode, int will_compress; int compress_type = root->fs_info->compress_type; - /* if this is a small write inside eof, kick off a defragbot */ - if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024) + /* if this is a small write inside eof, kick off a defrag */ + if ((end - start + 1) < 16 * 1024 && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); actual_end = min_t(u64, isize, end + 1); @@ -422,10 +426,10 @@ again: * sending it down to disk */ if (offset) { - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); } will_compress = 1; } @@ -433,7 +437,11 @@ again: cont: if (start == 0) { trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto cleanup_and_out; + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; /* lets try to make an inline extent */ @@ -450,11 +458,11 @@ cont: total_compressed, compress_type, pages); } - if (ret == 0) { + if (ret <= 0) { /* - * inline extent creation worked, we don't need - * to create any more async work items. Unlock - * and free up our temp pages. + * inline extent creation worked or returned error, + * we don't need to create any more async work items. + * Unlock and free up our temp pages. */ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, @@ -547,7 +555,7 @@ cleanup_and_bail_uncompressed: } out: - return 0; + return ret; free_pages_out: for (i = 0; i < nr_pages_ret; i++) { @@ -557,6 +565,20 @@ free_pages_out: kfree(pages); goto out; + +cleanup_and_out: + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_DIRTY | + EXTENT_CLEAR_DELALLOC | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + if (!trans || IS_ERR(trans)) + btrfs_error(root->fs_info, ret, "Failed to join transaction"); + else + btrfs_abort_transaction(trans, root, ret); + goto free_pages_out; } /* @@ -597,7 +619,7 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + - async_extent->ram_size - 1, GFP_NOFS); + async_extent->ram_size - 1); /* allocate blocks */ ret = cow_file_range(inode, async_cow->locked_page, @@ -606,6 +628,8 @@ retry: async_extent->ram_size - 1, &page_started, &nr_written, 0); + /* JDM XXX */ + /* * if page_started, cow_file_range inserted an * inline extent and took care of all the unlocking @@ -625,18 +649,21 @@ retry: } lock_extent(io_tree, async_extent->start, - async_extent->start + async_extent->ram_size - 1, - GFP_NOFS); + async_extent->start + async_extent->ram_size - 1); trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_reserve_extent(trans, root, + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, - 0, alloc_hint, - (u64)-1, &ins, 1); - btrfs_end_transaction(trans, root); + 0, alloc_hint, &ins, 1); + if (ret) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + } if (ret) { int i; @@ -649,8 +676,10 @@ retry: async_extent->pages = NULL; unlock_extent(io_tree, async_extent->start, async_extent->start + - async_extent->ram_size - 1, GFP_NOFS); - goto retry; + async_extent->ram_size - 1); + if (ret == -ENOSPC) + goto retry; + goto out_free; /* JDM: Requeue? */ } /* @@ -662,7 +691,7 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(); - BUG_ON(!em); + BUG_ON(!em); /* -ENOMEM */ em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; @@ -694,7 +723,7 @@ retry: ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ /* * clear dirty, set writeback and unlock the pages. @@ -716,13 +745,17 @@ retry: ins.offset, async_extent->pages, async_extent->nr_pages); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ alloc_hint = ins.objectid + ins.offset; kfree(async_extent); cond_resched(); } - - return 0; + ret = 0; +out: + return ret; +out_free: + kfree(async_extent); + goto out; } static u64 get_extent_allocation_hint(struct inode *inode, u64 start, @@ -791,7 +824,18 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(btrfs_is_free_space_inode(root, inode)); trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + return PTR_ERR(trans); + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; num_bytes = (end - start + blocksize) & ~(blocksize - 1); @@ -800,7 +844,8 @@ static noinline int cow_file_range(struct inode *inode, ret = 0; /* if this is a small write inside eof, kick off defrag */ - if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024) + if (num_bytes < 64 * 1024 && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(trans, inode); if (start == 0) { @@ -821,8 +866,10 @@ static noinline int cow_file_range(struct inode *inode, *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; - ret = 0; goto out; + } else if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; } } @@ -838,11 +885,14 @@ static noinline int cow_file_range(struct inode *inode, cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(trans, root, cur_alloc_size, root->sectorsize, 0, alloc_hint, - (u64)-1, &ins, 1); - BUG_ON(ret); + &ins, 1); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; + } em = alloc_extent_map(); - BUG_ON(!em); + BUG_ON(!em); /* -ENOMEM */ em->start = start; em->orig_start = em->start; ram_size = ins.offset; @@ -868,13 +918,16 @@ static noinline int cow_file_range(struct inode *inode, cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; + } } if (disk_num_bytes < cur_alloc_size) @@ -899,11 +952,23 @@ static noinline int cow_file_range(struct inode *inode, alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; } -out: ret = 0; +out: btrfs_end_transaction(trans, root); return ret; +out_unlock: + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + + goto out; } /* @@ -969,7 +1034,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1, 0, NULL, GFP_NOFS); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); - BUG_ON(!async_cow); + BUG_ON(!async_cow); /* -ENOMEM */ async_cow->inode = inode; async_cow->root = root; async_cow->locked_page = locked_page; @@ -1060,7 +1125,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 disk_bytenr; u64 num_bytes; int extent_type; - int ret; + int ret, err; int type; int nocow; int check_prev = 1; @@ -1078,7 +1143,11 @@ static noinline int run_delalloc_nocow(struct inode *inode, else trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; cow_start = (u64)-1; @@ -1086,7 +1155,10 @@ static noinline int run_delalloc_nocow(struct inode *inode, while (1) { ret = btrfs_lookup_file_extent(trans, root, path, ino, cur_offset, 0); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, @@ -1100,8 +1172,10 @@ next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) - BUG_ON(1); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } if (ret > 0) break; leaf = path->nodes[0]; @@ -1189,7 +1263,10 @@ out_check: ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, page_started, nr_written, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } cow_start = (u64)-1; } @@ -1198,7 +1275,7 @@ out_check: struct extent_map_tree *em_tree; em_tree = &BTRFS_I(inode)->extent_tree; em = alloc_extent_map(); - BUG_ON(!em); + BUG_ON(!em); /* -ENOMEM */ em->start = cur_offset; em->orig_start = em->start; em->len = num_bytes; @@ -1224,13 +1301,16 @@ out_check: ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, type); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } } extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, @@ -1249,18 +1329,23 @@ out_check: if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, end, page_started, nr_written, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } } +error: if (nolock) { - ret = btrfs_end_transaction_nolock(trans, root); - BUG_ON(ret); + err = btrfs_end_transaction_nolock(trans, root); } else { - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); + err = btrfs_end_transaction(trans, root); } + if (!ret) + ret = err; + btrfs_free_path(path); - return 0; + return ret; } /* @@ -1425,10 +1510,11 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, map_length = length; ret = btrfs_map_block(map_tree, READ, logical, &map_length, NULL, 0); - + /* Will always return 0 or 1 with map_multi == NULL */ + BUG_ON(ret < 0); if (map_length < length + size) return 1; - return ret; + return 0; } /* @@ -1448,7 +1534,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, int ret = 0; ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -1479,14 +1565,16 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; int skip_sum; + int metadata = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; if (btrfs_is_free_space_inode(root, inode)) - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); - else - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + metadata = 2; + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); + if (ret) + return ret; if (!(rw & REQ_WRITE)) { if (bio_flags & EXTENT_BIO_COMPRESSED) { @@ -1571,7 +1659,7 @@ again: page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, - &cached_state, GFP_NOFS); + &cached_state); /* already ordered? We're done */ if (PagePrivate2(page)) @@ -1675,13 +1763,15 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, */ ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, &hint, 0); - BUG_ON(ret); + if (ret) + goto out; ins.objectid = btrfs_ino(inode); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); - BUG_ON(ret); + if (ret) + goto out; leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1709,10 +1799,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_alloc_reserved_file_extent(trans, root, root->root_key.objectid, btrfs_ino(inode), file_pos, &ins); - BUG_ON(ret); +out: btrfs_free_path(path); - return 0; + return ret; } /* @@ -1740,35 +1830,41 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) end - start + 1); if (!ret) return 0; - BUG_ON(!ordered_extent); + BUG_ON(!ordered_extent); /* Logic error */ nolock = btrfs_is_free_space_inode(root, inode); if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - BUG_ON(!list_empty(&ordered_extent->list)); + BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { if (nolock) trans = btrfs_join_transaction_nolock(root); else trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); - BUG_ON(ret); + if (ret) /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); } goto out; } lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); if (nolock) trans = btrfs_join_transaction_nolock(root); else trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out_unlock; + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) @@ -1779,7 +1875,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len); - BUG_ON(ret); } else { BUG_ON(root == root->fs_info->tree_root); ret = insert_reserved_file_extent(trans, inode, @@ -1793,11 +1888,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, ordered_extent->len); - BUG_ON(ret); } unlock_extent_cached(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, &cached_state, GFP_NOFS); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); @@ -1805,7 +1903,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { ret = btrfs_update_inode_fallback(trans, root, inode); - BUG_ON(ret); + if (ret) { /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); + goto out; + } } ret = 0; out: @@ -1824,6 +1925,11 @@ out: btrfs_put_ordered_extent(ordered_extent); return 0; +out_unlock: + unlock_extent_cached(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, &cached_state, GFP_NOFS); + goto out; } static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, @@ -1841,7 +1947,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, * extent_io.c will try to find good copies for us. */ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state) + struct extent_state *state, int mirror) { size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); struct inode *inode = page->mapping->host; @@ -1873,7 +1979,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, } else { ret = get_state_private(io_tree, start, &private); } - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); if (ret) goto zeroit; @@ -1882,7 +1988,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, if (csum != private) goto zeroit; - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); good: return 0; @@ -1894,7 +2000,7 @@ zeroit: (unsigned long long)private); memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); if (private == 0) return 0; return -EIO; @@ -1905,6 +2011,8 @@ struct delayed_iput { struct inode *inode; }; +/* JDM: If this is fs-wide, why can't we add a pointer to + * btrfs_inode instead and avoid the allocation? */ void btrfs_add_delayed_iput(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; @@ -2051,20 +2159,27 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); - BUG_ON(ret); + BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ } /* insert an orphan item to track this unlinked/truncated file */ if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); - BUG_ON(ret && ret != -EEXIST); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + ret = 0; } /* insert an orphan item to track subvolume contains orphan files */ if (insert >= 2) { ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); - BUG_ON(ret); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } } return 0; } @@ -2094,7 +2209,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) if (trans && delete_item) { ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ } if (release_rsv) @@ -2228,7 +2343,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) } ret = btrfs_del_orphan_item(trans, root, found_key.objectid); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ btrfs_end_transaction(trans, root); continue; } @@ -2610,16 +2725,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, printk(KERN_INFO "btrfs failed to delete reference to %.*s, " "inode %llu parent %llu\n", name_len, name, (unsigned long long)ino, (unsigned long long)dir_ino); + btrfs_abort_transaction(trans, root, ret); goto err; } ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto err; + } ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir_ino); - BUG_ON(ret != 0 && ret != -ENOENT); + if (ret != 0 && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + goto err; + } ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); @@ -2777,7 +2898,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, err = ret; goto out; } - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ if (check_path_shared(root, path)) goto out; btrfs_release_path(path); @@ -2810,7 +2931,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, err = PTR_ERR(ref); goto out; } - BUG_ON(!ref); + BUG_ON(!ref); /* Logic error */ if (check_path_shared(root, path)) goto out; index = btrfs_inode_ref_index(path->nodes[0], ref); @@ -2917,23 +3038,42 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); - BUG_ON(IS_ERR_OR_NULL(di)); + if (IS_ERR_OR_NULL(di)) { + if (!di) + ret = -ENOENT; + else + ret = PTR_ERR(di); + goto out; + } leaf = path->nodes[0]; btrfs_dir_item_key_to_cpu(leaf, di, &key); WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } btrfs_release_path(path); ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, objectid, root->root_key.objectid, dir_ino, &index, name, name_len); if (ret < 0) { - BUG_ON(ret != -ENOENT); + if (ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } di = btrfs_search_dir_index_item(root, path, dir_ino, name, name_len); - BUG_ON(IS_ERR_OR_NULL(di)); + if (IS_ERR_OR_NULL(di)) { + if (!di) + ret = -ENOENT; + else + ret = PTR_ERR(di); + btrfs_abort_transaction(trans, root, ret); + goto out; + } leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); @@ -2943,15 +3083,19 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } btrfs_i_size_write(dir, dir->i_size - name_len * 2); dir->i_mtime = dir->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); - + if (ret) + btrfs_abort_transaction(trans, root, ret); +out: btrfs_free_path(path); - return 0; + return ret; } static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) @@ -3161,8 +3305,8 @@ search_again: } size = btrfs_file_extent_calc_inline_size(size); - ret = btrfs_truncate_item(trans, root, path, - size, 1); + btrfs_truncate_item(trans, root, path, + size, 1); } else if (root->ref_cows) { inode_sub_bytes(inode, item_end + 1 - found_key.offset); @@ -3210,7 +3354,11 @@ delete: ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, + root, ret); + goto error; + } pending_del_nr = 0; } btrfs_release_path(path); @@ -3223,8 +3371,10 @@ out: if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, root, ret); } +error: btrfs_free_path(path); return err; } @@ -3282,8 +3432,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, page_start); @@ -3359,7 +3508,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) btrfs_wait_ordered_range(inode, hole_start, block_end - hole_start); lock_extent_bits(io_tree, hole_start, block_end - 1, 0, - &cached_state, GFP_NOFS); + &cached_state); ordered = btrfs_lookup_ordered_extent(inode, hole_start); if (!ordered) break; @@ -3372,7 +3521,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, block_end - cur_offset, 0); - BUG_ON(IS_ERR_OR_NULL(em)); + if (IS_ERR(em)) { + err = PTR_ERR(em); + break; + } last_byte = min(extent_map_end(em), block_end); last_byte = (last_byte + mask) & ~mask; if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { @@ -3389,7 +3541,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) cur_offset + hole_size, &hint_byte, 1); if (err) { - btrfs_update_inode(trans, root, inode); + btrfs_abort_transaction(trans, root, err); btrfs_end_transaction(trans, root); break; } @@ -3399,7 +3551,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 0, hole_size, 0, hole_size, 0, 0, 0); if (err) { - btrfs_update_inode(trans, root, inode); + btrfs_abort_transaction(trans, root, err); btrfs_end_transaction(trans, root); break; } @@ -3779,7 +3931,7 @@ static void inode_tree_del(struct inode *inode) } } -int btrfs_invalidate_inodes(struct btrfs_root *root) +void btrfs_invalidate_inodes(struct btrfs_root *root) { struct rb_node *node; struct rb_node *prev; @@ -3839,7 +3991,6 @@ again: node = rb_next(node); } spin_unlock(&root->inode_lock); - return 0; } static int btrfs_init_locked_inode(struct inode *inode, void *p) @@ -3918,7 +4069,7 @@ static struct inode *new_simple_dir(struct super_block *s, BTRFS_I(inode)->dummy_inode = 1; inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; - inode->i_op = &simple_dir_inode_operations; + inode->i_op = &btrfs_dir_ro_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@ -3989,14 +4140,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; + struct inode *inode = dentry->d_inode; - if (!dentry->d_inode && !IS_ROOT(dentry)) - dentry = dentry->d_parent; + if (!inode && !IS_ROOT(dentry)) + inode = dentry->d_parent->d_inode; - if (dentry->d_inode) { - root = BTRFS_I(dentry->d_inode)->root; + if (inode) { + root = BTRFS_I(inode)->root; if (btrfs_root_refs(&root->root_item) == 0) return 1; + + if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return 1; } return 0; } @@ -4037,7 +4192,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, struct btrfs_path *path; struct list_head ins_list; struct list_head del_list; - struct qstr q; int ret; struct extent_buffer *leaf; int slot; @@ -4128,7 +4282,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, while (di_cur < di_total) { struct btrfs_key location; - struct dentry *tmp; if (verify_dir_item(root, leaf, di)) break; @@ -4149,35 +4302,15 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; btrfs_dir_item_key_to_cpu(leaf, di, &location); - q.name = name_ptr; - q.len = name_len; - q.hash = full_name_hash(q.name, q.len); - tmp = d_lookup(filp->f_dentry, &q); - if (!tmp) { - struct btrfs_key *newkey; - - newkey = kzalloc(sizeof(struct btrfs_key), - GFP_NOFS); - if (!newkey) - goto no_dentry; - tmp = d_alloc(filp->f_dentry, &q); - if (!tmp) { - kfree(newkey); - dput(tmp); - goto no_dentry; - } - memcpy(newkey, &location, - sizeof(struct btrfs_key)); - tmp->d_fsdata = newkey; - tmp->d_flags |= DCACHE_NEED_LOOKUP; - d_rehash(tmp); - dput(tmp); - } else { - dput(tmp); - } -no_dentry: + /* is this a reference to our own snapshot? If so - * skip it + * skip it. + * + * In contrast to old kernels, we insert the snapshot's + * dir item and dir index after it has been created, so + * we won't find a reference to our own snapshot. We + * still keep the following code for backward + * compatibility. */ if (location.type == BTRFS_ROOT_ITEM_KEY && location.objectid == root->root_key.objectid) { @@ -4581,18 +4714,26 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, parent_ino, index); } - if (ret == 0) { - ret = btrfs_insert_dir_item(trans, root, name, name_len, - parent_inode, &key, - btrfs_inode_type(inode), index); - if (ret) - goto fail_dir_item; + /* Nothing to clean up yet */ + if (ret) + return ret; - btrfs_i_size_write(parent_inode, parent_inode->i_size + - name_len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, parent_inode); + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode, &key, + btrfs_inode_type(inode), index); + if (ret == -EEXIST) + goto fail_dir_item; + else if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; } + + btrfs_i_size_write(parent_inode, parent_inode->i_size + + name_len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, parent_inode); + if (ret) + btrfs_abort_transaction(trans, root, ret); return ret; fail_dir_item: @@ -4806,7 +4947,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } else { struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, root, inode); - BUG_ON(err); + if (err) + goto fail; d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } @@ -4937,12 +5079,12 @@ static noinline int uncompress_inline(struct btrfs_path *path, ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); if (ret) { - char *kaddr = kmap_atomic(page, KM_USER0); + char *kaddr = kmap_atomic(page); unsigned long copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, max_size - extent_offset); memset(kaddr + pg_offset, 0, copy_size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); } kfree(tmp); return 0; @@ -5137,7 +5279,7 @@ again: ret = uncompress_inline(path, inode, page, pg_offset, extent_offset, item); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -5252,6 +5394,7 @@ out: free_extent_map(em); return ERR_PTR(err); } + BUG_ON(!em); /* Error is always set */ return em; } @@ -5414,7 +5557,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, alloc_hint = get_extent_allocation_hint(inode, start, len); ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, - alloc_hint, (u64)-1, &ins, 1); + alloc_hint, &ins, 1); if (ret) { em = ERR_PTR(ret); goto out; @@ -5602,7 +5745,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, free_extent_map(em); /* DIO will do one hole at a time, so just unlock a sector */ unlock_extent(&BTRFS_I(inode)->io_tree, start, - start + root->sectorsize - 1, GFP_NOFS); + start + root->sectorsize - 1); return 0; } @@ -5719,11 +5862,11 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) unsigned long flags; local_irq_save(flags); - kaddr = kmap_atomic(page, KM_IRQ0); + kaddr = kmap_atomic(page); csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, csum, bvec->bv_len); btrfs_csum_final(csum, (char *)&csum); - kunmap_atomic(kaddr, KM_IRQ0); + kunmap_atomic(kaddr); local_irq_restore(flags); flush_dcache_page(bvec->bv_page); @@ -5743,7 +5886,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) } while (bvec <= bvec_end); unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, - dip->logical_offset + dip->bytes - 1, GFP_NOFS); + dip->logical_offset + dip->bytes - 1); bio->bi_private = dip->private; kfree(dip->csums); @@ -5794,7 +5937,7 @@ again: lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, ordered->file_offset + ordered->len - 1, 0, - &cached_state, GFP_NOFS); + &cached_state); if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { ret = btrfs_mark_extent_written(trans, inode, @@ -5868,7 +6011,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -6209,7 +6352,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, while (1) { lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure theres no ordered @@ -6233,7 +6376,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, if (writing) { write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_DELALLOC, 0, NULL, &cached_state, + EXTENT_DELALLOC, NULL, &cached_state, GFP_NOFS); if (ret) { clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, @@ -6363,8 +6506,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent_bits(tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(tree, page_start, page_end, 0, &cached_state); ordered = btrfs_lookup_ordered_extent(page->mapping->host, page_offset(page)); if (ordered) { @@ -6386,8 +6528,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) } btrfs_put_ordered_extent(ordered); cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(tree, page_start, page_end, 0, &cached_state); } clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -6462,8 +6603,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); set_page_extent_mapped(page); /* @@ -6737,10 +6877,9 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, new_root, inode); - BUG_ON(err); iput(inode); - return 0; + return err; } struct inode *btrfs_alloc_inode(struct super_block *sb) @@ -6783,6 +6922,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(&ei->io_tree, &inode->i_data); extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); + ei->io_tree.track_uptodate = 1; + ei->io_failure_tree.track_uptodate = 1; mutex_init(&ei->log_mutex); mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -7072,7 +7213,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!ret) ret = btrfs_update_inode(trans, root, old_inode); } - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } if (new_inode) { new_inode->i_ctime = CURRENT_TIME; @@ -7090,11 +7234,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.name, new_dentry->d_name.len); } - BUG_ON(ret); - if (new_inode->i_nlink == 0) { + if (!ret && new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, new_dentry->d_inode); BUG_ON(ret); } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } } fixup_inode_flags(new_dir, old_inode); @@ -7102,7 +7249,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = btrfs_add_link(trans, new_dir, old_inode, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { struct dentry *parent = new_dentry->d_parent; @@ -7315,7 +7465,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, - 0, *alloc_hint, (u64)-1, &ins, 1); + 0, *alloc_hint, &ins, 1); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); @@ -7327,7 +7477,12 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, ins.offset, ins.offset, ins.offset, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + if (own_trans) + btrfs_end_transaction(trans, root); + break; + } btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); @@ -7349,7 +7504,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + + if (ret) { + btrfs_abort_transaction(trans, root, ret); + if (own_trans) + btrfs_end_transaction(trans, root); + break; + } if (own_trans) btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1b36f1932ef..14f8e1faa46 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -425,22 +425,37 @@ static noinline int create_subvol(struct btrfs_root *root, key.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); - BUG_ON(IS_ERR(new_root)); + if (IS_ERR(new_root)) { + btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); + ret = PTR_ERR(new_root); + goto fail; + } btrfs_record_root_in_trans(trans, new_root); ret = btrfs_create_subvol_root(trans, new_root, new_dirid); + if (ret) { + /* We potentially lose an unused inode item here */ + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + /* * insert the directory item */ ret = btrfs_set_inode_index(dir, &index); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } ret = btrfs_insert_dir_item(trans, root, name, namelen, dir, &key, BTRFS_FT_DIR, index); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto fail; + } btrfs_i_size_write(dir, dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); @@ -769,6 +784,31 @@ none: return -ENOENT; } +/* + * Validaty check of prev em and next em: + * 1) no prev/next em + * 2) prev/next em is an hole/inline extent + */ +static int check_adjacent_extents(struct inode *inode, struct extent_map *em) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *prev = NULL, *next = NULL; + int ret = 0; + + read_lock(&em_tree->lock); + prev = lookup_extent_mapping(em_tree, em->start - 1, (u64)-1); + next = lookup_extent_mapping(em_tree, em->start + em->len, (u64)-1); + read_unlock(&em_tree->lock); + + if ((!prev || prev->block_start >= EXTENT_MAP_LAST_BYTE) && + (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)) + ret = 1; + free_extent_map(prev); + free_extent_map(next); + + return ret; +} + static int should_defrag_range(struct inode *inode, u64 start, u64 len, int thresh, u64 *last_len, u64 *skip, u64 *defrag_end) @@ -797,17 +837,25 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, if (!em) { /* get the big lock and read metadata off disk */ - lock_extent(io_tree, start, start + len - 1, GFP_NOFS); + lock_extent(io_tree, start, start + len - 1); em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - unlock_extent(io_tree, start, start + len - 1, GFP_NOFS); + unlock_extent(io_tree, start, start + len - 1); if (IS_ERR(em)) return 0; } /* this will cover holes, and inline extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { ret = 0; + goto out; + } + + /* If we have nothing to merge with us, just skip. */ + if (check_adjacent_extents(inode, em)) { + ret = 0; + goto out; + } /* * we hit a real extent, if it is big don't bother defragging it again @@ -815,6 +863,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh) ret = 0; +out: /* * last_len ends up being a counter of how many bytes we've defragged. * every time we choose not to defrag an extent, we reset *last_len @@ -856,6 +905,7 @@ static int cluster_pages_for_defrag(struct inode *inode, u64 isize = i_size_read(inode); u64 page_start; u64 page_end; + u64 page_cnt; int ret; int i; int i_done; @@ -864,19 +914,21 @@ static int cluster_pages_for_defrag(struct inode *inode, struct extent_io_tree *tree; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - if (isize == 0) - return 0; file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + if (!isize || start_index > file_end) + return 0; + + page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(inode, - num_pages << PAGE_CACHE_SHIFT); + page_cnt << PAGE_CACHE_SHIFT); if (ret) return ret; i_done = 0; tree = &BTRFS_I(inode)->io_tree; /* step one, lock all the pages */ - for (i = 0; i < num_pages; i++) { + for (i = 0; i < page_cnt; i++) { struct page *page; again: page = find_or_create_page(inode->i_mapping, @@ -887,10 +939,10 @@ again: page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; while (1) { - lock_extent(tree, page_start, page_end, GFP_NOFS); + lock_extent(tree, page_start, page_end); ordered = btrfs_lookup_ordered_extent(inode, page_start); - unlock_extent(tree, page_start, page_end, GFP_NOFS); + unlock_extent(tree, page_start, page_end); if (!ordered) break; @@ -898,6 +950,15 @@ again: btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); lock_page(page); + /* + * we unlocked the page above, so we need check if + * it was released or not. + */ + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } } if (!PageUptodate(page)) { @@ -911,15 +972,6 @@ again: } } - isize = i_size_read(inode); - file_end = (isize - 1) >> PAGE_CACHE_SHIFT; - if (!isize || page->index > file_end) { - /* whoops, we blew past eof, skip this page */ - unlock_page(page); - page_cache_release(page); - break; - } - if (page->mapping != inode->i_mapping) { unlock_page(page); page_cache_release(page); @@ -946,19 +998,18 @@ again: page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; lock_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, 0, &cached_state, - GFP_NOFS); + page_start, page_end - 1, 0, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, GFP_NOFS); - if (i_done != num_pages) { + if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, - (num_pages - i_done) << PAGE_CACHE_SHIFT); + (page_cnt - i_done) << PAGE_CACHE_SHIFT); } @@ -983,7 +1034,7 @@ out: unlock_page(pages[i]); page_cache_release(pages[i]); } - btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); return ret; } @@ -1089,12 +1140,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!(inode->i_sb->s_flags & MS_ACTIVE)) break; - if (!newer_than && - !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, - extent_thresh, - &last_len, &skip, - &defrag_end)) { + if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, extent_thresh, + &last_len, &skip, &defrag_end)) { unsigned long next; /* * the should_defrag function tells us how much to skip @@ -1123,17 +1171,24 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index += max_cluster; } + mutex_lock(&inode->i_mutex); ret = cluster_pages_for_defrag(inode, pages, i, cluster); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&inode->i_mutex); goto out_ra; + } defrag_count += ret; balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); + mutex_unlock(&inode->i_mutex); if (newer_than) { if (newer_off == (u64)-1) break; + if (ret > 0) + i += ret; + newer_off = max(newer_off + 1, (u64)i << PAGE_CACHE_SHIFT); @@ -1966,7 +2021,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_key.objectid, dentry->d_name.name, dentry->d_name.len); - BUG_ON(ret); + if (ret) { + err = ret; + btrfs_abort_transaction(trans, root, ret); + goto out_end_trans; + } btrfs_record_root_in_trans(trans, dest); @@ -1979,11 +2038,16 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, dest->root_key.objectid); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } } - +out_end_trans: ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); + if (ret && !err) + err = ret; inode->i_flags |= S_DEAD; out_up_write: up_write(&root->fs_info->subvol_sem); @@ -2198,7 +2262,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) di_args->bytes_used = dev->bytes_used; di_args->total_bytes = dev->total_bytes; memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - strncpy(di_args->path, dev->name, sizeof(di_args->path)); + if (dev->name) + strncpy(di_args->path, dev->name, sizeof(di_args->path)); + else + di_args->path[0] = '\0'; out: if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) @@ -2326,13 +2393,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, another, and lock file content */ while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + lock_extent(&BTRFS_I(src)->io_tree, off, off+len); ordered = btrfs_lookup_first_ordered_extent(src, off+len); if (!ordered && !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, EXTENT_DELALLOC, 0, NULL)) break; - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); if (ordered) btrfs_put_ordered_extent(ordered); btrfs_wait_ordered_range(src, off, len); @@ -2447,11 +2514,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.offset, new_key.offset + datal, &hint_byte, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } leaf = path->nodes[0]; slot = path->slots[0]; @@ -2478,7 +2555,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_ino(inode), new_key.offset - datao, 0); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, + root, + ret); + btrfs_end_transaction(trans, + root); + goto out; + + } } } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; @@ -2503,11 +2588,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.offset, new_key.offset + datal, &hint_byte, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } if (skip) { u32 start = @@ -2541,8 +2636,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_i_size_write(inode, endoff); ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - btrfs_end_transaction(trans, root); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = btrfs_end_transaction(trans, root); } next: btrfs_release_path(path); @@ -2551,7 +2650,7 @@ next: ret = 0; out: btrfs_release_path(path); - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); out_unlock: mutex_unlock(&src->i_mutex); mutex_unlock(&inode->i_mutex); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 4f69028a68c..086e6bdae1c 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -252,7 +252,7 @@ struct btrfs_data_container { struct btrfs_ioctl_ino_path_args { __u64 inum; /* in */ - __u32 size; /* in */ + __u64 size; /* in */ __u64 reserved[4]; /* struct btrfs_data_container *fspath; out */ __u64 fspath; /* out */ @@ -260,7 +260,7 @@ struct btrfs_ioctl_ino_path_args { struct btrfs_ioctl_logical_ino_args { __u64 logical; /* in */ - __u32 size; /* in */ + __u64 size; /* in */ __u64 reserved[4]; /* struct btrfs_data_container *inodes; out */ __u64 inodes; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 5e178d8f716..272f911203f 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -208,7 +208,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) * take a spinning write lock. This will wait for both * blocking readers or writers */ -int btrfs_tree_lock(struct extent_buffer *eb) +void btrfs_tree_lock(struct extent_buffer *eb) { again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); @@ -230,13 +230,12 @@ again: atomic_inc(&eb->spinning_writers); atomic_inc(&eb->write_locks); eb->lock_owner = current->pid; - return 0; } /* * drop a spinning or a blocking write lock. */ -int btrfs_tree_unlock(struct extent_buffer *eb) +void btrfs_tree_unlock(struct extent_buffer *eb) { int blockers = atomic_read(&eb->blocking_writers); @@ -255,7 +254,6 @@ int btrfs_tree_unlock(struct extent_buffer *eb) atomic_dec(&eb->spinning_writers); write_unlock(&eb->lock); } - return 0; } void btrfs_assert_tree_locked(struct extent_buffer *eb) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 17247ddb81a..ca52681e5f4 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -24,8 +24,8 @@ #define BTRFS_WRITE_LOCK_BLOCKING 3 #define BTRFS_READ_LOCK_BLOCKING 4 -int btrfs_tree_lock(struct extent_buffer *eb); -int btrfs_tree_unlock(struct extent_buffer *eb); +void btrfs_tree_lock(struct extent_buffer *eb); +void btrfs_tree_unlock(struct extent_buffer *eb); int btrfs_try_spin_lock(struct extent_buffer *eb); void btrfs_tree_read_lock(struct extent_buffer *eb); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index a178f5ebea7..743b86fa4fc 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -411,9 +411,9 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, bytes = min_t(unsigned long, destlen, out_len - start_byte); - kaddr = kmap_atomic(dest_page, KM_USER0); + kaddr = kmap_atomic(dest_page); memcpy(kaddr, workspace->buf + start_byte, bytes); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); out: return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a1c94042530..bbf6d0d9aeb 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -59,6 +59,14 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, return NULL; } +static void ordered_data_tree_panic(struct inode *inode, int errno, + u64 offset) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " + "%llu\n", (unsigned long long)offset); +} + /* * look for a given offset in the tree, and if it can't be found return the * first lesser offset @@ -207,7 +215,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, spin_lock(&tree->lock); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); - BUG_ON(node); + if (node) + ordered_data_tree_panic(inode, -EEXIST, file_offset); spin_unlock(&tree->lock); spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); @@ -215,7 +224,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, &BTRFS_I(inode)->root->fs_info->ordered_extents); spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); - BUG_ON(node); return 0; } @@ -249,9 +257,9 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, * when an ordered extent is finished. If the list covers more than one * ordered extent, it is split across multiples. */ -int btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, - struct btrfs_ordered_sum *sum) +void btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum) { struct btrfs_ordered_inode_tree *tree; @@ -259,7 +267,6 @@ int btrfs_add_ordered_sum(struct inode *inode, spin_lock(&tree->lock); list_add_tail(&sum->list, &entry->list); spin_unlock(&tree->lock); - return 0; } /* @@ -384,7 +391,7 @@ out: * used to drop a reference on an ordered extent. This will free * the extent if the last reference is dropped */ -int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) { struct list_head *cur; struct btrfs_ordered_sum *sum; @@ -400,7 +407,6 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) } kfree(entry); } - return 0; } /* @@ -408,8 +414,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) * and you must wake_up entry->wait. You must hold the tree lock * while you call this function. */ -static int __btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +static void __btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -436,35 +442,30 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, list_del_init(&BTRFS_I(inode)->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); - - return 0; } /* * remove an ordered extent from the tree. No references are dropped * but any waiters are woken. */ -int btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +void btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; - int ret; tree = &BTRFS_I(inode)->ordered_tree; spin_lock(&tree->lock); - ret = __btrfs_remove_ordered_extent(inode, entry); + __btrfs_remove_ordered_extent(inode, entry); spin_unlock(&tree->lock); wake_up(&entry->wait); - - return ret; } /* * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput) +void btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput) { struct list_head splice; struct list_head *cur; @@ -512,7 +513,6 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, spin_lock(&root->fs_info->ordered_extent_lock); } spin_unlock(&root->fs_info->ordered_extent_lock); - return 0; } /* @@ -525,7 +525,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, * extra check to make sure the ordered operation list really is empty * before we return */ -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) { struct btrfs_inode *btrfs_inode; struct inode *inode; @@ -573,8 +573,6 @@ again: spin_unlock(&root->fs_info->ordered_extent_lock); mutex_unlock(&root->fs_info->ordered_operations_mutex); - - return 0; } /* @@ -609,7 +607,7 @@ void btrfs_start_ordered_extent(struct inode *inode, /* * Used to wait on ordered extents across a large range of bytes. */ -int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { u64 end; u64 orig_end; @@ -664,7 +662,6 @@ again: schedule_timeout(1); goto again; } - return 0; } /* @@ -948,9 +945,8 @@ out: * If trans is not null, we'll do a friendly check for a transaction that * is already flushing things and force the IO down ourselves. */ -int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) +void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) { u64 last_mod; @@ -961,7 +957,7 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, * commit, we can safely return without doing anything */ if (last_mod < root->fs_info->last_trans_committed) - return 0; + return; /* * the transaction is already committing. Just start the IO and @@ -969,7 +965,7 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, */ if (trans && root->fs_info->running_transaction->blocked) { btrfs_wait_ordered_range(inode, 0, (u64)-1); - return 0; + return; } spin_lock(&root->fs_info->ordered_extent_lock); @@ -978,6 +974,4 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, &root->fs_info->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); - - return 0; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ff1f69aa188..c355ad4dc1a 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -138,8 +138,8 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) t->last = NULL; } -int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); -int btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry); int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, @@ -154,14 +154,14 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type, int compress_type); -int btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, - struct btrfs_ordered_sum *sum); +void btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, u64 file_offset); void btrfs_start_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry, int wait); -int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, @@ -170,10 +170,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); -int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); -int btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput); +void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); +void btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput); #endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index f8be250963a..24cad1695af 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -58,7 +58,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; - if (ret) { + if (ret) { /* JDM: Really? */ ret = -ENOENT; goto out; } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 22db04550f6..ac5d0108588 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -54,7 +54,6 @@ * than the 2 started one after another. */ -#define MAX_MIRRORS 2 #define MAX_IN_FLIGHT 6 struct reada_extctl { @@ -71,7 +70,7 @@ struct reada_extent { struct list_head extctl; struct kref refcnt; spinlock_t lock; - struct reada_zone *zones[MAX_MIRRORS]; + struct reada_zone *zones[BTRFS_MAX_MIRRORS]; int nzones; struct btrfs_device *scheduled_for; }; @@ -84,7 +83,8 @@ struct reada_zone { spinlock_t lock; int locked; struct btrfs_device *device; - struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */ + struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl + * self */ int ndevs; struct kref refcnt; }; @@ -250,14 +250,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, struct btrfs_bio *bbio) { int ret; - int looped = 0; struct reada_zone *zone; struct btrfs_block_group_cache *cache = NULL; u64 start; u64 end; int i; -again: zone = NULL; spin_lock(&fs_info->reada_lock); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, @@ -274,9 +272,6 @@ again: spin_unlock(&fs_info->reada_lock); } - if (looped) - return NULL; - cache = btrfs_lookup_block_group(fs_info, logical); if (!cache) return NULL; @@ -307,13 +302,15 @@ again: ret = radix_tree_insert(&dev->reada_zones, (unsigned long)(zone->end >> PAGE_CACHE_SHIFT), zone); - spin_unlock(&fs_info->reada_lock); - if (ret) { + if (ret == -EEXIST) { kfree(zone); - looped = 1; - goto again; + ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, + logical >> PAGE_CACHE_SHIFT, 1); + if (ret == 1) + kref_get(&zone->refcnt); } + spin_unlock(&fs_info->reada_lock); return zone; } @@ -323,26 +320,26 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct btrfs_key *top, int level) { int ret; - int looped = 0; struct reada_extent *re = NULL; + struct reada_extent *re_exist = NULL; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; + struct btrfs_device *prev_dev; u32 blocksize; u64 length; int nzones = 0; int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; -again: spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); if (re) kref_get(&re->refcnt); spin_unlock(&fs_info->reada_lock); - if (re || looped) + if (re) return re; re = kzalloc(sizeof(*re), GFP_NOFS); @@ -365,9 +362,9 @@ again: if (ret || !bbio || length < blocksize) goto error; - if (bbio->num_stripes > MAX_MIRRORS) { + if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { printk(KERN_ERR "btrfs readahead: more than %d copies not " - "supported", MAX_MIRRORS); + "supported", BTRFS_MAX_MIRRORS); goto error; } @@ -398,16 +395,31 @@ again: /* insert extent in reada_tree + all per-device trees, all or nothing */ spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); + if (ret == -EEXIST) { + re_exist = radix_tree_lookup(&fs_info->reada_tree, index); + BUG_ON(!re_exist); + kref_get(&re_exist->refcnt); + spin_unlock(&fs_info->reada_lock); + goto error; + } if (ret) { spin_unlock(&fs_info->reada_lock); - if (ret != -ENOMEM) { - /* someone inserted the extent in the meantime */ - looped = 1; - } goto error; } + prev_dev = NULL; for (i = 0; i < nzones; ++i) { dev = bbio->stripes[i].dev; + if (dev == prev_dev) { + /* + * in case of DUP, just add the first zone. As both + * are on the same device, there's nothing to gain + * from adding both. + * Also, it wouldn't work, as the tree is per device + * and adding would fail with EEXIST + */ + continue; + } + prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { while (--i >= 0) { @@ -450,9 +462,7 @@ error: } kfree(bbio); kfree(re); - if (looped) - goto again; - return NULL; + return re_exist; } static void reada_kref_dummy(struct kref *kr) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8c1aae2c845..646ee21bb03 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -326,6 +326,19 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) return NULL; } +void backref_tree_panic(struct rb_node *rb_node, int errno, + u64 bytenr) +{ + + struct btrfs_fs_info *fs_info = NULL; + struct backref_node *bnode = rb_entry(rb_node, struct backref_node, + rb_node); + if (bnode->root) + fs_info = bnode->root->fs_info; + btrfs_panic(fs_info, errno, "Inconsistency in backref cache " + "found at offset %llu\n", (unsigned long long)bytenr); +} + /* * walk up backref nodes until reach node presents tree root */ @@ -452,7 +465,8 @@ static void update_backref_node(struct backref_cache *cache, rb_erase(&node->rb_node, &cache->rb_root); node->bytenr = bytenr; rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, bytenr); } /* @@ -999,7 +1013,8 @@ next: if (!cowonly) { rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); list_add_tail(&node->lower, &cache->leaves); } @@ -1034,7 +1049,9 @@ next: if (!cowonly) { rb_node = tree_insert(&cache->rb_root, upper->bytenr, &upper->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, + upper->bytenr); } list_add_tail(&edge->list[UPPER], &upper->lower); @@ -1180,7 +1197,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, rb_node = tree_insert(&cache->rb_root, new_node->bytenr, &new_node->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, new_node->bytenr); if (!new_node->lowest) { list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { @@ -1203,14 +1221,15 @@ fail: /* * helper to add 'address of tree root -> reloc tree' mapping */ -static int __add_reloc_root(struct btrfs_root *root) +static int __must_check __add_reloc_root(struct btrfs_root *root) { struct rb_node *rb_node; struct mapping_node *node; struct reloc_control *rc = root->fs_info->reloc_ctl; node = kmalloc(sizeof(*node), GFP_NOFS); - BUG_ON(!node); + if (!node) + return -ENOMEM; node->bytenr = root->node->start; node->data = root; @@ -1219,7 +1238,12 @@ static int __add_reloc_root(struct btrfs_root *root) rb_node = tree_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); - BUG_ON(rb_node); + if (rb_node) { + kfree(node); + btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " + "for start=%llu while inserting into relocation " + "tree\n"); + } list_add_tail(&root->root_list, &rc->reloc_roots); return 0; @@ -1252,9 +1276,12 @@ static int __update_reloc_root(struct btrfs_root *root, int del) rb_node = tree_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); } else { + spin_lock(&root->fs_info->trans_lock); list_del_init(&root->root_list); + spin_unlock(&root->fs_info->trans_lock); kfree(node); } return 0; @@ -1334,6 +1361,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *reloc_root; struct reloc_control *rc = root->fs_info->reloc_ctl; int clear_rsv = 0; + int ret; if (root->reloc_root) { reloc_root = root->reloc_root; @@ -1353,7 +1381,8 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, if (clear_rsv) trans->block_rsv = NULL; - __add_reloc_root(reloc_root); + ret = __add_reloc_root(reloc_root); + BUG_ON(ret < 0); root->reloc_root = reloc_root; return 0; } @@ -1577,15 +1606,14 @@ int replace_file_extents(struct btrfs_trans_handle *trans, WARN_ON(!IS_ALIGNED(end, root->sectorsize)); end--; ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, - GFP_NOFS); + key.offset, end); if (!ret) continue; btrfs_drop_extent_cache(inode, key.offset, end, 1); unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, GFP_NOFS); + key.offset, end); } } @@ -1956,9 +1984,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for readpage to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, start, end); btrfs_drop_extent_cache(inode, start, end, 1); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); } return 0; } @@ -2246,7 +2274,8 @@ again: } else { list_del_init(&reloc_root->root_list); } - btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); + ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); + BUG_ON(ret < 0); } if (found) { @@ -2862,12 +2891,12 @@ int prealloc_file_extent_cluster(struct inode *inode, else end = cluster->end - offset; - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); if (ret) break; nr++; @@ -2899,7 +2928,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, start, end); while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); @@ -2910,7 +2939,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, } btrfs_drop_extent_cache(inode, start, end, 0); } - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); return ret; } @@ -2990,8 +3019,7 @@ static int relocate_file_extent_cluster(struct inode *inode, page_start = (u64)page->index << PAGE_CACHE_SHIFT; page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(&BTRFS_I(inode)->io_tree, - page_start, page_end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); set_page_extent_mapped(page); @@ -3007,7 +3035,7 @@ static int relocate_file_extent_cluster(struct inode *inode, set_page_dirty(page); unlock_extent(&BTRFS_I(inode)->io_tree, - page_start, page_end, GFP_NOFS); + page_start, page_end); unlock_page(page); page_cache_release(page); @@ -3154,7 +3182,8 @@ static int add_tree_block(struct reloc_control *rc, block->key_ready = 0; rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, block->bytenr); return 0; } @@ -3426,7 +3455,9 @@ static int find_data_references(struct reloc_control *rc, block->key_ready = 1; rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, + block->bytenr); } if (counted) added = 1; @@ -3782,7 +3813,7 @@ restart: ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); if (ret < 0) { - if (ret != -EAGAIN) { + if (ret != -ENOSPC) { err = ret; WARN_ON(1); break; @@ -4073,10 +4104,11 @@ out: static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) { struct btrfs_trans_handle *trans; - int ret; + int ret, err; trans = btrfs_start_transaction(root->fs_info->tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); @@ -4084,11 +4116,11 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) btrfs_set_root_refs(&root->root_item, 0); ret = btrfs_update_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); - BUG_ON(ret); - ret = btrfs_end_transaction(trans, root->fs_info->tree_root); - BUG_ON(ret); - return 0; + err = btrfs_end_transaction(trans, root->fs_info->tree_root); + if (err) + return err; + return ret; } /* @@ -4156,7 +4188,11 @@ int btrfs_recover_relocation(struct btrfs_root *root) err = ret; goto out; } - mark_garbage_root(reloc_root); + ret = mark_garbage_root(reloc_root); + if (ret < 0) { + err = ret; + goto out; + } } } @@ -4202,13 +4238,19 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root = read_fs_root(root->fs_info, reloc_root->root_key.offset); - BUG_ON(IS_ERR(fs_root)); + if (IS_ERR(fs_root)) { + err = PTR_ERR(fs_root); + goto out_free; + } - __add_reloc_root(reloc_root); + err = __add_reloc_root(reloc_root); + BUG_ON(err < 0); /* -ENOMEM or logic error */ fs_root->reloc_root = reloc_root; } - btrfs_commit_transaction(trans, rc->extent_root); + err = btrfs_commit_transaction(trans, rc->extent_root); + if (err) + goto out_free; merge_reloc_roots(rc); @@ -4218,7 +4260,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (IS_ERR(trans)) err = PTR_ERR(trans); else - btrfs_commit_transaction(trans, rc->extent_root); + err = btrfs_commit_transaction(trans, rc->extent_root); out_free: kfree(rc); out: @@ -4267,6 +4309,8 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, disk_bytenr + len - 1, &list, 0); + if (ret) + goto out; while (!list_empty(&list)) { sums = list_entry(list.next, struct btrfs_ordered_sum, list); @@ -4284,6 +4328,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) btrfs_add_ordered_sum(inode, ordered, sums); } +out: btrfs_put_ordered_extent(ordered); return ret; } @@ -4380,7 +4425,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, * called after snapshot is created. migrate block reservation * and create reloc root for the newly created snapshot */ -void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending) { struct btrfs_root *root = pending->root; @@ -4390,7 +4435,7 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, int ret; if (!root->reloc_root) - return; + return 0; rc = root->fs_info->reloc_ctl; rc->merging_rsv_size += rc->nodes_relocated; @@ -4399,18 +4444,21 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_block_rsv_migrate(&pending->block_rsv, rc->block_rsv, rc->nodes_relocated); - BUG_ON(ret); + if (ret) + return ret; } new_root = pending->snap; reloc_root = create_reloc_root(trans, root->reloc_root, new_root->root_key.objectid); + if (IS_ERR(reloc_root)) + return PTR_ERR(reloc_root); - __add_reloc_root(reloc_root); + ret = __add_reloc_root(reloc_root); + BUG_ON(ret < 0); new_root->reloc_root = reloc_root; - if (rc->create_reloc_tree) { + if (rc->create_reloc_tree) ret = clone_backref_node(trans, rc, root, reloc_root); - BUG_ON(ret); - } + return ret; } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index f4099904565..24fb8ce4e07 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -93,10 +93,14 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root unsigned long ptr; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); goto out; + } if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); @@ -116,13 +120,10 @@ out: return ret; } -int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_root_item - *item) +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_root_item *item) { - int ret; - ret = btrfs_insert_item(trans, root, key, item, sizeof(*item)); - return ret; + return btrfs_insert_item(trans, root, key, item, sizeof(*item)); } /* @@ -384,6 +385,8 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root, * * For a back ref the root_id is the id of the subvol or snapshot and * ref_id is the id of the tree referencing it. + * + * Will return 0, -ENOMEM, or anything from the CoW path */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, @@ -407,7 +410,11 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name_len); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b9b84cdfc35..2f3d6f917fb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -36,37 +36,30 @@ * Future enhancements: * - In case an unrepairable extent is encountered, track which files are * affected and report them - * - In case of a read error on files with nodatasum, map the file and read - * the extent to trigger a writeback of the good copy * - track and record media errors, throw out bad devices * - add a mode to also read unallocated space */ -struct scrub_bio; -struct scrub_page; +struct scrub_block; struct scrub_dev; -static void scrub_bio_end_io(struct bio *bio, int err); -static void scrub_checksum(struct btrfs_work *work); -static int scrub_checksum_data(struct scrub_dev *sdev, - struct scrub_page *spag, void *buffer); -static int scrub_checksum_tree_block(struct scrub_dev *sdev, - struct scrub_page *spag, u64 logical, - void *buffer); -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); -static int scrub_fixup_check(struct scrub_bio *sbio, int ix); -static void scrub_fixup_end_io(struct bio *bio, int err); -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, - struct page *page); -static void scrub_fixup(struct scrub_bio *sbio, int ix); #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ +#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_page { + struct scrub_block *sblock; + struct page *page; + struct block_device *bdev; u64 flags; /* extent flags */ u64 generation; - int mirror_num; - int have_csum; + u64 logical; + u64 physical; + struct { + unsigned int mirror_num:8; + unsigned int have_csum:1; + unsigned int io_error:1; + }; u8 csum[BTRFS_CSUM_SIZE]; }; @@ -77,12 +70,25 @@ struct scrub_bio { int err; u64 logical; u64 physical; - struct scrub_page spag[SCRUB_PAGES_PER_BIO]; - u64 count; + struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; + int page_count; int next_free; struct btrfs_work work; }; +struct scrub_block { + struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; + int page_count; + atomic_t outstanding_pages; + atomic_t ref_count; /* free mem on transition to zero */ + struct scrub_dev *sdev; + struct { + unsigned int header_error:1; + unsigned int checksum_error:1; + unsigned int no_io_error_seen:1; + }; +}; + struct scrub_dev { struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; struct btrfs_device *dev; @@ -96,6 +102,10 @@ struct scrub_dev { struct list_head csum_list; atomic_t cancel_req; int readonly; + int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ + u32 sectorsize; + u32 nodesize; + u32 leafsize; /* * statistics */ @@ -124,6 +134,43 @@ struct scrub_warning { int scratch_bufsize; }; + +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); +static int scrub_setup_recheck_block(struct scrub_dev *sdev, + struct btrfs_mapping_tree *map_tree, + u64 length, u64 logical, + struct scrub_block *sblock); +static int scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size); +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, + int is_metadata, int have_csum, + const u8 *csum, u64 generation, + u16 csum_size); +static void scrub_complete_bio_end_io(struct bio *bio, int err); +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int force_write); +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int page_num, int force_write); +static int scrub_checksum_data(struct scrub_block *sblock); +static int scrub_checksum_tree_block(struct scrub_block *sblock); +static int scrub_checksum_super(struct scrub_block *sblock); +static void scrub_block_get(struct scrub_block *sblock); +static void scrub_block_put(struct scrub_block *sblock); +static int scrub_add_page_to_bio(struct scrub_dev *sdev, + struct scrub_page *spage); +static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, + u64 physical, u64 flags, u64 gen, int mirror_num, + u8 *csum, int force); +static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_bio_end_io_worker(struct btrfs_work *work); +static void scrub_block_complete(struct scrub_block *sblock); + + static void scrub_free_csums(struct scrub_dev *sdev) { while (!list_empty(&sdev->csum_list)) { @@ -135,23 +182,6 @@ static void scrub_free_csums(struct scrub_dev *sdev) } } -static void scrub_free_bio(struct bio *bio) -{ - int i; - struct page *last_page = NULL; - - if (!bio) - return; - - for (i = 0; i < bio->bi_vcnt; ++i) { - if (bio->bi_io_vec[i].bv_page == last_page) - continue; - last_page = bio->bi_io_vec[i].bv_page; - __free_page(last_page); - } - bio_put(bio); -} - static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) { int i; @@ -159,13 +189,23 @@ static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) if (!sdev) return; + /* this can happen when scrub is cancelled */ + if (sdev->curr != -1) { + struct scrub_bio *sbio = sdev->bios[sdev->curr]; + + for (i = 0; i < sbio->page_count; i++) { + BUG_ON(!sbio->pagev[i]); + BUG_ON(!sbio->pagev[i]->page); + scrub_block_put(sbio->pagev[i]->sblock); + } + bio_put(sbio->bio); + } + for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio = sdev->bios[i]; if (!sbio) break; - - scrub_free_bio(sbio->bio); kfree(sbio); } @@ -179,11 +219,16 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) struct scrub_dev *sdev; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + int pages_per_bio; + pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, + bio_get_nr_vecs(dev->bdev)); sdev = kzalloc(sizeof(*sdev), GFP_NOFS); if (!sdev) goto nomem; sdev->dev = dev; + sdev->pages_per_bio = pages_per_bio; + sdev->curr = -1; for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio; @@ -194,8 +239,8 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sbio->index = i; sbio->sdev = sdev; - sbio->count = 0; - sbio->work.func = scrub_checksum; + sbio->page_count = 0; + sbio->work.func = scrub_bio_end_io_worker; if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; @@ -203,7 +248,9 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sdev->bios[i]->next_free = -1; } sdev->first_free = 0; - sdev->curr = -1; + sdev->nodesize = dev->dev_root->nodesize; + sdev->leafsize = dev->dev_root->leafsize; + sdev->sectorsize = dev->dev_root->sectorsize; atomic_set(&sdev->in_flight, 0); atomic_set(&sdev->fixup_cnt, 0); atomic_set(&sdev->cancel_req, 0); @@ -294,10 +341,9 @@ err: return 0; } -static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, - int ix) +static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) { - struct btrfs_device *dev = sbio->sdev->dev; + struct btrfs_device *dev = sblock->sdev->dev; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; struct btrfs_path *path; struct btrfs_key found_key; @@ -316,8 +362,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; - swarn.logical = sbio->logical + ix * PAGE_SIZE; + BUG_ON(sblock->page_count < 1); + swarn.sector = (sblock->pagev[0].physical) >> 9; + swarn.logical = sblock->pagev[0].logical; swarn.errstr = errstr; swarn.dev = dev; swarn.msg_bufsize = bufsize; @@ -342,7 +389,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, do { ret = tree_backref_for_extent(&ptr, eb, ei, item_size, &ref_root, &ref_level); - printk(KERN_WARNING "%s at logical %llu on dev %s, " + printk(KERN_WARNING + "btrfs: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " "%llu\n", errstr, swarn.logical, dev->name, (unsigned long long)swarn.sector, @@ -531,9 +579,9 @@ out: spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR "btrfs: unable to fixup " - "(nodatasum) error at logical %llu\n", - fixup->logical); + printk_ratelimited(KERN_ERR + "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", + (unsigned long long)fixup->logical, sdev->dev->name); } btrfs_free_path(path); @@ -550,91 +598,168 @@ out: } /* - * scrub_recheck_error gets called when either verification of the page - * failed or the bio failed to read, e.g. with EIO. In the latter case, - * recheck_error gets called for every page in the bio, even though only - * one may be bad + * scrub_handle_errored_block gets called when either verification of the + * pages failed or the bio failed to read, e.g. with EIO. In the latter + * case, this function handles all pages in the bio, even though only one + * may be bad. + * The goal of this function is to repair the errored block by using the + * contents of one of the mirrors. */ -static int scrub_recheck_error(struct scrub_bio *sbio, int ix) +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { - struct scrub_dev *sdev = sbio->sdev; - u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + struct scrub_dev *sdev = sblock_to_check->sdev; + struct btrfs_fs_info *fs_info; + u64 length; + u64 logical; + u64 generation; + unsigned int failed_mirror_index; + unsigned int is_metadata; + unsigned int have_csum; + u8 *csum; + struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ + struct scrub_block *sblock_bad; + int ret; + int mirror_index; + int page_num; + int success; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); + DEFAULT_RATELIMIT_BURST); + + BUG_ON(sblock_to_check->page_count < 1); + fs_info = sdev->dev->dev_root->fs_info; + length = sblock_to_check->page_count * PAGE_SIZE; + logical = sblock_to_check->pagev[0].logical; + generation = sblock_to_check->pagev[0].generation; + BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); + failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; + is_metadata = !(sblock_to_check->pagev[0].flags & + BTRFS_EXTENT_FLAG_DATA); + have_csum = sblock_to_check->pagev[0].have_csum; + csum = sblock_to_check->pagev[0].csum; - if (sbio->err) { - if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, - sbio->bio->bi_io_vec[ix].bv_page) == 0) { - if (scrub_fixup_check(sbio, ix) == 0) - return 0; - } - if (__ratelimit(&_rs)) - scrub_print_warning("i/o error", sbio, ix); - } else { - if (__ratelimit(&_rs)) - scrub_print_warning("checksum error", sbio, ix); + /* + * read all mirrors one after the other. This includes to + * re-read the extent or metadata block that failed (that was + * the cause that this fixup code is called) another time, + * page by page this time in order to know which pages + * caused I/O errors and which ones are good (for all mirrors). + * It is the goal to handle the situation when more than one + * mirror contains I/O errors, but the errors do not + * overlap, i.e. the data can be repaired by selecting the + * pages from those mirrors without I/O error on the + * particular pages. One example (with blocks >= 2 * PAGE_SIZE) + * would be that mirror #1 has an I/O error on the first page, + * the second page is good, and mirror #2 has an I/O error on + * the second page, but the first page is good. + * Then the first page of the first mirror can be repaired by + * taking the first page of the second mirror, and the + * second page of the second mirror can be repaired by + * copying the contents of the 2nd page of the 1st mirror. + * One more note: if the pages of one mirror contain I/O + * errors, the checksum cannot be verified. In order to get + * the best data for repairing, the first attempt is to find + * a mirror without I/O errors and with a validated checksum. + * Only if this is not possible, the pages are picked from + * mirrors with I/O errors without considering the checksum. + * If the latter is the case, at the end, the checksum of the + * repaired area is verified in order to correctly maintain + * the statistics. + */ + + sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * + sizeof(*sblocks_for_recheck), + GFP_NOFS); + if (!sblocks_for_recheck) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + sdev->stat.read_errors++; + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + goto out; } - spin_lock(&sdev->stat_lock); - ++sdev->stat.read_errors; - spin_unlock(&sdev->stat_lock); + /* setup the context, map the logical blocks and alloc the pages */ + ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, + logical, sblocks_for_recheck); + if (ret) { + spin_lock(&sdev->stat_lock); + sdev->stat.read_errors++; + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + goto out; + } + BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); + sblock_bad = sblocks_for_recheck + failed_mirror_index; - scrub_fixup(sbio, ix); - return 1; -} + /* build and submit the bios for the failed mirror, check checksums */ + ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, + csum, generation, sdev->csum_size); + if (ret) { + spin_lock(&sdev->stat_lock); + sdev->stat.read_errors++; + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + goto out; + } -static int scrub_fixup_check(struct scrub_bio *sbio, int ix) -{ - int ret = 1; - struct page *page; - void *buffer; - u64 flags = sbio->spag[ix].flags; + if (!sblock_bad->header_error && !sblock_bad->checksum_error && + sblock_bad->no_io_error_seen) { + /* + * the error disappeared after reading page by page, or + * the area was part of a huge bio and other parts of the + * bio caused I/O errors, or the block layer merged several + * read requests into one and the error is caused by a + * different bio (usually one of the two latter cases is + * the cause) + */ + spin_lock(&sdev->stat_lock); + sdev->stat.unverified_errors++; + spin_unlock(&sdev->stat_lock); - page = sbio->bio->bi_io_vec[ix].bv_page; - buffer = kmap_atomic(page, KM_USER0); - if (flags & BTRFS_EXTENT_FLAG_DATA) { - ret = scrub_checksum_data(sbio->sdev, - sbio->spag + ix, buffer); - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = scrub_checksum_tree_block(sbio->sdev, - sbio->spag + ix, - sbio->logical + ix * PAGE_SIZE, - buffer); - } else { - WARN_ON(1); + goto out; } - kunmap_atomic(buffer, KM_USER0); - return ret; -} + if (!sblock_bad->no_io_error_seen) { + spin_lock(&sdev->stat_lock); + sdev->stat.read_errors++; + spin_unlock(&sdev->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("i/o error", sblock_to_check); + } else if (sblock_bad->checksum_error) { + spin_lock(&sdev->stat_lock); + sdev->stat.csum_errors++; + spin_unlock(&sdev->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("checksum error", sblock_to_check); + } else if (sblock_bad->header_error) { + spin_lock(&sdev->stat_lock); + sdev->stat.verify_errors++; + spin_unlock(&sdev->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("checksum/header error", + sblock_to_check); + } -static void scrub_fixup_end_io(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); -} + if (sdev->readonly) + goto did_not_correct_error; -static void scrub_fixup(struct scrub_bio *sbio, int ix) -{ - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_bio *bbio = NULL; - struct scrub_fixup_nodatasum *fixup; - u64 logical = sbio->logical + ix * PAGE_SIZE; - u64 length; - int i; - int ret; - DECLARE_COMPLETION_ONSTACK(complete); - - if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && - (sbio->spag[ix].have_csum == 0)) { - fixup = kzalloc(sizeof(*fixup), GFP_NOFS); - if (!fixup) - goto uncorrectable; - fixup->sdev = sdev; - fixup->logical = logical; - fixup->root = fs_info->extent_root; - fixup->mirror_num = sbio->spag[ix].mirror_num; + if (!is_metadata && !have_csum) { + struct scrub_fixup_nodatasum *fixup_nodatasum; + + /* + * !is_metadata and !have_csum, this means that the data + * might not be COW'ed, that it might be modified + * concurrently. The general strategy to work on the + * commit root does not help in the case when COW is not + * used. + */ + fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); + if (!fixup_nodatasum) + goto did_not_correct_error; + fixup_nodatasum->sdev = sdev; + fixup_nodatasum->logical = logical; + fixup_nodatasum->root = fs_info->extent_root; + fixup_nodatasum->mirror_num = failed_mirror_index + 1; /* * increment scrubs_running to prevent cancel requests from * completing as long as a fixup worker is running. we must also @@ -649,235 +774,533 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) atomic_inc(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); atomic_inc(&sdev->fixup_cnt); - fixup->work.func = scrub_fixup_nodatasum; - btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); - return; + fixup_nodatasum->work.func = scrub_fixup_nodatasum; + btrfs_queue_worker(&fs_info->scrub_workers, + &fixup_nodatasum->work); + goto out; } - length = PAGE_SIZE; - ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, - &bbio, 0); - if (ret || !bbio || length < PAGE_SIZE) { - printk(KERN_ERR - "scrub_fixup: btrfs_map_block failed us for %llu\n", - (unsigned long long)logical); - WARN_ON(1); - kfree(bbio); - return; + /* + * now build and submit the bios for the other mirrors, check + * checksums + */ + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + if (mirror_index == failed_mirror_index) + continue; + + /* build and submit the bios, check checksums */ + ret = scrub_recheck_block(fs_info, + sblocks_for_recheck + mirror_index, + is_metadata, have_csum, csum, + generation, sdev->csum_size); + if (ret) + goto did_not_correct_error; } - if (bbio->num_stripes == 1) - /* there aren't any replicas */ - goto uncorrectable; + /* + * first try to pick the mirror which is completely without I/O + * errors and also does not have a checksum error. + * If one is found, and if a checksum is present, the full block + * that is known to contain an error is rewritten. Afterwards + * the block is known to be corrected. + * If a mirror is found which is completely correct, and no + * checksum is present, only those pages are rewritten that had + * an I/O error in the block to be repaired, since it cannot be + * determined, which copy of the other pages is better (and it + * could happen otherwise that a correct page would be + * overwritten by a bad one). + */ + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = sblocks_for_recheck + + mirror_index; + + if (!sblock_other->header_error && + !sblock_other->checksum_error && + sblock_other->no_io_error_seen) { + int force_write = is_metadata || have_csum; + + ret = scrub_repair_block_from_good_copy(sblock_bad, + sblock_other, + force_write); + if (0 == ret) + goto corrected_error; + } + } /* - * first find a good copy + * in case of I/O errors in the area that is supposed to be + * repaired, continue by picking good copies of those pages. + * Select the good pages from mirrors to rewrite bad pages from + * the area to fix. Afterwards verify the checksum of the block + * that is supposed to be repaired. This verification step is + * only done for the purpose of statistic counting and for the + * final scrub report, whether errors remain. + * A perfect algorithm could make use of the checksum and try + * all possible combinations of pages from the different mirrors + * until the checksum verification succeeds. For example, when + * the 2nd page of mirror #1 faces I/O errors, and the 2nd page + * of mirror #2 is readable but the final checksum test fails, + * then the 2nd page of mirror #3 could be tried, whether now + * the final checksum succeedes. But this would be a rare + * exception and is therefore not implemented. At least it is + * avoided that the good copy is overwritten. + * A more useful improvement would be to pick the sectors + * without I/O error based on sector sizes (512 bytes on legacy + * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one + * mirror could be repaired by taking 512 byte of a different + * mirror, even if other 512 byte sectors in the same PAGE_SIZE + * area are unreadable. */ - for (i = 0; i < bbio->num_stripes; ++i) { - if (i + 1 == sbio->spag[ix].mirror_num) - continue; - if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, - bbio->stripes[i].physical >> 9, - sbio->bio->bi_io_vec[ix].bv_page)) { - /* I/O-error, this is not a good copy */ + /* can only fix I/O errors from here on */ + if (sblock_bad->no_io_error_seen) + goto did_not_correct_error; + + success = 1; + for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + struct scrub_page *page_bad = sblock_bad->pagev + page_num; + + if (!page_bad->io_error) continue; + + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = sblocks_for_recheck + + mirror_index; + struct scrub_page *page_other = sblock_other->pagev + + page_num; + + if (!page_other->io_error) { + ret = scrub_repair_page_from_good_copy( + sblock_bad, sblock_other, page_num, 0); + if (0 == ret) { + page_bad->io_error = 0; + break; /* succeeded for this page */ + } + } } - if (scrub_fixup_check(sbio, ix) == 0) - break; + if (page_bad->io_error) { + /* did not find a mirror to copy the page from */ + success = 0; + } } - if (i == bbio->num_stripes) - goto uncorrectable; - if (!sdev->readonly) { - /* - * bi_io_vec[ix].bv_page now contains good data, write it back - */ - if (scrub_fixup_io(WRITE, sdev->dev->bdev, - (sbio->physical + ix * PAGE_SIZE) >> 9, - sbio->bio->bi_io_vec[ix].bv_page)) { - /* I/O-error, writeback failed, give up */ - goto uncorrectable; + if (success) { + if (is_metadata || have_csum) { + /* + * need to verify the checksum now that all + * sectors on disk are repaired (the write + * request for data to be repaired is on its way). + * Just be lazy and use scrub_recheck_block() + * which re-reads the data before the checksum + * is verified, but most likely the data comes out + * of the page cache. + */ + ret = scrub_recheck_block(fs_info, sblock_bad, + is_metadata, have_csum, csum, + generation, sdev->csum_size); + if (!ret && !sblock_bad->header_error && + !sblock_bad->checksum_error && + sblock_bad->no_io_error_seen) + goto corrected_error; + else + goto did_not_correct_error; + } else { +corrected_error: + spin_lock(&sdev->stat_lock); + sdev->stat.corrected_errors++; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR + "btrfs: fixed up error at logical %llu on dev %s\n", + (unsigned long long)logical, sdev->dev->name); } + } else { +did_not_correct_error: + spin_lock(&sdev->stat_lock); + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR + "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", + (unsigned long long)logical, sdev->dev->name); } - kfree(bbio); - spin_lock(&sdev->stat_lock); - ++sdev->stat.corrected_errors; - spin_unlock(&sdev->stat_lock); +out: + if (sblocks_for_recheck) { + for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; + mirror_index++) { + struct scrub_block *sblock = sblocks_for_recheck + + mirror_index; + int page_index; + + for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; + page_index++) + if (sblock->pagev[page_index].page) + __free_page( + sblock->pagev[page_index].page); + } + kfree(sblocks_for_recheck); + } - printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", - (unsigned long long)logical); - return; + return 0; +} -uncorrectable: - kfree(bbio); - spin_lock(&sdev->stat_lock); - ++sdev->stat.uncorrectable_errors; - spin_unlock(&sdev->stat_lock); +static int scrub_setup_recheck_block(struct scrub_dev *sdev, + struct btrfs_mapping_tree *map_tree, + u64 length, u64 logical, + struct scrub_block *sblocks_for_recheck) +{ + int page_index; + int mirror_index; + int ret; + + /* + * note: the three members sdev, ref_count and outstanding_pages + * are not used (and not set) in the blocks that are used for + * the recheck procedure + */ + + page_index = 0; + while (length > 0) { + u64 sublen = min_t(u64, length, PAGE_SIZE); + u64 mapped_length = sublen; + struct btrfs_bio *bbio = NULL; + + /* + * with a length of PAGE_SIZE, each returned stripe + * represents one mirror + */ + ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, + &bbio, 0); + if (ret || !bbio || mapped_length < sublen) { + kfree(bbio); + return -EIO; + } - printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " - "logical %llu\n", (unsigned long long)logical); + BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); + for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; + mirror_index++) { + struct scrub_block *sblock; + struct scrub_page *page; + + if (mirror_index >= BTRFS_MAX_MIRRORS) + continue; + + sblock = sblocks_for_recheck + mirror_index; + page = sblock->pagev + page_index; + page->logical = logical; + page->physical = bbio->stripes[mirror_index].physical; + /* for missing devices, bdev is NULL */ + page->bdev = bbio->stripes[mirror_index].dev->bdev; + page->mirror_num = mirror_index + 1; + page->page = alloc_page(GFP_NOFS); + if (!page->page) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + spin_unlock(&sdev->stat_lock); + return -ENOMEM; + } + sblock->page_count++; + } + kfree(bbio); + length -= sublen; + logical += sublen; + page_index++; + } + + return 0; } -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, - struct page *page) +/* + * this function will check the on disk data for checksum errors, header + * errors and read I/O errors. If any I/O errors happen, the exact pages + * which are errored are marked as being bad. The goal is to enable scrub + * to take those pages that are not errored from all the mirrors so that + * the pages that are errored in the just handled mirror can be repaired. + */ +static int scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size) { - struct bio *bio = NULL; - int ret; - DECLARE_COMPLETION_ONSTACK(complete); + int page_num; - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = bdev; - bio->bi_sector = sector; - bio_add_page(bio, page, PAGE_SIZE, 0); - bio->bi_end_io = scrub_fixup_end_io; - bio->bi_private = &complete; - btrfsic_submit_bio(rw, bio); + sblock->no_io_error_seen = 1; + sblock->header_error = 0; + sblock->checksum_error = 0; - /* this will also unplug the queue */ - wait_for_completion(&complete); + for (page_num = 0; page_num < sblock->page_count; page_num++) { + struct bio *bio; + int ret; + struct scrub_page *page = sblock->pagev + page_num; + DECLARE_COMPLETION_ONSTACK(complete); - ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); - bio_put(bio); - return ret; + if (page->bdev == NULL) { + page->io_error = 1; + sblock->no_io_error_seen = 0; + continue; + } + + BUG_ON(!page->page); + bio = bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; + bio->bi_bdev = page->bdev; + bio->bi_sector = page->physical >> 9; + bio->bi_end_io = scrub_complete_bio_end_io; + bio->bi_private = &complete; + + ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); + if (PAGE_SIZE != ret) { + bio_put(bio); + return -EIO; + } + btrfsic_submit_bio(READ, bio); + + /* this will also unplug the queue */ + wait_for_completion(&complete); + + page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + sblock->no_io_error_seen = 0; + bio_put(bio); + } + + if (sblock->no_io_error_seen) + scrub_recheck_block_checksum(fs_info, sblock, is_metadata, + have_csum, csum, generation, + csum_size); + + return 0; } -static void scrub_bio_end_io(struct bio *bio, int err) +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, + int is_metadata, int have_csum, + const u8 *csum, u64 generation, + u16 csum_size) { - struct scrub_bio *sbio = bio->bi_private; - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + int page_num; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + struct btrfs_root *root = fs_info->extent_root; + void *mapped_buffer; + + BUG_ON(!sblock->pagev[0].page); + if (is_metadata) { + struct btrfs_header *h; + + mapped_buffer = kmap_atomic(sblock->pagev[0].page); + h = (struct btrfs_header *)mapped_buffer; + + if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || + generation != le64_to_cpu(h->generation) || + memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || + memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, + BTRFS_UUID_SIZE)) + sblock->header_error = 1; + csum = h->csum; + } else { + if (!have_csum) + return; - sbio->err = err; - sbio->bio = bio; + mapped_buffer = kmap_atomic(sblock->pagev[0].page); + } - btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); + for (page_num = 0;;) { + if (page_num == 0 && is_metadata) + crc = btrfs_csum_data(root, + ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, + crc, PAGE_SIZE - BTRFS_CSUM_SIZE); + else + crc = btrfs_csum_data(root, mapped_buffer, crc, + PAGE_SIZE); + + kunmap_atomic(mapped_buffer); + page_num++; + if (page_num >= sblock->page_count) + break; + BUG_ON(!sblock->pagev[page_num].page); + + mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, csum, csum_size)) + sblock->checksum_error = 1; } -static void scrub_checksum(struct btrfs_work *work) +static void scrub_complete_bio_end_io(struct bio *bio, int err) { - struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_dev *sdev = sbio->sdev; - struct page *page; - void *buffer; - int i; - u64 flags; - u64 logical; - int ret; + complete((struct completion *)bio->bi_private); +} - if (sbio->err) { - ret = 0; - for (i = 0; i < sbio->count; ++i) - ret |= scrub_recheck_error(sbio, i); - if (!ret) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.unverified_errors; - spin_unlock(&sdev->stat_lock); - } +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int force_write) +{ + int page_num; + int ret = 0; - sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bio->bi_phys_segments = 0; - sbio->bio->bi_idx = 0; + for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + int ret_sub; - for (i = 0; i < sbio->count; i++) { - struct bio_vec *bi; - bi = &sbio->bio->bi_io_vec[i]; - bi->bv_offset = 0; - bi->bv_len = PAGE_SIZE; - } - goto out; + ret_sub = scrub_repair_page_from_good_copy(sblock_bad, + sblock_good, + page_num, + force_write); + if (ret_sub) + ret = ret_sub; } - for (i = 0; i < sbio->count; ++i) { - page = sbio->bio->bi_io_vec[i].bv_page; - buffer = kmap_atomic(page, KM_USER0); - flags = sbio->spag[i].flags; - logical = sbio->logical + i * PAGE_SIZE; - ret = 0; - if (flags & BTRFS_EXTENT_FLAG_DATA) { - ret = scrub_checksum_data(sdev, sbio->spag + i, buffer); - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = scrub_checksum_tree_block(sdev, sbio->spag + i, - logical, buffer); - } else if (flags & BTRFS_EXTENT_FLAG_SUPER) { - BUG_ON(i); - (void)scrub_checksum_super(sbio, buffer); - } else { - WARN_ON(1); - } - kunmap_atomic(buffer, KM_USER0); - if (ret) { - ret = scrub_recheck_error(sbio, i); - if (!ret) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.unverified_errors; - spin_unlock(&sdev->stat_lock); - } + + return ret; +} + +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int page_num, int force_write) +{ + struct scrub_page *page_bad = sblock_bad->pagev + page_num; + struct scrub_page *page_good = sblock_good->pagev + page_num; + + BUG_ON(sblock_bad->pagev[page_num].page == NULL); + BUG_ON(sblock_good->pagev[page_num].page == NULL); + if (force_write || sblock_bad->header_error || + sblock_bad->checksum_error || page_bad->io_error) { + struct bio *bio; + int ret; + DECLARE_COMPLETION_ONSTACK(complete); + + bio = bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; + bio->bi_bdev = page_bad->bdev; + bio->bi_sector = page_bad->physical >> 9; + bio->bi_end_io = scrub_complete_bio_end_io; + bio->bi_private = &complete; + + ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); + if (PAGE_SIZE != ret) { + bio_put(bio); + return -EIO; } + btrfsic_submit_bio(WRITE, bio); + + /* this will also unplug the queue */ + wait_for_completion(&complete); + bio_put(bio); } -out: - scrub_free_bio(sbio->bio); - sbio->bio = NULL; - spin_lock(&sdev->list_lock); - sbio->next_free = sdev->first_free; - sdev->first_free = sbio->index; - spin_unlock(&sdev->list_lock); - atomic_dec(&sdev->in_flight); - wake_up(&sdev->list_wait); + return 0; } -static int scrub_checksum_data(struct scrub_dev *sdev, - struct scrub_page *spag, void *buffer) +static void scrub_checksum(struct scrub_block *sblock) { + u64 flags; + int ret; + + BUG_ON(sblock->page_count < 1); + flags = sblock->pagev[0].flags; + ret = 0; + if (flags & BTRFS_EXTENT_FLAG_DATA) + ret = scrub_checksum_data(sblock); + else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = scrub_checksum_tree_block(sblock); + else if (flags & BTRFS_EXTENT_FLAG_SUPER) + (void)scrub_checksum_super(sblock); + else + WARN_ON(1); + if (ret) + scrub_handle_errored_block(sblock); +} + +static int scrub_checksum_data(struct scrub_block *sblock) +{ + struct scrub_dev *sdev = sblock->sdev; u8 csum[BTRFS_CSUM_SIZE]; + u8 *on_disk_csum; + struct page *page; + void *buffer; u32 crc = ~(u32)0; int fail = 0; struct btrfs_root *root = sdev->dev->dev_root; + u64 len; + int index; - if (!spag->have_csum) + BUG_ON(sblock->page_count < 1); + if (!sblock->pagev[0].have_csum) return 0; - crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE); + on_disk_csum = sblock->pagev[0].csum; + page = sblock->pagev[0].page; + buffer = kmap_atomic(page); + + len = sdev->sectorsize; + index = 0; + for (;;) { + u64 l = min_t(u64, len, PAGE_SIZE); + + crc = btrfs_csum_data(root, buffer, crc, l); + kunmap_atomic(buffer); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index].page); + page = sblock->pagev[index].page; + buffer = kmap_atomic(page); + } + btrfs_csum_final(crc, csum); - if (memcmp(csum, spag->csum, sdev->csum_size)) + if (memcmp(csum, on_disk_csum, sdev->csum_size)) fail = 1; - spin_lock(&sdev->stat_lock); - ++sdev->stat.data_extents_scrubbed; - sdev->stat.data_bytes_scrubbed += PAGE_SIZE; - if (fail) - ++sdev->stat.csum_errors; - spin_unlock(&sdev->stat_lock); - return fail; } -static int scrub_checksum_tree_block(struct scrub_dev *sdev, - struct scrub_page *spag, u64 logical, - void *buffer) +static int scrub_checksum_tree_block(struct scrub_block *sblock) { + struct scrub_dev *sdev = sblock->sdev; struct btrfs_header *h; struct btrfs_root *root = sdev->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; - u8 csum[BTRFS_CSUM_SIZE]; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + struct page *page; + void *mapped_buffer; + u64 mapped_size; + void *p; u32 crc = ~(u32)0; int fail = 0; int crc_fail = 0; + u64 len; + int index; + + BUG_ON(sblock->page_count < 1); + page = sblock->pagev[0].page; + mapped_buffer = kmap_atomic(page); + h = (struct btrfs_header *)mapped_buffer; + memcpy(on_disk_csum, h->csum, sdev->csum_size); /* * we don't use the getter functions here, as we * a) don't have an extent buffer and * b) the page is already kmapped */ - h = (struct btrfs_header *)buffer; - if (logical != le64_to_cpu(h->bytenr)) + if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) ++fail; - if (spag->generation != le64_to_cpu(h->generation)) + if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) ++fail; if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -887,51 +1310,90 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev, BTRFS_UUID_SIZE)) ++fail; - crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, - PAGE_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, csum); - if (memcmp(csum, h->csum, sdev->csum_size)) - ++crc_fail; + BUG_ON(sdev->nodesize != sdev->leafsize); + len = sdev->nodesize - BTRFS_CSUM_SIZE; + mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; + p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + index = 0; + for (;;) { + u64 l = min_t(u64, len, mapped_size); - spin_lock(&sdev->stat_lock); - ++sdev->stat.tree_extents_scrubbed; - sdev->stat.tree_bytes_scrubbed += PAGE_SIZE; - if (crc_fail) - ++sdev->stat.csum_errors; - if (fail) - ++sdev->stat.verify_errors; - spin_unlock(&sdev->stat_lock); + crc = btrfs_csum_data(root, p, crc, l); + kunmap_atomic(mapped_buffer); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index].page); + page = sblock->pagev[index].page; + mapped_buffer = kmap_atomic(page); + mapped_size = PAGE_SIZE; + p = mapped_buffer; + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) + ++crc_fail; return fail || crc_fail; } -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) +static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; - u64 logical; - struct scrub_dev *sdev = sbio->sdev; + struct scrub_dev *sdev = sblock->sdev; struct btrfs_root *root = sdev->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; - u8 csum[BTRFS_CSUM_SIZE]; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + struct page *page; + void *mapped_buffer; + u64 mapped_size; + void *p; u32 crc = ~(u32)0; int fail = 0; + u64 len; + int index; - s = (struct btrfs_super_block *)buffer; - logical = sbio->logical; + BUG_ON(sblock->page_count < 1); + page = sblock->pagev[0].page; + mapped_buffer = kmap_atomic(page); + s = (struct btrfs_super_block *)mapped_buffer; + memcpy(on_disk_csum, s->csum, sdev->csum_size); - if (logical != le64_to_cpu(s->bytenr)) + if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) ++fail; - if (sbio->spag[0].generation != le64_to_cpu(s->generation)) + if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) ++fail; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) ++fail; - crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, - PAGE_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, csum); - if (memcmp(csum, s->csum, sbio->sdev->csum_size)) + len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; + mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; + p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + index = 0; + for (;;) { + u64 l = min_t(u64, len, mapped_size); + + crc = btrfs_csum_data(root, p, crc, l); + kunmap_atomic(mapped_buffer); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index].page); + page = sblock->pagev[index].page; + mapped_buffer = kmap_atomic(page); + mapped_size = PAGE_SIZE; + p = mapped_buffer; + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) ++fail; if (fail) { @@ -948,29 +1410,42 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) return fail; } -static int scrub_submit(struct scrub_dev *sdev) +static void scrub_block_get(struct scrub_block *sblock) +{ + atomic_inc(&sblock->ref_count); +} + +static void scrub_block_put(struct scrub_block *sblock) +{ + if (atomic_dec_and_test(&sblock->ref_count)) { + int i; + + for (i = 0; i < sblock->page_count; i++) + if (sblock->pagev[i].page) + __free_page(sblock->pagev[i].page); + kfree(sblock); + } +} + +static void scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; if (sdev->curr == -1) - return 0; + return; sbio = sdev->bios[sdev->curr]; - sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); btrfsic_submit_bio(READ, sbio->bio); - - return 0; } -static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force) +static int scrub_add_page_to_bio(struct scrub_dev *sdev, + struct scrub_page *spage) { + struct scrub_block *sblock = spage->sblock; struct scrub_bio *sbio; - struct page *page; int ret; again: @@ -983,7 +1458,7 @@ again: if (sdev->curr != -1) { sdev->first_free = sdev->bios[sdev->curr]->next_free; sdev->bios[sdev->curr]->next_free = -1; - sdev->bios[sdev->curr]->count = 0; + sdev->bios[sdev->curr]->page_count = 0; spin_unlock(&sdev->list_lock); } else { spin_unlock(&sdev->list_lock); @@ -991,62 +1466,200 @@ again: } } sbio = sdev->bios[sdev->curr]; - if (sbio->count == 0) { + if (sbio->page_count == 0) { struct bio *bio; - sbio->physical = physical; - sbio->logical = logical; - bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); - if (!bio) - return -ENOMEM; + sbio->physical = spage->physical; + sbio->logical = spage->logical; + bio = sbio->bio; + if (!bio) { + bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); + if (!bio) + return -ENOMEM; + sbio->bio = bio; + } bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = sbio->physical >> 9; + bio->bi_sector = spage->physical >> 9; sbio->err = 0; - sbio->bio = bio; - } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || - sbio->logical + sbio->count * PAGE_SIZE != logical) { - ret = scrub_submit(sdev); - if (ret) - return ret; + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + spage->physical || + sbio->logical + sbio->page_count * PAGE_SIZE != + spage->logical) { + scrub_submit(sdev); goto again; } - sbio->spag[sbio->count].flags = flags; - sbio->spag[sbio->count].generation = gen; - sbio->spag[sbio->count].have_csum = 0; - sbio->spag[sbio->count].mirror_num = mirror_num; - - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); - if (!ret) { - __free_page(page); - ret = scrub_submit(sdev); - if (ret) - return ret; + sbio->pagev[sbio->page_count] = spage; + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { + if (sbio->page_count < 1) { + bio_put(sbio->bio); + sbio->bio = NULL; + return -EIO; + } + scrub_submit(sdev); goto again; } - if (csum) { - sbio->spag[sbio->count].have_csum = 1; - memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); + scrub_block_get(sblock); /* one for the added page */ + atomic_inc(&sblock->outstanding_pages); + sbio->page_count++; + if (sbio->page_count == sdev->pages_per_bio) + scrub_submit(sdev); + + return 0; +} + +static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, + u64 physical, u64 flags, u64 gen, int mirror_num, + u8 *csum, int force) +{ + struct scrub_block *sblock; + int index; + + sblock = kzalloc(sizeof(*sblock), GFP_NOFS); + if (!sblock) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + spin_unlock(&sdev->stat_lock); + return -ENOMEM; } - ++sbio->count; - if (sbio->count == SCRUB_PAGES_PER_BIO || force) { + + /* one ref inside this function, plus one for each page later on */ + atomic_set(&sblock->ref_count, 1); + sblock->sdev = sdev; + sblock->no_io_error_seen = 1; + + for (index = 0; len > 0; index++) { + struct scrub_page *spage = sblock->pagev + index; + u64 l = min_t(u64, len, PAGE_SIZE); + + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + spin_unlock(&sdev->stat_lock); + while (index > 0) { + index--; + __free_page(sblock->pagev[index].page); + } + kfree(sblock); + return -ENOMEM; + } + spage->sblock = sblock; + spage->bdev = sdev->dev->bdev; + spage->flags = flags; + spage->generation = gen; + spage->logical = logical; + spage->physical = physical; + spage->mirror_num = mirror_num; + if (csum) { + spage->have_csum = 1; + memcpy(spage->csum, csum, sdev->csum_size); + } else { + spage->have_csum = 0; + } + sblock->page_count++; + len -= l; + logical += l; + physical += l; + } + + BUG_ON(sblock->page_count == 0); + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev + index; int ret; - ret = scrub_submit(sdev); - if (ret) + ret = scrub_add_page_to_bio(sdev, spage); + if (ret) { + scrub_block_put(sblock); return ret; + } } + if (force) + scrub_submit(sdev); + + /* last one frees, either here or in bio completion for last page */ + scrub_block_put(sblock); return 0; } +static void scrub_bio_end_io(struct bio *bio, int err) +{ + struct scrub_bio *sbio = bio->bi_private; + struct scrub_dev *sdev = sbio->sdev; + struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + + sbio->err = err; + sbio->bio = bio; + + btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); +} + +static void scrub_bio_end_io_worker(struct btrfs_work *work) +{ + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_dev *sdev = sbio->sdev; + int i; + + BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); + if (sbio->err) { + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + + spage->io_error = 1; + spage->sblock->no_io_error_seen = 0; + } + } + + /* now complete the scrub_block items that have all pages completed */ + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + struct scrub_block *sblock = spage->sblock; + + if (atomic_dec_and_test(&sblock->outstanding_pages)) + scrub_block_complete(sblock); + scrub_block_put(sblock); + } + + if (sbio->err) { + /* what is this good for??? */ + sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); + sbio->bio->bi_flags |= 1 << BIO_UPTODATE; + sbio->bio->bi_phys_segments = 0; + sbio->bio->bi_idx = 0; + + for (i = 0; i < sbio->page_count; i++) { + struct bio_vec *bi; + bi = &sbio->bio->bi_io_vec[i]; + bi->bv_offset = 0; + bi->bv_len = PAGE_SIZE; + } + } + + bio_put(sbio->bio); + sbio->bio = NULL; + spin_lock(&sdev->list_lock); + sbio->next_free = sdev->first_free; + sdev->first_free = sbio->index; + spin_unlock(&sdev->list_lock); + atomic_dec(&sdev->in_flight); + wake_up(&sdev->list_wait); +} + +static void scrub_block_complete(struct scrub_block *sblock) +{ + if (!sblock->no_io_error_seen) + scrub_handle_errored_block(sblock); + else + scrub_checksum(sblock); +} + static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, u8 *csum) { @@ -1054,7 +1667,6 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, int ret = 0; unsigned long i; unsigned long num_sectors; - u32 sectorsize = sdev->dev->dev_root->sectorsize; while (!list_empty(&sdev->csum_list)) { sum = list_first_entry(&sdev->csum_list, @@ -1072,7 +1684,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, if (!sum) return 0; - num_sectors = sum->len / sectorsize; + num_sectors = sum->len / sdev->sectorsize; for (i = 0; i < num_sectors; ++i) { if (sum->sums[i].bytenr == logical) { memcpy(csum, &sum->sums[i].sum, sdev->csum_size); @@ -1093,9 +1705,28 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, { int ret; u8 csum[BTRFS_CSUM_SIZE]; + u32 blocksize; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + blocksize = sdev->sectorsize; + spin_lock(&sdev->stat_lock); + sdev->stat.data_extents_scrubbed++; + sdev->stat.data_bytes_scrubbed += len; + spin_unlock(&sdev->stat_lock); + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + BUG_ON(sdev->nodesize != sdev->leafsize); + blocksize = sdev->nodesize; + spin_lock(&sdev->stat_lock); + sdev->stat.tree_extents_scrubbed++; + sdev->stat.tree_bytes_scrubbed += len; + spin_unlock(&sdev->stat_lock); + } else { + blocksize = sdev->sectorsize; + BUG_ON(1); + } while (len) { - u64 l = min_t(u64, len, PAGE_SIZE); + u64 l = min_t(u64, len, blocksize); int have_csum = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) { @@ -1104,8 +1735,8 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, if (have_csum == 0) ++sdev->stat.no_csum; } - ret = scrub_page(sdev, logical, l, physical, flags, gen, - mirror_num, have_csum ? csum : NULL, 0); + ret = scrub_pages(sdev, logical, l, physical, flags, gen, + mirror_num, have_csum ? csum : NULL, 0); if (ret) return ret; len -= l; @@ -1170,6 +1801,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (!path) return -ENOMEM; + /* + * work on commit root. The related disk blocks are static as + * long as COW is applied. This means, it is save to rewrite + * them to repair disk errors without any race conditions + */ path->search_commit_root = 1; path->skip_locking = 1; @@ -1516,15 +2152,18 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) struct btrfs_device *device = sdev->dev; struct btrfs_root *root = device->dev_root; + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + return -EIO; + gen = root->fs_info->last_trans_committed; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) break; - ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr, - BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); + ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); if (ret) return ret; } @@ -1583,10 +2222,30 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, /* * check some assumptions */ - if (root->sectorsize != PAGE_SIZE || - root->sectorsize != root->leafsize || - root->sectorsize != root->nodesize) { - printk(KERN_ERR "btrfs_scrub: size assumptions fail\n"); + if (root->nodesize != root->leafsize) { + printk(KERN_ERR + "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", + root->nodesize, root->leafsize); + return -EINVAL; + } + + if (root->nodesize > BTRFS_STRIPE_LEN) { + /* + * in this case scrub is unable to calculate the checksum + * the way scrub is implemented. Do not handle this + * situation at all because it won't ever happen. + */ + printk(KERN_ERR + "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", + root->nodesize, BTRFS_STRIPE_LEN); + return -EINVAL; + } + + if (root->sectorsize != PAGE_SIZE) { + /* not supported for data w/o checksums */ + printk(KERN_ERR + "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", + root->sectorsize, (unsigned long long)PAGE_SIZE); return -EINVAL; } @@ -1656,7 +2315,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, return ret; } -int btrfs_scrub_pause(struct btrfs_root *root) +void btrfs_scrub_pause(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1671,34 +2330,28 @@ int btrfs_scrub_pause(struct btrfs_root *root) mutex_lock(&fs_info->scrub_lock); } mutex_unlock(&fs_info->scrub_lock); - - return 0; } -int btrfs_scrub_continue(struct btrfs_root *root) +void btrfs_scrub_continue(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; atomic_dec(&fs_info->scrub_pause_req); wake_up(&fs_info->scrub_pause_wait); - return 0; } -int btrfs_scrub_pause_super(struct btrfs_root *root) +void btrfs_scrub_pause_super(struct btrfs_root *root) { down_write(&root->fs_info->scrub_super_lock); - return 0; } -int btrfs_scrub_continue_super(struct btrfs_root *root) +void btrfs_scrub_continue_super(struct btrfs_root *root) { up_write(&root->fs_info->scrub_super_lock); - return 0; } -int btrfs_scrub_cancel(struct btrfs_root *root) +int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; mutex_lock(&fs_info->scrub_lock); if (!atomic_read(&fs_info->scrubs_running)) { @@ -1719,6 +2372,11 @@ int btrfs_scrub_cancel(struct btrfs_root *root) return 0; } +int btrfs_scrub_cancel(struct btrfs_root *root) +{ + return __btrfs_scrub_cancel(root->fs_info); +} + |