diff options
Diffstat (limited to 'fs/btrfs')
59 files changed, 53898 insertions, 0 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig new file mode 100644 index 00000000000..7bb3c020e57 --- /dev/null +++ b/fs/btrfs/Kconfig @@ -0,0 +1,31 @@ +config BTRFS_FS + tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" + depends on EXPERIMENTAL + select LIBCRC32C + select ZLIB_INFLATE + select ZLIB_DEFLATE + help + Btrfs is a new filesystem with extents, writable snapshotting, + support for multiple devices and many more features. + + Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET + FINALIZED. You should say N here unless you are interested in + testing Btrfs with non-critical data. + + To compile this file system support as a module, choose M here. The + module will be called btrfs. + + If unsure, say N. + +config BTRFS_FS_POSIX_ACL + bool "Btrfs POSIX Access Control Lists" + depends on BTRFS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile new file mode 100644 index 00000000000..a35eb36b32f --- /dev/null +++ b/fs/btrfs/Makefile @@ -0,0 +1,10 @@ + +obj-$(CONFIG_BTRFS_FS) := btrfs.o + +btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + file-item.o inode-item.o inode-map.o disk-io.o \ + transaction.o inode.o file.o tree-defrag.o \ + extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ + extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ + export.o tree-log.o acl.o free-space-cache.o zlib.o \ + compression.o delayed-ref.o relocation.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c new file mode 100644 index 00000000000..9be949a231e --- /dev/null +++ b/fs/btrfs/acl.c @@ -0,0 +1,314 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> + +#include "ctree.h" +#include "btrfs_inode.h" +#include "xattr.h" + +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + +static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + + size = __btrfs_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __btrfs_getxattr(inode, name, value, size); + if (size > 0) { + acl = posix_acl_from_xattr(value, size); + set_cached_acl(inode, type, acl); + } + kfree(value); + } else if (size == -ENOENT || size == -ENODATA || size == 0) { + /* FIXME, who returns -ENOENT? I think nobody */ + acl = NULL; + set_cached_acl(inode, type, acl); + } else { + acl = ERR_PTR(-EIO); + } + + return acl; +} + +static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) +{ + struct posix_acl *acl; + int ret = 0; + + acl = btrfs_get_acl(dentry->d_inode, type); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + ret = posix_acl_to_xattr(acl, value, size); + posix_acl_release(acl); + + return ret; +} + +/* + * Needs to be called with fs_mutex held + */ +static int btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, int type) +{ + int ret, size = 0; + const char *name; + char *value = NULL; + mode_t mode; + + if (acl) { + ret = posix_acl_valid(acl); + if (ret < 0) + return ret; + ret = 0; + } + + switch (type) { + case ACL_TYPE_ACCESS: + mode = inode->i_mode; + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + inode->i_mode = mode; + } + ret = 0; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EINVAL : 0; + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return -EINVAL; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(acl, value, size); + if (ret < 0) + goto out; + } + + ret = __btrfs_setxattr(trans, inode, name, value, size, 0); +out: + kfree(value); + + if (!ret) + set_cached_acl(inode, type, acl); + + return ret; +} + +static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + int ret; + struct posix_acl *acl = NULL; + + if (!is_owner_or_cap(dentry->d_inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (acl == NULL) { + value = NULL; + size = 0; + } else if (IS_ERR(acl)) { + return PTR_ERR(acl); + } + } + + ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); + + posix_acl_release(acl); + + return ret; +} + +int btrfs_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl; + int error = -EAGAIN; + + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + } + + return error; +} + +/* + * btrfs_init_acl is already generally called under fs_mutex, so the locking + * stuff has been fixed to work with that. If the locking stuff changes, we + * need to re-evaluate the acl locking stuff. + */ +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int ret = 0; + + /* this happens with subvols */ + if (!dir) + return 0; + + if (!S_ISLNK(inode->i_mode)) { + if (IS_POSIXACL(dir)) { + acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + + if (!acl) + inode->i_mode &= ~current_umask(); + } + + if (IS_POSIXACL(dir) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + ret = btrfs_set_acl(trans, inode, acl, + ACL_TYPE_DEFAULT); + if (ret) + goto failed; + } + clone = posix_acl_clone(acl, GFP_NOFS); + ret = -ENOMEM; + if (!clone) + goto failed; + + mode = inode->i_mode; + ret = posix_acl_create_masq(clone, &mode); + if (ret >= 0) { + inode->i_mode = mode; + if (ret > 0) { + /* we need an acl */ + ret = btrfs_set_acl(trans, inode, clone, + ACL_TYPE_ACCESS); + } + } + posix_acl_release(clone); + } +failed: + posix_acl_release(acl); + + return ret; +} + +int btrfs_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int ret = 0; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!IS_POSIXACL(inode)) + return 0; + + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + + ret = posix_acl_chmod_masq(clone, inode->i_mode); + if (!ret) + ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS); + + posix_acl_release(clone); + + return ret; +} + +struct xattr_handler btrfs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .get = btrfs_xattr_acl_get, + .set = btrfs_xattr_acl_set, +}; + +struct xattr_handler btrfs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .get = btrfs_xattr_acl_get, + .set = btrfs_xattr_acl_set, +}; + +#else /* CONFIG_BTRFS_FS_POSIX_ACL */ + +int btrfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) +{ + return 0; +} + +#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c new file mode 100644 index 00000000000..c0861e781cd --- /dev/null +++ b/fs/btrfs/async-thread.c @@ -0,0 +1,716 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/freezer.h> +#include "async-thread.h" + +#define WORK_QUEUED_BIT 0 +#define WORK_DONE_BIT 1 +#define WORK_ORDER_DONE_BIT 2 +#define WORK_HIGH_PRIO_BIT 3 + +/* + * container for the kthread task pointer and the list of pending work + * One of these is allocated per thread. + */ +struct btrfs_worker_thread { + /* pool we belong to */ + struct btrfs_workers *workers; + + /* list of struct btrfs_work that are waiting for service */ + struct list_head pending; + struct list_head prio_pending; + + /* list of worker threads from struct btrfs_workers */ + struct list_head worker_list; + + /* kthread */ + struct task_struct *task; + + /* number of things on the pending list */ + atomic_t num_pending; + + /* reference counter for this struct */ + atomic_t refs; + + unsigned long sequence; + + /* protects the pending list. */ + spinlock_t lock; + + /* set to non-zero when this thread is already awake and kicking */ + int working; + + /* are we currently idle */ + int idle; +}; + +/* + * btrfs_start_workers uses kthread_run, which can block waiting for memory + * for a very long time. It will actually throttle on page writeback, + * and so it may not make progress until after our btrfs worker threads + * process all of the pending work structs in their queue + * + * This means we can't use btrfs_start_workers from inside a btrfs worker + * thread that is used as part of cleaning dirty memory, which pretty much + * involves all of the worker threads. + * + * Instead we have a helper queue who never has more than one thread + * where we scheduler thread start operations. This worker_start struct + * is used to contain the work and hold a pointer to the queue that needs + * another worker. + */ +struct worker_start { + struct btrfs_work work; + struct btrfs_workers *queue; +}; + +static void start_new_worker_func(struct btrfs_work *work) +{ + struct worker_start *start; + start = container_of(work, struct worker_start, work); + btrfs_start_workers(start->queue, 1); + kfree(start); +} + +static int start_new_worker(struct btrfs_workers *queue) +{ + struct worker_start *start; + int ret; + + start = kzalloc(sizeof(*start), GFP_NOFS); + if (!start) + return -ENOMEM; + + start->work.func = start_new_worker_func; + start->queue = queue; + ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); + if (ret) + kfree(start); + return ret; +} + +/* + * helper function to move a thread onto the idle list after it + * has finished some requests. + */ +static void check_idle_worker(struct btrfs_worker_thread *worker) +{ + if (!worker->idle && atomic_read(&worker->num_pending) < + worker->workers->idle_thresh / 2) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 1; + + /* the list may be empty if the worker is just starting */ + if (!list_empty(&worker->worker_list)) { + list_move(&worker->worker_list, + &worker->workers->idle_list); + } + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +/* + * helper function to move a thread off the idle list after new + * pending work is added. + */ +static void check_busy_worker(struct btrfs_worker_thread *worker) +{ + if (worker->idle && atomic_read(&worker->num_pending) >= + worker->workers->idle_thresh) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; + + if (!list_empty(&worker->worker_list)) { + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + } + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +static void check_pending_worker_creates(struct btrfs_worker_thread *worker) +{ + struct btrfs_workers *workers = worker->workers; + unsigned long flags; + + rmb(); + if (!workers->atomic_start_pending) + return; + + spin_lock_irqsave(&workers->lock, flags); + if (!workers->atomic_start_pending) + goto out; + + workers->atomic_start_pending = 0; + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) + goto out; + + workers->num_workers_starting += 1; + spin_unlock_irqrestore(&workers->lock, flags); + start_new_worker(workers); + return; + +out: + spin_unlock_irqrestore(&workers->lock, flags); +} + +static noinline int run_ordered_completions(struct btrfs_workers *workers, + struct btrfs_work *work) +{ + if (!workers->ordered) + return 0; + + set_bit(WORK_DONE_BIT, &work->flags); + + spin_lock(&workers->order_lock); + + while (1) { + if (!list_empty(&workers->prio_order_list)) { + work = list_entry(workers->prio_order_list.next, + struct btrfs_work, order_list); + } else if (!list_empty(&workers->order_list)) { + work = list_entry(workers->order_list.next, + struct btrfs_work, order_list); + } else { + break; + } + if (!test_bit(WORK_DONE_BIT, &work->flags)) + break; + + /* we are going to call the ordered done function, but + * we leave the work item on the list as a barrier so + * that later work items that are done don't have their + * functions called before this one returns + */ + if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) + break; + + spin_unlock(&workers->order_lock); + + work->ordered_func(work); + + /* now take the lock again and call the freeing code */ + spin_lock(&workers->order_lock); + list_del(&work->order_list); + work->ordered_free(work); + } + + spin_unlock(&workers->order_lock); + return 0; +} + +static void put_worker(struct btrfs_worker_thread *worker) +{ + if (atomic_dec_and_test(&worker->refs)) + kfree(worker); +} + +static int try_worker_shutdown(struct btrfs_worker_thread *worker) +{ + int freeit = 0; + + spin_lock_irq(&worker->lock); + spin_lock(&worker->workers->lock); + if (worker->workers->num_workers > 1 && + worker->idle && + !worker->working && + !list_empty(&worker->worker_list) && + list_empty(&worker->prio_pending) && + list_empty(&worker->pending) && + atomic_read(&worker->num_pending) == 0) { + freeit = 1; + list_del_init(&worker->worker_list); + worker->workers->num_workers--; + } + spin_unlock(&worker->workers->lock); + spin_unlock_irq(&worker->lock); + + if (freeit) + put_worker(worker); + return freeit; +} + +static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, + struct list_head *prio_head, + struct list_head *head) +{ + struct btrfs_work *work = NULL; + struct list_head *cur = NULL; + + if(!list_empty(prio_head)) + cur = prio_head->next; + + smp_mb(); + if (!list_empty(&worker->prio_pending)) + goto refill; + + if (!list_empty(head)) + cur = head->next; + + if (cur) + goto out; + +refill: + spin_lock_irq(&worker->lock); + list_splice_tail_init(&worker->prio_pending, prio_head); + list_splice_tail_init(&worker->pending, head); + + if (!list_empty(prio_head)) + cur = prio_head->next; + else if (!list_empty(head)) + cur = head->next; + spin_unlock_irq(&worker->lock); + + if (!cur) + goto out_fail; + +out: + work = list_entry(cur, struct btrfs_work, list); + +out_fail: + return work; +} + +/* + * main loop for servicing work items + */ +static int worker_loop(void *arg) +{ + struct btrfs_worker_thread *worker = arg; + struct list_head head; + struct list_head prio_head; + struct btrfs_work *work; + + INIT_LIST_HEAD(&head); + INIT_LIST_HEAD(&prio_head); + + do { +again: + while (1) { + + + work = get_next_work(worker, &prio_head, &head); + if (!work) + break; + + list_del(&work->list); + clear_bit(WORK_QUEUED_BIT, &work->flags); + + work->worker = worker; + + work->func(work); + + atomic_dec(&worker->num_pending); + /* + * unless this is an ordered work queue, + * 'work' was probably freed by func above. + */ + run_ordered_completions(worker->workers, work); + + check_pending_worker_creates(worker); + + } + + spin_lock_irq(&worker->lock); + check_idle_worker(worker); + + if (freezing(current)) { + worker->working = 0; + spin_unlock_irq(&worker->lock); + refrigerator(); + } else { + spin_unlock_irq(&worker->lock); + if (!kthread_should_stop()) { + cpu_relax(); + /* + * we've dropped the lock, did someone else + * jump_in? + */ + smp_mb(); + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) + continue; + + /* + * this short schedule allows more work to + * come in without the queue functions + * needing to go through wake_up_process() + * + * worker->working is still 1, so nobody + * is going to try and wake us up + */ + schedule_timeout(1); + smp_mb(); + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) + continue; + + if (kthread_should_stop()) + break; + + /* still no more work?, sleep for real */ + spin_lock_irq(&worker->lock); + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) { + spin_unlock_irq(&worker->lock); + goto again; + } + + /* + * this makes sure we get a wakeup when someone + * adds something new to the queue + */ + worker->working = 0; + spin_unlock_irq(&worker->lock); + + if (!kthread_should_stop()) { + schedule_timeout(HZ * 120); + if (!worker->working && + try_worker_shutdown(worker)) { + return 0; + } + } + } + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +/* + * this will wait for all the worker threads to shutdown + */ +int btrfs_stop_workers(struct btrfs_workers *workers) +{ + struct list_head *cur; + struct btrfs_worker_thread *worker; + int can_stop; + + spin_lock_irq(&workers->lock); + list_splice_init(&workers->idle_list, &workers->worker_list); + while (!list_empty(&workers->worker_list)) { + cur = workers->worker_list.next; + worker = list_entry(cur, struct btrfs_worker_thread, + worker_list); + + atomic_inc(&worker->refs); + workers->num_workers -= 1; + if (!list_empty(&worker->worker_list)) { + list_del_init(&worker->worker_list); + put_worker(worker); + can_stop = 1; + } else + can_stop = 0; + spin_unlock_irq(&workers->lock); + if (can_stop) + kthread_stop(worker->task); + spin_lock_irq(&workers->lock); + put_worker(worker); + } + spin_unlock_irq(&workers->lock); + return 0; +} + +/* + * simple init on struct btrfs_workers + */ +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_helper) +{ + workers->num_workers = 0; + workers->num_workers_starting = 0; + INIT_LIST_HEAD(&workers->worker_list); + INIT_LIST_HEAD(&workers->idle_list); + INIT_LIST_HEAD(&workers->order_list); + INIT_LIST_HEAD(&workers->prio_order_list); + spin_lock_init(&workers->lock); + spin_lock_init(&workers->order_lock); + workers->max_workers = max; + workers->idle_thresh = 32; + workers->name = name; + workers->ordered = 0; + workers->atomic_start_pending = 0; + workers->atomic_worker_start = async_helper; +} + +/* + * starts new worker threads. This does not enforce the max worker + * count in case you need to temporarily go past it. + */ +static int __btrfs_start_workers(struct btrfs_workers *workers, + int num_workers) +{ + struct btrfs_worker_thread *worker; + int ret = 0; + int i; + + for (i = 0; i < num_workers; i++) { + worker = kzalloc(sizeof(*worker), GFP_NOFS); + if (!worker) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->prio_pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); + + atomic_set(&worker->num_pending, 0); + atomic_set(&worker->refs, 1); + worker->workers = workers; + worker->task = kthread_run(worker_loop, worker, + "btrfs-%s-%d", workers->name, + workers->num_workers + i); + if (IS_ERR(worker->task)) { + ret = PTR_ERR(worker->task); + kfree(worker); + goto fail; + } + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->idle_list); + worker->idle = 1; + workers->num_workers++; + workers->num_workers_starting--; + WARN_ON(workers->num_workers_starting < 0); + spin_unlock_irq(&workers->lock); + } + return 0; +fail: + btrfs_stop_workers(workers); + return ret; +} + +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + spin_lock_irq(&workers->lock); + workers->num_workers_starting += num_workers; + spin_unlock_irq(&workers->lock); + return __btrfs_start_workers(workers, num_workers); +} + +/* + * run through the list and find a worker thread that doesn't have a lot + * to do right now. This can return null if we aren't yet at the thread + * count limit and all of the threads are busy. + */ +static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + struct list_head *next; + int enforce_min; + + enforce_min = (workers->num_workers + workers->num_workers_starting) < + workers->max_workers; + + /* + * if we find an idle thread, don't move it to the end of the + * idle list. This improves the chance that the next submission + * will reuse the same thread, and maybe catch it while it is still + * working + */ + if (!list_empty(&workers->idle_list)) { + next = workers->idle_list.next; + worker = list_entry(next, struct btrfs_worker_thread, + worker_list); + return worker; + } + if (enforce_min || list_empty(&workers->worker_list)) + return NULL; + + /* + * if we pick a busy task, move the task to the end of the list. + * hopefully this will keep things somewhat evenly balanced. + * Do the move in batches based on the sequence number. This groups + * requests submitted at roughly the same time onto the same worker. + */ + next = workers->worker_list.next; + worker = list_entry(next, struct btrfs_worker_thread, worker_list); + worker->sequence++; + + if (worker->sequence % workers->idle_thresh == 0) + list_move_tail(next, &workers->worker_list); + return worker; +} + +/* + * selects a worker thread to take the next job. This will either find + * an idle worker, start a new worker up to the max count, or just return + * one of the existing busy workers. + */ +static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + struct list_head *fallback; + +again: + spin_lock_irqsave(&workers->lock, flags); + worker = next_worker(workers); + + if (!worker) { + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) { + goto fallback; + } else if (workers->atomic_worker_start) { + workers->atomic_start_pending = 1; + goto fallback; + } else { + workers->num_workers_starting++; + spin_unlock_irqrestore(&workers->lock, flags); + /* we're below the limit, start another worker */ + __btrfs_start_workers(workers, 1); + goto again; + } + } + goto found; + +fallback: + fallback = NULL; + /* + * we have failed to find any workers, just + * return the first one we can find. + */ + if (!list_empty(&workers->worker_list)) + fallback = workers->worker_list.next; + if (!list_empty(&workers->idle_list)) + fallback = workers->idle_list.next; + BUG_ON(!fallback); + worker = list_entry(fallback, + struct btrfs_worker_thread, worker_list); +found: + /* + * this makes sure the worker doesn't exit before it is placed + * onto a busy/idle list + */ + atomic_inc(&worker->num_pending); + spin_unlock_irqrestore(&workers->lock, flags); + return worker; +} + +/* + * btrfs_requeue_work just puts the work item back on the tail of the list + * it was taken from. It is intended for use with long running work functions + * that make some progress and want to give the cpu up for others. + */ +int btrfs_requeue_work(struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker = work->worker; + unsigned long flags; + int wake = 0; + + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); + atomic_inc(&worker->num_pending); + + /* by definition we're busy, take ourselves off the idle + * list + */ + if (worker->idle) { + spin_lock(&worker->workers->lock); + worker->idle = 0; + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + spin_unlock(&worker->workers->lock); + } + if (!worker->working) { + wake = 1; + worker->working = 1; + } + + if (wake) + wake_up_process(worker->task); + spin_unlock_irqrestore(&worker->lock, flags); +out: + + return 0; +} + +void btrfs_set_work_high_prio(struct btrfs_work *work) +{ + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); +} + +/* + * places a struct btrfs_work into the pending queue of one of the kthreads + */ +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + int wake = 0; + + /* don't requeue something already on a list */ + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + goto out; + + worker = find_worker(workers); + if (workers->ordered) { + /* + * you're not allowed to do ordered queues from an + * interrupt handler + */ + spin_lock(&workers->order_lock); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { + list_add_tail(&work->order_list, + &workers->prio_order_list); + } else { + list_add_tail(&work->order_list, &workers->order_list); + } + spin_unlock(&workers->order_lock); + } else { + INIT_LIST_HEAD(&work->order_list); + } + + spin_lock_irqsave(&worker->lock, flags); + + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); + check_busy_worker(worker); + + /* + * avoid calling into wake_up_process if this thread has already + * been kicked + */ + if (!worker->working) + wake = 1; + worker->working = 1; + + if (wake) + wake_up_process(worker->task); + spin_unlock_irqrestore(&worker->lock, flags); + +out: + return 0; +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h new file mode 100644 index 00000000000..5077746cf85 --- /dev/null +++ b/fs/btrfs/async-thread.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ASYNC_THREAD_ +#define __BTRFS_ASYNC_THREAD_ + +struct btrfs_worker_thread; + +/* + * This is similar to a workqueue, but it is meant to spread the operations + * across all available cpus instead of just the CPU that was used to + * queue the work. There is also some batching introduced to try and + * cut down on context switches. + * + * By default threads are added on demand up to 2 * the number of cpus. + * Changing struct btrfs_workers->max_workers is one way to prevent + * demand creation of kthreads. + * + * the basic model of these worker threads is to embed a btrfs_work + * structure in your own data struct, and use container_of in a + * work function to get back to your data struct. + */ +struct btrfs_work { + /* + * func should be set to the function you want called + * your work struct is passed as the only arg + * + * ordered_func must be set for work sent to an ordered work queue, + * and it is called to complete a given work item in the same + * order they were sent to the queue. + */ + void (*func)(struct btrfs_work *work); + void (*ordered_func)(struct btrfs_work *work); + void (*ordered_free)(struct btrfs_work *work); + + /* + * flags should be set to zero. It is used to make sure the + * struct is only inserted once into the list. + */ + unsigned long flags; + + /* don't touch these */ + struct btrfs_worker_thread *worker; + struct list_head list; + struct list_head order_list; +}; + +struct btrfs_workers { + /* current number of running workers */ + int num_workers; + + int num_workers_starting; + + /* max number of workers allowed. changed by btrfs_start_workers */ + int max_workers; + + /* once a worker has this many requests or fewer, it is idle */ + int idle_thresh; + + /* force completions in the order they were queued */ + int ordered; + + /* more workers required, but in an interrupt handler */ + int atomic_start_pending; + + /* + * are we allowed to sleep while starting workers or are we required + * to start them at a later time? If we can't sleep, this indicates + * which queue we need to use to schedule thread creation. + */ + struct btrfs_workers *atomic_worker_start; + + /* list with all the work threads. The workers on the idle thread + * may be actively servicing jobs, but they haven't yet hit the + * idle thresh limit above. + */ + struct list_head worker_list; + struct list_head idle_list; + + /* + * when operating in ordered mode, this maintains the list + * of work items waiting for completion + */ + struct list_head order_list; + struct list_head prio_order_list; + + /* lock for finding the next worker thread to queue on */ + spinlock_t lock; + + /* lock for the ordered lists */ + spinlock_t order_lock; + + /* extra name for this worker, used for current->name */ + char *name; +}; + +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); +int btrfs_stop_workers(struct btrfs_workers *workers); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_starter); +int btrfs_requeue_work(struct btrfs_work *work); +void btrfs_set_work_high_prio(struct btrfs_work *work); +#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h new file mode 100644 index 00000000000..3f1f50d9d91 --- /dev/null +++ b/fs/btrfs/btrfs_inode.h @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_I__ +#define __BTRFS_I__ + +#include "extent_map.h" +#include "extent_io.h" +#include "ordered-data.h" + +/* in memory btrfs inode */ +struct btrfs_inode { + /* which subvolume this inode belongs to */ + struct btrfs_root *root; + + /* key used to find this inode on disk. This is used by the code + * to read in roots of subvolumes + */ + struct btrfs_key location; + + /* the extent_tree has caches of all the extent mappings to disk */ + struct extent_map_tree extent_tree; + + /* the io_tree does range state (DIRTY, LOCKED etc) */ + struct extent_io_tree io_tree; + + /* special utility tree used to record which mirrors have already been + * tried when checksums fail for a given block + */ + struct extent_io_tree io_failure_tree; + + /* held while logging the inode in tree-log.c */ + struct mutex log_mutex; + + /* used to order data wrt metadata */ + struct btrfs_ordered_inode_tree ordered_tree; + + /* for keeping track of orphaned inodes */ + struct list_head i_orphan; + + /* list of all the delalloc inodes in the FS. There are times we need + * to write all the delalloc pages to disk, and this list is used + * to walk them all. + */ + struct list_head delalloc_inodes; + + /* + * list for tracking inodes that must be sent to disk before a + * rename or truncate commit + */ + struct list_head ordered_operations; + + /* node for the red-black tree that links inodes in subvolume root */ + struct rb_node rb_node; + + /* the space_info for where this inode's data allocations are done */ + struct btrfs_space_info *space_info; + + /* full 64 bit generation number, struct vfs_inode doesn't have a big + * enough field for this. + */ + u64 generation; + + /* sequence number for NFS changes */ + u64 sequence; + + /* + * transid of the trans_handle that last modified this inode + */ + u64 last_trans; + + /* + * log transid when this inode was last modified + */ + u64 last_sub_trans; + + /* + * transid that last logged this inode + */ + u64 logged_trans; + + /* total number of bytes pending delalloc, used by stat to calc the + * real block usage of the file + */ + u64 delalloc_bytes; + + /* total number of bytes that may be used for this inode for + * delalloc + */ + u64 reserved_bytes; + + /* + * the size of the file stored in the metadata on disk. data=ordered + * means the in-memory i_size might be larger than the size on disk + * because not all the blocks are written yet. + */ + u64 disk_i_size; + + /* flags field from the on disk inode */ + u32 flags; + + /* + * if this is a directory then index_cnt is the counter for the index + * number for new files that are created + */ + u64 index_cnt; + + /* the start of block group preferred for allocations. */ + u64 block_group; + + /* the fsync log has some corner cases that mean we have to check + * directories to see if any unlinks have been done before + * the directory was logged. See tree-log.c for all the + * details + */ + u64 last_unlink_trans; + + /* + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. + */ + spinlock_t accounting_lock; + int reserved_extents; + int outstanding_extents; + + /* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + * + * yes, its silly to have a single bitflag, but we might grow more + * of these. + */ + unsigned ordered_data_close:1; + unsigned dummy_inode:1; + + struct inode vfs_inode; +}; + +static inline struct btrfs_inode *BTRFS_I(struct inode *inode) +{ + return container_of(inode, struct btrfs_inode, vfs_inode); +} + +static inline void btrfs_i_size_write(struct inode *inode, u64 size) +{ + i_size_write(inode, size); + BTRFS_I(inode)->disk_i_size = size; +} + +#endif diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h new file mode 100644 index 00000000000..7c4503ef6ef --- /dev/null +++ b/fs/btrfs/compat.h @@ -0,0 +1,7 @@ +#ifndef _COMPAT_H_ +#define _COMPAT_H_ + +#define btrfs_drop_nlink(inode) drop_nlink(inode) +#define btrfs_inc_nlink(inode) inc_nlink(inode) + +#endif /* _COMPAT_H_ */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c new file mode 100644 index 00000000000..a11a32058b5 --- /dev/null +++ b/fs/btrfs/compression.c @@ -0,0 +1,707 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/bit_spinlock.h> +#include <linux/pagevec.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "ordered-data.h" +#include "compression.h" +#include "extent_io.h" +#include "extent_map.h" + +struct compressed_bio { + /* number of bios pending for this compressed extent */ + atomic_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + int mirror_num; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + + /* + * the start of a variable length array of checksums only + * used by reads + */ + u32 sums; +}; + +static inline int compressed_bio_size(struct btrfs_root *root, + unsigned long disk_size) +{ + u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + return sizeof(struct compressed_bio) + + ((disk_size + root->sectorsize - 1) / root->sectorsize) * + csum_size; +} + +static struct bio *compressed_bio_alloc(struct block_device *bdev, + u64 first_byte, gfp_t gfp_flags) +{ + struct bio *bio; + int nr_vecs; + + nr_vecs = bio_get_nr_vecs(bdev); + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_size = 0; + bio->bi_bdev = bdev; + bio->bi_sector = first_byte >> 9; + } + return bio; +} + +static int check_compressed_csum(struct inode *inode, + struct compressed_bio *cb, + u64 disk_start) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page *page; + unsigned long i; + char *kaddr; + u32 csum; + u32 *cb_sum = &cb->sums; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + return 0; + + for (i = 0; i < cb->nr_pages; i++) { + page = cb->compressed_pages[i]; + csum = ~(u32)0; + + kaddr = kmap_atomic(page, KM_USER0); + csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_USER0); + + if (csum != *cb_sum) { + printk(KERN_INFO "btrfs csum failed ino %lu " + "extent %llu csum %u " + "wanted %u mirror %d\n", inode->i_ino, + (unsigned long long)disk_start, + csum, *cb_sum, cb->mirror_num); + ret = -EIO; + goto fail; + } + cb_sum++; + + } + ret = 0; +fail: + return ret; +} + +/* when we finish reading compressed pages from the disk, we + * decompress them and then run the bio end_io routines on the + * decompressed pages (in the inode address space). + * + * This allows the checksumming and other IO error handling routines + * to work normally + * + * The compressed pages are freed here, and it must be run + * in process context + */ +static void end_compressed_bio_read(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + int ret; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + inode = cb->inode; + ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); + if (ret) + goto csum_failed; + + /* ok, we're the last bio for this extent, lets start + * the decompression. + */ + tree = &BTRFS_I(inode)->io_tree; + ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); +csum_failed: + if (ret) + cb->errors = 1; + + /* release the compressed pages */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* do io completion on the original bio */ + if (cb->errors) { + bio_io_error(cb->orig_bio); + } else { + int bio_index = 0; + struct bio_vec *bvec = cb->orig_bio->bi_io_vec; + + /* + * we have verified the checksum already, set page + * checked so the end_io handlers know about it + */ + while (bio_index < cb->orig_bio->bi_vcnt) { + SetPageChecked(bvec->bv_page); + bvec++; + bio_index++; + } + bio_endio(cb->orig_bio, 0); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * Clear the writeback bits on all of the file + * pages for a compressed write + */ +static noinline int end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; + struct page *pages[16]; + unsigned long nr_pages = end_index - index + 1; + int i; + int ret; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + nr_pages -= 1; + index += 1; + continue; + } + for (i = 0; i < ret; i++) { + end_page_writeback(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + } + /* the inode may be gone now */ + return 0; +} + +/* + * do the cleanup once all the compressed pages hit the disk. + * This will clear writeback on the file pages and free the compressed + * pages. + * + * This also calls the writeback end hooks for the file pages so that + * metadata and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + /* ok, we're the last bio for this extent, step one is to + * call back into the FS and do all the end_io operations + */ + inode = cb->inode; + tree = &BTRFS_I(inode)->io_tree; + cb->compressed_pages[0]->mapping = cb->inode->i_mapping; + tree->ops->writepage_end_io_hook(cb->compressed_pages[0], + cb->start, + cb->start + cb->len - 1, + NULL, 1); + cb->compressed_pages[0]->mapping = NULL; + + end_compressed_writeback(inode, cb->start, cb->len); + /* note, our inode could be gone now */ + + /* + * release the compressed pages, these came from alloc_page and + * are not attached to the inode at all + */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * worker function to build and submit bios for previously compressed pages. + * The corresponding pages in the inode should be marked for writeback + * and the compressed pages should have a reference on them for dropping + * when the IO is complete. + * + * This also checksums the file bytes and gets things ready for + * the end io hooks. + */ +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages) +{ + struct bio *bio = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct compressed_bio *cb; + unsigned long bytes_left; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int page_index = 0; + struct page *page; + u64 first_byte = disk_start; + struct block_device *bdev; + int ret; + + WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->start = start; + cb->len = len; + cb->mirror_num = 0; + cb->compressed_pages = compressed_pages; + cb->compressed_len = compressed_len; + cb->orig_bio = NULL; + cb->nr_pages = nr_pages; + + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + atomic_inc(&cb->pending_bios); + + /* create and submit bios for the compressed pages */ + bytes_left = compressed_len; + for (page_index = 0; page_index < cb->nr_pages; page_index++) { + page = compressed_pages[page_index]; + page->mapping = inode->i_mapping; + if (bio->bi_size) + ret = io_tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(bio); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + } + if (bytes_left < PAGE_CACHE_SIZE) { + printk("bytes left %lu compress len %lu nr %lu\n", + bytes_left, cb->compressed_len, cb->nr_pages); + } + bytes_left -= PAGE_CACHE_SIZE; + first_byte += PAGE_CACHE_SIZE; + cond_resched(); + } + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + return 0; +} + +static noinline int add_ra_bio_pages(struct inode *inode, + u64 compressed_end, + struct compressed_bio *cb) +{ + unsigned long end_index; + unsigned long page_index; + u64 last_offset; + u64 isize = i_size_read(inode); + int ret; + struct page *page; + unsigned long nr_pages = 0; + struct extent_map *em; + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + struct extent_map_tree *em_tree; + struct extent_io_tree *tree; + u64 end; + int misses = 0; + + page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; + last_offset = (page_offset(page) + PAGE_CACHE_SIZE); + em_tree = &BTRFS_I(inode)->extent_tree; + tree = &BTRFS_I(inode)->io_tree; + + if (isize == 0) + return 0; + + end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + while (last_offset < compressed_end) { + page_index = last_offset >> PAGE_CACHE_SHIFT; + + if (page_index > end_index) + break; + + rcu_read_lock(); + page = radix_tree_lookup(&mapping->page_tree, page_index); + rcu_read_unlock(); + if (page) { + misses++; + if (misses > 4) + break; + goto next; + } + + page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); + if (!page) + break; + + page->index = page_index; + /* + * what we want to do here is call add_to_page_cache_lru, + * but that isn't exported, so we reproduce it here + */ + if (add_to_page_cache(page, mapping, + page->index, GFP_NOFS)) { + page_cache_release(page); + goto next; + } + + /* open coding of lru_cache_add, also not exported */ + page_cache_get(page); + if (!pagevec_add(&pvec, page)) + __pagevec_lru_add_file(&pvec); + + end = last_offset + PAGE_CACHE_SIZE - 1; + /* + * at this point, we have a locked page in the page cache + * for these bytes in the file. But, we have to make + * sure they map to this compressed extent on disk. + */ + set_page_extent_mapped(page); + lock_extent(tree, last_offset, end, GFP_NOFS); + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, last_offset, + PAGE_CACHE_SIZE); + read_unlock(&em_tree->lock); + + if (!em || last_offset < em->start || + (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || + (em->block_start >> 9) != cb->orig_bio->bi_sector) { + free_extent_map(em); + unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + break; + } + free_extent_map(em); + + if (page->index == end_index) { + char *userpage; + size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + int zeros; + zeros = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, zeros); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } + + ret = bio_add_page(cb->orig_bio, page, + PAGE_CACHE_SIZE, 0); + + if (ret == PAGE_CACHE_SIZE) { + nr_pages++; + page_cache_release(page); + } else { + unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + break; + } +next: + last_offset += PAGE_CACHE_SIZE; + } + if (pagevec_count(&pvec)) + __pagevec_lru_add_file(&pvec); + return 0; +} + +/* + * for a compressed read, the bio we get passed has all the inode pages + * in it. We don't actually do IO on those pages but allocate new ones + * to hold the compressed pages on disk. + * + * bio->bi_sector points to the compressed extent on disk + * bio->bi_io_vec points to all of the inode pages + * bio->bi_vcnt is a count of pages + * + * After the compressed pages are read, we copy the bytes into the + * bio we were passed and then call the bio end_io calls + */ +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *em_tree; + struct compressed_bio *cb; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + unsigned long compressed_len; + unsigned long nr_pages; + unsigned long page_index; + struct page *page; + struct block_device *bdev; + struct bio *comp_bio; + u64 cur_disk_byte = (u64)bio->bi_sector << 9; + u64 em_len; + u64 em_start; + struct extent_map *em; + int ret; + u32 *sums; + + tree = &BTRFS_I(inode)->io_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + + /* we need the actual starting offset of this extent in the file */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, + page_offset(bio->bi_io_vec->bv_page), + PAGE_CACHE_SIZE); + read_unlock(&em_tree->lock); + + compressed_len = em->block_len; + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->mirror_num = mirror_num; + sums = &cb->sums; + + cb->start = em->orig_start; + em_len = em->len; + em_start = em->start; + + free_extent_map(em); + em = NULL; + + cb->len = uncompressed_len; + cb->compressed_len = compressed_len; + cb->orig_bio = bio; + + nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, + GFP_NOFS); + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + for (page_index = 0; page_index < nr_pages; page_index++) { + cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + } + cb->nr_pages = nr_pages; + + add_ra_bio_pages(inode, em_start + em_len, cb); + + /* include any pages we added in add_ra-bio_pages */ + uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + cb->len = uncompressed_len; + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + atomic_inc(&cb->pending_bios); + + for (page_index = 0; page_index < nr_pages; page_index++) { + page = cb->compressed_pages[page_index]; + page->mapping = inode->i_mapping; + page->index = em_start >> PAGE_CACHE_SHIFT; + + if (comp_bio->bi_size) + ret = tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + comp_bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_lookup_bio_sums(root, inode, comp_bio, + sums); + } + sums += (comp_bio->bi_size + root->sectorsize - 1) / + root->sectorsize; + + ret = btrfs_map_bio(root, READ, comp_bio, + mirror_num, 0); + BUG_ON(ret); + + bio_put(comp_bio); + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, + GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + + bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0); + } + cur_disk_byte += PAGE_CACHE_SIZE; + } + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + + ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); + BUG_ON(ret); + + bio_put(comp_bio); + return 0; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h new file mode 100644 index 00000000000..421f5b4aa71 --- /dev/null +++ b/fs/btrfs/compression.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_COMPRESSION_ +#define __BTRFS_COMPRESSION_ + +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); +void btrfs_zlib_exit(void); +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages); +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); +#endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c new file mode 100644 index 00000000000..c4bc570a396 --- /dev/null +++ b/fs/btrfs/ctree.c @@ -0,0 +1,4388 @@ +/* + * Copyright (C) 2007,2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "locking.h" + +static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int level); +static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, int extend); +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty); +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst_buf, + struct extent_buffer *src_buf); +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); +static int setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr); + + +struct btrfs_path *btrfs_alloc_path(void) +{ + struct btrfs_path *path; + path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); + if (path) + path->reada = 1; + return path; +} + +/* + * set all locked nodes in the path to blocking locks. This should + * be done before scheduling + */ +noinline void btrfs_set_path_blocking(struct btrfs_path *p) +{ + int i; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (p->nodes[i] && p->locks[i]) + btrfs_set_lock_blocking(p->nodes[i]); + } +} + +/* + * reset all the locked nodes in the patch to spinning locks. + * + * held is used to keep lockdep happy, when lockdep is enabled + * we set held to a blocking lock before we go around and + * retake all the spinlocks in the path. You can safely use NULL + * for held + */ +noinline void btrfs_clear_path_blocking(struct btrfs_path *p, + struct extent_buffer *held) +{ + int i; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* lockdep really cares that we take all of these spinlocks + * in the right order. If any of the locks in the path are not + * currently blocking, it is going to complain. So, make really + * really sure by forcing the path to blocking before we clear + * the path blocking. + */ + if (held) + btrfs_set_lock_blocking(held); + btrfs_set_path_blocking(p); +#endif + + for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { + if (p->nodes[i] && p->locks[i]) + btrfs_clear_lock_blocking(p->nodes[i]); + } + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (held) + btrfs_clear_lock_blocking(held); +#endif +} + +/* this also releases the path */ +void btrfs_free_path(struct btrfs_path *p) +{ + btrfs_release_path(NULL, p); + kmem_cache_free(btrfs_path_cachep, p); +} + +/* + * path release drops references on the extent buffers in the path + * and it drops any locks held by this path + * + * It is safe to call this on paths that no locks or extent buffers held. + */ +noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) +{ + int i; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + p->slots[i] = 0; + if (!p->nodes[i]) + continue; + if (p->locks[i]) { + btrfs_tree_unlock(p->nodes[i]); + p->locks[i] = 0; + } + free_extent_buffer(p->nodes[i]); + p->nodes[i] = NULL; + } +} + +/* + * safely gets a reference on the root node of a tree. A lock + * is not taken, so a concurrent writer may put a different node + * at the root of the tree. See btrfs_lock_root_node for the + * looping required. + * + * The extent buffer returned by this has a reference taken, so + * it won't disappear. It may stop being the root of the tree + * at any time because there are no locks held. + */ +struct extent_buffer *btrfs_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + spin_lock(&root->node_lock); + eb = root->node; + extent_buffer_get(eb); + spin_unlock(&root->node_lock); + return eb; +} + +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_lock(eb); + + spin_lock(&root->node_lock); + if (eb == root->node) { + spin_unlock(&root->node_lock); + break; + } + spin_unlock(&root->node_lock); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* cowonly root (everything not a reference counted cow subvolume), just get + * put onto a simple dirty list. transaction.c walks this to make sure they + * get properly updated on disk. + */ +static void add_root_to_dirty_list(struct btrfs_root *root) +{ + if (root->track_dirty && list_empty(&root->dirty_list)) { + list_add(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + } +} + +/* + * used by snapshot creation to make a copy of a root for a tree with + * a given objectid. The buffer with the new root node is returned in + * cow_ret, and this func returns zero on success or a negative error code. + */ +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid) +{ + struct extent_buffer *cow; + u32 nritems; + int ret = 0; + int level; + struct btrfs_disk_key disk_key; + + WARN_ON(root->ref_cows && trans->transid != + root->fs_info->running_transaction->transid); + WARN_ON(root->ref_cows && trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); + + cow = btrfs_alloc_free_block(trans, root, buf->len, 0, + new_root_objectid, &disk_key, level, + buf->start, 0); + if (IS_ERR(cow)) + return PTR_ERR(cow); + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, new_root_objectid); + + write_extent_buffer(cow, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(cow), + BTRFS_FSID_SIZE); + + WARN_ON(btrfs_header_generation(buf) > trans->transid); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + + if (ret) + return ret; + + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +/* + * check if the tree block can be shared by multiple trees + */ +int btrfs_block_can_be_shared(struct btrfs_root *root, + struct extent_buffer *buf) +{ + /* + * Tree blocks not in refernece counted trees and tree roots + * are never shared. If a block was allocated after the last + * snapshot and the block was not allocated by tree relocation, + * we know the block is not shared. + */ + if (root->ref_cows && + buf != root->node && buf != root->commit_root && + (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item) || + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + return 1; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (root->ref_cows && + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + return 1; +#endif + return 0; +} + +static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *cow) +{ + u64 refs; + u64 owner; + u64 flags; + u64 new_flags = 0; + int ret; + + /* + * Backrefs update rules: + * + * Always use full backrefs for extent pointers in tree block + * allocated by tree relocation. + * + * If a shared tree block is no longer referenced by its owner + * tree (btrfs_header_owner(buf) == root->root_key.objectid), + * use full backrefs for extent pointers in tree block. + * + * If a tree block is been relocating + * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), + * use full backrefs for extent pointers in tree block. + * The reason for this is some operations (such as drop tree) + * are only allowed for blocks use full backrefs. + */ + + if (btrfs_block_can_be_shared(root, buf)) { + ret = btrfs_lookup_extent_info(trans, root, buf->start, + buf->len, &refs, &flags); + BUG_ON(ret); + BUG_ON(refs == 0); + } else { + refs = 1; + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; + else + flags = 0; + } + + owner = btrfs_header_owner(buf); + BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + + if (refs > 1) { + if ((owner == root->root_key.objectid || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { + ret = btrfs_inc_ref(trans, root, buf, 1); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_dec_ref(trans, root, buf, 0); + BUG_ON(ret); + ret = btrfs_inc_ref(trans, root, cow, 1); + BUG_ON(ret); + } + new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else { + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + BUG_ON(ret); + } + if (new_flags != 0) { + ret = btrfs_set_disk_extent_flags(trans, root, + buf->start, + buf->len, + new_flags, 0); + BUG_ON(ret); + } + } else { + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + BUG_ON(ret); + ret = btrfs_dec_ref(trans, root, buf, 1); + BUG_ON(ret); + } + clean_tree_block(trans, root, buf); + } + return 0; +} + +/* + * does the dirty work in cow of a single block. The parent block (if + * supplied) is updated to point to the new cow copy. The new buffer is marked + * dirty and returned locked. If you modify the block it needs to be marked + * dirty again. + * + * search_start -- an allocation hint for the new block + * + * empty_size -- a hint that you plan on doing more cow. This is the size in + * bytes the allocator should try to find free next to the block it returns. + * This is just a hint and may be ignored by the allocator. + */ +static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *cow; + int level; + int unlock_orig = 0; + u64 parent_start; + + if (*cow_ret == buf) + unlock_orig = 1; + + btrfs_assert_tree_locked(buf); + + WARN_ON(root->ref_cows && trans->transid != + root->fs_info->running_transaction->transid); + WARN_ON(root->ref_cows && trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent) + parent_start = parent->start; + else + parent_start = 0; + } else + parent_start = 0; + + cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, + root->root_key.objectid, &disk_key, + level, search_start, empty_size); + if (IS_ERR(cow)) + return PTR_ERR(cow); + + /* cow is set to blocking by btrfs_init_new_buffer */ + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, root->root_key.objectid); + + write_extent_buffer(cow, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(cow), + BTRFS_FSID_SIZE); + + update_ref_for_cow(trans, root, buf, cow); + + if (buf == root->node) { + WARN_ON(parent && parent != buf); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + parent_start = buf->start; + else + parent_start = 0; + + spin_lock(&root->node_lock); + root->node = cow; + extent_buffer_get(cow); + spin_unlock(&root->node_lock); + + btrfs_free_tree_block(trans, root, buf->start, buf->len, + parent_start, root->root_key.objectid, level); + free_extent_buffer(buf); + add_root_to_dirty_list(root); + } else { + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent_start = parent->start; + else + parent_start = 0; + + WARN_ON(trans->transid != btrfs_header_generation(parent)); + btrfs_set_node_blockptr(parent, parent_slot, + cow->start); + btrfs_set_node_ptr_generation(parent, parent_slot, + trans->transid); + btrfs_mark_buffer_dirty(parent); + btrfs_free_tree_block(trans, root, buf->start, buf->len, + parent_start, root->root_key.objectid, level); + } + if (unlock_orig) + btrfs_tree_unlock(buf); + free_extent_buffer(buf); + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +static inline int should_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf) +{ + if (btrfs_header_generation(buf) == trans->transid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && + !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + return 0; + return 1; +} + +/* + * cows a single block, see __btrfs_cow_block for the real work. + * This version of it has extra checks so that a block isn't cow'd more than + * once per transaction, as long as it hasn't been written yet + */ +noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret) +{ + u64 search_start; + int ret; + + if (trans->transaction != root->fs_info->running_transaction) { + printk(KERN_CRIT "trans %llu running %llu\n", + (unsigned long long)trans->transid, + (unsigned long long) + root->fs_info->running_transaction->transid); + WARN_ON(1); + } + if (trans->transid != root->fs_info->generation) { + printk(KERN_CRIT "trans %llu running %llu\n", + (unsigned long long)trans->transid, + (unsigned long long)root->fs_info->generation); + WARN_ON(1); + } + + if (!should_cow_block(trans, root, buf)) { + *cow_ret = buf; + return 0; + } + + search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + + if (parent) + btrfs_set_lock_blocking(parent); + btrfs_set_lock_blocking(buf); + + ret = __btrfs_cow_block(trans, root, buf, parent, + parent_slot, cow_ret, search_start, 0); + return ret; +} + +/* + * helper function for defrag to decide if two blocks pointed to by a + * node are actually close by + */ +static int close_blocks(u64 blocknr, u64 other, u32 blocksize) +{ + if (blocknr < other && other - (blocknr + blocksize) < 32768) + return 1; + if (blocknr > other && blocknr - (other + blocksize) < 32768) + return 1; + return 0; +} + +/* + * compare two keys in a memcmp fashion + */ +static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) +{ + struct btrfs_key k1; + + btrfs_disk_key_to_cpu(&k1, disk); + + return btrfs_comp_cpu_keys(&k1, k2); +} + +/* + * same as comp_keys only with two btrfs_key's + */ +int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +{ + if (k1->objectid > k2->objectid) + return 1; + if (k1->objectid < k2->objectid) + return -1; + if (k1->type > k2->type) + return 1; + if (k1->type < k2->type) + return -1; + if (k1->offset > k2->offset) + return 1; + if (k1->offset < k2->offset) + return -1; + return 0; +} + +/* + * this is used by the defrag code to go through all the + * leaves pointed to by a node and reallocate them so that + * disk order is close to key order + */ +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, int cache_only, u64 *last_ret, + struct btrfs_key *progress) +{ + struct extent_buffer *cur; + u64 blocknr; + u64 gen; + u64 search_start = *last_ret; + u64 last_block = 0; + u64 other; + u32 parent_nritems; + int end_slot; + int i; + int err = 0; + int parent_level; + int uptodate; + u32 blocksize; + int progress_passed = 0; + struct btrfs_disk_key disk_key; + + parent_level = btrfs_header_level(parent); + if (cache_only && parent_level != 1) + return 0; + + if (trans->transaction != root->fs_info->running_transaction) + WARN_ON(1); + if (trans->transid != root->fs_info->generation) + WARN_ON(1); + + parent_nritems = btrfs_header_nritems(parent); + blocksize = btrfs_level_size(root, parent_level - 1); + end_slot = parent_nritems; + + if (parent_nritems == 1) + return 0; + + btrfs_set_lock_blocking(parent); + + for (i = start_slot; i < end_slot; i++) { + int close = 1; + + if (!parent->map_token) { + map_extent_buffer(parent, + btrfs_node_key_ptr_offset(i), + sizeof(struct btrfs_key_ptr), + &parent->map_token, &parent->kaddr, + &parent->map_start, &parent->map_len, + KM_USER1); + } + btrfs_node_key(parent, &disk_key, i); + if (!progress_passed && comp_keys(&disk_key, progress) < 0) + continue; + + progress_passed = 1; + blocknr = btrfs_node_blockptr(parent, i); + gen = btrfs_node_ptr_generation(parent, i); + if (last_block == 0) + last_block = blocknr; + + if (i > 0) { + other = btrfs_node_blockptr(parent, i - 1); + close = close_blocks(blocknr, other, blocksize); + } + if (!close && i < end_slot - 2) { + other = btrfs_node_blockptr(parent, i + 1); + close = close_blocks(blocknr, other, blocksize); + } + if (close) { + last_block = blocknr; + continue; + } + if (parent->map_token) { + unmap_extent_buffer(parent, parent->map_token, + KM_USER1); + parent->map_token = NULL; + } + + cur = btrfs_find_tree_block(root, blocknr, blocksize); + if (cur) + uptodate = btrfs_buffer_uptodate(cur, gen); + else + uptodate = 0; + if (!cur || !uptodate) { + if (cache_only) { + free_extent_buffer(cur); + continue; + } + if (!cur) { + cur = read_tree_block(root, blocknr, + blocksize, gen); + } else if (!uptodate) { + btrfs_read_buffer(cur, gen); + } + } + if (search_start == 0) + search_start = last_block; + + btrfs_tree_lock(cur); + btrfs_set_lock_blocking(cur); + err = __btrfs_cow_block(trans, root, cur, parent, i, + &cur, search_start, + min(16 * blocksize, + (end_slot - i) * blocksize)); + if (err) { + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + break; + } + search_start = cur->start; + last_block = cur->start; + *last_ret = search_start; + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + } + if (parent->map_token) { + unmap_extent_buffer(parent, parent->map_token, + KM_USER1); + parent->map_token = NULL; + } + return err; +} + +/* + * The leaf data grows from end-to-front in the node. + * this returns the address of the start of the last item, + * which is the stop of the leaf data stack + */ +static inline unsigned int leaf_data_end(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(root); + return btrfs_item_offset_nr(leaf, nr - 1); +} + +/* + * extra debugging checks to make sure all the items in a key are + * well formed and in the proper order + */ +static int check_node(struct btrfs_root *root, struct btrfs_path *path, + int level) +{ + struct extent_buffer *parent = NULL; + struct extent_buffer *node = path->nodes[level]; + struct btrfs_disk_key parent_key; + struct btrfs_disk_key node_key; + int parent_slot; + int slot; + struct btrfs_key cpukey; + u32 nritems = btrfs_header_nritems(node); + + if (path->nodes[level + 1]) + parent = path->nodes[level + 1]; + + slot = path->slots[level]; + BUG_ON(nritems == 0); + if (parent) { + parent_slot = path->slots[level + 1]; + btrfs_node_key(parent, &parent_key, parent_slot); + btrfs_node_key(node, &node_key, 0); + BUG_ON(memcmp(&parent_key, &node_key, + sizeof(struct btrfs_disk_key))); + BUG_ON(btrfs_node_blockptr(parent, parent_slot) != + btrfs_header_bytenr(node)); + } + BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root)); + if (slot != 0) { + btrfs_node_key_to_cpu(node, &cpukey, slot - 1); + btrfs_node_key(node, &node_key, slot); + BUG_ON(comp_keys(&node_key, &cpukey) <= 0); + } + if (slot < nritems - 1) { + btrfs_node_key_to_cpu(node, &cpukey, slot + 1); + btrfs_node_key(node, &node_key, slot); + BUG_ON(comp_keys(&node_key, &cpukey) >= 0); + } + return 0; +} + +/* + * extra checking to make sure all the items in a leaf are + * well formed and in the proper order + */ +static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, + int level) +{ + struct extent_buffer *leaf = path->nodes[level]; + struct extent_buffer *parent = NULL; + int parent_slot; + struct btrfs_key cpukey; + struct btrfs_disk_key parent_key; + struct btrfs_disk_key leaf_key; + int slot = path->slots[0]; + + u32 nritems = btrfs_header_nritems(leaf); + + if (path->nodes[level + 1]) + parent = path->nodes[level + 1]; + + if (nritems == 0) + return 0; + + if (parent) { + parent_slot = path->slots[level + 1]; + btrfs_node_key(parent, &parent_key, parent_slot); + btrfs_item_key(leaf, &leaf_key, 0); + + BUG_ON(memcmp(&parent_key, &leaf_key, + sizeof(struct btrfs_disk_key))); + BUG_ON(btrfs_node_blockptr(parent, parent_slot) != + btrfs_header_bytenr(leaf)); + } + if (slot != 0 && slot < nritems - 1) { + btrfs_item_key(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1); + if (comp_keys(&leaf_key, &cpukey) <= 0) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad key\n", slot); + BUG_ON(1); + } + if (btrfs_item_offset_nr(leaf, slot - 1) != + btrfs_item_end_nr(leaf, slot)) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad\n", slot); + BUG_ON(1); + } + } + if (slot < nritems - 1) { + btrfs_item_key(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1); + BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0); + if (btrfs_item_offset_nr(leaf, slot) != + btrfs_item_end_nr(leaf, slot + 1)) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad\n", slot); + BUG_ON(1); + } + } + BUG_ON(btrfs_item_offset_nr(leaf, 0) + + btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root)); + return 0; +} + +static noinline int check_block(struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + return 0; + if (level == 0) + return check_leaf(root, path, level); + return check_node(root, path, level); +} + +/* + * search for key in the extent_buffer. The items start at offset p, + * and they are item_size apart. There are 'max' items in p. + * + * the slot in the array is returned via slot, and it points to + * the place where you would insert key if it is not found in + * the array. + * + * slot may point to max if the key is bigger than all of the keys + */ +static noinline int generic_bin_search(struct extent_buffer *eb, + unsigned long p, + int item_size, struct btrfs_key *key, + int max, int *slot) +{ + int low = 0; + int high = max; + int mid; + int ret; + struct btrfs_disk_key *tmp = NULL; + struct btrfs_disk_key unaligned; + unsigned long offset; + char *map_token = NULL; + char *kaddr = NULL; + unsigned long map_start = 0; + unsigned long map_len = 0; + int err; + + while (low < high) { + mid = (low + high) / 2; + offset = p + mid * item_size; + + if (!map_token || offset < map_start || + (offset + sizeof(struct btrfs_disk_key)) > + map_start + map_len) { + if (map_token) { + unmap_extent_buffer(eb, map_token, KM_USER0); + map_token = NULL; + } + + err = map_private_extent_buffer(eb, offset, + sizeof(struct btrfs_disk_key), + &map_token, &kaddr, + &map_start, &map_len, KM_USER0); + + if (!err) { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } else { + read_extent_buffer(eb, &unaligned, + offset, sizeof(unaligned)); + tmp = &unaligned; + } + + } else { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } + ret = comp_keys(tmp, key); + + if (ret < 0) + low = mid + 1; + else if (ret > 0) + high = mid; + else { + *slot = mid; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + return 0; + } + } + *slot = low; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + return 1; +} + +/* + * simple bin_search frontend that does the right thing for + * leaves vs nodes + */ +static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + if (level == 0) { + return generic_bin_search(eb, + offsetof(struct btrfs_leaf, items), + sizeof(struct btrfs_item), + key, btrfs_header_nritems(eb), + slot); + } else { + return generic_bin_search(eb, + offsetof(struct btrfs_node, ptrs), + sizeof(struct btrfs_key_ptr), + key, btrfs_header_nritems(eb), + slot); + } + return -1; +} + +int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + return bin_search(eb, key, level, slot); +} + +/* given a node and slot number, this reads the blocks it points to. The + * extent buffer is returned with a reference taken (but unlocked). + * NULL is returned on error. + */ +static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, + struct extent_buffer *parent, int slot) +{ + int level = btrfs_header_level(parent); + if (slot < 0) + return NULL; + if (slot >= btrfs_header_nritems(parent)) + return NULL; + + BUG_ON(level == 0); + + return read_tree_block(root, btrfs_node_blockptr(parent, slot), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(parent, slot)); +} + +/* + * node level balancing, used to make sure nodes are in proper order for + * item deletion. We balance from the top down, so we have to make sure + * that a deletion won't leave an node completely empty later on. + */ +static noinline int balance_level(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + int err_on_enospc = 0; + u64 orig_ptr; + + if (level == 0) + return 0; + + mid = path->nodes[level]; + + WARN_ON(!path->locks[level]); + WARN_ON(btrfs_header_generation(mid) != trans->transid); + + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + + /* + * deal with the case where there is only one pointer in the root + * by promoting the node below to a root + */ + if (!parent) { + struct extent_buffer *child; + + if (btrfs_header_nritems(mid) != 1) + return 0; + + /* promote the child to a root */ + child = read_node_slot(root, mid, 0); + BUG_ON(!child); + btrfs_tree_lock(child); + btrfs_set_lock_blocking(child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child); + BUG_ON(ret); + + spin_lock(&root->node_lock); + root->node = child; + spin_unlock(&root->node_lock); + + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + + path->locks[level] = 0; + path->nodes[level] = NULL; + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); + /* once for the path */ + free_extent_buffer(mid); + ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, + 0, root->root_key.objectid, level); + /* once for the root ptr */ + free_extent_buffer(mid); + return ret; + } + if (btrfs_header_nritems(mid) > + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) + return 0; + + if (btrfs_header_nritems(mid) < 2) + err_on_enospc = 1; + + left = read_node_slot(root, parent, pslot - 1); + if (left) { + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + wret = btrfs_cow_block(trans, root, left, + parent, pslot - 1, &left); + if (wret) { + ret = wret; + goto enospc; + } + } + right = read_node_slot(root, parent, pslot + 1); + if (right) { + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + wret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, &right); + if (wret) { + ret = wret; + goto enospc; + } + } + + /* first, try to make some room in the middle buffer */ + if (left) { + orig_slot += btrfs_header_nritems(left); + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + if (btrfs_header_nritems(mid) < 2) + err_on_enospc = 1; + } + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + wret = push_node_left(trans, root, mid, right, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + if (btrfs_header_nritems(right) == 0) { + u64 bytenr = right->start; + u32 blocksize = right->len; + + clean_tree_block(trans, root, right); + btrfs_tree_unlock(right); + free_extent_buffer(right); + right = NULL; + wret = del_ptr(trans, root, path, level + 1, pslot + + 1); + if (wret) + ret = wret; + wret = btrfs_free_tree_block(trans, root, + bytenr, blocksize, 0, + root->root_key.objectid, + level); + if (wret) + ret = wret; + } else { + struct btrfs_disk_key right_key; + btrfs_node_key(right, &right_key, 0); + btrfs_set_node_key(parent, &right_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + } + } + if (btrfs_header_nritems(mid) == 1) { + /* + * we're not allowed to leave a node with one item in the + * tree during a delete. A deletion from lower in the tree + * could try to delete the only pointer in this node. + * So, pull some keys from the left. + * There has to be a left pointer at this point because + * otherwise we would have pulled some pointers from the + * right + */ + BUG_ON(!left); + wret = balance_node_right(trans, root, mid, left); + if (wret < 0) { + ret = wret; + goto enospc; + } + if (wret == 1) { + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + } + BUG_ON(wret == 1); + } + if (btrfs_header_nritems(mid) == 0) { + /* we've managed to empty the middle node, drop it */ + u64 bytenr = mid->start; + u32 blocksize = mid->len; + + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + mid = NULL; + wret = del_ptr(trans, root, path, level + 1, pslot); + if (wret) + ret = wret; + wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, + 0, root->root_key.objectid, level); + if (wret) + ret = wret; + } else { + /* update the parent key to reflect our changes */ + struct btrfs_disk_key mid_key; + btrfs_node_key(mid, &mid_key, 0); + btrfs_set_node_key(parent, &mid_key, pslot); + btrfs_mark_buffer_dirty(parent); + } + + /* update the path */ + if (left) { + if (btrfs_header_nritems(left) > orig_slot) { + extent_buffer_get(left); + /* left was locked after cow */ + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + if (mid) { + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } + } else { + orig_slot -= btrfs_header_nritems(left); + path->slots[level] = orig_slot; + } + } + /* double check we haven't messed things up */ + check_block(root, path, level); + if (orig_ptr != + btrfs_node_blockptr(path->nodes[level], path->slots[level])) + BUG(); +enospc: + if (right) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + if (left) { + if (path->nodes[level] != left) + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return ret; +} + +/* Node balancing for insertion. Here we only split or push nodes around + * when they are completely full. This is also done top down, so we + * have to be pessimistic. + */ +static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + u64 orig_ptr; + + if (level == 0) + return 1; + + mid = path->nodes[level]; + WARN_ON(btrfs_header_generation(mid) != trans->transid); + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + + if (!parent) + return 1; + + left = read_node_slot(root, parent, pslot - 1); + + /* first, try to make some room in the middle buffer */ + if (left) { + u32 left_nr; + + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + + left_nr = btrfs_header_nritems(left); + if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, left, parent, + pslot - 1, &left); + if (ret) + wret = 1; + else { + wret = push_node_left(trans, root, + left, mid, 0); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + orig_slot += left_nr; + btrfs_node_key(mid, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot); + btrfs_mark_buffer_dirty(parent); + if (btrfs_header_nritems(left) > orig_slot) { + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + orig_slot -= + btrfs_header_nritems(left); + path->slots[level] = orig_slot; + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return 0; + } + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + right = read_node_slot(root, parent, pslot + 1); + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + u32 right_nr; + + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + + right_nr = btrfs_header_nritems(right); + if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, + &right); + if (ret) + wret = 1; + else { + wret = balance_node_right(trans, root, + right, mid); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(right, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + + if (btrfs_header_nritems(mid) <= orig_slot) { + path->nodes[level] = right; + path->slots[level + 1] += 1; + path->slots[level] = orig_slot - + btrfs_header_nritems(mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + } + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 1; +} + +/* + * readahead one full node of leaves, finding things that are close + * to the block in 'slot', and triggering ra on them. + */ +static void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) +{ + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + u32 nritems; + u64 search; + u64 target; + u64 nread = 0; + int direction = path->reada; + struct extent_buffer *eb; + u32 nr; + u32 blocksize; + u32 nscan = 0; + + if (level != 1) + return; + + if (!path->nodes[level]) + return; + + node = path->nodes[level]; + + search = btrfs_node_blockptr(node, slot); + blocksize = btrfs_level_size(root, level - 1); + eb = btrfs_find_tree_block(root, search, blocksize); + if (eb) { + free_extent_buffer(eb); + return; + } + + target = search; + + nritems = btrfs_header_nritems(node); + nr = slot; + while (1) { + if (direction < 0) { + if (nr == 0) + break; + nr--; + } else if (direction > 0) { + nr++; + if (nr >= nritems) + break; + } + if (path->reada < 0 && objectid) { + btrfs_node_key(node, &disk_key, nr); + if (btrfs_disk_key_objectid(&disk_key) != objectid) + break; + } + search = btrfs_node_blockptr(node, nr); + if ((search <= target && target - search <= 65536) || + (search > target && search - target <= 65536)) { + readahead_tree_block(root, search, blocksize, + btrfs_node_ptr_generation(node, nr)); + nread += blocksize; + } + nscan++; + if ((nread > 65536 || nscan > 32)) + break; + } +} + +/* + * returns -EAGAIN if it had to drop the path, or zero if everything was in + * cache + */ +static noinline int reada_for_balance(struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + int slot; + int nritems; + struct extent_buffer *parent; + struct extent_buffer *eb; + u64 gen; + u64 block1 = 0; + u64 block2 = 0; + int ret = 0; + int blocksize; + + parent = path->nodes[level + 1]; + if (!parent) + return 0; + + nritems = btrfs_header_nritems(parent); + slot = path->slots[level + 1]; + blocksize = btrfs_level_size(root, level); + + if (slot > 0) { + block1 = btrfs_node_blockptr(parent, slot - 1); + gen = btrfs_node_ptr_generation(parent, slot - 1); + eb = btrfs_find_tree_block(root, block1, blocksize); + if (eb && btrfs_buffer_uptodate(eb, gen)) + block1 = 0; + free_extent_buffer(eb); + } + if (slot + 1 < nritems) { + block2 = btrfs_node_blockptr(parent, slot + 1); + gen = btrfs_node_ptr_generation(parent, slot + 1); + eb = btrfs_find_tree_block(root, block2, blocksize); + if (eb && btrfs_buffer_uptodate(eb, gen)) + block2 = 0; + free_extent_buffer(eb); + } + if (block1 || block2) { + ret = -EAGAIN; + + /* release the whole path */ + btrfs_release_path(root, path); + + /* read the blocks */ + if (block1) + readahead_tree_block(root, block1, blocksize, 0); + if (block2) + readahead_tree_block(root, block2, blocksize, 0); + + if (block1) { + eb = read_tree_block(root, block1, blocksize, 0); + free_extent_buffer(eb); + } + if (block2) { + eb = read_tree_block(root, block2, blocksize, 0); + free_extent_buffer(eb); + } + } + return ret; +} + + +/* + * when we walk down the tree, it is usually safe to unlock the higher layers + * in the tree. The exceptions are when our path goes through slot 0, because + * operations on the tree might require changing key pointers higher up in the + * tree. + * + * callers might also have set path->keep_locks, which tells this code to keep + * the lock if the path points to the last slot in the block. This is part of + * walking through the tree, and selecting the next slot in the higher block. + * + * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so + * if lowest_unlock is 1, level 0 won't be unlocked + */ +static noinline void unlock_up(struct btrfs_path *path, int level, + int lowest_unlock) +{ + int i; + int skip_level = level; + int no_skips = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + break; + if (!path->locks[i]) + break; + if (!no_skips && path->slots[i] == 0) { + skip_level = i + 1; + continue; + } + if (!no_skips && path->keep_locks) { + u32 nritems; + t = path->nodes[i]; + nritems = btrfs_header_nritems(t); + if (nritems < 1 || path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } + if (skip_level < i && i >= lowest_unlock) + no_skips = 1; + + t = path->nodes[i]; + if (i >= lowest_unlock && i > skip_level && path->locks[i]) { + btrfs_tree_unlock(t); + path->locks[i] = 0; + } + } +} + +/* + * This releases any locks held in the path starting at level and + * going all the way up to the root. + * + * btrfs_search_slot will keep the lock held on higher nodes in a few + * corner cases, such as COW of the block at slot zero in the node. This + * ignores those rules, and it should only be called when there are no + * more updates to be done higher up in the tree. + */ +noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) +{ + int i; + + if (path->keep_locks) + return; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + continue; + if (!path->locks[i]) + continue; + btrfs_tree_unlock(path->nodes[i]); + path->locks[i] = 0; + } +} + +/* + * helper function for btrfs_search_slot. The goal is to find a block + * in cache without setting the path to blocking. If we find the block + * we return zero and the path is unchanged. + * + * If we can't find the block, we set the path blocking and do some + * reada. -EAGAIN is returned and the search must be repeated. + */ +static int +read_block_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer **eb_ret, int level, int slot, + struct btrfs_key *key) +{ + u64 blocknr; + u64 gen; + u32 blocksize; + struct extent_buffer *b = *eb_ret; + struct extent_buffer *tmp; + int ret; + + blocknr = btrfs_node_blockptr(b, slot); + gen = btrfs_node_ptr_generation(b, slot); + blocksize = btrfs_level_size(root, level - 1); + + tmp = btrfs_find_tree_block(root, blocknr, blocksize); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + /* + * we found an up to date block without sleeping, return + * right away + */ + *eb_ret = tmp; + return 0; + } + + /* + * reduce lock contention at high levels + * of the btree by dropping locks before + * we read. Don't release the lock on the current + * level because we need to walk this node to figure + * out which blocks to read. + */ + btrfs_unlock_up_safe(p, level + 1); + btrfs_set_path_blocking(p); + + if (tmp) + free_extent_buffer(tmp); + if (p->reada) + reada_for_search(root, p, level, slot, key->objectid); + + btrfs_release_path(NULL, p); + + ret = -EAGAIN; + tmp = read_tree_block(root, blocknr, blocksize, gen); + if (tmp) { + /* + * If the read above didn't mark this buffer up to date, + * it will never end up being up to date. Set ret to EIO now + * and give up so that our caller doesn't loop forever + * on our EAGAINs. + */ + if (!btrfs_buffer_uptodate(tmp, 0)) + ret = -EIO; + free_extent_buffer(tmp); + } + return ret; +} + +/* + * helper function for btrfs_search_slot. This does all of the checks + * for node-level blocks and does any balancing required based on + * the ins_len. + * + * If no extra work was required, zero is returned. If we had to + * drop the path, -EAGAIN is returned and btrfs_search_slot must + * start over + */ +static int +setup_nodes_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer *b, int level, int ins_len) +{ + int ret; + if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = split_node(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + } else if (ins_len < 0 && btrfs_header_nritems(b) < + BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = balance_level(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + if (!b) { + btrfs_release_path(NULL, p); + goto again; + } + BUG_ON(btrfs_header_nritems(b) == 1); + } + return 0; + +again: + ret = -EAGAIN; +done: + return ret; +} + +/* + * look for key in the tree. path is filled in with nodes along the way + * if key is found, we return zero and you can find the item in the leaf + * level of the path (level 0) + * + * If the key isn't found, the path points to the slot where it should + * be inserted, and 1 is returned. If there are other errors during the + * search a negative error number is returned. + * + * if ins_len > 0, nodes and leaves will be split as we walk down the + * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if + * possible) + */ +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow) +{ + struct extent_buffer *b; + int slot; + int ret; + int err; + int level; + int lowest_unlock = 1; + u8 lowest_level = 0; + + lowest_level = p->lowest_level; + WARN_ON(lowest_level && ins_len > 0); + WARN_ON(p->nodes[0] != NULL); + + if (ins_len < 0) + lowest_unlock = 2; + +again: + if (p->search_commit_root) { + b = root->commit_root; + extent_buffer_get(b); + if (!p->skip_locking) + btrfs_tree_lock(b); + } else { + if (p->skip_locking) + b = btrfs_root_node(root); + else + b = btrfs_lock_root_node(root); + } + + while (b) { + level = btrfs_header_level(b); + + /* + * setup the path here so we can release it under lock + * contention with the cow code + */ + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; + + if (cow) { + /* + * if we don't really need to cow this block + * then we don't want to set the path blocking, + * so we test it here + */ + if (!should_cow_block(trans, root, b)) + goto cow_done; + + btrfs_set_path_blocking(p); + + err = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b); + if (err) { + free_extent_buffer(b); + ret = err; + goto done; + } + } +cow_done: + BUG_ON(!cow && ins_len); + if (level != btrfs_header_level(b)) + WARN_ON(1); + level = btrfs_header_level(b); + + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; + + btrfs_clear_path_blocking(p, NULL); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + * + * If cow is true, then we might be changing slot zero, + * which may require changing the parent. So, we can't + * drop the lock until after we know which slot we're + * operating on. + */ + if (!cow) + btrfs_unlock_up_safe(p, level + 1); + + ret = check_block(root, p, level); + if (ret) { + ret = -1; + goto done; + } + + ret = bin_search(b, key, level, &slot); + + if (level != 0) { + int dec = 0; + if (ret && slot > 0) { + dec = 1; + slot -= 1; + } + p->slots[level] = slot; + err = setup_nodes_for_search(trans, root, p, b, level, + ins_len); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + b = p->nodes[level]; + slot = p->slots[level]; + + unlock_up(p, level, lowest_unlock); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(trans, root, p, + &b, level, slot, key); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + if (!p->skip_locking) { + btrfs_clear_path_blocking(p, NULL); + err = btrfs_try_spin_lock(b); + + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(p, b); + } + } + } else { + p->slots[level] = slot; + if (ins_len > 0 && + btrfs_leaf_free_space(root, b) < ins_len) { + btrfs_set_path_blocking(p); + err = split_leaf(trans, root, key, + p, ins_len, ret == 0); + btrfs_clear_path_blocking(p, NULL); + + BUG_ON(err > 0); + if (err) { + ret = err; + goto done; + } + } + if (!p->search_for_split) + unlock_up(p, level, lowest_unlock); + goto done; + } + } + ret = 1; +done: + /* + * we don't really know what they plan on doing with the path + * from here on, so for now just mark it as blocking + */ + if (!p->leave_spinning) + btrfs_set_path_blocking(p); + if (ret < 0) + btrfs_release_path(root, p); + return ret; +} + +/* + * adjust the pointers going up the tree, starting at level + * making sure the right key of each node is points to 'key'. + * This is used after shifting pointers to the left, so it stops + * fixing up pointers when a given leaf/node is not in slot 0 of the + * higher levels + * + * If this fails to write a tree block, it returns -1, but continues + * fixing up the blocks in ram so the tree is consistent. + */ +static int fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, int level) +{ + int i; + int ret = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + int tslot = path->slots[i]; + if (!path->nodes[i]) + break; + t = path->nodes[i]; + btrfs_set_node_key(t, key, tslot); + btrfs_mark_buffer_dirty(path->nodes[i]); + if (tslot != 0) + break; + } + return ret; +} + +/* + * update item key. + * + * This function isn't completely safe. It's the caller's responsibility + * that the new key won't break the order + */ +int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *eb; + int slot; + + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot > 0) { + btrfs_item_key(eb, &disk_key, slot - 1); + if (comp_keys(&disk_key, new_key) >= 0) + return -1; + } + if (slot < btrfs_header_nritems(eb) - 1) { + btrfs_item_key(eb, &disk_key, slot + 1); + if (comp_keys(&disk_key, new_key) <= 0) + return -1; + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(eb, &disk_key, slot); + btrfs_mark_buffer_dirty(eb); + if (slot == 0) + fixup_low_keys(trans, root, path, &disk_key, 1); + return 0; +} + +/* + * try to push data from one node into the next node left in the + * tree. + * + * returns 0 if some ptrs were pushed left, < 0 if there was some horrible + * error, and > 0 if there was no room in the left hand block. + */ +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty) +{ + int push_items = 0; + int src_nritems; + int dst_nritems; + int ret = 0; + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + if (!empty && src_nritems <= 8) + return 1; + + if (push_items <= 0) + return 1; + + if (empty) { + push_items = min(src_nritems, push_items); + if (push_items < src_nritems) { + /* leave at least 8 pointers in the node if + * we aren't going to empty it + */ + if (src_nritems - push_items < 8) { + if (push_items <= 8) + return 1; + push_items -= 8; + } + } + } else + push_items = min(src_nritems - 8, push_items); + + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(dst_nritems), + btrfs_node_key_ptr_offset(0), + push_items * sizeof(struct btrfs_key_ptr)); + + if (push_items < src_nritems) { + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(push_items), + (src_nritems - push_items) * + sizeof(struct btrfs_key_ptr)); + } + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); + + return ret; +} + +/* + * try to push data from one node into the next node right in the + * tree. + * + * returns 0 if some ptrs were pushed, < 0 if there was some horrible + * error, and > 0 if there was no room in the right hand block. + * + * this will only push up to 1/2 the contents of the left node over + */ +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst, + struct extent_buffer *src) +{ + int push_items = 0; + int max_push; + int src_nritems; + int dst_nritems; + int ret = 0; + + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + if (push_items <= 0) + return 1; + + if (src_nritems < 4) + return 1; + + max_push = src_nritems / 2 + 1; + /* don't try to empty the node */ + if (max_push >= src_nritems) + return 1; + + if (max_push < push_items) + push_items = max_push; + + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), + btrfs_node_key_ptr_offset(0), + (dst_nritems) * + sizeof(struct btrfs_key_ptr)); + + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(src_nritems - push_items), + push_items * sizeof(struct btrfs_key_ptr)); + + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); + + return ret; +} + +/* + * helper function to insert a new root level in the tree. + * A new node is allocated, and a single item is inserted to + * point to the existing root + * + * returns zero on success or < 0 on failure. + */ +static noinline int insert_new_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + u64 lower_gen; + struct extent_buffer *lower; + struct extent_buffer *c; + struct extent_buffer *old; + struct btrfs_disk_key lower_key; + + BUG_ON(path->nodes[level]); + BUG_ON(path->nodes[level-1] != root->node); + + lower = path->nodes[level-1]; + if (level == 1) + btrfs_item_key(lower, &lower_key, 0); + else + btrfs_node_key(lower, &lower_key, 0); + + c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, + root->root_key.objectid, &lower_key, + level, root->node->start, 0); + if (IS_ERR(c)) + return PTR_ERR(c); + + memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_nritems(c, 1); + btrfs_set_header_level(c, level); + btrfs_set_header_bytenr(c, c->start); + btrfs_set_header_generation(c, trans->transid); + btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(c, root->root_key.objectid); + + write_extent_buffer(c, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(c), + BTRFS_FSID_SIZE); + + write_extent_buffer(c, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(c), + BTRFS_UUID_SIZE); + + btrfs_set_node_key(c, &lower_key, 0); + btrfs_set_node_blockptr(c, 0, lower->start); + lower_gen = btrfs_header_generation(lower); + WARN_ON(lower_gen != trans->transid); + + btrfs_set_node_ptr_generation(c, 0, lower_gen); + + btrfs_mark_buffer_dirty(c); + + spin_lock(&root->node_lock); + old = root->node; + root->node = c; + spin_unlock(&root->node_lock); + + /* the super has an extra ref to root->node */ + free_extent_buffer(old); + + add_root_to_dirty_list(root); + extent_buffer_get(c); + path->nodes[level] = c; + path->locks[level] = 1; + path->slots[level] = 0; + return 0; +} + +/* + * worker function to insert a single pointer in a node. + * the node should have enough room for the pointer already + * + * slot and level indicate where you want the key to go, and + * blocknr is the block the key points to. + * + * returns zero on success and < 0 on any error + */ +static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, struct btrfs_disk_key + *key, u64 bytenr, int slot, int level) +{ + struct extent_buffer *lower; + int nritems; + + BUG_ON(!path->nodes[level]); + lower = path->nodes[level]; + nritems = btrfs_header_nritems(lower); + BUG_ON(slot > nritems); + if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) + BUG(); + if (slot != nritems) { + memmove_extent_buffer(lower, + btrfs_node_key_ptr_offset(slot + 1), + btrfs_node_key_ptr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_key_ptr)); + } + btrfs_set_node_key(lower, key, slot); + btrfs_set_node_blockptr(lower, slot, bytenr); + WARN_ON(trans->transid == 0); + btrfs_set_node_ptr_generation(lower, slot, trans->transid); + btrfs_set_header_nritems(lower, nritems + 1); + btrfs_mark_buffer_dirty(lower); + return 0; +} + +/* + * split the node at the specified level in path in two. + * The path is corrected to point to the appropriate node after the split + * + * Before splitting this tries to make some room in the node by pushing + * left and right, if either one works, it returns right away. + * + * returns 0 on success and < 0 on failure + */ +static noinline int split_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *c; + struct extent_buffer *split; + struct btrfs_disk_key disk_key; + int mid; + int ret; + int wret; + u32 c_nritems; + + c = path->nodes[level]; + WARN_ON(btrfs_header_generation(c) != trans->transid); + if (c == root->node) { + /* trying to split the root, lets make a new one */ + ret = insert_new_root(trans, root, path, level + 1); + if (ret) + return ret; + } else { + ret = push_nodes_for_insert(trans, root, path, level); + c = path->nodes[level]; + if (!ret && btrfs_header_nritems(c) < + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) + return 0; + if (ret < 0) + return ret; + } + + c_nritems = btrfs_header_nritems(c); + mid = (c_nritems + 1) / 2; + btrfs_node_key(c, &disk_key, mid); + + split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, + root->root_key.objectid, + &disk_key, level, c->start, 0); + if (IS_ERR(split)) + return PTR_ERR(split); + + memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_level(split, btrfs_header_level(c)); + btrfs_set_header_bytenr(split, split->start); + btrfs_set_header_generation(split, trans->transid); + btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(split, root->root_key.objectid); + write_extent_buffer(split, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(split), + BTRFS_FSID_SIZE); + write_extent_buffer(split, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(split), + BTRFS_UUID_SIZE); + + + copy_extent_buffer(split, c, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(mid), + (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); + btrfs_set_header_nritems(split, c_nritems - mid); + btrfs_set_header_nritems(c, mid); + ret = 0; + + btrfs_mark_buffer_dirty(c); + btrfs_mark_buffer_dirty(split); + + wret = insert_ptr(trans, root, path, &disk_key, split->start, + path->slots[level + 1] + 1, + level + 1); + if (wret) + ret = wret; + + if (path->slots[level] >= mid) { + path->slots[level] -= mid; + btrfs_tree_unlock(c); + free_extent_buffer(c); + path->nodes[level] = split; + path->slots[level + 1] += 1; + } else { + btrfs_tree_unlock(split); + free_extent_buffer(split); + } + return ret; +} + +/* + * how many bytes are required to store the items in a leaf. start + * and nr indicate which items in the leaf to check. This totals up the + * space used both by the item structs and the item data + */ +static int leaf_space_used(struct extent_buffer *l, int start, int nr) +{ + int data_len; + int nritems = btrfs_header_nritems(l); + int end = min(nritems, start + nr) - 1; + + if (!nr) + return 0; + data_len = btrfs_item_end_nr(l, start); + data_len = data_len - btrfs_item_offset_nr(l, end); + data_len += sizeof(struct btrfs_item) * nr; + WARN_ON(data_len < 0); + return data_len; +} + +/* + * The space between the end of the leaf items and + * the start of the leaf data. IOW, how much room + * the leaf has left for both items and data + */ +noinline int btrfs_leaf_free_space(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + int nritems = btrfs_header_nritems(leaf); + int ret; + ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); + if (ret < 0) { + printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " + "used %d nritems %d\n", + ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), + leaf_space_used(leaf, 0, nritems), nritems); + } + return ret; +} + +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size, int empty, + struct extent_buffer *right, + int free_space, u32 left_nritems) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *upper = path->nodes[1]; + struct btrfs_disk_key disk_key; + int slot; + u32 i; + int push_space = 0; + int push_items = 0; + struct btrfs_item *item; + u32 nr; + u32 right_nritems; + u32 data_end; + u32 this_item_size; + + if (empty) + nr = 0; + else + nr = 1; + + if (path->slots[0] >= left_nritems) + push_space += data_size; + + slot = path->slots[1]; + i = left_nritems - 1; + while (i >= nr) { + item = btrfs_item_nr(left, i); + + if (!empty && push_items > 0) { + if (path->slots[0] > i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(root, left); + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + if (!left->map_token) { + map_extent_buffer(left, (unsigned long)item, + sizeof(struct btrfs_item), + &left->map_token, &left->kaddr, + &left->map_start, &left->map_len, + KM_USER1); + } + + this_item_size = btrfs_item_size(left, item); + if (this_item_size + sizeof(*item) + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(*item); + if (i == 0) + break; + i--; + } + if (left->map_token) { + unmap_extent_buffer(left, left->map_token, KM_USER1); + left->map_token = NULL; + } + + if (push_items == 0) + goto out_unlock; + + if (!empty && push_items == left_nritems) + WARN_ON(1); + + /* push left to right */ + right_nritems = btrfs_header_nritems(right); + + push_space = btrfs_item_end_nr(left, left_nritems - push_items); + push_space -= leaf_data_end(root, left); + + /* make room in the right data area */ + data_end = leaf_data_end(root, right); + memmove_extent_buffer(right, + btrfs_leaf_data(right) + data_end - push_space, + btrfs_leaf_data(right) + data_end, + BTRFS_LEAF_DATA_SIZE(root) - data_end); + + /* copy from the left data area */ + copy_extent_buffer(right, left, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(left) + leaf_data_end(root, left), + push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), + btrfs_item_nr_offset(0), + right_nritems * sizeof(struct btrfs_item)); + + /* copy the items from left to right */ + copy_extent_buffer(right, left, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(left_nritems - push_items), + push_items * sizeof(struct btrfs_item)); + + /* update the item pointers */ + right_nritems += push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(root); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(right, i); + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + push_space -= btrfs_item_size(right, item); + btrfs_set_item_offset(right, item, push_space); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + left_nritems -= push_items; + btrfs_set_header_nritems(left, left_nritems); + + if (left_nritems) + btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(right); + + btrfs_item_key(right, &disk_key, 0); + btrfs_set_node_key(upper, &disk_key, slot + 1); + btrfs_mark_buffer_dirty(upper); + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] >= left_nritems) { + path->slots[0] -= left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + clean_tree_block(trans, root, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* + * push some data in the path leaf to the right, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. + */ +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *right; + struct extent_buffer *upper; + int slot; + int free_space; + u32 left_nritems; + int ret; + + if (!path->nodes[1]) + return 1; + + slot = path->slots[1]; + upper = path->nodes[1]; + if (slot >= btrfs_header_nritems(upper) - 1) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + right = read_node_slot(root, upper, slot + 1); + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right); + if (ret) + goto out_unlock; + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; + + return __push_leaf_right(trans, root, path, data_size, empty, + right, free_space, left_nritems); +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + */ +static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int data_size, + int empty, struct extent_buffer *left, + int free_space, int right_nritems) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *right = path->nodes[0]; + int slot; + int i; + int push_space = 0; + int push_items = 0; + struct btrfs_item *item; + u32 old_left_nritems; + u32 nr; + int ret = 0; + int wret; + u32 this_item_size; + u32 old_left_item_size; + + slot = path->slots[1]; + + if (empty) + nr = right_nritems; + else + nr = right_nritems - 1; + + for (i = 0; i < nr; i++) { + item = btrfs_item_nr(right, i); + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + if (!empty && push_items > 0) { + if (path->slots[0] < i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(root, right); + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + this_item_size = btrfs_item_size(right, item); + if (this_item_size + sizeof(*item) + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(*item); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + if (push_items == 0) { + ret = 1; + goto out; + } + if (!empty && push_items == btrfs_header_nritems(right)) + WARN_ON(1); + + /* push data from right to left */ + copy_extent_buffer(left, right, + btrfs_item_nr_offset(btrfs_header_nritems(left)), + btrfs_item_nr_offset(0), + push_items * sizeof(struct btrfs_item)); + + push_space = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_offset_nr(right, push_items - 1); + + copy_extent_buffer(left, right, btrfs_leaf_data(left) + + leaf_data_end(root, left) - push_space, + btrfs_leaf_data(right) + + btrfs_item_offset_nr(right, push_items - 1), + push_space); + old_left_nritems = btrfs_header_nritems(left); + BUG_ON(old_left_nritems <= 0); + + old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); + for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { + u32 ioff; + + item = btrfs_item_nr(left, i); + if (!left->map_token) { + map_extent_buffer(left, (unsigned long)item, + sizeof(struct btrfs_item), + &left->map_token, &left->kaddr, + &left->map_start, &left->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(left, item); + btrfs_set_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); + } + btrfs_set_header_nritems(left, old_left_nritems + push_items); + if (left->map_token) { + unmap_extent_buffer(left, left->map_token, KM_USER1); + left->map_token = NULL; + } + + /* fixup right node */ + if (push_items > right_nritems) { + printk(KERN_CRIT "push items %d nr %u\n", push_items, + right_nritems); + WARN_ON(1); + } + + if (push_items < right_nritems) { + push_space = btrfs_item_offset_nr(right, push_items - 1) - + leaf_data_end(root, right); + memmove_extent_buffer(right, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(right) + + leaf_data_end(root, right), push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(push_items), + (btrfs_header_nritems(right) - push_items) * + sizeof(struct btrfs_item)); + } + right_nritems -= push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(root); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(right, i); + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + push_space = push_space - btrfs_item_size(right, item); + btrfs_set_item_offset(right, item, push_space); + } + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_mark_buffer_dirty(left); + if (right_nritems) + btrfs_mark_buffer_dirty(right); + + btrfs_item_key(right, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, &disk_key, 1); + if (wret) + ret = wret; + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] < push_items) { + path->slots[0] += old_left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + clean_tree_block(trans, root, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = left; + path->slots[1] -= 1; + } else { + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->slots[0] -= push_items; + } + BUG_ON(path->slots[0] < 0); + return ret; +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + */ +static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct extent_buffer *right = path->nodes[0]; + struct extent_buffer *left; + int slot; + int free_space; + u32 right_nritems; + int ret = 0; + + slot = path->slots[1]; + if (slot == 0) + return 1; + if (!path->nodes[1]) + return 1; + + right_nritems = btrfs_header_nritems(right); + if (right_nritems == 0) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + left = read_node_slot(root, path->nodes[1], slot - 1); + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left); + if (ret) { + /* we hit -ENOSPC, but it isn't fatal here */ + ret = 1; + goto out; + } + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + return __push_leaf_left(trans, root, path, data_size, + empty, left, free_space, right_nritems); +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. + */ +static noinline int copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) +{ + int data_copy_size; + int rt_data_off; + int i; + int ret = 0; + int wret; + struct btrfs_disk_key disk_key; + + nritems = nritems - mid; + btrfs_set_header_nritems(right, nritems); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); + + copy_extent_buffer(right, l, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(mid), + nritems * sizeof(struct btrfs_item)); + + copy_extent_buffer(right, l, + btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - + data_copy_size, btrfs_leaf_data(l) + + leaf_data_end(root, l), data_copy_size); + + rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_end_nr(l, mid); + + for (i = 0; i < nritems; i++) { + struct btrfs_item *item = btrfs_item_nr(right, i); + u32 ioff; + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(right, item); + btrfs_set_item_offset(right, item, ioff + rt_data_off); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_set_header_nritems(l, mid); + ret = 0; + btrfs_item_key(right, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(l); + BUG_ON(path->slots[0] != slot); + + if (mid <= slot) { + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] -= mid; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + + BUG_ON(path->slots[0] < 0); + + return ret; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. + */ +static noinline int split_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, + int extend) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *l; + u32 nritems; + int mid; + int slot; + struct extent_buffer *right; + int ret = 0; + int wret; + int split; + int num_doubles = 0; + + l = path->nodes[0]; + slot = path->slots[0]; + if (extend && data_size + btrfs_item_size_nr(l, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) + return -EOVERFLOW; + + /* first try to make some room by pushing left and right */ + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { + wret = push_leaf_right(trans, root, path, data_size, 0); + if (wret < 0) + return wret; + if (wret) { + wret = push_leaf_left(trans, root, path, data_size, 0); + if (wret < 0) + return wret; + } + l = path->nodes[0]; + + /* did the pushes work? */ + if (btrfs_leaf_free_space(root, l) >= data_size) + return 0; + } + + if (!path->nodes[1]) { + ret = insert_new_root(trans, root, path, 1); + if (ret) + return ret; + } +again: + split = 1; + l = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(l); + mid = (nritems + 1) / 2; + + if (mid <= slot) { + if (nritems == 1 || + leaf_space_used(l, mid, nritems - mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (slot >= nritems) { + split = 0; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + split = 2; + } + } + } + } else { + if (leaf_space_used(l, 0, mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (!extend && data_size && slot == 0) { + split = 0; + } else if ((extend || !data_size) && slot == 0) { + mid = 1; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + split = 2 ; + } + } + } + } + + if (split == 0) + btrfs_cpu_key_to_disk(&disk_key, ins_key); + else + btrfs_item_key(l, &disk_key, mid); + + right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + root->root_key.objectid, + &disk_key, 0, l->start, 0); + if (IS_ERR(right)) { + BUG_ON(1); + return PTR_ERR(right); + } + + memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(right, right->start); + btrfs_set_header_generation(right, trans->transid); + btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(right, root->root_key.objectid); + btrfs_set_header_level(right, 0); + write_extent_buffer(right, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(right), + BTRFS_FSID_SIZE); + + write_extent_buffer(right, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(right), + BTRFS_UUID_SIZE); + + if (split == 0) { + if (mid <= slot) { + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1] += 1; + } else { + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, + right->start, + path->slots[1], 1); + if (wret) + ret = wret; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + if (path->slots[1] == 0) { + wret = fixup_low_keys(trans, root, + path, &disk_key, 1); + if (wret) + ret = wret; + } + } + btrfs_mark_buffer_dirty(right); + return ret; + } + + ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); + BUG_ON(ret); + + if (split == 2) { + BUG_ON(num_doubles != 0); + num_doubles++; + goto again; + } + + return ret; +} + +static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int ins_len) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + u64 extent_len = 0; + u32 item_size; + int ret; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_EXTENT_CSUM_KEY); + + if (btrfs_leaf_free_space(root, leaf) >= ins_len) + return 0; + + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + } + btrfs_release_path(root, path); + + path->keep_locks = 1; + path->search_for_split = 1; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + path->search_for_split = 0; + if (ret < 0) + goto err; + + ret = -EAGAIN; + leaf = path->nodes[0]; + /* if our item isn't there or got smaller, return now */ + if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) + goto err; + + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) + goto err; + } + + btrfs_set_path_blocking(path); + ret = split_leaf(trans, root, &key, path, ins_len, 1); + BUG_ON(ret); + + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); + return 0; +err: + path->keep_locks = 0; + return ret; +} + +static noinline int split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + struct btrfs_item *new_item; + int slot; + char *buf; + u32 nritems; + u32 item_size; + u32 orig_offset; + struct btrfs_disk_key disk_key; + + leaf = path->nodes[0]; + BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); + + btrfs_set_path_blocking(path); + + item = btrfs_item_nr(leaf, path->slots[0]); + orig_offset = btrfs_item_offset(leaf, item); + item_size = btrfs_item_size(leaf, item); + + buf = kmalloc(item_size, GFP_NOFS); + if (!buf) + return -ENOMEM; + + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, + path->slots[0]), item_size); + + slot = path->slots[0] + 1; + nritems = btrfs_header_nritems(leaf); + if (slot != nritems) { + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(leaf, &disk_key, slot); + + new_item = btrfs_item_nr(leaf, slot); + + btrfs_set_item_offset(leaf, new_item, orig_offset); + btrfs_set_item_size(leaf, new_item, item_size - split_offset); + + btrfs_set_item_offset(leaf, item, + orig_offset + item_size - split_offset); + btrfs_set_item_size(leaf, item, split_offset); + + btrfs_set_header_nritems(leaf, nritems + 1); + + /* write the data for the start of the original item */ + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + split_offset); + + /* write the data for the new item */ + write_extent_buffer(leaf, buf + split_offset, + btrfs_item_ptr_offset(leaf, slot), + item_size - split_offset); + btrfs_mark_buffer_dirty(leaf); + + BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); + kfree(buf); + return 0; +} + +/* + * This function splits a single item into two items, + * giving 'new_key' to the new item and splitting the + * old one at split_offset (from the start of the item). + * + * The path may be released by this operation. After + * the split, the path is pointing to the old item. The + * new item is going to be in the same node as the old one. + * + * Note, the item being split must be smaller enough to live alone on + * a tree block with room for one extra struct btrfs_item + * + * This allows us to split the item in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + int ret; + ret = setup_leaf_for_split(trans, root, path, + sizeof(struct btrfs_item)); + if (ret) + return ret; + + ret = split_item(trans, root, path, new_key, split_offset); + return ret; +} + +/* + * This function duplicate a item, giving 'new_key' to the new item. + * It guarantees both items live in the same tree leaf and the new item + * is contiguous with the original item. + * + * This allows us to split file extent in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key) +{ + struct extent_buffer *leaf; + int ret; + u32 item_size; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ret = setup_leaf_for_split(trans, root, path, + item_size + sizeof(struct btrfs_item)); + if (ret) + return ret; + + path->slots[0]++; + ret = setup_items_for_insert(trans, root, path, new_key, &item_size, + item_size, item_size + + sizeof(struct btrfs_item), 1); + BUG_ON(ret); + + leaf = path->nodes[0]; + memcpy_extent_buffer(leaf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + btrfs_item_ptr_offset(leaf, path->slots[0] - 1), + item_size); + return 0; +} + +/* + * make the item pointed to by the path smaller. new_size indicates + * how small to make it, and from_end tells us if we just chop bytes + * off the end of the item or if we shift the item to chop bytes off + * the front. + */ +int btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end) +{ + int ret = 0; + int slot; + int slot_orig; + struct extent_buffer *leaf; + struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data_start; + unsigned int old_size; + unsigned int size_diff; + int i; + + slot_orig = path->slots[0]; + leaf = path->nodes[0]; + slot = path->slots[0]; + + old_size = btrfs_item_size_nr(leaf, slot); + if (old_size == new_size) + return 0; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + old_data_start = btrfs_item_offset_nr(leaf, slot); + + size_diff = old_size - new_size; + + BUG_ON(slot < 0); + BUG_ON(slot >= nritems); + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + item = btrfs_item_nr(leaf, i); + + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + size_diff); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the data */ + if (from_end) { + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + size_diff, btrfs_leaf_data(leaf) + + data_end, old_data_start + new_size - data_end); + } else { + struct btrfs_disk_key disk_key; + u64 offset; + + btrfs_item_key(leaf, &disk_key, slot); + + if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { + unsigned long ptr; + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + fi = (struct btrfs_file_extent_item *)( + (unsigned long)fi - size_diff); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) { + ptr = btrfs_item_ptr_offset(leaf, slot); + memmove_extent_buffer(leaf, ptr, + (unsigned long)fi, + offsetof(struct btrfs_file_extent_item, + disk_bytenr)); + } + } + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + size_diff, btrfs_leaf_data(leaf) + + data_end, old_data_start - data_end); + + offset = btrfs_disk_key_offset(&disk_key); + btrfs_set_disk_key_offset(&disk_key, offset + size_diff); + btrfs_set_item_key(leaf, &disk_key, slot); + if (slot == 0) + fixup_low_keys(trans, root, path, &disk_key, 1); + } + + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_size(leaf, item, new_size); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + return ret; +} + +/* + * make the item pointed to by the path bigger, data_size is the new size. + */ +int btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size) +{ + int ret = 0; + int slot; + int slot_orig; + struct extent_buffer *leaf; + struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data; + unsigned int old_size; + int i; + + slot_orig = path->slots[0]; + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < data_size) { + btrfs_print_leaf(root, leaf); + BUG(); + } + slot = path->slots[0]; + old_data = btrfs_item_end_nr(leaf, slot); + + BUG_ON(slot < 0); + if (slot >= nritems) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d too large, nritems %d\n", + slot, nritems); + BUG_ON(1); + } + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + item = btrfs_item_nr(leaf, i); + + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - data_size); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - data_size, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + + data_end = old_data; + old_size = btrfs_item_size_nr(leaf, slot); + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_size(leaf, item, old_size + data_size); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + return ret; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + * Returns the number of keys that were inserted. + */ +int btrfs_insert_some_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int ret = 0; + int slot; + int i; + u32 nritems; + u32 total_data = 0; + u32 total_size = 0; + unsigned int data_end; + struct btrfs_disk_key disk_key; + struct btrfs_key found_key; + + for (i = 0; i < nr; i++) { + if (total_size + data_size[i] + sizeof(struct btrfs_item) > + BTRFS_LEAF_DATA_SIZE(root)) { + break; + nr = i; + } + total_data += data_size[i]; + total_size += data_size[i] + sizeof(struct btrfs_item); + } + BUG_ON(nr == 0); + + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < total_size) { + for (i = nr; i >= 0; i--) { + total_data -= data_size[i]; + total_size -= data_size[i] + sizeof(struct btrfs_item); + if (total_size < btrfs_leaf_free_space(root, leaf)) + break; + } + nr = i; + } + + slot = path->slots[0]; + BUG_ON(slot < 0); + + if (slot != nritems) { + unsigned int old_data = btrfs_item_end_nr(leaf, slot); + + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* figure out how many keys we can insert in here */ + total_data = data_size[0]; + for (i = 1; i < nr; i++) { + if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0) + break; + total_data += data_size[i]; + } + nr = i; + + if (old_data < data_end) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + slot, old_data, data_end); + BUG_ON(1); + } + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + WARN_ON(leaf->map_token); + for (i = slot; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - total_data); + } + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - total_data, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + data_end = old_data; + } else { + /* + * this sucks but it has to be done, if we are inserting at + * the end of the leaf only insert 1 of the items, since we + * have no way of knowing whats on the next leaf and we'd have + * to drop our current locks to figure it out + */ + nr = 1; + } + + /* setup the item for the new data */ + for (i = 0; i < nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + btrfs_set_item_key(leaf, &disk_key, slot + i); + item = btrfs_item_nr(leaf, slot + i); + btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + data_end -= data_size[i]; + btrfs_set_item_size(leaf, item, data_size[i]); + } + btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (slot == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + ret = fixup_low_keys(trans, root, path, &disk_key, 1); + } + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } +out: + if (!ret) + ret = nr; + return ret; +} + +/* + * this is a helper for btrfs_insert_empty_items, the main goal here is + * to save stack depth by doing the bulk of the work in a function + * that doesn't call btrfs_search_slot + */ +static noinline_for_stack int +setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr) +{ + struct btrfs_item *item; + int i; + u32 nritems; + unsigned int data_end; + struct btrfs_disk_key disk_key; + int ret; + struct extent_buffer *leaf; + int slot; + + leaf = path->nodes[0]; + slot = path->slots[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < total_size) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "not enough freespace need %u have %d\n", + total_size, btrfs_leaf_free_space(root, leaf)); + BUG(); + } + + if (slot != nritems) { + unsigned int old_data = btrfs_item_end_nr(leaf, slot); + + if (old_data < data_end) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + slot, old_data, data_end); + BUG_ON(1); + } + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + WARN_ON(leaf->map_token); + for (i = slot; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - total_data); + } + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - total_data, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + data_end = old_data; + } + + /* setup the item for the new data */ + for (i = 0; i < nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + btrfs_set_item_key(leaf, &disk_key, slot + i); + item = btrfs_item_nr(leaf, slot + i); + btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + data_end -= data_size[i]; + btrfs_set_item_size(leaf, item, data_size[i]); + } + + btrfs_set_header_nritems(leaf, nritems + nr); + + ret = 0; + if (slot == 0) { + struct btrfs_disk_key disk_key; + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + ret = fixup_low_keys(trans, root, path, &disk_key, 1); + } + btrfs_unlock_up_safe(path, 1); + btrfs_mark_buffer_dirty(leaf); + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + return ret; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + struct extent_buffer *leaf; + int ret = 0; + int slot; + int i; + u32 total_size = 0; + u32 total_data = 0; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + + total_size = total_data + (nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + BUG_ON(slot < 0); + + ret = setup_items_for_insert(trans, root, path, cpu_key, data_size, + total_data, total_size, nr); + +out: + return ret; +} + +/* + * Given a key and some data, insert an item into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *cpu_key, void *data, u32 + data_size) +{ + int ret = 0; + struct btrfs_path *path; + struct extent_buffer *leaf; + unsigned long ptr; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (!ret) { + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, data, ptr, data_size); + btrfs_mark_buffer_dirty(leaf); + } + btrfs_free_path(path); + return ret; +} + +/* + * delete the pointer from a given node. + * + * the tree should have been previously balanced so the deletion does not + * empty a node. + */ +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) +{ + struct extent_buffer *parent = path->nodes[level]; + u32 nritems; + int ret = 0; + int wret; + + nritems = btrfs_header_nritems(parent); + if (slot != nritems - 1) { + memmove_extent_buffer(parent, + btrfs_node_key_ptr_offset(slot), + btrfs_node_key_ptr_offset(slot + 1), + sizeof(struct btrfs_key_ptr) * + (nritems - slot - 1)); + } + nritems--; + btrfs_set_header_nritems(parent, nritems); + if (nritems == 0 && parent == root->node) { + BUG_ON(btrfs_header_level(root->node) != 1); + /* just turn the root into a leaf and break */ + btrfs_set_header_level(root->node, 0); + } else if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(parent, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, &disk_key, level + 1); + if (wret) + ret = wret; + } + btrfs_mark_buffer_dirty(parent); + return ret; +} + +/* + * a helper function to delete the leaf pointed to by path->slots[1] and + * path->nodes[1]. + * + * This deletes the pointer in path->nodes[1] and frees the leaf + * block extent. zero is returned if it all worked out, < 0 otherwise. + * + * The path must have already been setup for deleting the leaf, including + * all the proper balancing. path->nodes[1] must be locked. + */ +static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) +{ + int ret; + + WARN_ON(btrfs_header_generation(leaf) != trans->transid); + ret = del_ptr(trans, root, path, 1, path->slots[1]); + if (ret) + return ret; + + /* + * btrfs_free_extent is expensive, we want to make sure we + * aren't holding any locks when we call it + */ + btrfs_unlock_up_safe(path, 0); + + ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, + 0, root->root_key.objectid, 0); + return ret; +} +/* + * delete the item at the leaf level in path. If that empties + * the leaf, remove it from the tree + */ +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int last_off; + int dsize = 0; + int ret = 0; + int wret; + int i; + u32 nritems; + + leaf = path->nodes[0]; + last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size_nr(leaf, slot + i); + + nritems = btrfs_header_nritems(leaf); + + if (slot + nr != nritems) { + int data_end = leaf_data_end(root, leaf); + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + dsize, + btrfs_leaf_data(leaf) + data_end, + last_off - data_end); + + for (i = slot + nr; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + dsize); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), + btrfs_item_nr_offset(slot + nr), + sizeof(struct btrfs_item) * + (nritems - slot - nr)); + } + btrfs_set_header_nritems(leaf, nritems - nr); + nritems -= nr; + + /* delete the leaf if we've emptied it */ + if (nritems == 0) { + if (leaf == root->node) { + btrfs_set_header_level(leaf, 0); + } else { + ret = btrfs_del_leaf(trans, root, path, leaf); + BUG_ON(ret); + } + } else { + int used = leaf_space_used(leaf, 0, nritems); + if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_item_key(leaf, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, + &disk_key, 1); + if (wret) + ret = wret; + } + + /* delete the leaf if it is mostly empty */ + if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { + /* push_leaf_left fixes the path. + * make sure the path still points to our leaf + * for possible call to del_ptr below + */ + slot = path->slots[1]; + extent_buffer_get(leaf); + + btrfs_set_path_blocking(path); + wret = push_leaf_left(trans, root, path, 1, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + + if (path->nodes[0] == leaf && + btrfs_header_nritems(leaf)) { + wret = push_leaf_right(trans, root, path, 1, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + } + + if (btrfs_header_nritems(leaf) == 0) { + path->slots[1] = slot; + ret = btrfs_del_leaf(trans, root, path, leaf); + BUG_ON(ret); + free_extent_buffer(leaf); + } else { + /* if we're still in the path, make sure + * we're dirty. Otherwise, one of the + * push_leaf functions must have already + * dirtied this buffer + */ + if (path->nodes[0] == leaf) + btrfs_mark_buffer_dirty(leaf); + free_extent_buffer(leaf); + } + } else { + btrfs_mark_buffer_dirty(leaf); + } + } + return ret; +} + +/* + * search the tree again to find a leaf with lesser keys + * returns 0 if it found something or 1 if there are no lesser leaves. + * returns < 0 on io errors. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. + */ +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_disk_key found_key; + int ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + + if (key.offset > 0) + key.offset--; + else if (key.type > 0) + key.type--; + else if (key.objectid > 0) + key.objectid--; + else + return 1; + + btrfs_release_path(root, path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = comp_keys(&found_key, &key); + if (ret < 0) + return 0; + return 1; +} + +/* + * A helper function to walk down the tree starting at min_key, and looking + * for nodes or leaves that are either in cache or have a minimum + * transaction id. This is used by the btree defrag code, and tree logging + * + * This does not cow, but it does stuff the starting key it finds back + * into min_key, so you can call btrfs_search_slot with cow=1 on the + * key and get a writable path. + * + * This does lock as it descends, and path->keep_locks should be set + * to 1 by the caller. + * + * This honors path->lowest_level to prevent descent past a given level + * of the tree. + * + * min_trans indicates the oldest transaction that you are interested + * in walking through. Any nodes or leaves older than min_trans are + * skipped over (without reading them). + * + * returns zero if something useful was found, < 0 on error and 1 if there + * was nothing in the tree that matched the search criteria. + */ +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_key *max_key, + struct btrfs_path *path, int cache_only, + u64 min_trans) +{ + struct extent_buffer *cur; + struct btrfs_key found_key; + int slot; + int sret; + u32 nritems; + int level; + int ret = 1; + + WARN_ON(!path->keep_locks); +again: + cur = btrfs_lock_root_node(root); + level = btrfs_header_level(cur); + WARN_ON(path->nodes[level]); + path->nodes[level] = cur; + path->locks[level] = 1; + + if (btrfs_header_generation(cur) < min_trans) { + ret = 1; + goto out; + } + while (1) { + nritems = btrfs_header_nritems(cur); + level = btrfs_header_level(cur); + sret = bin_search(cur, min_key, level, &slot); + + /* at the lowest level, we're done, setup the path and exit */ + if (level == path->lowest_level) { + if (slot >= nritems) + goto find_next_key; + ret = 0; + path->slots[level] = slot; + btrfs_item_key_to_cpu(cur, &found_key, slot); + goto out; + } + if (sret && slot > 0) + slot--; + /* + * check this node pointer against the cache_only and + * min_trans parameters. If it isn't in cache or is too + * old, skip to the next one. + */ + while (slot < nritems) { + u64 blockptr; + u64 gen; + struct extent_buffer *tmp; + struct btrfs_disk_key disk_key; + + blockptr = btrfs_node_blockptr(cur, slot); + gen = btrfs_node_ptr_generation(cur, slot); + if (gen < min_trans) { + slot++; + continue; + } + if (!cache_only) + break; + + if (max_key) { + btrfs_node_key(cur, &disk_key, slot); + if (comp_keys(&disk_key, max_key) >= 0) { + ret = 1; + goto out; + } + } + + tmp = btrfs_find_tree_block(root, blockptr, + btrfs_level_size(root, level - 1)); + + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + free_extent_buffer(tmp); + break; + } + if (tmp) + free_extent_buffer(tmp); + slot++; + } +find_next_key: + /* + * we didn't find a candidate key in this node, walk forward + * and find another one + */ + if (slot >= nritems) { + path->slots[level] = slot; + btrfs_set_path_blocking(path); + sret = btrfs_find_next_key(root, path, min_key, level, + cache_only, min_trans); + if (sret == 0) { + btrfs_release_path(root, path); + goto again; + } else { + goto out; + } + } + /* save our key for returning back */ + btrfs_node_key_to_cpu(cur, &found_key, slot); + path->slots[level] = slot; + if (level == path->lowest_level) { + ret = 0; + unlock_up(path, level, 1); + goto out; + } + btrfs_set_path_blocking(path); + cur = read_node_slot(root, cur, slot); + + btrfs_tree_lock(cur); + + path->locks[level - 1] = 1; + path->nodes[level - 1] = cur; + unlock_up(path, level, 1); + btrfs_clear_path_blocking(path, NULL); + } +out: + if (ret == 0) + memcpy(min_key, &found_key, sizeof(found_key)); + btrfs_set_path_blocking(path); + return ret; +} + +/* + * this is similar to btrfs_next_leaf, but does not try to preserve + * and fixup the path. It looks for and returns the next key in the + * tree based on the current path and the cache_only and min_trans + * parameters. + * + * 0 is returned if another key is found, < 0 if there are any errors + * and 1 is returned if there are no higher keys in the tree + * + * path->keep_locks should be set to 1 on the search made before + * calling this function. + */ +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int level, + int cache_only, u64 min_trans) +{ + int slot; + struct extent_buffer *c; + + WARN_ON(!path->keep_locks); + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + return 1; + + slot = path->slots[level] + 1; + c = path->nodes[level]; +next: + if (slot >= btrfs_header_nritems(c)) { + int ret; + int orig_lowest; + struct btrfs_key cur_key; + if (level + 1 >= BTRFS_MAX_LEVEL || + !path->nodes[level + 1]) + return 1; + + if (path->locks[level + 1]) { + level++; + continue; + } + + slot = btrfs_header_nritems(c) - 1; + if (level == 0) + btrfs_item_key_to_cpu(c, &cur_key, slot); + else + btrfs_node_key_to_cpu(c, &cur_key, slot); + + orig_lowest = path->lowest_level; + btrfs_release_path(root, path); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &cur_key, path, + 0, 0); + path->lowest_level = orig_lowest; + if (ret < 0) + return ret; + + c = path->nodes[level]; + slot = path->slots[level]; + if (ret == 0) + slot++; + goto next; + } + + if (level == 0) + btrfs_item_key_to_cpu(c, key, slot); + else { + u64 blockptr = btrfs_node_blockptr(c, slot); + u64 gen = btrfs_node_ptr_generation(c, slot); + + if (cache_only) { + struct extent_buffer *cur; + cur = btrfs_find_tree_block(root, blockptr, + btrfs_level_size(root, level - 1)); + if (!cur || !btrfs_buffer_uptodate(cur, gen)) { + slot++; + if (cur) + free_extent_buffer(cur); + goto next; + } + free_extent_buffer(cur); + } + if (gen < min_trans) { + slot++; + goto next; + } + btrfs_node_key_to_cpu(c, key, slot); + } + return 0; + } + return 1; +} + +/* + * search the tree again to find a leaf with greater keys + * returns 0 if it found something or 1 if there are no greater leaves. + * returns < 0 on io errors. + */ +int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + int slot; + int level; + struct extent_buffer *c; + struct extent_buffer *next; + struct btrfs_key key; + u32 nritems; + int ret; + int old_spinning = path->leave_spinning; + int force_blocking = 0; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (nritems == 0) + return 1; + + /* + * we take the blocks in an order that upsets lockdep. Using + * blocking mode is the only way around it. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + force_blocking = 1; +#endif + + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); +again: + level = 1; + next = NULL; + btrfs_release_path(root, path); + + path->keep_locks = 1; + + if (!force_blocking) + path->leave_spinning = 1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->keep_locks = 0; + + if (ret < 0) + return ret; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * by releasing the path above we dropped all our locks. A balance + * could have added more items next to the key that used to be + * at the very end of the block. So, check again here and + * advance the path if there are now more items available. + */ + if (nritems > 0 && path->slots[0] < nritems - 1) { + if (ret == 0) + path->slots[0]++; + ret = 0; + goto done; + } + + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) { + ret = 1; + goto done; + } + + slot = path->slots[level] + 1; + c = path->nodes[level]; + if (slot >= btrfs_header_nritems(c)) { + level++; + if (level == BTRFS_MAX_LEVEL) { + ret = 1; + goto done; + } + continue; + } + + if (next) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + } + + next = c; + ret = read_block_for_search(NULL, root, path, &next, level, + slot, &key); + if (ret == -EAGAIN) + goto again; + + if (ret < 0) { + btrfs_release_path(root, path); + goto done; + } + + if (!path->skip_locking) { + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); + } + break; + } + path->slots[level] = slot; + while (1) { + level--; + c = path->nodes[level]; + if (path->locks[level]) + btrfs_tree_unlock(c); + + free_extent_buffer(c); + path->nodes[level] = next; + path->slots[level] = 0; + if (!path->skip_locking) + path->locks[level] = 1; + + if (!level) + break; + + ret = read_block_for_search(NULL, root, path, &next, level, + 0, &key); + if (ret == -EAGAIN) + goto again; + + if (ret < 0) { + btrfs_release_path(root, path); + goto done; + } + + if (!path->skip_locking) { + btrfs_assert_tree_locked(path->nodes[level]); + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); + } + } + ret = 0; +done: + unlock_up(path, 0, 1); + path->leave_spinning = old_spinning; + if (!old_spinning) + btrfs_set_path_blocking(path); + + return ret; +} + +/* + * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps + * searching until it gets past min_objectid or finds an item of 'type' + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid < min_objectid) + break; + if (found_key.type == type) + return 0; + if (found_key.objectid == min_objectid && + found_key.type < type) + break; + } + return 1; +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h new file mode 100644 index 00000000000..2aa8ec6a098 --- /dev/null +++ b/fs/btrfs/ctree.h @@ -0,0 +1,2411 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_CTREE__ +#define __BTRFS_CTREE__ + +#include <linux/version.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/fs.h> +#include <linux/completion.h> +#include <linux/backing-dev.h> +#include <linux/wait.h> +#include <asm/kmap_types.h> +#include "extent_io.h" +#include "extent_map.h" +#include "async-thread.h" + +struct btrfs_trans_handle; +struct btrfs_transaction; +extern struct kmem_cache *btrfs_trans_handle_cachep; +extern struct kmem_cache *btrfs_transaction_cachep; +extern struct kmem_cache *btrfs_bit_radix_cachep; +extern struct kmem_cache *btrfs_path_cachep; +struct btrfs_ordered_sum; + +#define BTRFS_MAGIC "_BHRfS_M" + +#define BTRFS_MAX_LEVEL 8 + +#define BTRFS_COMPAT_EXTENT_TREE_V0 + +/* + * files bigger than this get some pre-flushing when they are added + * to the ordered operations list. That way we limit the total + * work done by the commit + */ +#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024) + +/* holds pointers to all of the tree roots */ +#define BTRFS_ROOT_TREE_OBJECTID 1ULL + +/* stores information about which extents are in use, and reference counts */ +#define BTRFS_EXTENT_TREE_OBJECTID 2ULL + +/* + * chunk tree stores translations from logical -> physical block numbering + * the super block points to the chunk tree + */ +#define BTRFS_CHUNK_TREE_OBJECTID 3ULL + +/* + * stores information about which areas of a given device are in use. + * one per device. The tree of tree roots points to the device tree + */ +#define BTRFS_DEV_TREE_OBJECTID 4ULL + +/* one per subvolume, storing files and directories */ +#define BTRFS_FS_TREE_OBJECTID 5ULL + +/* directory objectid inside the root tree */ +#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL + +/* holds checksums of all the data extents */ +#define BTRFS_CSUM_TREE_OBJECTID 7ULL + +/* orhpan objectid for tracking unlinked/truncated files */ +#define BTRFS_ORPHAN_OBJECTID -5ULL + +/* does write ahead logging to speed up fsyncs */ +#define BTRFS_TREE_LOG_OBJECTID -6ULL +#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL + +/* for space balancing */ +#define BTRFS_TREE_RELOC_OBJECTID -8ULL +#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL + +/* + * extent checksums all have this objectid + * this allows them to share the logging tree + * for fsyncs + */ +#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL + +/* dummy objectid represents multiple objectids */ +#define BTRFS_MULTIPLE_OBJECTIDS -255ULL + +/* + * All files have objectids in this range. + */ +#define BTRFS_FIRST_FREE_OBJECTID 256ULL +#define BTRFS_LAST_FREE_OBJECTID -256ULL +#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL + + +/* + * the device items go into the chunk tree. The key is in the form + * [ 1 BTRFS_DEV_ITEM_KEY device_id ] + */ +#define BTRFS_DEV_ITEMS_OBJECTID 1ULL + +#define BTRFS_BTREE_INODE_OBJECTID 1 + +#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 + +/* + * we can actually store much bigger names, but lets not confuse the rest + * of linux + */ +#define BTRFS_NAME_LEN 255 + +/* 32 bytes in various csum fields */ +#define BTRFS_CSUM_SIZE 32 + +/* csum types */ +#define BTRFS_CSUM_TYPE_CRC32 0 + +static int btrfs_csum_sizes[] = { 4, 0 }; + +/* four bytes for CRC32 */ +#define BTRFS_EMPTY_DIR_SIZE 0 + +#define BTRFS_FT_UNKNOWN 0 +#define BTRFS_FT_REG_FILE 1 +#define BTRFS_FT_DIR 2 +#define BTRFS_FT_CHRDEV 3 +#define BTRFS_FT_BLKDEV 4 +#define BTRFS_FT_FIFO 5 +#define BTRFS_FT_SOCK 6 +#define BTRFS_FT_SYMLINK 7 +#define BTRFS_FT_XATTR 8 +#define BTRFS_FT_MAX 9 + +/* + * The key defines the order in the tree, and so it also defines (optimal) + * block layout. + * + * objectid corresponds to the inode number. + * + * type tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with type of 1 might refer to the inode data, + * type of 2 may point to file data in the btree and type == 3 may point to + * extents. + * + * offset is the starting byte offset for this key in the stream. + * + * btrfs_disk_key is in disk byte order. struct btrfs_key is always + * in cpu native order. Otherwise they are identical and their sizes + * should be the same (ie both packed) + */ +struct btrfs_disk_key { + __le64 objectid; + u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_key { + u64 objectid; + u8 type; + u64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_mapping_tree { + struct extent_map_tree map_tree; +}; + +#define BTRFS_UUID_SIZE 16 +struct btrfs_dev_item { + /* the internal btrfs device id */ + __le64 devid; + + /* size of the device */ + __le64 total_bytes; + + /* bytes used */ + __le64 bytes_used; + + /* optimal io alignment for this device */ + __le32 io_align; + + /* optimal io width for this device */ + __le32 io_width; + + /* minimal io size for this device */ + __le32 sector_size; + + /* type and info about this device */ + __le64 type; + + /* expected generation for this device */ + __le64 generation; + + /* + * starting byte of this partition on the device, + * to allow for stripe alignment in the future + */ + __le64 start_offset; + + /* grouping information for allocation decisions */ + __le32 dev_group; + + /* seek speed 0-100 where 100 is fastest */ + u8 seek_speed; + + /* bandwidth 0-100 where 100 is fastest */ + u8 bandwidth; + + /* btrfs generated uuid for this device */ + u8 uuid[BTRFS_UUID_SIZE]; + + /* uuid of FS who owns this device */ + u8 fsid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_stripe { + __le64 devid; + __le64 offset; + u8 dev_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_chunk { + /* size of this chunk in bytes */ + __le64 length; + + /* objectid of the root referencing this chunk */ + __le64 owner; + + __le64 stripe_len; + __le64 type; + + /* optimal io alignment for this chunk */ + __le32 io_align; + + /* optimal io width for this chunk */ + __le32 io_width; + + /* minimal io size for this chunk */ + __le32 sector_size; + + /* 2^16 stripes is quite a lot, a second limit is the size of a single + * item in the btree + */ + __le16 num_stripes; + + /* sub stripes only matter for raid10 */ + __le16 sub_stripes; + struct btrfs_stripe stripe; + /* additional stripes go here */ +} __attribute__ ((__packed__)); + +static inline unsigned long btrfs_chunk_item_size(int num_stripes) +{ + BUG_ON(num_stripes == 0); + return sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * (num_stripes - 1); +} + +#define BTRFS_FSID_SIZE 16 +#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) +#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) +#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) +#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) + +#define BTRFS_BACKREF_REV_MAX 256 +#define BTRFS_BACKREF_REV_SHIFT 56 +#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ + BTRFS_BACKREF_REV_SHIFT) + +#define BTRFS_OLD_BACKREF_REV 0 +#define BTRFS_MIXED_BACKREF_REV 1 + +/* + * every tree block (leaf or node) starts with this header. + */ +struct btrfs_header { + /* these first four must match the super block */ + u8 csum[BTRFS_CSUM_SIZE]; + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + __le64 bytenr; /* which block this node is supposed to live in */ + __le64 flags; + + /* allowed to be different from the super from here on down */ + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + __le64 generation; + __le64 owner; + __le32 nritems; + u8 level; +} __attribute__ ((__packed__)); + +#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ + sizeof(struct btrfs_header)) / \ + sizeof(struct btrfs_key_ptr)) +#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) +#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) +#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) - \ + sizeof(struct btrfs_file_extent_item)) +#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) -\ + sizeof(struct btrfs_dir_item)) + + +/* + * this is a very generous portion of the super block, giving us + * room to translate 14 chunks with 3 stripes each. + */ +#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 +#define BTRFS_LABEL_SIZE 256 + +/* + * the super block basically lists the main trees of the FS + * it currently lacks any block count etc etc + */ +struct btrfs_super_block { + u8 csum[BTRFS_CSUM_SIZE]; + /* the first 4 fields must match struct btrfs_header */ + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + __le64 bytenr; /* this block number */ + __le64 flags; + + /* allowed to be different from the btrfs_header from here own down */ + __le64 magic; + __le64 generation; + __le64 root; + __le64 chunk_root; + __le64 log_root; + + /* this will help find the new super based on the log root */ + __le64 log_root_transid; + __le64 total_bytes; + __le64 bytes_used; + __le64 root_dir_objectid; + __le64 num_devices; + __le32 sectorsize; + __le32 nodesize; + __le32 leafsize; + __le32 stripesize; + __le32 sys_chunk_array_size; + __le64 chunk_root_generation; + __le64 compat_flags; + __le64 compat_ro_flags; + __le64 incompat_flags; + __le16 csum_type; + u8 root_level; + u8 chunk_root_level; + u8 log_root_level; + struct btrfs_dev_item dev_item; + + char label[BTRFS_LABEL_SIZE]; + + /* future expansion */ + __le64 reserved[32]; + u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; +} __attribute__ ((__packed__)); + +/* + * Compat flags that we support. If any incompat flags are set other than the + * ones specified below then we will fail to mount + */ +#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) + +#define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF + +/* + * A leaf is full of items. offset and size tell us where to find + * the item in the leaf (relative to the start of the data area) + */ +struct btrfs_item { + struct btrfs_disk_key key; + __le32 offset; + __le32 size; +} __attribute__ ((__packed__)); + +/* + * leaves have an item area and a data area: + * [item0, item1....itemN] [free space] [dataN...data1, data0] + * + * The data is separate from the items to get the keys closer together + * during searches. + */ +struct btrfs_leaf { + struct btrfs_header header; + struct btrfs_item items[]; +} __attribute__ ((__packed__)); + +/* + * all non-leaf blocks are nodes, they hold only keys and pointers to + * other blocks + */ +struct btrfs_key_ptr { + struct btrfs_disk_key key; + __le64 blockptr; + __le64 generation; +} __attribute__ ((__packed__)); + +struct btrfs_node { + struct btrfs_header header; + struct btrfs_key_ptr ptrs[]; +} __attribute__ ((__packed__)); + +/* + * btrfs_paths remember the path taken from the root down to the leaf. + * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point + * to any other levels that are present. + * + * The slots array records the index of the item or block pointer + * used while walking the tree. + */ +struct btrfs_path { + struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; + int slots[BTRFS_MAX_LEVEL]; + /* if there is real range locking, this locks field will change */ + int locks[BTRFS_MAX_LEVEL]; + int reada; + /* keep some upper locks as we walk down */ + int lowest_level; + + /* + * set by btrfs_split_item, tells search_slot to keep all locks + * and to force calls to keep space in the nodes + */ + unsigned int search_for_split:1; + unsigned int keep_locks:1; + unsigned int skip_locking:1; + unsigned int leave_spinning:1; + unsigned int search_commit_root:1; +}; + +/* + * items in the extent btree are used to record the objectid of the + * owner of the block and the number of references + */ + +struct btrfs_extent_item { + __le64 refs; + __le64 generation; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_extent_item_v0 { + __le32 refs; +} __attribute__ ((__packed__)); + +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ + sizeof(struct btrfs_item)) + +#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) +#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) + +/* following flags only apply to tree blocks */ + +/* use full backrefs for extent pointers in the block */ +#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) + +struct btrfs_tree_block_info { + struct btrfs_disk_key key; + u8 level; +} __attribute__ ((__packed__)); + +struct btrfs_extent_data_ref { + __le64 root; + __le64 objectid; + __le64 offset; + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_shared_data_ref { + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_extent_inline_ref { + u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +/* old style backrefs item */ +struct btrfs_extent_ref_v0 { + __le64 root; + __le64 generation; + __le64 objectid; + __le32 count; +} __attribute__ ((__packed__)); + + +/* dev extents record free space on individual devices. The owner + * field points back to the chunk allocation mapping tree that allocated + * the extent. The chunk tree uuid field is a way to double check the owner + */ +struct btrfs_dev_extent { + __le64 chunk_tree; + __le64 chunk_objectid; + __le64 chunk_offset; + __le64 length; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_inode_ref { + __le64 index; + __le16 name_len; + /* name goes here */ +} __attribute__ ((__packed__)); + +struct btrfs_timespec { + __le64 sec; + __le32 nsec; +} __attribute__ ((__packed__)); + +enum btrfs_compression_type { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LAST = 2, +}; + +struct btrfs_inode_item { + /* nfs style generation number */ + __le64 generation; + /* transid that last touched this inode */ + __le64 transid; + __le64 size; + __le64 nbytes; + __le64 block_group; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le64 rdev; + __le64 flags; + + /* modification sequence number for NFS */ + __le64 sequence; + + /* + * a little future expansion, for more than this we can + * just grow the inode item and version it + */ + __le64 reserved[4]; + struct btrfs_timespec atime; + struct btrfs_timespec ctime; + struct btrfs_timespec mtime; + struct btrfs_timespec otime; +} __attribute__ ((__packed__)); + +struct btrfs_dir_log_item { + __le64 end; +} __attribute__ ((__packed__)); + +struct btrfs_dir_item { + struct btrfs_disk_key location; + __le64 transid; + __le16 data_len; + __le16 name_len; + u8 type; +} __attribute__ ((__packed__)); + +struct btrfs_root_item { + struct btrfs_inode_item inode; + __le64 generation; + __le64 root_dirid; + __le64 bytenr; + __le64 byte_limit; + __le64 bytes_used; + __le64 last_snapshot; + __le64 flags; + __le32 refs; + struct btrfs_disk_key drop_progress; + u8 drop_level; + u8 level; +} __attribute__ ((__packed__)); + +/* + * this is used for both forward and backward root refs + */ +struct btrfs_root_ref { + __le64 dirid; + __le64 sequence; + __le16 name_len; +} __attribute__ ((__packed__)); + +#define BTRFS_FILE_EXTENT_INLINE 0 +#define BTRFS_FILE_EXTENT_REG 1 +#define BTRFS_FILE_EXTENT_PREALLOC 2 + +struct btrfs_file_extent_item { + /* + * transaction id that created this extent + */ + __le64 generation; + /* + * max number of bytes to hold this extent in ram + * when we split a compressed extent we can't know how big + * each of the resulting pieces will be. So, this is + * an upper limit on the size of the extent in ram instead of + * an exact limit. + */ + __le64 ram_bytes; + + /* + * 32 bits for the various ways we might encode the data, + * including compression and encryption. If any of these + * are set to something a given disk format doesn't understand + * it is treated like an incompat flag for reading and writing, + * but not for stat. + */ + u8 compression; + u8 encryption; + __le16 other_encoding; /* spare for later use */ + + /* are we inline data or a real extent? */ + u8 type; + + /* + * disk space consumed by the extent, checksum blocks are included + * in these numbers + */ + __le64 disk_bytenr; + __le64 disk_num_bytes; + /* + * the logical offset in file blocks (no csums) + * this extent record is for. This allows a file extent to point + * into the middle of an existing extent on disk, sharing it + * between two snapshots (useful if some bytes in the middle of the + * extent have changed + */ + __le64 offset; + /* + * the logical number of file blocks (no csums included). This + * always reflects the size uncompressed and without encoding. + */ + __le64 num_bytes; + +} __attribute__ ((__packed__)); + +struct btrfs_csum_item { + u8 csum; +} __attribute__ ((__packed__)); + +/* different types of block groups (and chunks) */ +#define BTRFS_BLOCK_GROUP_DATA (1 << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) +#define BTRFS_BLOCK_GROUP_DUP (1 << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) + +struct btrfs_block_group_item { + __le64 used; + __le64 chunk_objectid; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_space_info { + u64 flags; + + u64 total_bytes; /* total bytes in the space */ + u64 bytes_used; /* total bytes used on disk */ + u64 bytes_pinned; /* total bytes pinned, will be freed when the + transaction finishes */ + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + u64 bytes_super; /* total bytes reserved for the super blocks */ + u64 bytes_root; /* the number of bytes needed to commit a + transaction */ + u64 bytes_may_use; /* number of bytes that may be used for + delalloc/allocations */ + u64 bytes_delalloc; /* number of bytes currently reserved for + delayed allocation */ + + int full; /* indicates that we cannot allocate any more + chunks for this space */ + int force_alloc; /* set if we need to force a chunk alloc for + this space */ + int force_delalloc; /* make people start doing filemap_flush until + we're under a threshold */ + + struct list_head list; + + /* for controlling how we free up space for allocations */ + wait_queue_head_t allocate_wait; + wait_queue_head_t flush_wait; + int allocating_chunk; + int flushing; + + /* for block groups in our same type */ + struct list_head block_groups; + spinlock_t lock; + struct rw_semaphore groups_sem; + atomic_t caching_threads; +}; + +/* + * free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations and data allocations in ssd mode. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* largest extent in this cluster */ + u64 max_size; + + /* first extent starting offset */ + u64 window_start; + + /* if this cluster simply points at a bitmap in the block group */ + bool points_to_bitmap; + + struct btrfs_block_group_cache *block_group; + /* + * when a cluster is allocated from a block group, we put the + * cluster onto a list in the block group so that it can + * be freed before the block group is freed. + */ + struct list_head block_group_list; +}; + +enum btrfs_caching_type { + BTRFS_CACHE_NO = 0, + BTRFS_CACHE_STARTED = 1, + BTRFS_CACHE_FINISHED = 2, +}; + +struct btrfs_caching_control { + struct list_head list; + struct mutex mutex; + wait_queue_head_t wait; + struct btrfs_block_group_cache *block_group; + u64 progress; + atomic_t count; +}; + +struct btrfs_block_group_cache { + struct btrfs_key key; + struct btrfs_block_group_item item; + struct btrfs_fs_info *fs_info; + spinlock_t lock; + u64 pinned; + u64 reserved; + u64 bytes_super; + u64 flags; + u64 sectorsize; + int extents_thresh; + int free_extents; + int total_bitmaps; + int ro; + int dirty; + + /* cache tracking stuff */ + int cached; + struct btrfs_caching_control *caching_ctl; + u64 last_byte_to_unpin; + + struct btrfs_space_info *space_info; + + /* free space cache stuff */ + spinlock_t tree_lock; + struct rb_root free_space_offset; + u64 free_space; + + /* block group cache stuff */ + struct rb_node cache_node; + + /* for block groups in the same raid type */ + struct list_head list; + + /* usage count */ + atomic_t count; + + /* List of struct btrfs_free_clusters for this block group. + * Today it will only have one thing on it, but that may change + */ + struct list_head cluster_list; +}; + +struct reloc_control; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_fs_info { + u8 fsid[BTRFS_FSID_SIZE]; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + struct btrfs_root *extent_root; + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; + struct btrfs_root *fs_root; + struct btrfs_root *csum_root; + + /* the log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; + + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; + + /* block group cache stuff */ + spinlock_t block_group_cache_lock; + struct rb_root block_group_cache_tree; + + struct extent_io_tree freed_extents[2]; + struct extent_io_tree *pinned_extents; + + /* logical->physical extent mapping */ + struct btrfs_mapping_tree mapping_tree; + + u64 generation; + u64 last_trans_committed; + + /* + * this is updated to the current trans every time a full commit + * is required instead of the faster short fsync log commits + */ + u64 last_trans_log_full_commit; + u64 open_ioctl_trans; + unsigned long mount_opt; + u64 max_extent; + u64 max_inline; + u64 alloc_start; + struct btrfs_transaction *running_transaction; + wait_queue_head_t transaction_throttle; + wait_queue_head_t transaction_wait; + wait_queue_head_t async_submit_wait; + + struct btrfs_super_block super_copy; + struct btrfs_super_block super_for_commit; + struct block_device *__bdev; + struct super_block *sb; + struct inode *btree_inode; + struct backing_dev_info bdi; + struct mutex trans_mutex; + struct mutex tree_log_mutex; + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex chunk_mutex; + struct mutex volume_mutex; + /* + * this protects the ordered operations list only while we are + * processing all of the entries on it. This way we make + * sure the commit code doesn't find the list temporarily empty + * because another function happens to be doing non-waiting preflush + * before jumping into the main commit. + */ + struct mutex ordered_operations_mutex; + struct rw_semaphore extent_commit_sem; + + struct rw_semaphore cleanup_work_sem; + + struct rw_semaphore subvol_sem; + struct srcu_struct subvol_srcu; + + struct list_head trans_list; + struct list_head hashers; + struct list_head dead_roots; + struct list_head caching_block_groups; + + spinlock_t delayed_iput_lock; + struct list_head delayed_iputs; + + atomic_t nr_async_submits; + atomic_t async_submit_draining; + atomic_t nr_async_bios; + atomic_t async_delalloc_pages; + + /* + * this is used by the balancing code to wait for all the pending + * ordered extents + */ + spinlock_t ordered_extent_lock; + + /* + * all of the data=ordered extents pending writeback + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ + struct list_head ordered_extents; + + /* + * all of the inodes that have delalloc bytes. It is possible for + * this list to be empty even when there is still dirty data=ordered + * extents waiting to finish IO. + */ + struct list_head delalloc_inodes; + + /* + * special rename and truncate targets that must be on disk before + * we're allowed to commit. This is basically the ext3 style + * data=ordered list. + */ + struct list_head ordered_operations; + + /* + * there is a pool of worker threads for checksumming during writes + * and a pool for checksumming after reads. This is because readers + * can run with FS locks held, and the writers may be waiting for + * those locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + * + * A third pool does submit_bio to avoid deadlocking with the other + * two + */ + struct btrfs_workers generic_worker; + struct btrfs_workers workers; + struct btrfs_workers delalloc_workers; + struct btrfs_workers endio_workers; + struct btrfs_workers endio_meta_workers; + struct btrfs_workers endio_meta_write_workers; + struct btrfs_workers endio_write_workers; + struct btrfs_workers submit_workers; + struct btrfs_workers enospc_workers; + /* + * fixup workers take dirty pages that didn't properly go through + * the cow mechanism and make them safe to write. It happens + * for the sys_munmap function call path + */ + struct btrfs_workers fixup_workers; + struct task_struct *transaction_kthread; + struct task_struct *cleaner_kthread; + int thread_pool_size; + + struct kobject super_kobj; + struct completion kobj_unregister; + int do_barriers; + int closing; + int log_root_recovering; + + u64 total_pinned; + + /* protected by the delalloc lock, used to keep from writing + * metadata until there is a nice batch + */ + u64 dirty_metadata_bytes; + struct list_head dirty_cowonly_roots; + + struct btrfs_fs_devices *fs_devices; + + /* + * the space_info list is almost entirely read only. It only changes + * when we add a new raid type to the FS, and that happens + * very rarely. RCU is used to protect it. + */ + struct list_head space_info; + + struct reloc_control *reloc_ctl; + + spinlock_t delalloc_lock; + spinlock_t new_trans_lock; + u64 delalloc_bytes; + + /* data_alloc_cluster is only used in ssd mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* all metadata allocations go through this cluster */ + struct btrfs_free_cluster meta_alloc_cluster; + + spinlock_t ref_cache_lock; + u64 total_ref_cache_size; + + u64 avail_data_alloc_bits; + u64 avail_metadata_alloc_bits; + u64 avail_system_alloc_bits; + u64 data_alloc_profile; + u64 metadata_alloc_profile; + u64 system_alloc_profile; + + unsigned data_chunk_allocations; + unsigned metadata_ratio; + + void *bdev_holder; +}; + +/* + * in ram representation of the tree. extent_root is used for all allocations + * and for the extent tree extent_root root. + */ +struct btrfs_root { + struct extent_buffer *node; + + /* the node lock is held while changing the node pointer */ + spinlock_t node_lock; + + struct extent_buffer *commit_root; + struct btrfs_root *log_root; + struct btrfs_root *reloc_root; + + struct btrfs_root_item root_item; + struct btrfs_key root_key; + struct btrfs_fs_info *fs_info; + struct extent_io_tree dirty_log_pages; + + struct kobject root_kobj; + struct completion kobj_unregister; + struct mutex objectid_mutex; + + struct mutex log_mutex; + wait_queue_head_t log_writer_wait; + wait_queue_head_t log_commit_wait[2]; + atomic_t log_writers; + atomic_t log_commit[2]; + unsigned long log_transid; + unsigned long last_log_commit; + unsigned long log_batch; + pid_t log_start_pid; + bool log_multiple_pids; + + u64 objectid; + u64 last_trans; + + /* data allocations are done in sectorsize units */ + u32 sectorsize; + + /* node allocations are done in nodesize units */ + u32 nodesize; + + /* leaf allocations are done in leafsize units */ + u32 leafsize; + + u32 stripesize; + + u32 type; + + u64 highest_objectid; + int ref_cows; + int track_dirty; + int in_radix; + int clean_orphans; + + u64 defrag_trans_start; + struct btrfs_key defrag_progress; + struct btrfs_key defrag_max; + int defrag_running; + char *name; + int in_sysfs; + + /* the dirty list is only used by non-reference counted roots */ + struct list_head dirty_list; + + struct list_head root_list; + + spinlock_t list_lock; + struct list_head orphan_list; + + spinlock_t inode_lock; + /* red-black tree that keeps track of in-memory inodes */ + struct rb_root inode_tree; + + /* + * right now this just gets used so that a root has its own devid + * for stat. It may be used for more later + */ + struct super_block anon_super; +}; + +/* + * inode items have the data typically returned from stat and store other + * info about object characteristics. There is one for every file and dir in + * the FS + */ +#define BTRFS_INODE_ITEM_KEY 1 +#define BTRFS_INODE_REF_KEY 12 +#define BTRFS_XATTR_ITEM_KEY 24 +#define BTRFS_ORPHAN_ITEM_KEY 48 +/* reserve 2-15 close to the inode for later flexibility */ + +/* + * dir items are the name -> inode pointers in a directory. There is one + * for every name in a directory. + */ +#define BTRFS_DIR_LOG_ITEM_KEY 60 +#define BTRFS_DIR_LOG_INDEX_KEY 72 +#define BTRFS_DIR_ITEM_KEY 84 +#define BTRFS_DIR_INDEX_KEY 96 +/* + * extent data is for file data + */ +#define BTRFS_EXTENT_DATA_KEY 108 + +/* + * extent csums are stored in a separate tree and hold csums for + * an entire extent on disk. + */ +#define BTRFS_EXTENT_CSUM_KEY 128 + +/* + * root items point to tree roots. They are typically in the root + * tree used by the super block to find all the other trees + */ +#define BTRFS_ROOT_ITEM_KEY 132 + +/* + * root backrefs tie subvols and snapshots to the directory entries that + * reference them + */ +#define BTRFS_ROOT_BACKREF_KEY 144 + +/* + * root refs make a fast index for listing all of the snapshots and + * subvolumes referenced by a given root. They point directly to the + * directory item in the root that references the subvol + */ +#define BTRFS_ROOT_REF_KEY 156 + +/* + * extent items are in the extent map tree. These record which blocks + * are used, and how many references there are to each block + */ +#define BTRFS_EXTENT_ITEM_KEY 168 + +#define BTRFS_TREE_BLOCK_REF_KEY 176 + +#define BTRFS_EXTENT_DATA_REF_KEY 178 + +#define BTRFS_EXTENT_REF_V0_KEY 180 + +#define BTRFS_SHARED_BLOCK_REF_KEY 182 + +#define BTRFS_SHARED_DATA_REF_KEY 184 + +/* + * block groups give us hints into the extent allocation trees. Which + * blocks are free etc etc + */ +#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 + +#define BTRFS_DEV_EXTENT_KEY 204 +#define BTRFS_DEV_ITEM_KEY 216 +#define BTRFS_CHUNK_ITEM_KEY 228 + +/* + * string items are for debugging. They just store a short string of + * data in the FS + */ +#define BTRFS_STRING_ITEM_KEY 253 + +#define BTRFS_MOUNT_NODATASUM (1 << 0) +#define BTRFS_MOUNT_NODATACOW (1 << 1) +#define BTRFS_MOUNT_NOBARRIER (1 << 2) +#define BTRFS_MOUNT_SSD (1 << 3) +#define BTRFS_MOUNT_DEGRADED (1 << 4) +#define BTRFS_MOUNT_COMPRESS (1 << 5) +#define BTRFS_MOUNT_NOTREELOG (1 << 6) +#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) +#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) +#define BTRFS_MOUNT_NOSSD (1 << 9) +#define BTRFS_MOUNT_DISCARD (1 << 10) +#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) + +#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) +#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ + BTRFS_MOUNT_##opt) +/* + * Inode flags + */ +#define BTRFS_INODE_NODATASUM (1 << 0) +#define BTRFS_INODE_NODATACOW (1 << 1) +#define BTRFS_INODE_READONLY (1 << 2) +#define BTRFS_INODE_NOCOMPRESS (1 << 3) +#define BTRFS_INODE_PREALLOC (1 << 4) +#define BTRFS_INODE_SYNC (1 << 5) +#define BTRFS_INODE_IMMUTABLE (1 << 6) +#define BTRFS_INODE_APPEND (1 << 7) +#define BTRFS_INODE_NODUMP (1 << 8) +#define BTRFS_INODE_NOATIME (1 << 9) +#define BTRFS_INODE_DIRSYNC (1 << 10) + + +/* some macros to generate set/get funcs for the struct fields. This + * assumes there is a lefoo_to_cpu for every type, so lets make a simple + * one for u8: + */ +#define le8_to_cpu(v) (v) +#define cpu_to_le8(v) (v) +#define __le8 u8 + +#define read_eb_member(eb, ptr, type, member, result) ( \ + read_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define write_eb_member(eb, ptr, type, member, result) ( \ + write_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#ifndef BTRFS_SETGET_FUNCS +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); +#endif + +#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(struct extent_buffer *eb) \ +{ \ + type *p = kmap_atomic(eb->first_page, KM_USER0); \ + u##bits res = le##bits##_to_cpu(p->member); \ + kunmap_atomic(p, KM_USER0); \ + return res; \ +} \ +static inline void btrfs_set_##name(struct extent_buffer *eb, \ + u##bits val) \ +{ \ + type *p = kmap_atomic(eb->first_page, KM_USER0); \ + p->member = cpu_to_le##bits(val); \ + kunmap_atomic(p, KM_USER0); \ +} + +#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(type *s) \ +{ \ + return le##bits##_to_cpu(s->member); \ +} \ +static inline void btrfs_set_##name(type *s, u##bits val) \ +{ \ + s->member = cpu_to_le##bits(val); \ +} + +BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); +BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); +BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); +BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); +BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, + start_offset, 64); +BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); +BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); +BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); +BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, + dev_group, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, + seek_speed, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, + bandwidth, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, + generation, 64); + +static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) +{ + return (char *)d + offsetof(struct btrfs_dev_item, uuid); +} + +static inline char *btrfs_device_fsid(struct btrfs_dev_item *d) +{ + return (char *)d + offsetof(struct btrfs_dev_item, fsid); +} + +BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); +BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); +BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); +BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); +BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); + +static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) +{ + return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); +} + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, + stripe_len, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, + num_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, + sub_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); + +static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, + int nr) +{ + unsigned long offset = (unsigned long)c; + offset += offsetof(struct btrfs_chunk, stripe); + offset += nr * sizeof(struct btrfs_stripe); + return (struct btrfs_stripe *)offset; +} + +static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); +} + +static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); +} + +/* struct btrfs_block_group_item */ +BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); + +BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(disk_block_group_flags, + struct btrfs_block_group_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_flags, + struct btrfs_block_group_item, flags, 64); + +/* struct btrfs_inode_ref */ +BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); + +/* struct btrfs_inode_item */ +BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); +BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); +BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); + +static inline struct btrfs_timespec * +btrfs_inode_atime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, atime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_mtime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, mtime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_ctime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, ctime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_otime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, otime); + return (struct btrfs_timespec *)ptr; +} + +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); + +/* struct btrfs_dev_extent */ +BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, + chunk_tree, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); + +static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) +{ + unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); + return (u8 *)((unsigned long)dev + ptr); +} + +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); +BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); + +BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); + + +BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +static inline void btrfs_tree_block_key(struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +static inline void btrfs_set_tree_block_key(struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, + root, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, + objectid, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, + offset, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, + count, 32); + +BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, + count, 32); + +BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, + type, 8); +BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, + offset, 64); + +static inline u32 btrfs_extent_inline_ref_size(int type) +{ + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_SHARED_DATA_REF_KEY) + return sizeof(struct btrfs_shared_data_ref) + + sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return sizeof(struct btrfs_extent_data_ref) + + offsetof(struct btrfs_extent_inline_ref, offset); + BUG(); + return 0; +} + +BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64); +BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0, + generation, 64); +BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64); +BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); + +/* struct btrfs_node */ +BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); + +static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_blockptr(struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline unsigned long btrfs_node_key_ptr_offset(int nr) +{ + return offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; +} + +void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr); + +static inline void btrfs_set_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr; + ptr = btrfs_node_key_ptr_offset(nr); + write_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} + +/* struct btrfs_item */ +BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); + +static inline unsigned long btrfs_item_nr_offset(int nr) +{ + return offsetof(struct btrfs_leaf, items) + + sizeof(struct btrfs_item) * nr; +} + +static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb, + int nr) +{ + return (struct btrfs_item *)btrfs_item_nr_offset(nr); +} + +static inline u32 btrfs_item_end(struct extent_buffer *eb, + struct btrfs_item *item) +{ + return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); +} + +static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_end(eb, btrfs_item_nr(eb, nr)); +} + +static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_offset(eb, btrfs_item_nr(eb, nr)); +} + +static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_size(eb, btrfs_item_nr(eb, nr)); +} + +static inline void btrfs_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + read_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +static inline void btrfs_set_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + write_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); + +/* + * struct btrfs_root_ref + */ +BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); + +/* struct btrfs_dir_item */ +BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); + +static inline void btrfs_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +/* struct btrfs_disk_key */ +BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, + objectid, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, + struct btrfs_disk_key *disk) +{ + cpu->offset = le64_to_cpu(disk->offset); + cpu->type = disk->type; + cpu->objectid = le64_to_cpu(disk->objectid); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, + struct btrfs_key *cpu) +{ + disk->offset = cpu_to_le64(cpu->offset); + disk->type = cpu->type; + disk->objectid = cpu_to_le64(cpu->objectid); +} + +static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + btrfs_node_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + btrfs_item_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_key *key) +{ + struct btrfs_disk_key disk_key; + btrfs_dir_item_key(eb, item, &disk_key); + btrfs_disk_key_to_cpu(key, &disk_key); +} + + +static inline u8 btrfs_key_type(struct btrfs_key *key) +{ + return key->type; +} + +static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) +{ + key->type = val; +} + +/* struct btrfs_header */ +BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); +BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); +BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); + +static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) +{ + return (btrfs_header_flags(eb) & flag) == flag; +} + +static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + btrfs_set_header_flags(eb, flags | flag); + return (flags & flag) == flag; +} + +static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + btrfs_set_header_flags(eb, flags & ~flag); + return (flags & flag) == flag; +} + +static inline int btrfs_header_backref_rev(struct extent_buffer *eb) +{ + u64 flags = btrfs_header_flags(eb); + return flags >> BTRFS_BACKREF_REV_SHIFT; +} + +static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, + int rev) +{ + u64 flags = btrfs_header_flags(eb); + flags &= ~BTRFS_BACKREF_REV_MASK; + flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; + btrfs_set_header_flags(eb, flags); +} + +static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, fsid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_super_fsid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_super_block, fsid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_header_csum(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, csum); + return (u8 *)ptr; +} + +static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb) +{ + return NULL; +} + +static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb) +{ + return NULL; +} + +static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb) +{ + return NULL; +} + +static inline int btrfs_is_leaf(struct extent_buffer *eb) +{ + return btrfs_header_level(eb) == 0; +} + +/* struct btrfs_root_item */ +BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, + generation, 64); +BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); + +BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); +BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); +BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); +BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, + last_snapshot, 64); + +/* struct btrfs_super_block */ + +BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, + struct btrfs_super_block, sys_chunk_array_size, 32); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, + struct btrfs_super_block, chunk_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, + root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, + chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, + log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, + log_root_transid, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, + log_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, + sectorsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, + nodesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block, + leafsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, + stripesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, + root_dir_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, + num_devices, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, + incompat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, + csum_type, 16); + +static inline int btrfs_super_csum_size(struct btrfs_super_block *s) +{ + int t = btrfs_super_csum_type(s); + BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes)); + return btrfs_csum_sizes[t]; +} + +static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) +{ + return offsetof(struct btrfs_leaf, items); +} + +/* struct btrfs_file_extent_item */ +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); + +static inline unsigned long +btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) +{ + unsigned long offset = (unsigned long)e; + offset += offsetof(struct btrfs_file_extent_item, disk_bytenr); + return offset; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; +} + +BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, + disk_bytenr, 64); +BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, + disk_num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, + offset, 64); +BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, + num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + struct btrfs_file_extent_item *e) +{ + return btrfs_file_extent_ram_bytes(eb, e); +} + +/* + * this returns the number of bytes used by the item on disk, minus the + * size of any extent headers. If a file is compressed on disk, this is + * the compressed size + */ +static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, + struct btrfs_item *e) +{ + unsigned long offset; + offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); + return btrfs_item_size(eb, e) - offset; +} + +static inline struct btrfs_root *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline int btrfs_set_root_name(struct btrfs_root *root, + const char *name, int len) +{ + /* if we already have a name just free it */ + kfree(root->name); + + root->name = kmalloc(len+1, GFP_KERNEL); + if (!root->name) + return -ENOMEM; + + memcpy(root->name, name, len); + root->name[len] = '\0'; + + return 0; +} + +static inline u32 btrfs_level_size(struct btrfs_root *root, int level) +{ + if (level == 0) + return root->leafsize; + return root->nodesize; +} + +/* helper function to cast into the data area of the leaf. */ +#define btrfs_item_ptr(leaf, slot, type) \ + ((type *)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) + +#define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) + +static inline struct dentry *fdentry(struct file *file) +{ + return file->f_path.dentry; +} + +/* extent-tree.c */ +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count); +int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num, int reserved); +int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr); +int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, + u64 bytenr); +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +u64 btrfs_find_block_group(struct btrfs_root *root, + u64 search_start, u64 search_hint, int owner); +struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u32 blocksize, + u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size); +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 parent, u64 root_objectid, int level); +struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + int level); +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, + u64 offset, struct btrfs_key *ins); +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins); +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 flags, + int is_data); +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset); + +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset); + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); +int btrfs_free_block_groups(struct btrfs_fs_info *info); +int btrfs_read_block_groups(struct btrfs_root *root); +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr); +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, + u64 size); +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 group_start); +int btrfs_prepare_block_group_relocation(struct btrfs_root *root, + struct btrfs_block_group_cache *group); + +u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); +void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items); +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items); +int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); +void btrfs_free_reserved_data_space(struct btrfs_root *root, + struct inode *inode, u64 bytes); +void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); +void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); +/* ctree.c */ +int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot); +int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type); +int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key); +struct extent_buffer *btrfs_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int lowest_level, + int cache_only, u64 min_trans); +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_key *max_key, + struct btrfs_path *path, int cache_only, + u64 min_trans); +int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret); +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid); +int btrfs_block_can_be_shared(struct btrfs_root *root, + struct extent_buffer *buf); +int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, u32 data_size); +int btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end); +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset); +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key); +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow); +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, int cache_only, u64 *last_ret, + struct btrfs_key *progress); +void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); +struct btrfs_path *btrfs_alloc_path(void); +void btrfs_free_path(struct btrfs_path *p); +void btrfs_set_path_blocking(struct btrfs_path *p); +void btrfs_unlock_up_safe(struct btrfs_path *p, int level); + +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr); +static inline int btrfs_del_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + return btrfs_del_items(trans, root, path, path->slots[0], 1); +} + +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, void *data, u32 data_size); +int btrfs_insert_some_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr); +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, int nr); + +static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u32 data_size) +{ + return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); +} + +int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent); +/* root-item.c */ +int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, + u64 root_id, u64 ref_id); +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 sequence, + const char *name, int name_len); +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, + const char *name, int name_len); +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct + btrfs_root_item *item, struct btrfs_key *key); +int btrfs_search_root(struct btrfs_root *root, u64 search_start, + u64 *found_objectid); +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); +int btrfs_find_orphan_roots(struct btrfs_root *tree_root); +int btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); +/* dir-item.c */ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + int name_len, u64 dir, + struct btrfs_key *location, u8 type, u64 index); +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, int name_len, + int mod); +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod); +struct btrfs_dir_item * +btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const char *name, int name_len); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len); +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di); +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len); +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod); + +/* orphan.c */ +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); + +/* inode-map.c */ +int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, + struct btrfs_root *fs_root, + u64 dirid, u64 *objectid); +int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid); + +/* inode-item.c */ +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index); +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index); +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod); + +/* file-item.c */ +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len); +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst); +int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding); +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 bytenr, int mod); +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); +int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 file_start, int contig); +int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode, + u64 start, unsigned long len); +struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow); +int btrfs_csum_truncate(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u64 isize); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, + u64 end, struct list_head *list); +/* inode.c */ + +/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ +#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) +#define ClearPageChecked ClearPageFsMisc +#define SetPageChecked SetPageFsMisc +#define PageChecked PageFsMisc +#endif + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); +int btrfs_set_inode_index(struct inode *dir, u64 *index); +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len); +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index); +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, u64 objectid, + const char *name, int name_len); +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 new_size, + u32 min_type); + +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); +int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc); +int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + struct btrfs_root *new_root, + u64 new_dirid, u64 alloc_hint); +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio, unsigned long bio_flags); + +unsigned long btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, pgoff_t last_index); +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +int btrfs_readpage(struct file *file, struct page *page); +void btrfs_delete_inode(struct inode *inode); +void btrfs_put_inode(struct inode *inode); +int btrfs_write_inode(struct inode *inode, int wait); +void btrfs_dirty_inode(struct inode *inode); +struct inode *btrfs_alloc_inode(struct super_block *sb); +void btrfs_destroy_inode(struct inode *inode); +void btrfs_drop_inode(struct inode *inode); +int btrfs_init_cachep(void); +void btrfs_destroy_cachep(void); +long btrfs_ioctl_trans_end(struct file *file); +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root); +int btrfs_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to); +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t page_offset, u64 start, u64 end, + int create); +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); +int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); +void btrfs_orphan_cleanup(struct btrfs_root *root); +int btrfs_cont_expand(struct inode *inode, loff_t size); +int btrfs_invalidate_inodes(struct btrfs_root *root); +void btrfs_add_delayed_iput(struct inode *inode); +void btrfs_run_delayed_iputs(struct btrfs_root *root); +extern const struct dentry_operations btrfs_dentry_operations; + +/* ioctl.c */ +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +void btrfs_update_iflags(struct inode *inode); +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); + +/* file.c */ +int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); +int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned); +int btrfs_check_file(struct btrfs_root *root, struct inode *inode); +extern const struct file_operations btrfs_file_operations; +int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, + u64 start, u64 end, u64 *hint_byte, int drop_cache); +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end); +int btrfs_release_file(struct inode *inode, struct file *file); + +/* tree-defrag.c */ +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int cache_only); + +/* sysfs.c */ +int btrfs_init_sysfs(void); +void btrfs_exit_sysfs(void); +int btrfs_sysfs_add_super(struct btrfs_fs_info *fs); +int btrfs_sysfs_add_root(struct btrfs_root *root); +void btrfs_sysfs_del_root(struct btrfs_root *root); +void btrfs_sysfs_del_super(struct btrfs_fs_info *root); + +/* xattr.c */ +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); + +/* super.c */ +u64 btrfs_parse_size(char *str); +int btrfs_parse_options(struct btrfs_root *root, char *options); +int btrfs_sync_fs(struct super_block *sb, int wait); + +/* acl.c */ +#ifdef CONFIG_BTRFS_FS_POSIX_ACL +int btrfs_check_acl(struct inode *inode, int mask); +#else +#define btrfs_check_acl NULL +#endif +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir); +int btrfs_acl_chmod(struct inode *inode); + +/* relocation.c */ +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_root *root); +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +#endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c new file mode 100644 index 00000000000..84e6781413b --- /dev/null +++ b/fs/btrfs/delayed-ref.c @@ -0,0 +1,919 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/sort.h> +#include "ctree.h" +#include "delayed-ref.h" +#include "transaction.h" + +/* + * delayed back reference update tracking. For subvolume trees + * we queue up extent allocations and backref maintenance for + * delayed processing. This avoids deep call chains where we + * add extents in the middle of btrfs_search_slot, and it allows + * us to buffer up frequently modified backrefs in an rb tree instead + * of hammering updates on the extent allocation tree. + */ + +/* + * compare two delayed tree backrefs with same bytenr and type + */ +static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, + struct btrfs_delayed_tree_ref *ref1) +{ + if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * compare two delayed data backrefs with same bytenr and type + */ +static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, + struct btrfs_delayed_data_ref *ref1) +{ + if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + if (ref1->objectid < ref2->objectid) + return -1; + if (ref1->objectid > ref2->objectid) + return 1; + if (ref1->offset < ref2->offset) + return -1; + if (ref1->offset > ref2->offset) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * entries in the rb tree are ordered by the byte number of the extent, + * type of the delayed backrefs and content of delayed backrefs. + */ +static int comp_entry(struct btrfs_delayed_ref_node *ref2, + struct btrfs_delayed_ref_node *ref1) +{ + if (ref1->bytenr < ref2->bytenr) + return -1; + if (ref1->bytenr > ref2->bytenr) + return 1; + if (ref1->is_head && ref2->is_head) + return 0; + if (ref2->is_head) + return -1; + if (ref1->is_head) + return 1; + if (ref1->type < ref2->type) + return -1; + if (ref1->type > ref2->type) + return 1; + if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { + return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), + btrfs_delayed_node_to_tree_ref(ref1)); + } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || + ref1->type == BTRFS_SHARED_DATA_REF_KEY) { + return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), + btrfs_delayed_node_to_data_ref(ref1)); + } + BUG(); + return 0; +} + +/* + * insert a new ref into the rbtree. This returns any existing refs + * for the same (bytenr,parent) tuple, or NULL if the new node was properly + * inserted. + */ +static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_node *entry; + struct btrfs_delayed_ref_node *ins; + int cmp; + + ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, + rb_node); + + cmp = comp_entry(entry, ins); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * find an head entry based on bytenr. This returns the delayed ref + * head if it was able to find one, or NULL if nothing was in that spot + */ +static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, + u64 bytenr, + struct btrfs_delayed_ref_node **last) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int cmp; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + if (last) + *last = entry; + + if (bytenr < entry->bytenr) + cmp = -1; + else if (bytenr > entry->bytenr) + cmp = 1; + else if (!btrfs_delayed_ref_is_head(entry)) + cmp = 1; + else + cmp = 0; + + if (cmp < 0) + n = n->rb_left; + else if (cmp > 0) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + assert_spin_locked(&delayed_refs->lock); + if (mutex_trylock(&head->mutex)) + return 0; + + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + mutex_lock(&head->mutex); + spin_lock(&delayed_refs->lock); + if (!head->node.in_tree) { + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; + } + btrfs_put_delayed_ref(&head->node); + return 0; +} + +int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, + struct list_head *cluster, u64 start) +{ + int count = 0; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *head; + + delayed_refs = &trans->transaction->delayed_refs; + if (start == 0) { + node = rb_first(&delayed_refs->root); + } else { + ref = NULL; + find_ref_head(&delayed_refs->root, start, &ref); + if (ref) { + struct btrfs_delayed_ref_node *tmp; + + node = rb_prev(&ref->rb_node); + while (node) { + tmp = rb_entry(node, + struct btrfs_delayed_ref_node, + rb_node); + if (tmp->bytenr < start) + break; + ref = tmp; + node = rb_prev(&ref->rb_node); + } + node = &ref->rb_node; + } else + node = rb_first(&delayed_refs->root); + } +again: + while (node && count < 32) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + if (btrfs_delayed_ref_is_head(ref)) { + head = btrfs_delayed_node_to_head(ref); + if (list_empty(&head->cluster)) { + list_add_tail(&head->cluster, cluster); + delayed_refs->run_delayed_start = + head->node.bytenr; + count++; + + WARN_ON(delayed_refs->num_heads_ready == 0); + delayed_refs->num_heads_ready--; + } else if (count) { + /* the goal of the clustering is to find extents + * that are likely to end up in the same extent + * leaf on disk. So, we don't want them spread + * all over the tree. Stop now if we've hit + * a head that was already in use + */ + break; + } + } + node = rb_next(node); + } + if (count) { + return 0; + } else if (start) { + /* + * we've gone to the end of the rbtree without finding any + * clusters. start from the beginning and try again + */ + start = 0; + node = rb_first(&delayed_refs->root); + goto again; + } + return 1; +} + +/* + * This checks to see if there are any delayed refs in the + * btree for a given bytenr. It returns one if it finds any + * and zero otherwise. + * + * If it only finds a head node, it returns 0. + * + * The idea is to use this when deciding if you can safely delete an + * extent from the extent allocation tree. There may be a pending + * ref in the rbtree that adds or removes references, so as long as this + * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent + * allocation tree. + */ +int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *prev_node; + int ret = 0; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + if (ref) { + prev_node = rb_prev(&ref->rb_node); + if (!prev_node) + goto out; + ref = rb_entry(prev_node, struct btrfs_delayed_ref_node, + rb_node); + if (ref->bytenr == bytenr) + ret = 1; + } +out: + spin_unlock(&delayed_refs->lock); + return ret; +} + +/* + * helper function to lookup reference count and flags of extent. + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. + */ +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + struct btrfs_key key; + u32 item_size; + u64 num_refs; + u64 extent_flags; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + delayed_refs = &trans->transaction->delayed_refs; +again: + ret = btrfs_search_slot(trans, root->fs_info->extent_root, + &key, path, 0, 0); + if (ret < 0) + goto out; + + if (ret == 0) { + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + num_refs = btrfs_extent_refs_v0(leaf, ei0); + /* FIXME: this isn't correct for data */ + extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; +#else + BUG(); +#endif + } + BUG_ON(num_refs == 0); + } else { + num_refs = 0; + extent_flags = 0; + ret = 0; + } + + spin_lock(&delayed_refs->lock); + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + if (ref) { + head = btrfs_delayed_node_to_head(ref); + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&ref->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(ref); + goto again; + } + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); + + num_refs += ref->ref_mod; + mutex_unlock(&head->mutex); + } + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; +out: + spin_unlock(&delayed_refs->lock); + btrfs_free_path(path); + return ret; +} + +/* + * helper function to update an extent delayed ref in the + * rbtree. existing and update must both have the same + * bytenr and parent + * + * This may free existing if the update cancels out whatever + * operation it was doing. + */ +static noinline void +update_existing_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *existing, + struct btrfs_delayed_ref_node *update) +{ + if (update->action != existing->action) { + /* + * this is effectively undoing either an add or a + * drop. We decrement the ref_mod, and if it goes + * down to zero we just delete the entry without + * every changing the extent allocation tree. + */ + existing->ref_mod--; + if (existing->ref_mod == 0) { + rb_erase(&existing->rb_node, + &delayed_refs->root); + existing->in_tree = 0; + btrfs_put_delayed_ref(existing); + delayed_refs->num_entries--; + if (trans->delayed_ref_updates) + trans->delayed_ref_updates--; + } else { + WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || + existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + } + } else { + WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || + existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + /* + * the action on the existing ref matches + * the action on the ref we're trying to add. + * Bump the ref_mod by one so the backref that + * is eventually added/removed has the correct + * reference count + */ + existing->ref_mod += update->ref_mod; + } +} + +/* + * helper function to update the accounting in the head ref + * existing and update must have the same bytenr + */ +static noinline void +update_existing_head_ref(struct btrfs_delayed_ref_node *existing, + struct btrfs_delayed_ref_node *update) +{ + struct btrfs_delayed_ref_head *existing_ref; + struct btrfs_delayed_ref_head *ref; + + existing_ref = btrfs_delayed_node_to_head(existing); + ref = btrfs_delayed_node_to_head(update); + BUG_ON(existing_ref->is_data != ref->is_data); + + if (ref->must_insert_reserved) { + /* if the extent was freed and then + * reallocated before the delayed ref + * entries were processed, we can end up + * with an existing head ref without + * the must_insert_reserved flag set. + * Set it again here + */ + existing_ref->must_insert_reserved = ref->must_insert_reserved; + + /* + * update the num_bytes so we make sure the accounting + * is done correctly + */ + existing->num_bytes = update->num_bytes; + + } + + if (ref->extent_op) { + if (!existing_ref->extent_op) { + existing_ref->extent_op = ref->extent_op; + } else { + if (ref->extent_op->update_key) { + memcpy(&existing_ref->extent_op->key, + &ref->extent_op->key, + sizeof(ref->extent_op->key)); + existing_ref->extent_op->update_key = 1; + } + if (ref->extent_op->update_flags) { + existing_ref->extent_op->flags_to_set |= + ref->extent_op->flags_to_set; + existing_ref->extent_op->update_flags = 1; + } + kfree(ref->extent_op); + } + } + /* + * update the reference mod on the head to reflect this new operation + */ + existing->ref_mod += update->ref_mod; +} + +/* + * helper function to actually insert a head node into the rbtree. + * this does all the dirty work in terms of maintaining the correct + * overall modification count. + */ +static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, + int action, int is_data) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_ref_head *head_ref = NULL; + struct btrfs_delayed_ref_root *delayed_refs; + int count_mod = 1; + int must_insert_reserved = 0; + + /* + * the head node stores the sum of all the mods, so dropping a ref + * should drop the sum in the head node by one. + */ + if (action == BTRFS_UPDATE_DELAYED_HEAD) + count_mod = 0; + else if (action == BTRFS_DROP_DELAYED_REF) + count_mod = -1; + + /* + * BTRFS_ADD_DELAYED_EXTENT means that we need to update + * the reserved accounting when the extent is finally added, or + * if a later modification deletes the delayed ref without ever + * inserting the extent into the extent allocation tree. + * ref->must_insert_reserved is the flag used to record + * that accounting mods are required. + * + * Once we record must_insert_reserved, switch the action to + * BTRFS_ADD_DELAYED_REF because other special casing is not required. + */ + if (action == BTRFS_ADD_DELAYED_EXTENT) + must_insert_reserved = 1; + else + must_insert_reserved = 0; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = count_mod; + ref->type = 0; + ref->action = 0; + ref->is_head = 1; + ref->in_tree = 1; + + head_ref = btrfs_delayed_node_to_head(ref); + head_ref->must_insert_reserved = must_insert_reserved; + head_ref->is_data = is_data; + + INIT_LIST_HEAD(&head_ref->cluster); + mutex_init(&head_ref->mutex); + + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_head_ref(existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_heads++; + delayed_refs->num_heads_ready++; + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * helper to insert a delayed tree ref into the rbtree. + */ +static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_tree_ref *full_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->is_head = 0; + ref->in_tree = 1; + + full_ref = btrfs_delayed_node_to_tree_ref(ref); + if (parent) { + full_ref->parent = parent; + ref->type = BTRFS_SHARED_BLOCK_REF_KEY; + } else { + full_ref->root = ref_root; + ref->type = BTRFS_TREE_BLOCK_REF_KEY; + } + full_ref->level = level; + + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_ref(trans, delayed_refs, existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * helper to insert a delayed data ref into the rbtree. + */ +static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, u64 owner, u64 offset, + int action) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_data_ref *full_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->is_head = 0; + ref->in_tree = 1; + + full_ref = btrfs_delayed_node_to_data_ref(ref); + if (parent) { + full_ref->parent = parent; + ref->type = BTRFS_SHARED_DATA_REF_KEY; + } else { + full_ref->root = ref_root; + ref->type = BTRFS_EXTENT_DATA_REF_KEY; + } + full_ref->objectid = owner; + full_ref->offset = offset; + + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_ref(trans, delayed_refs, existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * add a delayed tree ref. This does all of the accounting required + * to make sure the delayed ref is eventually processed before this + * transaction commits. + */ +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_tree_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + BUG_ON(extent_op && extent_op->is_data); + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + return -ENOMEM; + } + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, + action, 0); + BUG_ON(ret); + + ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, level, action); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} + +/* + * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. + */ +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, + u64 owner, u64 offset, int action, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_data_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + BUG_ON(extent_op && !extent_op->is_data); + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + return -ENOMEM; + } + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, + action, 1); + BUG_ON(ret); + + ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, owner, offset, action); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} + +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) + return -ENOMEM; + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + extent_op->is_data); + BUG_ON(ret); + + spin_unlock(&delayed_refs->lock); + return 0; +} + +/* + * this does a simple search for the head node for a given extent. + * It must be called with the delayed ref spinlock held, and it returns + * the head node if any where found, or NULL if not. + */ +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + if (ref) + return btrfs_delayed_node_to_head(ref); + return NULL; +} + +/* + * add a delayed ref to the tree. This does all of the accounting required + * to make sure the delayed ref is eventually processed before this + * transaction commits. + * + * The main point of this call is to add and remove a backreference in a single + * shot, taking the lock only once, and only searching for the head node once. + * + * It is the same as doing a ref add and delete in two separate calls. + */ +#if 0 +int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 orig_parent, + u64 parent, u64 orig_ref_root, u64 ref_root, + u64 orig_ref_generation, u64 ref_generation, + u64 owner_objectid, int pin) +{ + struct btrfs_delayed_ref *ref; + struct btrfs_delayed_ref *old_ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS); + if (!old_ref) { + kfree(ref); + return -ENOMEM; + } + + /* + * the parent = 0 case comes from cases where we don't actually + * know the parent yet. It will get updated later via a add/drop + * pair. + */ + if (parent == 0) + parent = bytenr; + if (orig_parent == 0) + orig_parent = bytenr; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + kfree(old_ref); + return -ENOMEM; + } + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, + (u64)-1, 0, 0, 0, + BTRFS_UPDATE_DELAYED_HEAD, 0); + BUG_ON(ret); + + ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, ref_generation, + owner_objectid, BTRFS_ADD_DELAYED_REF, 0); + BUG_ON(ret); + + ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes, + orig_parent, orig_ref_root, + orig_ref_generation, owner_objectid, + BTRFS_DROP_DELAYED_REF, pin); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} +#endif diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h new file mode 100644 index 00000000000..f6fc67ddad3 --- /dev/null +++ b/fs/btrfs/delayed-ref.h @@ -0,0 +1,214 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __DELAYED_REF__ +#define __DELAYED_REF__ + +/* these are the possible values of struct btrfs_delayed_ref->action */ +#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ +#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ +#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ +#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ + +struct btrfs_delayed_ref_node { + struct rb_node rb_node; + + /* the starting bytenr of the extent */ + u64 bytenr; + + /* the size of the extent */ + u64 num_bytes; + + /* ref count on this data structure */ + atomic_t refs; + + /* + * how many refs is this entry adding or deleting. For + * head refs, this may be a negative number because it is keeping + * track of the total mods done to the reference count. + * For individual refs, this will always be a positive number + * + * It may be more than one, since it is possible for a single + * parent to have more than one ref on an extent + */ + int ref_mod; + + unsigned int action:8; + unsigned int type:8; + /* is this node still in the rbtree? */ + unsigned int is_head:1; + unsigned int in_tree:1; +}; + +struct btrfs_delayed_extent_op { + struct btrfs_disk_key key; + u64 flags_to_set; + unsigned int update_key:1; + unsigned int update_flags:1; + unsigned int is_data:1; +}; + +/* + * the head refs are used to hold a lock on a given extent, which allows us + * to make sure that only one process is running the delayed refs + * at a time for a single extent. They also store the sum of all the + * reference count modifications we've queued up. + */ +struct btrfs_delayed_ref_head { + struct btrfs_delayed_ref_node node; + + /* + * the mutex is held while running the refs, and it is also + * held when checking the sum of reference modifications. + */ + struct mutex mutex; + + struct list_head cluster; + + struct btrfs_delayed_extent_op *extent_op; + /* + * when a new extent is allocated, it is just reserved in memory + * The actual extent isn't inserted into the extent allocation tree + * until the delayed ref is processed. must_insert_reserved is + * used to flag a delayed ref so the accounting can be updated + * when a full insert is done. + * + * It is possible the extent will be freed before it is ever + * inserted into the extent allocation tree. In this case + * we need to update the in ram accounting to properly reflect + * the free has happened. + */ + unsigned int must_insert_reserved:1; + unsigned int is_data:1; +}; + +struct btrfs_delayed_tree_ref { + struct btrfs_delayed_ref_node node; + union { + u64 root; + u64 parent; + }; + int level; +}; + +struct btrfs_delayed_data_ref { + struct btrfs_delayed_ref_node node; + union { + u64 root; + u64 parent; + }; + u64 objectid; + u64 offset; +}; + +struct btrfs_delayed_ref_root { + struct rb_root root; + + /* this spin lock protects the rbtree and the entries inside */ + spinlock_t lock; + + /* how many delayed ref updates we've queued, used by the + * throttling code + */ + unsigned long num_entries; + + /* total number of head nodes in tree */ + unsigned long num_heads; + + /* total number of head nodes ready for processing */ + unsigned long num_heads_ready; + + /* + * set when the tree is flushing before a transaction commit, + * used by the throttling code to decide if new updates need + * to be run right away + */ + int flushing; + + u64 run_delayed_start; +}; + +static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) +{ + WARN_ON(atomic_read(&ref->refs) == 0); + if (atomic_dec_and_test(&ref->refs)) { + WARN_ON(ref->in_tree); + kfree(ref); + } +} + +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, + u64 owner, u64 offset, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op); + +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); +int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags); +int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 orig_parent, + u64 parent, u64 orig_ref_root, u64 ref_root, + u64 orig_ref_generation, u64 ref_generation, + u64 owner_objectid, int pin); +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head); +int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, + struct list_head *cluster, u64 search_start); +/* + * a node might live in a head or a regular ref, this lets you + * test for the proper type to use. + */ +static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) +{ + return node->is_head; +} + +/* + * helper functions to cast a node into its container + */ +static inline struct btrfs_delayed_tree_ref * +btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_tree_ref, node); +} + +static inline struct btrfs_delayed_data_ref * +btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_data_ref, node); +} + +static inline struct btrfs_delayed_ref_head * +btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(!btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_ref_head, node); +} +#endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c new file mode 100644 index 00000000000..e9103b3baa4 --- /dev/null +++ b/fs/btrfs/dir-item.c @@ -0,0 +1,431 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "hash.h" +#include "transaction.h" + +/* + * insert a name into a directory, doing overflow properly if there is a hash + * collision. data_size indicates how big the item inserted should be. On + * success a struct btrfs_dir_item pointer is returned, otherwise it is + * an ERR_PTR. + * + * The name is not copied into the dir item, you have to do that yourself. + */ +static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle + *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, + u32 data_size, + const char *name, + int name_len) +{ + int ret; + char *ptr; + struct btrfs_item *item; + struct extent_buffer *leaf; + + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (ret == -EEXIST) { + struct btrfs_dir_item *di; + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) + return ERR_PTR(-EEXIST); + ret = btrfs_extend_item(trans, root, path, data_size); + WARN_ON(ret > 0); + } + if (ret < 0) + return ERR_PTR(ret); + WARN_ON(ret > 0); + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr(leaf, path->slots[0], char); + BUG_ON(data_size > btrfs_item_size(leaf, item)); + ptr += btrfs_item_size(leaf, item) - data_size; + return (struct btrfs_dir_item *)ptr; +} + +/* + * xattrs work a lot like directories, this inserts an xattr item + * into the tree + */ +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len) +{ + int ret = 0; + struct btrfs_dir_item *dir_item; + unsigned long name_ptr, data_ptr; + struct btrfs_key key, location; + struct btrfs_disk_key disk_key; + struct extent_buffer *leaf; + u32 data_size; + + BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + + data_size = sizeof(*dir_item) + name_len + data_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + /* + * FIXME: at some point we should handle xattr's that are larger than + * what we can fit in our leaf. We set location to NULL b/c we arent + * pointing at anything else, that will change if we store the xattr + * data in a separate inode. + */ + BUG_ON(IS_ERR(dir_item)); + memset(&location, 0, sizeof(location)); + + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, &location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + btrfs_set_dir_data_len(leaf, dir_item, data_len); + name_ptr = (unsigned long)(dir_item + 1); + data_ptr = (unsigned long)((char *)name_ptr + name_len); + + write_extent_buffer(leaf, name, name_ptr, name_len); + write_extent_buffer(leaf, data, data_ptr, data_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + + return ret; +} + +/* + * insert a directory item in the tree, doing all the magic for + * both indexes. 'dir' indicates which objectid to insert it into, + * 'location' is the key to stuff into the directory item, 'type' is the + * type of the inode we're pointing to, and 'index' is the sequence number + * to use for the second index (if one is created). + */ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, const char *name, int name_len, u64 dir, + struct btrfs_key *location, u8 type, u64 index) +{ + int ret = 0; + int ret2 = 0; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + struct btrfs_disk_key disk_key; + u32 data_size; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + + path = btrfs_alloc_path(); + path->leave_spinning = 1; + + data_size = sizeof(*dir_item) + name_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + if (ret == -EEXIST) + goto second_insert; + goto out; + } + + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_data_len(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + name_ptr = (unsigned long)(dir_item + 1); + + write_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + +second_insert: + /* FIXME, use some real flag for selecting the extra index */ + if (root == root->fs_info->tree_root) { + ret = 0; + goto out; + } + btrfs_release_path(root, path); + + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = index; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + if (IS_ERR(dir_item)) { + ret2 = PTR_ERR(dir_item); + goto out; + } + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_data_len(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + name_ptr = (unsigned long)(dir_item + 1); + write_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + if (ret) + return ret; + if (ret2) + return ret2; + return 0; +} + +/* + * lookup a directory item based on name. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + */ +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, int name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + + key.offset = btrfs_name_hash(name, name_len); + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + if (path->slots[0] == 0) + return NULL; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != dir || + btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY || + found_key.offset != key.offset) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +/* + * lookup a directory item based on index. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + * + * The name is used to make sure the index really points to the name you were + * looking for. + */ +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = objectid; + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +struct btrfs_dir_item * +btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const char *name, int name_len) +{ + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u32 nritems; + int ret; + + key.objectid = dirid; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ERR_PTR(ret); + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) + break; + + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) + return di; + + path->slots[0]++; + } + return NULL; +} + +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + if (path->slots[0] == 0) + return NULL; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != dir || + btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY || + found_key.offset != key.offset) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +/* + * helper function to look at the directory item pointed to by 'path' + * this walks through all the entries in a dir item and finds one + * for a specific name. + */ +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len) +{ + struct btrfs_dir_item *dir_item; + unsigned long name_ptr; + u32 total_len; + u32 cur = 0; + u32 this_len; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + total_len = btrfs_item_size_nr(leaf, path->slots[0]); + while (cur < total_len) { + this_len = sizeof(*dir_item) + + btrfs_dir_name_len(leaf, dir_item) + + btrfs_dir_data_len(leaf, dir_item); + name_ptr = (unsigned long)(dir_item + 1); + + if (btrfs_dir_name_len(leaf, dir_item) == name_len && + memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + return dir_item; + + cur += this_len; + dir_item = (struct btrfs_dir_item *)((char *)dir_item + + this_len); + } + return NULL; +} + +/* + * given a pointer into a directory item, delete it. This + * handles items that have more than one entry in them. + */ +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di) +{ + + struct extent_buffer *leaf; + u32 sub_item_len; + u32 item_len; + int ret = 0; + + leaf = path->nodes[0]; + sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di); + item_len = btrfs_item_size_nr(leaf, path->slots[0]); + if (sub_item_len == item_len) { + ret = btrfs_del_item(trans, root, path); + } else { + /* MARKER */ + unsigned long ptr = (unsigned long)di; + unsigned long start; + + start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_len - (ptr + sub_item_len - start)); + ret = btrfs_truncate_item(trans, root, path, + item_len - sub_item_len, 1); + } + return 0; +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c new file mode 100644 index 00000000000..2b59201b955 --- /dev/null +++ b/fs/btrfs/disk-io.c @@ -0,0 +1,2614 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/scatterlist.h> +#include <linux/swap.h> +#include <linux/radix-tree.h> +#include <linux/writeback.h> +#include <linux/buffer_head.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/crc32c.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "print-tree.h" +#include "async-thread.h" +#include "locking.h" +#include "tree-log.h" +#include "free-space-cache.h" + +static struct extent_io_ops btree_extent_io_ops; +static void end_workqueue_fn(struct btrfs_work *work); +static void free_fs_root(struct btrfs_root *root); + +static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); + +/* + * end_io_wq structs are used to do processing in task context when an IO is + * complete. This is used during reads to verify checksums, and it is used + * by writes to insert metadata for new file extents after IO is complete. + */ +struct end_io_wq { + struct bio *bio; + bio_end_io_t *end_io; + void *private; + struct btrfs_fs_info *info; + int error; + int metadata; + struct list_head list; + struct btrfs_work work; +}; + +/* + * async submit bios are used to offload expensive checksumming + * onto the worker threads. They checksum file and metadata bios + * just before they are sent down the IO stack. + */ +struct async_submit_bio { + struct inode *inode; + struct bio *bio; + struct list_head list; + extent_submit_bio_hook_t *submit_bio_start; + extent_submit_bio_hook_t *submit_bio_done; + int rw; + int mirror_num; + unsigned long bio_flags; + struct btrfs_work work; +}; + +/* These are used to set the lockdep class on the extent buffer locks. + * The class is set by the readpage_end_io_hook after the buffer has + * passed csum validation but before the pages are unlocked. + * + * The lockdep class is also set by btrfs_init_new_buffer on freshly + * allocated blocks. + * + * The class is based on the level in the tree block, which allows lockdep + * to know that lower nodes nest inside the locks of higher nodes. + * + * We also add a check to make sure the highest level of the tree is + * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this + * code needs update as well. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# if BTRFS_MAX_LEVEL != 8 +# error +# endif +static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; +static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { + /* leaf */ + "btrfs-extent-00", + "btrfs-extent-01", + "btrfs-extent-02", + "btrfs-extent-03", + "btrfs-extent-04", + "btrfs-extent-05", + "btrfs-extent-06", + "btrfs-extent-07", + /* highest possible level */ + "btrfs-extent-08", +}; +#endif + +/* + * extents on the btree inode are pretty simple, there's one extent + * that covers the entire device + */ +static struct extent_map *btree_get_extent(struct inode *inode, + struct page *page, size_t page_offset, u64 start, u64 len, + int create) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + em->bdev = + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + read_unlock(&em_tree->lock); + goto out; + } + read_unlock(&em_tree->lock); + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + em->start = 0; + em->len = (u64)-1; + em->block_len = (u64)-1; + em->block_start = 0; + em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + if (ret == -EEXIST) { + u64 failed_start = em->start; + u64 failed_len = em->len; + + free_extent_map(em); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + ret = 0; + } else { + em = lookup_extent_mapping(em_tree, failed_start, + failed_len); + ret = -EIO; + } + } else if (ret) { + free_extent_map(em); + em = NULL; + } + write_unlock(&em_tree->lock); + + if (ret) + em = ERR_PTR(ret); +out: + return em; +} + +u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) +{ + return crc32c(seed, data, len); +} + +void btrfs_csum_final(u32 crc, char *result) +{ + *(__le32 *)result = ~cpu_to_le32(crc); +} + +/* + * compute the csum for a btree block, and either verify it or write it + * into the csum field of the block. + */ +static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, + int verify) +{ + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + char *result = NULL; + unsigned long len; + unsigned long cur_len; + unsigned long offset = BTRFS_CSUM_SIZE; + char *map_token = NULL; + char *kaddr; + unsigned long map_start; + unsigned long map_len; + int err; + u32 crc = ~(u32)0; + unsigned long inline_result; + + len = buf->len - offset; + while (len > 0) { + err = map_private_extent_buffer(buf, offset, 32, + &map_token, &kaddr, + &map_start, &map_len, KM_USER0); + if (err) + return 1; + cur_len = min(len, map_len - (offset - map_start)); + crc = btrfs_csum_data(root, kaddr + offset - map_start, + crc, cur_len); + len -= cur_len; + offset += cur_len; + unmap_extent_buffer(buf, map_token, KM_USER0); + } + if (csum_size > sizeof(inline_result)) { + result = kzalloc(csum_size * sizeof(char), GFP_NOFS); + if (!result) + return 1; + } else { + result = (char *)&inline_result; + } + + btrfs_csum_final(crc, result); + + if (verify) { + if (memcmp_extent_buffer(buf, result, 0, csum_size)) { + u32 val; + u32 found = 0; + memcpy(&found, result, csum_size); + + read_extent_buffer(buf, &val, 0, csum_size); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs: %s checksum verify " + "failed on %llu wanted %X found %X " + "level %d\n", + root->fs_info->sb->s_id, + (unsigned long long)buf->start, val, found, + btrfs_header_level(buf)); + } + if (result != (char *)&inline_result) + kfree(result); + return 1; + } + } else { + write_extent_buffer(buf, result, 0, csum_size); + } + if (result != (char *)&inline_result) + kfree(result); + return 0; +} + +/* + * we can't consider a given block up to date unless the transid of the + * block matches the transid in the parent node's pointer. This is how we + * detect blocks that either didn't get written at all or got written + * in the wrong place. + */ +static int verify_parent_transid(struct extent_io_tree *io_tree, + struct extent_buffer *eb, u64 parent_transid) +{ + int ret; + + if (!parent_transid || btrfs_header_generation(eb) == parent_transid) + return 0; + + lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); + if (extent_buffer_uptodate(io_tree, eb) && + btrfs_header_generation(eb) == parent_transid) { + ret = 0; + goto out; + } + if (printk_ratelimit()) { + printk("parent transid verify failed on %llu wanted %llu " + "found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + } + ret = 1; + clear_extent_buffer_uptodate(io_tree, eb); +out: + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + return ret; +} + +/* + * helper to read a given tree block, doing retries as required when + * the checksums don't match and we have alternate mirrors to try. + */ +static int btree_read_extent_buffer_pages(struct btrfs_root *root, + struct extent_buffer *eb, + u64 start, u64 parent_transid) +{ + struct extent_io_tree *io_tree; + int ret; + int num_copies = 0; + int mirror_num = 0; + + io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + while (1) { + ret = read_extent_buffer_pages(io_tree, eb, start, 1, + btree_get_extent, mirror_num); + if (!ret && + !verify_parent_transid(io_tree, eb, parent_transid)) + return ret; + + num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + eb->start, eb->len); + if (num_copies == 1) + return ret; + + mirror_num++; + if (mirror_num > num_copies) + return ret; + } + return -EIO; +} + +/* + * checksum a dirty tree block before IO. This has extra checks to make sure + * we only fill in the checksum field in the first page of a multi-page block + */ + +static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) +{ + struct extent_io_tree *tree; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 found_start; + int found_level; + unsigned long len; + struct extent_buffer *eb; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, + btrfs_header_generation(eb)); + BUG_ON(ret); + found_start = btrfs_header_bytenr(eb); + if (found_start != start) { + WARN_ON(1); + goto err; + } + if (eb->first_page != page) { + WARN_ON(1); + goto err; + } + if (!PageUptodate(page)) { + WARN_ON(1); + goto err; + } + found_level = btrfs_header_level(eb); + + csum_tree_block(root, eb, 0); +err: + free_extent_buffer(eb); +out: + return 0; +} + +static int check_tree_block_fsid(struct btrfs_root *root, + struct extent_buffer *eb) +{ + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + u8 fsid[BTRFS_UUID_SIZE]; + int ret = 1; + + read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb), + BTRFS_FSID_SIZE); + while (fs_devices) { + if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { + ret = 0; + break; + } + fs_devices = fs_devices->seed; + } + return ret; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) +{ + lockdep_set_class_and_name(&eb->lock, + &btrfs_eb_class[level], + btrfs_eb_name[level]); +} +#endif + +static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct extent_io_tree *tree; + u64 found_start; + int found_level; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + int ret = 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + + found_start = btrfs_header_bytenr(eb); + if (found_start != start) { + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad tree block start " + "%llu %llu\n", + (unsigned long long)found_start, + (unsigned long long)eb->start); + } + ret = -EIO; + goto err; + } + if (eb->first_page != page) { + printk(KERN_INFO "btrfs bad first page %lu %lu\n", + eb->first_page->index, page->index); + WARN_ON(1); + ret = -EIO; + goto err; + } + if (check_tree_block_fsid(root, eb)) { + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad fsid on block %llu\n", + (unsigned long long)eb->start); + } + ret = -EIO; + goto err; + } + found_level = btrfs_header_level(eb); + + btrfs_set_buffer_lockdep_class(eb, found_level); + + ret = csum_tree_block(root, eb, 1); + if (ret) + ret = -EIO; + + end = min_t(u64, eb->len, PAGE_CACHE_SIZE); + end = eb->start + end - 1; +err: + free_extent_buffer(eb); +out: + return ret; +} + +static void end_workqueue_bio(struct bio *bio, int err) +{ + struct end_io_wq *end_io_wq = bio->bi_private; + struct btrfs_fs_info *fs_info; + + fs_info = end_io_wq->info; + end_io_wq->error = err; + end_io_wq->work.func = end_workqueue_fn; + end_io_wq->work.flags = 0; + + if (bio->bi_rw & (1 << BIO_RW)) { + if (end_io_wq->metadata) + btrfs_queue_worker(&fs_info->endio_meta_write_workers, + &end_io_wq->work); + else + btrfs_queue_worker(&fs_info->endio_write_workers, + &end_io_wq->work); + } else { + if (end_io_wq->metadata) + btrfs_queue_worker(&fs_info->endio_meta_workers, + &end_io_wq->work); + else + btrfs_queue_worker(&fs_info->endio_workers, + &end_io_wq->work); + } +} + +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + int metadata) +{ + struct end_io_wq *end_io_wq; + end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); + if (!end_io_wq) + return -ENOMEM; + + end_io_wq->private = bio->bi_private; + end_io_wq->end_io = bio->bi_end_io; + end_io_wq->info = info; + end_io_wq->error = 0; + end_io_wq->bio = bio; + end_io_wq->metadata = metadata; + + bio->bi_private = end_io_wq; + bio->bi_end_io = end_workqueue_bio; + return 0; +} + +unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) +{ + unsigned long limit = min_t(unsigned long, + info->workers.max_workers, + info->fs_devices->open_devices); + return 256 * limit; +} + +int btrfs_congested_async(struct btrfs_fs_info *info, int iodone) +{ + return atomic_read(&info->nr_async_bios) > + btrfs_async_submit_limit(info); +} + +static void run_one_async_start(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + async->submit_bio_start(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags); +} + +static void run_one_async_done(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + int limit; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + + limit = btrfs_async_submit_limit(fs_info); + limit = limit * 2 / 3; + + atomic_dec(&fs_info->nr_async_submits); + + if (atomic_read(&fs_info->nr_async_submits) < limit && + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); + + async->submit_bio_done(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + kfree(async); +} + +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) +{ + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->inode = inode; + async->rw = rw; + async->bio = bio; + async->mirror_num = mirror_num; + async->submit_bio_start = submit_bio_start; + async->submit_bio_done = submit_bio_done; + + async->work.func = run_one_async_start; + async->work.ordered_func = run_one_async_done; + async->work.ordered_free = run_one_async_free; + + async->work.flags = 0; + async->bio_flags = bio_flags; + + atomic_inc(&fs_info->nr_async_submits); + + if (rw & (1 << BIO_RW_SYNCIO)) + btrfs_set_work_high_prio(&async->work); + + btrfs_queue_worker(&fs_info->workers, &async->work); + + while (atomic_read(&fs_info->async_submit_draining) && + atomic_read(&fs_info->nr_async_submits)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0)); + } + + return 0; +} + +static int btree_csum_one_bio(struct bio *bio) +{ + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + struct btrfs_root *root; + + WARN_ON(bio->bi_vcnt <= 0); + while (bio_index < bio->bi_vcnt) { + root = BTRFS_I(bvec->bv_page->mapping->host)->root; + csum_dirty_buffer(root, bvec->bv_page); + bio_index++; + bvec++; + } + return 0; +} + +static int __btree_submit_bio_start(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + /* + * when we're called for a write, we're already in the async + * submission context. Just jump into btrfs_map_bio + */ + btree_csum_one_bio(bio); + return 0; +} + +static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + /* + * when we're called for a write, we're already in the async + * submission context. Just jump into btrfs_map_bio + */ + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); +} + +static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + int ret; + + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, + bio, 1); + BUG_ON(ret); + + if (!(rw & (1 << BIO_RW))) { + /* + * called for a read, do the setup so that checksum validation + * can happen in the async kernel threads + */ + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } + + /* + * kthread helpers are used to submit writes so that checksumming + * can happen in parallel across all CPUs + */ + return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, 0, + __btree_submit_bio_start, + __btree_submit_bio_done); +} + +static int btree_writepage(struct page *page, struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct extent_buffer *eb; + int was_dirty; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (!(current->flags & PF_MEMALLOC)) { + return extent_write_full_page(tree, page, + btree_get_extent, wbc); + } + + redirty_page_for_writepage(wbc, page); + eb = btrfs_find_tree_block(root, page_offset(page), + PAGE_CACHE_SIZE); + WARN_ON(!eb); + + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + if (!was_dirty) { + spin_lock(&root->fs_info->delalloc_lock); + root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; + spin_unlock(&root->fs_info->delalloc_lock); + } + free_extent_buffer(eb); + + unlock_page(page); + return 0; +} + +static int btree_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; + if (wbc->sync_mode == WB_SYNC_NONE) { + struct btrfs_root *root = BTRFS_I(mapping->host)->root; + u64 num_dirty; + unsigned long thresh = 32 * 1024 * 1024; + + if (wbc->for_kupdate) + return 0; + + /* this is a bit racy, but that's ok */ + num_dirty = root->fs_info->dirty_metadata_bytes; + if (num_dirty < thresh) + return 0; + } + return extent_writepages(tree, mapping, btree_get_extent, wbc); +} + +static int btree_readpage(struct file *file, struct page *page) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_read_full_page(tree, page, btree_get_extent); +} + +static int btree_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *map; + int ret; + + if (PageWriteback(page) || PageDirty(page)) + return 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + + ret = try_release_extent_state(map, tree, page, gfp_flags); + if (!ret) + return 0; + + ret = try_release_extent_buffer(tree, page); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } + + return ret; +} + +static void btree_invalidatepage(struct page *page, unsigned long offset) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + extent_invalidatepage(tree, page, offset); + btree_releasepage(page, GFP_NOFS); + if (PagePrivate(page)) { + printk(KERN_WARNING "btrfs warning page private not zero " + "on page %llu\n", (unsigned long long)page_offset(page)); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } +} + +static const struct address_space_operations btree_aops = { + .readpage = btree_readpage, + .writepage = btree_writepage, + .writepages = btree_writepages, + .releasepage = btree_releasepage, + .invalidatepage = btree_invalidatepage, + .sync_page = block_sync_page, +}; + +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, + u64 parent_transid) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + int ret = 0; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return 0; + read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, + buf, 0, 0, btree_get_extent, 0); + free_extent_buffer(buf); + return ret; +} + +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, + bytenr, blocksize, GFP_NOFS); + return eb; +} + +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + + eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, + bytenr, blocksize, NULL, GFP_NOFS); + return eb; +} + + +int btrfs_write_tree_block(struct extent_buffer *buf) +{ + return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, + buf->start + buf->len - 1); +} + +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +{ + return filemap_fdatawait_range(buf->first_page->mapping, + buf->start, buf->start + buf->len - 1); +} + +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_io_tree *io_tree; + int ret; + + io_tree = &BTRFS_I(btree_inode)->io_tree; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return NULL; + + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + + if (ret == 0) + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); + return buf; + +} + +int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + if (btrfs_header_generation(buf) == + root->fs_info->running_transaction->transid) { + btrfs_assert_tree_locked(buf); + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (root->fs_info->dirty_metadata_bytes >= buf->len) + root->fs_info->dirty_metadata_bytes -= buf->len; + else + WARN_ON(1); + spin_unlock(&root->fs_info->delalloc_lock); + } + + /* ugh, clear_extent_buffer_dirty needs to lock the page */ + btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); + } + return 0; +} + +static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + u32 stripesize, struct btrfs_root *root, + struct btrfs_fs_info *fs_info, + u64 objectid) +{ + root->node = NULL; + root->commit_root = NULL; + root->sectorsize = sectorsize; + root->nodesize = nodesize; + root->leafsize = leafsize; + root->stripesize = stripesize; + root->ref_cows = 0; + root->track_dirty = 0; + root->in_radix = 0; + root->clean_orphans = 0; + + root->fs_info = fs_info; + root->objectid = objectid; + root->last_trans = 0; + root->highest_objectid = 0; + root->name = NULL; + root->in_sysfs = 0; + root->inode_tree.rb_node = NULL; + + INIT_LIST_HEAD(&root->dirty_list); + INIT_LIST_HEAD(&root->orphan_list); + INIT_LIST_HEAD(&root->root_list); + spin_lock_init(&root->node_lock); + spin_lock_init(&root->list_lock); + spin_lock_init(&root->inode_lock); + mutex_init(&root->objectid_mutex); + mutex_init(&root->log_mutex); + init_waitqueue_head(&root->log_writer_wait); + init_waitqueue_head(&root->log_commit_wait[0]); + init_waitqueue_head(&root->log_commit_wait[1]); + atomic_set(&root->log_commit[0], 0); + atomic_set(&root->log_commit[1], 0); + atomic_set(&root->log_writers, 0); + root->log_batch = 0; + root->log_transid = 0; + root->last_log_commit = 0; + extent_io_tree_init(&root->dirty_log_pages, + fs_info->btree_inode->i_mapping, GFP_NOFS); + + memset(&root->root_key, 0, sizeof(root->root_key)); + memset(&root->root_item, 0, sizeof(root->root_item)); + memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); + memset(&root->root_kobj, 0, sizeof(root->root_kobj)); + root->defrag_trans_start = fs_info->generation; + init_completion(&root->kobj_unregister); + root->defrag_running = 0; + root->root_key.objectid = objectid; + root->anon_super.s_root = NULL; + root->anon_super.s_dev = 0; + INIT_LIST_HEAD(&root->anon_super.s_list); + INIT_LIST_HEAD(&root->anon_super.s_instances); + init_rwsem(&root->anon_super.s_umount); + + return 0; +} + +static int find_and_setup_root(struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, + u64 objectid, + struct btrfs_root *root) +{ + int ret; + u32 blocksize; + u64 generation; + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, objectid); + ret = btrfs_find_last_root(tree_root, objectid, + &root->root_item, &root->root_key); + if (ret > 0) + return -ENOENT; + BUG_ON(ret); + + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); + BUG_ON(!root->node); + root->commit_root = btrfs_root_node(root); + return 0; +} + +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct extent_buffer *eb; + struct btrfs_root *log_root_tree = fs_info->log_root_tree; + u64 start = 0; + u64 end = 0; + int ret; + + if (!log_root_tree) + return 0; + + while (1) { + ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); + if (ret) + break; + + clear_extent_bits(&log_root_tree->dirty_log_pages, start, end, + EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + } + eb = fs_info->log_root_tree->node; + + WARN_ON(btrfs_header_level(eb) != 0); + WARN_ON(btrfs_header_nritems(eb) != 0); + + ret = btrfs_free_reserved_extent(fs_info->tree_root, + eb->start, eb->len); + BUG_ON(ret); + + free_extent_buffer(eb); + kfree(fs_info->log_root_tree); + fs_info->log_root_tree = NULL; + return 0; +} + +static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct btrfs_root *tree_root = fs_info->tree_root; + struct extent_buffer *leaf; + + root = kzalloc(sizeof(*root), GFP_NOFS); + if (!root) + return ERR_PTR(-ENOMEM); + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, BTRFS_TREE_LOG_OBJECTID); + + root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + /* + * log trees do not get reference counted because they go away + * before a real commit is actually done. They do store pointers + * to file data extents, and those reference counts still get + * updated (along with back refs to the log tree). + */ + root->ref_cows = 0; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); + if (IS_ERR(leaf)) { + kfree(root); + return ERR_CAST(leaf); + } + + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); + root->node = leaf; + + write_extent_buffer(root->node, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(root->node), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(root->node); + btrfs_tree_unlock(root->node); + return root; +} + +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *log_root; + + log_root = alloc_log_tree(trans, fs_info); + if (IS_ERR(log_root)) + return PTR_ERR(log_root); + WARN_ON(fs_info->log_root_tree); + fs_info->log_root_tree = log_root; + return 0; +} + +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *log_root; + struct btrfs_inode_item *inode_item; + + log_root = alloc_log_tree(trans, root->fs_info); + if (IS_ERR(log_root)) + return PTR_ERR(log_root); + + log_root->last_trans = trans->transid; + log_root->root_key.offset = root->root_key.objectid; + + inode_item = &log_root->root_item.inode; + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_node(&log_root->root_item, log_root->node); + + WARN_ON(root->log_root); + root->log_root = log_root; + root->log_transid = 0; + root->last_log_commit = 0; + return 0; +} + +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_key *location) +{ + struct btrfs_root *root; + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_path *path; + struct extent_buffer *l; + u64 generation; + u32 blocksize; + int ret = 0; + + root = kzalloc(sizeof(*root), GFP_NOFS); + if (!root) + return ERR_PTR(-ENOMEM); + if (location->offset == (u64)-1) { + ret = find_and_setup_root(tree_root, fs_info, + location->objectid, root); + if (ret) { + kfree(root); + return ERR_PTR(ret); + } + goto out; + } + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, location->objectid); + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); + if (ret == 0) { + l = path->nodes[0]; + read_extent_buffer(l, &root->root_item, + btrfs_item_ptr_offset(l, path->slots[0]), + sizeof(root->root_item)); + memcpy(&root->root_key, location, sizeof(*location)); + } + btrfs_free_path(path); + if (ret) { + if (ret > 0) + ret = -ENOENT; + return ERR_PTR(ret); + } + + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); + root->commit_root = btrfs_root_node(root); + BUG_ON(!root->node); +out: + if (location->objectid != BTRFS_TREE_LOG_OBJECTID) + root->ref_cows = 1; + + return root; +} + +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid) +{ + struct btrfs_root *root; + + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_objectid); + return root; +} + +struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location) +{ + struct btrfs_root *root; + int ret; + + if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) + return fs_info->chunk_root; + if (location->objectid == BTRFS_DEV_TREE_OBJECTID) + return fs_info->dev_root; + if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) + return fs_info->csum_root; +again: + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)location->objectid); + spin_unlock(&fs_info->fs_roots_radix_lock); + if (root) + return root; + + ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); + if (ret == 0) + ret = -ENOENT; + if (ret < 0) + return ERR_PTR(ret); + + root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); + if (IS_ERR(root)) + return root; + + WARN_ON(btrfs_root_refs(&root->root_item) == 0); + set_anon_super(&root->anon_super, NULL); + + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto fail; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); + if (ret == 0) { + root->in_radix = 1; + root->clean_orphans = 1; + } + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); + if (ret) { + if (ret == -EEXIST) { + free_fs_root(root); + goto again; + } + goto fail; + } + + ret = btrfs_find_dead_roots(fs_info->tree_root, + root->root_key.objectid); + WARN_ON(ret); + return root; +fail: + free_fs_root(root); + return ERR_PTR(ret); +} + +struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + const char *name, int namelen) +{ + return btrfs_read_fs_root_no_name(fs_info, location); +#if 0 + struct btrfs_root *root; + int ret; + + root = btrfs_read_fs_root_no_name(fs_info, location); + if (!root) + return NULL; + + if (root->in_sysfs) + return root; + + ret = btrfs_set_root_name(root, name, namelen); + if (ret) { + free_extent_buffer(root->node); + kfree(root); + return ERR_PTR(ret); + } + + ret = btrfs_sysfs_add_root(root); + if (ret) { + free_extent_buffer(root->node); + kfree(root->name); + kfree(root); + return ERR_PTR(ret); + } + root->in_sysfs = 1; + return root; +#endif +} + +static int btrfs_congested_fn(void *congested_data, int bdi_bits) +{ + struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; + int ret = 0; + struct btrfs_device *device; + struct backing_dev_info *bdi; + + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi && bdi_congested(bdi, bdi_bits)) { + ret = 1; + break; + } + } + return ret; +} + +/* + * this unplugs every device on the box, and it is only used when page + * is null + */ +static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ + struct btrfs_device *device; + struct btrfs_fs_info *info; + + info = (struct btrfs_fs_info *)bdi->unplug_io_data; + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi->unplug_io_fn) + bdi->unplug_io_fn(bdi, page); + } +} + +static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ + struct inode *inode; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct address_space *mapping; + u64 offset; + + /* the generic O_DIRECT read code does this */ + if (1 || !page) { + __unplug_io_fn(bdi, page); + return; + } + + /* + * page->mapping may change at any time. Get a consistent copy + * and use that for everything below + */ + smp_mb(); + mapping = page->mapping; + if (!mapping) + return; + + inode = mapping->host; + + /* + * don't do the expensive searching for a small number of + * devices + */ + if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) { + __unplug_io_fn(bdi, page); + return; + } + + offset = page_offset(page); + + em_tree = &BTRFS_I(inode)->extent_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + read_unlock(&em_tree->lock); + if (!em) { + __unplug_io_fn(bdi, page); + return; + } + + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + __unplug_io_fn(bdi, page); + return; + } + offset = offset - em->start; + btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, + em->block_start + offset, page); + free_extent_map(em); +} + +/* + * If this fails, caller must call bdi_destroy() to get rid of the + * bdi again. + */ +static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) +{ + int err; + + bdi->name = "btrfs"; + bdi->capabilities = BDI_CAP_MAP_COPY; + err = bdi_init(bdi); + if (err) + return err; + + err = bdi_register(bdi, NULL, "btrfs-%d", + atomic_inc_return(&btrfs_bdi_num)); + if (err) { + bdi_destroy(bdi); + return err; + } + + bdi->ra_pages = default_backing_dev_info.ra_pages; + bdi->unplug_io_fn = btrfs_unplug_io_fn; + bdi->unplug_io_data = info; + bdi->congested_fn = btrfs_congested_fn; + bdi->congested_data = info; + return 0; +} + +static int bio_ready_for_csum(struct bio *bio) +{ + u64 length = 0; + u64 buf_len = 0; + u64 start = 0; + struct page *page; + struct extent_io_tree *io_tree = NULL; + struct btrfs_fs_info *info = NULL; + struct bio_vec *bvec; + int i; + int ret; + + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + if (page->private == EXTENT_PAGE_PRIVATE) { + length += bvec->bv_len; + continue; + } + if (!page->private) { + length += bvec->bv_len; + continue; + } + length = bvec->bv_len; + buf_len = page->private >> 2; + start = page_offset(page) + bvec->bv_offset; + io_tree = &BTRFS_I(page->mapping->host)->io_tree; + info = BTRFS_I(page->mapping->host)->root->fs_info; + } + /* are we fully contained in this bio? */ + if (buf_len <= length) + return 1; + + ret = extent_range_uptodate(io_tree, start + length, + start + buf_len - 1); + return ret; +} + +/* + * called by the kthread helper functions to finally call the bio end_io + * functions. This is where read checksum verification actually happens + */ +static void end_workqueue_fn(struct btrfs_work *work) +{ + struct bio *bio; + struct end_io_wq *end_io_wq; + struct btrfs_fs_info *fs_info; + int error; + + end_io_wq = container_of(work, struct end_io_wq, work); + bio = end_io_wq->bio; + fs_info = end_io_wq->info; + + /* metadata bio reads are special because the whole tree block must + * be checksummed at once. This makes sure the entire block is in + * ram and up to date before trying to verify things. For + * blocksize <= pagesize, it is basically a noop + */ + if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && + !bio_ready_for_csum(bio)) { + btrfs_queue_worker(&fs_info->endio_meta_workers, + &end_io_wq->work); + return; + } + error = end_io_wq->error; + bio->bi_private = end_io_wq->private; + bio->bi_end_io = end_io_wq->end_io; + kfree(end_io_wq); + bio_endio(bio, error); +} + +static int cleaner_kthread(void *arg) +{ + struct btrfs_root *root = arg; + + do { + smp_mb(); + if (root->fs_info->closing) + break; + + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + + if (!(root->fs_info->sb->s_flags & MS_RDONLY) && + mutex_trylock(&root->fs_info->cleaner_mutex)) { + btrfs_run_delayed_iputs(root); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + } + + if (freezing(current)) { + refrigerator(); + } else { + smp_mb(); + if (root->fs_info->closing) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +static int transaction_kthread(void *arg) +{ + struct btrfs_root *root = arg; + struct btrfs_trans_handle *trans; + struct btrfs_transaction *cur; + unsigned long now; + unsigned long delay; + int ret; + + do { + smp_mb(); + if (root->fs_info->closing) + break; + + delay = HZ * 30; + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + mutex_lock(&root->fs_info->trans_mutex); + cur = root->fs_info->running_transaction; + if (!cur) { + mutex_unlock(&root->fs_info->trans_mutex); + goto sleep; + } + + now = get_seconds(); + if (now < cur->start_time || now - cur->start_time < 30) { + mutex_unlock(&root->fs_info->trans_mutex); + delay = HZ * 5; + goto sleep; + } + mutex_unlock(&root->fs_info->trans_mutex); + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); + +sleep: + wake_up_process(root->fs_info->cleaner_kthread); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + + if (freezing(current)) { + refrigerator(); + } else { + if (root->fs_info->closing) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(delay); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options) +{ + u32 sectorsize; + u32 nodesize; + u32 leafsize; + u32 blocksize; + u32 stripesize; + u64 generation; + u64 features; + struct btrfs_key location; + struct buffer_head *bh; + struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), + GFP_NOFS); + struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *log_tree_root; + + int ret; + int err = -EINVAL; + + struct btrfs_super_block *disk_super; + + if (!extent_root || !tree_root || !fs_info || + !chunk_root || !dev_root || !csum_root) { + err = -ENOMEM; + goto fail; + } + + ret = init_srcu_struct(&fs_info->subvol_srcu); + if (ret) { + err = ret; + goto fail; + } + + ret = setup_bdi(fs_info, &fs_info->bdi); + if (ret) { + err = ret; + goto fail_srcu; + } + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail_bdi; + } + + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_LIST_HEAD(&fs_info->trans_list); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->delayed_iputs); + INIT_LIST_HEAD(&fs_info->hashers); + INIT_LIST_HEAD(&fs_info->delalloc_inodes); + INIT_LIST_HEAD(&fs_info->ordered_operations); + INIT_LIST_HEAD(&fs_info->caching_block_groups); + spin_lock_init(&fs_info->delalloc_lock); + spin_lock_init(&fs_info->new_trans_lock); + spin_lock_init(&fs_info->ref_cache_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->delayed_iput_lock); + + init_completion(&fs_info->kobj_unregister); + fs_info->tree_root = tree_root; + fs_info->extent_root = extent_root; + fs_info->csum_root = csum_root; + fs_info->chunk_root = chunk_root; + fs_info->dev_root = dev_root; + fs_info->fs_devices = fs_devices; + INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); + INIT_LIST_HEAD(&fs_info->space_info); + btrfs_mapping_init(&fs_info->mapping_tree); + atomic_set(&fs_info->nr_async_submits, 0); + atomic_set(&fs_info->async_delalloc_pages, 0); + atomic_set(&fs_info->async_submit_draining, 0); + atomic_set(&fs_info->nr_async_bios, 0); + fs_info->sb = sb; + fs_info->max_extent = (u64)-1; + fs_info->max_inline = 8192 * 1024; + fs_info->metadata_ratio = 0; + + fs_info->thread_pool_size = min_t(unsigned long, + num_online_cpus() + 2, 8); + + INIT_LIST_HEAD(&fs_info->ordered_extents); + spin_lock_init(&fs_info->ordered_extent_lock); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = blksize_bits(4096); + sb->s_bdi = &fs_info->bdi; + + fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + fs_info->btree_inode->i_nlink = 1; + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + fs_info->btree_inode->i_size = OFFSET_MAX; + fs_info->btree_inode->i_mapping->a_ops = &btree_aops; + fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; + + RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); + extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, + fs_info->btree_inode->i_mapping, + GFP_NOFS); + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, + GFP_NOFS); + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + + BTRFS_I(fs_info->btree_inode)->root = tree_root; + memset(&BTRFS_I(fs_info->btree_inode)->location, 0, + sizeof(struct btrfs_key)); + BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; + insert_inode_hash(fs_info->btree_inode); + + spin_lock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree.rb_node = NULL; + + extent_io_tree_init(&fs_info->freed_extents[0], + fs_info->btree_inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&fs_info->freed_extents[1], + fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->pinned_extents = &fs_info->freed_extents[0]; + fs_info->do_barriers = 1; + + + mutex_init(&fs_info->trans_mutex); + mutex_init(&fs_info->ordered_operations_mutex); + mutex_init(&fs_info->tree_log_mutex); + mutex_init(&fs_info->chunk_mutex); + mutex_init(&fs_info->transaction_kthread_mutex); + mutex_init(&fs_info->cleaner_mutex); + mutex_init(&fs_info->volume_mutex); + init_rwsem(&fs_info->extent_commit_sem); + init_rwsem(&fs_info->cleanup_work_sem); + init_rwsem(&fs_info->subvol_sem); + + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); + btrfs_init_free_cluster(&fs_info->data_alloc_cluster); + + init_waitqueue_head(&fs_info->transaction_throttle); + init_waitqueue_head(&fs_info->transaction_wait); + init_waitqueue_head(&fs_info->async_submit_wait); + + __setup_root(4096, 4096, 4096, 4096, tree_root, + fs_info, BTRFS_ROOT_TREE_OBJECTID); + + + bh = btrfs_read_dev_super(fs_devices->latest_bdev); + if (!bh) + goto fail_iput; + + memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); + memcpy(&fs_info->super_for_commit, &fs_info->super_copy, + sizeof(fs_info->super_for_commit)); + brelse(bh); + + memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); + + disk_super = &fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + goto fail_iput; + + ret = btrfs_parse_options(tree_root, options); + if (ret) { + err = ret; + goto fail_iput; + } + + features = btrfs_super_incompat_flags(disk_super) & + ~BTRFS_FEATURE_INCOMPAT_SUPP; + if (features) { + printk(KERN_ERR "BTRFS: couldn't mount because of " + "unsupported optional features (%Lx).\n", + (unsigned long long)features); + err = -EINVAL; + goto fail_iput; + } + + features = btrfs_super_incompat_flags(disk_super); + if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { + features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + btrfs_set_super_incompat_flags(disk_super, features); + } + + features = btrfs_super_compat_ro_flags(disk_super) & + ~BTRFS_FEATURE_COMPAT_RO_SUPP; + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " + "unsupported option features (%Lx).\n", + (unsigned long long)features); + err = -EINVAL; + goto fail_iput; + } + + btrfs_init_workers(&fs_info->generic_worker, + "genwork", 1, NULL); + + btrfs_init_workers(&fs_info->workers, "worker", + fs_info->thread_pool_size, + &fs_info->generic_worker); + + btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", + fs_info->thread_pool_size, + &fs_info->generic_worker); + + btrfs_init_workers(&fs_info->submit_workers, "submit", + min_t(u64, fs_devices->num_devices, + fs_info->thread_pool_size), + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->enospc_workers, "enospc", + fs_info->thread_pool_size, + &fs_info->generic_worker); + + /* a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the + * devices + */ + fs_info->submit_workers.idle_thresh = 64; + + fs_info->workers.idle_thresh = 16; + fs_info->workers.ordered = 1; + + fs_info->delalloc_workers.idle_thresh = 2; + fs_info->delalloc_workers.ordered = 1; + + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_workers, "endio", + fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", + fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_meta_write_workers, + "endio-meta-write", fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", + fs_info->thread_pool_size, + &fs_info->generic_worker); + + /* + * endios are largely parallel and should have a very + * low idle thresh + */ + fs_info->endio_workers.idle_thresh = 4; + fs_info->endio_meta_workers.idle_thresh = 4; + + fs_info->endio_write_workers.idle_thresh = 2; + fs_info->endio_meta_write_workers.idle_thresh = 2; + + btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->generic_worker, 1); + btrfs_start_workers(&fs_info->submit_workers, 1); + btrfs_start_workers(&fs_info->delalloc_workers, 1); + btrfs_start_workers(&fs_info->fixup_workers, 1); + btrfs_start_workers(&fs_info->endio_workers, 1); + btrfs_start_workers(&fs_info->endio_meta_workers, 1); + btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); + btrfs_start_workers(&fs_info->endio_write_workers, 1); + btrfs_start_workers(&fs_info->enospc_workers, 1); + + fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); + fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, + 4 * 1024 * 1024 / PAGE_CACHE_SIZE); + + nodesize = btrfs_super_nodesize(disk_super); + leafsize = btrfs_super_leafsize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = btrfs_super_stripesize(disk_super); + tree_root->nodesize = nodesize; + tree_root->leafsize = leafsize; + tree_root->sectorsize = sectorsize; + tree_root->stripesize = stripesize; + + sb->s_blocksize = sectorsize; + sb->s_blocksize_bits = blksize_bits(sectorsize); + + if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, + sizeof(disk_super->magic))) { + printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); + goto fail_sb_buffer; + } + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_sys_array(tree_root); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk(KERN_WARNING "btrfs: failed to read the system " + "array on %s\n", sb->s_id); + goto fail_sb_buffer; + } + + blocksize = btrfs_level_size(tree_root, + btrfs_super_chunk_root_level(disk_super)); + generation = btrfs_super_chunk_root_generation(disk_super); + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); + + chunk_root->node = read_tree_block(chunk_root, + btrfs_super_chunk_root(disk_super), + blocksize, generation); + BUG_ON(!chunk_root->node); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", + sb->s_id); + goto fail_chunk_root; + } + btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); + chunk_root->commit_root = btrfs_root_node(chunk_root); + + read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), + BTRFS_UUID_SIZE); + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_chunk_tree(chunk_root); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", + sb->s_id); + goto fail_chunk_root; + } + + btrfs_close_extra_devices(fs_devices); + + blocksize = btrfs_level_size(tree_root, + btrfs_super_root_level(disk_super)); + generation = btrfs_super_generation(disk_super); + + tree_root->node = read_tree_block(tree_root, + btrfs_super_root(disk_super), + blocksize, generation); + if (!tree_root->node) + goto fail_chunk_root; + if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", + sb->s_id); + goto fail_tree_root; + } + btrfs_set_root_node(&tree_root->root_item, tree_root->node); + tree_root->commit_root = btrfs_root_node(tree_root); + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_EXTENT_TREE_OBJECTID, extent_root); + if (ret) + goto fail_tree_root; + extent_root->track_dirty = 1; + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_DEV_TREE_OBJECTID, dev_root); + if (ret) + goto fail_extent_root; + dev_root->track_dirty = 1; + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_CSUM_TREE_OBJECTID, csum_root); + if (ret) + goto fail_dev_root; + + csum_root->track_dirty = 1; + + btrfs_read_block_groups(extent_root); + + fs_info->generation = generation; + fs_info->last_trans_committed = generation; + fs_info->data_alloc_profile = (u64)-1; + fs_info->metadata_alloc_profile = (u64)-1; + fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, + "btrfs-cleaner"); + if (IS_ERR(fs_info->cleaner_kthread)) + goto fail_csum_root; + + fs_info->transaction_kthread = kthread_run(transaction_kthread, + tree_root, + "btrfs-transaction"); + if (IS_ERR(fs_info->transaction_kthread)) + goto fail_cleaner; + + if (!btrfs_test_opt(tree_root, SSD) && + !btrfs_test_opt(tree_root, NOSSD) && + !fs_info->fs_devices->rotating) { + printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " + "mode\n"); + btrfs_set_opt(fs_info->mount_opt, SSD); + } + + if (btrfs_super_log_root(disk_super) != 0) { + u64 bytenr = btrfs_super_log_root(disk_super); + + if (fs_devices->rw_devices == 0) { + printk(KERN_WARNING "Btrfs log replay required " + "on RO media\n"); + err = -EIO; + goto fail_trans_kthread; + } + blocksize = + btrfs_level_size(tree_root, + btrfs_super_log_root_level(disk_super)); + + log_tree_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); + + log_tree_root->node = read_tree_block(tree_root, bytenr, + blocksize, + generation + 1); + ret = btrfs_recover_log_trees(log_tree_root); + BUG_ON(ret); + + if (sb->s_flags & MS_RDONLY) { + ret = btrfs_commit_super(tree_root); + BUG_ON(ret); + } + } + + ret = btrfs_find_orphan_roots(tree_root); + BUG_ON(ret); + + if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_recover_relocation(tree_root); + if (ret < 0) { + printk(KERN_WARNING + "btrfs: failed to recover relocation\n"); + err = -EINVAL; + goto fail_trans_kthread; + } + } + + location.objectid = BTRFS_FS_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = (u64)-1; + + fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); + if (!fs_info->fs_root) + goto fail_trans_kthread; + + if (!(sb->s_flags & MS_RDONLY)) { + down_read(&fs_info->cleanup_work_sem); + btrfs_orphan_cleanup(fs_info->fs_root); + up_read(&fs_info->cleanup_work_sem); + } + + return tree_root; + +fail_trans_kthread: + kthread_stop(fs_info->transaction_kthread); +fail_cleaner: + kthread_stop(fs_info->cleaner_kthread); + + /* + * make sure we're done with the btree inode before we stop our + * kthreads + */ + filemap_write_and_wait(fs_info->btree_inode->i_mapping); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + +fail_csum_root: + free_extent_buffer(csum_root->node); + free_extent_buffer(csum_root->commit_root); +fail_dev_root: + free_extent_buffer(dev_root->node); + free_extent_buffer(dev_root->commit_root); +fail_extent_root: + free_extent_buffer(extent_root->node); + free_extent_buffer(extent_root->commit_root); +fail_tree_root: + free_extent_buffer(tree_root->node); + free_extent_buffer(tree_root->commit_root); +fail_chunk_root: + free_extent_buffer(chunk_root->node); + free_extent_buffer(chunk_root->commit_root); +fail_sb_buffer: + btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); +fail_iput: + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + iput(fs_info->btree_inode); + + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); +fail_bdi: + bdi_destroy(&fs_info->bdi); +fail_srcu: + cleanup_srcu_struct(&fs_info->subvol_srcu); +fail: + kfree(extent_root); + kfree(tree_root); + kfree(fs_info); + kfree(chunk_root); + kfree(dev_root); + kfree(csum_root); + return ERR_PTR(err); +} + +static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + /* note, we dont' set_buffer_write_io_error because we have + * our own ways of dealing with the IO errors + */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) +{ + struct buffer_head *bh; + struct buffer_head *latest = NULL; + struct btrfs_super_block *super; + int i; + u64 transid = 0; + u64 bytenr; + + /* we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + for (i = 0; i < 1; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + 4096 >= i_size_read(bdev->bd_inode)) + break; + bh = __bread(bdev, bytenr / 4096, 4096); + if (!bh) + continue; + + super = (struct btrfs_super_block *)bh->b_data; + if (btrfs_super_bytenr(super) != bytenr || + strncmp((char *)(&super->magic), BTRFS_MAGIC, + sizeof(super->magic))) { + brelse(bh); + continue; + } + + if (!latest || btrfs_super_generation(super) > transid) { + brelse(latest); + latest = bh; + transid = btrfs_super_generation(super); + } else { + brelse(bh); + } + } + return latest; +} + +/* + * this should be called twice, once with wait == 0 and + * once with wait == 1. When wait == 0 is done, all the buffer heads + * we write are pinned. + * + * They are released when wait == 1 is done. + * max_mirrors must be the same for both runs, and it indicates how + * many supers on this one device should be written. + * + * max_mirrors == 0 means to write them all. + */ +static int write_dev_supers(struct btrfs_device *device, + struct btrfs_super_block *sb, + int do_barriers, int wait, int max_mirrors) +{ + struct buffer_head *bh; + int i; + int ret; + int errors = 0; + u32 crc; + u64 bytenr; + int last_barrier = 0; + + if (max_mirrors == 0) + max_mirrors = BTRFS_SUPER_MIRROR_MAX; + + /* make sure only the last submit_bh does a barrier */ + if (do_barriers) { + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->total_bytes) + break; + last_barrier = i; + } + } + + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + break; + + if (wait) { + bh = __find_get_block(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + BUG_ON(!bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + errors++; + + /* drop our reference */ + brelse(bh); + + /* drop the reference from the wait == 0 run */ + brelse(bh); + continue; + } else { + btrfs_set_super_bytenr(sb, bytenr); + + crc = ~(u32)0; + crc = btrfs_csum_data(NULL, (char *)sb + + BTRFS_CSUM_SIZE, crc, + BTRFS_SUPER_INFO_SIZE - + BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, sb->csum); + + /* + * one reference for us, and we leave it for the + * caller + */ + bh = __getblk(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + + /* one reference for submit_bh */ + get_bh(bh); + + set_buffer_uptodate(bh); + lock_buffer(bh); + bh->b_end_io = btrfs_end_buffer_write_sync; + } + + if (i == last_barrier && do_barriers && device->barriers) { + ret = submit_bh(WRITE_BARRIER, bh); + if (ret == -EOPNOTSUPP) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + set_buffer_uptodate(bh); + device->barriers = 0; + /* one reference for submit_bh */ + get_bh(bh); + lock_buffer(bh); + ret = submit_bh(WRITE_SYNC, bh); + } + } else { + ret = submit_bh(WRITE_SYNC, bh); + } + + if (ret) + errors++; + } + return errors < i ? 0 : -1; +} + +int write_all_supers(struct btrfs_root *root, int max_mirrors) +{ + struct list_head *head; + struct btrfs_device *dev; + struct btrfs_super_block *sb; + struct btrfs_dev_item *dev_item; + int ret; + int do_barriers; + int max_errors; + int total_errors = 0; + u64 flags; + + max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + do_barriers = !btrfs_test_opt(root, NOBARRIER); + + sb = &root->fs_info->super_for_commit; + dev_item = &sb->dev_item; + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + head = &root->fs_info->fs_devices->devices; + list_for_each_entry(dev, head, dev_list) { + if (!dev->bdev) { + total_errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + btrfs_set_stack_device_generation(dev_item, 0); + btrfs_set_stack_device_type(dev_item, dev->type); + btrfs_set_stack_device_id(dev_item, dev->devid); + btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); + btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); + btrfs_set_stack_device_io_align(dev_item, dev->io_align); + btrfs_set_stack_device_io_width(dev_item, dev->io_width); + btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); + memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); + + flags = btrfs_super_flags(sb); + btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + + ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors); + if (ret) + total_errors++; + } + if (total_errors > max_errors) { + printk(KERN_ERR "btrfs: %d errors while writing supers\n", + total_errors); + BUG(); + } + + total_errors = 0; + list_for_each_entry(dev, head, dev_list) { + if (!dev->bdev) + continue; + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors); + if (ret) + total_errors++; + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + if (total_errors > max_errors) { + printk(KERN_ERR "btrfs: %d errors while writing supers\n", + total_errors); + BUG(); + } + return 0; +} + +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int max_mirrors) +{ + int ret; + + ret = write_all_supers(root, max_mirrors); + return ret; +} + +int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +{ + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); + spin_unlock(&fs_info->fs_roots_radix_lock); + + if (btrfs_root_refs(&root->root_item) == 0) + synchronize_srcu(&fs_info->subvol_srcu); + + free_fs_root(root); + return 0; +} + +static void free_fs_root(struct btrfs_root *root) +{ + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + if (root->anon_super.s_dev) { + down_write(&root->anon_super.s_umount); + kill_anon_super(&root->anon_super); + } + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root->name); + kfree(root); +} + +static int del_fs_roots(struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *gang[8]; + int i; + + while (!list_empty(&fs_info->dead_roots)) { + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); + + if (gang[0]->in_radix) { + btrfs_free_fs_root(fs_info, gang[0]); + } else { + free_extent_buffer(gang[0]->node); + free_extent_buffer(gang[0]->commit_root); + kfree(gang[0]); + } + } + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_free_fs_root(fs_info, gang[i]); + } + return 0; +} + +int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) +{ + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i; + int ret; + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) + break; + + root_objectid = gang[ret - 1]->root_key.objectid + 1; + for (i = 0; i < ret; i++) { + root_objectid = gang[i]->root_key.objectid; + btrfs_orphan_cleanup(gang[i]); + } + root_objectid++; + } + return 0; +} + +int btrfs_commit_super(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + /* wait until ongoing cleanup work done */ + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + /* run commit again to drop the original snapshot */ + trans = btrfs_start_transaction(root, 1); + btrfs_commit_transaction(trans, root); + ret = btrfs_write_and_wait_transaction(NULL, root); + BUG_ON(ret); + + ret = write_ctree_super(NULL, root, 0); + return ret; +} + +int close_ctree(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + fs_info->closing = 1; + smp_mb(); + + kthread_stop(root->fs_info->transaction_kthread); + kthread_stop(root->fs_info->cleaner_kthread); + + if (!(fs_info->sb->s_flags & MS_RDONLY)) { + ret = btrfs_commit_super(root); + if (ret) + printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + } + + fs_info->closing = 2; + smp_mb(); + + if (fs_info->delalloc_bytes) { + printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", + (unsigned long long)fs_info->delalloc_bytes); + } + if (fs_info->total_ref_cache_size) { + printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", + (unsigned long long)fs_info->total_ref_cache_size); + } + + free_extent_buffer(fs_info->extent_root->node); + free_extent_buffer(fs_info->extent_root->commit_root); + free_extent_buffer(fs_info->tree_root->node); + free_extent_buffer(fs_info->tree_root->commit_root); + free_extent_buffer(root->fs_info->chunk_root->node); + free_extent_buffer(root->fs_info->chunk_root->commit_root); + free_extent_buffer(root->fs_info->dev_root->node); + free_extent_buffer(root->fs_info->dev_root->commit_root); + free_extent_buffer(root->fs_info->csum_root->node); + free_extent_buffer(root->fs_info->csum_root->commit_root); + + btrfs_free_block_groups(root->fs_info); + + del_fs_roots(fs_info); + + iput(fs_info->btree_inode); + + btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); + + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + bdi_destroy(&fs_info->bdi); + cleanup_srcu_struct(&fs_info->subvol_srcu); + + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + return 0; +} + +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) +{ + int ret; + struct inode *btree_inode = buf->first_page->mapping->host; + + ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); + if (!ret) + return ret; + + ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, + parent_transid); + return !ret; +} + +int btrfs_set_buffer_uptodate(struct extent_buffer *buf) +{ + struct inode *btree_inode = buf->first_page->mapping->host; + return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, + buf); +} + +void btrfs_mark_buffer_dirty(struct extent_buffer *buf) +{ + struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + u64 transid = btrfs_header_generation(buf); + struct inode *btree_inode = root->fs_info->btree_inode; + int was_dirty; + + btrfs_assert_tree_locked(buf); + if (transid != root->fs_info->generation) { + printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " + "found %llu running %llu\n", + (unsigned long long)buf->start, + (unsigned long long)transid, + (unsigned long long)root->fs_info->generation); + WARN_ON(1); + } + was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); + if (!was_dirty) { + spin_lock(&root->fs_info->delalloc_lock); + root->fs_info->dirty_metadata_bytes += buf->len; + spin_unlock(&root->fs_info->delalloc_lock); + } +} + +void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +{ + /* + * looks as though older kernels can get into trouble with + * this code, they end up stuck in balance_dirty_pages forever + */ + u64 num_dirty; + unsigned long thresh = 32 * 1024 * 1024; + + if (current->flags & PF_MEMALLOC) + return; + + num_dirty = root->fs_info->dirty_metadata_bytes; + + if (num_dirty > thresh) { + balance_dirty_pages_ratelimited_nr( + root->fs_info->btree_inode->i_mapping, 1); + } + return; +} + +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) +{ + struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + int ret; + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + if (ret == 0) + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); + return ret; +} + +int btree_lock_page_hook(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_buffer *eb; + unsigned long len; + u64 bytenr = page_offset(page); + + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + + len = page->private >> 2; + eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS); + if (!eb) + goto out; + + btrfs_tree_lock(eb); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (root->fs_info->dirty_metadata_bytes >= eb->len) + root->fs_info->dirty_metadata_bytes -= eb->len; + else + WARN_ON(1); + spin_unlock(&root->fs_info->delalloc_lock); + } + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); +out: + lock_page(page); + return 0; +} + +static struct extent_io_ops btree_extent_io_ops = { + .write_cache_pages_lock_hook = btree_lock_page_hook, + .readpage_end_io_hook = btree_readpage_end_io_hook, + .submit_bio_hook = btree_submit_bio_hook, + /* note we're sharing with inode.c for the merge bio hook */ + .merge_bio_hook = btrfs_merge_bio_hook, +}; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h new file mode 100644 index 00000000000..c958ecbc191 --- /dev/null +++ b/fs/btrfs/disk-io.h @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __DISKIO__ +#define __DISKIO__ + +#define BTRFS_SUPER_INFO_OFFSET (64 * 1024) +#define BTRFS_SUPER_INFO_SIZE 4096 + +#define BTRFS_SUPER_MIRROR_MAX 3 +#define BTRFS_SUPER_MIRROR_SHIFT 12 + +static inline u64 btrfs_sb_offset(int mirror) +{ + u64 start = 16 * 1024; + if (mirror) + return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); + return BTRFS_SUPER_INFO_OFFSET; +} + +struct btrfs_device; +struct btrfs_fs_devices; + +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid); +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, + u64 parent_transid); +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize); +int clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf); +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options); +int close_ctree(struct btrfs_root *root); +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int max_mirrors); +struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); +int btrfs_commit_super(struct btrfs_root *root); +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize); +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid); +struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + const char *name, int namelen); +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_key *location); +struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location); +int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); +int btrfs_insert_dev_radix(struct btrfs_root *root, + struct block_device *bdev, + u64 device_id, + u64 block_start, + u64 num_blocks); +void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); +int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); +int btrfs_set_buffer_uptodate(struct extent_buffer *buf); +int wait_on_tree_block_writeback(struct btrfs_root *root, + struct extent_buffer *buf); +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); +u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); +void btrfs_csum_final(u32 crc, char *result); +int btrfs_open_device(struct btrfs_device *dev); +int btrfs_verify_block_csum(struct btrfs_root *root, + struct extent_buffer *buf); +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + int metadata); +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done); + +int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); +unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); +int btrfs_write_tree_block(struct extent_buffer *buf); +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btree_lock_page_hook(struct page *page); + + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); +#else +static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, + int level) +{ +} +#endif +#endif diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c new file mode 100644 index 00000000000..ba5c3fd5ab8 --- /dev/null +++ b/fs/btrfs/export.c @@ -0,0 +1,240 @@ +#include <linux/fs.h> +#include <linux/types.h> +#include "ctree.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "export.h" +#include "compat.h" + +#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_root_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) + +static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct btrfs_fid *fid = (struct btrfs_fid *)fh; + struct inode *inode = dentry->d_inode; + int len = *max_len; + int type; + + if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || + (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) + return 255; + + len = BTRFS_FID_SIZE_NON_CONNECTABLE; + type = FILEID_BTRFS_WITHOUT_PARENT; + + fid->objectid = inode->i_ino; + fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->gen = inode->i_generation; + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + u64 parent_root_id; + + spin_lock(&dentry->d_lock); + + parent = dentry->d_parent->d_inode; + fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_gen = parent->i_generation; + parent_root_id = BTRFS_I(parent)->root->objectid; + + spin_unlock(&dentry->d_lock); + + if (parent_root_id != fid->root_objectid) { + fid->parent_root_objectid = parent_root_id; + len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; + type = FILEID_BTRFS_WITH_PARENT_ROOT; + } else { + len = BTRFS_FID_SIZE_CONNECTABLE; + type = FILEID_BTRFS_WITH_PARENT; + } + } + + *max_len = len; + return type; +} + +static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; + struct btrfs_root *root; + struct dentry *dentry; + struct inode *inode; + struct btrfs_key key; + int index; + int err = 0; + + if (objectid < BTRFS_FIRST_FREE_OBJECTID) + return ERR_PTR(-ESTALE); + + key.objectid = root_objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto fail; + } + + if (btrfs_root_refs(&root->root_item) == 0) { + err = -ENOENT; + goto fail; + } + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + inode = btrfs_iget(sb, &key, root); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto fail; + } + + srcu_read_unlock(&fs_info->subvol_srcu, index); + + if (check_generation && generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + dentry = d_obtain_alias(inode); + if (!IS_ERR(dentry)) + dentry->d_op = &btrfs_dentry_operations; + return dentry; +fail: + srcu_read_unlock(&fs_info->subvol_srcu, index); + return ERR_PTR(err); +} + +static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) + return NULL; + root_objectid = fid->root_objectid; + } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) + return NULL; + root_objectid = fid->parent_root_objectid; + } else + return NULL; + + objectid = fid->parent_objectid; + generation = fid->parent_gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); +} + +static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if ((fh_type != FILEID_BTRFS_WITH_PARENT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE) && + (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && + (fh_type != FILEID_BTRFS_WITHOUT_PARENT || + fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) + return NULL; + + objectid = fid->objectid; + root_objectid = fid->root_objectid; + generation = fid->gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); +} + +static struct dentry *btrfs_get_parent(struct dentry *child) +{ + struct inode *dir = child->d_inode; + static struct dentry *dentry; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_root_ref *ref; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + + path = btrfs_alloc_path(); + + if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = root->fs_info->tree_root; + } else { + key.objectid = dir->i_ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + + BUG_ON(ret == 0); + if (path->slots[0] == 0) { + ret = -ENOENT; + goto fail; + } + + path->slots[0]--; + leaf = path->nodes[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != key.objectid || found_key.type != key.type) { + ret = -ENOENT; + goto fail; + } + + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + key.objectid = btrfs_root_ref_dirid(leaf, ref); + } else { + key.objectid = found_key.offset; + } + btrfs_free_path(path); + + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { + return btrfs_get_dentry(root->fs_info->sb, key.objectid, + found_key.offset, 0, 0); + } + + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); + if (!IS_ERR(dentry)) + dentry->d_op = &btrfs_dentry_operations; + return dentry; +fail: + btrfs_free_path(path); + return ERR_PTR(ret); +} + +const struct export_operations btrfs_export_ops = { + .encode_fh = btrfs_encode_fh, + .fh_to_dentry = btrfs_fh_to_dentry, + .fh_to_parent = btrfs_fh_to_parent, + .get_parent = btrfs_get_parent, +}; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h new file mode 100644 index 00000000000..074348a9584 --- /dev/null +++ b/fs/btrfs/export.h @@ -0,0 +1,19 @@ +#ifndef BTRFS_EXPORT_H +#define BTRFS_EXPORT_H + +#include <linux/exportfs.h> + +extern const struct export_operations btrfs_export_ops; + +struct btrfs_fid { + u64 objectid; + u64 root_objectid; + u32 gen; + + u64 parent_objectid; + u32 parent_gen; + + u64 parent_root_objectid; +} __attribute__ ((packed)); + +#endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c new file mode 100644 index 00000000000..559f72489b3 --- /dev/null +++ b/fs/btrfs/extent-tree.c @@ -0,0 +1,7687 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/sort.h> +#include <linux/rcupdate.h> +#include <linux/kthread.h> +#include "compat.h" +#include "hash.h" +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "volumes.h" +#include "locking.h" +#include "free-space-cache.h" + +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc, + int mark_free); +static int update_reserved_extents(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve); +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extra_op); +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei); +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod); +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins); +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force); +static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, + int is_data, int reserved, + struct extent_buffer **must_clean); +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key); +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); + +static noinline int +block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + smp_mb(); + return cache->cached == BTRFS_CACHE_FINISHED; +} + +static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) +{ + return (cache->flags & bits) == bits; +} + +void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +{ + atomic_inc(&cache->count); +} + +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) +{ + if (atomic_dec_and_test(&cache->count)) + kfree(cache); +} + +/* + * this adds the block group to the fs_info rb tree for the block group + * cache + */ +static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, + struct btrfs_block_group_cache *block_group) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_block_group_cache *cache; + + spin_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_node; + + while (*p) { + parent = *p; + cache = rb_entry(parent, struct btrfs_block_group_cache, + cache_node); + if (block_group->key.objectid < cache->key.objectid) { + p = &(*p)->rb_left; + } else if (block_group->key.objectid > cache->key.objectid) { + p = &(*p)->rb_right; + } else { + spin_unlock(&info->block_group_cache_lock); + return -EEXIST; + } + } + + rb_link_node(&block_group->cache_node, parent, p); + rb_insert_color(&block_group->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + + return 0; +} + +/* + * This will return the block group at or after bytenr if contains is 0, else + * it will return the block group that contains the bytenr + */ +static struct btrfs_block_group_cache * +block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, + int contains) +{ + struct btrfs_block_group_cache *cache, *ret = NULL; + struct rb_node *n; + u64 end, start; + + spin_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_node; + + while (n) { + cache = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + end = cache->key.objectid + cache->key.offset - 1; + start = cache->key.objectid; + + if (bytenr < start) { + if (!contains && (!ret || start < ret->key.objectid)) + ret = cache; + n = n->rb_left; + } else if (bytenr > start) { + if (contains && bytenr <= end) { + ret = cache; + break; + } + n = n->rb_right; + } else { + ret = cache; + break; + } + } + if (ret) + btrfs_get_block_group(ret); + spin_unlock(&info->block_group_cache_lock); + + return ret; +} + +static int add_excluded_extent(struct btrfs_root *root, + u64 start, u64 num_bytes) +{ + u64 end = start + num_bytes - 1; + set_extent_bits(&root->fs_info->freed_extents[0], + start, end, EXTENT_UPTODATE, GFP_NOFS); + set_extent_bits(&root->fs_info->freed_extents[1], + start, end, EXTENT_UPTODATE, GFP_NOFS); + return 0; +} + +static void free_excluded_extents(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + u64 start, end; + + start = cache->key.objectid; + end = start + cache->key.offset - 1; + + clear_extent_bits(&root->fs_info->freed_extents[0], + start, end, EXTENT_UPTODATE, GFP_NOFS); + clear_extent_bits(&root->fs_info->freed_extents[1], + start, end, EXTENT_UPTODATE, GFP_NOFS); +} + +static int exclude_super_stripes(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { + stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; + cache->bytes_super += stripe_len; + ret = add_excluded_extent(root, cache->key.objectid, + stripe_len); + BUG_ON(ret); + } + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(&root->fs_info->mapping_tree, + cache->key.objectid, bytenr, + 0, &logical, &nr, &stripe_len); + BUG_ON(ret); + + while (nr--) { + cache->bytes_super += stripe_len; + ret = add_excluded_extent(root, logical[nr], + stripe_len); + BUG_ON(ret); + } + + kfree(logical); + } + return 0; +} + +static struct btrfs_caching_control * +get_caching_control(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *ctl; + + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_STARTED) { + spin_unlock(&cache->lock); + return NULL; + } + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + spin_unlock(&cache->lock); + return ctl; +} + +static void put_caching_control(struct btrfs_caching_control *ctl) +{ + if (atomic_dec_and_test(&ctl->count)) + kfree(ctl); +} + +/* + * this is only called by cache_block_group, since we could have freed extents + * we need to check the pinned_extents for any extents that can't be used yet + * since their free space will be released as soon as the transaction commits. + */ +static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end) +{ + u64 extent_start, extent_end, size, total_added = 0; + int ret; + + while (start < end) { + ret = find_first_extent_bit(info->pinned_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY | EXTENT_UPTODATE); + if (ret) + break; + + if (extent_start <= start) { + start = extent_end + 1; + } else if (extent_start > start && extent_start < end) { + size = extent_start - start; + total_added += size; + ret = btrfs_add_free_space(block_group, start, + size); + BUG_ON(ret); + start = extent_end + 1; + } else { + break; + } + } + + if (start < end) { + size = end - start; + total_added += size; + ret = btrfs_add_free_space(block_group, start, size); + BUG_ON(ret); + } + + return total_added; +} + +static int caching_kthread(void *data) +{ + struct btrfs_block_group_cache *block_group = data; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + u64 total_found = 0; + u64 last = 0; + u32 nritems; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + exclude_super_stripes(extent_root, block_group); + spin_lock(&block_group->space_info->lock); + block_group->space_info->bytes_super += block_group->bytes_super; + spin_unlock(&block_group->space_info->lock); + + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + + /* + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent + * root to add free space. So we skip locking and search the commit + * root, since its read-only + */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = 2; + + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; +again: + mutex_lock(&caching_ctl->mutex); + /* need to make sure the commit_root doesn't disappear */ + down_read(&fs_info->extent_commit_sem); + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto err; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + while (1) { + smp_mb(); + if (fs_info->closing > 1) { + last = (u64)-1; + break; + } + + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + } else { + ret = find_next_key(path, 0, &key); + if (ret) + break; + + caching_ctl->progress = last; + btrfs_release_path(extent_root, path); + up_read(&fs_info->extent_commit_sem); + mutex_unlock(&caching_ctl->mutex); + if (btrfs_transaction_in_commit(fs_info)) + schedule_timeout(1); + else + cond_resched(); + goto again; + } + + if (key.objectid < block_group->key.objectid) { + path->slots[0]++; + continue; + } + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + + if (key.type == BTRFS_EXTENT_ITEM_KEY) { + total_found += add_new_free_space(block_group, + fs_info, last, + key.objectid); + last = key.objectid + key.offset; + + if (total_found > (1024 * 1024 * 2)) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + } + path->slots[0]++; + } + ret = 0; + + total_found += add_new_free_space(block_group, fs_info, last, + block_group->key.objectid + + block_group->key.offset); + caching_ctl->progress = (u64)-1; + + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); + +err: + btrfs_free_path(path); + up_read(&fs_info->extent_commit_sem); + + free_excluded_extents(extent_root, block_group); + + mutex_unlock(&caching_ctl->mutex); + wake_up(&caching_ctl->wait); + + put_caching_control(caching_ctl); + atomic_dec(&block_group->space_info->caching_threads); + btrfs_put_block_group(block_group); + + return 0; +} + +static int cache_block_group(struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_caching_control *caching_ctl; + struct task_struct *tsk; + int ret = 0; + + smp_mb(); + if (cache->cached != BTRFS_CACHE_NO) + return 0; + + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + /* one for caching kthread, one for caching block group list */ + atomic_set(&caching_ctl->count, 2); + + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); + return 0; + } + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_STARTED; + spin_unlock(&cache->lock); + + down_write(&fs_info->extent_commit_sem); + list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); + up_write(&fs_info->extent_commit_sem); + + atomic_inc(&cache->space_info->caching_threads); + btrfs_get_block_group(cache); + + tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", + cache->key.objectid); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); + printk(KERN_ERR "error running thread %d\n", ret); + BUG(); + } + + return ret; +} + +/* + * return the block group that starts at or after bytenr + */ +static struct btrfs_block_group_cache * +btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) +{ + struct btrfs_block_group_cache *cache; + + cache = block_group_cache_tree_search(info, bytenr, 0); + + return cache; +} + +/* + * return the block group that contains the given bytenr + */ +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, + u64 bytenr) +{ + struct btrfs_block_group_cache *cache; + + cache = block_group_cache_tree_search(info, bytenr, 1); + + return cache; +} + +static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, + u64 flags) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags == flags) { + rcu_read_unlock(); + return found; + } + } + rcu_read_unlock(); + return NULL; +} + +/* + * after adding space to the filesystem, we need to clear the full flags + * on all the space infos. + */ +void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) + found->full = 0; + rcu_read_unlock(); +} + +static u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +u64 btrfs_find_block_group(struct btrfs_root *root, + u64 search_start, u64 search_hint, int owner) +{ + struct btrfs_block_group_cache *cache; + u64 used; + u64 last = max(search_hint, search_start); + u64 group_start = 0; + int full_search = 0; + int factor = 9; + int wrapped = 0; +again: + while (1) { + cache = btrfs_lookup_first_block_group(root->fs_info, last); + if (!cache) + break; + + spin_lock(&cache->lock); + last = cache->key.objectid + cache->key.offset; + used = btrfs_block_group_used(&cache->item); + + if ((full_search || !cache->ro) && + block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { + if (used + cache->pinned + cache->reserved < + div_factor(cache->key.offset, factor)) { + group_start = cache->key.objectid; + spin_unlock(&cache->lock); + btrfs_put_block_group(cache); + goto found; + } + } + spin_unlock(&cache->lock); + btrfs_put_block_group(cache); + cond_resched(); + } + if (!wrapped) { + last = search_start; + wrapped = 1; + goto again; + } + if (!full_search && factor < 10) { + last = search_start; + full_search = 1; + factor = 10; + goto again; + } +found: + return group_start; +} + +/* simple helper to search for an existing extent at a given offset */ +int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) +{ + int ret; + struct btrfs_key key; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + BUG_ON(!path); + key.objectid = start; + key.offset = len; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, + 0, 0); + btrfs_free_path(path); + return ret; +} + +/* + * Back reference rules. Back refs have three main goals: + * + * 1) differentiate between all holders of references to an extent so that + * when a reference is dropped we can make sure it was a valid reference + * before freeing the extent. + * + * 2) Provide enough information to quickly find the holders of an extent + * if we notice a given block is corrupted or bad. + * + * 3) Make it easy to migrate blocks for FS shrinking or storage pool + * maintenance. This is actually the same as #2, but with a slightly + * different use case. + * + * There are two kinds of back refs. The implicit back refs is optimized + * for pointers in non-shared tree blocks. For a given pointer in a block, + * back refs of this kind provide information about the block's owner tree + * and the pointer's key. These information allow us to find the block by + * b-tree searching. The full back refs is for pointers in tree blocks not + * referenced by their owner trees. The location of tree block is recorded + * in the back refs. Actually the full back refs is generic, and can be + * used in all cases the implicit back refs is used. The major shortcoming + * of the full back refs is its overhead. Every time a tree block gets + * COWed, we have to update back refs entry for all pointers in it. + * + * For a newly allocated tree block, we use implicit back refs for + * pointers in it. This means most tree related operations only involve + * implicit back refs. For a tree block created in old transaction, the + * only way to drop a reference to it is COW it. So we can detect the + * event that tree block loses its owner tree's reference and do the + * back refs conversion. + * + * When a tree block is COW'd through a tree, there are four cases: + * + * The reference count of the block is one and the tree is the block's + * owner tree. Nothing to do in this case. + * + * The reference count of the block is one and the tree is not the + * block's owner tree. In this case, full back refs is used for pointers + * in the block. Remove these full back refs, add implicit back refs for + * every pointers in the new block. + * + * The reference count of the block is greater than one and the tree is + * the block's owner tree. In this case, implicit back refs is used for + * pointers in the block. Add full back refs for every pointers in the + * block, increase lower level extents' reference counts. The original + * implicit back refs are entailed to the new block. + * + * The reference count of the block is greater than one and the tree is + * not the block's owner tree. Add implicit back refs for every pointer in + * the new block, increase lower level extents' reference count. + * + * Back Reference Key composing: + * + * The key objectid corresponds to the first byte in the extent, + * The key type is used to differentiate between types of back refs. + * There are different meanings of the key offset for different types + * of back refs. + * + * File extents can be referenced by: + * + * - multiple snapshots, subvolumes, or different generations in one subvol + * - different files inside a single subvolume + * - different offsets inside a file (bookend extents in file.c) + * + * The extent ref structure for the implicit back refs has fields for: + * + * - Objectid of the subvolume root + * - objectid of the file holding the reference + * - original offset in the file + * - how many bookend extents + * + * The key offset for the implicit back refs is hash of the first + * three fields. + * + * The extent ref structure for the full back refs has field for: + * + * - number of pointers in the tree leaf + * + * The key offset for the implicit back refs is the first byte of + * the tree leaf + * + * When a file extent is allocated, The implicit back refs is used. + * the fields are filled in: + * + * (root_key.objectid, inode objectid, offset in file, 1) + * + * When a file extent is removed file truncation, we find the + * corresponding implicit back refs and check the following fields: + * + * (btrfs_header_owner(leaf), inode objectid, offset in file) + * + * Btree extents can be referenced by: + * + * - Different subvolumes + * + * Both the implicit back refs and the full back refs for tree blocks + * only consist of key. The key offset for the implicit back refs is + * objectid of block's owner tree. The key offset for the full back refs + * is the first byte of parent block. + * + * When implicit back refs is used, information about the lowest key and + * level of the tree block are required. These information are stored in + * tree block info structure. + */ + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static int convert_extent_item_v0(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 owner, u32 extra_size) +{ + struct btrfs_extent_item *item; + struct btrfs_extent_item_v0 *ei0; + struct btrfs_extent_ref_v0 *ref0; + struct btrfs_tree_block_info *bi; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + u32 new_size = sizeof(*item); + u64 refs; + int ret; + + leaf = path->nodes[0]; + BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + refs = btrfs_extent_refs_v0(leaf, ei0); + + if (owner == (u64)-1) { + while (1) { + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + BUG_ON(ret > 0); + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0]); + BUG_ON(key.objectid != found_key.objectid); + if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { + path->slots[0]++; + continue; + } + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + owner = btrfs_ref_objectid_v0(leaf, ref0); + break; + } + } + btrfs_release_path(root, path); + + if (owner < BTRFS_FIRST_FREE_OBJECTID) + new_size += sizeof(*bi); + + new_size -= sizeof(*ei0); + ret = btrfs_search_slot(trans, root, &key, path, + new_size + extra_size, 1); + if (ret < 0) + return ret; + BUG_ON(ret); + + ret = btrfs_extend_item(trans, root, path, new_size); + BUG_ON(ret); + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, refs); + /* FIXME: get real generation */ + btrfs_set_extent_generation(leaf, item, 0); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_set_extent_flags(leaf, item, + BTRFS_EXTENT_FLAG_TREE_BLOCK | + BTRFS_BLOCK_FLAG_FULL_BACKREF); + bi = (struct btrfs_tree_block_info *)(item + 1); + /* FIXME: get first key of the block */ + memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); + btrfs_set_tree_block_level(leaf, bi, (int)owner); + } else { + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} +#endif + +static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) +{ + u32 high_crc = ~(u32)0; + u32 low_crc = ~(u32)0; + __le64 lenum; + + lenum = cpu_to_le64(root_objectid); + high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(owner); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(offset); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + + return ((u64)high_crc << 31) ^ (u64)low_crc; +} + +static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref) +{ + return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), + btrfs_extent_data_ref_objectid(leaf, ref), + btrfs_extent_data_ref_offset(leaf, ref)); +} + +static int match_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) +{ + if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != owner || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + return 0; + return 1; +} + +static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, + u64 owner, u64 offset) +{ + struct btrfs_key key; + struct btrfs_extent_data_ref *ref; + struct extent_buffer *leaf; + u32 nritems; + int ret; + int recow; + int err = -ENOENT; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + } +again: + recow = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } + + if (parent) { + if (!ret) + return 0; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + key.type = BTRFS_EXTENT_REF_V0_KEY; + btrfs_release_path(root, path); + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } + if (!ret) + return 0; +#endif + goto fail; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + err = ret; + if (ret) + goto fail; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != bytenr || + key.type != BTRFS_EXTENT_DATA_REF_KEY) + goto fail; + + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) { + if (recow) { + btrfs_release_path(root, path); + goto again; + } + err = 0; + break; + } + path->slots[0]++; + } +fail: + return err; +} + +static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + u32 size; + u32 num_refs; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + size = sizeof(struct btrfs_shared_data_ref); + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + size = sizeof(struct btrfs_extent_data_ref); + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, size); + if (ret && ret != -EEXIST) + goto fail; + + leaf = path->nodes[0]; + if (parent) { + struct btrfs_shared_data_ref *ref; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + if (ret == 0) { + btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_shared_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_shared_data_ref_count(leaf, ref, num_refs); + } + } else { + struct btrfs_extent_data_ref *ref; + while (ret == -EEXIST) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) + break; + btrfs_release_path(root, path); + key.offset++; + ret = btrfs_insert_empty_item(trans, root, path, &key, + size); + if (ret && ret != -EEXIST) + goto fail; + + leaf = path->nodes[0]; + } + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (ret == 0) { + btrfs_set_extent_data_ref_root(leaf, ref, + root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_extent_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_extent_data_ref_count(leaf, ref, num_refs); + } + } + btrfs_mark_buffer_dirty(leaf); + ret = 0; +fail: + btrfs_release_path(root, path); + return ret; +} + +static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int refs_to_drop) +{ + struct btrfs_key key; + struct btrfs_extent_data_ref *ref1 = NULL; + struct btrfs_shared_data_ref *ref2 = NULL; + struct extent_buffer *leaf; + u32 num_refs = 0; + int ret = 0; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + num_refs = btrfs_ref_count_v0(leaf, ref0); +#endif + } else { + BUG(); + } + + BUG_ON(num_refs < refs_to_drop); + num_refs -= refs_to_drop; + + if (num_refs == 0) { + ret = btrfs_del_item(trans, root, path); + } else { + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); + else if (key.type == BTRFS_SHARED_DATA_REF_KEY) + btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + else { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + btrfs_set_ref_count_v0(leaf, ref0, num_refs); + } +#endif + btrfs_mark_buffer_dirty(leaf); + } + return ret; +} + +static noinline u32 extent_data_ref_count(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref1; + struct btrfs_shared_data_ref *ref2; + u32 num_refs = 0; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (iref) { + if (btrfs_extent_inline_ref_type(leaf, iref) == + BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else { + ref2 = (struct btrfs_shared_data_ref *)(iref + 1); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); + } + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + num_refs = btrfs_ref_count_v0(leaf, ref0); +#endif + } else { + WARN_ON(1); + } + return num_refs; +} + +static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (ret == -ENOENT && parent) { + btrfs_release_path(root, path); + key.type = BTRFS_EXTENT_REF_V0_KEY; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + } +#endif + return ret; +} + +static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + btrfs_release_path(root, path); + return ret; +} + +static inline int extent_ref_type(u64 parent, u64 owner) +{ + int type; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (parent > 0) + type = BTRFS_SHARED_BLOCK_REF_KEY; + else + type = BTRFS_TREE_BLOCK_REF_KEY; + } else { + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; + } + return type; +} + +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 >= + btrfs_header_nritems(path->nodes[level])) + continue; + if (level == 0) + btrfs_item_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + else + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + return 1; +} + +/* + * look for inline back ref. if back ref is found, *ref_ret is set + * to the address of inline back ref, and 0 is returned. + * + * if back ref isn't found, *ref_ret is set to the address where it + * should be inserted, and -ENOENT is returned. + * + * if insert is true and there are too many inline back refs, the path + * points to the extent item, and -EAGAIN is returned. + * + * NOTE: inline back refs are ordered in the same way that back ref + * items in the tree are ordered. + */ +static noinline_for_stack +int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int insert) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + u64 flags; + u64 item_size; + unsigned long ptr; + unsigned long end; + int extra_size; + int type; + int want; + int ret; + int err = 0; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + want = extent_ref_type(parent, owner); + if (insert) { + extra_size = btrfs_extent_inline_ref_size(want); + path->keep_locks = 1; + } else + extra_size = -1; + ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(ret); + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + if (!insert) { + err = -ENOENT; + goto out; + } + ret = convert_extent_item_v0(trans, root, path, owner, + extra_size); + if (ret < 0) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } else { + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); + } + + err = -ENOENT; + while (1) { + if (ptr >= end) { + WARN_ON(ptr > end); + break; + } + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + if (want < type) + break; + if (want > type) { + ptr += btrfs_extent_inline_ref_size(type); + continue; + } + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (match_extent_data_ref(leaf, dref, root_objectid, + owner, offset)) { + err = 0; + break; + } + if (hash_extent_data_ref_item(leaf, dref) < + hash_extent_data_ref(root_objectid, owner, offset)) + break; + } else { + u64 ref_offset; + ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); + if (parent > 0) { + if (parent == ref_offset) { + err = 0; + break; + } + if (ref_offset < parent) + break; + } else { + if (root_objectid == ref_offset) { + err = 0; + break; + } + if (ref_offset < root_objectid) + break; + } + } + ptr += btrfs_extent_inline_ref_size(type); + } + if (err == -ENOENT && insert) { + if (item_size + extra_size >= + BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { + err = -EAGAIN; + goto out; + } + /* + * To add new inline back ref, we have to make sure + * there is no corresponding back ref item. + * For simplicity, we just do not add new inline back + * ref if there is any kind of item for this block + */ + if (find_next_key(path, 0, &key) == 0 && + key.objectid == bytenr && + key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + err = -EAGAIN; + goto out; + } + } + *ref_ret = (struct btrfs_extent_inline_ref *)ptr; +out: + if (insert) { + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); + } + return err; +} + +/* + * helper to add new inline back ref + */ +static noinline_for_stack +int setup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + unsigned long ptr; + unsigned long end; + unsigned long item_offset; + u64 refs; + int size; + int type; + int ret; + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + item_offset = (unsigned long)iref - (unsigned long)ei; + + type = extent_ref_type(parent, owner); + size = btrfs_extent_inline_ref_size(type); + + ret = btrfs_extend_item(trans, root, path, size); + BUG_ON(ret); + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + refs += refs_to_add; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + ptr = (unsigned long)ei + item_offset; + end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); + if (ptr < end - size) + memmove_extent_buffer(leaf, ptr + size, ptr, + end - size - ptr); + + iref = (struct btrfs_extent_inline_ref *)ptr; + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, dref, owner); + btrfs_set_extent_data_ref_offset(leaf, dref, offset); + btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + sref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static int lookup_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + + ret = lookup_inline_extent_backref(trans, root, path, ref_ret, + bytenr, num_bytes, parent, + root_objectid, owner, offset, 0); + if (ret != -ENOENT) + return ret; + + btrfs_release_path(root, path); + *ref_ret = NULL; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, + root_objectid); + } else { + ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, + root_objectid, owner, offset); + } + return ret; +} + +/* + * helper to update/remove inline back ref + */ +static noinline_for_stack +int update_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op) +{ + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_data_ref *dref = NULL; + struct btrfs_shared_data_ref *sref = NULL; + unsigned long ptr; + unsigned long end; + u32 item_size; + int size; + int type; + int ret; + u64 refs; + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); + refs += refs_to_mod; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + type = btrfs_extent_inline_ref_type(leaf, iref); + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + refs = btrfs_extent_data_ref_count(leaf, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + sref = (struct btrfs_shared_data_ref *)(iref + 1); + refs = btrfs_shared_data_ref_count(leaf, sref); + } else { + refs = 1; + BUG_ON(refs_to_mod != -1); + } + + BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); + refs += refs_to_mod; + + if (refs > 0) { + if (type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, dref, refs); + else + btrfs_set_shared_data_ref_count(leaf, sref, refs); + } else { + size = btrfs_extent_inline_ref_size(type); + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + if (ptr + size < end) + memmove_extent_buffer(leaf, ptr, ptr + size, + end - ptr - size); + item_size -= size; + ret = btrfs_truncate_item(trans, root, path, item_size, 1); + BUG_ON(ret); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static noinline_for_stack +int insert_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_extent_inline_ref *iref; + int ret; + + ret = lookup_inline_extent_backref(trans, root, path, &iref, + bytenr, num_bytes, parent, + root_objectid, owner, offset, 1); + if (ret == 0) { + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + ret = update_inline_extent_backref(trans, root, path, iref, + refs_to_add, extent_op); + } else if (ret == -ENOENT) { + ret = setup_inline_extent_backref(trans, root, path, iref, + parent, root_objectid, + owner, offset, refs_to_add, + extent_op); + } + return ret; +} + +static int insert_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add) +{ + int ret; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + BUG_ON(refs_to_add != 1); + ret = insert_tree_block_ref(trans, root, path, bytenr, + parent, root_objectid); + } else { + ret = insert_extent_data_ref(trans, root, path, bytenr, + parent, root_objectid, + owner, offset, refs_to_add); + } + return ret; +} + +static int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_drop, int is_data) +{ + int ret; + + BUG_ON(!is_data && refs_to_drop != 1); + if (iref) { + ret = update_inline_extent_backref(trans, root, path, iref, + -refs_to_drop, NULL); + } else if (is_data) { + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + } else { + ret = btrfs_del_item(trans, root, path); + } + return ret; +} + +static void btrfs_issue_discard(struct block_device *bdev, + u64 start, u64 len) +{ + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, + DISCARD_FL_BARRIER); +} + +static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + int ret; + u64 map_length = num_bytes; + struct btrfs_multi_bio *multi = NULL; + + if (!btrfs_test_opt(root, DISCARD)) + return 0; + + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, + bytenr, &map_length, &multi, 0); + if (!ret) { + struct btrfs_bio_stripe *stripe = multi->stripes; + int i; + + if (map_length > num_bytes) + map_length = num_bytes; + + for (i = 0; i < multi->num_stripes; i++, stripe++) { + btrfs_issue_discard(stripe->dev->bdev, + stripe->physical, + map_length); + } + kfree(multi); + } + + return ret; +} + +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && + root_objectid == BTRFS_TREE_LOG_OBJECTID); + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + parent, root_objectid, (int)owner, + BTRFS_ADD_DELAYED_REF, NULL); + } else { + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + parent, root_objectid, owner, offset, + BTRFS_ADD_DELAYED_REF, NULL); + } + return ret; +} + +static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_extent_item *item; + u64 refs; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + path->leave_spinning = 1; + /* this will setup the path even if it fails to insert the back ref */ + ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, + path, bytenr, num_bytes, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + if (ret == 0) + goto out; + + if (ret != -EAGAIN) { + err = ret; + goto out; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, item); + btrfs_set_extent_refs(leaf, item, refs + refs_to_add); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, item); + + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root->fs_info->extent_root, path); + + path->reada = 1; + path->leave_spinning = 1; + + /* now insert the actual backref */ + ret = insert_extent_backref(trans, root->fs_info->extent_root, + path, bytenr, parent, root_objectid, + owner, offset, refs_to_add); + BUG_ON(ret); +out: + btrfs_free_path(path); + return err; +} + +static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret = 0; + struct btrfs_delayed_data_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; + u64 flags = 0; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ref = btrfs_delayed_node_to_data_ref(node); + if (node->type == BTRFS_SHARED_DATA_REF_KEY) + parent = ref->parent; + else + ref_root = ref->root; + + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + if (extent_op) { + BUG_ON(extent_op->update_key); + flags |= extent_op->flags_to_set; + } + ret = alloc_reserved_file_extent(trans, root, + parent, ref_root, flags, + ref->objectid, ref->offset, + &ins, node->ref_mod); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, root, node->bytenr, + node->num_bytes, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op); + } else { + BUG(); + } + return ret; +} + +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei) +{ + u64 flags = btrfs_extent_flags(leaf, ei); + if (extent_op->update_flags) { + flags |= extent_op->flags_to_set; + btrfs_set_extent_flags(leaf, ei, flags); + } + + if (extent_op->update_key) { + struct btrfs_tree_block_info *bi; + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_set_tree_block_key(leaf, bi, &extent_op->key); + } +} + +static int run_delayed_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + u32 item_size; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = node->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + + path->reada = 1; + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, + path, 0, 1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + err = -EIO; + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + ret = convert_extent_item_v0(trans, root->fs_info->extent_root, + path, (u64)-1, 0); + if (ret < 0) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + __run_delayed_extent_op(extent_op, leaf, ei); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return err; +} + +static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret = 0; + struct btrfs_delayed_tree_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ref = btrfs_delayed_node_to_tree_ref(node); + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) + parent = ref->parent; + else + ref_root = ref->root; + + BUG_ON(node->ref_mod != 1); + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + BUG_ON(!extent_op || !extent_op->update_flags || + !extent_op->update_key); + ret = alloc_reserved_tree_block(trans, root, + parent, ref_root, + extent_op->flags_to_set, + &extent_op->key, + ref->level, &ins); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, + ref->level, 0, 1, extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, + ref->level, 0, 1, extent_op); + } else { + BUG(); + } + return ret; +} + + +/* helper function to actually process a single delayed ref entry */ +static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret; + if (btrfs_delayed_ref_is_head(node)) { + struct btrfs_delayed_ref_head *head; + /* + * we've hit the end of the chain and we were supposed + * to insert this extent into the tree. But, it got + * deleted before we ever needed to insert it, so all + * we have to do is clean up the accounting + */ + BUG_ON(extent_op); + head = btrfs_delayed_node_to_head(node); + if (insert_reserved) { + int mark_free = 0; + struct extent_buffer *must_clean = NULL; + + ret = pin_down_bytes(trans, root, NULL, + node->bytenr, node->num_bytes, + head->is_data, 1, &must_clean); + if (ret > 0) + mark_free = 1; + + if (must_clean) { + clean_tree_block(NULL, root, must_clean); + btrfs_tree_unlock(must_clean); + free_extent_buffer(must_clean); + } + if (head->is_data) { + ret = btrfs_del_csums(trans, root, + node->bytenr, + node->num_bytes); + BUG_ON(ret); + } + if (mark_free) { + ret = btrfs_free_reserved_extent(root, + node->bytenr, + node->num_bytes); + BUG_ON(ret); + } + } + mutex_unlock(&head->mutex); + return 0; + } + + if (node->type == BTRFS_TREE_BLOCK_REF_KEY || + node->type == BTRFS_SHARED_BLOCK_REF_KEY) + ret = run_delayed_tree_ref(trans, root, node, extent_op, + insert_reserved); + else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + ret = run_delayed_data_ref(trans, root, node, extent_op, + insert_reserved); + else + BUG(); + return ret; +} + +static noinline struct btrfs_delayed_ref_node * +select_delayed_ref(struct btrfs_delayed_ref_head *head) +{ + struct rb_node *node; + struct btrfs_delayed_ref_node *ref; + int action = BTRFS_ADD_DELAYED_REF; +again: + /* + * select delayed ref of type BTRFS_ADD_DELAYED_REF first. + * this prevents ref count from going down to zero when + * there still are pending delayed ref. + */ + node = rb_prev(&head->node.rb_node); + while (1) { + if (!node) + break; + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (ref->bytenr != head->node.bytenr) + break; + if (ref->action == action) + return ref; + node = rb_prev(node); + } + if (action == BTRFS_ADD_DELAYED_REF) { + action = BTRFS_DROP_DELAYED_REF; + goto again; + } + return NULL; +} + +static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct list_head *cluster) +{ + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *locked_ref = NULL; + struct btrfs_delayed_extent_op *extent_op; + int ret; + int count = 0; + int must_insert_reserved = 0; + + delayed_refs = &trans->transaction->delayed_refs; + while (1) { + if (!locked_ref) { + /* pick a new head ref from the cluster list */ + if (list_empty(cluster)) + break; + + locked_ref = list_entry(cluster->next, + struct btrfs_delayed_ref_head, cluster); + + /* grab the lock that says we are going to process + * all the refs for this head */ + ret = btrfs_delayed_ref_lock(trans, locked_ref); + + /* + * we may have dropped the spin lock to get the head + * mutex lock, and that might have given someone else + * time to free the head. If that's true, it has been + * removed from our list and we can move on. + */ + if (ret == -EAGAIN) { + locked_ref = NULL; + count++; + continue; + } + } + + /* + * record the must insert reserved flag before we + * drop the spin lock. + */ + must_insert_reserved = locked_ref->must_insert_reserved; + locked_ref->must_insert_reserved = 0; + + extent_op = locked_ref->extent_op; + locked_ref->extent_op = NULL; + + /* + * locked_ref is the head node, so we have to go one + * node back for any delayed ref updates + */ + ref = select_delayed_ref(locked_ref); + if (!ref) { + /* All delayed refs have been processed, Go ahead + * and send the head node to run_one_delayed_ref, + * so that any accounting fixes can happen + */ + ref = &locked_ref->node; + + if (extent_op && must_insert_reserved) { + kfree(extent_op); + extent_op = NULL; + } + + if (extent_op) { + spin_unlock(&delayed_refs->lock); + + ret = run_delayed_extent_op(trans, root, + ref, extent_op); + BUG_ON(ret); + kfree(extent_op); + + cond_resched(); + spin_lock(&delayed_refs->lock); + continue; + } + + list_del_init(&locked_ref->cluster); + locked_ref = NULL; + } + + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + + spin_unlock(&delayed_refs->lock); + + ret = run_one_delayed_ref(trans, root, ref, extent_op, + must_insert_reserved); + BUG_ON(ret); + + btrfs_put_delayed_ref(ref); + kfree(extent_op); + count++; + + cond_resched(); + spin_lock(&delayed_refs->lock); + } + return count; +} + +/* + * this starts processing the delayed reference count updates and + * extent insertions we have queued up so far. count can be + * 0, which means to process everything in the tree at the start + * of the run (but not newly added entries), or it can be some target + * number you'd like to process. + */ +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct list_head cluster; + int ret; + int run_all = count == (unsigned long)-1; + int run_most = 0; + + if (root == root->fs_info->extent_root) + root = root->fs_info->tree_root; + + delayed_refs = &trans->transaction->delayed_refs; + INIT_LIST_HEAD(&cluster); +again: + spin_lock(&delayed_refs->lock); + if (count == 0) { + count = delayed_refs->num_entries * 2; + run_most = 1; + } + while (1) { + if (!(run_all || run_most) && + delayed_refs->num_heads_ready < 64) + break; + + /* + * go find something we can process in the rbtree. We start at + * the beginning of the tree, and then build a cluster + * of refs to process starting at the first one we are able to + * lock + */ + ret = btrfs_find_ref_cluster(trans, &cluster, + delayed_refs->run_delayed_start); + if (ret) + break; + + ret = run_clustered_refs(trans, root, &cluster); + BUG_ON(ret < 0); + + count -= min_t(unsigned long, ret, count); + + if (count == 0) + break; + } + + if (run_all) { + node = rb_first(&delayed_refs->root); + if (!node) + goto out; + count = (unsigned long)-1; + + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (btrfs_delayed_ref_is_head(ref)) { + struct btrfs_delayed_ref_head *head; + + head = btrfs_delayed_node_to_head(ref); + atomic_inc(&ref->refs); + + spin_unlock(&delayed_refs->lock); + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + + btrfs_put_delayed_ref(ref); + cond_resched(); + goto again; + } + node = rb_next(node); + } + spin_unlock(&delayed_refs->lock); + schedule_timeout(1); + goto again; + } +out: + spin_unlock(&delayed_refs->lock); + return 0; +} + +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 flags, + int is_data) +{ + struct btrfs_delayed_extent_op *extent_op; + int ret; + + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + if (!extent_op) + return -ENOMEM; + + extent_op->flags_to_set = flags; + extent_op->update_flags = 1; + extent_op->update_key = 0; + extent_op->is_data = is_data ? 1 : 0; + + ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + if (ret) + kfree(extent_op); + return ret; +} + +static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_data_ref *data_ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + int ret = 0; + + ret = -ENOENT; + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (!head) + goto out; + + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; + } + + node = rb_prev(&head->node.rb_node); + if (!node) + goto out_unlock; + + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + + if (ref->bytenr != bytenr) + goto out_unlock; + + ret = 1; + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) + goto out_unlock; + + data_ref = btrfs_delayed_node_to_data_ref(ref); + + node = rb_prev(node); + if (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + if (ref->bytenr == bytenr) + goto out_unlock; + } + + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || data_ref->offset != offset) + goto out_unlock; + + ret = 0; +out_unlock: + mutex_unlock(&head->mutex); +out: + spin_unlock(&delayed_refs->lock); + return ret; +} + +static noinline int check_committed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_item *ei; + struct btrfs_key key; + u32 item_size; + int ret; + + key.objectid = bytenr; + key.offset = (u64)-1; + key.type = BTRFS_EXTENT_ITEM_KEY; + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = -ENOENT; + if (path->slots[0] == 0) + goto out; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) + goto out; + + ret = 1; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + goto out; + } +#endif + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + + if (item_size != sizeof(*ei) + + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + goto out; + + if (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + if (btrfs_extent_inline_ref_type(leaf, iref) != + BTRFS_EXTENT_DATA_REF_KEY) + goto out; + + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (btrfs_extent_refs(leaf, ei) != + btrfs_extent_data_ref_count(leaf, ref) || + btrfs_extent_data_ref_root(leaf, ref) != + root->root_key.objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + goto out; + + ret = 0; +out: + return ret; +} + +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_path *path; + int ret; + int ret2; + + path = btrfs_alloc_path(); + if (!path) + return -ENOENT; + + do { + ret = check_committed_ref(trans, root, path, objectid, + offset, bytenr); + if (ret && ret != -ENOENT) + goto out; + + ret2 = check_delayed_ref(trans, root, path, objectid, + offset, bytenr); + } while (ret2 == -EAGAIN); + + if (ret2 && ret2 != -ENOENT) { + ret = ret2; + goto out; + } + + if (ret != -ENOENT || ret2 != -ENOENT) + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +#if 0 +int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, u32 nr_extents) +{ + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 root_gen; + u32 nritems; + int i; + int level; + int ret = 0; + int shared = 0; + + if (!root->ref_cows) + return 0; + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + shared = 0; + root_gen = root->root_key.offset; + } else { + shared = 1; + root_gen = trans->transid - 1; + } + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + + if (level == 0) { + struct btrfs_leaf_ref *ref; + struct btrfs_extent_info *info; + + ref = btrfs_alloc_leaf_ref(root, nr_extents); + if (!ref) { + ret = -ENOMEM; + goto out; + } + + ref->root_gen = root_gen; + ref->bytenr = buf->start; + ref->owner = btrfs_header_owner(buf); + ref->generation = btrfs_header_generation(buf); + ref->nritems = nr_extents; + info = ref->extents; + + for (i = 0; nr_extents > 0 && i < nritems; i++) { + u64 disk_bytenr; + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (disk_bytenr == 0) + continue; + + info->bytenr = disk_bytenr; + info->num_bytes = + btrfs_file_extent_disk_num_bytes(buf, fi); + info->objectid = key.objectid; + info->offset = key.offset; + info++; + } + + ret = btrfs_add_leaf_ref(root, ref, shared); + if (ret == -EEXIST && shared) { + struct btrfs_leaf_ref *old; + old = btrfs_lookup_leaf_ref(root, ref->bytenr); + BUG_ON(!old); + btrfs_remove_leaf_ref(root, old); + btrfs_free_leaf_ref(root, old); + ret = btrfs_add_leaf_ref(root, ref, shared); + } + WARN_ON(ret); + btrfs_free_leaf_ref(root, ref); + } +out: + return ret; +} + +/* when a block goes through cow, we update the reference counts of + * everything that block points to. The internal pointers of the block + * can be in just about any order, and it is likely to have clusters of + * things that are close together and clusters of things that are not. + * + * To help reduce the seeks that come with updating all of these reference + * counts, sort them by byte number before actual updates are done. + * + * struct refsort is used to match byte number to slot in the btree block. + * we sort based on the byte number and then use the slot to actually + * find the item. + * + * struct refsort is smaller than strcut btrfs_item and smaller than + * struct btrfs_key_ptr. Since we're currently limited to the page size + * for a btree block, there's no way for a kmalloc of refsorts for a + * single node to be bigger than a page. + */ +struct refsort { + u64 bytenr; + u32 slot; +}; + +/* + * for passing into sort() + */ +static int refsort_cmp(const void *a_void, const void *b_void) +{ + const struct refsort *a = a_void; + const struct refsort *b = b_void; + + if (a->bytenr < b->bytenr) + return -1; + if (a->bytenr > b->bytenr) + return 1; + return 0; +} +#endif + +static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + int full_backref, int inc) +{ + u64 bytenr; + u64 num_bytes; + u64 parent; + u64 ref_root; + u32 nritems; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int level; + int ret = 0; + int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, + u64, u64, u64, u64, u64, u64); + + ref_root = btrfs_header_owner(buf); + nritems = btrfs_header_nritems(buf); + level = btrfs_header_level(buf); + + if (!root->ref_cows && level == 0) + return 0; + + if (inc) + process_func = btrfs_inc_extent_ref; + else + process_func = btrfs_free_extent; + + if (full_backref) + parent = buf->start; + else + parent = 0; + + for (i = 0; i < nritems; i++) { + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + key.offset -= btrfs_file_extent_offset(buf, fi); + ret = process_func(trans, root, bytenr, num_bytes, + parent, ref_root, key.objectid, + key.offset); + if (ret) + goto fail; + } else { + bytenr = btrfs_node_blockptr(buf, i); + num_bytes = btrfs_level_size(root, level - 1); + ret = process_func(trans, root, bytenr, num_bytes, + parent, ref_root, level - 1, 0); + if (ret) + goto fail; + } + } + return 0; +fail: + BUG(); + return ret; +} + +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); +} + +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); +} + +static int write_one_cache_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_block_group_cache *cache) +{ + int ret; + struct btrfs_root *extent_root = root->fs_info->extent_root; + unsigned long bi; + struct extent_buffer *leaf; + + ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); + if (ret < 0) + goto fail; + BUG_ON(ret); + + leaf = path->nodes[0]; + bi = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(extent_root, path); +fail: + if (ret) + return ret; + return 0; + +} + +static struct btrfs_block_group_cache * +next_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct rb_node *node; + spin_lock(&root->fs_info->block_group_cache_lock); + node = rb_next(&cache->cache_node); + btrfs_put_block_group(cache); + if (node) { + cache = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + btrfs_get_block_group(cache); + } else + cache = NULL; + spin_unlock(&root->fs_info->block_group_cache_lock); + return cache; +} + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache; + int err = 0; + struct btrfs_path *path; + u64 last = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + if (last == 0) { + err = btrfs_run_delayed_refs(trans, root, + (unsigned long)-1); + BUG_ON(err); + } + + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + if (cache->dirty) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } + + cache->dirty = 0; + last = cache->key.objectid + cache->key.offset; + + err = write_one_cache_group(trans, root, path, cache); + BUG_ON(err); + btrfs_put_block_group(cache); + } + + btrfs_free_path(path); + return 0; +} + +int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_block_group_cache *block_group; + int readonly = 0; + + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + if (!block_group || block_group->ro) + readonly = 1; + if (block_group) + btrfs_put_block_group(block_group); + return readonly; +} + +static int update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + + found = __find_space_info(info, flags); + if (found) { + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->bytes_used += bytes_used; + found->full = 0; + spin_unlock(&found->lock); + *space_info = found; + return 0; + } + found = kzalloc(sizeof(*found), GFP_NOFS); + if (!found) + return -ENOMEM; + + INIT_LIST_HEAD(&found->block_groups); + init_rwsem(&found->groups_sem); + spin_lock_init(&found->lock); + found->flags = flags; + found->total_bytes = total_bytes; + found->bytes_used = bytes_used; + found->bytes_pinned = 0; + found->bytes_reserved = 0; + found->bytes_readonly = 0; + found->bytes_delalloc = 0; + found->full = 0; + found->force_alloc = 0; + *space_info = found; + list_add_rcu(&found->list, &info->space_info); + atomic_set(&found->caching_threads, 0); + return 0; +} + +static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP); + if (extra_flags) { + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + } +} + +static void set_block_group_readonly(struct btrfs_block_group_cache *cache) +{ + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + if (!cache->ro) { + cache->space_info->bytes_readonly += cache->key.offset - + btrfs_block_group_used(&cache->item); + cache->ro = 1; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); +} + +u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) +{ + u64 num_devices = root->fs_info->fs_devices->rw_devices; + + if (num_devices == 1) + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + if (num_devices < 4) + flags &= ~BTRFS_BLOCK_GROUP_RAID10; + + if ((flags & BTRFS_BLOCK_GROUP_DUP) && + (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10))) { + flags &= ~BTRFS_BLOCK_GROUP_DUP; + } + + if ((flags & BTRFS_BLOCK_GROUP_RAID1) && + (flags & BTRFS_BLOCK_GROUP_RAID10)) { + flags &= ~BTRFS_BLOCK_GROUP_RAID1; + } + + if ((flags & BTRFS_BLOCK_GROUP_RAID0) && + ((flags & BTRFS_BLOCK_GROUP_RAID1) | + (flags & BTRFS_BLOCK_GROUP_RAID10) | + (flags & BTRFS_BLOCK_GROUP_DUP))) + flags &= ~BTRFS_BLOCK_GROUP_RAID0; + return flags; +} + +static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) +{ + struct btrfs_fs_info *info = root->fs_info; + u64 alloc_profile; + + if (data) { + alloc_profile = info->avail_data_alloc_bits & + info->data_alloc_profile; + data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; + } else if (root == root->fs_info->chunk_root) { + alloc_profile = info->avail_system_alloc_bits & + info->system_alloc_profile; + data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; + } else { + alloc_profile = info->avail_metadata_alloc_bits & + info->metadata_alloc_profile; + data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; + } + + return btrfs_reduce_alloc_profile(root, data); +} + +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) +{ + u64 alloc_target; + + alloc_target = btrfs_get_alloc_profile(root, 1); + BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, + alloc_target); +} + +static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) +{ + u64 num_bytes; + int level; + + level = BTRFS_MAX_LEVEL - 2; + /* + * NOTE: these calculations are absolutely the worst possible case. + * This assumes that _every_ item we insert will require a new leaf, and + * that the tree has grown to its maximum level size. + */ + + /* + * for every item we insert we could insert both an extent item and a + * extent ref item. Then for ever item we insert, we will need to cow + * both the original leaf, plus the leaf to the left and right of it. + * + * Unless we are talking about the extent root, then we just want the + * number of items * 2, since we just need the extent item plus its ref. + */ + if (root == root->fs_info->extent_root) + num_bytes = num_items * 2; + else + num_bytes = (num_items + (2 * num_items)) * 3; + + /* + * num_bytes is total number of leaves we could need times the leaf + * size, and then for every leaf we could end up cow'ing 2 nodes per + * level, down to the leaf level. + */ + num_bytes = (num_bytes * root->leafsize) + + (num_bytes * (level * 2)) * root->nodesize; + + return num_bytes; +} + +/* + * Unreserve metadata space for delalloc. If we have less reserved credits than + * we have extents, this function does nothing. + */ +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 alloc_target; + bool bug = false; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); + + spin_lock(&meta_sinfo->lock); + spin_lock(&BTRFS_I(inode)->accounting_lock); + if (BTRFS_I(inode)->reserved_extents <= + BTRFS_I(inode)->outstanding_extents) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); + spin_unlock(&meta_sinfo->lock); + return 0; + } + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + BTRFS_I(inode)->reserved_extents--; + BUG_ON(BTRFS_I(inode)->reserved_extents < 0); + + if (meta_sinfo->bytes_delalloc < num_bytes) { + bug = true; + meta_sinfo->bytes_delalloc = 0; + } else { + meta_sinfo->bytes_delalloc -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + + BUG_ON(bug); + + return 0; +} + +static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) +{ + u64 thresh; + + thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use; + + thresh = meta_sinfo->total_bytes - thresh; + thresh *= 80; + do_div(thresh, 100); + if (thresh <= meta_sinfo->bytes_delalloc) + meta_sinfo->force_delalloc = 1; + else + meta_sinfo->force_delalloc = 0; +} + +struct async_flush { + struct btrfs_root *root; + struct btrfs_space_info *info; + struct btrfs_work work; +}; + +static noinline void flush_delalloc_async(struct btrfs_work *work) +{ + struct async_flush *async; + struct btrfs_root *root; + struct btrfs_space_info *info; + + async = container_of(work, struct async_flush, work); + root = async->root; + info = async->info; + + btrfs_start_delalloc_inodes(root, 0); + wake_up(&info->flush_wait); + btrfs_wait_ordered_extents(root, 0, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); + + kfree(async); +} + +static void wait_on_flush(struct btrfs_space_info *info) +{ + DEFINE_WAIT(wait); + u64 used; + + while (1) { + prepare_to_wait(&info->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_lock(&info->lock); + if (!info->flushing) { + spin_unlock(&info->lock); + break; + } + + used = info->bytes_used + info->bytes_reserved + + info->bytes_pinned + info->bytes_readonly + + info->bytes_super + info->bytes_root + + info->bytes_may_use + info->bytes_delalloc; + if (used < info->total_bytes) { + spin_unlock(&info->lock); + break; + } + spin_unlock(&info->lock); + schedule(); + } + finish_wait(&info->flush_wait, &wait); +} + +static void flush_delalloc(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct async_flush *async; + bool wait = false; + + spin_lock(&info->lock); + + if (!info->flushing) { + info->flushing = 1; + init_waitqueue_head(&info->flush_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_on_flush(info); + return; + } + + async = kzalloc(sizeof(*async), GFP_NOFS); + if (!async) + goto flush; + + async->root = root; + async->info = info; + async->work.func = flush_delalloc_async; + + btrfs_queue_worker(&root->fs_info->enospc_workers, + &async->work); + wait_on_flush(info); + return; + +flush: + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); +} + +static int maybe_allocate_chunk(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_trans_handle *trans; + bool wait = false; + int ret = 0; + u64 min_metadata; + u64 free_space; + + free_space = btrfs_super_total_bytes(disk_super); + /* + * we allow the metadata to grow to a max of either 10gb or 5% of the + * space in the volume. + */ + min_metadata = min((u64)10 * 1024 * 1024 * 1024, + div64_u64(free_space * 5, 100)); + if (info->total_bytes >= min_metadata) { + spin_unlock(&info->lock); + return 0; + } + + if (info->full) { + spin_unlock(&info->lock); + return 0; + } + + if (!info->allocating_chunk) { + info->force_alloc = 1; + info->allocating_chunk = 1; + init_waitqueue_head(&info->allocate_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_event(info->allocate_wait, + !info->allocating_chunk); + return 1; + } + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 4096 + 2 * 1024 * 1024, + info->flags, 0); + btrfs_end_transaction(trans, root); + if (ret) + goto out; +out: + spin_lock(&info->lock); + info->allocating_chunk = 0; + spin_unlock(&info->lock); + wake_up(&info->allocate_wait); + + if (ret) + return 0; + return 1; +} + +/* + * Reserve metadata space for delalloc. + */ +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 used; + u64 alloc_target; + int flushed = 0; + int force_delalloc; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); +again: + spin_lock(&meta_sinfo->lock); + + force_delalloc = meta_sinfo->force_delalloc; + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + + if (!flushed) + meta_sinfo->bytes_delalloc += num_bytes; + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + if (used > meta_sinfo->total_bytes) { + flushed++; + + if (flushed == 1) { + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + flushed++; + } else { + spin_unlock(&meta_sinfo->lock); + } + + if (flushed == 2) { + filemap_flush(inode->i_mapping); + goto again; + } else if (flushed == 3) { + flush_delalloc(root, meta_sinfo); + goto again; + } + spin_lock(&meta_sinfo->lock); + meta_sinfo->bytes_delalloc -= num_bytes; + spin_unlock(&meta_sinfo->lock); + printk(KERN_ERR "enospc, has %d, reserved %d\n", + BTRFS_I(inode)->outstanding_extents, + BTRFS_I(inode)->reserved_extents); + dump_space_info(meta_sinfo, 0, 0); + return -ENOSPC; + } + + BTRFS_I(inode)->reserved_extents++; + check_force_delalloc(meta_sinfo); + spin_unlock(&meta_sinfo->lock); + + if (!flushed && force_delalloc) + filemap_flush(inode->i_mapping); + + return 0; +} + +/* + * unreserve num_items number of items worth of metadata space. This needs to + * be paired with btrfs_reserve_metadata_space. + * + * NOTE: if you have the option, run this _AFTER_ you do a + * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref + * oprations which will result in more used metadata, so we want to make sure we + * can do that without issue. + */ +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 alloc_target; + bool bug = false; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root, num_items); + + spin_lock(&meta_sinfo->lock); + if (meta_sinfo->bytes_may_use < num_bytes) { + bug = true; + meta_sinfo->bytes_may_use = 0; + } else { + meta_sinfo->bytes_may_use -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + + BUG_ON(bug); + + return 0; +} + +/* + * Reserve some metadata space for use. We'll calculate the worste case number + * of bytes that would be needed to modify num_items number of items. If we + * have space, fantastic, if not, you get -ENOSPC. Please call + * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of + * items you reserved, since whatever metadata you needed should have already + * been allocated. + * + * This will commit the transaction to make more space if we don't have enough + * metadata space. THe only time we don't do this is if we're reserving space + * inside of a transaction, then we will just return -ENOSPC and it is the + * callers responsibility to handle it properly. + */ +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 used; + u64 alloc_target; + int retries = 0; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root, num_items); +again: + spin_lock(&meta_sinfo->lock); + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + + if (!retries) + meta_sinfo->bytes_may_use += num_bytes; + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + if (used > meta_sinfo->total_bytes) { + retries++; + if (retries == 1) { + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + retries++; + } else { + spin_unlock(&meta_sinfo->lock); + } + + if (retries == 2) { + flush_delalloc(root, meta_sinfo); + goto again; + } + spin_lock(&meta_sinfo->lock); + meta_sinfo->bytes_may_use -= num_bytes; + spin_unlock(&meta_sinfo->lock); + + dump_space_info(meta_sinfo, 0, 0); + return -ENOSPC; + } + + check_force_delalloc(meta_sinfo); + spin_unlock(&meta_sinfo->lock); + + return 0; +} + +/* + * This will check the space that the inode allocates from to make sure we have + * enough space for bytes. + */ +int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + int ret = 0, committed = 0; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + data_sinfo = BTRFS_I(inode)->space_info; + if (!data_sinfo) + goto alloc; + +again: + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + if (data_sinfo->total_bytes - data_sinfo->bytes_used - + data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - + data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - + data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { + struct btrfs_trans_handle *trans; + + /* + * if we don't have enough free bytes in this space then we need + * to alloc a new chunk. + */ + if (!data_sinfo->full) { + u64 alloc_target; + + data_sinfo->force_alloc = 1; + spin_unlock(&data_sinfo->lock); +alloc: + alloc_target = btrfs_get_alloc_profile(root, 1); + trans = btrfs_start_transaction(root, 1); + if (!trans) + return -ENOMEM; + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + bytes + 2 * 1024 * 1024, + alloc_target, 0); + btrfs_end_transaction(trans, root); + if (ret) + return ret; + + if (!data_sinfo) { + btrfs_set_inode_space_info(root, inode); + data_sinfo = BTRFS_I(inode)->space_info; + } + goto again; + } + spin_unlock(&data_sinfo->lock); + + /* commit the current transaction and try again */ + if (!committed && !root->fs_info->open_ioctl_trans) { + committed = 1; + trans = btrfs_join_transaction(root, 1); + if (!trans) + return -ENOMEM; + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + goto again; + } + + printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" + ", %llu bytes_used, %llu bytes_reserved, " + "%llu bytes_pinned, %llu bytes_readonly, %llu may use " + "%llu total\n", (unsigned long long)bytes, + (unsigned long long)data_sinfo->bytes_delalloc, + (unsigned long long)data_sinfo->bytes_used, + (unsigned long long)data_sinfo->bytes_reserved, + (unsigned long long)data_sinfo->bytes_pinned, + (unsigned long long)data_sinfo->bytes_readonly, + (unsigned long long)data_sinfo->bytes_may_use, + (unsigned long long)data_sinfo->total_bytes); + return -ENOSPC; + } + data_sinfo->bytes_may_use += bytes; + BTRFS_I(inode)->reserved_bytes += bytes; + spin_unlock(&data_sinfo->lock); + + return 0; +} + +/* + * if there was an error for whatever reason after calling + * btrfs_check_data_free_space, call this so we can cleanup the counters. + */ +void btrfs_free_reserved_data_space(struct btrfs_root *root, + struct inode *inode, u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + data_sinfo = BTRFS_I(inode)->space_info; + spin_lock(&data_sinfo->lock); + data_sinfo->bytes_may_use -= bytes; + BTRFS_I(inode)->reserved_bytes -= bytes; + spin_unlock(&data_sinfo->lock); +} + +/* called when we are adding a delalloc extent to the inode's io_tree */ +void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + + /* get the space info for where this inode will be storing its data */ + data_sinfo = BTRFS_I(inode)->space_info; + + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + data_sinfo->bytes_delalloc += bytes; + + /* + * we are adding a delalloc extent without calling + * btrfs_check_data_free_space first. This happens on a weird + * writepage condition, but shouldn't hurt our accounting + */ + if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { + data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; + BTRFS_I(inode)->reserved_bytes = 0; + } else { + data_sinfo->bytes_may_use -= bytes; + BTRFS_I(inode)->reserved_bytes -= bytes; + } + + spin_unlock(&data_sinfo->lock); +} + +/* called when we are clearing an delalloc extent from the inode's io_tree */ +void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *info; + + info = BTRFS_I(inode)->space_info; + + spin_lock(&info->lock); + info->bytes_delalloc -= bytes; + spin_unlock(&info->lock); +} + +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = 1; + } + rcu_read_unlock(); +} + +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force) +{ + struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = extent_root->fs_info; + u64 thresh; + int ret = 0; + + mutex_lock(&fs_info->chunk_mutex); + + flags = btrfs_reduce_alloc_profile(extent_root, flags); + + space_info = __find_space_info(extent_root->fs_info, flags); + if (!space_info) { + ret = update_space_info(extent_root->fs_info, flags, + 0, 0, &space_info); + BUG_ON(ret); + } + BUG_ON(!space_info); + + spin_lock(&space_info->lock); + if (space_info->force_alloc) + force = 1; + if (space_info->full) { + spin_unlock(&space_info->lock); + goto out; + } + + thresh = space_info->total_bytes - space_info->bytes_readonly; + thresh = div_factor(thresh, 8); + if (!force && + (space_info->bytes_used + space_info->bytes_pinned + + space_info->bytes_reserved + alloc_bytes) < thresh) { + spin_unlock(&space_info->lock); + goto out; + } + spin_unlock(&space_info->lock); + + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + + ret = btrfs_alloc_chunk(trans, extent_root, flags); + spin_lock(&space_info->lock); + if (ret) + space_info->full = 1; + space_info->force_alloc = 0; + spin_unlock(&space_info->lock); +out: + mutex_unlock(&extent_root->fs_info->chunk_mutex); + return ret; +} + +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc, + int mark_free) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *info = root->fs_info; + u64 total = num_bytes; + u64 old_val; + u64 byte_in_group; + + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + old_val = btrfs_super_bytes_used(&info->super_copy); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_super_bytes_used(&info->super_copy, old_val); + spin_unlock(&info->delalloc_lock); + + while (total) { + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) + return -1; + byte_in_group = bytenr - cache->key.objectid; + WARN_ON(byte_in_group > cache->key.offset); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->dirty = 1; + old_val = btrfs_block_group_used(&cache->item); + num_bytes = min(total, cache->key.offset - byte_in_group); + if (alloc) { + old_val += num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->reserved -= num_bytes; + cache->space_info->bytes_used += num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + if (cache->ro) + cache->space_info->bytes_readonly -= num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + } else { + old_val -= num_bytes; + cache->space_info->bytes_used -= num_bytes; + if (cache->ro) + cache->space_info->bytes_readonly += num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + if (mark_free) { + int ret; + + ret = btrfs_discard_extent(root, bytenr, + num_bytes); + WARN_ON(ret); + + ret = btrfs_add_free_space(cache, bytenr, + num_bytes); + WARN_ON(ret); + } + } + btrfs_put_block_group(cache); + total -= num_bytes; + bytenr += num_bytes; + } + return 0; +} + +static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) +{ + struct btrfs_block_group_cache *cache; + u64 bytenr; + + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); + if (!cache) + return 0; + + bytenr = cache->key.objectid; + btrfs_put_block_group(cache); + + return bytenr; +} + +/* + * this function must be called within transaction + */ +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(fs_info, bytenr); + BUG_ON(!cache); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + if (reserved) { + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + btrfs_put_block_group(cache); + + set_extent_dirty(fs_info->pinned_extents, + bytenr, bytenr + num_bytes - 1, GFP_NOFS); + return 0; +} + +static int update_reserved_extents(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve) +{ + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + if (reserve) { + cache->reserved += num_bytes; + cache->space_info->bytes_reserved += num_bytes; + } else { + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + return 0; +} + +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_caching_control *next; + struct btrfs_caching_control *caching_ctl; + struct btrfs_block_group_cache *cache; + + down_write(&fs_info->extent_commit_sem); + + list_for_each_entry_safe(caching_ctl, next, + &fs_info->caching_block_groups, list) { + cache = caching_ctl->block_group; + if (block_group_cache_done(cache)) { + cache->last_byte_to_unpin = (u64)-1; + list_del_init(&caching_ctl->list); + put_caching_control(caching_ctl); + } else { + cache->last_byte_to_unpin = caching_ctl->progress; + } + } + + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + fs_info->pinned_extents = &fs_info->freed_extents[1]; + else + fs_info->pinned_extents = &fs_info->freed_extents[0]; + + up_write(&fs_info->extent_commit_sem); + return 0; +} + +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache = NULL; + u64 len; + + while (start <= end) { + if (!cache || + start >= cache->key.objectid + cache->key.offset) { + if (cache) + btrfs_put_block_group(cache); + cache = btrfs_lookup_block_group(fs_info, start); + BUG_ON(!cache); + } + + len = cache->key.objectid + cache->key.offset - start; + len = min(len, end + 1 - start); + + if (start < cache->last_byte_to_unpin) { + len = min(len, cache->last_byte_to_unpin - start); + btrfs_add_free_space(cache, start, len); + } + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + start += len; + } + + if (cache) + btrfs_put_block_group(cache); + return 0; +} + +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; + + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + unpin = &fs_info->freed_extents[1]; + else + unpin = &fs_info->freed_extents[0]; + + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + ret = btrfs_discard_extent(root, start, end + 1 - start); + + clear_extent_dirty(unpin, start, end, GFP_NOFS); + unpin_extent_range(root, start, end); + cond_resched(); + } + + return ret; +} + +static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, + int is_data, int reserved, + struct extent_buffer **must_clean) +{ + int err = 0; + struct extent_buffer *buf; + + if (is_data) + goto pinit; + + /* + * discard is sloooow, and so triggering discards on + * individual btree blocks isn't a good plan. Just + * pin everything in discard mode. + */ + if (btrfs_test_opt(root, DISCARD)) + goto pinit; + + buf = btrfs_find_tree_block(root, bytenr, num_bytes); + if (!buf) + goto pinit; + + /* we can reuse a block if it hasn't been written + * and it is from this transaction. We can't + * reuse anything from the tree log root because + * it has tiny sub-transactions. + */ + if (btrfs_buffer_uptodate(buf, 0) && + btrfs_try_tree_lock(buf)) { + u64 header_owner = btrfs_header_owner(buf); + u64 header_transid = btrfs_header_generation(buf); + if (header_owner != BTRFS_TREE_LOG_OBJECTID && + header_transid == trans->transid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + *must_clean = buf; + return 1; + } + btrfs_tree_unlock(buf); + } + free_extent_buffer(buf); +pinit: + if (path) + btrfs_set_path_blocking(path); + /* unlocks the pinned mutex */ + btrfs_pin_extent(root, bytenr, num_bytes, reserved); + + BUG_ON(err < 0); + return 0; +} + +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_root *extent_root = info->extent_root; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + int ret; + int is_data; + int extent_slot = 0; + int found_extent = 0; + int num_to_del = 1; + u32 item_size; + u64 refs; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + path->leave_spinning = 1; + + is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; + BUG_ON(!is_data && refs_to_drop != 1); + + ret = lookup_extent_backref(trans, extent_root, path, &iref, + bytenr, num_bytes, parent, + root_objectid, owner_objectid, + owner_offset); + if (ret == 0) { + extent_slot = path->slots[0]; + while (extent_slot >= 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + extent_slot); + if (key.objectid != bytenr) + break; + if (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) { + found_extent = 1; + break; + } + if (path->slots[0] - extent_slot > 5) + break; + extent_slot--; + } +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); + if (found_extent && item_size < sizeof(*ei)) + found_extent = 0; +#endif + if (!found_extent) { + BUG_ON(iref); + ret = remove_extent_backref(trans, extent_root, path, + NULL, refs_to_drop, + is_data); + BUG_ON(ret); + btrfs_release_path(extent_root, path); + path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + if (ret) { + printk(KERN_ERR "umm, got %d back from search" + ", was looking for %llu\n", ret, + (unsigned long long)bytenr); + btrfs_print_leaf(extent_root, path->nodes[0]); + } + BUG_ON(ret); + extent_slot = path->slots[0]; + } + } else { + btrfs_print_leaf(extent_root, path->nodes[0]); + WARN_ON(1); + printk(KERN_ERR "btrfs unable to find ref byte nr %llu " + "parent %llu root %llu owner %llu offset %llu\n", + (unsigned long long)bytenr, + (unsigned long long)parent, + (unsigned long long)root_objectid, + (unsigned long long)owner_objectid, + (unsigned long long)owner_offset); + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, extent_slot); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + BUG_ON(found_extent || extent_slot != path->slots[0]); + ret = convert_extent_item_v0(trans, extent_root, path, + owner_objectid, 0); + BUG_ON(ret < 0); + + btrfs_release_path(extent_root, path); + path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + ret = btrfs_search_slot(trans, extent_root, &key, path, + -1, 1); + if (ret) { + printk(KERN_ERR "umm, got %d back from search" + ", was looking for %llu\n", ret, + (unsigned long long)bytenr); + btrfs_print_leaf(extent_root, path->nodes[0]); + } + BUG_ON(ret); + extent_slot = path->slots[0]; + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, extent_slot); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + ei = btrfs_item_ptr(leaf, extent_slot, + struct btrfs_extent_item); + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + struct btrfs_tree_block_info *bi; + BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); + } + + refs = btrfs_extent_refs(leaf, ei); + BUG_ON(refs < refs_to_drop); + refs -= refs_to_drop; + + if (refs > 0) { + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + /* + * In the case of inline back ref, reference count will + * be updated by remove_extent_backref + */ + if (iref) { + BUG_ON(!found_extent); + } else { + btrfs_set_extent_refs(leaf, ei, refs); + btrfs_mark_buffer_dirty(leaf); + } + if (found_extent) { + ret = remove_extent_backref(trans, extent_root, path, + iref, refs_to_drop, + is_data); + BUG_ON(ret); + } + } else { + int mark_free = 0; + struct extent_buffer *must_clean = NULL; + + if (found_extent) { + BUG_ON(is_data && refs_to_drop != + extent_data_ref_count(root, path, iref)); + if (iref) { + BUG_ON(path->slots[0] != extent_slot); + } else { + BUG_ON(path->slots[0] != extent_slot + 1); + path->slots[0] = extent_slot; + num_to_del = 2; + } + } + + ret = pin_down_bytes(trans, root, path, bytenr, + num_bytes, is_data, 0, &must_clean); + if (ret > 0) + mark_free = 1; + BUG_ON(ret < 0); + /* + * it is going to be very rare for someone to be waiting + * on the block we're freeing. del_items might need to + * schedule, so rather than get fancy, just force it + * to blocking here + */ + if (must_clean) + btrfs_set_lock_blocking(must_clean); + + ret = btrfs_del_items(trans, extent_root, path, path->slots[0], + num_to_del); + BUG_ON(ret); + btrfs_release_path(extent_root, path); + + if (must_clean) { + clean_tree_block(NULL, root, must_clean); + btrfs_tree_unlock(must_clean); + free_extent_buffer(must_clean); + } + + if (is_data) { + ret = btrfs_del_csums(trans, root, bytenr, num_bytes); + BUG_ON(ret); + } else { + invalidate_mapping_pages(info->btree_inode->i_mapping, + bytenr >> PAGE_CACHE_SHIFT, + (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); + } + + ret = update_block_group(trans, root, bytenr, num_bytes, 0, + mark_free); + BUG_ON(ret); + } + btrfs_free_path(path); + return ret; +} + +/* + * when we free an extent, it is possible (and likely) that we free the last + * delayed ref for that extent as well. This searches the delayed ref tree for + * a given extent, and if there are no other delayed refs to be processed, it + * removes it from the tree. + */ +static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct rb_node *node; + int ret; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (!head) + goto out; + + node = rb_prev(&head->node.rb_node); + if (!node) + goto out; + + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + + /* there are still entries for this ref, we can't drop it */ + if (ref->bytenr == bytenr) + goto out; + + if (head->extent_op) { + if (!head->must_insert_reserved) + goto out; + kfree(head->extent_op); + head->extent_op = NULL; + } + + /* + * waiting for the lock here would deadlock. If someone else has it + * locked they are already in the process of dropping it anyway + */ + if (!mutex_trylock(&head->mutex)) + goto out; + + /* + * at this point we have a head with no other entries. Go + * ahead and process it. + */ + head->node.in_tree = 0; + rb_erase(&head->node.rb_node, &delayed_refs->root); + + delayed_refs->num_entries--; + + /* + * we don't take a ref on the node because we're removing it from the + * tree, so we just steal the ref the tree was holding. + */ + delayed_refs->num_heads--; + if (list_empty(&head->cluster)) + delayed_refs->num_heads_ready--; + + list_del_init(&head->cluster); + spin_unlock(&delayed_refs->lock); + + ret = run_one_delayed_ref(trans, root->fs_info->tree_root, + &head->node, head->extent_op, + head->must_insert_reserved); + BUG_ON(ret); + btrfs_put_delayed_ref(&head->node); + return 0; +out: + spin_unlock(&delayed_refs->lock); + return 0; +} + +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + + /* + * tree log blocks never actually go into the extent allocation + * tree, just update pinning info and exit early. + */ + if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { + WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); + /* unlocks the pinned mutex */ + btrfs_pin_extent(root, bytenr, num_bytes, 1); + ret = 0; + } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + parent, root_objectid, (int)owner, + BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); + ret = check_ref_cleanup(trans, root, bytenr); + BUG_ON(ret); + } else { + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + parent, root_objectid, owner, + offset, BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); + } + return ret; +} + +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 parent, u64 root_objectid, int level) +{ + u64 used; + spin_lock(&root->node_lock); + used = btrfs_root_used(&root->root_item) - blocksize; + btrfs_set_root_used(&root->root_item, used); + spin_unlock(&root->node_lock); + + return btrfs_free_extent(trans, root, bytenr, blocksize, + parent, root_objectid, level, 0); +} + +static u64 stripe_align(struct btrfs_root *root, u64 val) +{ + u64 mask = ((u64)root->stripesize - 1); + u64 ret = (val + mask) & ~mask; + return ret; +} + +/* + * when we wait for progress in the block group caching, its because + * our allocation attempt failed at least once. So, we must sleep + * and let some progress happen before we try again. + * + * This function will sleep at least once waiting for new free space to + * show up, and then it will check the block group free space numbers + * for our min num_bytes. Another option is to have it go ahead + * and look in the rbtree for a free extent of a given size, but this + * is a good start. + */ +static noinline int +wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes) +{ + struct btrfs_caching_control *caching_ctl; + DEFINE_WAIT(wait); + + caching_ctl = get_caching_control(cache); + if (!caching_ctl) + return 0; + + wait_event(caching_ctl->wait, block_group_cache_done(cache) || + (cache->free_space >= num_bytes)); + + put_caching_control(caching_ctl); + return 0; +} + +static noinline int +wait_block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *caching_ctl; + DEFINE_WAIT(wait); + + caching_ctl = get_caching_control(cache); + if (!caching_ctl) + return 0; + + wait_event(caching_ctl->wait, block_group_cache_done(cache)); + + put_caching_control(caching_ctl); + return 0; +} + +enum btrfs_loop_type { + LOOP_FIND_IDEAL = 0, + LOOP_CACHING_NOWAIT = 1, + LOOP_CACHING_WAIT = 2, + LOOP_ALLOC_CHUNK = 3, + LOOP_NO_EMPTY_SIZE = 4, +}; + +/* + * walks the btree of allocated extents and find a hole of a given size. + * The key ins is changed to record the hole: + * ins->objectid == block start + * ins->flags = BTRFS_EXTENT_ITEM_KEY + * ins->offset == number of blocks + * Any available blocks before search_start are skipped. + */ +static noinline int find_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *orig_root, + u64 num_bytes, u64 empty_size, + u64 search_start, u64 search_end, + u64 hint_byte, struct btrfs_key *ins, + u64 exclude_start, u64 exclude_nr, + int data) +{ + int ret = 0; + struct btrfs_root *root = orig_root->fs_info->extent_root; + struct btrfs_free_cluster *last_ptr = NULL; + struct btrfs_block_group_cache *block_group = NULL; + int empty_cluster = 2 * 1024 * 1024; + int allowed_chunk_alloc = 0; + int done_chunk_alloc = 0; + struct btrfs_space_info *space_info; + int last_ptr_loop = 0; + int loop = 0; + bool found_uncached_bg = false; + bool failed_cluster_refill = false; + bool failed_alloc = false; + u64 ideal_cache_percent = 0; + u64 ideal_cache_offset = 0; + + WARN_ON(num_bytes < root->sectorsize); + btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); + ins->objectid = 0; + ins->offset = 0; + + space_info = __find_space_info(root->fs_info, data); + + if (orig_root->ref_cows || empty_size) + allowed_chunk_alloc = 1; + + if (data & BTRFS_BLOCK_GROUP_METADATA) { + last_ptr = &root->fs_info->meta_alloc_cluster; + if (!btrfs_test_opt(root, SSD)) + empty_cluster = 64 * 1024; + } + + if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { + last_ptr = &root->fs_info->data_alloc_cluster; + } + + if (last_ptr) { + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + hint_byte = last_ptr->window_start; + spin_unlock(&last_ptr->lock); + } + + search_start = max(search_start, first_logical_byte(root, 0)); + search_start = max(search_start, hint_byte); + + if (!last_ptr) + empty_cluster = 0; + + if (search_start == hint_byte) { +ideal_cache: + block_group = btrfs_lookup_block_group(root->fs_info, + search_start); + /* + * we don't want to use the block group if it doesn't match our + * allocation bits, or if its not cached. + * + * However if we are re-searching with an ideal block group + * picked out then we don't care that the block group is cached. + */ + if (block_group && block_group_bits(block_group, data) && + (block_group->cached != BTRFS_CACHE_NO || + search_start == ideal_cache_offset)) { + down_read(&space_info->groups_sem); + if (list_empty(&block_group->list) || + block_group->ro) { + /* + * someone is removing this block group, + * we can't jump into the have_block_group + * target because our list pointers are not + * valid + */ + btrfs_put_block_group(block_group); + up_read(&space_info->groups_sem); + } else { + goto have_block_group; + } + } else if (block_group) { + btrfs_put_block_group(block_group); + } + } +search: + down_read(&space_info->groups_sem); + list_for_each_entry(block_group, &space_info->block_groups, list) { + u64 offset; + int cached; + + btrfs_get_block_group(block_group); + search_start = block_group->key.objectid; + +have_block_group: + if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + u64 free_percent; + + free_percent = btrfs_block_group_used(&block_group->item); + free_percent *= 100; + free_percent = div64_u64(free_percent, + block_group->key.offset); + free_percent = 100 - free_percent; + if (free_percent > ideal_cache_percent && + likely(!block_group->ro)) { + ideal_cache_offset = block_group->key.objectid; + ideal_cache_percent = free_percent; + } + + /* + * We only want to start kthread caching if we are at + * the point where we will wait for caching to make + * progress, or if our ideal search is over and we've + * found somebody to start caching. + */ + if (loop > LOOP_CACHING_NOWAIT || + (loop > LOOP_FIND_IDEAL && + atomic_read(&space_info->caching_threads) < 2)) { + ret = cache_block_group(block_group); + BUG_ON(ret); + } + found_uncached_bg = true; + + /* + * If loop is set for cached only, try the next block + * group. + */ + if (loop == LOOP_FIND_IDEAL) + goto loop; + } + + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) + found_uncached_bg = true; + + if (unlikely(block_group->ro)) + goto loop; + + /* + * Ok we want to try and use the cluster allocator, so lets look + * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will + * have tried the cluster allocator plenty of times at this + * point and not have found anything, so we are likely way too + * fragmented for the clustering stuff to find anything, so lets + * just skip it and let the allocator find whatever block it can + * find + */ + if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { + /* + * the refill lock keeps out other + * people trying to start a new cluster + */ + spin_lock(&last_ptr->refill_lock); + if (last_ptr->block_group && + (last_ptr->block_group->ro || + !block_group_bits(last_ptr->block_group, data))) { + offset = 0; + goto refill_cluster; + } + + offset = btrfs_alloc_from_cluster(block_group, last_ptr, + num_bytes, search_start); + if (offset) { + /* we have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + goto checks; + } + + spin_lock(&last_ptr->lock); + /* + * whoops, this cluster doesn't actually point to + * this block group. Get a ref on the block + * group is does point to and try again + */ + if (!last_ptr_loop && last_ptr->block_group && + last_ptr->block_group != block_group) { + + btrfs_put_block_group(block_group); + block_group = last_ptr->block_group; + btrfs_get_block_group(block_group); + spin_unlock(&last_ptr->lock); + spin_unlock(&last_ptr->refill_lock); + + last_ptr_loop = 1; + search_start = block_group->key.objectid; + /* + * we know this block group is properly + * in the list because + * btrfs_remove_block_group, drops the + * cluster before it removes the block + * group from the list + */ + goto have_block_group; + } + spin_unlock(&last_ptr->lock); +refill_cluster: + /* + * this cluster didn't work out, free it and + * start over + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + + last_ptr_loop = 0; + + /* allocate a cluster in this block group */ + ret = btrfs_find_space_cluster(trans, root, + block_group, last_ptr, + offset, num_bytes, + empty_cluster + empty_size); + if (ret == 0) { + /* + * now pull our allocation out of this + * cluster + */ + offset = btrfs_alloc_from_cluster(block_group, + last_ptr, num_bytes, + search_start); + if (offset) { + /* we found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + goto checks; + } + } else if (!cached && loop > LOOP_CACHING_NOWAIT + && !failed_cluster_refill) { + spin_unlock(&last_ptr->refill_lock); + + failed_cluster_refill = true; + wait_block_group_cache_progress(block_group, + num_bytes + empty_cluster + empty_size); + goto have_block_group; + } + + /* + * at this point we either didn't find a cluster + * or we weren't able to allocate a block from our + * cluster. Free the cluster we've been trying + * to use, and go to the next block group + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + spin_unlock(&last_ptr->refill_lock); + goto loop; + } + + offset = btrfs_find_space_for_alloc(block_group, search_start, + num_bytes, empty_size); + /* + * If we didn't find a chunk, and we haven't failed on this + * block group before, and this block group is in the middle of + * caching and we are ok with waiting, then go ahead and wait + * for progress to be made, and set failed_alloc to true. + * + * If failed_alloc is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !failed_alloc && !cached && + loop > LOOP_CACHING_NOWAIT) { + wait_block_group_cache_progress(block_group, + num_bytes + empty_size); + failed_alloc = true; + goto have_block_group; + } else if (!offset) { + goto loop; + } +checks: + search_start = stripe_align(root, offset); + /* move on to the next group */ + if (search_start + num_bytes >= search_end) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; + } + + /* move on to the next group */ + if (search_start + num_bytes > + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; + } + + if (exclude_nr > 0 && + (search_start + num_bytes > exclude_start && + search_start < exclude_start + exclude_nr)) { + search_start = exclude_start + exclude_nr; + + btrfs_add_free_space(block_group, offset, num_bytes); + /* + * if search_start is still in this block group + * then we just re-search this block group + */ + if (search_start >= block_group->key.objectid && + search_start < (block_group->key.objectid + + block_group->key.offset)) + goto have_block_group; + goto loop; + } + + ins->objectid = search_start; + ins->offset = num_bytes; + + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + + update_reserved_extents(block_group, num_bytes, 1); + + /* we are all good, lets return */ + break; +loop: + failed_cluster_refill = false; + failed_alloc = false; + btrfs_put_block_group(block_group); + } + up_read(&space_info->groups_sem); + + /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for + * for them to make caching progress. Also + * determine the best possible bg to cache + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again + */ + if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && + (found_uncached_bg || empty_size || empty_cluster || + allowed_chunk_alloc)) { + if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { + found_uncached_bg = false; + loop++; + if (!ideal_cache_percent && + atomic_read(&space_info->caching_threads)) + goto search; + + /* + * 1 of the following 2 things have happened so far + * + * 1) We found an ideal block group for caching that + * is mostly full and will cache quickly, so we might + * as well wait for it. + * + * 2) We searched for cached only and we didn't find + * anything, and we didn't start any caching kthreads + * either, so chances are we will loop through and + * start a couple caching kthreads, and then come back + * around and just wait for them. This will be slower + * because we will have 2 caching kthreads reading at + * the same time when we could have just started one + * and waited for it to get far enough to give us an + * allocation, so go ahead and go to the wait caching + * loop. + */ + loop = LOOP_CACHING_WAIT; + search_start = ideal_cache_offset; + ideal_cache_percent = 0; + goto ideal_cache; + } else if (loop == LOOP_FIND_IDEAL) { + /* + * Didn't find a uncached bg, wait on anything we find + * next. + */ + loop = LOOP_CACHING_WAIT; + goto search; + } + + if (loop < LOOP_CACHING_WAIT) { + loop++; + goto search; + } + + if (loop == LOOP_ALLOC_CHUNK) { + empty_size = 0; + empty_cluster = 0; + } + + if (allowed_chunk_alloc) { + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, 1); + allowed_chunk_alloc = 0; + done_chunk_alloc = 1; + } else if (!done_chunk_alloc) { + space_info->force_alloc = 1; + } + + if (loop < LOOP_NO_EMPTY_SIZE) { + loop++; + goto search; + } + ret = -ENOSPC; + } else if (!ins->objectid) { + ret = -ENOSPC; + } + + /* we found what we needed */ + if (ins->objectid) { + if (!(data & BTRFS_BLOCK_GROUP_DATA)) + trans->block_group = block_group->key.objectid; + + btrfs_put_block_group(block_group); + ret = 0; + } + + return ret; +} + +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) +{ + struct btrfs_block_group_cache *cache; + + spin_lock(&info->lock); + printk(KERN_INFO "space_info has %llu free, is %sfull\n", + (unsigned long long)(info->total_bytes - info->bytes_used - + info->bytes_pinned - info->bytes_reserved - + info->bytes_super), + (info->full) ? "" : "not "); + printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," + " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" + "\n", + (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_pinned, + (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_may_use, + (unsigned long long)info->bytes_used, + (unsigned long long)info->bytes_root, + (unsigned long long)info->bytes_super, + (unsigned long long)info->bytes_reserved); + spin_unlock(&info->lock); + + if (!dump_block_groups) + return; + + down_read(&info->groups_sem); + list_for_each_entry(cache, &info->block_groups, list) { + spin_lock(&cache->lock); + printk(KERN_INFO "block group %llu has %llu bytes, %llu used " + "%llu pinned %llu reserved\n", + (unsigned long long)cache->key.objectid, + (unsigned long long)cache->key.offset, + (unsigned long long)btrfs_block_group_used(&cache->item), + (unsigned long long)cache->pinned, + (unsigned long long)cache->reserved); + btrfs_dump_free_space(cache, bytes); + spin_unlock(&cache->lock); + } + up_read(&info->groups_sem); +} + +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data) +{ + int ret; + u64 search_start = 0; + + data = btrfs_get_alloc_profile(root, data); +again: + /* + * the only place that sets empty_size is btrfs_realloc_node, which + * is not called recursively on allocations + */ + if (empty_size || root->ref_cows) + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes + 2 * 1024 * 1024, data, 0); + + WARN_ON(num_bytes < root->sectorsize); + ret = find_free_extent(trans, root, num_bytes, empty_size, + search_start, search_end, hint_byte, ins, + trans->alloc_exclude_start, + trans->alloc_exclude_nr, data); + + if (ret == -ENOSPC && num_bytes > min_alloc_size) { + num_bytes = num_bytes >> 1; + num_bytes = num_bytes & ~(root->sectorsize - 1); + num_bytes = max(num_bytes, min_alloc_size); + do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes, data, 1); + goto again; + } + if (ret == -ENOSPC) { + struct btrfs_space_info *sinfo; + + sinfo = __find_space_info(root->fs_info, data); + printk(KERN_ERR "btrfs allocation failed flags %llu, " + "wanted %llu\n", (unsigned long long)data, + (unsigned long long)num_bytes); + dump_space_info(sinfo, num_bytes, 1); + } + + return ret; +} + +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +{ + struct btrfs_block_group_cache *cache; + int ret = 0; + + cache = btrfs_lookup_block_group(root->fs_info, start); + if (!cache) { + printk(KERN_ERR "Unable to find block group for %llu\n", + (unsigned long long)start); + return -ENOSPC; + } + + ret = btrfs_discard_extent(root, start, len); + + btrfs_add_free_space(cache, start, len); + update_reserved_extents(cache, len, 0); + btrfs_put_block_group(cache); + + return ret; +} + +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod) +{ + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_extent_item *extent_item; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + int type; + u32 size; + + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; + + size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, ref_mod); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_DATA); + + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { + struct btrfs_shared_data_ref *ref; + ref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); + } else { + struct btrfs_extent_data_ref *ref; + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); + } + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + + ret = update_block_group(trans, root, ins->objectid, ins->offset, + 1, 0); + if (ret) { + printk(KERN_ERR "btrfs update block group failed for %llu " + "%llu\n", (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); + BUG(); + } + return ret; +} + +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins) +{ + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_extent_item *extent_item; + struct btrfs_tree_block_info *block_info; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, 1); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); + block_info = (struct btrfs_tree_block_info *)(extent_item + 1); + + btrfs_set_tree_block_key(leaf, block_info, key); + btrfs_set_tree_block_level(leaf, block_info, level); + + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + if (parent > 0) { + BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + ret = update_block_group(trans, root, ins->objectid, ins->offset, + 1, 0); + if (ret) { + printk(KERN_ERR "btrfs update block group failed for %llu " + "%llu\n", (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); + BUG(); + } + return ret; +} + +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, + u64 offset, struct btrfs_key *ins) +{ + int ret; + + BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + + ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, + 0, root_objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT, NULL); + return ret; +} + +/* + * this is used by the tree logging recovery code. It records that + * an extent has been allocated and makes sure to clear the free + * space cache bits as well + */ +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins) +{ + int ret; + struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; + u64 start = ins->objectid; + u64 num_bytes = ins->offset; + + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + cache_block_group(block_group); + caching_ctl = get_caching_control(block_group); + + if (!caching_ctl) { + BUG_ON(!block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + BUG_ON(ret); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = add_excluded_extent(root, start, num_bytes); + BUG_ON(ret); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + BUG_ON(ret); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + BUG_ON(ret); + + start = caching_ctl->progress; + num_bytes = ins->objectid + ins->offset - + caching_ctl->progress; + ret = add_excluded_extent(root, start, num_bytes); + BUG_ON(ret); + } + + mutex_unlock(&caching_ctl->mutex); + put_caching_control(caching_ctl); + } + + update_reserved_extents(block_group, ins->offset, 1); + btrfs_put_block_group(block_group); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, + 0, owner, offset, ins, 1); + return ret; +} + +/* + * finds a free extent and does all the dirty work required for allocation + * returns the key for the extent through ins, and a tree buffer for + * the first block of the extent through buf. + * + * returns 0 if everything worked, non-zero otherwise. + */ +static int alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 empty_size, u64 hint_byte, u64 search_end, + struct btrfs_key *ins) +{ + int ret; + u64 flags = 0; + + ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, + empty_size, hint_byte, search_end, + ins, 0); + if (ret) + return ret; + + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins->objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_delayed_extent_op *extent_op; + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = 1; + extent_op->update_flags = 1; + extent_op->is_data = 0; + + ret = btrfs_add_delayed_tree_ref(trans, ins->objectid, + ins->offset, parent, root_objectid, + level, BTRFS_ADD_DELAYED_EXTENT, + extent_op); + BUG_ON(ret); + } + + if (root_objectid == root->root_key.objectid) { + u64 used; + spin_lock(&root->node_lock); + used = btrfs_root_used(&root->root_item) + num_bytes; + btrfs_set_root_used(&root->root_item, used); + spin_unlock(&root->node_lock); + } + return ret; +} + +struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + int level) +{ + struct extent_buffer *buf; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return ERR_PTR(-ENOMEM); + btrfs_set_header_generation(buf, trans->transid); + btrfs_set_buffer_lockdep_class(buf, level); + btrfs_tree_lock(buf); + clean_tree_block(trans, root, buf); + + btrfs_set_lock_blocking(buf); + btrfs_set_buffer_uptodate(buf); + + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + /* + * we allow two log transactions at a time, use different + * EXENT bit to differentiate dirty pages. + */ + if (root->log_transid % 2 == 0) + set_extent_dirty(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + else + set_extent_new(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + } else { + set_extent_dirty(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + } + trans->blocks_used++; + /* this returns a buffer locked for blocking */ + return buf; +} + +/* + * helper function to allocate a block for a given tree + * returns the tree buffer or NULL. + */ +struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u32 blocksize, + u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size) +{ + struct btrfs_key ins; + int ret; + struct extent_buffer *buf; + + ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, + key, level, empty_size, hint, (u64)-1, &ins); + if (ret) { + BUG_ON(ret > 0); + return ERR_PTR(ret); + } + + buf = btrfs_init_new_buffer(trans, root, ins.objectid, + blocksize, level); + return buf; +} + +struct walk_control { + u64 refs[BTRFS_MAX_LEVEL]; + u64 flags[BTRFS_MAX_LEVEL]; + struct btrfs_key update_progress; + int stage; + int level; + int shared_level; + int update_ref; + int keep_locks; + int reada_slot; + int reada_count; +}; + +#define DROP_REFERENCE 1 +#define UPDATE_BACKREF 2 + +static noinline void reada_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct walk_control *wc, + struct btrfs_path *path) +{ + u64 bytenr; + u64 generation; + u64 refs; + u64 flags; + u64 last = 0; + u32 nritems; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *eb; + int ret; + int slot; + int nread = 0; + + if (path->slots[wc->level] < wc->reada_slot) { + wc->reada_count = wc->reada_count * 2 / 3; + wc->reada_count = max(wc->reada_count, 2); + } else { + wc->reada_count = wc->reada_count * 3 / 2; + wc->reada_count = min_t(int, wc->reada_count, + BTRFS_NODEPTRS_PER_BLOCK(root)); + } + + eb = path->nodes[wc->level]; + nritems = btrfs_header_nritems(eb); + blocksize = btrfs_level_size(root, wc->level - 1); + + for (slot = path->slots[wc->level]; slot < nritems; slot++) { + if (nread >= wc->reada_count) + break; + + cond_resched(); + bytenr = btrfs_node_blockptr(eb, slot); + generation = btrfs_node_ptr_generation(eb, slot); + + if (slot == path->slots[wc->level]) + goto reada; + + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) + continue; + + /* We don't lock the tree block, it's OK to be racy here */ + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &refs, &flags); + BUG_ON(ret); + BUG_ON(refs == 0); + + if (wc->stage == DROP_REFERENCE) { + if (refs == 1) + goto reada; + + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + if (!wc->update_ref || + generation <= root->root_key.offset) + continue; + btrfs_node_key_to_cpu(eb, &key, slot); + ret = btrfs_comp_cpu_keys(&key, + &wc->update_progress); + if (ret < 0) + continue; + } else { + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + } +reada: + ret = readahead_tree_block(root, bytenr, blocksize, + generation); + if (ret) + break; + last = bytenr + blocksize; + nread++; + } + wc->reada_slot = slot; +} + +/* + * hepler to process tree block while walking down the tree. + * + * when wc->stage == UPDATE_BACKREF, this function updates + * back refs for pointers in the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int lookup_info) +{ + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; + int ret; + + if (wc->stage == UPDATE_BACKREF && + btrfs_header_owner(eb) != root->root_key.objectid) + return 1; + + /* + * when reference count of tree block is 1, it won't increase + * again. once full backref flag is set, we never clear it. + */ + if (lookup_info && + ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { + BUG_ON(!path->locks[level]); + ret = btrfs_lookup_extent_info(trans, root, + eb->start, eb->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + } + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level] > 1) + return 1; + + if (path->locks[level] && !wc->keep_locks) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + } + return 0; + } + + /* wc->stage == UPDATE_BACKREF */ + if (!(wc->flags[level] & flag)) { + BUG_ON(!path->locks[level]); + ret = btrfs_inc_ref(trans, root, eb, 1); + BUG_ON(ret); + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); + ret = btrfs_set_disk_extent_flags(trans, root, eb->start, + eb->len, flag, 0); + BUG_ON(ret); + wc->flags[level] |= flag; + } + + /* + * the block is shared by multiple trees, so it's not good to + * keep the tree lock + */ + if (path->locks[level] && level > 0) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + } + return 0; +} + +/* + * hepler to process tree block pointer. + * + * when wc->stage == DROP_REFERENCE, this function checks + * reference count of the block pointed to. if the block + * is shared and we need update back refs for the subtree + * rooted at the block, this function changes wc->stage to + * UPDATE_BACKREF. if the block is shared and there is no + * need to update back, this function drops the reference + * to the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int do_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int *lookup_info) +{ + u64 bytenr; + u64 generation; + u64 parent; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *next; + int level = wc->level; + int reada = 0; + int ret = 0; + + generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + /* + * if the lower level block was created before the snapshot + * was created, we know there is no need to update back refs + * for the subtree + */ + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) { + *lookup_info = 1; + return 1; + } + + bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); + blocksize = btrfs_level_size(root, level - 1); + + next = btrfs_find_tree_block(root, bytenr, blocksize); + if (!next) { + next = btrfs_find_create_tree_block(root, bytenr, blocksize); + reada = 1; + } + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &wc->refs[level - 1], + &wc->flags[level - 1]); + BUG_ON(ret); + BUG_ON(wc->refs[level - 1] == 0); + *lookup_info = 0; + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level - 1] > 1) { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + + if (!wc->update_ref || + generation <= root->root_key.offset) + goto skip; + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); + if (ret < 0) + goto skip; + + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; + } + } else { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + } + + if (!btrfs_buffer_uptodate(next, generation)) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + next = NULL; + *lookup_info = 1; + } + + if (!next) { + if (reada && level == 1) + reada_walk_down(trans, root, wc, path); + next = read_tree_block(root, bytenr, blocksize, generation); + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + } + + level--; + BUG_ON(level != btrfs_header_level(next)); + path->nodes[level] = next; + path->slots[level] = 0; + path->locks[level] = 1; + wc->level = level; + if (wc->level == 1) + wc->reada_slot = 0; + return 0; +skip: + wc->refs[level - 1] = 0; + wc->flags[level - 1] = 0; + if (wc->stage == DROP_REFERENCE) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + parent = path->nodes[level]->start; + } else { + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level])); + parent = 0; + } + + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, + root->root_key.objectid, level - 1, 0); + BUG_ON(ret); + } + btrfs_tree_unlock(next); + free_extent_buffer(next); + *lookup_info = 1; + return 1; +} + +/* + * hepler to process tree block while walking up the tree. + * + * when wc->stage == DROP_REFERENCE, this function drops + * reference count on the block. + * + * when wc->stage == UPDATE_BACKREF, this function changes + * wc->stage back to DROP_REFERENCE if we changed wc->stage + * to UPDATE_BACKREF previously while processing the block. + * + * NOTE: return value 1 means we should stop walking up. + */ +static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + int ret = 0; + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 parent = 0; + + if (wc->stage == UPDATE_BACKREF) { + BUG_ON(wc->shared_level < level); + if (level < wc->shared_level) + goto out; + + ret = find_next_key(path, level + 1, &wc->update_progress); + if (ret > 0) + wc->update_ref = 0; + + wc->stage = DROP_REFERENCE; + wc->shared_level = -1; + path->slots[level] = 0; + + /* + * check reference count again if the block isn't locked. + * we should start walking down the tree again if reference + * count is one. + */ + if (!path->locks[level]) { + BUG_ON(level == 0); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + path->locks[level] = 1; + + ret = btrfs_lookup_extent_info(trans, root, + eb->start, eb->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + if (wc->refs[level] == 1) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + return 1; + } + } + } + + /* wc->stage == DROP_REFERENCE */ + BUG_ON(wc->refs[level] > 1 && !path->locks[level]); + + if (wc->refs[level] == 1) { + if (level == 0) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = btrfs_dec_ref(trans, root, eb, 1); + else + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); + } + /* make block locked assertion in clean_tree_block happy */ + if (!path->locks[level] && + btrfs_header_generation(eb) == trans->transid) { + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + path->locks[level] = 1; + } + clean_tree_block(trans, root, eb); + } + + if (eb == root->node) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = eb->start; + else + BUG_ON(root->root_key.objectid != + btrfs_header_owner(eb)); + } else { + if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = path->nodes[level + 1]->start; + else + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level + 1])); + } + + ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, + root->root_key.objectid, level, 0); + BUG_ON(ret); +out: + wc->refs[level] = 0; + wc->flags[level] = 0; + return ret; +} + +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + int level = wc->level; + int lookup_info = 1; + int ret; + + while (level >= 0) { + ret = walk_down_proc(trans, root, path, wc, lookup_info); + if (ret > 0) + break; + + if (level == 0) + break; + + if (path->slots[level] >= + btrfs_header_nritems(path->nodes[level])) + break; + + ret = do_walk_down(trans, root, path, wc, &lookup_info); + if (ret > 0) { + path->slots[level]++; + continue; + } + level = wc->level; + } + return 0; +} + +static noinline int walk_up_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int max_level) +{ + int level = wc->level; + int ret; + + path->slots[level] = btrfs_header_nritems(path->nodes[level]); + while (level < max_level && path->nodes[level]) { + wc->level = level; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + path->slots[level]++; + return 0; + } else { + ret = walk_up_proc(trans, root, path, wc); + if (ret > 0) + return 0; + + if (path->locks[level]) { + btrfs_tree_unlock(path->nodes[level]); + path->locks[level] = 0; + } + free_extent_buffer(path->nodes[level]); + path->nodes[level] = NULL; + level++; + } + } + return 1; +} + +/* + * drop a subvolume tree. + * + * this function traverses the tree freeing any blocks that only + * referenced by the tree. + * + * when a shared tree block is found. this function decreases its + * reference count by one. if update_ref is true, this function + * also make sure backrefs for the shared block and all lower level + * blocks are properly updated. + */ +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) +{ + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = root->fs_info->tree_root; + struct btrfs_root_item *root_item = &root->root_item; + struct walk_control *wc; + struct btrfs_key key; + int err = 0; + int ret; + int level; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + wc = kzalloc(sizeof(*wc), GFP_NOFS); + BUG_ON(!wc); + + trans = btrfs_start_transaction(tree_root, 1); + + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_header_level(root->node); + path->nodes[level] = btrfs_lock_root_node(root); + btrfs_set_lock_blocking(path->nodes[level]); + path->slots[level] = 0; + path->locks[level] = 1; + memset(&wc->update_progress, 0, + sizeof(wc->update_progress)); + } else { + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + memcpy(&wc->update_progress, &key, + sizeof(wc->update_progress)); + + level = root_item->drop_level; + BUG_ON(level == 0); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + err = ret; + goto out; + } + WARN_ON(ret > 0); + + /* + * unlock our path, this is safe because only this + * function is allowed to delete this snapshot + */ + btrfs_unlock_up_safe(path, 0); + + level = btrfs_header_level(root->node); + while (1) { + btrfs_tree_lock(path->nodes[level]); + btrfs_set_lock_blocking(path->nodes[level]); + + ret = btrfs_lookup_extent_info(trans, root, + path->nodes[level]->start, + path->nodes[level]->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + + if (level == root_item->drop_level) + break; + + btrfs_tree_unlock(path->nodes[level]); + WARN_ON(wc->refs[level] != 1); + level--; + } + } + + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = update_ref; + wc->keep_locks = 0; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); + + while (1) { + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) { + err = ret; + break; + } + + ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); + if (ret < 0) { + err = ret; + break; + } + + if (ret > 0) { + BUG_ON(wc->stage != DROP_REFERENCE); + break; + } + + if (wc->stage == DROP_REFERENCE) { + level = wc->level; + btrfs_node_key(path->nodes[level], + &root_item->drop_progress, + path->slots[level]); + root_item->drop_level = level; + } + + BUG_ON(wc->level == 0); + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) { + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + root_item); + BUG_ON(ret); + + btrfs_end_transaction(trans, tree_root); + trans = btrfs_start_transaction(tree_root, 1); + } else { + unsigned long update; + update = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (update) + btrfs_run_delayed_refs(trans, tree_root, + update); + } + } + btrfs_release_path(root, path); + BUG_ON(err); + + ret = btrfs_del_root(trans, tree_root, &root->root_key); + BUG_ON(ret); + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_find_last_root(tree_root, root->root_key.objectid, + NULL, NULL); + BUG_ON(ret < 0); + if (ret > 0) { + ret = btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); + BUG_ON(ret); + } + } + + if (root->in_radix) { + btrfs_free_fs_root(tree_root->fs_info, root); + } else { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); + } +out: + btrfs_end_transaction(trans, tree_root); + kfree(wc); + btrfs_free_path(path); + return err; +} + +/* + * drop subtree rooted at tree block 'node'. + * + * NOTE: this function will unlock and release tree block 'node' + */ +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent) +{ + struct btrfs_path *path; + struct walk_control *wc; + int level; + int parent_level; + int ret = 0; + int wret; + + BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + wc = kzalloc(sizeof(*wc), GFP_NOFS); + BUG_ON(!wc); + + btrfs_assert_tree_locked(parent); + parent_level = btrfs_header_level(parent); + extent_buffer_get(parent); + path->nodes[parent_level] = parent; + path->slots[parent_level] = btrfs_header_nritems(parent); + + btrfs_assert_tree_locked(node); + level = btrfs_header_level(node); + path->nodes[level] = node; + path->slots[level] = 0; + path->locks[level] = 1; + + wc->refs[parent_level] = 1; + wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = 0; + wc->keep_locks = 1; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); + + while (1) { + wret = walk_down_tree(trans, root, path, wc); + if (wret < 0) { + ret = wret; + break; + } + + wret = walk_up_tree(trans, root, path, wc, parent_level); + if (wret < 0) + ret = wret; + if (wret != 0) + break; + } + + kfree(wc); + btrfs_free_path(path); + return ret; +} + +#if 0 +static unsigned long calc_ra(unsigned long start, unsigned long last, + unsigned long nr) +{ + return min(last, start + nr - 1); +} + +static noinline int relocate_inode_pages(struct inode *inode, u64 start, + u64 len) +{ + u64 page_start; + u64 page_end; + unsigned long first_index; + unsigned long last_index; + unsigned long i; + struct page *page; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct file_ra_state *ra; + struct btrfs_ordered_extent *ordered; + unsigned int total_read = 0; + unsigned int total_dirty = 0; + int ret = 0; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + + mutex_lock(&inode->i_mutex); + first_index = start >> PAGE_CACHE_SHIFT; + last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; + + /* make sure the dirty trick played by the caller work */ + ret = invalidate_inode_pages2_range(inode->i_mapping, + first_index, last_index); + if (ret) + goto out_unlock; + + file_ra_state_init(ra, inode->i_mapping); + + for (i = first_index ; i <= last_index; i++) { + if (total_read % ra->ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, ra, NULL, i, + calc_ra(i, last_index, ra->ra_pages)); + } + total_read++; +again: + if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) + BUG_ON(1); + page = grab_cache_page(inode->i_mapping, i); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + set_page_extent_mapped(page); + + if (i == first_index) + set_extent_bits(io_tree, page_start, page_end, + EXTENT_BOUNDARY, GFP_NOFS); + btrfs_set_extent_delalloc(inode, page_start, page_end); + + set_page_dirty(page); + total_dirty++; + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + } + +out_unlock: + kfree(ra); + mutex_unlock(&inode->i_mutex); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); + return ret; +} + +static noinline int relocate_data_extent(struct inode *reloc_inode, + struct btrfs_key *extent_key, + u64 offset) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree; + struct extent_map *em; + u64 start = extent_key->objectid - offset; + u64 end = start + extent_key->offset - 1; + + em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em || IS_ERR(em)); + + em->start = start; + em->len = extent_key->offset; + em->block_len = extent_key->offset; + em->block_start = extent_key->objectid; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + /* setup extent map to cheat btrfs_readpage */ + lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); + while (1) { + int ret; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(reloc_inode, start, end, 0); + } + unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); + + return relocate_inode_pages(reloc_inode, start, extent_key->offset); +} + +struct btrfs_ref_path { + u64 extent_start; + u64 nodes[BTRFS_MAX_LEVEL]; + u64 root_objectid; + u64 root_generation; + u64 owner_objectid; + u32 num_refs; + int lowest_level; + int current_level; + int shared_level; + + struct btrfs_key node_keys[BTRFS_MAX_LEVEL]; + u64 new_nodes[BTRFS_MAX_LEVEL]; +}; + +struct disk_extent { + u64 ram_bytes; + u64 disk_bytenr; + u64 disk_num_bytes; + u64 offset; + u64 num_bytes; + u8 compression; + u8 encryption; + u16 other_encoding; +}; + +static int is_cowonly_root(u64 root_objectid) +{ + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || + root_objectid == BTRFS_EXTENT_TREE_OBJECTID || + root_objectid == BTRFS_CHUNK_TREE_OBJECTID || + root_objectid == BTRFS_DEV_TREE_OBJECTID || + root_objectid == BTRFS_TREE_LOG_OBJECTID || + root_objectid == BTRFS_CSUM_TREE_OBJECTID) + return 1; + return 0; +} + +static noinline int __next_ref_path(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_ref_path *ref_path, + int first_time) +{ + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_extent_ref *ref; + struct btrfs_key key; + struct btrfs_key found_key; + u64 bytenr; + u32 nritems; + int level; + int ret = 1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (first_time) { + ref_path->lowest_level = -1; + ref_path->current_level = -1; + ref_path->shared_level = -1; + goto walk_up; + } +walk_down: + level = ref_path->current_level - 1; + while (level >= -1) { + u64 parent; + if (level < ref_path->lowest_level) + break; + + if (level >= 0) + bytenr = ref_path->nodes[level]; + else + bytenr = ref_path->extent_start; + BUG_ON(bytenr == 0); + + parent = ref_path->nodes[level + 1]; + ref_path->nodes[level + 1] = 0; + ref_path->current_level = level; + BUG_ON(parent == 0); + + key.objectid = bytenr; + key.offset = parent + 1; + key.type = BTRFS_EXTENT_REF_KEY; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret > 0) + goto next; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid == bytenr && + found_key.type == BTRFS_EXTENT_REF_KEY) { + if (level < ref_path->shared_level) + ref_path->shared_level = level; + goto found; + } +next: + level--; + btrfs_release_path(extent_root, path); + cond_resched(); + } + /* reached lowest level */ + ret = 1; + goto out; +walk_up: + level = ref_path->current_level; + while (level < BTRFS_MAX_LEVEL - 1) { + u64 ref_objectid; + + if (level >= 0) + bytenr = ref_path->nodes[level]; + else + bytenr = ref_path->extent_start; + + BUG_ON(bytenr == 0); + + key.objectid = bytenr; + key.offset = 0; + key.type = BTRFS_EXTENT_REF_KEY; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret > 0) { + /* the extent was freed by someone */ + if (ref_path->lowest_level == level) + goto out; + btrfs_release_path(extent_root, path); + goto walk_down; + } + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != bytenr || + found_key.type != BTRFS_EXTENT_REF_KEY) { + /* the extent was freed by someone */ + if (ref_path->lowest_level == level) { + ret = 1; + goto out; + } + btrfs_release_path(extent_root, path); + goto walk_down; + } +found: + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + ref_objectid = btrfs_ref_objectid(leaf, ref); + if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (first_time) { + level = (int)ref_objectid; + BUG_ON(level >= BTRFS_MAX_LEVEL); + ref_path->lowest_level = level; + ref_path->current_level = level; + ref_path->nodes[level] = bytenr; + } else { + WARN_ON(ref_objectid != level); + } + } else { + WARN_ON(level != -1); + } + first_time = 0; + + if (ref_path->lowest_level == level) { + ref_path->owner_objectid = ref_objectid; + ref_path->num_refs = btrfs_ref_num_refs(leaf, ref); + } + + /* + * the block is tree root or the block isn't in reference + * counted tree. + */ + if (found_key.objectid == found_key.offset || + is_cowonly_root(btrfs_ref_root(leaf, ref))) { + ref_path->root_objectid = btrfs_ref_root(leaf, ref); + ref_path->root_generation = + btrfs_ref_generation(leaf, ref); + if (level < 0) { + /* special reference from the tree log */ + ref_path->nodes[0] = found_key.offset; + ref_path->current_level = 0; + } + ret = 0; + goto out; + } + + level++; + BUG_ON(ref_path->nodes[level] != 0); + ref_path->nodes[level] = found_key.offset; + ref_path->current_level = level; + + /* + * the reference was created in the running transaction, + * no need to continue walking up. + */ + if (btrfs_ref_generation(leaf, ref) == trans->transid) { + ref_path->root_objectid = btrfs_ref_root(leaf, ref); + ref_path->root_generation = + btrfs_ref_generation(leaf, ref); + ret = 0; + goto out; + } + + btrfs_release_path(extent_root, path); + cond_resched(); + } + /* reached max tree level, but no tree root found. */ + BUG(); +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_first_ref_path(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_ref_path *ref_path, + u64 extent_start) +{ + memset(ref_path, 0, sizeof(*ref_path)); + ref_path->extent_start = extent_start; + + return __next_ref_path(trans, extent_root, ref_path, 1); +} + +static int btrfs_next_ref_path(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_ref_path *ref_path) +{ + return __next_ref_path(trans, extent_root, ref_path, 0); +} + +static noinline int get_new_locations(struct inode *reloc_inode, + struct btrfs_key *extent_key, + u64 offset, int no_fragment, + struct disk_extent **extents, + int *nr_extents) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct disk_extent *exts = *extents; + struct btrfs_key found_key; + u64 cur_pos; + u64 last_byte; + u32 nritems; + int nr = 0; + int max = *nr_extents; + int ret; + + WARN_ON(!no_fragment && *extents); + if (!exts) { + max = 1; + exts = kmalloc(sizeof(*exts) * max, GFP_NOFS); + if (!exts) + return -ENOMEM; + } + + path = btrfs_alloc_path(); + BUG_ON(!path); + + cur_pos = extent_key->objectid - offset; + last_byte = extent_key->objectid + extent_key->offset; + ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, + cur_pos, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.offset != cur_pos || + found_key.type != BTRFS_EXTENT_DATA_KEY || + found_key.objectid != reloc_inode->i_ino) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != + BTRFS_FILE_EXTENT_REG || + btrfs_file_extent_disk_bytenr(leaf, fi) == 0) + break; + + if (nr == max) { + struct disk_extent *old = exts; + max *= 2; + exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); + memcpy(exts, old, sizeof(*exts) * nr); + if (old != *extents) + kfree(old); + } + + exts[nr].disk_bytenr = + btrfs_file_extent_disk_bytenr(leaf, fi); + exts[nr].disk_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); + exts[nr].offset = btrfs_file_extent_offset(leaf, fi); + exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + exts[nr].compression = btrfs_file_extent_compression(leaf, fi); + exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); + exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, + fi); + BUG_ON(exts[nr].offset > 0); + BUG_ON(exts[nr].compression || exts[nr].encryption); + BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); + + cur_pos += exts[nr].num_bytes; + nr++; + + if (cur_pos + offset >= last_byte) + break; + + if (no_fragment) { + ret = 1; + goto out; + } + path->slots[0]++; + } + + BUG_ON(cur_pos + offset > last_byte); + if (cur_pos + offset < last_byte) { + ret = -ENOENT; + goto out; + } + ret = 0; +out: + btrfs_free_path(path); + if (ret) { + if (exts != *extents) + kfree(exts); + } else { + *extents = exts; + *nr_extents = nr; + } + return ret; +} + +static noinline int replace_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *extent_key, + struct btrfs_key *leaf_key, + struct btrfs_ref_path *ref_path, + struct disk_extent *new_extents, + int nr_extents) +{ + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct inode *inode = NULL; + struct btrfs_key key; + u64 lock_start = 0; + u64 lock_end = 0; + u64 num_bytes; + u64 ext_offset; + u64 search_end = (u64)-1; + u32 nritems; + int nr_scaned = 0; + int extent_locked = 0; + int extent_type; + int ret; + + memcpy(&key, leaf_key, sizeof(key)); + if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { + if (key.objectid < ref_path->owner_objectid || + (key.objectid == ref_path->owner_objectid && + key.type < BTRFS_EXTENT_DATA_KEY)) { + key.objectid = ref_path->owner_objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + } + } + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); +next: + if (extent_locked && ret > 0) { + /* + * the file extent item was modified by someone + * before the extent got locked. + */ + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + extent_locked = 0; + } + + if (path->slots[0] >= nritems) { + if (++nr_scaned > 2) + break; + + BUG_ON(extent_locked); + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { + if ((key.objectid > ref_path->owner_objectid) || + (key.objectid == ref_path->owner_objectid && + key.type > BTRFS_EXTENT_DATA_KEY) || + key.offset >= search_end) + break; + } + + if (inode && key.objectid != inode->i_ino) { + BUG_ON(extent_locked); + btrfs_release_path(root, path); + mutex_unlock(&inode->i_mutex); + iput(inode); + inode = NULL; + continue; + } + + if (key.type != BTRFS_EXTENT_DATA_KEY) { + path->slots[0]++; + ret = 1; + goto next; + } + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + if ((extent_type != BTRFS_FILE_EXTENT_REG && + extent_type != BTRFS_FILE_EXTENT_PREALLOC) || + (btrfs_file_extent_disk_bytenr(leaf, fi) != + extent_key->objectid)) { + path->slots[0]++; + ret = 1; + goto next; + } + + num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + ext_offset = btrfs_file_extent_offset(leaf, fi); + + if (search_end == (u64)-1) { + search_end = key.offset - ext_offset + + btrfs_file_extent_ram_bytes(leaf, fi); + } + + if (!extent_locked) { + lock_start = key.offset; + lock_end = lock_start + num_bytes - 1; + } else { + if (lock_start > key.offset || + lock_end + 1 < key.offset + num_bytes) { + unlock_extent(&BTRFS_I(inode)->io_tree, + lock_start, lock_end, GFP_NOFS); + extent_locked = 0; + } + } + + if (!inode) { + btrfs_release_path(root, path); + + inode = btrfs_iget_locked(root->fs_info->sb, + key.objectid, root); + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->location.objectid = + key.objectid; + BTRFS_I(inode)->location.type = + BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + /* + * some code call btrfs_commit_transaction while + * holding the i_mutex, so we can't use mutex_lock + * here. + */ + if (is_bad_inode(inode) || + !mutex_trylock(&inode->i_mutex)) { + iput(inode); + inode = NULL; + key.offset = (u64)-1; + goto skip; + } + } + + if (!extent_locked) { + struct btrfs_ordered_extent *ordered; + + btrfs_release_path(root, path); + + lock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + lock_end); + if (ordered && + ordered->file_offset <= lock_end && + ordered->file_offset + ordered->len > lock_start) { + unlock_extent(&BTRFS_I(inode)->io_tree, + lock_start, lock_end, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + key.offset += num_bytes; + goto skip; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + extent_locked = 1; + continue; + } + + if (nr_extents == 1) { + /* update extent pointer in place */ + btrfs_set_file_extent_disk_bytenr(leaf, fi, + new_extents[0].disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, + new_extents[0].disk_num_bytes); + btrfs_mark_buffer_dirty(leaf); + + btrfs_drop_extent_cache(inode, key.offset, + key.offset + num_bytes - 1, 0); + + ret = btrfs_inc_extent_ref(trans, root, + new_extents[0].disk_bytenr, + new_extents[0].disk_num_bytes, + leaf->start, + root->root_key.objectid, + trans->transid, + key.objectid); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, root, + extent_key->objectid, + extent_key->offset, + leaf->start, + btrfs_header_owner(leaf), + btrfs_header_generation(leaf), + key.objectid, 0); + BUG_ON(ret); + + btrfs_release_path(root, path); + key.offset += num_bytes; + } else { + BUG_ON(1); +#if 0 + u64 alloc_hint; + u64 extent_len; + int i; + /* + * drop old extent pointer at first, then insert the + * new pointers one bye one + */ + btrfs_release_path(root, path); + ret = btrfs_drop_extents(trans, root, inode, key.offset, + key.offset + num_bytes, + key.offset, &alloc_hint); + BUG_ON(ret); + + for (i = 0; i < nr_extents; i++) { + if (ext_offset >= new_extents[i].num_bytes) { + ext_offset -= new_extents[i].num_bytes; + continue; + } + extent_len = min(new_extents[i].num_bytes - + ext_offset, num_bytes); + + ret = btrfs_insert_empty_item(trans, root, + path, &key, + sizeof(*fi)); + BUG_ON(ret); + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_disk_bytenr(leaf, fi, + new_extents[i].disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, + new_extents[i].disk_num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, + new_extents[i].ram_bytes); + + btrfs_set_file_extent_compression(leaf, fi, + new_extents[i].compression); + btrfs_set_file_extent_encryption(leaf, fi, + new_extents[i].encryption); + btrfs_set_file_extent_other_encoding(leaf, fi, + new_extents[i].other_encoding); + + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_len); + ext_offset += new_extents[i].offset; + btrfs_set_file_extent_offset(leaf, fi, + ext_offset); + btrfs_mark_buffer_dirty(leaf); + + btrfs_drop_extent_cache(inode, key.offset, + key.offset + extent_len - 1, 0); + + ret = btrfs_inc_extent_ref(trans, root, + new_extents[i].disk_bytenr, + new_extents[i].disk_num_bytes, + leaf->start, + root->root_key.objectid, + trans->transid, key.objectid); + BUG_ON(ret); + btrfs_release_path(root, path); + + inode_add_bytes(inode, extent_len); + + ext_offset = 0; + num_bytes -= extent_len; + key.offset += extent_len; + + if (num_bytes == 0) + break; + } + BUG_ON(i >= nr_extents); +#endif + } + + if (extent_locked) { + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + extent_locked = 0; + } +skip: + if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && + key.offset >= search_end) + break; + + cond_resched(); + } + ret = 0; +out: + btrfs_release_path(root, path); + if (inode) { + mutex_unlock(&inode->i_mutex); + if (extent_locked) { + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + } + iput(inode); + } + return ret; +} + +int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, u64 orig_start) +{ + int level; + int ret; + + BUG_ON(btrfs_header_generation(buf) != trans->transid); + BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + + level = btrfs_header_level(buf); + if (level == 0) { + struct btrfs_leaf_ref *ref; + struct btrfs_leaf_ref *orig_ref; + + orig_ref = btrfs_lookup_leaf_ref(root, orig_start); + if (!orig_ref) + return -ENOENT; + + ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems); + if (!ref) { + btrfs_free_leaf_ref(root, orig_ref); + return -ENOMEM; + } + + ref->nritems = orig_ref->nritems; + memcpy(ref->extents, orig_ref->extents, + sizeof(ref->extents[0]) * ref->nritems); + + btrfs_free_leaf_ref(root, orig_ref); + + ref->root_gen = trans->transid; + ref->bytenr = buf->start; + ref->owner = btrfs_header_owner(buf); + ref->generation = btrfs_header_generation(buf); + + ret = btrfs_add_leaf_ref(root, ref, 0); + WARN_ON(ret); + btrfs_free_leaf_ref(root, ref); + } + return 0; +} + +static noinline int invalidate_extent_cache(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_block_group_cache *group, + struct btrfs_root *target_root) +{ + struct btrfs_key key; + struct inode *inode = NULL; + struct btrfs_file_extent_item *fi; + u64 num_bytes; + u64 skip_objectid = 0; + u32 nritems; + u32 i; + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.objectid == skip_objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) + continue; + if (!inode || inode->i_ino != key.objectid) { + iput(inode); + inode = btrfs_ilookup(target_root->fs_info->sb, + key.objectid, target_root, 1); + } + if (!inode) { + skip_objectid = key.objectid; + continue; + } + num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + + lock_extent(&BTRFS_I(inode)->io_tree, key.offset, + key.offset + num_bytes - 1, GFP_NOFS); + btrfs_drop_extent_cache(inode, key.offset, + key.offset + num_bytes - 1, 1); + unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, + key.offset + num_bytes - 1, GFP_NOFS); + cond_resched(); + } + iput(inode); + return 0; +} + +static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_block_group_cache *group, + struct inode *reloc_inode) +{ + struct btrfs_key key; + struct btrfs_key extent_key; + struct btrfs_file_extent_item *fi; + struct btrfs_leaf_ref *ref; + struct disk_extent *new_extent; + u64 bytenr; + u64 num_bytes; + u32 nritems; + u32 i; + int ext_index; + int nr_extent; + int ret; + + new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); + BUG_ON(!new_extent); + + ref = btrfs_lookup_leaf_ref(root, leaf->start); + BUG_ON(!ref); + + ext_index = -1; + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + if (bytenr == 0) + continue; + + ext_index++; + if (bytenr >= group->key.objectid + group->key.offset || + bytenr + num_bytes <= group->key.objectid) + continue; + + extent_key.objectid = bytenr; + extent_key.offset = num_bytes; + extent_key.type = BTRFS_EXTENT_ITEM_KEY; + nr_extent = 1; + ret = get_new_locations(reloc_inode, &extent_key, + group->key.objectid, 1, + &new_extent, &nr_extent); + if (ret > 0) + continue; + BUG_ON(ret < 0); + + BUG_ON(ref->extents[ext_index].bytenr != bytenr); + BUG_ON(ref->extents[ext_index].num_bytes != num_bytes); + ref->extents[ext_index].bytenr = new_extent->disk_bytenr; + ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; + + btrfs_set_file_extent_disk_bytenr(leaf, fi, + new_extent->disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, + new_extent->disk_num_bytes); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_inc_extent_ref(trans, root, + new_extent->disk_bytenr, + new_extent->disk_num_bytes, + leaf->start, + root->root_key.objectid, + trans->transid, key.objectid); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, root, + bytenr, num_bytes, leaf->start, + btrfs_header_owner(leaf), + btrfs_header_generation(leaf), + key.objectid, 0); + BUG_ON(ret); + cond_resched(); + } + kfree(new_extent); + BUG_ON(ext_index + 1 != ref->nritems); + btrfs_free_leaf_ref(root, ref); + return 0; +} + +int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + int ret; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + root->reloc_root = NULL; + list_add(&reloc_root->dead_list, + &root->fs_info->dead_reloc_roots); + + btrfs_set_root_bytenr(&reloc_root->root_item, + reloc_root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(reloc_root->node)); + memset(&reloc_root->root_item.drop_progress, 0, + sizeof(struct btrfs_disk_key)); + reloc_root->root_item.drop_level = 0; + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &reloc_root->root_key, + &reloc_root->root_item); + BUG_ON(ret); + } + return 0; +} + +int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *reloc_root; + struct btrfs_root *prev_root = NULL; + struct list_head dead_roots; + int ret; + unsigned long nr; + + INIT_LIST_HEAD(&dead_roots); + list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots); + + while (!list_empty(&dead_roots)) { + reloc_root = list_entry(dead_roots.prev, + struct btrfs_root, dead_list); + list_del_init(&reloc_root->dead_list); + + BUG_ON(reloc_root->commit_root != NULL); + while (1) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + mutex_lock(&root->fs_info->drop_mutex); + ret = btrfs_drop_snapshot(trans, reloc_root); + if (ret != -EAGAIN) + break; + mutex_unlock(&root->fs_info->drop_mutex); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + btrfs_btree_balance_dirty(root, nr); + } + + free_extent_buffer(reloc_root->node); + + ret = btrfs_del_root(trans, root->fs_info->tree_root, + &reloc_root->root_key); + BUG_ON(ret); + mutex_unlock(&root->fs_info->drop_mutex); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + btrfs_btree_balance_dirty(root, nr); + + kfree(prev_root); + prev_root = reloc_root; + } + if (prev_root) { + btrfs_remove_leaf_refs(prev_root, (u64)-1, 0); + kfree(prev_root); + } + return 0; +} + +int btrfs_add_dead_reloc_root(struct btrfs_root *root) +{ + list_add(&root->dead_list, &root->fs_info->dead_reloc_roots); + return 0; +} + +int btrfs_cleanup_reloc_trees(struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct btrfs_trans_handle *trans; + struct btrfs_key location; + int found; + int ret; + + mutex_lock(&root->fs_info->tree_reloc_mutex); + ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL); + BUG_ON(ret); + found = !list_empty(&root->fs_info->dead_reloc_roots); + mutex_unlock(&root->fs_info->tree_reloc_mutex); + + if (found) { + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + } + + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + location.offset = (u64)-1; + location.type = BTRFS_ROOT_ITEM_KEY; + + reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + BUG_ON(!reloc_root); + btrfs_orphan_cleanup(reloc_root); + return 0; +} + +static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb; + struct btrfs_root_item *root_item; + struct btrfs_key root_key; + int ret; + + BUG_ON(!root->ref_cows); + if (root->reloc_root) + return 0; + + root_item = kmalloc(sizeof(*root_item), GFP_NOFS); + BUG_ON(!root_item); + + ret = btrfs_copy_root(trans, root, root->commit_root, + &eb, BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; + root_key.offset = root->root_key.objectid; + root_key.type = BTRFS_ROOT_ITEM_KEY; + + memcpy(root_item, &root->root_item, sizeof(root_item)); + btrfs_set_root_refs(root_item, 0); + btrfs_set_root_bytenr(root_item, eb->start); + btrfs_set_root_level(root_item, btrfs_header_level(eb)); + btrfs_set_root_generation(root_item, trans->transid); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_insert_root(trans, root->fs_info->tree_root, + &root_key, root_item); + BUG_ON(ret); + kfree(root_item); + + reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &root_key); + BUG_ON(!reloc_root); + reloc_root->last_trans = trans->transid; + reloc_root->commit_root = NULL; + reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; + + root->reloc_root = reloc_root; + return 0; +} + +/* + * Core function of space balance. + * + * The idea is using reloc trees to relocate tree blocks in reference + * counted roots. There is one reloc tree for each subvol, and all + * reloc trees share same root key objectid. Reloc trees are snapshots + * of the latest committed roots of subvols (root->commit_root). + * + * To relocate a tree block referenced by a subvol, there are two steps. + * COW the block through subvol's reloc tree, then update block pointer + * in the subvol to point to the new block. Since all reloc trees share + * same root key objectid, doing special handing for tree blocks owned + * by them is easy. Once a tree block has been COWed in one reloc tree, + * we can use the resulting new block directly when the same block is + * required to COW again through other reloc trees. By this way, relocated + * tree blocks are shared between reloc trees, so they are also shared + * between subvols. + */ +static noinline int relocate_one_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *first_key, + struct btrfs_ref_path *ref_path, + struct btrfs_block_group_cache *group, + struct inode *reloc_inode) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb = NULL; + struct btrfs_key *keys; + u64 *nodes; + int level; + int shared_level; + int lowest_level = 0; + int ret; + + if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) + lowest_level = ref_path->owner_objectid; + + if (!root->ref_cows) { + path->lowest_level = lowest_level; + ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); + BUG_ON(ret < 0); + path->lowest_level = 0; + btrfs_release_path(root, path); + return 0; + } + + mutex_lock(&root->fs_info->tree_reloc_mutex); + ret = init_reloc_tree(trans, root); + BUG_ON(ret); + reloc_root = root->reloc_root; + + shared_level = ref_path->shared_level; + ref_path->shared_level = BTRFS_MAX_LEVEL - 1; + + keys = ref_path->node_keys; + nodes = ref_path->new_nodes; + memset(&keys[shared_level + 1], 0, + sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1)); + memset(&nodes[shared_level + 1], 0, + sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1)); + + if (nodes[lowest_level] == 0) { + path->lowest_level = lowest_level; + ret = btrfs_search_slot(trans, reloc_root, first_key, path, + 0, 1); + BUG_ON(ret); + for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) { + eb = path->nodes[level]; + if (!eb || eb == reloc_root->node) + break; + nodes[level] = eb->start; + if (level == 0) + btrfs_item_key_to_cpu(eb, &keys[level], 0); + else + btrfs_node_key_to_cpu(eb, &keys[level], 0); + } + if (nodes[0] && + ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + eb = path->nodes[0]; + ret = replace_extents_in_leaf(trans, reloc_root, eb, + group, reloc_inode); + BUG_ON(ret); + } + btrfs_release_path(reloc_root, path); + } else { + ret = btrfs_merge_path(trans, reloc_root, keys, nodes, + lowest_level); + BUG_ON(ret); + } + + /* + * replace tree blocks in the fs tree with tree blocks in + * the reloc tree. + */ + ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level); + BUG_ON(ret < 0); + + if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_search_slot(trans, reloc_root, first_key, path, + 0, 0); + BUG_ON(ret); + extent_buffer_get(path->nodes[0]); + eb = path->nodes[0]; + btrfs_release_path(reloc_root, path); + ret = invalidate_extent_cache(reloc_root, eb, group, root); + BUG_ON(ret); + free_extent_buffer(eb); + } + + mutex_unlock(&root->fs_info->tree_reloc_mutex); + path->lowest_level = 0; + return 0; +} + +static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *first_key, + struct btrfs_ref_path *ref_path) +{ + int ret; + + ret = relocate_one_path(trans, root, path, first_key, + ref_path, NULL, NULL); + BUG_ON(ret); + + return 0; +} + +static noinline int del_extent_zero(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_path *path, + struct btrfs_key *extent_key) +{ + int ret; + + ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); + if (ret) + goto out; + ret = btrfs_del_item(trans, extent_root, path); +out: + btrfs_release_path(extent_root, path); + return ret; +} + +static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info, + struct btrfs_ref_path *ref_path) +{ + struct btrfs_key root_key; + + root_key.objectid = ref_path->root_objectid; + root_key.type = BTRFS_ROOT_ITEM_KEY; + if (is_cowonly_root(ref_path->root_objectid)) + root_key.offset = 0; + else + root_key.offset = (u64)-1; + + return btrfs_read_fs_root_no_name(fs_info, &root_key); +} + +static noinline int relocate_one_extent(struct btrfs_root *extent_root, + struct btrfs_path *path, + struct btrfs_key *extent_key, + struct btrfs_block_group_cache *group, + struct inode *reloc_inode, int pass) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *found_root; + struct btrfs_ref_path *ref_path = NULL; + struct disk_extent *new_extents = NULL; + int nr_extents = 0; + int loops; + int ret; + int level; + struct btrfs_key first_key; + u64 prev_block = 0; + + + trans = btrfs_start_transaction(extent_root, 1); + BUG_ON(!trans); + + if (extent_key->objectid == 0) { + ret = del_extent_zero(trans, extent_root, path, extent_key); + goto out; + } + + ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS); + if (!ref_path) { + ret = -ENOMEM; + goto out; + } + + for (loops = 0; ; loops++) { + if (loops == 0) { + ret = btrfs_first_ref_path(trans, extent_root, ref_path, + extent_key->objectid); + } else { + ret = btrfs_next_ref_path(trans, extent_root, ref_path); + } + if (ret < 0) + goto out; + if (ret > 0) + break; + + if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID || + ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID) + continue; + + found_root = read_ref_root(extent_root->fs_info, ref_path); + BUG_ON(!found_root); + /* + * for reference counted tree, only process reference paths + * rooted at the latest committed root. + */ + if (found_root->ref_cows && + ref_path->root_generation != found_root->root_key.offset) + continue; + + if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + if (pass == 0) { + /* + * copy data extents to new locations + */ + u64 group_start = group->key.objectid; + ret = relocate_data_extent(reloc_inode, + extent_key, + group_start); + if (ret < 0) + goto out; + break; + } + level = 0; + } else { + level = ref_path->owner_objectid; + } + + if (prev_block != ref_path->nodes[level]) { + struct extent_buffer *eb; + u64 block_start = ref_path->nodes[level]; + u64 block_size = btrfs_level_size(found_root, level); + + eb = read_tree_block(found_root, block_start, + block_size, 0); + btrfs_tree_lock(eb); + BUG_ON(level != btrfs_header_level(eb)); + + if (level == 0) + btrfs_item_key_to_cpu(eb, &first_key, 0); + else + btrfs_node_key_to_cpu(eb, &first_key, 0); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + prev_block = block_start; + } + + mutex_lock(&extent_root->fs_info->trans_mutex); + btrfs_record_root_in_trans(found_root); + mutex_unlock(&extent_root->fs_info->trans_mutex); + if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + /* + * try to update data extent references while + * keeping metadata shared between snapshots. + */ + if (pass == 1) { + ret = relocate_one_path(trans, found_root, + path, &first_key, ref_path, + group, reloc_inode); + if (ret < 0) + goto out; + continue; + } + /* + * use fallback method to process the remaining + * references. + */ + if (!new_extents) { + u64 group_start = group->key.objectid; + new_extents = kmalloc(sizeof(*new_extents), + GFP_NOFS); + nr_extents = 1; + ret = get_new_locations(reloc_inode, + extent_key, + group_start, 1, + &new_extents, + &nr_extents); + if (ret) + goto out; + } + ret = replace_one_extent(trans, found_root, + path, extent_key, + &first_key, ref_path, + new_extents, nr_extents); + } else { + ret = relocate_tree_block(trans, found_root, path, + &first_key, ref_path); + } + if (ret < 0) + goto out; + } + ret = 0; +out: + btrfs_end_transaction(trans, extent_root); + kfree(new_extents); + kfree(ref_path); + return ret; +} +#endif + +static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) +{ + u64 num_devices; + u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + + num_devices = root->fs_info->fs_devices->rw_devices; + if (num_devices == 1) { + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* turn raid0 into single device chunks */ + if (flags & BTRFS_BLOCK_GROUP_RAID0) + return stripped; + + /* turn mirroring into duplication */ + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + return stripped | BTRFS_BLOCK_GROUP_DUP; + return flags; + } else { + /* they already had raid on here, just return */ + if (flags & stripped) + return flags; + + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* switch duplicated blocks with raid1 */ + if (flags & BTRFS_BLOCK_GROUP_DUP) + return stripped | BTRFS_BLOCK_GROUP_RAID1; + + /* turn single device chunks into raid0 */ + return stripped | BTRFS_BLOCK_GROUP_RAID0; + } + return flags; +} + +static int __alloc_chunk_for_shrink(struct btrfs_root *root, + struct btrfs_block_group_cache *shrink_block_group, + int force) +{ + struct btrfs_trans_handle *trans; + u64 new_alloc_flags; + u64 calc; + + spin_lock(&shrink_block_group->lock); + if (btrfs_block_group_used(&shrink_block_group->item) + + shrink_block_group->reserved > 0) { + spin_unlock(&shrink_block_group->lock); + + trans = btrfs_start_transaction(root, 1); + spin_lock(&shrink_block_group->lock); + + new_alloc_flags = update_block_group_flags(root, + shrink_block_group->flags); + if (new_alloc_flags != shrink_block_group->flags) { + calc = + btrfs_block_group_used(&shrink_block_group->item); + } else { + calc = shrink_block_group->key.offset; + } + spin_unlock(&shrink_block_group->lock); + + do_chunk_alloc(trans, root->fs_info->extent_root, + calc + 2 * 1024 * 1024, new_alloc_flags, force); + + btrfs_end_transaction(trans, root); + } else + spin_unlock(&shrink_block_group->lock); + return 0; +} + + +int btrfs_prepare_block_group_relocation(struct btrfs_root *root, + struct btrfs_block_group_cache *group) + +{ + __alloc_chunk_for_shrink(root, group, 1); + set_block_group_readonly(group); + return 0; +} + +/* + * checks to see if its even possible to relocate this block group. + * + * @return - -1 if it's not a good idea to relocate this block group, 0 if its + * ok to go ahead and try. + */ +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_device *device; + int full = 0; + int ret = 0; + + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + + /* odd, couldn't find the block group, leave it alone */ + if (!block_group) + return -1; + + /* no bytes used, we're good */ + if (!btrfs_block_group_used(&block_group->item)) + goto out; + + space_info = block_group->space_info; + spin_lock(&space_info->lock); + + full = space_info->full; + + /* + * if this is the last block group we have in this space, we can't + * relocate it unless we're able to allocate a new chunk below. + * + * Otherwise, we need to make sure we have room in the space to handle + * all of the extents from this block group. If we can, we're good + */ + if ((space_info->total_bytes != block_group->key.offset) && + (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + btrfs_block_group_used(&block_group->item) < + space_info->total_bytes)) { + spin_unlock(&space_info->lock); + goto out; + } + spin_unlock(&space_info->lock); + + /* + * ok we don't have enough space, but maybe we have free space on our + * devices to allocate new chunks for relocation, so loop through our + * alloc devices and guess if we have enough space. However, if we + * were marked as full, then we know there aren't enough chunks, and we + * can just return. + */ + ret = -1; + if (full) + goto out; + + mutex_lock(&root->fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + u64 min_free = btrfs_block_group_used(&block_group->item); + u64 dev_offset, max_avail; + + /* + * check to make sure we can actually find a chunk with enough + * space to fit our block group in. + */ + if (device->total_bytes > device->bytes_used + min_free) { + ret = find_free_dev_extent(NULL, device, min_free, + &dev_offset, &max_avail); + if (!ret) + break; + ret = -1; + } + } + mutex_unlock(&root->fs_info->chunk_mutex); +out: + btrfs_put_block_group(block_group); + return ret; +} + +static int find_first_block_group(struct btrfs_root *root, + struct btrfs_path *path, struct btrfs_key *key) +{ + int ret = 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + int slot; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid >= key->objectid && + found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = 0; + goto out; + } + path->slots[0]++; + } + ret = -ENOENT; +out: + return ret; +} + +int btrfs_free_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_caching_control *caching_ctl; + struct rb_node *n; + + down_write(&info->extent_commit_sem); + while (!list_empty(&info->caching_block_groups)) { + caching_ctl = list_entry(info->caching_block_groups.next, + struct btrfs_caching_control, list); + list_del(&caching_ctl->list); + put_caching_control(caching_ctl); + } + up_write(&info->extent_commit_sem); + + spin_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + rb_erase(&block_group->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_block_group_cache_done(block_group); + + btrfs_remove_free_space_cache(block_group); + btrfs_put_block_group(block_group); + + spin_lock(&info->block_group_cache_lock); + } + spin_unlock(&info->block_group_cache_lock); + + /* now that all the block groups are freed, go through and + * free all the space_info structs. This is only called during + * the final stages of unmount, and so we know nobody is + * using them. We call synchronize_rcu() once before we start, + * just to be on the safe side. + */ + synchronize_rcu(); + + while(!list_empty(&info->space_info)) { + space_info = list_entry(info->space_info.next, + struct btrfs_space_info, + list); + + list_del(&space_info->list); + kfree(space_info); + } + return 0; +} + +int btrfs_read_block_groups(struct btrfs_root *root) +{ + struct btrfs_path *path; + int ret; + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *space_info; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + root = info->extent_root; + key.objectid = 0; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + ret = find_first_block_group(root, path, &key); + if (ret > 0) { + ret = 0; + goto error; + } + if (ret != 0) + goto error; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) { + ret = -ENOMEM; + break; + } + + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + spin_lock_init(&cache->tree_lock); + cache->fs_info = info; + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + + /* + * we only want to have 32k of ram per block group for keeping + * track of free space, and if we pass 1/2 of that we want to + * start converting things over to using bitmaps + */ + cache->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); + + read_extent_buffer(leaf, &cache->item, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(cache->item)); + memcpy(&cache->key, &found_key, sizeof(found_key)); + + key.objectid = found_key.objectid + found_key.offset; + btrfs_release_path(root, path); + cache->flags = btrfs_block_group_flags(&cache->item); + cache->sectorsize = root->sectorsize; + + /* + * check for two cases, either we are full, and therefore + * don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all + * the space in and be done with it. This saves us _alot_ of + * time, particularly in the full case. + */ + if (found_key.offset == btrfs_block_group_used(&cache->item)) { + exclude_super_stripes(root, cache); + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + free_excluded_extents(root, cache); + } else if (btrfs_block_group_used(&cache->item) == 0) { + exclude_super_stripes(root, cache); + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, root->fs_info, + found_key.objectid, + found_key.objectid + + found_key.offset); + free_excluded_extents(root, cache); + } + + ret = update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + &space_info); + BUG_ON(ret); + cache->space_info = space_info; + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_super += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + + down_write(&space_info->groups_sem); + list_add_tail(&cache->list, &space_info->block_groups); + up_write(&space_info->groups_sem); + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + BUG_ON(ret); + + set_avail_alloc_bits(root->fs_info, cache->flags); + if (btrfs_chunk_readonly(root, cache->key.objectid)) + set_block_group_readonly(cache); + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, + u64 size) +{ + int ret; + struct btrfs_root *extent_root; + struct btrfs_block_group_cache *cache; + + extent_root = root->fs_info->extent_root; + + root->fs_info->last_trans_log_full_commit = trans->transid; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return -ENOMEM; + + cache->key.objectid = chunk_offset; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = root->sectorsize; + + /* + * we only want to have 32k of ram per block group for keeping track + * of free space, and if we pass 1/2 of that we want to start + * converting things over to using bitmaps + */ + cache->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + spin_lock_init(&cache->tree_lock); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + + btrfs_set_block_group_used(&cache->item, bytes_used); + btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); + cache->flags = type; + btrfs_set_block_group_flags(&cache->item, type); + + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + exclude_super_stripes(root, cache); + + add_new_free_space(cache, root->fs_info, chunk_offset, + chunk_offset + size); + + free_excluded_extents(root, cache); + + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, + &cache->space_info); + BUG_ON(ret); + + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_super += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + + down_write(&cache->space_info->groups_sem); + list_add_tail(&cache->list, &cache->space_info->block_groups); + up_write(&cache->space_info->groups_sem); + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + BUG_ON(ret); + + ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, + sizeof(cache->item)); + BUG_ON(ret); + + set_avail_alloc_bits(extent_root->fs_info, type); + + return 0; +} + +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 group_start) +{ + struct btrfs_path *path; + struct btrfs_block_group_cache *block_group; + struct btrfs_free_cluster *cluster; + struct btrfs_key key; + int ret; + + root = root->fs_info->extent_root; + + block_group = btrfs_lookup_block_group(root->fs_info, group_start); + BUG_ON(!block_group); + BUG_ON(!block_group->ro); + + memcpy(&key, &block_group->key, sizeof(key)); + + /* make sure this block group isn't part of an allocation cluster */ + cluster = &root->fs_info->data_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + /* + * make sure this block group isn't part of a metadata + * allocation cluster + */ + cluster = &root->fs_info->meta_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + spin_lock(&root->fs_info->block_group_cache_lock); + rb_erase(&block_group->cache_node, + &root->fs_info->block_group_cache_tree); + spin_unlock(&root->fs_info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + /* + * we must use list_del_init so people can check to see if they + * are still on the list after taking the semaphore + */ + list_del_init(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_block_group_cache_done(block_group); + + btrfs_remove_free_space_cache(block_group); + + spin_lock(&block_group->space_info->lock); + block_group->space_info->total_bytes -= block_group->key.offset; + block_group->space_info->bytes_readonly -= block_group->key.offset; + spin_unlock(&block_group->space_info->lock); + + btrfs_clear_space_info_full(root->fs_info); + + btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -EIO; + if (ret < 0) + goto out; + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c new file mode 100644 index 00000000000..b177ed31961 --- /dev/null +++ b/fs/btrfs/extent_io.c @@ -0,0 +1,3856 @@ +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/bio.h> +#include <linux/mm.h> +#include <linux/gfp.h> +#include <linux/pagemap.h> +#include <linux/page-flags.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/blkdev.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include "extent_io.h" +#include "extent_map.h" +#include "compat.h" +#include "ctree.h" +#include "btrfs_inode.h" + +static struct kmem_cache *extent_state_cache; +static struct kmem_cache *extent_buffer_cache; + +static LIST_HEAD(buffers); +static LIST_HEAD(states); + +#define LEAK_DEBUG 0 +#if LEAK_DEBUG +static DEFINE_SPINLOCK(leak_lock); +#endif + +#define BUFFER_LRU_MAX 64 + +struct tree_entry { + u64 start; + u64 end; + struct rb_node rb_node; +}; + +struct extent_page_data { + struct bio *bio; + struct extent_io_tree *tree; + get_extent_t *get_extent; + + /* tells writepage not to lock the state bits for this range + * it still does the unlocking + */ + unsigned int extent_locked:1; + + /* tells the submit_bio code to use a WRITE_SYNC */ + unsigned int sync_io:1; +}; + +int __init extent_io_init(void) +{ + extent_state_cache = kmem_cache_create("extent_state", + sizeof(struct extent_state), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!extent_state_cache) + return -ENOMEM; + + extent_buffer_cache = kmem_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!extent_buffer_cache) + goto free_state_cache; + return 0; + +free_state_cache: + kmem_cache_destroy(extent_state_cache); + return -ENOMEM; +} + +void extent_io_exit(void) +{ + struct extent_state *state; + struct extent_buffer *eb; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + printk(KERN_ERR "btrfs state leak: start %llu end %llu " + "state %lu in tree %p refs %d\n", + (unsigned long long)state->start, + (unsigned long long)state->end, + state->state, state->tree, atomic_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + + } + + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, struct extent_buffer, leak_list); + printk(KERN_ERR "btrfs buffer leak start %llu len %lu " + "refs %d\n", (unsigned long long)eb->start, + eb->len, atomic_read(&eb->refs)); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } + if (extent_state_cache) + kmem_cache_destroy(extent_state_cache); + if (extent_buffer_cache) + kmem_cache_destroy(extent_buffer_cache); +} + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping, gfp_t mask) +{ + tree->state.rb_node = NULL; + tree->buffer.rb_node = NULL; + tree->ops = NULL; + tree->dirty_bytes = 0; + spin_lock_init(&tree->lock); + spin_lock_init(&tree->buffer_lock); + tree->mapping = mapping; +} + +static struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; +#if LEAK_DEBUG + unsigned long flags; +#endif + + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state) + return state; + state->state = 0; + state->private = 0; + state->tree = NULL; +#if LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_add(&state->leak_list, &states); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + atomic_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + return state; +} + +static void free_extent_state(struct extent_state *state) +{ + if (!state) + return; + if (atomic_dec_and_test(&state->refs)) { +#if LEAK_DEBUG + unsigned long flags; +#endif + WARN_ON(state->tree); +#if LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_del(&state->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_state_cache, state); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct tree_entry *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset > entry->end) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct tree_entry, rb_node); + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct tree_entry *entry; + struct tree_entry *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct tree_entry, rb_node); + prev = n; + prev_entry = entry; + + if (offset < entry->start) + n = n->rb_left; + else if (offset > entry->end) + n = n->rb_right; + else + return n; + } + + if (prev_ret) { + orig_prev = prev; + while (prev && offset > prev_entry->end) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +static inline struct rb_node *tree_search(struct extent_io_tree *tree, + u64 offset) +{ + struct rb_node *prev = NULL; + struct rb_node *ret; + + ret = __etree_search(tree, offset, &prev, NULL); + if (!ret) + return prev; + return ret; +} + +static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree, + u64 offset, struct rb_node *node) +{ + struct rb_root *root = &tree->buffer; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct extent_buffer *eb; + + while (*p) { + parent = *p; + eb = rb_entry(parent, struct extent_buffer, rb_node); + + if (offset < eb->start) + p = &(*p)->rb_left; + else if (offset > eb->start) + p = &(*p)->rb_right; + else + return eb; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct extent_buffer *buffer_search(struct extent_io_tree *tree, + u64 offset) +{ + struct rb_root *root = &tree->buffer; + struct rb_node *n = root->rb_node; + struct extent_buffer *eb; + + while (n) { + eb = rb_entry(n, struct extent_buffer, rb_node); + if (offset < eb->start) + n = n->rb_left; + else if (offset > eb->start) + n = n->rb_right; + else + return eb; + } + return NULL; +} + +static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, + struct extent_state *other) +{ + if (tree->ops && tree->ops->merge_extent_hook) + tree->ops->merge_extent_hook(tree->mapping->host, new, + other); +} + +/* + * utility function to look for merge candidates inside a given range. + * Any extents with matching state are merged together into a single + * extent in the tree. Extents with EXTENT_IO in their state field + * are not merged because the end_io handlers need to be able to do + * operations on them without sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static int merge_state(struct extent_io_tree *tree, + struct extent_state *state) +{ + struct extent_state *other; + struct rb_node *other_node; + + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + return 0; + + other_node = rb_prev(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->end == state->start - 1 && + other->state == state->state) { + merge_cb(tree, state, other); + state->start = other->start; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); + } + } + other_node = rb_next(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->start == state->end + 1 && + other->state == state->state) { + merge_cb(tree, state, other); + other->start = state->start; + state->tree = NULL; + rb_erase(&state->rb_node, &tree->state); + free_extent_state(state); + state = NULL; + } + } + + return 0; +} + +static int set_state_cb(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long bits) +{ + if (tree->ops && tree->ops->set_bit_hook) { + return tree->ops->set_bit_hook(tree->mapping->host, + state->start, state->end, + state->state, bits); + } + + return 0; +} + +static void clear_state_cb(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long bits) +{ + if (tree->ops && tree->ops->clear_bit_hook) + tree->ops->clear_bit_hook(tree->mapping->host, state, bits); +} + +/* + * insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_io_tree *tree, + struct extent_state *state, u64 start, u64 end, + int bits) +{ + struct rb_node *node; + int ret; + + if (end < start) { + printk(KERN_ERR "btrfs end < start %llu %llu\n", + (unsigned long long)end, + (unsigned long long)start); + WARN_ON(1); + } + state->start = start; + state->end = end; + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + + if (bits & EXTENT_DIRTY) + tree->dirty_bytes += end - start + 1; + state->state |= bits; + node = tree_insert(&tree->state, end, &state->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); + printk(KERN_ERR "btrfs found node %llu %llu on insert of " + "%llu %llu\n", (unsigned long long)found->start, + (unsigned long long)found->end, + (unsigned long long)start, (unsigned long long)end); + free_extent_state(state); + return -EEXIST; + } + state->tree = tree; + merge_state(tree, state); + return 0; +} + +static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, + u64 split) +{ + if (tree->ops && tree->ops->split_extent_hook) + return tree->ops->split_extent_hook(tree->mapping->host, + orig, split); + return 0; +} + +/* + * split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *node; + + split_cb(tree, orig, split); + + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + + node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + if (node) { + free_extent_state(prealloc); + return -EEXIST; + } + prealloc->tree = tree; + return 0; +} + +/* + * utility function to clear some bits in an extent state struct. + * it will optionally wake up any one waiting on this state (wake == 1), or + * forcibly remove the state from the tree (delete == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static int clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, int bits, int wake, + int delete) +{ + int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; + int ret = state->state & bits_to_clear; + + if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + WARN_ON(range > tree->dirty_bytes); + tree->dirty_bytes -= range; + } + clear_state_cb(tree, state, bits); + state->state &= ~bits_to_clear; + if (wake) + wake_up(&state->wq); + if (delete || state->state == 0) { + if (state->tree) { + clear_state_cb(tree, state, state->state); + rb_erase(&state->rb_node, &tree->state); + state->tree = NULL; + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + } + return ret; +} + +/* + * clear some bits on a range in the tree. This may require splitting + * or inserting elements in the tree, so the gfp mask is used to + * indicate which allocations or sleeping are allowed. + * + * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove + * the given range from the tree regardless of state (ie for truncate). + * + * the range [start, end] is inclusive. + * + * This takes the tree lock, and returns < 0 on error, > 0 if any of the + * bits were already set, or zero if none of the bits were already set. + */ +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask) +{ + struct extent_state *state; + struct extent_state *cached; + struct extent_state *prealloc = NULL; + struct rb_node *next_node; + struct rb_node *node; + u64 last_end; + int err; + int set = 0; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + if (cached_state) { + cached = *cached_state; + *cached_state = NULL; + cached_state = NULL; + if (cached && cached->tree && cached->start == start) { + atomic_dec(&cached->refs); + state = cached; + goto hit_next; + } + free_extent_state(cached); + } + /* + * this search will find the extents that end after + * our range starts + */ + node = tree_search(tree, start); + if (!node) + goto out; + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + if (state->start > end) + goto out; + WARN_ON(state->end < start); + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip + * bits on second half. + * + * If the extent we found extends past our range, we + * just split and search again. It'll get split again + * the next time though. + * + * If the extent we found is inside our range, we clear + * the desired bit on it. + */ + + if (state->start < start) { + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set |= clear_state_bit(tree, state, bits, wake, + delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + if (wake) + wake_up(&state->wq); + + set |= clear_state_bit(tree, prealloc, bits, wake, delete); + + prealloc = NULL; + goto out; + } + + if (state->end < end && prealloc && !need_resched()) + next_node = rb_next(&state->rb_node); + else + next_node = NULL; + + set |= clear_state_bit(tree, state, bits, wake, delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start <= end && next_node) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return set; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +static int wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) + __releases(tree->lock) + __acquires(tree->lock) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); + return 0; +} + +/* + * waits for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +{ + struct extent_state *state; + struct rb_node *node; + + spin_lock(&tree->lock); +again: + while (1) { + /* + * this search will find all the extents that end after + * our range starts + */ + node = tree_search(tree, start); + if (!node) + break; + + state = rb_entry(node, struct extent_state, rb_node); + + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + atomic_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (need_resched()) { + spin_unlock(&tree->lock); + cond_resched(); + spin_lock(&tree->lock); + } + } +out: + spin_unlock(&tree->lock); + return 0; +} + +static int set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + int bits) +{ + int ret; + + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + tree->dirty_bytes += range; + } + state->state |= bits; + + return 0; +} + +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + if (cached_ptr && !(*cached_ptr)) { + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { + *cached_ptr = state; + atomic_inc(&state->refs); + } + } +} + +/* + * set some bits on a range in the tree. This may require allocations or + * sleeping, so the gfp mask is used to indicate what is allowed. + * + * If any of the exclusive bits are set, this will fail with -EEXIST if some + * part of the range already has the desired bits set. The start of the + * existing range is returned in failed_start in this case. + * + * [start, end] is inclusive This takes the tree lock. + */ + +static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, + gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; + u64 last_start; + u64 last_end; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start == start && state->tree) { + node = &state->rb_node; + goto hit_next; + } + } + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + err = insert_state(tree, prealloc, start, end, bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + struct rb_node *next_node; + if (state->state & exclusive_bits) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } + + err = set_state_bits(tree, state, bits); + if (err) + goto out; + + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + + start = last_end + 1; + if (start < end && prealloc && !need_resched()) { + next_node = rb_next(node); + if (next_node) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + } + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + err = set_state_bits(tree, state, bits); + if (err) + goto out; + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + err = insert_state(tree, prealloc, start, this_end, + bits); + BUG_ON(err == -EEXIST); + if (err) { + prealloc = NULL; + goto out; + } + cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + err = set_state_bits(tree, prealloc, bits); + if (err) { + prealloc = NULL; + goto out; + } + cache_state(prealloc, cached_state); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +/* wrappers around set/clear extent bit */ +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + NULL, mask); +} + +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return set_extent_bit(tree, start, end, bits, 0, NULL, + NULL, mask); +} + +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); +} + +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, + 0, NULL, NULL, mask); +} + +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, + NULL, mask); +} + +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + NULL, mask); +} + +static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, + NULL, mask); +} + +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, + NULL, mask); +} + +static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + NULL, mask); +} + +int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) +{ + return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); +} + +/* + * either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, struct extent_state **cached_state, gfp_t mask) +{ + int err; + u64 failed_start; + while (1) { + err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, + cached_state, mask); + if (err == -EEXIST && (mask & __GFP_WAIT)) { + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + start = failed_start; + } else { + break; + } + WARN_ON(start > end); + } + return err; +} + +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +{ + return lock_extent_bits(tree, start, end, 0, NULL, mask); +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + int err; + u64 failed_start; + + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, mask); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, 1, 0, NULL, mask); + return 0; + } + return 1; +} + +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + mask); +} + +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, + mask); +} + +/* + * helper function to set pages and extents in the tree dirty + */ +int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + return 0; +} + +/* + * helper function to set both pages and extents in the tree writeback + */ +static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + set_page_writeback(page); + page_cache_release(page); + index++; + } + return 0; +} + +/* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned, < 0 on error + */ +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && (state->state & bits)) { + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + break; + } + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); + return ret; +} + +/* find the first state struct with 'bits' set after 'start', and + * return it. tree->lock must be held. NULL will returned if + * nothing was found after 'start' + */ +struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, int bits) +{ + struct rb_node *node; + struct extent_state *state; + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && (state->state & bits)) + return state; + + node = rb_next(node); + if (!node) + break; + } +out: + return NULL; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_delalloc_range(struct extent_io_tree *tree, + u64 *start, u64 *end, u64 max_bytes) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 found = 0; + u64 total_bytes = 0; + + spin_lock(&tree->lock); + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, cur_start); + if (!node) { + if (!found) + *end = (u64)-1; + goto out; + } + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (found && (state->start != cur_start || + (state->state & EXTENT_BOUNDARY))) { + goto out; + } + if (!(state->state & EXTENT_DELALLOC)) { + if (!found) + *end = state->end; + goto out; + } + if (!found) + *start = state->start; + found++; + *end = state->end; + cur_start = state->end + 1; + node = rb_next(node); + if (!node) + break; + total_bytes += state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + } +out: + spin_unlock(&tree->lock); + return found; +} + +static noinline int __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + + if (index == locked_page->index && end_index == index) + return 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, nr_pages, + ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] != locked_page) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +static noinline int lock_delalloc_pages(struct inode *inode, + struct page *locked_page, + u64 delalloc_start, + u64 delalloc_end) +{ + unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; + unsigned long start_index = index; + unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; + unsigned long pages_locked = 0; + struct page *pages[16]; + unsigned long nrpages; + int ret; + int i; + + /* the caller is responsible for locking the start index */ + if (index == locked_page->index && index == end_index) + return 0; + + /* skip the page at the start index */ + nrpages = end_index - index + 1; + while (nrpages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nrpages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + ret = -EAGAIN; + goto done; + } + /* now we have an array of pages, lock them all */ + for (i = 0; i < ret; i++) { + /* + * the caller is taking responsibility for + * locked_page + */ + if (pages[i] != locked_page) { + lock_page(pages[i]); + if (!PageDirty(pages[i]) || + pages[i]->mapping != inode->i_mapping) { + ret = -EAGAIN; + unlock_page(pages[i]); + page_cache_release(pages[i]); + goto done; + } + } + page_cache_release(pages[i]); + pages_locked++; + } + nrpages -= ret; + index += ret; + cond_resched(); + } + ret = 0; +done: + if (ret && pages_locked) { + __unlock_for_delalloc(inode, locked_page, + delalloc_start, + ((u64)(start_index + pages_locked - 1)) << + PAGE_CACHE_SHIFT); + } + return ret; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, + u64 *start, u64 *end, + u64 max_bytes) +{ + u64 delalloc_start; + u64 delalloc_end; + u64 found; + struct extent_state *cached_state = NULL; + int ret; + int loops = 0; + +again: + /* step one, find a bunch of delalloc bytes starting at start */ + delalloc_start = *start; + delalloc_end = 0; + found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes); + if (!found || delalloc_end <= *start) { + *start = delalloc_start; + *end = delalloc_end; + return found; + } + + /* + * start comes from the offset of locked_page. We have to lock + * pages in order, so we can't process delalloc bytes before + * locked_page + */ + if (delalloc_start < *start) + delalloc_start = *start; + + /* + * make sure to limit the number of pages we try to lock down + * if we're looping. + */ + if (delalloc_end + 1 - delalloc_start > max_bytes && loops) + delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; + + /* step two, lock all the pages after the page that has start */ + ret = lock_delalloc_pages(inode, locked_page, + delalloc_start, delalloc_end); + if (ret == -EAGAIN) { + /* some of the pages are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching + */ + free_extent_state(cached_state); + if (!loops) { + unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); + max_bytes = PAGE_CACHE_SIZE - offset; + loops = 1; + goto again; + } else { + found = 0; + goto out_failed; + } + } + BUG_ON(ret); + + /* step three, lock the state bits for the whole range */ + lock_extent_bits(tree, delalloc_start, delalloc_end, + 0, &cached_state, GFP_NOFS); + + /* then test to make sure it is all still delalloc */ + ret = test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, 1, cached_state); + if (!ret) { + unlock_extent_cached(tree, delalloc_start, delalloc_end, + &cached_state, GFP_NOFS); + __unlock_for_delalloc(inode, locked_page, + delalloc_start, delalloc_end); + cond_resched(); + goto again; + } + free_extent_state(cached_state); + *start = delalloc_start; + *end = delalloc_end; +out_failed: + return found; +} + +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + unsigned long op) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + int clear_bits = 0; + + if (op & EXTENT_CLEAR_UNLOCK) + clear_bits |= EXTENT_LOCKED; + if (op & EXTENT_CLEAR_DIRTY) + clear_bits |= EXTENT_DIRTY; + + if (op & EXTENT_CLEAR_DELALLOC) + clear_bits |= EXTENT_DELALLOC; + + if (op & EXTENT_CLEAR_ACCOUNTING) + clear_bits |= EXTENT_DO_ACCOUNTING; + + clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); + if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | + EXTENT_SET_PRIVATE2))) + return 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + + if (op & EXTENT_SET_PRIVATE2) + SetPagePrivate2(pages[i]); + + if (pages[i] == locked_page) { + page_cache_release(pages[i]); + continue; + } + if (op & EXTENT_CLEAR_DIRTY) + clear_page_dirty_for_io(pages[i]); + if (op & EXTENT_SET_WRITEBACK) + set_page_writeback(pages[i]); + if (op & EXTENT_END_WRITEBACK) + end_page_writeback(pages[i]); + if (op & EXTENT_CLEAR_UNLOCK_PAGE) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +/* + * count the number of bytes in the tree that have a given bit(s) + * set. This can be fairly slow, except for EXTENT_DIRTY which is + * cached. The total number found is returned. + */ +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + unsigned long bits) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 total_bytes = 0; + int found = 0; + + if (search_end <= cur_start) { + WARN_ON(1); + return 0; + } + + spin_lock(&tree->lock); + if (cur_start == 0 && bits == EXTENT_DIRTY) { + total_bytes = tree->dirty_bytes; + goto out; + } + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, cur_start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > search_end) + break; + if (state->end >= cur_start && (state->state & bits)) { + total_bytes += min(search_end, state->end) + 1 - + max(cur_start, state->start); + if (total_bytes >= max_bytes) + break; + if (!found) { + *start = state->start; + found = 1; + } + } + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); + return total_bytes; +} + +/* + * set the private field for a given byte offset in the tree. If there isn't + * an extent_state there already, this does nothing. + */ +int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + state->private = private; +out: + spin_unlock(&tree->lock); + return ret; +} + +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + *private = state->private; +out: + spin_unlock(&tree->lock); + return ret; +} + +/* + * searches a range in the state tree for a given mask. + * If 'filled' == 1, this returns 1 only if every extent in the tree + * has the bits set. Otherwise, 1 is returned if any bit in the + * range is found set. + */ +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int filled, struct extent_state *cached) +{ + struct extent_state *state = NULL; + struct rb_node *node; + int bitset = 0; + + spin_lock(&tree->lock); + if (cached && cached->tree && cached->start == start) + node = &cached->rb_node; + else + node = tree_search(tree, start); + while (node && start <= end) { + state = rb_entry(node, struct extent_state, rb_node); + + if (filled && state->start > start) { + bitset = 0; + break; + } + + if (state->start > end) + break; + + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + + if (state->end == (u64)-1) + break; + + start = state->end + 1; + if (start > end) + break; + node = rb_next(node); + if (!node) { + if (filled) + bitset = 0; + break; + } + } + spin_unlock(&tree->lock); + return bitset; +} + +/* + * helper function to set a given page up to date if all the + * extents in the tree for that page are up to date + */ +static int check_page_uptodate(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) + SetPageUptodate(page); + return 0; +} + +/* + * helper function to unlock a page if all the extents in the tree + * for that page are unlocked + */ +static int check_page_locked(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) + unlock_page(page); + return 0; +} + +/* + * helper function to end page writeback if all the extents + * in the tree for that page are done with writeback + */ +static int check_page_writeback(struct extent_io_tree *tree, + struct page *page) +{ + end_page_writeback(page); + return 0; +} + +/* lots and lots of room for performance fixes in the end_bio funcs */ + +/* + * after a writepage IO is done, we need to: + * clear the uptodate bits on error + * clear the writeback bits in the extent tree for this IO + * end_page_writeback if the page has no more pending IO + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_writepage(struct bio *bio, int err) +{ + int uptodate = err == 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + int whole_page; + int ret; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate && tree->ops && + tree->ops->writepage_io_failed_hook) { + ret = tree->ops->writepage_io_failed_hook(bio, page, + start, end, NULL); + if (ret == 0) { + uptodate = (err == 0); + continue; + } + } + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, GFP_NOFS); + ClearPageUptodate(page); + SetPageError(page); + } + + if (whole_page) + end_page_writeback(page); + else + check_page_writeback(tree, page); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +/* + * after a readpage IO is done, we need to: + * clear the uptodate bits on error + * set the uptodate bits if things worked + * set the page up to date if all extents in the tree are uptodate + * clear the lock bit in the extent tree + * unlock the page if there are no other extents locked for it + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_readpage(struct bio *bio, int err) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + int whole_page; + int ret; + + if (err) + uptodate = 0; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { + ret = tree->ops->readpage_end_io_hook(page, start, end, + NULL); + if (ret) + uptodate = 0; + } + if (!uptodate && tree->ops && + tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(bio, page, + start, end, NULL); + if (ret == 0) { + uptodate = + test_bit(BIO_UPTODATE, &bio->bi_flags); + if (err) + uptodate = 0; + continue; + } + } + + if (uptodate) { + set_extent_uptodate(tree, start, end, + GFP_ATOMIC); + } + unlock_extent(tree, start, end, GFP_ATOMIC); + + if (whole_page) { + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { + if (uptodate) { + check_page_uptodate(tree, page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + check_page_locked(tree, page); + } + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +/* + * IO done from prepare_write is pretty simple, we just unlock + * the structs in the extent tree when done, and set the uptodate bits + * as appropriate. + */ +static void end_bio_extent_preparewrite(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate) { + set_extent_uptodate(tree, start, end, GFP_ATOMIC); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + + unlock_extent(tree, start, end, GFP_ATOMIC); + + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +static struct bio * +extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) +{ + struct bio *bio; + + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_size = 0; + bio->bi_bdev = bdev; + bio->bi_sector = first_sector; + } + return bio; +} + +static int submit_one_bio(int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + int ret = 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct page *page = bvec->bv_page; + struct extent_io_tree *tree = bio->bi_private; + u64 start; + u64 end; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + bio->bi_private = NULL; + + bio_get(bio); + + if (tree->ops && tree->ops->submit_bio_hook) + tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + mirror_num, bio_flags); + else + submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + bio_put(bio); + return ret; +} + +static int submit_extent_page(int rw, struct extent_io_tree *tree, + struct page *page, sector_t sector, + size_t size, unsigned long offset, + struct block_device *bdev, + struct bio **bio_ret, + unsigned long max_pages, + bio_end_io_t end_io_func, + int mirror_num, + unsigned long prev_bio_flags, + unsigned long bio_flags) +{ + int ret = 0; + struct bio *bio; + int nr; + int contig = 0; + int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; + int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; + size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); + + if (bio_ret && *bio_ret) { + bio = *bio_ret; + if (old_compressed) + contig = bio->bi_sector == sector; + else + contig = bio->bi_sector + (bio->bi_size >> 9) == + sector; + + if (prev_bio_flags != bio_flags || !contig || + (tree->ops && tree->ops->merge_bio_hook && + tree->ops->merge_bio_hook(page, offset, page_size, bio, + bio_flags)) || + bio_add_page(bio, page, page_size, offset) < page_size) { + ret = submit_one_bio(rw, bio, mirror_num, + prev_bio_flags); + bio = NULL; + } else { + return 0; + } + } + if (this_compressed) + nr = BIO_MAX_PAGES; + else + nr = bio_get_nr_vecs(bdev); + + bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + + bio_add_page(bio, page, page_size, offset); + bio->bi_end_io = end_io_func; + bio->bi_private = tree; + + if (bio_ret) + *bio_ret = bio; + else + ret = submit_one_bio(rw, bio, mirror_num, bio_flags); + + return ret; +} + +void set_page_extent_mapped(struct page *page) +{ + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, EXTENT_PAGE_PRIVATE); + } +} + +static void set_page_extent_head(struct page *page, unsigned long len) +{ + set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); +} + +/* + * basic readpage implementation. Locked extent state structs are inserted + * into the tree that are removed when the IO is done (by the end_io + * handlers) + */ +static int __extent_read_full_page(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct bio **bio, int mirror_num, + unsigned long *bio_flags) +{ + struct inode *inode = page->mapping->host; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 cur_end; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t page_offset = 0; + size_t iosize; + size_t disk_io_size; + size_t blocksize = inode->i_sb->s_blocksize; + unsigned long this_bio_flag = 0; + + set_page_extent_mapped(page); + + end = page_end; + lock_extent(tree, start, end, GFP_NOFS); + + if (page->index == last_byte >> PAGE_CACHE_SHIFT) { + char *userpage; + size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + iosize = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } + while (cur <= end) { + if (cur >= last_byte) { + char *userpage; + iosize = PAGE_CACHE_SIZE - page_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + page_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + break; + } + em = get_extent(inode, page, page_offset, cur, + end - cur + 1, 0); + if (IS_ERR(em) || !em) { + SetPageError(page); + unlock_extent(tree, cur, end, GFP_NOFS); + break; + } + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + this_bio_flag = EXTENT_BIO_COMPRESSED; + + iosize = min(extent_map_end(em) - cur, end - cur + 1); + cur_end = min(extent_map_end(em) - 1, end); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + if (this_bio_flag & EXTENT_BIO_COMPRESSED) { + disk_io_size = em->block_len; + sector = em->block_start >> 9; + } else { + sector = (em->block_start + extent_offset) >> 9; + disk_io_size = iosize; + } + bdev = em->bdev; + block_start = em->block_start; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + block_start = EXTENT_MAP_HOLE; + free_extent_map(em); + em = NULL; + + /* we've found a hole, just zero and go on */ + if (block_start == EXTENT_MAP_HOLE) { + char *userpage; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + page_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + /* the get_extent function already copied into the page */ + if (test_range_bit(tree, cur, cur_end, + EXTENT_UPTODATE, 1, NULL)) { + check_page_uptodate(tree, page); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + /* we have an inline extent but it didn't get marked up + * to date. Error out + */ + if (block_start == EXTENT_MAP_INLINE) { + SetPageError(page); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + + ret = 0; + if (tree->ops && tree->ops->readpage_io_hook) { + ret = tree->ops->readpage_io_hook(page, cur, + cur + iosize - 1); + } + if (!ret) { + unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + pnr -= page->index; + ret = submit_extent_page(READ, tree, page, + sector, disk_io_size, page_offset, + bdev, bio, pnr, + end_bio_extent_readpage, mirror_num, + *bio_flags, + this_bio_flag); + nr++; + *bio_flags = this_bio_flag; + } + if (ret) + SetPageError(page); + cur = cur + iosize; + page_offset += iosize; + } + if (!nr) { + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + return 0; +} + +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent) +{ + struct bio *bio = NULL; + unsigned long bio_flags = 0; + int ret; + + ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, + &bio_flags); + if (bio) + submit_one_bio(READ, bio, 0, bio_flags); + return ret; +} + +static noinline void update_nr_written(struct page *page, + struct writeback_control *wbc, + unsigned long nr_written) +{ + wbc->nr_to_write -= nr_written; + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) + page->mapping->writeback_index = page->index + nr_written; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + struct extent_io_tree *tree = epd->tree; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 delalloc_start; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 iosize; + u64 unlock_start; + sector_t sector; + struct extent_state *cached_state = NULL; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t pg_offset = 0; + size_t blocksize; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + u64 nr_delalloc; + u64 delalloc_end; + int page_started; + int compressed; + int write_flags; + unsigned long nr_written = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC_PLUG; + else + write_flags = WRITE; + + WARN_ON(!PageLocked(page)); + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + pg_offset, 0, + PAGE_CACHE_SIZE - pg_offset); + kunmap_atomic(userpage, KM_USER0); + flush_dcache_page(page); + } + pg_offset = 0; + + set_page_extent_mapped(page); + + delalloc_start = start; + delalloc_end = 0; + page_started = 0; + if (!epd->extent_locked) { + u64 delalloc_to_write = 0; + /* + * make sure the wbc mapping index is at least updated + * to this page. + */ + update_nr_written(page, wbc, 0); + + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + tree->ops->fill_delalloc(inode, page, delalloc_start, + delalloc_end, &page_started, + &nr_written); + /* + * delalloc_end is already one less than the total + * length, so we don't subtract one from + * PAGE_CACHE_SIZE + */ + delalloc_to_write += (delalloc_end - delalloc_start + + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + delalloc_start = delalloc_end + 1; + } + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; + + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } + + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + ret = 0; + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= nr_written; + goto done_unlocked; + } + } + if (tree->ops && tree->ops->writepage_start_hook) { + ret = tree->ops->writepage_start_hook(page, start, + page_end); + if (ret == -EAGAIN) { + redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); + unlock_page(page); + ret = 0; + goto done_unlocked; + } + } + + /* + * we don't want to touch the inode after unlocking the page, + * so we update the mapping writeback index now + */ + update_nr_written(page, wbc, nr_written + 1); + + end = page_end; + if (last_byte <= start) { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + page_end, NULL, 1); + unlock_start = page_end + 1; + goto done; + } + + blocksize = inode->i_sb->s_blocksize; + + while (cur <= end) { + if (cur >= last_byte) { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + page_end, NULL, 1); + unlock_start = page_end + 1; + break; + } + em = epd->get_extent(inode, page, pg_offset, cur, + end - cur + 1, 1); + if (IS_ERR(em) || !em) { + SetPageError(page); + break; + } + + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + iosize = min(extent_map_end(em) - cur, end - cur + 1); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + sector = (em->block_start + extent_offset) >> 9; + bdev = em->bdev; + block_start = em->block_start; + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + free_extent_map(em); + em = NULL; + + /* + * compressed and inline extents are written through other + * paths in the FS + */ + if (compressed || block_start == EXTENT_MAP_HOLE || + block_start == EXTENT_MAP_INLINE) { + /* + * end_io notification does not happen here for + * compressed extents + */ + if (!compressed && tree->ops && + tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + cur + iosize - 1, + NULL, 1); + else if (compressed) { + /* we don't want to end_page_writeback on + * a compressed extent. this happens + * elsewhere + */ + nr++; + } + + cur += iosize; + pg_offset += iosize; + unlock_start = cur; + continue; + } + /* leave this out until we have a page_mkwrite call */ + if (0 && !test_range_bit(tree, cur, cur + iosize - 1, + EXTENT_DIRTY, 0, NULL)) { + cur = cur + iosize; + pg_offset += iosize; + continue; + } + + if (tree->ops && tree->ops->writepage_io_hook) { + ret = tree->ops->writepage_io_hook(page, cur, + cur + iosize - 1); + } else { + ret = 0; + } + if (ret) { + SetPageError(page); + } else { + unsigned long max_nr = end_index + 1; + + set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + printk(KERN_ERR "btrfs warning page %lu not " + "writeback, cur %llu end %llu\n", + page->index, (unsigned long long)cur, + (unsigned long long)end); + } + + ret = submit_extent_page(write_flags, tree, page, + sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, + end_bio_extent_writepage, + 0, 0, 0); + if (ret) + SetPageError(page); + } + cur = cur + iosize; + pg_offset += iosize; + nr++; + } +done: + if (nr == 0) { + /* make sure the mapping tag for page dirty gets cleared */ + set_page_writeback(page); + end_page_writeback(page); + } + unlock_page(page); + +done_unlocked: + + /* drop our reference on any cached states */ + free_extent_state(cached_state); + return 0; +} + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +static int extent_write_cache_pages(struct extent_io_tree *tree, + struct address_space *mapping, + struct writeback_control *wbc, + writepage_t writepage, void *data, + void (*flush_fn)(void *)) +{ + int ret = 0; + int done = 0; + int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int range_whole = 0; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } +retry: + while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + if (tree->ops && tree->ops->write_cache_pages_lock_hook) + tree->ops->write_cache_pages_lock_hook(page); + else + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) { + if (PageWriteback(page)) + flush_fn(data); + wait_on_page_writeback(page); + } + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc, data); + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { + unlock_page(page); + ret = 0; + } + if (ret) + done = 1; + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + return ret; +} + +static void flush_epd_write_bio(struct extent_page_data *epd) +{ + if (epd->bio) { + if (epd->sync_io) + submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); + else + submit_one_bio(WRITE, epd->bio, 0, 0); + epd->bio = NULL; + } +} + +static noinline void flush_write_bio(void *data) +{ + struct extent_page_data *epd = data; + flush_epd_write_bio(epd); +} + +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret; + struct address_space *mapping = page->mapping; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + }; + struct writeback_control wbc_writepages = { + .bdi = wbc->bdi, + .sync_mode = wbc->sync_mode, + .older_than_this = NULL, + .nr_to_write = 64, + .range_start = page_offset(page) + PAGE_CACHE_SIZE, + .range_end = (loff_t)-1, + }; + + ret = __extent_writepage(page, wbc, &epd); + + extent_write_cache_pages(tree, mapping, &wbc_writepages, + __extent_writepage, &epd, flush_write_bio); + flush_epd_write_bio(&epd); + return ret; +} + +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode) +{ + int ret = 0; + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 1, + .sync_io = mode == WB_SYNC_ALL, + }; + struct writeback_control wbc_writepages = { + .bdi = inode->i_mapping->backing_dev_info, + .sync_mode = mode, + .older_than_this = NULL, + .nr_to_write = nr_pages * 2, + .range_start = start, + .range_end = end + 1, + }; + + while (start <= end) { + page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + if (clear_page_dirty_for_io(page)) + ret = __extent_writepage(page, &wbc_writepages, &epd); + else { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + start + PAGE_CACHE_SIZE - 1, + NULL, 1); + unlock_page(page); + } + page_cache_release(page); + start += PAGE_CACHE_SIZE; + } + + flush_epd_write_bio(&epd); + return ret; +} + +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret = 0; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + }; + + ret = extent_write_cache_pages(tree, mapping, wbc, + __extent_writepage, &epd, + flush_write_bio); + flush_epd_write_bio(&epd); + return ret; +} + +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent) +{ + struct bio *bio = NULL; + unsigned page_idx; + struct pagevec pvec; + unsigned long bio_flags = 0; + + pagevec_init(&pvec, 0); + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + /* + * what we want to do here is call add_to_page_cache_lru, + * but that isn't exported, so we reproduce it here + */ + if (!add_to_page_cache(page, mapping, + page->index, GFP_KERNEL)) { + + /* open coding of lru_cache_add, also not exported */ + page_cache_get(page); + if (!pagevec_add(&pvec, page)) + __pagevec_lru_add_file(&pvec); + __extent_read_full_page(tree, page, get_extent, + &bio, 0, &bio_flags); + } + page_cache_release(page); + } + if (pagevec_count(&pvec)) + __pagevec_lru_add_file(&pvec); + BUG_ON(!list_empty(pages)); + if (bio) + submit_one_bio(READ, bio, 0, bio_flags); + return 0; +} + +/* + * basic invalidatepage code, this waits on any locked or writeback + * ranges corresponding to the page, and then deletes any extent state + * records from the tree + */ +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset) +{ + u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); + u64 end = start + PAGE_CACHE_SIZE - 1; + size_t blocksize = page->mapping->host->i_sb->s_blocksize; + + start += (offset + blocksize - 1) & ~(blocksize - 1); + if (start > end) + return 0; + + lock_extent(tree, start, end, GFP_NOFS); + wait_on_page_writeback(page); + clear_extent_bit(tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, + 1, 1, NULL, GFP_NOFS); + return 0; +} + +/* + * simple commit_write call, set_range_dirty is used to mark both + * the pages and the extent records as dirty + */ +int extent_commit_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + set_page_extent_mapped(page); + set_page_dirty(page); + + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + return 0; +} + +int extent_prepare_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent) +{ + u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + u64 block_start; + u64 orig_block_start; + u64 block_end; + u64 cur_end; + struct extent_map *em; + unsigned blocksize = 1 << inode->i_blkbits; + size_t page_offset = 0; + size_t block_off_start; + size_t block_off_end; + int err = 0; + int iocount = 0; + int ret = 0; + int isnew; + + set_page_extent_mapped(page); + + block_start = (page_start + from) & ~((u64)blocksize - 1); + block_end = (page_start + to - 1) | (blocksize - 1); + orig_block_start = block_start; + + lock_extent(tree, page_start, page_end, GFP_NOFS); + while (block_start <= block_end) { + em = get_extent(inode, page, page_offset, block_start, + block_end - block_start + 1, 1); + if (IS_ERR(em) || !em) + goto err; + + cur_end = min(block_end, extent_map_end(em) - 1); + block_off_start = block_start & (PAGE_CACHE_SIZE - 1); + block_off_end = block_off_start + blocksize; + isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); + + if (!PageUptodate(page) && isnew && + (block_off_end > to || block_off_start < from)) { + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + if (block_off_end > to) + memset(kaddr + to, 0, block_off_end - to); + if (block_off_start < from) + memset(kaddr + block_off_start, 0, + from - block_off_start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + if ((em->block_start != EXTENT_MAP_HOLE && + em->block_start != EXTENT_MAP_INLINE) && + !isnew && !PageUptodate(page) && + (block_off_end > to || block_off_start < from) && + !test_range_bit(tree, block_start, cur_end, + EXTENT_UPTODATE, 1, NULL)) { + u64 sector; + u64 extent_offset = block_start - em->start; + size_t iosize; + sector = (em->block_start + extent_offset) >> 9; + iosize = (cur_end - block_start + blocksize) & + ~((u64)blocksize - 1); + /* + * we've already got the extent locked, but we + * need to split the state such that our end_bio + * handler can clear the lock. + */ + set_extent_bit(tree, block_start, + block_start + iosize - 1, + EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, em->bdev, + NULL, 1, + end_bio_extent_preparewrite, 0, + 0, 0); + iocount++; + block_start = block_start + iosize; + } else { + set_extent_uptodate(tree, block_start, cur_end, + GFP_NOFS); + unlock_extent(tree, block_start, cur_end, GFP_NOFS); + block_start = cur_end + 1; + } + page_offset = block_start & (PAGE_CACHE_SIZE - 1); + free_extent_map(em); + } + if (iocount) { + wait_extent_bit(tree, orig_block_start, + block_end, EXTENT_LOCKED); + } + check_page_uptodate(tree, page); +err: + /* FIXME, zero out newly allocated blocks on error */ + return err; +} + +/* + * a helper for releasepage, this tests for areas of the page that + * are locked or under IO and drops the related state bits if it is safe + * to drop the page. + */ +int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + int ret = 1; + + if (test_range_bit(tree, start, end, + EXTENT_IOBITS, 0, NULL)) + ret = 0; + else { + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; + /* + * at this point we can safely clear everything except the + * locked bit and the nodatasum bit + */ + clear_extent_bit(tree, start, end, + ~(EXTENT_LOCKED | EXTENT_NODATASUM), + 0, 0, NULL, mask); + } + return ret; +} + +/* + * a helper for releasepage. As long as there are no locked extents + * in the range corresponding to the page, both state records and extent + * map records are removed + */ +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask) +{ + struct extent_map *em; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + + if ((mask & __GFP_WAIT) && + page->mapping->host->i_size > 16 * 1024 * 1024) { + u64 len; + while (start <= end) { + len = end - start + 1; + write_lock(&map->lock); + em = lookup_extent_mapping(map, start, len); + if (!em || IS_ERR(em)) { + write_unlock(&map->lock); + break; + } + if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || + em->start != start) { + write_unlock(&map->lock); + free_extent_map(em); + break; + } + if (!test_range_bit(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED | EXTENT_WRITEBACK, + 0, NULL)) { + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); + } + start = extent_map_end(em); + write_unlock(&map->lock); + + /* once for us */ + free_extent_map(em); + } + } + return try_release_extent_state(map, tree, page, mask); +} + +sector_t extent_bmap(struct address_space *mapping, sector_t iblock, + get_extent_t *get_extent) +{ + struct inode *inode = mapping->host; + u64 start = iblock << inode->i_blkbits; + sector_t sector = 0; + size_t blksize = (1 << inode->i_blkbits); + struct extent_map *em; + + lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, + GFP_NOFS); + em = get_extent(inode, NULL, 0, start, blksize, 0); + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, + GFP_NOFS); + if (!em || IS_ERR(em)) + return 0; + + if (em->block_start > EXTENT_MAP_LAST_BYTE) + goto out; + + sector = (em->block_start + start - em->start) >> inode->i_blkbits; +out: + free_extent_map(em); + return sector; +} + +int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, get_extent_t *get_extent) +{ + int ret; + u64 off = start; + u64 max = start + len; + u32 flags = 0; + u64 disko = 0; + struct extent_map *em = NULL; + int end = 0; + u64 em_start = 0, em_len = 0; + unsigned long emflags; + ret = 0; + + if (len == 0) + return -EINVAL; + + lock_extent(&BTRFS_I(inode)->io_tree, start, start + len, + GFP_NOFS); + em = get_extent(inode, NULL, 0, off, max - off, 0); + if (!em) + goto out; + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + while (!end) { + off = em->start + em->len; + if (off >= max) + end = 1; + + em_start = em->start; + em_len = em->len; + + disko = 0; + flags = 0; + + if (em->block_start == EXTENT_MAP_LAST_BYTE) { + end = 1; + flags |= FIEMAP_EXTENT_LAST; + } else if (em->block_start == EXTENT_MAP_HOLE) { + flags |= FIEMAP_EXTENT_UNWRITTEN; + } else if (em->block_start == EXTENT_MAP_INLINE) { + flags |= (FIEMAP_EXTENT_DATA_INLINE | + FIEMAP_EXTENT_NOT_ALIGNED); + } else if (em->block_start == EXTENT_MAP_DELALLOC) { + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + } else { + disko = em->block_start; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + flags |= FIEMAP_EXTENT_ENCODED; + + emflags = em->flags; + free_extent_map(em); + em = NULL; + + if (!end) { + em = get_extent(inode, NULL, 0, off, max - off, 0); + if (!em) + goto out; + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + emflags = em->flags; + } + if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } + + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; + } +out_free: + free_extent_map(em); +out: + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len, + GFP_NOFS); + return ret; +} + +static inline struct page *extent_buffer_page(struct extent_buffer *eb, + unsigned long i) +{ + struct page *p; + struct address_space *mapping; + + if (i == 0) + return eb->first_page; + i += eb->start >> PAGE_CACHE_SHIFT; + mapping = eb->first_page->mapping; + if (!mapping) + return NULL; + + /* + * extent_buffer_page is only called after pinning the page + * by increasing the reference count. So we know the page must + * be in the radix tree. + */ + rcu_read_lock(); + p = radix_tree_lookup(&mapping->page_tree, i); + rcu_read_unlock(); + + return p; +} + +static inline unsigned long num_extent_pages(u64 start, u64 len) +{ + return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT); +} + +static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, + unsigned long len, + gfp_t mask) +{ + struct extent_buffer *eb = NULL; +#if LEAK_DEBUG + unsigned long flags; +#endif + + eb = kmem_cache_zalloc(extent_buffer_cache, mask); + eb->start = start; + eb->len = len; + spin_lock_init(&eb->lock); + init_waitqueue_head(&eb->lock_wq); + +#if LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_add(&eb->leak_list, &buffers); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + atomic_set(&eb->refs, 1); + + return eb; +} + +static void __free_extent_buffer(struct extent_buffer *eb) +{ +#if LEAK_DEBUG + unsigned long flags; + spin_lock_irqsave(&leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_buffer_cache, eb); +} + +struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + struct page *page0, + gfp_t mask) +{ + unsigned long num_pages = num_extent_pages(start, len); + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct extent_buffer *eb; + struct extent_buffer *exists = NULL; + struct page *p; + struct address_space *mapping = tree->mapping; + int uptodate = 1; + + spin_lock(&tree->buffer_lock); + eb = buffer_search(tree, start); + if (eb) { + atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); + mark_page_accessed(eb->first_page); + return eb; + } + spin_unlock(&tree->buffer_lock); + + eb = __alloc_extent_buffer(tree, start, len, mask); + if (!eb) + return NULL; + + if (page0) { + eb->first_page = page0; + i = 1; + index++; + page_cache_get(page0); + mark_page_accessed(page0); + set_page_extent_mapped(page0); + set_page_extent_head(page0, len); + uptodate = PageUptodate(page0); + } else { + i = 0; + } + for (; i < num_pages; i++, index++) { + p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); + if (!p) { + WARN_ON(1); + goto free_eb; + } + set_page_extent_mapped(p); + mark_page_accessed(p); + if (i == 0) { + eb->first_page = p; + set_page_extent_head(p, len); + } else { + set_page_private(p, EXTENT_PAGE_PRIVATE); + } + if (!PageUptodate(p)) + uptodate = 0; + unlock_page(p); + } + if (uptodate) + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + + spin_lock(&tree->buffer_lock); + exists = buffer_tree_insert(tree, start, &eb->rb_node); + if (exists) { + /* add one reference for the caller */ + atomic_inc(&exists->refs); + spin_unlock(&tree->buffer_lock); + goto free_eb; + } + /* add one reference for the tree */ + atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); + return eb; + +free_eb: + if (!atomic_dec_and_test(&eb->refs)) + return exists; + for (index = 1; index < i; index++) + page_cache_release(extent_buffer_page(eb, index)); + page_cache_release(extent_buffer_page(eb, 0)); + __free_extent_buffer(eb); + return exists; +} + +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + gfp_t mask) +{ + struct extent_buffer *eb; + + spin_lock(&tree->buffer_lock); + eb = buffer_search(tree, start); + if (eb) + atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); + + if (eb) + mark_page_accessed(eb->first_page); + + return eb; +} + +void free_extent_buffer(struct extent_buffer *eb) +{ + if (!eb) + return; + + if (!atomic_dec_and_test(&eb->refs)) + return; + + WARN_ON(1); +} + +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + struct page *page; + + num_pages = num_extent_pages(eb->start, eb->len); + + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!PageDirty(page)) + continue; + + lock_page(page); + if (i == 0) + set_page_extent_head(page, eb->len); + else + set_page_private(page, EXTENT_PAGE_PRIVATE); + + clear_page_dirty_for_io(page); + spin_lock_irq(&page->mapping->tree_lock); + if (!PageDirty(page)) { + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&page->mapping->tree_lock); + unlock_page(page); + } + return 0; +} + +int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + return wait_on_extent_writeback(tree, eb->start, + eb->start + eb->len - 1); +} + +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + int was_dirty = 0; + + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) + __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + return was_dirty; +} + +int clear_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (page) + ClearPageUptodate(page); + } + return 0; +} + +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { + check_page_uptodate(tree, page); + continue; + } + SetPageUptodate(page); + } + return 0; +} + +int extent_range_uptodate(struct extent_io_tree *tree, + u64 start, u64 end) +{ + struct page *page; + int ret; + int pg_uptodate = 1; + int uptodate; + unsigned long index; + + ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); + if (ret) + return 1; + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + page = find_get_page(tree->mapping, index); + uptodate = PageUptodate(page); + page_cache_release(page); + if (!uptodate) { + pg_uptodate = 0; + break; + } + start += PAGE_CACHE_SIZE; + } + return pg_uptodate; +} + +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + int ret = 0; + unsigned long num_pages; + unsigned long i; + struct page *page; + int pg_uptodate = 1; + + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return 1; + + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, NULL); + if (ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!PageUptodate(page)) { + pg_uptodate = 0; + break; + } + } + return pg_uptodate; +} + +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, + u64 start, int wait, + get_extent_t *get_extent, int mirror_num) +{ + unsigned long i; + unsigned long start_i; + struct page *page; + int err; + int ret = 0; + int locked_pages = 0; + int all_uptodate = 1; + int inc_all_pages = 0; + unsigned long num_pages; + struct bio *bio = NULL; + unsigned long bio_flags = 0; + + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return 0; + + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, NULL)) { + return 0; + } + + if (start) { + WARN_ON(start < eb->start); + start_i = (start >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT); + } else { + start_i = 0; + } + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!wait) { + if (!trylock_page(page)) + goto unlock_exit; + } else { + lock_page(page); + } + locked_pages++; + if (!PageUptodate(page)) + all_uptodate = 0; + } + if (all_uptodate) { + if (start_i == 0) + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + goto unlock_exit; + } + + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (inc_all_pages) + page_cache_get(page); + if (!PageUptodate(page)) { + if (start_i == 0) + inc_all_pages = 1; + ClearPageError(page); + err = __extent_read_full_page(tree, page, + get_extent, &bio, + mirror_num, &bio_flags); + if (err) + ret = err; + } else { + unlock_page(page); + } + } + + if (bio) + submit_one_bio(READ, bio, mirror_num, bio_flags); + + if (ret || !wait) + return ret; + + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + wait_on_page_locked(page); + if (!PageUptodate(page)) + ret = -EIO; + } + + if (!ret) + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + return ret; + +unlock_exit: + i = start_i; + while (locked_pages > 0) { + page = extent_buffer_page(eb, i); + i++; + unlock_page(page); + locked_pages--; + } + return ret; +} + +void read_extent_buffer(struct extent_buffer *eb, void *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = kmap_atomic(page, KM_USER1); + memcpy(dst, kaddr + offset, cur); + kunmap_atomic(kaddr, KM_USER1); + + dst += cur; + len -= cur; + offset = 0; + i++; + } +} + +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + size_t offset = start & (PAGE_CACHE_SIZE - 1); + char *kaddr; + struct page *p; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + unsigned long end_i = (start_offset + start + min_len - 1) >> + PAGE_CACHE_SHIFT; + + if (i != end_i) + return -EINVAL; + + if (i == 0) { + offset = start_offset; + *map_start = 0; + } else { + offset = 0; + *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; + } + + if (start + min_len > eb->len) { + printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " + "wanted %lu %lu\n", (unsigned long long)eb->start, + eb->len, start, min_len); + WARN_ON(1); + } + + p = extent_buffer_page(eb, i); + kaddr = kmap_atomic(p, km); + *token = kaddr; + *map = kaddr + offset; + *map_len = PAGE_CACHE_SIZE - offset; + return 0; +} + +int map_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, + char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + int err; + int save = 0; + if (eb->map_token) { + unmap_extent_buffer(eb, eb->map_token, km); + eb->map_token = NULL; + save = 1; + } + err = map_private_extent_buffer(eb, start, min_len, token, map, + map_start, map_len, km); + if (!err && save) { + eb->map_token = *token; + eb->kaddr = *map; + eb->map_start = *map_start; + eb->map_len = *map_len; + } + return err; +} + +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) +{ + kunmap_atomic(token, km); +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *ptr = (char *)ptrv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + + kaddr = kmap_atomic(page, KM_USER0); + ret = memcmp(ptr, kaddr + offset, cur); + kunmap_atomic(kaddr, KM_USER0); + if (ret) + break; + + ptr += cur; + len -= cur; + offset = 0; + i++; + } + return ret; +} + +void write_extent_buffer(struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *src = (char *)srcv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = kmap_atomic(page, KM_USER1); + memcpy(kaddr + offset, src, cur); + kunmap_atomic(kaddr, KM_USER1); + + src += cur; + len -= cur; + offset = 0; + i++; + } +} + +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, c, cur); + kunmap_atomic(kaddr, KM_USER0); + + len -= cur; + offset = 0; + i++; + } +} + +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) +{ + u64 dst_len = dst->len; + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(src->len != dst_len); + + offset = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(dst, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); + + kaddr = kmap_atomic(page, KM_USER0); + read_extent_buffer(src, kaddr + offset, src_offset, cur); + kunmap_atomic(kaddr, KM_USER0); + + src_offset += cur; + len -= cur; + offset = 0; + i++; + } +} + +static void move_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + if (dst_page == src_page) { + memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); + } else { + char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *p = dst_kaddr + dst_off + len; + char *s = src_kaddr + src_off + len; + + while (len--) + *--p = *--s; + + kunmap_atomic(src_kaddr, KM_USER1); + } + kunmap_atomic(dst_kaddr, KM_USER0); +} + +static void copy_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *src_kaddr; + + if (dst_page != src_page) + src_kaddr = kmap_atomic(src_page, KM_USER1); + else + src_kaddr = dst_kaddr; + + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + kunmap_atomic(dst_kaddr, KM_USER0); + if (dst_page != src_page) + kunmap_atomic(src_kaddr, KM_USER1); +} + +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + "len %lu dst len %lu\n", src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + "len %lu dst len %lu\n", dst_offset, len, dst->len); + BUG_ON(1); + } + + while (len > 0) { + dst_off_in_page = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - + src_off_in_page)); + cur = min_t(unsigned long, cur, + (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); + + copy_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), + dst_off_in_page, src_off_in_page, cur); + + src_offset += cur; + dst_offset += cur; + len -= cur; + } +} + +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + unsigned long dst_end = dst_offset + len - 1; + unsigned long src_end = src_offset + len - 1; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + "len %lu len %lu\n", src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + "len %lu len %lu\n", dst_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset < src_offset) { + memcpy_extent_buffer(dst, dst_offset, src_offset, len); + return; + } + while (len > 0) { + dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; + + dst_off_in_page = (start_offset + dst_end) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_end) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + cur = min_t(unsigned long, len, src_off_in_page + 1); + cur = min(cur, dst_off_in_page + 1); + move_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), + dst_off_in_page - cur + 1, + src_off_in_page - cur + 1, cur); + + dst_end -= cur; + src_end -= cur; + len -= cur; + } +} + +int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) +{ + u64 start = page_offset(page); + struct extent_buffer *eb; + int ret = 1; + unsigned long i; + unsigned long num_pages; + + spin_lock(&tree->buffer_lock); + eb = buffer_search(tree, start); + if (!eb) + goto out; + + if (atomic_read(&eb->refs) > 1) { + ret = 0; + goto out; + } + if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + ret = 0; + goto out; + } + /* at this point we can safely release the extent buffer */ + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) + page_cache_release(extent_buffer_page(eb, i)); + rb_erase(&eb->rb_node, &tree->buffer); + __free_extent_buffer(eb); +out: + spin_unlock(&tree->buffer_lock); + return ret; +} diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h new file mode 100644 index 00000000000..36de250a7b2 --- /dev/null +++ b/fs/btrfs/extent_io.h @@ -0,0 +1,303 @@ +#ifndef __EXTENTIO__ +#define __EXTENTIO__ + +#include <linux/rbtree.h> + +/* bits for the extent state */ +#define EXTENT_DIRTY 1 +#define EXTENT_WRITEBACK (1 << 1) +#define EXTENT_UPTODATE (1 << 2) +#define EXTENT_LOCKED (1 << 3) +#define EXTENT_NEW (1 << 4) +#define EXTENT_DELALLOC (1 << 5) +#define EXTENT_DEFRAG (1 << 6) +#define EXTENT_DEFRAG_DONE (1 << 7) +#define EXTENT_BUFFER_FILLED (1 << 8) +#define EXTENT_BOUNDARY (1 << 9) +#define EXTENT_NODATASUM (1 << 10) +#define EXTENT_DO_ACCOUNTING (1 << 11) +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) + +/* flags for bio submission */ +#define EXTENT_BIO_COMPRESSED 1 + +/* these are bit numbers for test/set bit */ +#define EXTENT_BUFFER_UPTODATE 0 +#define EXTENT_BUFFER_BLOCKING 1 +#define EXTENT_BUFFER_DIRTY 2 + +/* these are flags for extent_clear_unlock_delalloc */ +#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 +#define EXTENT_CLEAR_UNLOCK 0x2 +#define EXTENT_CLEAR_DELALLOC 0x4 +#define EXTENT_CLEAR_DIRTY 0x8 +#define EXTENT_SET_WRITEBACK 0x10 +#define EXTENT_END_WRITEBACK 0x20 +#define EXTENT_SET_PRIVATE2 0x40 +#define EXTENT_CLEAR_ACCOUNTING 0x80 + +/* + * page->private values. Every page that is controlled by the extent + * map has page->private set to one. + */ +#define EXTENT_PAGE_PRIVATE 1 +#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 + +struct extent_state; + +typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags); +struct extent_io_ops { + int (*fill_delalloc)(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written); + int (*writepage_start_hook)(struct page *page, u64 start, u64 end); + int (*writepage_io_hook)(struct page *page, u64 start, u64 end); + extent_submit_bio_hook_t *submit_bio_hook; + int (*merge_bio_hook)(struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags); + int (*readpage_io_hook)(struct page *page, u64 start, u64 end); + int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state); + int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state); + int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state); + int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate); + int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits); + int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, + unsigned long bits); + int (*merge_extent_hook)(struct inode *inode, + struct extent_state *new, + struct extent_state *other); + int (*split_extent_hook)(struct inode *inode, + struct extent_state *orig, u64 split); + int (*write_cache_pages_lock_hook)(struct page *page); +}; + +struct extent_io_tree { + struct rb_root state; + struct rb_root buffer; + struct address_space *mapping; + u64 dirty_bytes; + spinlock_t lock; + spinlock_t buffer_lock; + struct extent_io_ops *ops; +}; + +struct extent_state { + u64 start; + u64 end; /* inclusive */ + struct rb_node rb_node; + + /* ADD NEW ELEMENTS AFTER THIS */ + struct extent_io_tree *tree; + wait_queue_head_t wq; + atomic_t refs; + unsigned long state; + u64 split_start; + u64 split_end; + + /* for use by the FS */ + u64 private; + + struct list_head leak_list; +}; + +struct extent_buffer { + u64 start; + unsigned long len; + char *map_token; + char *kaddr; + unsigned long map_start; + unsigned long map_len; + struct page *first_page; + unsigned long bflags; + atomic_t refs; + struct list_head leak_list; + struct rb_node rb_node; + + /* the spinlock is used to protect most operations */ + spinlock_t lock; + + /* + * when we keep the lock held while blocking, waiters go onto + * the wq + */ + wait_queue_head_t lock_wq; +}; + +struct extent_map_tree; + +static inline struct extent_state *extent_state_next(struct extent_state *state) +{ + struct rb_node *node; + node = rb_next(&state->rb_node); + if (!node) + return NULL; + return rb_entry(node, struct extent_state, rb_node); +} + +typedef struct extent_map *(get_extent_t)(struct inode *inode, + struct page *page, + size_t page_offset, + u64 start, u64 len, + int create); + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping, gfp_t mask); +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); +int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); +int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, struct extent_state **cached, gfp_t mask); +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent); +int __init extent_io_init(void); +void extent_io_exit(void); + +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, unsigned long bits); + +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int filled, struct extent_state *cached_state); +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, struct extent_state **cached, + gfp_t mask); +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask); +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits); +struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, int bits); +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset); +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode); +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent); +int extent_prepare_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent); +int extent_commit_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to); +sector_t extent_bmap(struct address_space *mapping, sector_t iblock, + get_extent_t *get_extent); +int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, get_extent_t *get_extent); +int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); +int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); +void set_page_extent_mapped(struct page *page); + +struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + struct page *page0, + gfp_t mask); +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + gfp_t mask); +void free_extent_buffer(struct extent_buffer *eb); +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, u64 start, int wait, + get_extent_t *get_extent, int mirror_num); + +static inline void extent_buffer_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->refs); +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len); +void read_extent_buffer(struct extent_buffer *eb, void *dst, + unsigned long start, + unsigned long len); +void write_extent_buffer(struct extent_buffer *eb, const void *src, + unsigned long start, unsigned long len); +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len); +int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, + struct extent_buffer *eb); +int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end); +int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int test_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int clear_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); +int release_extent_buffer_tail_pages(struct extent_buffer *eb); +int extent_range_uptodate(struct extent_io_tree *tree, + u64 start, u64 end); +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + unsigned long op); +#endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c new file mode 100644 index 00000000000..428fcac45f9 --- /dev/null +++ b/fs/btrfs/extent_map.c @@ -0,0 +1,419 @@ +#include <linux/err.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/hardirq.h> +#include "extent_map.h" + + +static struct kmem_cache *extent_map_cache; + +int __init extent_map_init(void) +{ + extent_map_cache = kmem_cache_create("extent_map", + sizeof(struct extent_map), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!extent_map_cache) + return -ENOMEM; + return 0; +} + +void extent_map_exit(void) +{ + if (extent_map_cache) + kmem_cache_destroy(extent_map_cache); +} + +/** + * extent_map_tree_init - initialize extent map tree + * @tree: tree to initialize + * @mask: flags for memory allocations during tree operations + * + * Initialize the extent tree @tree. Should be called for each new inode + * or other user of the extent_map interface. + */ +void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) +{ + tree->map.rb_node = NULL; + rwlock_init(&tree->lock); +} + +/** + * alloc_extent_map - allocate new extent map structure + * @mask: memory allocation flags + * + * Allocate a new extent_map structure. The new structure is + * returned with a reference count of one and needs to be + * freed using free_extent_map() + */ +struct extent_map *alloc_extent_map(gfp_t mask) +{ + struct extent_map *em; + em = kmem_cache_alloc(extent_map_cache, mask); + if (!em || IS_ERR(em)) + return em; + em->in_tree = 0; + em->flags = 0; + atomic_set(&em->refs, 1); + return em; +} + +/** + * free_extent_map - drop reference count of an extent_map + * @em: extent map beeing releasead + * + * Drops the reference out on @em by one and free the structure + * if the reference count hits zero. + */ +void free_extent_map(struct extent_map *em) +{ + if (!em) + return; + WARN_ON(atomic_read(&em->refs) == 0); + if (atomic_dec_and_test(&em->refs)) { + WARN_ON(em->in_tree); + kmem_cache_free(extent_map_cache, em); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct extent_map *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct extent_map, rb_node); + + WARN_ON(!entry->in_tree); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset >= extent_map_end(entry)) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct extent_map, rb_node); + entry->in_tree = 1; + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * search through the tree for an extent_map with a given offset. If + * it can't be found, try to find some neighboring extents + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct extent_map *entry; + struct extent_map *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct extent_map, rb_node); + prev = n; + prev_entry = entry; + + WARN_ON(!entry->in_tree); + + if (offset < entry->start) + n = n->rb_left; + else if (offset >= extent_map_end(entry)) + n = n->rb_right; + else + return n; + } + + if (prev_ret) { + orig_prev = prev; + while (prev && offset >= extent_map_end(prev_entry)) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct extent_map, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +/* check to see if two extent_map structs are adjacent and safe to merge */ +static int mergable_maps(struct extent_map *prev, struct extent_map *next) +{ + if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) + return 0; + + /* + * don't merge compressed extents, we need to know their + * actual size + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) + return 0; + + if (extent_map_end(prev) == next->start && + prev->flags == next->flags && + prev->bdev == next->bdev && + ((next->block_start == EXTENT_MAP_HOLE && + prev->block_start == EXTENT_MAP_HOLE) || + (next->block_start == EXTENT_MAP_INLINE && + prev->block_start == EXTENT_MAP_INLINE) || + (next->block_start == EXTENT_MAP_DELALLOC && + prev->block_start == EXTENT_MAP_DELALLOC) || + (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && + next->block_start == extent_map_block_end(prev)))) { + return 1; + } + return 0; +} + +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +{ + int ret = 0; + struct extent_map *merge = NULL; + struct rb_node *rb; + struct extent_map *em; + + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, start, len); + + WARN_ON(!em || em->start != start); + + if (!em) + goto out; + + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_len += merge->block_len; + em->block_start = merge->block_start; + merge->in_tree = 0; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); + } + } + + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + em->block_len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + merge->in_tree = 0; + free_extent_map(merge); + } + + free_extent_map(em); +out: + write_unlock(&tree->lock); + return ret; + +} + +/** + * add_extent_mapping - add new extent map to the extent tree + * @tree: tree to insert new map in + * @em: map to insert + * + * Insert @em into @tree or perform a simple forward/backward merge with + * existing mappings. The extent_map struct passed in will be inserted + * into the tree directly, with an additional reference taken, or a + * reference dropped if the merge attempt was successfull. + */ +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em) +{ + int ret = 0; + struct extent_map *merge = NULL; + struct rb_node *rb; + struct extent_map *exist; + + exist = lookup_extent_mapping(tree, em->start, em->len); + if (exist) { + free_extent_map(exist); + ret = -EEXIST; + goto out; + } + rb = tree_insert(&tree->map, em->start, &em->rb_node); + if (rb) { + ret = -EEXIST; + goto out; + } + atomic_inc(&em->refs); + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_len += merge->block_len; + em->block_start = merge->block_start; + merge->in_tree = 0; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); + } + } + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + em->block_len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + merge->in_tree = 0; + free_extent_map(merge); + } +out: + return ret; +} + +/* simple helper to do math around the end of an extent, handling wrap */ +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + +/** + * lookup_extent_mapping - lookup extent_map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. There may be additional objects in the tree that + * intersect, so check the object returned carefully to make sure that no + * additional lookups are needed. + */ +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + u64 end = range_end(start, len); + + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + } + if (!rb_node && next) { + em = rb_entry(next, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + } + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_PTR(PTR_ERR(rb_node)); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + + em = NULL; + goto out; + +found: + atomic_inc(&em->refs); +out: + return em; +} + +/** + * search_extent_mapping - find a nearby extent map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. + * + * If one can't be found, any nearby extent may be returned + */ +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); + goto found; + } + if (!rb_node && next) { + em = rb_entry(next, struct extent_map, rb_node); + goto found; + } + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_PTR(PTR_ERR(rb_node)); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + goto found; + + em = NULL; + goto out; + +found: + atomic_inc(&em->refs); +out: + return em; +} + +/** + * remove_extent_mapping - removes an extent_map from the extent tree + * @tree: extent tree to remove from + * @em: extent map beeing removed + * + * Removes @em from @tree. No reference counts are dropped, and no checks + * are done to see if the range is in use + */ +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +{ + int ret = 0; + + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + rb_erase(&em->rb_node, &tree->map); + em->in_tree = 0; + return ret; +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h new file mode 100644 index 00000000000..ab6d74b6e64 --- /dev/null +++ b/fs/btrfs/extent_map.h @@ -0,0 +1,65 @@ +#ifndef __EXTENTMAP__ +#define __EXTENTMAP__ + +#include <linux/rbtree.h> + +#define EXTENT_MAP_LAST_BYTE (u64)-4 +#define EXTENT_MAP_HOLE (u64)-3 +#define EXTENT_MAP_INLINE (u64)-2 +#define EXTENT_MAP_DELALLOC (u64)-1 + +/* bits for the flags field */ +#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ +#define EXTENT_FLAG_COMPRESSED 1 +#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ +#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ + +struct extent_map { + struct rb_node rb_node; + + /* all of these are in bytes */ + u64 start; + u64 len; + u64 orig_start; + u64 block_start; + u64 block_len; + unsigned long flags; + struct block_device *bdev; + atomic_t refs; + int in_tree; +}; + +struct extent_map_tree { + struct rb_root map; + rwlock_t lock; +}; + +static inline u64 extent_map_end(struct extent_map *em) +{ + if (em->start + em->len < em->start) + return (u64)-1; + return em->start + em->len; +} + +static inline u64 extent_map_block_end(struct extent_map *em) +{ + if (em->block_start + em->block_len < em->block_start) + return (u64)-1; + return em->block_start + em->block_len; +} + +void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em); +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); + +struct extent_map *alloc_extent_map(gfp_t mask); +void free_extent_map(struct extent_map *em); +int __init extent_map_init(void); +void extent_map_exit(void); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +#endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c new file mode 100644 index 00000000000..9b99886562d --- /dev/null +++ b/fs/btrfs/file-item.c @@ -0,0 +1,834 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/bio.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" + +#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) * 2) / \ + size) - 1)) + +#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ + sizeof(struct btrfs_ordered_sum)) / \ + sizeof(struct btrfs_sector_sum) * \ + (r)->sectorsize - (r)->sectorsize) + +int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding) +{ + int ret = 0; + struct btrfs_file_extent_item *item; + struct btrfs_key file_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + + path = btrfs_alloc_path(); + BUG_ON(!path); + file_key.objectid = objectid; + file_key.offset = pos; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + sizeof(*item)); + if (ret < 0) + goto out; + BUG_ON(ret); + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); + btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); + btrfs_set_file_extent_offset(leaf, item, offset); + btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, compression); + btrfs_set_file_extent_encryption(leaf, item, encryption); + btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return ret; +} + +struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow) +{ + int ret; + struct btrfs_key file_key; + struct btrfs_key found_key; + struct btrfs_csum_item *item; + struct extent_buffer *leaf; + u64 csum_offset = 0; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int csums_in_item; + + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = bytenr; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); + if (ret < 0) + goto fail; + leaf = path->nodes[0]; + if (ret > 0) { + ret = 1; + if (path->slots[0] == 0) + goto fail; + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) + goto fail; + + csum_offset = (bytenr - found_key.offset) >> + root->fs_info->sb->s_blocksize_bits; + csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); + csums_in_item /= csum_size; + + if (csum_offset >= csums_in_item) { + ret = -EFBIG; + goto fail; + } + } + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); + return item; +fail: + if (ret > 0) + ret = -ENOENT; + return ERR_PTR(ret); +} + + +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 offset, int mod) +{ + int ret; + struct btrfs_key file_key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + file_key.objectid = objectid; + file_key.offset = offset; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); + return ret; +} + + +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst) +{ + u32 sum; + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + u64 offset; + u64 item_start_offset = 0; + u64 item_last_offset = 0; + u64 disk_bytenr; + u32 diff; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int ret; + struct btrfs_path *path; + struct btrfs_csum_item *item = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + + path = btrfs_alloc_path(); + if (bio->bi_size > PAGE_CACHE_SIZE * 8) + path->reada = 2; + + WARN_ON(bio->bi_vcnt <= 0); + + disk_bytenr = (u64)bio->bi_sector << 9; + while (bio_index < bio->bi_vcnt) { + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); + if (ret == 0) + goto found; + + if (!item || disk_bytenr < item_start_offset || + disk_bytenr >= item_last_offset) { + struct btrfs_key found_key; + u32 item_size; + + if (item) + btrfs_release_path(root, path); + item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, + path, disk_bytenr, 0); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + if (ret == -ENOENT || ret == -EFBIG) + ret = 0; + sum = 0; + if (BTRFS_I(inode)->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + set_extent_bits(io_tree, offset, + offset + bvec->bv_len - 1, + EXTENT_NODATASUM, GFP_NOFS); + } else { + printk(KERN_INFO "btrfs no csum found " + "for inode %lu start %llu\n", + inode->i_ino, + (unsigned long long)offset); + } + item = NULL; + btrfs_release_path(root, path); + goto found; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + item_start_offset = found_key.offset; + item_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + item_last_offset = item_start_offset + + (item_size / csum_size) * + root->sectorsize; + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + } + /* + * this byte range must be able to fit inside + * a single leaf so it will also fit inside a u32 + */ + diff = disk_bytenr - item_start_offset; + diff = diff / root->sectorsize; + diff = diff * csum_size; + + read_extent_buffer(path->nodes[0], &sum, + ((unsigned long)item) + diff, + csum_size); +found: + if (dst) + *dst++ = sum; + else + set_state_private(io_tree, offset, sum); + disk_bytenr += bvec->bv_len; + bio_index++; + bvec++; + } + btrfs_free_path(path); + return 0; +} + +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_csum_item *item; + unsigned long offset; + int ret; + size_t size; + u64 csum_end; + u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = start; + key.type = BTRFS_EXTENT_CSUM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + if (offset * csum_size < + btrfs_item_size_nr(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + + while (start <= end) { + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) + break; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + size = btrfs_item_size_nr(leaf, path->slots[0]); + csum_end = key.offset + (size / csum_size) * root->sectorsize; + if (csum_end <= start) { + path->slots[0]++; + continue; + } + + csum_end = min(csum_end, end + 1); + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (start < csum_end) { + size = min_t(size_t, csum_end - start, + MAX_ORDERED_SUM_BYTES(root)); + sums = kzalloc(btrfs_ordered_sum_size(root, size), + GFP_NOFS); + BUG_ON(!sums); + + sector_sum = sums->sums; + sums->bytenr = start; + sums->len = size; + + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + offset *= csum_size; + + while (size > 0) { + read_extent_buffer(path->nodes[0], + §or_sum->sum, + ((unsigned long)item) + + offset, csum_size); + sector_sum->bytenr = start; + + size -= root->sectorsize; + start += root->sectorsize; + offset += csum_size; + sector_sum++; + } + list_add_tail(&sums->list, list); + } + path->slots[0]++; + } + ret = 0; +fail: + btrfs_free_path(path); + return ret; +} + +int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 file_start, int contig) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + char *data; + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + unsigned long total_bytes = 0; + unsigned long this_sum_bytes = 0; + u64 offset; + u64 disk_bytenr; + + WARN_ON(bio->bi_vcnt <= 0); + sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); + if (!sums) + return -ENOMEM; + + sector_sum = sums->sums; + disk_bytenr = (u64)bio->bi_sector << 9; + sums->len = bio->bi_size; + INIT_LIST_HEAD(&sums->list); + + if (contig) + offset = file_start; + else + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); + sums->bytenr = ordered->start; + + while (bio_index < bio->bi_vcnt) { + if (!contig) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + + if (!contig && (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset)) { + unsigned long bytes_left; + sums->len = this_sum_bytes; + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + + bytes_left = bio->bi_size - total_bytes; + + sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), + GFP_NOFS); + BUG_ON(!sums); + sector_sum = sums->sums; + sums->len = bytes_left; + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); + sums->bytenr = ordered->start; + } + + data = kmap_atomic(bvec->bv_page, KM_USER0); + sector_sum->sum = ~(u32)0; + sector_sum->sum = btrfs_csum_data(root, + data + bvec->bv_offset, + sector_sum->sum, + bvec->bv_len); + kunmap_atomic(data, KM_USER0); + btrfs_csum_final(sector_sum->sum, + (char *)§or_sum->sum); + sector_sum->bytenr = disk_bytenr; + + sector_sum++; + bio_index++; + total_bytes += bvec->bv_len; + this_sum_bytes += bvec->bv_len; + disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; + bvec++; + } + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + return 0; +} + +/* + * helper function for csum removal, this expects the + * key to describe the csum pointed to by the path, and it expects + * the csum to overlap the range [bytenr, len] + * + * The csum should not be entirely contained in the range and the + * range should not be entirely contained in the csum. + * + * This calls btrfs_truncate_item with the correct args based on the + * overlap, and fixes up the key as required. + */ +static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 bytenr, u64 len) +{ + struct extent_buffer *leaf; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + u64 csum_end; + u64 end_byte = bytenr + len; + u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; + int ret; + + leaf = path->nodes[0]; + csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end <<= root->fs_info->sb->s_blocksize_bits; + csum_end += key->offset; + + if (key->offset < bytenr && csum_end <= end_byte) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * A simple truncate off the end of the item + */ + u32 new_size = (bytenr - key->offset) >> blocksize_bits; + new_size *= csum_size; + ret = btrfs_truncate_item(trans, root, path, new_size, 1); + BUG_ON(ret); + } else if (key->offset >= bytenr && csum_end > end_byte && + end_byte > key->offset) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * we need to truncate from the beginning of the csum + */ + u32 new_size = (csum_end - end_byte) >> blocksize_bits; + new_size *= csum_size; + + ret = btrfs_truncate_item(trans, root, path, new_size, 0); + BUG_ON(ret); + + key->offset = end_byte; + ret = btrfs_set_item_key_safe(trans, root, path, key); + BUG_ON(ret); + } else { + BUG(); + } + return 0; +} + +/* + * deletes the csum items from the csum tree for a given + * range of bytes. + */ +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len) +{ + struct btrfs_path *path; + struct btrfs_key key; + u64 end_byte = bytenr + len; + u64 csum_end; + struct extent_buffer *leaf; + int ret; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int blocksize_bits = root->fs_info->sb->s_blocksize_bits; + + root = root->fs_info->csum_root; + + path = btrfs_alloc_path(); + + while (1) { + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = end_byte - 1; + key.type = BTRFS_EXTENT_CSUM_KEY; + + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) { + break; + } + + if (key.offset >= end_byte) + break; + + csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end <<= blocksize_bits; + csum_end += key.offset; + + /* this csum ends before we start, we're done */ + if (csum_end <= bytenr) + break; + + /* delete the entire item, it is inside our range */ + if (key.offset >= bytenr && csum_end <= end_byte) { + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + if (key.offset == bytenr) + break; + } else if (key.offset < bytenr && csum_end > end_byte) { + unsigned long offset; + unsigned long shift_len; + unsigned long item_offset; + /* + * [ bytenr - len ] + * [csum ] + * + * Our bytes are in the middle of the csum, + * we need to split this item and insert a new one. + * + * But we can't drop the path because the + * csum could change, get removed, extended etc. + * + * The trick here is the max size of a csum item leaves + * enough room in the tree block for a single + * item header. So, we split the item in place, + * adding a new header pointing to the existing + * bytes. Then we loop around again and we have + * a nicely formed csum item that we can neatly + * truncate. + */ + offset = (bytenr - key.offset) >> blocksize_bits; + offset *= csum_size; + + shift_len = (len >> blocksize_bits) * csum_size; + + item_offset = btrfs_item_ptr_offset(leaf, + path->slots[0]); + + memset_extent_buffer(leaf, 0, item_offset + offset, + shift_len); + key.offset = bytenr; + + /* + * btrfs_split_item returns -EAGAIN when the + * item changed size or key + */ + ret = btrfs_split_item(trans, root, path, &key, offset); + BUG_ON(ret && ret != -EAGAIN); + + key.offset = end_byte - 1; + } else { + ret = truncate_one_csum(trans, root, path, + &key, bytenr, len); + BUG_ON(ret); + if (key.offset < bytenr) + break; + } + btrfs_release_path(root, path); + } +out: + btrfs_free_path(path); + return 0; +} + +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums) +{ + u64 bytenr; + int ret; + struct btrfs_key file_key; + struct btrfs_key found_key; + u64 next_offset; + u64 total_bytes = 0; + int found_next; + struct btrfs_path *path; + struct btrfs_csum_item *item; + struct btrfs_csum_item *item_end; + struct extent_buffer *leaf = NULL; + u64 csum_offset; + struct btrfs_sector_sum *sector_sum; + u32 nritems; + u32 ins_size; + char *eb_map; + char *eb_token; + unsigned long map_len; + unsigned long map_start; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + + path = btrfs_alloc_path(); + BUG_ON(!path); + sector_sum = sums->sums; +again: + next_offset = (u64)-1; + found_next = 0; + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = sector_sum->bytenr; + bytenr = sector_sum->bytenr; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + + item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); + if (!IS_ERR(item)) { + leaf = path->nodes[0]; + ret = 0; + goto found; + } + ret = PTR_ERR(item); + if (ret == -EFBIG) { + u32 item_size; + /* we found one, but it isn't big enough yet */ + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if ((item_size / csum_size) >= + MAX_CSUM_ITEMS(root, csum_size)) { + /* already at max size, make a new one */ + goto insert; + } + } else { + int slot = path->slots[0] + 1; + /* we didn't find a csum item, insert one */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems - 1) { + ret = btrfs_next_leaf(root, path); + if (ret == 1) + found_next = 1; + if (ret != 0) + goto insert; + slot = 0; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); + if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + found_key.type != BTRFS_EXTENT_CSUM_KEY) { + found_next = 1; + goto insert; + } + next_offset = found_key.offset; + found_next = 1; + goto insert; + } + + /* + * at this point, we know the tree has an item, but it isn't big + * enough yet to put our csum in. Grow it + */ + btrfs_release_path(root, path); + ret = btrfs_search_slot(trans, root, &file_key, path, + csum_size, 1); + if (ret < 0) + goto fail_unlock; + + if (ret > 0) { + if (path->slots[0] == 0) + goto insert; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + csum_offset = (bytenr - found_key.offset) >> + root->fs_info->sb->s_blocksize_bits; + + if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || + found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { + goto insert; + } + + if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / + csum_size) { + u32 diff = (csum_offset + 1) * csum_size; + + /* + * is the item big enough already? we dropped our lock + * before and need to recheck + */ + if (diff < btrfs_item_size_nr(leaf, path->slots[0])) + goto csum; + + diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); + if (diff != csum_size) + goto insert; + + ret = btrfs_extend_item(trans, root, path, diff); + BUG_ON(ret); + goto csum; + } + +insert: + btrfs_release_path(root, path); + csum_offset = 0; + if (found_next) { + u64 tmp = total_bytes + root->sectorsize; + u64 next_sector = sector_sum->bytenr; + struct btrfs_sector_sum *next = sector_sum + 1; + + while (tmp < sums->len) { + if (next_sector + root->sectorsize != next->bytenr) + break; + tmp += root->sectorsize; + next_sector = next->bytenr; + next++; + } + tmp = min(tmp, next_offset - file_key.offset); + tmp >>= root->fs_info->sb->s_blocksize_bits; + tmp = max((u64)1, tmp); + tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); + ins_size = csum_size * tmp; + } else { + ins_size = csum_size; + } + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + ins_size); + path->leave_spinning = 0; + if (ret < 0) + goto fail_unlock; + if (ret != 0) { + WARN_ON(1); + goto fail_unlock; + } +csum: + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + ret = 0; + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); +found: + item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + + btrfs_item_size_nr(leaf, path->slots[0])); + eb_token = NULL; +next_sector: + + if (!eb_token || + (unsigned long)item + csum_size >= map_start + map_len) { + int err; + + if (eb_token) + unmap_extent_buffer(leaf, eb_token, KM_USER1); + eb_token = NULL; + err = map_private_extent_buffer(leaf, (unsigned long)item, + csum_size, + &eb_token, &eb_map, + &map_start, &map_len, KM_USER1); + if (err) + eb_token = NULL; + } + if (eb_token) { + memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)), + §or_sum->sum, csum_size); + } else { + write_extent_buffer(leaf, §or_sum->sum, + (unsigned long)item, csum_size); + } + + total_bytes += root->sectorsize; + sector_sum++; + if (total_bytes < sums->len) { + item = (struct btrfs_csum_item *)((char *)item + + csum_size); + if (item < item_end && bytenr + PAGE_CACHE_SIZE == + sector_sum->bytenr) { + bytenr = sector_sum->bytenr; + goto next_sector; + } + } + if (eb_token) { + unmap_extent_buffer(leaf, eb_token, KM_USER1); + eb_token = NULL; + } + btrfs_mark_buffer_dirty(path->nodes[0]); + if (total_bytes < sums->len) { + btrfs_release_path(root, path); + cond_resched(); + goto again; + } +out: + btrfs_free_path(path); + return ret; + +fail_unlock: + goto out; +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c new file mode 100644 index 00000000000..6ed434ac037 --- /dev/null +++ b/fs/btrfs/file.c @@ -0,0 +1,1167 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "tree-log.h" +#include "locking.h" +#include "compat.h" + + +/* simple helper to fault in pages and copy. This should go away + * and be replaced with calls into generic code. + */ +static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, + int write_bytes, + struct page **prepared_pages, + const char __user *buf) +{ + long page_fault = 0; + int i; + int offset = pos & (PAGE_CACHE_SIZE - 1); + + for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { + size_t count = min_t(size_t, + PAGE_CACHE_SIZE - offset, write_bytes); + struct page *page = prepared_pages[i]; + fault_in_pages_readable(buf, count); + + /* Copy data from userspace to the current page */ + kmap(page); + page_fault = __copy_from_user(page_address(page) + offset, + buf, count); + /* Flush processor's dcache for this page */ + flush_dcache_page(page); + kunmap(page); + buf += count; + write_bytes -= count; + + if (page_fault) + break; + } + return page_fault ? -EFAULT : 0; +} + +/* + * unlocks pages after btrfs_file_write is done with them + */ +static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) +{ + size_t i; + for (i = 0; i < num_pages; i++) { + if (!pages[i]) + break; + /* page checked is some magic around finding pages that + * have been modified without going through btrfs_set_page_dirty + * clear it here + */ + ClearPageChecked(pages[i]); + unlock_page(pages[i]); + mark_page_accessed(pages[i]); + page_cache_release(pages[i]); + } +} + +/* + * after copy_from_user, pages need to be dirtied and we need to make + * sure holes are created between the current EOF and the start of + * any next extents (if required). + * + * this also makes the decision about creating an inline extent vs + * doing real data extents, marking pages dirty and delalloc as required. + */ +static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct file *file, + struct page **pages, + size_t num_pages, + loff_t pos, + size_t write_bytes) +{ + int err = 0; + int i; + struct inode *inode = fdentry(file)->d_inode; + u64 num_bytes; + u64 start_pos; + u64 end_of_last_block; + u64 end_pos = pos + write_bytes; + loff_t isize = i_size_read(inode); + + start_pos = pos & ~((u64)root->sectorsize - 1); + num_bytes = (write_bytes + pos - start_pos + + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + end_of_last_block = start_pos + num_bytes - 1; + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + if (err) + return err; + + for (i = 0; i < num_pages; i++) { + struct page *p = pages[i]; + SetPageUptodate(p); + ClearPageChecked(p); + set_page_dirty(p); + } + if (end_pos > isize) { + i_size_write(inode, end_pos); + /* we've only changed i_size in ram, and we haven't updated + * the disk i_size. There is no need to log the inode + * at this time. + */ + } + return err; +} + +/* + * this drops all the extents in the cache that intersect the range + * [start, end]. Existing extents are split as required. + */ +int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned) +{ + struct extent_map *em; + struct extent_map *split = NULL; + struct extent_map *split2 = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 len = end - start + 1; + int ret; + int testend = 1; + unsigned long flags; + int compressed = 0; + + WARN_ON(end < start); + if (end == (u64)-1) { + len = (u64)-1; + testend = 0; + } + while (1) { + if (!split) + split = alloc_extent_map(GFP_NOFS); + if (!split2) + split2 = alloc_extent_map(GFP_NOFS); + + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { + write_unlock(&em_tree->lock); + break; + } + flags = em->flags; + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + if (testend && em->start + em->len >= start + len) { + free_extent_map(em); + write_unlock(&em_tree->lock); + break; + } + start = em->start + em->len; + if (testend) + len = start + len - (em->start + em->len); + free_extent_map(em); + write_unlock(&em_tree->lock); + continue; + } + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + remove_extent_mapping(em_tree, em); + + if (em->block_start < EXTENT_MAP_LAST_BYTE && + em->start < start) { + split->start = em->start; + split->len = start - em->start; + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + + split->bdev = em->bdev; + split->flags = flags; + ret = add_extent_mapping(em_tree, split); + BUG_ON(ret); + free_extent_map(split); + split = split2; + split2 = NULL; + } + if (em->block_start < EXTENT_MAP_LAST_BYTE && + testend && em->start + em->len > start + len) { + u64 diff = start + len - em->start; + + split->start = start + len; + split->len = em->start + em->len - (start + len); + split->bdev = em->bdev; + split->flags = flags; + + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + diff; + split->orig_start = split->start; + } + + ret = add_extent_mapping(em_tree, split); + BUG_ON(ret); + free_extent_map(split); + split = NULL; + } + write_unlock(&em_tree->lock); + + /* once for us */ + free_extent_map(em); + /* once for the tree*/ + free_extent_map(em); + } + if (split) + free_extent_map(split); + if (split2) + free_extent_map(split2); + return 0; +} + +/* + * this is very complex, but the basic idea is to drop all extents + * in the range start - end. hint_block is filled in with a block number + * that would be a good hint to the block allocator for this file. + * + * If an extent intersects the range but is not entirely inside the range + * it is either truncated or split. Anything entirely inside the range + * is deleted from the tree. + */ +int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, + u64 start, u64 end, u64 *hint_byte, int drop_cache) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key new_key; + u64 search_start = start; + u64 disk_bytenr = 0; + u64 num_bytes = 0; + u64 extent_offset = 0; + u64 extent_end = 0; + int del_nr = 0; + int del_slot = 0; + int extent_type; + int recow; + int ret; + + if (drop_cache) + btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + recow = 0; + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + search_start, -1); + if (ret < 0) + break; + if (ret > 0 && path->slots[0] > 0 && search_start == start) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == inode->i_ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + ret = 0; +next_slot: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + BUG_ON(del_nr > 0); + ret = btrfs_next_leaf(root, path); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + leaf = path->nodes[0]; + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid > inode->i_ino || + key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, fi); + } else { + WARN_ON(1); + extent_end = search_start; + } + + if (extent_end <= search_start) { + path->slots[0]++; + goto next_slot; + } + + search_start = max(key.offset, start); + if (recow) { + btrfs_release_path(root, path); + continue; + } + + /* + * | - range to drop - | + * | -------- extent -------- | + */ + if (start > key.offset && end < extent_end) { + BUG_ON(del_nr > 0); + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = start; + ret = btrfs_duplicate_item(trans, root, path, + &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(root, path); + continue; + } + if (ret < 0) + break; + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + extent_offset += start - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - start); + btrfs_mark_buffer_dirty(leaf); + + if (disk_bytenr > 0) { + ret = btrfs_inc_extent_ref(trans, root, + disk_bytenr, num_bytes, 0, + root->root_key.objectid, + new_key.objectid, + start - extent_offset); + BUG_ON(ret); + *hint_byte = disk_bytenr; + } + key.offset = start; + } + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (start <= key.offset && end < extent_end) { + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = end; + btrfs_set_item_key_safe(trans, root, path, &new_key); + + extent_offset += end - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_mark_buffer_dirty(leaf); + if (disk_bytenr > 0) { + inode_sub_bytes(inode, end - key.offset); + *hint_byte = disk_bytenr; + } + break; + } + + search_start = extent_end; + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (start > key.offset && end >= extent_end) { + BUG_ON(del_nr > 0); + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + btrfs_mark_buffer_dirty(leaf); + if (disk_bytenr > 0) { + inode_sub_bytes(inode, extent_end - start); + *hint_byte = disk_bytenr; + } + if (end == extent_end) + break; + + path->slots[0]++; + goto next_slot; + } + + /* + * | ---- range to drop ----- | + * | ------ extent ------ | + */ + if (start <= key.offset && end >= extent_end) { + if (del_nr == 0) { + del_slot = path->slots[0]; + del_nr = 1; + } else { + BUG_ON(del_slot + del_nr != path->slots[0]); + del_nr++; + } + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + inode_sub_bytes(inode, + extent_end - key.offset); + extent_end = ALIGN(extent_end, + root->sectorsize); + } else if (disk_bytenr > 0) { + ret = btrfs_free_extent(trans, root, + disk_bytenr, num_bytes, 0, + root->root_key.objectid, + key.objectid, key.offset - + extent_offset); + BUG_ON(ret); + inode_sub_bytes(inode, + extent_end - key.offset); + *hint_byte = disk_bytenr; + } + + if (end == extent_end) + break; + + if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { + path->slots[0]++; + goto next_slot; + } + + ret = btrfs_del_items(trans, root, path, del_slot, + del_nr); + BUG_ON(ret); + + del_nr = 0; + del_slot = 0; + + btrfs_release_path(root, path); + continue; + } + + BUG_ON(1); + } + + if (del_nr > 0) { + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); + } + + btrfs_free_path(path); + return ret; +} + +static int extent_mergeable(struct extent_buffer *leaf, int slot, + u64 objectid, u64 bytenr, u64 orig_offset, + u64 *start, u64 *end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || + btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || + btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if ((*start && *start != key.offset) || (*end && *end != extent_end)) + return 0; + + *start = key.offset; + *end = extent_end; + return 1; +} + +/* + * Mark extent in the range start - end as written. + * + * This changes extent type from 'pre-allocated' to 'regular'. If only + * part of extent is marked as written, the extent will be split into + * two or three. + */ +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key new_key; + u64 bytenr; + u64 num_bytes; + u64 extent_end; + u64 orig_offset; + u64 other_start; + u64 other_end; + u64 split; + int del_nr = 0; + int del_slot = 0; + int recow; + int ret; + + btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + BUG_ON(!path); +again: + recow = 0; + split = start; + key.objectid = inode->i_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = split; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + BUG_ON(key.objectid != inode->i_ino || + key.type != BTRFS_EXTENT_DATA_KEY); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + BUG_ON(btrfs_file_extent_type(leaf, fi) != + BTRFS_FILE_EXTENT_PREALLOC); + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + BUG_ON(key.offset > start || extent_end < end); + + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); + memcpy(&new_key, &key, sizeof(new_key)); + + if (start == key.offset && end < extent_end) { + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + new_key.offset = end; + btrfs_set_item_key_safe(trans, root, path, &new_key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_set_file_extent_offset(leaf, fi, + end - orig_offset); + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + end - other_start); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + } + + if (start > key.offset && end == extent_end) { + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + path->slots[0]++; + new_key.offset = start; + btrfs_set_item_key_safe(trans, root, path, &new_key); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + other_end - start); + btrfs_set_file_extent_offset(leaf, fi, + start - orig_offset); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + } + + while (start > key.offset || end < extent_end) { + if (key.offset == start) + split = end; + + new_key.offset = split; + ret = btrfs_duplicate_item(trans, root, path, &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(root, path); + goto again; + } + BUG_ON(ret < 0); + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + split - key.offset); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - split); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, + root->root_key.objectid, + inode->i_ino, orig_offset); + BUG_ON(ret); + + if (split == start) { + key.offset = start; + } else { + BUG_ON(start != key.offset); + path->slots[0]--; + extent_end = end; + } + recow = 1; + } + + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(root, path); + goto again; + } + extent_end = other_end; + del_slot = path->slots[0] + 1; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + 0, root->root_key.objectid, + inode->i_ino, orig_offset); + BUG_ON(ret); + } + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(root, path); + goto again; + } + key.offset = other_start; + del_slot = path->slots[0]; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + 0, root->root_key.objectid, + inode->i_ino, orig_offset); + BUG_ON(ret); + } + if (del_nr == 0) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_mark_buffer_dirty(leaf); + } else { + fi = btrfs_item_ptr(leaf, del_slot - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - key.offset); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); + } +out: + btrfs_free_path(path); + return 0; +} + +/* + * this gets pages into the page cache and locks them down, it also properly + * waits for data=ordered extents to finish before allowing the pages to be + * modified. + */ +static noinline int prepare_pages(struct btrfs_root *root, struct file *file, + struct page **pages, size_t num_pages, + loff_t pos, unsigned long first_index, + unsigned long last_index, size_t write_bytes) +{ + int i; + unsigned long index = pos >> PAGE_CACHE_SHIFT; + struct inode *inode = fdentry(file)->d_inode; + int err = 0; + u64 start_pos; + u64 last_pos; + + start_pos = pos & ~((u64)root->sectorsize - 1); + last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; + + if (start_pos > inode->i_size) { + err = btrfs_cont_expand(inode, start_pos); + if (err) + return err; + } + + memset(pages, 0, num_pages * sizeof(struct page *)); +again: + for (i = 0; i < num_pages; i++) { + pages[i] = grab_cache_page(inode->i_mapping, index + i); + if (!pages[i]) { + err = -ENOMEM; + BUG_ON(1); + } + wait_on_page_writeback(pages[i]); + } + if (start_pos < inode->i_size) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + last_pos - 1); + if (ordered && + ordered->file_offset + ordered->len > start_pos && + ordered->file_offset < last_pos) { + btrfs_put_ordered_extent(ordered); + unlock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + btrfs_wait_ordered_range(inode, start_pos, + last_pos - start_pos); + goto again; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, + last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, + GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); + } + for (i = 0; i < num_pages; i++) { + clear_page_dirty_for_io(pages[i]); + set_page_extent_mapped(pages[i]); + WARN_ON(!PageLocked(pages[i])); + } + return 0; +} + +static ssize_t btrfs_file_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + loff_t pos; + loff_t start_pos; + ssize_t num_written = 0; + ssize_t err = 0; + int ret = 0; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page **pages = NULL; + int nrptrs; + struct page *pinned[2]; + unsigned long first_index; + unsigned long last_index; + int will_write; + + will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || + (file->f_flags & O_DIRECT)); + + nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, + PAGE_CACHE_SIZE / (sizeof(struct page *))); + pinned[0] = NULL; + pinned[1] = NULL; + + pos = *ppos; + start_pos = pos; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* do the reserve before the mutex lock in case we have to do some + * flushing. We wouldn't deadlock, but this is more polite. + */ + err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (err) + goto out_nolock; + + mutex_lock(&inode->i_mutex); + + current->backing_dev_info = inode->i_mapping->backing_dev_info; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + + err = file_remove_suid(file); + if (err) + goto out; + + file_update_time(file); + + pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + + /* generic_write_checks can change our pos */ + start_pos = pos; + + BTRFS_I(inode)->sequence++; + first_index = pos >> PAGE_CACHE_SHIFT; + last_index = (pos + count) >> PAGE_CACHE_SHIFT; + + /* + * there are lots of better ways to do this, but this code + * makes sure the first and last page in the file range are + * up to date and ready for cow + */ + if ((pos & (PAGE_CACHE_SIZE - 1))) { + pinned[0] = grab_cache_page(inode->i_mapping, first_index); + if (!PageUptodate(pinned[0])) { + ret = btrfs_readpage(NULL, pinned[0]); + BUG_ON(ret); + wait_on_page_locked(pinned[0]); + } else { + unlock_page(pinned[0]); + } + } + if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { + pinned[1] = grab_cache_page(inode->i_mapping, last_index); + if (!PageUptodate(pinned[1])) { + ret = btrfs_readpage(NULL, pinned[1]); + BUG_ON(ret); + wait_on_page_locked(pinned[1]); + } else { + unlock_page(pinned[1]); + } + } + + while (count > 0) { + size_t offset = pos & (PAGE_CACHE_SIZE - 1); + size_t write_bytes = min(count, nrptrs * + (size_t)PAGE_CACHE_SIZE - + offset); + size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + WARN_ON(num_pages > nrptrs); + memset(pages, 0, sizeof(struct page *) * nrptrs); + + ret = btrfs_check_data_free_space(root, inode, write_bytes); + if (ret) + goto out; + + ret = prepare_pages(root, file, pages, num_pages, + pos, first_index, last_index, + write_bytes); + if (ret) { + btrfs_free_reserved_data_space(root, inode, + write_bytes); + goto out; + } + + ret = btrfs_copy_from_user(pos, num_pages, + write_bytes, pages, buf); + if (ret) { + btrfs_free_reserved_data_space(root, inode, + write_bytes); + btrfs_drop_pages(pages, num_pages); + goto out; + } + + ret = dirty_and_release_pages(NULL, root, file, pages, + num_pages, pos, write_bytes); + btrfs_drop_pages(pages, num_pages); + if (ret) { + btrfs_free_reserved_data_space(root, inode, + write_bytes); + goto out; + } + + if (will_write) { + filemap_fdatawrite_range(inode->i_mapping, pos, + pos + write_bytes - 1); + } else { + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + num_pages); + if (num_pages < + (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + btrfs_btree_balance_dirty(root, 1); + btrfs_throttle(root); + } + + buf += write_bytes; + count -= write_bytes; + pos += write_bytes; + num_written += write_bytes; + + cond_resched(); + } +out: + mutex_unlock(&inode->i_mutex); + if (ret) + err = ret; + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + +out_nolock: + kfree(pages); + if (pinned[0]) + page_cache_release(pinned[0]); + if (pinned[1]) + page_cache_release(pinned[1]); + *ppos = pos; + + /* + * we want to make sure fsync finds this change + * but we haven't joined a transaction running right now. + * + * Later on, someone is sure to update the inode and get the + * real transid recorded. + * + * We set last_trans now to the fs_info generation + 1, + * this will either be one more than the running transaction + * or the generation used for the next transaction if there isn't + * one running right now. + */ + BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + + if (num_written > 0 && will_write) { + struct btrfs_trans_handle *trans; + + err = btrfs_wait_ordered_range(inode, start_pos, num_written); + if (err) + num_written = err; + + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { + trans = btrfs_start_transaction(root, 1); + ret = btrfs_log_dentry_safe(trans, root, + file->f_dentry); + if (ret == 0) { + ret = btrfs_sync_log(trans, root); + if (ret == 0) + btrfs_end_transaction(trans, root); + else + btrfs_commit_transaction(trans, root); + } else if (ret != BTRFS_NO_LOG_SYNC) { + btrfs_commit_transaction(trans, root); + } else { + btrfs_end_transaction(trans, root); + } + } + if (file->f_flags & O_DIRECT) { + invalidate_mapping_pages(inode->i_mapping, + start_pos >> PAGE_CACHE_SHIFT, + (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); + } + } + current->backing_dev_info = NULL; + return num_written ? num_written : err; +} + +int btrfs_release_file(struct inode *inode, struct file *filp) +{ + /* + * ordered_data_close is set by settattr when we are about to truncate + * a file from a non-zero size to a zero size. This tries to + * flush down new bytes that may have been written if the + * application were using truncate to replace a file in place. + */ + if (BTRFS_I(inode)->ordered_data_close) { + BTRFS_I(inode)->ordered_data_close = 0; + btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); + if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(inode->i_mapping); + } + if (filp->private_data) + btrfs_ioctl_trans_end(filp); + return 0; +} + +/* + * fsync call for both files and directories. This logs the inode into + * the tree log instead of forcing full commits whenever possible. + * + * It needs to call filemap_fdatawait so that all ordered extent updates are + * in the metadata btree are up to date for copying to the log. + * + * It drops the inode mutex before doing the tree log commit. This is an + * important optimization for directories because holding the mutex prevents + * new operations on the dir while we write to disk. + */ +int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + struct btrfs_trans_handle *trans; + + + /* we wait first, since the writeback may change the inode */ + root->log_batch++; + /* the VFS called filemap_fdatawrite for us */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + root->log_batch++; + + /* + * check the transaction that last modified this inode + * and see if its already been committed + */ + if (!BTRFS_I(inode)->last_trans) + goto out; + + /* + * if the last transaction that changed this file was before + * the current transaction, we can bail out now without any + * syncing + */ + mutex_lock(&root->fs_info->trans_mutex); + if (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed) { + BTRFS_I(inode)->last_trans = 0; + mutex_unlock(&root->fs_info->trans_mutex); + goto out; + } + mutex_unlock(&root->fs_info->trans_mutex); + + /* + * ok we haven't committed the transaction yet, lets do a commit + */ + if (file && file->private_data) + btrfs_ioctl_trans_end(file); + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_log_dentry_safe(trans, root, dentry); + if (ret < 0) + goto out; + + /* we've logged all the items and now have a consistent + * version of the file in the log. It is possible that + * someone will come in and modify the file, but that's + * fine because the log is consistent on disk, and we + * have references to all of the file's extents + * + * It is possible that someone will come in and log the + * file again, but that will end up using the synchronization + * inside btrfs_sync_log to keep things safe. + */ + mutex_unlock(&dentry->d_inode->i_mutex); + + if (ret != BTRFS_NO_LOG_SYNC) { + if (ret > 0) { + ret = btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_sync_log(trans, root); + if (ret == 0) + ret = btrfs_end_transaction(trans, root); + else + ret = btrfs_commit_transaction(trans, root); + } + } else { + ret = btrfs_end_transaction(trans, root); + } + mutex_lock(&dentry->d_inode->i_mutex); +out: + return ret > 0 ? -EIO : ret; +} + +static const struct vm_operations_struct btrfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = btrfs_page_mkwrite, +}; + +static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &btrfs_file_vm_ops; + file_accessed(filp); + return 0; +} + +const struct file_operations btrfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .splice_read = generic_file_splice_read, + .write = btrfs_file_write, + .mmap = btrfs_file_mmap, + .open = generic_file_open, + .release = btrfs_release_file, + .fsync = btrfs_sync_file, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_ioctl, +#endif +}; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c new file mode 100644 index 00000000000..cb2849f0325 --- /dev/null +++ b/fs/btrfs/free-space-cache.c @@ -0,0 +1,1364 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/pagemap.h> +#include <linux/sched.h> +#include <linux/math64.h> +#include "ctree.h" +#include "free-space-cache.h" +#include "transaction.h" + +#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) + +static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, + u64 offset) +{ + BUG_ON(offset < bitmap_start); + offset -= bitmap_start; + return (unsigned long)(div64_u64(offset, sectorsize)); +} + +static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize) +{ + return (unsigned long)(div64_u64(bytes, sectorsize)); +} + +static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group, + u64 offset) +{ + u64 bitmap_start; + u64 bytes_per_bitmap; + + bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize; + bitmap_start = offset - block_group->key.objectid; + bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); + bitmap_start *= bytes_per_bitmap; + bitmap_start += block_group->key.objectid; + + return bitmap_start; +} + +static int tree_insert_offset(struct rb_root *root, u64 offset, + struct rb_node *node, int bitmap) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_free_space *info; + + while (*p) { + parent = *p; + info = rb_entry(parent, struct btrfs_free_space, offset_index); + + if (offset < info->offset) { + p = &(*p)->rb_left; + } else if (offset > info->offset) { + p = &(*p)->rb_right; + } else { + /* + * we could have a bitmap entry and an extent entry + * share the same offset. If this is the case, we want + * the extent entry to always be found first if we do a + * linear search through the tree, since we want to have + * the quickest allocation time, and allocating from an + * extent is faster than allocating from a bitmap. So + * if we're inserting a bitmap and we find an entry at + * this offset, we want to go right, or after this entry + * logically. If we are inserting an extent and we've + * found a bitmap, we want to go left, or before + * logically. + */ + if (bitmap) { + WARN_ON(info->bitmap); + p = &(*p)->rb_right; + } else { + WARN_ON(!info->bitmap); + p = &(*p)->rb_left; + } + } + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + + return 0; +} + +/* + * searches the tree for the given offset. + * + * fuzzy - If this is set, then we are trying to make an allocation, and we just + * want a section that has at least bytes size and comes at or after the given + * offset. + */ +static struct btrfs_free_space * +tree_search_offset(struct btrfs_block_group_cache *block_group, + u64 offset, int bitmap_only, int fuzzy) +{ + struct rb_node *n = block_group->free_space_offset.rb_node; + struct btrfs_free_space *entry, *prev = NULL; + + /* find entry that is closest to the 'offset' */ + while (1) { + if (!n) { + entry = NULL; + break; + } + + entry = rb_entry(n, struct btrfs_free_space, offset_index); + prev = entry; + + if (offset < entry->offset) + n = n->rb_left; + else if (offset > entry->offset) + n = n->rb_right; + else + break; + } + + if (bitmap_only) { + if (!entry) + return NULL; + if (entry->bitmap) + return entry; + + /* + * bitmap entry and extent entry may share same offset, + * in that case, bitmap entry comes after extent entry. + */ + n = rb_next(n); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + if (entry->offset != offset) + return NULL; + + WARN_ON(!entry->bitmap); + return entry; + } else if (entry) { + if (entry->bitmap) { + /* + * if previous extent entry covers the offset, + * we should return it instead of the bitmap entry + */ + n = &entry->offset_index; + while (1) { + n = rb_prev(n); + if (!n) + break; + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap) { + if (prev->offset + prev->bytes > offset) + entry = prev; + break; + } + } + } + return entry; + } + + if (!prev) + return NULL; + + /* find last entry before the 'offset' */ + entry = prev; + if (entry->offset > offset) { + n = rb_prev(&entry->offset_index); + if (n) { + entry = rb_entry(n, struct btrfs_free_space, + offset_index); + BUG_ON(entry->offset > offset); + } else { + if (fuzzy) + return entry; + else + return NULL; + } + } + + if (entry->bitmap) { + n = &entry->offset_index; + while (1) { + n = rb_prev(n); + if (!n) + break; + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap) { + if (prev->offset + prev->bytes > offset) + return prev; + break; + } + } + if (entry->offset + BITS_PER_BITMAP * + block_group->sectorsize > offset) + return entry; + } else if (entry->offset + entry->bytes > offset) + return entry; + + if (!fuzzy) + return NULL; + + while (1) { + if (entry->bitmap) { + if (entry->offset + BITS_PER_BITMAP * + block_group->sectorsize > offset) + break; + } else { + if (entry->offset + entry->bytes > offset) + break; + } + + n = rb_next(&entry->offset_index); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + } + return entry; +} + +static void unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + rb_erase(&info->offset_index, &block_group->free_space_offset); + block_group->free_extents--; + block_group->free_space -= info->bytes; +} + +static int link_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + int ret = 0; + + BUG_ON(!info->bitmap && !info->bytes); + ret = tree_insert_offset(&block_group->free_space_offset, info->offset, + &info->offset_index, (info->bitmap != NULL)); + if (ret) + return ret; + + block_group->free_space += info->bytes; + block_group->free_extents++; + return ret; +} + +static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) +{ + u64 max_bytes; + u64 bitmap_bytes; + u64 extent_bytes; + + /* + * The goal is to keep the total amount of memory used per 1gb of space + * at or below 32k, so we need to adjust how much memory we allow to be + * used by extent based free space tracking + */ + max_bytes = MAX_CACHE_BYTES_PER_GIG * + (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + + /* + * we want to account for 1 more bitmap than what we have so we can make + * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as + * we add more bitmaps. + */ + bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE; + + if (bitmap_bytes >= max_bytes) { + block_group->extents_thresh = 0; + return; + } + + /* + * we want the extent entry threshold to always be at most 1/2 the maxw + * bytes we can have, or whatever is less than that. + */ + extent_bytes = max_bytes - bitmap_bytes; + extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); + + block_group->extents_thresh = + div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); +} + +static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, end; + unsigned long i; + + start = offset_to_bit(info->offset, block_group->sectorsize, offset); + end = start + bytes_to_bits(bytes, block_group->sectorsize); + BUG_ON(end > BITS_PER_BITMAP); + + for (i = start; i < end; i++) + clear_bit(i, info->bitmap); + + info->bytes -= bytes; + block_group->free_space -= bytes; +} + +static void bitmap_set_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, end; + unsigned long i; + + start = offset_to_bit(info->offset, block_group->sectorsize, offset); + end = start + bytes_to_bits(bytes, block_group->sectorsize); + BUG_ON(end > BITS_PER_BITMAP); + + for (i = start; i < end; i++) + set_bit(i, info->bitmap); + + info->bytes += bytes; + block_group->free_space += bytes; +} + +static int search_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info, u64 *offset, + u64 *bytes) +{ + unsigned long found_bits = 0; + unsigned long bits, i; + unsigned long next_zero; + + i = offset_to_bit(bitmap_info->offset, block_group->sectorsize, + max_t(u64, *offset, bitmap_info->offset)); + bits = bytes_to_bits(*bytes, block_group->sectorsize); + + for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); + i < BITS_PER_BITMAP; + i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { + next_zero = find_next_zero_bit(bitmap_info->bitmap, + BITS_PER_BITMAP, i); + if ((next_zero - i) >= bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (found_bits) { + *offset = (u64)(i * block_group->sectorsize) + + bitmap_info->offset; + *bytes = (u64)(found_bits) * block_group->sectorsize; + return 0; + } + + return -1; +} + +static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache + *block_group, u64 *offset, + u64 *bytes, int debug) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + int ret; + + if (!block_group->free_space_offset.rb_node) + return NULL; + + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, *offset), + 0, 1); + if (!entry) + return NULL; + + for (node = &entry->offset_index; node; node = rb_next(node)) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (entry->bytes < *bytes) + continue; + + if (entry->bitmap) { + ret = search_bitmap(block_group, entry, offset, bytes); + if (!ret) + return entry; + continue; + } + + *offset = entry->offset; + *bytes = entry->bytes; + return entry; + } + + return NULL; +} + +static void add_new_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset) +{ + u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; + int max_bitmaps = (int)div64_u64(block_group->key.offset + + bytes_per_bg - 1, bytes_per_bg); + BUG_ON(block_group->total_bitmaps >= max_bitmaps); + + info->offset = offset_to_bitmap(block_group, offset); + info->bytes = 0; + link_free_space(block_group, info); + block_group->total_bitmaps++; + + recalculate_thresholds(block_group); +} + +static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info, + u64 *offset, u64 *bytes) +{ + u64 end; + u64 search_start, search_bytes; + int ret; + +again: + end = bitmap_info->offset + + (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1; + + /* + * XXX - this can go away after a few releases. + * + * since the only user of btrfs_remove_free_space is the tree logging + * stuff, and the only way to test that is under crash conditions, we + * want to have this debug stuff here just in case somethings not + * working. Search the bitmap for the space we are trying to use to + * make sure its actually there. If its not there then we need to stop + * because something has gone wrong. + */ + search_start = *offset; + search_bytes = *bytes; + ret = search_bitmap(block_group, bitmap_info, &search_start, + &search_bytes); + BUG_ON(ret < 0 || search_start != *offset); + + if (*offset > bitmap_info->offset && *offset + *bytes > end) { + bitmap_clear_bits(block_group, bitmap_info, *offset, + end - *offset + 1); + *bytes -= end - *offset + 1; + *offset = end + 1; + } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { + bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes); + *bytes = 0; + } + + if (*bytes) { + struct rb_node *next = rb_next(&bitmap_info->offset_index); + if (!bitmap_info->bytes) { + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + + /* + * no entry after this bitmap, but we still have bytes to + * remove, so something has gone wrong. + */ + if (!next) + return -EINVAL; + + bitmap_info = rb_entry(next, struct btrfs_free_space, + offset_index); + + /* + * if the next entry isn't a bitmap we need to return to let the + * extent stuff do its work. + */ + if (!bitmap_info->bitmap) + return -EAGAIN; + + /* + * Ok the next item is a bitmap, but it may not actually hold + * the information for the rest of this free space stuff, so + * look for it, and if we don't find it return so we can try + * everything over again. + */ + search_start = *offset; + search_bytes = *bytes; + ret = search_bitmap(block_group, bitmap_info, &search_start, + &search_bytes); + if (ret < 0 || search_start != *offset) + return -EAGAIN; + + goto again; + } else if (!bitmap_info->bytes) { + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + + return 0; +} + +static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + struct btrfs_free_space *bitmap_info; + int added = 0; + u64 bytes, offset, end; + int ret; + + /* + * If we are below the extents threshold then we can add this as an + * extent, and don't have to deal with the bitmap + */ + if (block_group->free_extents < block_group->extents_thresh && + info->bytes > block_group->sectorsize * 4) + return 0; + + /* + * some block groups are so tiny they can't be enveloped by a bitmap, so + * don't even bother to create a bitmap for this + */ + if (BITS_PER_BITMAP * block_group->sectorsize > + block_group->key.offset) + return 0; + + bytes = info->bytes; + offset = info->offset; + +again: + bitmap_info = tree_search_offset(block_group, + offset_to_bitmap(block_group, offset), + 1, 0); + if (!bitmap_info) { + BUG_ON(added); + goto new_bitmap; + } + + end = bitmap_info->offset + + (u64)(BITS_PER_BITMAP * block_group->sectorsize); + + if (offset >= bitmap_info->offset && offset + bytes > end) { + bitmap_set_bits(block_group, bitmap_info, offset, + end - offset); + bytes -= end - offset; + offset = end; + added = 0; + } else if (offset >= bitmap_info->offset && offset + bytes <= end) { + bitmap_set_bits(block_group, bitmap_info, offset, bytes); + bytes = 0; + } else { + BUG(); + } + + if (!bytes) { + ret = 1; + goto out; + } else + goto again; + +new_bitmap: + if (info && info->bitmap) { + add_new_bitmap(block_group, info, offset); + added = 1; + info = NULL; + goto again; + } else { + spin_unlock(&block_group->tree_lock); + + /* no pre-allocated info, allocate a new one */ + if (!info) { + info = kzalloc(sizeof(struct btrfs_free_space), + GFP_NOFS); + if (!info) { + spin_lock(&block_group->tree_lock); + ret = -ENOMEM; + goto out; + } + } + + /* allocate the bitmap */ + info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + spin_lock(&block_group->tree_lock); + if (!info->bitmap) { + ret = -ENOMEM; + goto out; + } + goto again; + } + +out: + if (info) { + if (info->bitmap) + kfree(info->bitmap); + kfree(info); + } + + return ret; +} + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *right_info = NULL; + struct btrfs_free_space *left_info = NULL; + struct btrfs_free_space *info = NULL; + int ret = 0; + + info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!info) + return -ENOMEM; + + info->offset = offset; + info->bytes = bytes; + + spin_lock(&block_group->tree_lock); + + /* + * first we want to see if there is free space adjacent to the range we + * are adding, if there is remove that struct and add a new one to + * cover the entire range + */ + right_info = tree_search_offset(block_group, offset + bytes, 0, 0); + if (right_info && rb_prev(&right_info->offset_index)) + left_info = rb_entry(rb_prev(&right_info->offset_index), + struct btrfs_free_space, offset_index); + else + left_info = tree_search_offset(block_group, offset - 1, 0, 0); + + /* + * If there was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + if ((!left_info || left_info->bitmap) && + (!right_info || right_info->bitmap)) { + ret = insert_into_bitmap(block_group, info); + + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } + } + + if (right_info && !right_info->bitmap) { + unlink_free_space(block_group, right_info); + info->bytes += right_info->bytes; + kfree(right_info); + } + + if (left_info && !left_info->bitmap && + left_info->offset + left_info->bytes == offset) { + unlink_free_space(block_group, left_info); + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kfree(left_info); + } + + ret = link_free_space(block_group, info); + if (ret) + kfree(info); +out: + spin_unlock(&block_group->tree_lock); + + if (ret) { + printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); + BUG_ON(ret == -EEXIST); + } + + return ret; +} + +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *info; + struct btrfs_free_space *next_info = NULL; + int ret = 0; + + spin_lock(&block_group->tree_lock); + +again: + info = tree_search_offset(block_group, offset, 0, 0); + if (!info) { + /* + * oops didn't find an extent that matched the space we wanted + * to remove, look for a bitmap instead + */ + info = tree_search_offset(block_group, + offset_to_bitmap(block_group, offset), + 1, 0); + if (!info) { + WARN_ON(1); + goto out_lock; + } + } + + if (info->bytes < bytes && rb_next(&info->offset_index)) { + u64 end; + next_info = rb_entry(rb_next(&info->offset_index), + struct btrfs_free_space, + offset_index); + + if (next_info->bitmap) + end = next_info->offset + BITS_PER_BITMAP * + block_group->sectorsize - 1; + else + end = next_info->offset + next_info->bytes; + + if (next_info->bytes < bytes || + next_info->offset > offset || offset > end) { + printk(KERN_CRIT "Found free space at %llu, size %llu," + " trying to use %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)bytes); + WARN_ON(1); + ret = -EINVAL; + goto out_lock; + } + + info = next_info; + } + + if (info->bytes == bytes) { + unlink_free_space(block_group, info); + if (info->bitmap) { + kfree(info->bitmap); + block_group->total_bitmaps--; + } + kfree(info); + goto out_lock; + } + + if (!info->bitmap && info->offset == offset) { + unlink_free_space(block_group, info); + info->offset += bytes; + info->bytes -= bytes; + link_free_space(block_group, info); + goto out_lock; + } + + if (!info->bitmap && info->offset <= offset && + info->offset + info->bytes >= offset + bytes) { + u64 old_start = info->offset; + /* + * we're freeing space in the middle of the info, + * this can happen during tree log replay + * + * first unlink the old info and then + * insert it again after the hole we're creating + */ + unlink_free_space(block_group, info); + if (offset + bytes < info->offset + info->bytes) { + u64 old_end = info->offset + info->bytes; + + info->offset = offset + bytes; + info->bytes = old_end - info->offset; + ret = link_free_space(block_group, info); + WARN_ON(ret); + if (ret) + goto out_lock; + } else { + /* the hole we're creating ends at the end + * of the info struct, just free the info + */ + kfree(info); + } + spin_unlock(&block_group->tree_lock); + + /* step two, insert a new info struct to cover + * anything before the hole + */ + ret = btrfs_add_free_space(block_group, old_start, + offset - old_start); + WARN_ON(ret); + goto out; + } + + ret = remove_from_bitmap(block_group, info, &offset, &bytes); + if (ret == -EAGAIN) + goto again; + BUG_ON(ret); +out_lock: + spin_unlock(&block_group->tree_lock); +out: + return ret; +} + +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int count = 0; + + for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (info->bytes >= bytes) + count++; + printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (info->bitmap) ? "yes" : "no"); + } + printk(KERN_INFO "block group has cluster?: %s\n", + list_empty(&block_group->cluster_list) ? "no" : "yes"); + printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" + "\n", count); +} + +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space *info; + struct rb_node *n; + u64 ret = 0; + + for (n = rb_first(&block_group->free_space_offset); n; + n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + ret += info->bytes; + } + + return ret; +} + +/* + * for a given cluster, put all of its extents back into the free + * space cache. If the block group passed doesn't match the block group + * pointed to by the cluster, someone else raced in and freed the + * cluster already. In that case, we just return without changing anything + */ +static int +__btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + bool bitmap; + + spin_lock(&cluster->lock); + if (cluster->block_group != block_group) + goto out; + + bitmap = cluster->points_to_bitmap; + cluster->block_group = NULL; + cluster->window_start = 0; + list_del_init(&cluster->block_group_list); + cluster->points_to_bitmap = false; + + if (bitmap) + goto out; + + node = rb_first(&cluster->root); + while (node) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + rb_erase(&entry->offset_index, &cluster->root); + BUG_ON(entry->bitmap); + tree_insert_offset(&block_group->free_space_offset, + entry->offset, &entry->offset_index, 0); + } + cluster->root.rb_node = NULL; + +out: + spin_unlock(&cluster->lock); + btrfs_put_block_group(block_group); + return 0; +} + +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space *info; + struct rb_node *node; + struct btrfs_free_cluster *cluster; + struct list_head *head; + + spin_lock(&block_group->tree_lock); + while ((head = block_group->cluster_list.next) != + &block_group->cluster_list) { + cluster = list_entry(head, struct btrfs_free_cluster, + block_group_list); + + WARN_ON(cluster->block_group != block_group); + __btrfs_return_cluster_to_free_space(block_group, cluster); + if (need_resched()) { + spin_unlock(&block_group->tree_lock); + cond_resched(); + spin_lock(&block_group->tree_lock); + } + } + + while ((node = rb_last(&block_group->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + unlink_free_space(block_group, info); + if (info->bitmap) + kfree(info->bitmap); + kfree(info); + if (need_resched()) { + spin_unlock(&block_group->tree_lock); + cond_resched(); + spin_lock(&block_group->tree_lock); + } + } + + spin_unlock(&block_group->tree_lock); +} + +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size) +{ + struct btrfs_free_space *entry = NULL; + u64 bytes_search = bytes + empty_size; + u64 ret = 0; + + spin_lock(&block_group->tree_lock); + entry = find_free_space(block_group, &offset, &bytes_search, 0); + if (!entry) + goto out; + + ret = offset; + if (entry->bitmap) { + bitmap_clear_bits(block_group, entry, offset, bytes); + if (!entry->bytes) { + unlink_free_space(block_group, entry); + kfree(entry->bitmap); + kfree(entry); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + } else { + unlink_free_space(block_group, entry); + entry->offset += bytes; + entry->bytes -= bytes; + if (!entry->bytes) + kfree(entry); + else + link_free_space(block_group, entry); + } + +out: + spin_unlock(&block_group->tree_lock); + + return ret; +} + +/* + * given a cluster, put all of its extents back into the free space + * cache. If a block group is passed, this function will only free + * a cluster that belongs to the passed block group. + * + * Otherwise, it'll get a reference on the block group pointed to by the + * cluster and remove the cluster from it. + */ +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) +{ + int ret; + + /* first, get a safe pointer to the block group */ + spin_lock(&cluster->lock); + if (!block_group) { + block_group = cluster->block_group; + if (!block_group) { + spin_unlock(&cluster->lock); + return 0; + } + } else if (cluster->block_group != block_group) { + /* someone else has already freed it don't redo their work */ + spin_unlock(&cluster->lock); + return 0; + } + atomic_inc(&block_group->count); + spin_unlock(&cluster->lock); + + /* now return any extents the cluster had on it */ + spin_lock(&block_group->tree_lock); + ret = __btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&block_group->tree_lock); + + /* finally drop our ref */ + btrfs_put_block_group(block_group); + return ret; +} + +static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 bytes, u64 min_start) +{ + struct btrfs_free_space *entry; + int err; + u64 search_start = cluster->window_start; + u64 search_bytes = bytes; + u64 ret = 0; + + spin_lock(&block_group->tree_lock); + spin_lock(&cluster->lock); + + if (!cluster->points_to_bitmap) + goto out; + + if (cluster->block_group != block_group) + goto out; + + /* + * search_start is the beginning of the bitmap, but at some point it may + * be a good idea to point to the actual start of the free area in the + * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only + * to 1 to make sure we get the bitmap entry + */ + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, search_start), + 1, 0); + if (!entry || !entry->bitmap) + goto out; + + search_start = min_start; + search_bytes = bytes; + + err = search_bitmap(block_group, entry, &search_start, + &search_bytes); + if (err) + goto out; + + ret = search_start; + bitmap_clear_bits(block_group, entry, ret, bytes); +out: + spin_unlock(&cluster->lock); + spin_unlock(&block_group->tree_lock); + + return ret; +} + +/* + * given a cluster, try to allocate 'bytes' from it, returns 0 + * if it couldn't find anything suitably large, or a logical disk offset + * if things worked out + */ +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start) +{ + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + u64 ret = 0; + + if (cluster->points_to_bitmap) + return btrfs_alloc_from_bitmap(block_group, cluster, bytes, + min_start); + + spin_lock(&cluster->lock); + if (bytes > cluster->max_size) + goto out; + + if (cluster->block_group != block_group) + goto out; + + node = rb_first(&cluster->root); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + + while(1) { + if (entry->bytes < bytes || entry->offset < min_start) { + struct rb_node *node; + + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + ret = entry->offset; + + entry->offset += bytes; + entry->bytes -= bytes; + + if (entry->bytes == 0) { + rb_erase(&entry->offset_index, &cluster->root); + kfree(entry); + } + break; + } +out: + spin_unlock(&cluster->lock); + + return ret; +} + +static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *entry, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 min_bytes) +{ + unsigned long next_zero; + unsigned long i; + unsigned long search_bits; + unsigned long total_bits; + unsigned long found_bits; + unsigned long start = 0; + unsigned long total_found = 0; + bool found = false; + + i = offset_to_bit(entry->offset, block_group->sectorsize, + max_t(u64, offset, entry->offset)); + search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + total_bits = bytes_to_bits(bytes, block_group->sectorsize); + +again: + found_bits = 0; + for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); + i < BITS_PER_BITMAP; + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { + next_zero = find_next_zero_bit(entry->bitmap, + BITS_PER_BITMAP, i); + if (next_zero - i >= search_bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (!found_bits) + return -1; + + if (!found) { + start = i; + found = true; + } + + total_found += found_bits; + + if (cluster->max_size < found_bits * block_group->sectorsize) + cluster->max_size = found_bits * block_group->sectorsize; + + if (total_found < total_bits) { + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); + if (i - start > total_bits * 2) { + total_found = 0; + cluster->max_size = 0; + found = false; + } + goto again; + } + + cluster->window_start = start * block_group->sectorsize + + entry->offset; + cluster->points_to_bitmap = true; + + return 0; +} + +/* + * here we try to find a cluster of blocks in a block group. The goal + * is to find at least bytes free and up to empty_size + bytes free. + * We might not find them all in one contiguous area. + * + * returns zero and sets up cluster if things worked out, otherwise + * it returns -enospc + */ +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size) +{ + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + struct btrfs_free_space *next; + struct btrfs_free_space *last = NULL; + u64 min_bytes; + u64 window_start; + u64 window_free; + u64 max_extent = 0; + bool found_bitmap = false; + int ret; + + /* for metadata, allow allocates with more holes */ + if (btrfs_test_opt(root, SSD_SPREAD)) { + min_bytes = bytes + empty_size; + } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + /* + * we want to do larger allocations when we are + * flushing out the delayed refs, it helps prevent + * making more work as we go along. + */ + if (trans->transaction->delayed_refs.flushing) + min_bytes = max(bytes, (bytes + empty_size) >> 1); + else + min_bytes = max(bytes, (bytes + empty_size) >> 4); + } else + min_bytes = max(bytes, (bytes + empty_size) >> 2); + + spin_lock(&block_group->tree_lock); + spin_lock(&cluster->lock); + + /* someone already found a cluster, hooray */ + if (cluster->block_group) { + ret = 0; + goto out; + } +again: + entry = tree_search_offset(block_group, offset, found_bitmap, 1); + if (!entry) { + ret = -ENOSPC; + goto out; + } + + /* + * If found_bitmap is true, we exhausted our search for extent entries, + * and we just want to search all of the bitmaps that we can find, and + * ignore any extent entries we find. + */ + while (entry->bitmap || found_bitmap || + (!entry->bitmap && entry->bytes < min_bytes)) { + struct rb_node *node = rb_next(&entry->offset_index); + + if (entry->bitmap && entry->bytes > bytes + empty_size) { + ret = btrfs_bitmap_cluster(block_group, entry, cluster, + offset, bytes + empty_size, + min_bytes); + if (!ret) + goto got_it; + } + + if (!node) { + ret = -ENOSPC; + goto out; + } + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + + /* + * We already searched all the extent entries from the passed in offset + * to the end and didn't find enough space for the cluster, and we also + * didn't find any bitmaps that met our criteria, just go ahead and exit + */ + if (found_bitmap) { + ret = -ENOSPC; + goto out; + } + + cluster->points_to_bitmap = false; + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = entry->bytes; + + while (1) { + /* out window is just right, lets fill it */ + if (window_free >= bytes + empty_size) + break; + + node = rb_next(&last->offset_index); + if (!node) { + if (found_bitmap) + goto again; + ret = -ENOSPC; + goto out; + } + next = rb_entry(node, struct btrfs_free_space, offset_index); + + /* + * we found a bitmap, so if this search doesn't result in a + * cluster, we know to go and search again for the bitmaps and + * start looking for space there + */ + if (next->bitmap) { + if (!found_bitmap) + offset = next->offset; + found_bitmap = true; + last = next; + continue; + } + + /* + * we haven't filled the empty size and the window is + * very large. reset and try again + */ + if (next->offset - (last->offset + last->bytes) > 128 * 1024 || + next->offset - window_start > (bytes + empty_size) * 2) { + entry = next; + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = entry->bytes; + } else { + last = next; + window_free += next->bytes; + if (entry->bytes > max_extent) + max_extent = entry->bytes; + } + } + + cluster->window_start = entry->offset; + + /* + * now we've found our entries, pull them out of the free space + * cache and put them into the cluster rbtree + * + * The cluster includes an rbtree, but only uses the offset index + * of each free space cache entry. + */ + while (1) { + node = rb_next(&entry->offset_index); + if (entry->bitmap && node) { + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } else if (entry->bitmap && !node) { + break; + } + + rb_erase(&entry->offset_index, &block_group->free_space_offset); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index, 0); + BUG_ON(ret); + + if (!node || entry == last) + break; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + + cluster->max_size = max_extent; +got_it: + ret = 0; + atomic_inc(&block_group->count); + list_add_tail(&cluster->block_group_list, &block_group->cluster_list); + cluster->block_group = block_group; +out: + spin_unlock(&cluster->lock); + spin_unlock(&block_group->tree_lock); + + return ret; +} + +/* + * simple code to zero out a cluster + */ +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +{ + spin_lock_init(&cluster->lock); + spin_lock_init(&cluster->refill_lock); + cluster->root.rb_node = NULL; + cluster->max_size = 0; + cluster->points_to_bitmap = false; + INIT_LIST_HEAD(&cluster->block_group_list); + cluster->block_group = NULL; +} + diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h new file mode 100644 index 00000000000..890a8e79011 --- /dev/null +++ b/fs/btrfs/free-space-cache.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_FREE_SPACE_CACHE +#define __BTRFS_FREE_SPACE_CACHE + +struct btrfs_free_space { + struct rb_node offset_index; + u64 offset; + u64 bytes; + unsigned long *bitmap; + struct list_head list; +}; + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache + *block_group); +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes); +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start); +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster); +#endif diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h new file mode 100644 index 00000000000..db2ff9773b9 --- /dev/null +++ b/fs/btrfs/hash.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __HASH__ +#define __HASH__ + +#include <linux/crc32c.h> +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return crc32c((u32)~1, name, len); +} +#endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c new file mode 100644 index 00000000000..72ce3c173d6 --- /dev/null +++ b/fs/btrfs/inode-item.c @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +static int find_name_in_backref(struct btrfs_path *path, const char *name, + int name_len, struct btrfs_inode_ref **ref_ret) +{ + struct extent_buffer *leaf; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long name_ptr; + u32 item_size; + u32 cur_offset = 0; + int len; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + ref = (struct btrfs_inode_ref *)(ptr + cur_offset); + len = btrfs_inode_ref_name_len(leaf, ref); + name_ptr = (unsigned long)(ref + 1); + cur_offset += len + sizeof(*ref); + if (len != name_len) + continue; + if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { + *ref_ret = ref; + return 1; + } + } + return 0; +} + +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + unsigned long item_start; + u32 item_size; + u32 sub_item_len; + int ret; + int del_len = name_len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = -ENOENT; + goto out; + } else if (ret < 0) { + goto out; + } + if (!find_name_in_backref(path, name, name_len, &ref)) { + ret = -ENOENT; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + + if (index) + *index = btrfs_inode_ref_index(leaf, ref); + + if (del_len == item_size) { + ret = btrfs_del_item(trans, root, path); + goto out; + } + ptr = (unsigned long)ref; + sub_item_len = name_len + sizeof(*ref); + item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_size - (ptr + sub_item_len - item_start)); + ret = btrfs_truncate_item(trans, root, path, + item_size - sub_item_len, 1); + BUG_ON(ret); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + unsigned long ptr; + int ret; + int ins_len = name_len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + ins_len); + if (ret == -EEXIST) { + u32 old_size; + + if (find_name_in_backref(path, name, name_len, &ref)) + goto out; + + old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + ret = btrfs_extend_item(trans, root, path, ins_len); + BUG_ON(ret); + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + ret = 0; + } else if (ret < 0) { + if (ret == -EOVERFLOW) + ret = -EMLINK; + goto out; + } else { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + } + write_extent_buffer(path->nodes[0], name, ptr, name_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid) +{ + struct btrfs_key key; + int ret; + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_inode_item)); + return ret; +} + +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod) +{ + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); + if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && + location->offset == (u64)-1 && path->slots[0] != 0) { + slot = path->slots[0] - 1; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid == location->objectid && + btrfs_key_type(&found_key) == btrfs_key_type(location)) { + path->slots[0]--; + return 0; + } + } + return ret; +} diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c new file mode 100644 index 00000000000..c56eb590917 --- /dev/null +++ b/fs/btrfs/inode-map.c @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *l; + struct btrfs_key search_key; + struct btrfs_key found_key; + int slot; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + search_key.objectid = BTRFS_LAST_FREE_OBJECTID; + search_key.type = -1; + search_key.offset = (u64)-1; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); + if (path->slots[0] > 0) { + slot = path->slots[0] - 1; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + *objectid = max_t(u64, found_key.objectid, + BTRFS_FIRST_FREE_OBJECTID - 1); + } else { + *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 dirid, u64 *objectid) +{ + int ret; + mutex_lock(&root->objectid_mutex); + + if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { + ret = btrfs_find_highest_inode(root, &root->highest_objectid); + if (ret) + goto out; + } + + if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + ret = -ENOSPC; + goto out; + } + + *objectid = ++root->highest_objectid; + ret = 0; +out: + mutex_unlock(&root->objectid_mutex); + return ret; +} diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c new file mode 100644 index 00000000000..4deb280f896 --- /dev/null +++ b/fs/btrfs/inode.c @@ -0,0 +1,6056 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/bit_spinlock.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/falloc.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "volumes.h" +#include "ordered-data.h" +#include "xattr.h" +#include "tree-log.h" +#include "compression.h" +#include "locking.h" + +struct btrfs_iget_args { + u64 ino; + struct btrfs_root *root; +}; + +static const struct inode_operations btrfs_dir_inode_operations; +static const struct inode_operations btrfs_symlink_inode_operations; +static const struct inode_operations btrfs_dir_ro_inode_operations; +static const struct inode_operations btrfs_special_inode_operations; +static const struct inode_operations btrfs_file_inode_operations; +static const struct address_space_operations btrfs_aops; +static const struct address_space_operations btrfs_symlink_aops; +static const struct file_operations btrfs_dir_file_operations; +static struct extent_io_ops btrfs_extent_io_ops; + +static struct kmem_cache *btrfs_inode_cachep; +struct kmem_cache *btrfs_trans_handle_cachep; +struct kmem_cache *btrfs_transaction_cachep; +struct kmem_cache *btrfs_path_cachep; + +#define S_SHIFT 12 +static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, +}; + +static void btrfs_truncate(struct inode *inode); +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock); + +static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) +{ + int err; + + err = btrfs_init_acl(trans, inode, dir); + if (!err) + err = btrfs_xattr_security_init(trans, inode, dir); + return err; +} + +/* + * this does all the hard work for inserting an inline extent into + * the btree. The caller should have done a btrfs_drop_extents so that + * no overlapping inline items exist in the btree + */ +static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, size_t size, size_t compressed_size, + struct page **compressed_pages) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct page *page = NULL; + char *kaddr; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + int err = 0; + int ret; + size_t cur_size = size; + size_t datasize; + unsigned long offset; + int use_compress = 0; + + if (compressed_size && compressed_pages) { + use_compress = 1; + cur_size = compressed_size; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + btrfs_set_trans_block_group(trans, inode); + + key.objectid = inode->i_ino; + key.offset = start; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + datasize = btrfs_file_extent_calc_inline_size(cur_size); + + inode_add_bytes(inode, size); + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + BUG_ON(ret); + if (ret) { + err = ret; + goto fail; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, size); + ptr = btrfs_file_extent_inline_start(ei); + + if (use_compress) { + struct page *cpage; + int i = 0; + while (compressed_size > 0) { + cpage = compressed_pages[i]; + cur_size = min_t(unsigned long, compressed_size, + PAGE_CACHE_SIZE); + + kaddr = kmap_atomic(cpage, KM_USER0); + write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap_atomic(kaddr, KM_USER0); + + i++; + ptr += cur_size; + compressed_size -= cur_size; + } + btrfs_set_file_extent_compression(leaf, ei, + BTRFS_COMPRESS_ZLIB); + } else { + page = find_get_page(inode->i_mapping, + start >> PAGE_CACHE_SHIFT); + btrfs_set_file_extent_compression(leaf, ei, 0); + kaddr = kmap_atomic(page, KM_USER0); + offset = start & (PAGE_CACHE_SIZE - 1); + write_extent_buffer(leaf, kaddr + offset, ptr, size); + kunmap_atomic(kaddr, KM_USER0); + page_cache_release(page); + } + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + /* + * we're an inline extent, so nobody can + * extend the file past i_size without locking + * a page we already have locked. + * + * We must do any isize and inode updates + * before we unlock the pages. Otherwise we + * could end up racing with unlink. + */ + BTRFS_I(inode)->disk_i_size = inode->i_size; + btrfs_update_inode(trans, root, inode); + + return 0; +fail: + btrfs_free_path(path); + return err; +} + + +/* + * conditionally insert an inline extent into the file. This + * does the checks required to make sure the data is small enough + * to fit as an inline extent. + */ +static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end, + size_t compressed_size, + struct page **compressed_pages) +{ + u64 isize = i_size_read(inode); + u64 actual_end = min(end + 1, isize); + u64 inline_len = actual_end - start; + u64 aligned_end = (end + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + u64 hint_byte; + u64 data_len = inline_len; + int ret; + + if (compressed_size) + data_len = compressed_size; + + if (start > 0 || + actual_end >= PAGE_CACHE_SIZE || + data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || + (!compressed_size && + (actual_end & (root->sectorsize - 1)) == 0) || + end + 1 < isize || + data_len > root->fs_info->max_inline) { + return 1; + } + + ret = btrfs_drop_extents(trans, inode, start, aligned_end, + &hint_byte, 1); + BUG_ON(ret); + + if (isize > actual_end) + inline_len = min_t(u64, isize, actual_end); + ret = insert_inline_extent(trans, root, inode, start, + inline_len, compressed_size, + compressed_pages); + BUG_ON(ret); + btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); + return 0; +} + +struct async_extent { + u64 start; + u64 ram_size; + u64 compressed_size; + struct page **pages; + unsigned long nr_pages; + struct list_head list; +}; + +struct async_cow { + struct inode *inode; + struct btrfs_root *root; + struct page *locked_page; + u64 start; + u64 end; + struct list_head extents; + struct btrfs_work work; +}; + +static noinline int add_async_extent(struct async_cow *cow, + u64 start, u64 ram_size, + u64 compressed_size, + struct page **pages, + unsigned long nr_pages) +{ + struct async_extent *async_extent; + + async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); + async_extent->start = start; + async_extent->ram_size = ram_size; + async_extent->compressed_size = compressed_size; + async_extent->pages = pages; + async_extent->nr_pages = nr_pages; + list_add_tail(&async_extent->list, &cow->extents); + return 0; +} + +/* + * we create compressed extents in two phases. The first + * phase compresses a range of pages that have already been + * locked (both pages and state bits are locked). + * + * This is done inside an ordered work queue, and the compression + * is spread across many cpus. The actual IO submission is step + * two, and the ordered work queue takes care of making sure that + * happens in the same order things were put onto the queue by + * writepages and friends. + * + * If this code finds it can't get good compression, it puts an + * entry onto the work queue to write the uncompressed bytes. This + * makes sure that both compressed inodes and uncompressed inodes + * are written in the same order that pdflush sent them down. + */ +static noinline int compress_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, + struct async_cow *async_cow, + int *num_added) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 num_bytes; + u64 orig_start; + u64 disk_num_bytes; + u64 blocksize = root->sectorsize; + u64 actual_end; + u64 isize = i_size_read(inode); + int ret = 0; + struct page **pages = NULL; + unsigned long nr_pages; + unsigned long nr_pages_ret = 0; + unsigned long total_compressed = 0; + unsigned long total_in = 0; + unsigned long max_compressed = 128 * 1024; + unsigned long max_uncompressed = 128 * 1024; + int i; + int will_compress; + + orig_start = start; + + actual_end = min_t(u64, isize, end + 1); +again: + will_compress = 0; + nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; + nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + + /* + * we don't want to send crud past the end of i_size through + * compression, that's just a waste of CPU time. So, if the + * end of the file is before the start of our current + * requested range of bytes, we bail out to the uncompressed + * cleanup code that can deal with all of this. + * + * It isn't really the fastest way to fix things, but this is a + * very uncommon corner. + */ + if (actual_end <= start) + goto cleanup_and_bail_uncompressed; + + total_compressed = actual_end - start; + + /* we want to make sure that amount of ram required to uncompress + * an extent is reasonable, so we limit the total size in ram + * of a compressed extent to 128k. This is a crucial number + * because it also controls how easily we can spread reads across + * cpus for decompression. + * + * We also want to make sure the amount of IO required to do + * a random read is reasonably small, so we limit the size of + * a compressed extent to 128k. + */ + total_compressed = min(total_compressed, max_uncompressed); + num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = max(blocksize, num_bytes); + disk_num_bytes = num_bytes; + total_in = 0; + ret = 0; + + /* + * we do compression for mount -o compress and when the + * inode has not been flagged as nocompress. This flag can + * change at any time if we discover bad compression ratios. + */ + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && + btrfs_test_opt(root, COMPRESS)) { + WARN_ON(pages); + pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + + ret = btrfs_zlib_compress_pages(inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); + + if (!ret) { + unsigned long offset = total_compressed & + (PAGE_CACHE_SIZE - 1); + struct page *page = pages[nr_pages_ret - 1]; + char *kaddr; + + /* zero the tail end of the last page, we might be + * sending it down to disk + */ + if (offset) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, + PAGE_CACHE_SIZE - offset); + kunmap_atomic(kaddr, KM_USER0); + } + will_compress = 1; + } + } + if (start == 0) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + btrfs_set_trans_block_group(trans, inode); + + /* lets try to make an inline extent */ + if (ret || total_in < (actual_end - start)) { + /* we didn't compress the entire range, try + * to make an uncompressed inline extent. + */ + ret = cow_file_range_inline(trans, root, inode, + start, end, 0, NULL); + } else { + /* try making a compressed inline extent */ + ret = cow_file_range_inline(trans, root, inode, + start, end, + total_compressed, pages); + } + if (ret == 0) { + /* + * inline extent creation worked, we don't need + * to create any more async work items. Unlock + * and free up our temp pages. + */ + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); + + btrfs_end_transaction(trans, root); + goto free_pages_out; + } + btrfs_end_transaction(trans, root); + } + + if (will_compress) { + /* + * we aren't doing an inline extent round the compressed size + * up to a block size boundary so the allocator does sane + * things + */ + total_compressed = (total_compressed + blocksize - 1) & + ~(blocksize - 1); + + /* + * one last check to make sure the compression is really a + * win, compare the page count read with the blocks on disk + */ + total_in = (total_in + PAGE_CACHE_SIZE - 1) & + ~(PAGE_CACHE_SIZE - 1); + if (total_compressed >= total_in) { + will_compress = 0; + } else { + disk_num_bytes = total_compressed; + num_bytes = total_in; + } + } + if (!will_compress && pages) { + /* + * the compression code ran but failed to make things smaller, + * free any pages it allocated and our page pointer array + */ + for (i = 0; i < nr_pages_ret; i++) { + WARN_ON(pages[i]->mapping); + page_cache_release(pages[i]); + } + kfree(pages); + pages = NULL; + total_compressed = 0; + nr_pages_ret = 0; + + /* flag the file so we don't compress in the future */ + if (!btrfs_test_opt(root, FORCE_COMPRESS)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + } + if (will_compress) { + *num_added += 1; + + /* the async work queues will take care of doing actual + * allocation on disk for these compressed pages, + * and will submit them to the elevator. + */ + add_async_extent(async_cow, start, num_bytes, + total_compressed, pages, nr_pages_ret); + + if (start + num_bytes < end && start + num_bytes < actual_end) { + start += num_bytes; + pages = NULL; + cond_resched(); + goto again; + } + } else { +cleanup_and_bail_uncompressed: + /* + * No compression, but we still need to write the pages in + * the file we've been given so far. redirty the locked + * page if it corresponds to our extent and set things up + * for the async work queue to run cow_file_range to do + * the normal delalloc dance + */ + if (page_offset(locked_page) >= start && + page_offset(locked_page) <= end) { + __set_page_dirty_nobuffers(locked_page); + /* unlocked later on in the async handlers */ + } + add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); + *num_added += 1; + } + +out: + return 0; + +free_pages_out: + for (i = 0; i < nr_pages_ret; i++) { + WARN_ON(pages[i]->mapping); + page_cache_release(pages[i]); + } + kfree(pages); + + goto out; +} + +/* + * phase two of compressed writeback. This is the ordered portion + * of the code, which only gets called in the order the work was + * queued. We walk all the async extents created by compress_file_range + * and send them down to the disk. + */ +static noinline int submit_compressed_extents(struct inode *inode, + struct async_cow *async_cow) +{ + struct async_extent *async_extent; + u64 alloc_hint = 0; + struct btrfs_trans_handle *trans; + struct btrfs_key ins; + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree; + int ret = 0; + + if (list_empty(&async_cow->extents)) + return 0; + + + while (!list_empty(&async_cow->extents)) { + async_extent = list_entry(async_cow->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + + io_tree = &BTRFS_I(inode)->io_tree; + +retry: + /* did the compression code fall back to uncompressed IO? */ + if (!async_extent->pages) { + int page_started = 0; + unsigned long nr_written = 0; + + lock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, GFP_NOFS); + + /* allocate blocks */ + ret = cow_file_range(inode, async_cow->locked_page, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0); + + /* + * if page_started, cow_file_range inserted an + * inline extent and took care of all the unlocking + * and IO for us. Otherwise, we need to submit + * all those pages down to the drive. + */ + if (!page_started && !ret) + extent_write_locked_range(io_tree, + inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + btrfs_get_extent, + WB_SYNC_ALL); + kfree(async_extent); + cond_resched(); + continue; + } + + lock_extent(io_tree, async_extent->start, + async_extent->start + async_extent->ram_size - 1, + GFP_NOFS); + + trans = btrfs_join_transaction(root, 1); + ret = btrfs_reserve_extent(trans, root, + async_extent->compressed_size, + async_extent->compressed_size, + 0, alloc_hint, + (u64)-1, &ins, 1); + btrfs_end_transaction(trans, root); + + if (ret) { + int i; + for (i = 0; i < async_extent->nr_pages; i++) { + WARN_ON(async_extent->pages[i]->mapping); + page_cache_release(async_extent->pages[i]); + } + kfree(async_extent->pages); + async_extent->nr_pages = 0; + async_extent->pages = NULL; + unlock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, GFP_NOFS); + goto retry; + } + + /* + * here we're doing allocation and writeback of the + * compressed pages + */ + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + + em = alloc_extent_map(GFP_NOFS); + em->start = async_extent->start; + em->len = async_extent->ram_size; + em->orig_start = em->start; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + } + + ret = btrfs_add_ordered_extent(inode, async_extent->start, + ins.objectid, + async_extent->ram_size, + ins.offset, + BTRFS_ORDERED_COMPRESSED); + BUG_ON(ret); + + /* + * clear dirty, set writeback and unlock the pages. + */ + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); + + ret = btrfs_submit_compressed_write(inode, + async_extent->start, + async_extent->ram_size, + ins.objectid, + ins.offset, async_extent->pages, + async_extent->nr_pages); + + BUG_ON(ret); + alloc_hint = ins.objectid + ins.offset; + kfree(async_extent); + cond_resched(); + } + + return 0; +} + +/* + * when extent_io.c finds a delayed allocation range in the file, + * the call backs end up in this code. The basic idea is to + * allocate extents on disk for the range, and create ordered data structs + * in ram to track those extents. + * + * locked_page is the page that writepage had locked already. We use + * it to make sure we don't do extra locks or unlocks. + * + * *page_started is set to one if we unlock locked_page and do everything + * required to start IO on it. It may be clean and already done with + * IO when we return. + */ +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 alloc_hint = 0; + u64 num_bytes; + unsigned long ram_size; + u64 disk_num_bytes; + u64 cur_alloc_size; + u64 blocksize = root->sectorsize; + u64 actual_end; + u64 isize = i_size_read(inode); + struct btrfs_key ins; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + int ret = 0; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + btrfs_set_trans_block_group(trans, inode); + + actual_end = min_t(u64, isize, end + 1); + + num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = max(blocksize, num_bytes); + disk_num_bytes = num_bytes; + ret = 0; + + if (start == 0) { + /* lets try to make an inline extent */ + ret = cow_file_range_inline(trans, root, inode, + start, end, 0, NULL); + if (ret == 0) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + + *nr_written = *nr_written + + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + *page_started = 1; + ret = 0; + goto out; + } + } + + BUG_ON(disk_num_bytes > + btrfs_super_total_bytes(&root->fs_info->super_copy)); + + + read_lock(&BTRFS_I(inode)->extent_tree.lock); + em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, + start, num_bytes); + if (em) { + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } + } + read_unlock(&BTRFS_I(inode)->extent_tree.lock); + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); + + while (disk_num_bytes > 0) { + unsigned long op; + + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); + ret = btrfs_reserve_extent(trans, root, cur_alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); + BUG_ON(ret); + + em = alloc_extent_map(GFP_NOFS); + em->start = start; + em->orig_start = em->start; + ram_size = ins.offset; + em->len = ins.offset; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, + start + ram_size - 1, 0); + } + + cur_alloc_size = ins.offset; + ret = btrfs_add_ordered_extent(inode, start, ins.objectid, + ram_size, cur_alloc_size, 0); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, start, + cur_alloc_size); + BUG_ON(ret); + } + + if (disk_num_bytes < cur_alloc_size) + break; + + /* we're not doing compressed IO, don't unlock the first + * page (which the caller expects to stay locked), don't + * clear any dirty bits and don't set any writeback bits + * + * Do set the Private2 bit so we know this page was properly + * setup for writepage + */ + op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; + op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2; + + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, start + ram_size - 1, + locked_page, op); + disk_num_bytes -= cur_alloc_size; + num_bytes -= cur_alloc_size; + alloc_hint = ins.objectid + ins.offset; + start += cur_alloc_size; + } +out: + ret = 0; + btrfs_end_transaction(trans, root); + + return ret; +} + +/* + * work queue call back to started compression on a file and pages + */ +static noinline void async_cow_start(struct btrfs_work *work) +{ + struct async_cow *async_cow; + int num_added = 0; + async_cow = container_of(work, struct async_cow, work); + + compress_file_range(async_cow->inode, async_cow->locked_page, + async_cow->start, async_cow->end, async_cow, + &num_added); + if (num_added == 0) + async_cow->inode = NULL; +} + +/* + * work queue call back to submit previously compressed pages + */ +static noinline void async_cow_submit(struct btrfs_work *work) +{ + struct async_cow *async_cow; + struct btrfs_root *root; + unsigned long nr_pages; + + async_cow = container_of(work, struct async_cow, work); + + root = async_cow->root; + nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); + + if (atomic_read(&root->fs_info->async_delalloc_pages) < + 5 * 1042 * 1024 && + waitqueue_active(&root->fs_info->async_submit_wait)) + wake_up(&root->fs_info->async_submit_wait); + + if (async_cow->inode) + submit_compressed_extents(async_cow->inode, async_cow); +} + +static noinline void async_cow_free(struct btrfs_work *work) +{ + struct async_cow *async_cow; + async_cow = container_of(work, struct async_cow, work); + kfree(async_cow); +} + +static int cow_file_range_async(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) +{ + struct async_cow *async_cow; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long nr_pages; + u64 cur_end; + int limit = 10 * 1024 * 1042; + + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, + 1, 0, NULL, GFP_NOFS); + while (start < end) { + async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); + async_cow->inode = inode; + async_cow->root = root; + async_cow->locked_page = locked_page; + async_cow->start = start; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + cur_end = end; + else + cur_end = min(end, start + 512 * 1024 - 1); + + async_cow->end = cur_end; + INIT_LIST_HEAD(&async_cow->extents); + + async_cow->work.func = async_cow_start; + async_cow->work.ordered_func = async_cow_submit; + async_cow->work.ordered_free = async_cow_free; + async_cow->work.flags = 0; + + nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); + + btrfs_queue_worker(&root->fs_info->delalloc_workers, + &async_cow->work); + + if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->async_delalloc_pages) < + limit)); + } + + while (atomic_read(&root->fs_info->async_submit_draining) && + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->async_delalloc_pages) == + 0)); + } + + *nr_written += nr_pages; + start = cur_end + 1; + } + *page_started = 1; + return 0; +} + +static noinline int csum_exist_in_range(struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + int ret; + struct btrfs_ordered_sum *sums; + LIST_HEAD(list); + + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, + bytenr + num_bytes - 1, &list); + if (ret == 0 && list_empty(&list)) + return 0; + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + return 1; +} + +/* + * when nowcow writeback call back. This checks for snapshots or COW copies + * of the extents that exist in the file, and COWs the file as required. + * + * If no cow copies or snapshots exist, we write directly to the existing + * blocks on disk + */ +static noinline int run_delalloc_nocow(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, int force, + unsigned long *nr_written) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key found_key; + u64 cow_start; + u64 cur_offset; + u64 extent_end; + u64 extent_offset; + u64 disk_bytenr; + u64 num_bytes; + int extent_type; + int ret; + int type; + int nocow; + int check_prev = 1; + + path = btrfs_alloc_path(); + BUG_ON(!path); + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + cow_start = (u64)-1; + cur_offset = start; + while (1) { + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + cur_offset, 0); + BUG_ON(ret < 0); + if (ret > 0 && path->slots[0] > 0 && check_prev) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0] - 1); + if (found_key.objectid == inode->i_ino && + found_key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + check_prev = 0; +next_slot: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + BUG_ON(1); + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + nocow = 0; + disk_bytenr = 0; + num_bytes = 0; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid > inode->i_ino || + found_key.type > BTRFS_EXTENT_DATA_KEY || + found_key.offset > end) + break; + + if (found_key.offset > cur_offset) { + extent_end = found_key.offset; + extent_type = 0; + goto out_check; + } + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + extent_end = found_key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } + if (disk_bytenr == 0) + goto out_check; + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out_check; + if (extent_type == BTRFS_FILE_EXTENT_REG && !force) + goto out_check; + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out_check; + if (btrfs_cross_ref_exist(trans, root, inode->i_ino, + found_key.offset - + extent_offset, disk_bytenr)) + goto out_check; + disk_bytenr += extent_offset; + disk_bytenr += cur_offset - found_key.offset; + num_bytes = min(end + 1, extent_end) - cur_offset; + /* + * force cow if csum exists in the range. + * this ensure that csum for a given extent are + * either valid or do not exist. + */ + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out_check; + nocow = 1; + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = found_key.offset + + btrfs_file_extent_inline_len(leaf, fi); + extent_end = ALIGN(extent_end, root->sectorsize); + } else { + BUG_ON(1); + } +out_check: + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } + if (!nocow) { + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = extent_end; + if (cur_offset > end) + break; + path->slots[0]++; + goto next_slot; + } + + btrfs_release_path(root, path); + if (cow_start != (u64)-1) { + ret = cow_file_range(inode, locked_page, cow_start, + found_key.offset - 1, page_started, + nr_written, 1); + BUG_ON(ret); + cow_start = (u64)-1; + } + + if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + struct extent_map *em; + struct extent_map_tree *em_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + em = alloc_extent_map(GFP_NOFS); + em->start = cur_offset; + em->orig_start = em->start; + em->len = num_bytes; + em->block_len = num_bytes; + em->block_start = disk_bytenr; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + } + type = BTRFS_ORDERED_PREALLOC; + } else { + type = BTRFS_ORDERED_NOCOW; + } + + ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, + num_bytes, num_bytes, type); + BUG_ON(ret); + + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + cur_offset, cur_offset + num_bytes - 1, + locked_page, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2); + cur_offset = extent_end; + if (cur_offset > end) + break; + } + btrfs_release_path(root, path); + + if (cur_offset <= end && cow_start == (u64)-1) + cow_start = cur_offset; + if (cow_start != (u64)-1) { + ret = cow_file_range(inode, locked_page, cow_start, end, + page_started, nr_written, 1); + BUG_ON(ret); + } + + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + btrfs_free_path(path); + return 0; +} + +/* + * extent_io.c call back to do delayed allocation processing + */ +static int run_delalloc_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started, 1, nr_written); + else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started, 0, nr_written); + else if (!btrfs_test_opt(root, COMPRESS)) + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); + else + ret = cow_file_range_async(inode, locked_page, start, end, + page_started, nr_written); + return ret; +} + +static int btrfs_split_extent_hook(struct inode *inode, + struct extent_state *orig, u64 split) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 size; + + if (!(orig->state & EXTENT_DELALLOC)) + return 0; + + size = orig->end - orig->start + 1; + if (size > root->fs_info->max_extent) { + u64 num_extents; + u64 new_size; + + new_size = orig->end - split + 1; + num_extents = div64_u64(size + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + + /* + * if we break a large extent up then leave oustanding_extents + * be, since we've already accounted for the large extent. + */ + if (div64_u64(new_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent) < num_extents) + return 0; + } + + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + return 0; +} + +/* + * extent_io.c merge_extent_hook, used to track merged delayed allocation + * extents so we can keep track of new extents that are just merged onto old + * extents, such as when we are doing sequential writes, so we can properly + * account for the metadata space we'll need. + */ +static int btrfs_merge_extent_hook(struct inode *inode, + struct extent_state *new, + struct extent_state *other) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 new_size, old_size; + u64 num_extents; + + /* not delalloc, ignore it */ + if (!(other->state & EXTENT_DELALLOC)) + return 0; + + old_size = other->end - other->start + 1; + if (new->start < other->start) + new_size = other->end - new->start + 1; + else + new_size = new->end - other->start + 1; + + /* we're not bigger than the max, unreserve the space and go */ + if (new_size <= root->fs_info->max_extent) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + return 0; + } + + /* + * If we grew by another max_extent, just return, we want to keep that + * reserved amount. + */ + num_extents = div64_u64(old_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + if (div64_u64(new_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent) > num_extents) + return 0; + + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + return 0; +} + +/* + * extent_io.c set_bit_hook, used to track delayed allocation + * bytes in this file, and to maintain the list of inodes that + * have pending delalloc work to be done. + */ +static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits) +{ + + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testeing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_delalloc_reserve_space(root, inode, end - start + 1); + spin_lock(&root->fs_info->delalloc_lock); + BTRFS_I(inode)->delalloc_bytes += end - start + 1; + root->fs_info->delalloc_bytes += end - start + 1; + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->fs_info->delalloc_inodes); + } + spin_unlock(&root->fs_info->delalloc_lock); + } + return 0; +} + +/* + * extent_io.c clear_bit_hook, see set_bit_hook for why + */ +static int btrfs_clear_bit_hook(struct inode *inode, + struct extent_state *state, unsigned long bits) +{ + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testeing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (bits & EXTENT_DO_ACCOUNTING) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + } + + spin_lock(&root->fs_info->delalloc_lock); + if (state->end - state->start + 1 > + root->fs_info->delalloc_bytes) { + printk(KERN_INFO "btrfs warning: delalloc account " + "%llu %llu\n", + (unsigned long long) + state->end - state->start + 1, + (unsigned long long) + root->fs_info->delalloc_bytes); + btrfs_delalloc_free_space(root, inode, (u64)-1); + root->fs_info->delalloc_bytes = 0; + BTRFS_I(inode)->delalloc_bytes = 0; + } else { + btrfs_delalloc_free_space(root, inode, + state->end - + state->start + 1); + root->fs_info->delalloc_bytes -= state->end - + state->start + 1; + BTRFS_I(inode)->delalloc_bytes -= state->end - + state->start + 1; + } + if (BTRFS_I(inode)->delalloc_bytes == 0 && + !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + } + spin_unlock(&root->fs_info->delalloc_lock); + } + return 0; +} + +/* + * extent_io.c merge_bio_hook, this must check the chunk tree to make sure + * we don't create bios that span stripes or chunks + */ +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct btrfs_mapping_tree *map_tree; + u64 logical = (u64)bio->bi_sector << 9; + u64 length = 0; + u64 map_length; + int ret; + + if (bio_flags & EXTENT_BIO_COMPRESSED) + return 0; + + length = bio->bi_size; + map_tree = &root->fs_info->mapping_tree; + map_length = length; + ret = btrfs_map_block(map_tree, READ, logical, + &map_length, NULL, 0); + + if (map_length < length + size) + return 1; + return 0; +} + +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static int __btrfs_submit_bio_start(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + BUG_ON(ret); + return 0; +} + +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + return btrfs_map_bio(root, rw, bio, mirror_num, 1); +} + +/* + * extent_io.c submission hook. This does the right thing for csum calculation + * on write, or reading the csums from the tree before a read + */ +static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + int skip_sum; + + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + if (!(rw & (1 << BIO_RW))) { + if (bio_flags & EXTENT_BIO_COMPRESSED) { + return btrfs_submit_compressed_read(inode, bio, + mirror_num, bio_flags); + } else if (!skip_sum) + btrfs_lookup_bio_sums(root, inode, bio, NULL); + goto mapit; + } else if (!skip_sum) { + /* csum items have already been cloned */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + goto mapit; + /* we're doing a write, do the async checksumming */ + return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, + bio_flags, __btrfs_submit_bio_start, + __btrfs_submit_bio_done); + } + +mapit: + return btrfs_map_bio(root, rw, bio, mirror_num, 0); +} + +/* + * given a list of ordered sums record them in the inode. This happens + * at IO completion time based on sums calculated at bio submission time. + */ +static noinline int add_pending_csums(struct btrfs_trans_handle *trans, + struct inode *inode, u64 file_offset, + struct list_head *list) +{ + struct btrfs_ordered_sum *sum; + + btrfs_set_trans_block_group(trans, inode); + + list_for_each_entry(sum, list, list) { + btrfs_csum_file_blocks(trans, + BTRFS_I(inode)->root->fs_info->csum_root, sum); + } + return 0; +} + +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) +{ + if ((end & (PAGE_CACHE_SIZE - 1)) == 0) + WARN_ON(1); + return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, + GFP_NOFS); +} + +/* see btrfs_writepage_start_hook for details on why this is required */ +struct btrfs_writepage_fixup { + struct page *page; + struct btrfs_work work; +}; + +static void btrfs_writepage_fixup_worker(struct btrfs_work *work) +{ + struct btrfs_writepage_fixup *fixup; + struct btrfs_ordered_extent *ordered; + struct page *page; + struct inode *inode; + u64 page_start; + u64 page_end; + + fixup = container_of(work, struct btrfs_writepage_fixup, work); + page = fixup->page; +again: + lock_page(page); + if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + ClearPageChecked(page); + goto out_page; + } + + inode = page->mapping->host; + page_start = page_offset(page); + page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; + + lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); + + /* already ordered? We're done */ + if (PagePrivate2(page)) + goto out; + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(&BTRFS_I(inode)->io_tree, page_start, + page_end, GFP_NOFS); + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + goto again; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end); + ClearPageChecked(page); +out: + unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); +out_page: + unlock_page(page); + page_cache_release(page); +} + +/* + * There are a few paths in the higher layers of the kernel that directly + * set the page dirty bit without asking the filesystem if it is a + * good idea. This causes problems because we want to make sure COW + * properly happens and the data=ordered rules are followed. + * + * In our case any range that doesn't have the ORDERED bit set + * hasn't been properly setup for IO. We kick off an async process + * to fix it up. The async helper will wait for ordered extents, set + * the delalloc bit and make it safe to write the page. + */ +static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) +{ + struct inode *inode = page->mapping->host; + struct btrfs_writepage_fixup *fixup; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* this page is properly in the ordered list */ + if (TestClearPagePrivate2(page)) + return 0; + + if (PageChecked(page)) + return -EAGAIN; + + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + return -EAGAIN; + + SetPageChecked(page); + page_cache_get(page); + fixup->work.func = btrfs_writepage_fixup_worker; + fixup->page = page; + btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); + return -EAGAIN; +} + +static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, + struct inode *inode, u64 file_pos, + u64 disk_bytenr, u64 disk_num_bytes, + u64 num_bytes, u64 ram_bytes, + u8 compression, u8 encryption, + u16 other_encoding, int extent_type) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u64 hint; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + path->leave_spinning = 1; + + /* + * we may be replacing one extent in the tree with another. + * The new extent is pinned in the extent map, and we don't want + * to drop it from the cache until it is completely in the btree. + * + * So, tell btrfs_drop_extents to leave this extent in the cache. + * the caller is expected to unpin it and allow it to be merged + * with the others. + */ + ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, + &hint, 0); + BUG_ON(ret); + + ins.objectid = inode->i_ino; + ins.offset = file_pos; + ins.type = BTRFS_EXTENT_DATA_KEY; + ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); + BUG_ON(ret); + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_type(leaf, fi, extent_type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); + btrfs_set_file_extent_compression(leaf, fi, compression); + btrfs_set_file_extent_encryption(leaf, fi, encryption); + btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + + btrfs_unlock_up_safe(path, 1); + btrfs_set_lock_blocking(leaf); + + btrfs_mark_buffer_dirty(leaf); + + inode_add_bytes(inode, num_bytes); + + ins.objectid = disk_bytenr; + ins.offset = disk_num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_alloc_reserved_file_extent(trans, root, + root->root_key.objectid, + inode->i_ino, file_pos, &ins); + BUG_ON(ret); + btrfs_free_path(path); + + return 0; +} + +/* + * helper function for btrfs_finish_ordered_io, this + * just reads in some of the csum leaves to prime them into ram + * before we start the transaction. It limits the amount of btree + * reads required while inside the transaction. + */ +/* as ordered data IO finishes, this gets called so we can finish + * an ordered extent if the range of bytes in the file it covers are + * fully written. + */ +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_ordered_extent *ordered_extent = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int compressed = 0; + int ret; + + ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); + if (!ret) + return 0; + + ordered_extent = btrfs_lookup_ordered_extent(inode, start); + BUG_ON(!ordered_extent); + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + BUG_ON(!list_empty(&ordered_extent->list)); + ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (!ret) { + trans = btrfs_join_transaction(root, 1); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + btrfs_end_transaction(trans, root); + } + goto out; + } + + lock_extent(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + GFP_NOFS); + + trans = btrfs_join_transaction(root, 1); + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) + compressed = 1; + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + BUG_ON(compressed); + ret = btrfs_mark_extent_written(trans, inode, + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len); + BUG_ON(ret); + } else { + ret = insert_reserved_file_extent(trans, inode, + ordered_extent->file_offset, + ordered_extent->start, + ordered_extent->disk_len, + ordered_extent->len, + ordered_extent->len, + compressed, 0, 0, + BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered_extent->file_offset, + ordered_extent->len); + BUG_ON(ret); + } + unlock_extent(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + GFP_NOFS); + add_pending_csums(trans, inode, ordered_extent->file_offset, + &ordered_extent->list); + + /* this also removes the ordered extent from the tree */ + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + btrfs_end_transaction(trans, root); +out: + /* once for us */ + btrfs_put_ordered_extent(ordered_extent); + /* once for the tree */ + btrfs_put_ordered_extent(ordered_extent); + + return 0; +} + +static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate) +{ + ClearPagePrivate2(page); + return btrfs_finish_ordered_io(page->mapping->host, start, end); +} + +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int last_mirror; +}; + +static int btrfs_io_failed_hook(struct bio *failed_bio, + struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct io_failure_record *failrec = NULL; + u64 private; + struct extent_map *em; + struct inode *inode = page->mapping->host; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct bio *bio; + int num_copies; + int ret; + int rw; + u64 logical; + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kmalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + failrec->start = start; + failrec->len = end - start + 1; + failrec->last_mirror = 0; + failrec->bio_flags = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + + if (!em || IS_ERR(em)) { + kfree(failrec); + return -EIO; + } + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + } + failrec->logical = logical; + free_extent_map(em); + set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | + EXTENT_DIRTY, GFP_NOFS); + set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + } + num_copies = btrfs_num_copies( + &BTRFS_I(inode)->root->fs_info->mapping_tree, + failrec->logical, failrec->len); + failrec->last_mirror++; + if (!state) { + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + if (state && state->start != failrec->start) + state = NULL; + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + } + if (!state || failrec->last_mirror > num_copies) { + set_state_private(failure_tree, failrec->start, 0); + clear_extent_bits(failure_tree, failrec->start, + failrec->start + failrec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + kfree(failrec); + return -EIO; + } + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_private = state; + bio->bi_end_io = failed_bio->bi_end_io; + bio->bi_sector = failrec->logical >> 9; + bio->bi_bdev = failed_bio->bi_bdev; + bio->bi_size = 0; + + bio_add_page(bio, page, failrec->len, start - page_offset(page)); + if (failed_bio->bi_rw & (1 << BIO_RW)) + rw = WRITE; + else + rw = READ; + + BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, + failrec->last_mirror, + failrec->bio_flags); + return 0; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int btrfs_clean_io_failures(struct inode *inode, u64 start) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failure; + int ret; + + private = 0; + if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY)) { + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, + start, &private_failure); + if (ret == 0) { + failure = (struct io_failure_record *)(unsigned long) + private_failure; + set_state_private(&BTRFS_I(inode)->io_failure_tree, + failure->start, 0); + clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, + failure->start, + failure->start + failure->len - 1, + EXTENT_DIRTY | EXTENT_LOCKED, + GFP_NOFS); + kfree(failure); + } + } + return 0; +} + +/* + * when reads are done, we need to check csums to verify the data is correct + * if there's a match, we allow the bio to finish. If not, we go through + * the io_failure_record routines to find good copies + */ +static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); + struct inode *inode = page->mapping->host; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + char *kaddr; + u64 private = ~(u32)0; + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + u32 csum = ~(u32)0; + + if (PageChecked(page)) { + ClearPageChecked(page); + goto good; + } + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + return 0; + + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, + GFP_NOFS); + return 0; + } + + if (state && state->start == start) { + private = state->private; + ret = 0; + } else { + ret = get_state_private(io_tree, start, &private); + } + kaddr = kmap_atomic(page, KM_USER0); + if (ret) + goto zeroit; + + csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); + btrfs_csum_final(csum, (char *)&csum); + if (csum != private) + goto zeroit; + + kunmap_atomic(kaddr, KM_USER0); +good: + /* if the io failure tree for this inode is non-empty, + * check to see if we've recovered from a failed IO + */ + btrfs_clean_io_failures(inode, start); + return 0; + +zeroit: + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " + "private %llu\n", page->mapping->host->i_ino, + (unsigned long long)start, csum, + (unsigned long long)private); + } + memset(kaddr + offset, 1, end - start + 1); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + if (private == 0) + return 0; + return -EIO; +} + +struct delayed_iput { + struct list_head list; + struct inode *inode; +}; + +void btrfs_add_delayed_iput(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct delayed_iput *delayed; + + if (atomic_add_unless(&inode->i_count, -1, 1)) + return; + + delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); + delayed->inode = inode; + + spin_lock(&fs_info->delayed_iput_lock); + list_add_tail(&delayed->list, &fs_info->delayed_iputs); + spin_unlock(&fs_info->delayed_iput_lock); +} + +void btrfs_run_delayed_iputs(struct btrfs_root *root) +{ + LIST_HEAD(list); + struct btrfs_fs_info *fs_info = root->fs_info; + struct delayed_iput *delayed; + int empty; + + spin_lock(&fs_info->delayed_iput_lock); + empty = list_empty(&fs_info->delayed_iputs); + spin_unlock(&fs_info->delayed_iput_lock); + if (empty) + return; + + down_read(&root->fs_info->cleanup_work_sem); + spin_lock(&fs_info->delayed_iput_lock); + list_splice_init(&fs_info->delayed_iputs, &list); + spin_unlock(&fs_info->delayed_iput_lock); + + while (!list_empty(&list)) { + delayed = list_entry(list.next, struct delayed_iput, list); + list_del(&delayed->list); + iput(delayed->inode); + kfree(delayed); + } + up_read(&root->fs_info->cleanup_work_sem); +} + +/* + * This creates an orphan entry for the given inode in case something goes + * wrong in the middle of an unlink/truncate. + */ +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + spin_lock(&root->list_lock); + + /* already on the orphan list, we're good */ + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + spin_unlock(&root->list_lock); + return 0; + } + + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + + spin_unlock(&root->list_lock); + + /* + * insert an orphan item to track this unlinked/truncated file + */ + ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + + return ret; +} + +/* + * We have done the truncate/delete so we can go ahead and remove the orphan + * item for this particular inode. + */ +int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + spin_lock(&root->list_lock); + + if (list_empty(&BTRFS_I(inode)->i_orphan)) { + spin_unlock(&root->list_lock); + return 0; + } + + list_del_init(&BTRFS_I(inode)->i_orphan); + if (!trans) { + spin_unlock(&root->list_lock); + return 0; + } + + spin_unlock(&root->list_lock); + + ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + + return ret; +} + +/* + * this cleans up any orphans that may be left on the list from the last use + * of this root. + */ +void btrfs_orphan_cleanup(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_item *item; + struct btrfs_key key, found_key; + struct btrfs_trans_handle *trans; + struct inode *inode; + int ret = 0, nr_unlink = 0, nr_truncate = 0; + + if (!xchg(&root->clean_orphans, 0)) + return; + + path = btrfs_alloc_path(); + BUG_ON(!path); + path->reada = -1; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + printk(KERN_ERR "Error searching slot for orphan: %d" + "\n", ret); + break; + } + + /* + * if ret == 0 means we found what we were searching for, which + * is weird, but possible, so only screw with path if we didnt + * find the key and see if we have stuff that matches + */ + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + /* pull out the item */ + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + /* make sure the item matches what we want */ + if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) + break; + if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) + break; + + /* release the path since we're done with it */ + btrfs_release_path(root, path); + + /* + * this is where we are basically btrfs_lookup, without the + * crossing root thing. we store the inode number in the + * offset of the orphan item. + */ + found_key.objectid = found_key.offset; + found_key.type = BTRFS_INODE_ITEM_KEY; + found_key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &found_key, root); + if (IS_ERR(inode)) + break; + + /* + * add this inode to the orphan list so btrfs_orphan_del does + * the proper thing when we hit it + */ + spin_lock(&root->list_lock); + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + spin_unlock(&root->list_lock); + + /* + * if this is a bad inode, means we actually succeeded in + * removing the inode, but not the orphan record, which means + * we need to manually delete the orphan since iput will just + * do a destroy_inode + */ + if (is_bad_inode(inode)) { + trans = btrfs_start_transaction(root, 1); + btrfs_orphan_del(trans, inode); + btrfs_end_transaction(trans, root); + iput(inode); + continue; + } + + /* if we have links, this was a truncate, lets do that */ + if (inode->i_nlink) { + nr_truncate++; + btrfs_truncate(inode); + } else { + nr_unlink++; + } + + /* this will do delete_inode and everything for us */ + iput(inode); + } + + if (nr_unlink) + printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); + if (nr_truncate) + printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); + + btrfs_free_path(path); +} + +/* + * very simple check to peek ahead in the leaf looking for xattrs. If we + * don't find any xattrs, we know there can't be any acls. + * + * slot is the slot the inode is in, objectid is the objectid of the inode + */ +static noinline int acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid) +{ + u32 nritems = btrfs_header_nritems(leaf); + struct btrfs_key found_key; + int scanned = 0; + + slot++; + while (slot < nritems) { + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* we found a different objectid, there must not be acls */ + if (found_key.objectid != objectid) + return 0; + + /* we found an xattr, assume we've got an acl */ + if (found_key.type == BTRFS_XATTR_ITEM_KEY) + return 1; + + /* + * we found a key greater than an xattr key, there can't + * be any acls later on + */ + if (found_key.type > BTRFS_XATTR_ITEM_KEY) + return 0; + + slot++; + scanned++; + + /* + * it goes inode, inode backrefs, xattrs, extents, + * so if there are a ton of hard links to an inode there can + * be a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization + */ + if (scanned >= 8) + break; + } + /* we hit the end of the leaf before we found an xattr or + * something larger than an xattr. We have to assume the inode + * has acls + */ + return 1; +} + +/* + * read an inode from the btree into the in-memory inode + */ +static void btrfs_read_locked_inode(struct inode *inode) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_inode_item *inode_item; + struct btrfs_timespec *tspec; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key location; + int maybe_acls; + u64 alloc_group_block; + u32 rdev; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + + ret = btrfs_lookup_inode(NULL, root, path, &location, 0); + if (ret) + goto make_bad; + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + + inode->i_mode = btrfs_inode_mode(leaf, inode_item); + inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); + inode->i_uid = btrfs_inode_uid(leaf, inode_item); + inode->i_gid = btrfs_inode_gid(leaf, inode_item); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + + tspec = btrfs_inode_atime(inode_item); + inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + tspec = btrfs_inode_mtime(inode_item); + inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + tspec = btrfs_inode_ctime(inode_item); + inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); + BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); + BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); + inode->i_generation = BTRFS_I(inode)->generation; + inode->i_rdev = 0; + rdev = btrfs_inode_rdev(leaf, inode_item); + + BTRFS_I(inode)->index_cnt = (u64)-1; + BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + + alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + + /* + * try to precache a NULL acl entry for files that don't have + * any xattrs or acls + */ + maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); + if (!maybe_acls) + cache_no_acl(inode); + + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, + alloc_group_block, 0); + btrfs_free_path(path); + inode_item = NULL; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + break; + case S_IFDIR: + inode->i_fop = &btrfs_dir_file_operations; + if (root == root->fs_info->tree_root) + inode->i_op = &btrfs_dir_ro_inode_operations; + else + inode->i_op = &btrfs_dir_inode_operations; + break; + case S_IFLNK: + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + break; + default: + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + break; + } + + btrfs_update_iflags(inode); + return; + +make_bad: + btrfs_free_path(path); + make_bad_inode(inode); +} + +/* + * given a leaf and an inode, copy the inode fields into the leaf + */ +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode) +{ + btrfs_set_inode_uid(leaf, item, inode->i_uid); + btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec); + + btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); + btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); + btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); +} + +/* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + path->leave_spinning = 1; + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(inode)->location, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto failed; + } + + btrfs_unlock_up_safe(path, 1); + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + + fill_inode_item(trans, leaf, inode_item, inode); + btrfs_mark_buffer_dirty(leaf); + btrfs_set_inode_last_trans(trans, inode); + ret = 0; +failed: + btrfs_free_path(path); + return ret; +} + + +/* + * unlink helper that gets used here in inode.c and in the tree logging + * recovery code. It remove a link in a directory with a given name, and + * also drops the back refs in the inode to the directory + */ +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) +{ + struct btrfs_path *path; + int ret = 0; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto err; + } + + path->leave_spinning = 1; + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto err; + } + if (!di) { + ret = -ENOENT; + goto err; + } + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto err; + btrfs_release_path(root, path); + + ret = btrfs_del_inode_ref(trans, root, name, name_len, + inode->i_ino, + dir->i_ino, &index); + if (ret) { + printk(KERN_INFO "btrfs failed to delete reference to %.*s, " + "inode %lu parent %lu\n", name_len, name, + inode->i_ino, dir->i_ino); + goto err; + } + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + index, name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto err; + } + if (!di) { + ret = -ENOENT; + goto err; + } + ret = btrfs_delete_one_dir_name(trans, root, path, di); + btrfs_release_path(root, path); + + ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, + inode, dir->i_ino); + BUG_ON(ret != 0 && ret != -ENOENT); + + ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, + dir, index); + BUG_ON(ret); +err: + btrfs_free_path(path); + if (ret) + goto out; + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; + btrfs_update_inode(trans, root, dir); + btrfs_drop_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); +out: + return ret; +} + +static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_root *root; + struct btrfs_trans_handle *trans; + struct inode *inode = dentry->d_inode; + int ret; + unsigned long nr = 0; + + root = BTRFS_I(dir)->root; + + /* + * 5 items for unlink inode + * 1 for orphan + */ + ret = btrfs_reserve_metadata_space(root, 6); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_unreserve_metadata_space(root, 6); + return PTR_ERR(trans); + } + + btrfs_set_trans_block_group(trans, dir); + + btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); + + ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + + if (inode->i_nlink == 0) + ret = btrfs_orphan_add(trans, inode); + + nr = trans->blocks_used; + + btrfs_end_transaction_throttle(trans, root); + btrfs_unreserve_metadata_space(root, 6); + btrfs_btree_balance_dirty(root, nr); + return ret; +} + +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, u64 objectid, + const char *name, int name_len) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, name_len, -1); + BUG_ON(!di || IS_ERR(di)); + + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + btrfs_release_path(root, path); + + ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, + objectid, root->root_key.objectid, + dir->i_ino, &index, name, name_len); + if (ret < 0) { + BUG_ON(ret != -ENOENT); + di = btrfs_search_dir_index_item(root, path, dir->i_ino, + name, name_len); + BUG_ON(!di || IS_ERR(di)); + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(root, path); + index = key.offset; + } + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + index, name, name_len, -1); + BUG_ON(!di || IS_ERR(di)); + + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + btrfs_release_path(root, path); + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + dir->i_sb->s_dirt = 1; + + btrfs_free_path(path); + return 0; +} + +static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err = 0; + int ret; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + unsigned long nr = 0; + + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || + inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return -ENOTEMPTY; + + ret = btrfs_reserve_metadata_space(root, 5); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_unreserve_metadata_space(root, 5); + return PTR_ERR(trans); + } + + btrfs_set_trans_block_group(trans, dir); + + if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + err = btrfs_unlink_subvol(trans, root, dir, + BTRFS_I(inode)->location.objectid, + dentry->d_name.name, + dentry->d_name.len); + goto out; + } + + err = btrfs_orphan_add(trans, inode); + if (err) + goto out; + + /* now the directory is empty */ + err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + if (!err) + btrfs_i_size_write(inode, 0); +out: + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + btrfs_unreserve_metadata_space(root, 5); + btrfs_btree_balance_dirty(root, nr); + + if (ret && !err) + err = ret; + return err; +} + +#if 0 +/* + * when truncating bytes in a file, it is possible to avoid reading + * the leaves that contain only checksum items. This can be the + * majority of the IO required to delete a large file, but it must + * be done carefully. + * + * The keys in the level just above the leaves are checked to make sure + * the lowest key in a given leaf is a csum key, and starts at an offset + * after the new size. + * + * Then the key for the next leaf is checked to make sure it also has + * a checksum item for the same file. If it does, we know our target leaf + * contains only checksum items, and it can be safely freed without reading + * it. + * + * This is just an optimization targeted at large files. It may do + * nothing. It will return 0 unless things went badly. + */ +static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct inode *inode, u64 new_size) +{ + struct btrfs_key key; + int ret; + int nritems; + struct btrfs_key found_key; + struct btrfs_key other_key; + struct btrfs_leaf_ref *ref; + u64 leaf_gen; + u64 leaf_start; + + path->lowest_level = 1; + key.objectid = inode->i_ino; + key.type = BTRFS_CSUM_ITEM_KEY; + key.offset = new_size; +again: + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (path->nodes[1] == NULL) { + ret = 0; + goto out; + } + ret = 0; + btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); + nritems = btrfs_header_nritems(path->nodes[1]); + + if (!nritems) + goto out; + + if (path->slots[1] >= nritems) + goto next_node; + + /* did we find a key greater than anything we want to delete? */ + if (found_key.objectid > inode->i_ino || + (found_key.objectid == inode->i_ino && found_key.type > key.type)) + goto out; + + /* we check the next key in the node to make sure the leave contains + * only checksum items. This comparison doesn't work if our + * leaf is the last one in the node + */ + if (path->slots[1] + 1 >= nritems) { +next_node: + /* search forward from the last key in the node, this + * will bring us into the next node in the tree + */ + btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); + + /* unlikely, but we inc below, so check to be safe */ + if (found_key.offset == (u64)-1) + goto out; + + /* search_forward needs a path with locks held, do the + * search again for the original key. It is possible + * this will race with a balance and return a path that + * we could modify, but this drop is just an optimization + * and is allowed to miss some leaves. + */ + btrfs_release_path(root, path); + found_key.offset++; + + /* setup a max key for search_forward */ + other_key.offset = (u64)-1; + other_key.type = key.type; + other_key.objectid = key.objectid; + + path->keep_locks = 1; + ret = btrfs_search_forward(root, &found_key, &other_key, + path, 0, 0); + path->keep_locks = 0; + if (ret || found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + key.offset = found_key.offset; + btrfs_release_path(root, path); + cond_resched(); + goto again; + } + + /* we know there's one more slot after us in the tree, + * read that key so we can verify it is also a checksum item + */ + btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); + + if (found_key.objectid < inode->i_ino) + goto next_key; + + if (found_key.type != key.type || found_key.offset < new_size) + goto next_key; + + /* + * if the key for the next leaf isn't a csum key from this objectid, + * we can't be sure there aren't good items inside this leaf. + * Bail out + */ + if (other_key.objectid != inode->i_ino || other_key.type != key.type) + goto out; + + leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]); + leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]); + /* + * it is safe to delete this leaf, it contains only + * csum items from this inode at an offset >= new_size + */ + ret = btrfs_del_leaf(trans, root, path, leaf_start); + BUG_ON(ret); + + if (root->ref_cows && leaf_gen < trans->transid) { + ref = btrfs_alloc_leaf_ref(root, 0); + if (ref) { + ref->root_gen = root->root_key.offset; + ref->bytenr = leaf_start; + ref->owner = 0; + ref->generation = leaf_gen; + ref->nritems = 0; + + btrfs_sort_leaf_ref(ref); + + ret = btrfs_add_leaf_ref(root, ref, 0); + WARN_ON(ret); + btrfs_free_leaf_ref(root, ref); + } else { + WARN_ON(1); + } + } +next_key: + btrfs_release_path(root, path); + + if (other_key.objectid == inode->i_ino && + other_key.type == key.type && other_key.offset > key.offset) { + key.offset = other_key.offset; + cond_resched(); + goto again; + } + ret = 0; +out: + /* fixup any changes we've made to the path */ + path->lowest_level = 0; + path->keep_locks = 0; + btrfs_release_path(root, path); + return ret; +} + +#endif + +/* + * this can truncate away extent items, csum items and directory items. + * It starts at a high offset and removes keys until it can't find + * any higher than new_size + * + * csum items that cross the new i_size are truncated to the new size + * as well. + * + * min_type is the minimum key type to truncate down to. If set to 0, this + * will kill all the items on this inode, including the INODE_ITEM_KEY. + */ +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + u64 new_size, u32 min_type) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; + u64 extent_start = 0; + u64 extent_num_bytes = 0; + u64 extent_offset = 0; + u64 item_end = 0; + u64 mask = root->sectorsize - 1; + u32 found_type = (u8)-1; + int found_extent; + int del_item; + int pending_del_nr = 0; + int pending_del_slot = 0; + int extent_type = -1; + int encoding; + int ret; + int err = 0; + + BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); + + if (root->ref_cows) + btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); + + path = btrfs_alloc_path(); + BUG_ON(!path); + path->reada = -1; + + key.objectid = inode->i_ino; + key.offset = (u64)-1; + key.type = (u8)-1; + +search_again: + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto out; + } + + if (ret > 0) { + /* there are no items in the tree for us to truncate, we're + * done + */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + + while (1) { + fi = NULL; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + encoding = 0; + + if (found_key.objectid != inode->i_ino) + break; + + if (found_type < min_type) + break; + + item_end = found_key.offset; + if (found_type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + encoding = btrfs_file_extent_compression(leaf, fi); + encoding |= btrfs_file_extent_encryption(leaf, fi); + encoding |= btrfs_file_extent_other_encoding(leaf, fi); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + item_end += + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + item_end += btrfs_file_extent_inline_len(leaf, + fi); + } + item_end--; + } + if (found_type > min_type) { + del_item = 1; + } else { + if (item_end < new_size) + break; + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; + } + found_extent = 0; + /* FIXME, shrink the extent if the ref count is only 1 */ + if (found_type != BTRFS_EXTENT_DATA_KEY) + goto delete; + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + u64 num_dec; + extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); + if (!del_item && !encoding) { + u64 orig_num_bytes = + btrfs_file_extent_num_bytes(leaf, fi); + extent_num_bytes = new_size - + found_key.offset + root->sectorsize - 1; + extent_num_bytes = extent_num_bytes & + ~((u64)root->sectorsize - 1); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_num_bytes); + num_dec = (orig_num_bytes - + extent_num_bytes); + if (root->ref_cows && extent_start != 0) + inode_sub_bytes(inode, num_dec); + btrfs_mark_buffer_dirty(leaf); + } else { + extent_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, + fi); + extent_offset = found_key.offset - + btrfs_file_extent_offset(leaf, fi); + + /* FIXME blocksize != 4096 */ + num_dec = btrfs_file_extent_num_bytes(leaf, fi); + if (extent_start != 0) { + found_extent = 1; + if (root->ref_cows) + inode_sub_bytes(inode, num_dec); + } + } + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + /* + * we can't truncate inline items that have had + * special encodings + */ + if (!del_item && + btrfs_file_extent_compression(leaf, fi) == 0 && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0) { + u32 size = new_size - found_key.offset; + + if (root->ref_cows) { + inode_sub_bytes(inode, item_end + 1 - + new_size); + } + size = + btrfs_file_extent_calc_inline_size(size); + ret = btrfs_truncate_item(trans, root, path, + size, 1); + BUG_ON(ret); + } else if (root->ref_cows) { + inode_sub_bytes(inode, item_end + 1 - + found_key.offset); + } + } +delete: + if (del_item) { + if (!pending_del_nr) { + /* no pending yet, add ourselves */ + pending_del_slot = path->slots[0]; + pending_del_nr = 1; + } else if (pending_del_nr && + path->slots[0] + 1 == pending_del_slot) { + /* hop on the pending chunk */ + pending_del_nr++; + pending_del_slot = path->slots[0]; + } else { + BUG(); + } + } else { + break; + } + if (found_extent && root->ref_cows) { + btrfs_set_path_blocking(path); + ret = btrfs_free_extent(trans, root, extent_start, + extent_num_bytes, 0, + btrfs_header_owner(leaf), + inode->i_ino, extent_offset); + BUG_ON(ret); + } + + if (found_type == BTRFS_INODE_ITEM_KEY) + break; + + if (path->slots[0] == 0 || + path->slots[0] != pending_del_slot) { + if (root->ref_cows) { + err = -EAGAIN; + goto out; + } + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + BUG_ON(ret); + pending_del_nr = 0; + } + btrfs_release_path(root, path); + goto search_again; + } else { + path->slots[0]--; + } + } +out: + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, pending_del_slot, + pending_del_nr); + } + btrfs_free_path(path); + return err; +} + +/* + * taken from block_truncate_page, but does cow as it zeros out + * any bytes left in the last page in the file. + */ +static int btrfs_truncate_page(struct address_space *mapping, loff_t from) +{ + struct inode *inode = mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + char *kaddr; + u32 blocksize = root->sectorsize; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + struct page *page; + int ret = 0; + u64 page_start; + u64 page_end; + + if ((offset & (blocksize - 1)) == 0) + goto out; + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) + goto out; + + ret = -ENOMEM; +again: + page = grab_cache_page(mapping, index); + if (!page) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + goto out; + } + + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + if (!PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + lock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_extent_mapped(page); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); + + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); + if (ret) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + goto out_unlock; + } + + ret = 0; + if (offset != PAGE_CACHE_SIZE) { + kaddr = kmap(page); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap(page); + } + ClearPageChecked(page); + set_page_dirty(page); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + +out_unlock: + if (ret) + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + unlock_page(page); + page_cache_release(page); +out: + return ret; +} + +int btrfs_cont_expand(struct inode *inode, loff_t size) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + u64 mask = root->sectorsize - 1; + u64 hole_start = (inode->i_size + mask) & ~mask; + u64 block_end = (size + mask) & ~mask; + u64 last_byte; + u64 cur_offset; + u64 hole_size; + int err = 0; + + if (size <= hole_start) + return 0; + + while (1) { + struct btrfs_ordered_extent *ordered; + btrfs_wait_ordered_range(inode, hole_start, + block_end - hole_start); + lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, hole_start); + if (!ordered) + break; + unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + btrfs_put_ordered_extent(ordered); + } + + cur_offset = hole_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + block_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), block_end); + last_byte = (last_byte + mask) & ~mask; + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + u64 hint_byte = 0; + hole_size = last_byte - cur_offset; + + err = btrfs_reserve_metadata_space(root, 2); + if (err) + break; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + err = btrfs_drop_extents(trans, inode, cur_offset, + cur_offset + hole_size, + &hint_byte, 1); + BUG_ON(err); + + err = btrfs_insert_file_extent(trans, root, + inode->i_ino, cur_offset, 0, + 0, hole_size, 0, hole_size, + 0, 0, 0); + BUG_ON(err); + + btrfs_drop_extent_cache(inode, hole_start, + last_byte - 1, 0); + + btrfs_end_transaction(trans, root); + btrfs_unreserve_metadata_space(root, 2); + } + free_extent_map(em); + cur_offset = last_byte; + if (cur_offset >= block_end) + break; + } + + unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + return err; +} + +static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + unsigned long nr; + int ret; + + if (attr->ia_size == inode->i_size) + return 0; + + if (attr->ia_size > inode->i_size) { + unsigned long limit; + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (attr->ia_size > inode->i_sb->s_maxbytes) + return -EFBIG; + if (limit != RLIM_INFINITY && attr->ia_size > limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + } + + ret = btrfs_reserve_metadata_space(root, 1); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_orphan_add(trans, inode); + BUG_ON(ret); + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_unreserve_metadata_space(root, 1); + btrfs_btree_balance_dirty(root, nr); + + if (attr->ia_size > inode->i_size) { + ret = btrfs_cont_expand(inode, attr->ia_size); + if (ret) { + btrfs_truncate(inode); + return ret; + } + + i_size_write(inode, attr->ia_size); + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + if (inode->i_nlink > 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + return 0; + } + + /* + * We're truncating a file that used to have good data down to + * zero. Make sure it gets into the ordered flush list so that + * any new writes get down to disk quickly. + */ + if (attr->ia_size == 0) + BTRFS_I(inode)->ordered_data_close = 1; + + /* we don't support swapfiles, so vmtruncate shouldn't fail */ + ret = vmtruncate(inode, attr->ia_size); + BUG_ON(ret); + + return 0; +} + +static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int err; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + err = btrfs_setattr_size(inode, attr); + if (err) + return err; + } + attr->ia_valid &= ~ATTR_SIZE; + + if (attr->ia_valid) + err = inode_setattr(inode, attr); + + if (!err && ((attr->ia_valid & ATTR_MODE))) + err = btrfs_acl_chmod(inode); + return err; +} + +void btrfs_delete_inode(struct inode *inode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long nr; + int ret; + + truncate_inode_pages(&inode->i_data, 0); + if (is_bad_inode(inode)) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + btrfs_wait_ordered_range(inode, 0, (u64)-1); + + if (root->fs_info->log_root_recovering) { + BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); + goto no_delete; + } + + if (inode->i_nlink > 0) { + BUG_ON(btrfs_root_refs(&root->root_item) != 0); + goto no_delete; + } + + btrfs_i_size_write(inode, 0); + + while (1) { + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + + if (ret != -EAGAIN) + break; + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + trans = NULL; + btrfs_btree_balance_dirty(root, nr); + } + + if (ret == 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); +no_delete: + clear_inode(inode); + return; +} + +/* + * this returns the key found in the dir entry in the location pointer. + * If no dir entries were found, location->objectid is 0. + */ +static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, + struct btrfs_key *location) +{ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + int ret = 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, + namelen, 0); + if (IS_ERR(di)) + ret = PTR_ERR(di); + + if (!di || IS_ERR(di)) + goto out_err; + + btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); +out: + btrfs_free_path(path); + return ret; +out_err: + location->objectid = 0; + goto out; +} + +/* + * when we hit a tree root in a directory, the btrfs part of the inode + * needs to be changed to reflect the root directory of the tree root. This + * is kind of like crossing a mount point. + */ +static int fixup_tree_root_location(struct btrfs_root *root, + struct inode *dir, + struct dentry *dentry, + struct btrfs_key *location, + struct btrfs_root **sub_root) +{ + struct btrfs_path *path; + struct btrfs_root *new_root; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + err = -ENOENT; + ret = btrfs_find_root_ref(root->fs_info->tree_root, path, + BTRFS_I(dir)->root->root_key.objectid, + location->objectid); + if (ret) { + if (ret < 0) + err = ret; + goto out; + } + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino || + btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) + goto out; + + ret = memcmp_extent_buffer(leaf, dentry->d_name.name, + (unsigned long)(ref + 1), + dentry->d_name.len); + if (ret) + goto out; + + btrfs_release_path(root->fs_info->tree_root, path); + + new_root = btrfs_read_fs_root_no_name(root->fs_info, location); + if (IS_ERR(new_root)) { + err = PTR_ERR(new_root); + goto out; + } + + if (btrfs_root_refs(&new_root->root_item) == 0) { + err = -ENOENT; + goto out; + } + + *sub_root = new_root; + location->objectid = btrfs_root_dirid(&new_root->root_item); + location->type = BTRFS_INODE_ITEM_KEY; + location->offset = 0; + err = 0; +out: + btrfs_free_path(path); + return err; +} + +static void inode_tree_add(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *entry; + struct rb_node **p; + struct rb_node *parent; +again: + p = &root->inode_tree.rb_node; + parent = NULL; + + if (hlist_unhashed(&inode->i_hash)) + return; + + spin_lock(&root->inode_lock); + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_inode, rb_node); + + if (inode->i_ino < entry->vfs_inode.i_ino) + p = &parent->rb_left; + else if (inode->i_ino > entry->vfs_inode.i_ino) + p = &parent->rb_right; + else { + WARN_ON(!(entry->vfs_inode.i_state & + (I_WILL_FREE | I_FREEING | I_CLEAR))); + rb_erase(parent, &root->inode_tree); + RB_CLEAR_NODE(parent); + spin_unlock(&root->inode_lock); + goto again; + } + } + rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); + rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); + spin_unlock(&root->inode_lock); +} + +static void inode_tree_del(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int empty = 0; + + spin_lock(&root->inode_lock); + if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { + rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + empty = RB_EMPTY_ROOT(&root->inode_tree); + } + spin_unlock(&root->inode_lock); + + if (empty && btrfs_root_refs(&root->root_item) == 0) { + synchronize_srcu(&root->fs_info->subvol_srcu); + spin_lock(&root->inode_lock); + empty = RB_EMPTY_ROOT(&root->inode_tree); + spin_unlock(&root->inode_lock); + if (empty) + btrfs_add_dead_root(root); + } +} + +int btrfs_invalidate_inodes(struct btrfs_root *root) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + u64 objectid = 0; + + WARN_ON(btrfs_root_refs(&root->root_item) != 0); + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < entry->vfs_inode.i_ino) + node = node->rb_left; + else if (objectid > entry->vfs_inode.i_ino) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= entry->vfs_inode.i_ino) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + objectid = entry->vfs_inode.i_ino + 1; + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + if (atomic_read(&inode->i_count) > 1) + d_prune_aliases(inode); + /* + * btrfs_drop_inode will remove it from + * the inode cache when its usage count + * hits zero. + */ + iput(inode); + cond_resched(); + spin_lock(&root->inode_lock); + goto again; + } + + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); + return 0; +} + +static noinline void init_btrfs_i(struct inode *inode) +{ + struct btrfs_inode *bi = BTRFS_I(inode); + + bi->generation = 0; + bi->sequence = 0; + bi->last_trans = 0; + bi->last_sub_trans = 0; + bi->logged_trans = 0; + bi->delalloc_bytes = 0; + bi->reserved_bytes = 0; + bi->disk_i_size = 0; + bi->flags = 0; + bi->index_cnt = (u64)-1; + bi->last_unlink_trans = 0; + bi->ordered_data_close = 0; + extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, + inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, + inode->i_mapping, GFP_NOFS); + INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); + INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); + mutex_init(&BTRFS_I(inode)->log_mutex); +} + +static int btrfs_init_locked_inode(struct inode *inode, void *p) +{ + struct btrfs_iget_args *args = p; + inode->i_ino = args->ino; + init_btrfs_i(inode); + BTRFS_I(inode)->root = args->root; + btrfs_set_inode_space_info(args->root, inode); + return 0; +} + +static int btrfs_find_actor(struct inode *inode, void *opaque) +{ + struct btrfs_iget_args *args = opaque; + return args->ino == inode->i_ino && + args->root == BTRFS_I(inode)->root; +} + +static struct inode *btrfs_iget_locked(struct super_block *s, + u64 objectid, + struct btrfs_root *root) +{ + struct inode *inode; + struct btrfs_iget_args args; + args.ino = objectid; + args.root = root; + + inode = iget5_locked(s, objectid, btrfs_find_actor, + btrfs_init_locked_inode, + (void *)&args); + return inode; +} + +/* Get an inode object given its location and corresponding root. + * Returns in *is_new if the inode was read from disk + */ +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root) +{ + struct inode *inode; + + inode = btrfs_iget_locked(s, location->objectid, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); + btrfs_read_locked_inode(inode); + + inode_tree_add(inode); + unlock_new_inode(inode); + } + + return inode; +} + +static struct inode *new_simple_dir(struct super_block *s, + struct btrfs_key *key, + struct btrfs_root *root) +{ + struct inode *inode = new_inode(s); + + if (!inode) + return ERR_PTR(-ENOMEM); + + init_btrfs_i(inode); + + BTRFS_I(inode)->root = root; + memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); + BTRFS_I(inode)->dummy_inode = 1; + + inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + return inode; +} + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *sub_root = root; + struct btrfs_key location; + int index; + int ret; + + dentry->d_op = &btrfs_dentry_operations; + + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ret = btrfs_inode_by_name(dir, dentry, &location); + + if (ret < 0) + return ERR_PTR(ret); + + if (location.objectid == 0) + return NULL; + + if (location.type == BTRFS_INODE_ITEM_KEY) { + inode = btrfs_iget(dir->i_sb, &location, root); + return inode; + } + + BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); + + index = srcu_read_lock(&root->fs_info->subvol_srcu); + ret = fixup_tree_root_location(root, dir, dentry, + &location, &sub_root); + if (ret < 0) { + if (ret != -ENOENT) + inode = ERR_PTR(ret); + else + inode = new_simple_dir(dir->i_sb, &location, sub_root); + } else { + inode = btrfs_iget(dir->i_sb, &location, sub_root); + } + srcu_read_unlock(&root->fs_info->subvol_srcu, index); + + if (root != sub_root) { + down_read(&root->fs_info->cleanup_work_sem); + if (!(inode->i_sb->s_flags & MS_RDONLY)) + btrfs_orphan_cleanup(sub_root); + up_read(&root->fs_info->cleanup_work_sem); + } + + return inode; +} + +static int btrfs_dentry_delete(struct dentry *dentry) +{ + struct btrfs_root *root; + + if (!dentry->d_inode && !IS_ROOT(dentry)) + dentry = dentry->d_parent; + + if (dentry->d_inode) { + root = BTRFS_I(dentry->d_inode)->root; + if (btrfs_root_refs(&root->root_item) == 0) + return 1; + } + return 0; +} + +static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_splice_alias(inode, dentry); +} + +static unsigned char btrfs_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int btrfs_real_readdir(struct file *filp, void *dirent, + filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_item *item; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + int ret; + u32 nritems; + struct extent_buffer *leaf; + int slot; + int advance; + unsigned char d_type; + int over = 0; + u32 di_cur; + u32 di_total; + u32 di_len; + int key_type = BTRFS_DIR_INDEX_KEY; + char tmp_name[32]; + char *name_ptr; + int name_len; + + /* FIXME, use a real flag for deciding about the key type */ + if (root->fs_info->tree_root == root) + key_type = BTRFS_DIR_ITEM_KEY; + + /* special case for "." */ + if (filp->f_pos == 0) { + over = filldir(dirent, ".", 1, + 1, inode->i_ino, + DT_DIR); + if (over) + return 0; + filp->f_pos = 1; + } + /* special case for .., just use the back ref */ + if (filp->f_pos == 1) { + u64 pino = parent_ino(filp->f_path.dentry); + over = filldir(dirent, "..", 2, + 2, pino, DT_DIR); + if (over) + return 0; + filp->f_pos = 2; + } + path = btrfs_alloc_path(); + path->reada = 2; + + btrfs_set_key_type(&key, key_type); + key.offset = filp->f_pos; + key.objectid = inode->i_ino; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + advance = 0; + + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + if (advance || slot >= nritems) { + if (slot >= nritems - 1) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } else { + slot++; + path->slots[0]++; + } + } + + advance = 1; + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != key.objectid) + break; + if (btrfs_key_type(&found_key) != key_type) + break; + if (found_key.offset < filp->f_pos) + continue; + + filp->f_pos = found_key.offset; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + di_cur = 0; + di_total = btrfs_item_size(leaf, item); + + while (di_cur < di_total) { + struct btrfs_key location; + + name_len = btrfs_dir_name_len(leaf, di); + if (name_len <= sizeof(tmp_name)) { + name_ptr = tmp_name; + } else { + name_ptr = kmalloc(name_len, GFP_NOFS); + if (!name_ptr) { + ret = -ENOMEM; + goto err; + } + } + read_extent_buffer(leaf, name_ptr, + (unsigned long)(di + 1), name_len); + + d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + btrfs_dir_item_key_to_cpu(leaf, di, &location); + + /* is this a reference to our own snapshot? If so + * skip it + */ + if (location.type == BTRFS_ROOT_ITEM_KEY && + location.objectid == root->root_key.objectid) { + over = 0; + goto skip; + } + over = filldir(dirent, name_ptr, name_len, + found_key.offset, location.objectid, + d_type); + +skip: + if (name_ptr != tmp_name) + kfree(name_ptr); + + if (over) + goto nopos; + di_len = btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di) + sizeof(*di); + di_cur += di_len; + di = (struct btrfs_dir_item *)((char *)di + di_len); + } + } + + /* Reached end of directory/root. Bump pos past the last item. */ + if (key_type == BTRFS_DIR_INDEX_KEY) + /* + * 32-bit glibc will use getdents64, but then strtol - + * so the last number we can serve is this. + */ + filp->f_pos = 0x7fffffff; + else + filp->f_pos++; +nopos: + ret = 0; +err: + btrfs_free_path(path); + return ret; +} + +int btrfs_write_inode(struct inode *inode, int wait) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (root->fs_info->btree_inode == inode) + return 0; + + if (wait) { + trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_commit_transaction(trans, root); + } + return ret; +} + +/* + * This is somewhat expensive, updating the tree every time the + * inode changes. But, it is most likely to find the inode in cache. + * FIXME, needs more benchmarking...there are no reasons other than performance + * to keep or drop this code. + */ +void btrfs_dirty_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + + trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); +} + +/* + * find the highest existing sequence number in a directory + * and then set the in-memory index_cnt variable to reflect + * free sequence numbers + */ +static int btrfs_set_inode_index_count(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key key, found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + /* FIXME: we should be able to handle this */ + if (ret == 0) + goto out; + ret = 0; + + /* + * MAGIC NUMBER EXPLANATION: + * since we search a directory based on f_pos we have to start at 2 + * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody + * else has to start at 2 + */ + if (path->slots[0] == 0) { + BTRFS_I(inode)->index_cnt = 2; + goto out; + } + + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != inode->i_ino || + btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { + BTRFS_I(inode)->index_cnt = 2; + goto out; + } + + BTRFS_I(inode)->index_cnt = found_key.offset + 1; +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to find a free sequence number in a given directory. This current + * code is very simple, later versions will do smarter things in the btree + */ +int btrfs_set_inode_index(struct inode *dir, u64 *index) +{ + int ret = 0; + + if (BTRFS_I(dir)->index_cnt == (u64)-1) { + ret = btrfs_set_inode_index_count(dir); + if (ret) + return ret; + } + + *index = BTRFS_I(dir)->index_cnt; + BTRFS_I(dir)->index_cnt++; + + return ret; +} + +static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + const char *name, int name_len, + u64 ref_objectid, u64 objectid, + u64 alloc_hint, int mode, u64 *index) +{ + struct inode *inode; + struct btrfs_inode_item *inode_item; + struct btrfs_key *location; + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + struct btrfs_key key[2]; + u32 sizes[2]; + unsigned long ptr; + int ret; + int owner; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + inode = new_inode(root->fs_info->sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (dir) { + ret = btrfs_set_inode_index(dir, index); + if (ret) { + iput(inode); + return ERR_PTR(ret); + } + } + /* + * index_cnt is ignored for everything but a dir, + * btrfs_get_inode_index_count has an explanation for the magic + * number + */ + init_btrfs_i(inode); + BTRFS_I(inode)->index_cnt = 2; + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->generation = trans->transid; + btrfs_set_inode_space_info(root, inode); + + if (mode & S_IFDIR) + owner = 0; + else + owner = 1; + BTRFS_I(inode)->block_group = + btrfs_find_block_group(root, 0, alloc_hint, owner); + + key[0].objectid = objectid; + btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); + key[0].offset = 0; + + key[1].objectid = objectid; + btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].offset = ref_objectid; + + sizes[0] = sizeof(struct btrfs_inode_item); + sizes[1] = name_len + sizeof(*ref); + + path->leave_spinning = 1; + ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); + if (ret != 0) + goto fail; + + inode->i_uid = current_fsuid(); + + if (dir && (dir->i_mode & S_ISGID)) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + + inode->i_mode = mode; + inode->i_ino = objectid; + inode_set_bytes(inode, 0); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, path->nodes[0], inode_item, inode); + + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, *index); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + + btrfs_inherit_iflags(inode, dir); + + if ((mode & S_IFREG)) { + if (btrfs_test_opt(root, NODATASUM)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + if (btrfs_test_opt(root, NODATACOW)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + } + + insert_inode_hash(inode); + inode_tree_add(inode); + return inode; +fail: + if (dir) + BTRFS_I(dir)->index_cnt--; + btrfs_free_path(path); + iput(inode); + return ERR_PTR(ret); +} + +static inline u8 btrfs_inode_type(struct inode *inode) +{ + return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; +} + +/* + * utility function to add 'inode' into 'parent_inode' with + * a give name and a given sequence number. + * if 'add_backref' is true, also insert a backref from the + * inode to the parent directory. + */ +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index) +{ + int ret = 0; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(parent_inode)->root; + + if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); + } else { + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + } + + if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + key.objectid, root->root_key.objectid, + parent_inode->i_ino, + index, name, name_len); + } else if (add_backref) { + ret = btrfs_insert_inode_ref(trans, root, + name, name_len, inode->i_ino, + parent_inode->i_ino, index); + } + + if (ret == 0) { + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode->i_ino, &key, + btrfs_inode_type(inode), index); + BUG_ON(ret); + + btrfs_i_size_write(parent_inode, parent_inode->i_size + + name_len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, parent_inode); + } + return ret; +} + +static int btrfs_add_nondir(struct btrfs_trans_handle *trans, + struct dentry *dentry, struct inode *inode, + int backref, u64 index) +{ + int err = btrfs_add_link(trans, dentry->d_parent->d_inode, + inode, dentry->d_name.name, + dentry->d_name.len, backref, index); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + if (err > 0) + err = -EEXIST; + return err; +} + +static int btrfs_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + u64 objectid; + unsigned long nr = 0; + u64 index = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); + if (err) + return err; + + trans = btrfs_start_transaction(root, 1); + if (!trans) + goto fail; + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, objectid, + BTRFS_I(dir)->block_group, mode, &index); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + err = btrfs_init_inode_security(trans, inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + btrfs_update_inode(trans, root, inode); + } + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + btrfs_unreserve_metadata_space(root, 5); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + unsigned long nr = 0; + u64 objectid; + u64 index = 0; + + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); + if (err) + return err; + + trans = btrfs_start_transaction(root, 1); + if (!trans) + goto fail; + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, + objectid, BTRFS_I(dir)->block_group, mode, + &index); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + err = btrfs_init_inode_security(trans, inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + } + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + btrfs_unreserve_metadata_space(root, 5); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = old_dentry->d_inode; + u64 index; + unsigned long nr = 0; + int err; + int drop_inode = 0; + + if (inode->i_nlink == 0) + return -ENOENT; + + /* do not allow sys_link's with other subvols of the same device */ + if (root->objectid != BTRFS_I(inode)->root->objectid) + return -EPERM; + + /* + * 1 item for inode ref + * 2 items for dir items + */ + err = btrfs_reserve_metadata_space(root, 3); + if (err) + return err; + + btrfs_inc_nlink(inode); + + err = btrfs_set_inode_index(dir, &index); + if (err) + goto fail; + + trans = btrfs_start_transaction(root, 1); + + btrfs_set_trans_block_group(trans, dir); + atomic_inc(&inode->i_count); + + err = btrfs_add_nondir(trans, dentry, inode, 1, index); + + if (err) { + drop_inode = 1; + } else { + btrfs_update_inode_block_group(trans, dir); + err = btrfs_update_inode(trans, root, inode); + BUG_ON(err); + btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); + } + + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + btrfs_unreserve_metadata_space(root, 3); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + int err = 0; + int drop_on_err = 0; + u64 objectid = 0; + u64 index = 0; + unsigned long nr = 1; + + /* + * 2 items for inode and ref + * 2 items for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); + if (err) + return err; + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + goto out_unlock; + } + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, objectid, + BTRFS_I(dir)->block_group, S_IFDIR | mode, + &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_fail; + } + + drop_on_err = 1; + + err = btrfs_init_inode_security(trans, inode, dir); + if (err) + goto out_fail; + + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + btrfs_set_trans_block_group(trans, inode); + + btrfs_i_size_write(inode, 0); + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_fail; + + err = btrfs_add_link(trans, dentry->d_parent->d_inode, + inode, dentry->d_name.name, + dentry->d_name.len, 0, index); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); + drop_on_err = 0; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); + +out_fail: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + +out_unlock: + btrfs_unreserve_metadata_space(root, 5); + if (drop_on_err) + iput(inode); + btrfs_btree_balance_dirty(root, nr); + return err; +} + +/* helper for btfs_get_extent. Given an existing extent in the tree, + * and an extent that you want to insert, deal with overlap and insert + * the new extent into the tree. + */ +static int merge_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map *existing, + struct extent_map *em, + u64 map_start, u64 map_len) +{ + u64 start_diff; + + BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + start_diff = map_start - em->start; + em->start = map_start; + em->len = map_len; + if (em->block_start < EXTENT_MAP_LAST_BYTE && + !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + em->block_start += start_diff; + em->block_len -= start_diff; + } + return add_extent_mapping(em_tree, em); +} + +static noinline int uncompress_inline(struct btrfs_path *path, + struct inode *inode, struct page *page, + size_t pg_offset, u64 extent_offset, + struct btrfs_file_extent_item *item) +{ + int ret; + struct extent_buffer *leaf = path->nodes[0]; + char *tmp; + size_t max_size; + unsigned long inline_size; + unsigned long ptr; + + WARN_ON(pg_offset != 0); + max_size = btrfs_file_extent_ram_bytes(leaf, item); + inline_size = btrfs_file_extent_inline_item_len(leaf, + btrfs_item_nr(leaf, path->slots[0])); + tmp = kmalloc(inline_size, GFP_NOFS); + ptr = btrfs_file_extent_inline_start(item); + + read_extent_buffer(leaf, tmp, ptr, inline_size); + + max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); + ret = btrfs_zlib_decompress(tmp, page, extent_offset, + inline_size, max_size); + if (ret) { + char *kaddr = kmap_atomic(page, KM_USER0); + unsigned long copy_size = min_t(u64, + PAGE_CACHE_SIZE - pg_offset, + max_size - extent_offset); + memset(kaddr + pg_offset, 0, copy_size); + kunmap_atomic(kaddr, KM_USER0); + } + kfree(tmp); + return 0; +} + +/* + * a bit scary, this does extent mapping from logical file offset to the disk. + * the ugly parts come from merging extents from the disk with the in-ram + * representation. This gets more complex because of the data=ordered code, + * where the in-ram extents might be locked pending data=ordered completion. + * + * This also copies inline extents directly into the page. + */ + +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + int ret; + int err = 0; + u64 bytenr; + u64 extent_start = 0; + u64 extent_end = 0; + u64 objectid = inode->i_ino; + u32 found_type; + struct btrfs_path *path = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *item; + struct extent_buffer *leaf; + struct btrfs_key found_key; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_trans_handle *trans = NULL; + int compressed; + +again: + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) + em->bdev = root->fs_info->fs_devices->latest_bdev; + read_unlock(&em_tree->lock); + + if (em) { + if (em->start > start || em->start + em->len <= start) + free_extent_map(em); + else if (em->block_start == EXTENT_MAP_INLINE && page) + free_extent_map(em); + else + goto out; + } + em = alloc_extent_map(GFP_NOFS); + if (!em) { + err = -ENOMEM; + goto out; + } + em->bdev = root->fs_info->fs_devices->latest_bdev; + em->start = EXTENT_MAP_HOLE; + em->orig_start = EXTENT_MAP_HOLE; + em->len = (u64)-1; + em->block_len = (u64)-1; + + if (!path) { + path = btrfs_alloc_path(); + BUG_ON(!path); + } + + ret = btrfs_lookup_file_extent(trans, root, path, + objectid, start, trans != NULL); + if (ret < 0) { + err = ret; + goto out; + } + + if (ret != 0) { + if (path->slots[0] == 0) + goto not_found; + path->slots[0]--; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + /* are we inside the extent that was found? */ + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + if (found_key.objectid != objectid || + found_type != BTRFS_EXTENT_DATA_KEY) { + goto not_found; + } + + found_type = btrfs_file_extent_type(leaf, item); + extent_start = found_key.offset; + compressed = btrfs_file_extent_compression(leaf, item); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, item); + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, item); + extent_end = (extent_start + size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + } + + if (start >= extent_end) { + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + goto not_found; + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != objectid || + found_key.type != BTRFS_EXTENT_DATA_KEY) + goto not_found; + if (start + len <= found_key.offset) + goto not_found; + em->start = start; + em->len = found_key.offset - start; + goto not_found_em; + } + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; + em->len = extent_end - extent_start; + em->orig_start = extent_start - + btrfs_file_extent_offset(leaf, item); + bytenr = btrfs_file_extent_disk_bytenr(leaf, item); + if (bytenr == 0) { + em->block_start = EXTENT_MAP_HOLE; + goto insert; + } + if (compressed) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->block_start = bytenr; + em->block_len = btrfs_file_extent_disk_num_bytes(leaf, + item); + } else { + bytenr += btrfs_file_extent_offset(leaf, item); + em->block_start = bytenr; + em->block_len = em->len; + if (found_type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } + goto insert; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + unsigned long ptr; + char *map; + size_t size; + size_t extent_offset; + size_t copy_size; + + em->block_start = EXTENT_MAP_INLINE; + if (!page || create) { + em->start = extent_start; + em->len = extent_end - extent_start; + goto out; + } + + size = btrfs_file_extent_inline_len(leaf, item); + extent_offset = page_offset(page) + pg_offset - extent_start; + copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, + size - extent_offset); + em->start = extent_start + extent_offset; + em->len = (copy_size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + em->orig_start = EXTENT_MAP_INLINE; + if (compressed) + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + ptr = btrfs_file_extent_inline_start(item) + extent_offset; + if (create == 0 && !PageUptodate(page)) { + if (btrfs_file_extent_compression(leaf, item) == + BTRFS_COMPRESS_ZLIB) { + ret = uncompress_inline(path, inode, page, + pg_offset, + extent_offset, item); + BUG_ON(ret); + } else { + map = kmap(page); + read_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + if (pg_offset + copy_size < PAGE_CACHE_SIZE) { + memset(map + pg_offset + copy_size, 0, + PAGE_CACHE_SIZE - pg_offset - + copy_size); + } + kunmap(page); + } + flush_dcache_page(page); + } else if (create && PageUptodate(page)) { + if (!trans) { + kunmap(page); + free_extent_map(em); + em = NULL; + btrfs_release_path(root, path); + trans = btrfs_join_transaction(root, 1); + goto again; + } + map = kmap(page); + write_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + kunmap(page); + btrfs_mark_buffer_dirty(leaf); + } + set_extent_uptodate(io_tree, em->start, + extent_map_end(em) - 1, GFP_NOFS); + goto insert; + } else { + printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); + WARN_ON(1); + } +not_found: + em->start = start; + em->len = len; +not_found_em: + em->block_start = EXTENT_MAP_HOLE; + set_bit(EXTENT_FLAG_VACANCY, &em->flags); +insert: + btrfs_release_path(root, path); + if (em->start > start || extent_map_end(em) <= start) { + printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " + "[%llu %llu]\n", (unsigned long long)em->start, + (unsigned long long)em->len, + (unsigned long long)start, + (unsigned long long)len); + err = -EIO; + goto out; + } + + err = 0; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + /* it is possible that someone inserted the extent into the tree + * while we had the lock dropped. It is also possible that + * an overlapping map exists in the tree + */ + if (ret == -EEXIST) { + struct extent_map *existing; + + ret = 0; + + existing = lookup_extent_mapping(em_tree, start, len); + if (existing && (existing->start > start || + existing->start + existing->len <= start)) { + free_extent_map(existing); + existing = NULL; + } + if (!existing) { + existing = lookup_extent_mapping(em_tree, em->start, + em->len); + if (existing) { + err = merge_extent_mapping(em_tree, existing, + em, start, + root->sectorsize); + free_extent_map(existing); + if (err) { + free_extent_map(em); + em = NULL; + } + } else { + err = -EIO; + free_extent_map(em); + em = NULL; + } + } else { + free_extent_map(em); + em = existing; + err = 0; + } + } + write_unlock(&em_tree->lock); +out: + if (path) + btrfs_free_path(path); + if (trans) { + ret = btrfs_end_transaction(trans, root); + if (!err) + err = ret; + } + if (err) { + free_extent_map(em); + return ERR_PTR(err); + } + return em; +} + +static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + return -EINVAL; +} + +static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); +} + +int btrfs_readpage(struct file *file, struct page *page) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_read_full_page(tree, page, btrfs_get_extent); +} + +static int btrfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + + + if (current->flags & PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_write_full_page(tree, page, btrfs_get_extent, wbc); +} + +int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + + tree = &BTRFS_I(mapping->host)->io_tree; + return extent_writepages(tree, mapping, btrfs_get_extent, wbc); +} + +static int +btrfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; + return extent_readpages(tree, mapping, pages, nr_pages, + btrfs_get_extent); +} +static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *map; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + ret = try_release_extent_mapping(map, tree, page, gfp_flags); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } + return ret; +} + +static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) +{ + if (PageWriteback(page) || PageDirty(page)) + return 0; + return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); +} + +static void btrfs_invalidatepage(struct page *page, unsigned long offset) +{ + struct extent_io_tree *tree; + struct btrfs_ordered_extent *ordered; + u64 page_start = page_offset(page); + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + + + /* + * we have the page locked, so new writeback can't start, + * and the dirty bit won't be cleared while we are here. + * + * Wait for IO on this page so that we can safely clear + * the PagePrivate2 bit and do ordered accounting + */ + wait_on_page_writeback(page); + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (offset) { + btrfs_releasepage(page, GFP_NOFS); + return; + } + lock_extent(tree, page_start, page_end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(page->mapping->host, + page_offset(page)); + if (ordered) { + /* + * IO on this page will never be started, so we need + * to account for any ordered extents now + */ + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, + NULL, GFP_NOFS); + /* + * whoever cleared the private bit is responsible + * for the finish_ordered_io + */ + if (TestClearPagePrivate2(page)) { + btrfs_finish_ordered_io(page->mapping->host, + page_start, page_end); + } + btrfs_put_ordered_extent(ordered); + lock_extent(tree, page_start, page_end, GFP_NOFS); + } + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); + __btrfs_releasepage(page, GFP_NOFS); + + ClearPageChecked(page); + if (PagePrivate(page)) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } +} + +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * vmtruncate() writes the inode size before removing pages, once we have the + * page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = fdentry(vma->vm_file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + char *kaddr; + unsigned long zero_start; + loff_t size; + int ret; + u64 page_start; + u64 page_end; + + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + if (ret) { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else /* -ENOSPC, -EIO, etc */ + ret = VM_FAULT_SIGBUS; + goto out; + } + + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + ret = VM_FAULT_SIGBUS; + goto out; + } + + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ +again: + lock_page(page); + size = i_size_read(inode); + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + if ((page->mapping != inode->i_mapping) || + (page_start >= size)) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + /* page got truncated out from underneath us */ + goto out_unlock; + } + wait_on_page_writeback(page); + + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_extent_mapped(page); + + /* + * we can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish + */ + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + /* + * XXX - page_mkwrite gets called every time the page is dirtied, even + * if it was already dirty, so for space accounting reasons we need to + * clear any delalloc bits for the range we are fixing to save. There + * is probably a better way to do this, but for now keep consistent with + * prepare_pages in the normal write path. + */ + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); + + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); + if (ret) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + ret = VM_FAULT_SIGBUS; + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + goto out_unlock; + } + ret = 0; + + /* page is wholly or partially inside EOF */ + if (page_start + PAGE_CACHE_SIZE > size) + zero_start = size & ~PAGE_CACHE_MASK; + else + zero_start = PAGE_CACHE_SIZE; + + if (zero_start != PAGE_CACHE_SIZE) { + kaddr = kmap(page); + memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); + flush_dcache_page(page); + kunmap(page); + } + ClearPageChecked(page); + set_page_dirty(page); + SetPageUptodate(page); + + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + +out_unlock: + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + if (!ret) + return VM_FAULT_LOCKED; + unlock_page(page); +out: + return ret; +} + +static void btrfs_truncate(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + struct btrfs_trans_handle *trans; + unsigned long nr; + u64 mask = root->sectorsize - 1; + + if (!S_ISREG(inode->i_mode)) { + WARN_ON(1); + return; + } + + ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (ret) + return; + + btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + /* + * setattr is responsible for setting the ordered_data_close flag, + * but that is only tested during the last file release. That + * could happen well after the next commit, leaving a great big + * window where new writes may get lost if someone chooses to write + * to this file after truncating to zero + * + * The inode doesn't have any dirty data here, and so if we commit + * this is a noop. If someone immediately starts writing to the inode + * it is very likely we'll catch some of their writes in this + * transaction, and the commit will find this file on the ordered + * data list with good things to send down. + * + * This is a best effort solution, there is still a window where + * using truncate to replace the contents of the file will + * end up with a zero length file after a crash. + */ + if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) + btrfs_add_ordered_operation(trans, root, inode); + + while (1) { + ret = btrfs_truncate_inode_items(trans, root, inode, + inode->i_size, + BTRFS_EXTENT_DATA_KEY); + if (ret != -EAGAIN) + break; + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + } + + if (ret == 0 && inode->i_nlink > 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + BUG_ON(ret); + btrfs_btree_balance_dirty(root, nr); +} + +/* + * create a new subvolume directory/inode (helper for the ioctl). + */ +int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + struct btrfs_root *new_root, + u64 new_dirid, u64 alloc_hint) +{ + struct inode *inode; + int err; + u64 index = 0; + + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, + new_dirid, alloc_hint, S_IFDIR | 0700, &index); + if (IS_ERR(inode)) + return PTR_ERR(inode); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + + inode->i_nlink = 1; + btrfs_i_size_write(inode, 0); + + err = btrfs_update_inode(trans, new_root, inode); + BUG_ON(err); + + iput(inode); + return 0; +} + +/* helper function for file defrag and space balancing. This + * forces readahead on a given range of bytes in an inode + */ +unsigned long btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, pgoff_t last_index) +{ + pgoff_t req_size = last_index - offset + 1; + + page_cache_sync_readahead(mapping, ra, file, offset, req_size); + return offset + req_size; +} + +struct inode *btrfs_alloc_inode(struct super_block *sb) +{ + struct btrfs_inode *ei; + + ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + ei->last_trans = 0; + ei->last_sub_trans = 0; + ei->logged_trans = 0; + ei->outstanding_extents = 0; + ei->reserved_extents = 0; + ei->root = NULL; + spin_lock_init(&ei->accounting_lock); + btrfs_ordered_inode_tree_init(&ei->ordered_tree); + INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->ordered_operations); + return &ei->vfs_inode; +} + +void btrfs_destroy_inode(struct inode *inode) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + + WARN_ON(!list_empty(&inode->i_dentry)); + WARN_ON(inode->i_data.nrpages); + + /* + * This can happen where we create an inode, but somebody else also + * created the same inode and we need to destroy the one we already + * created. + */ + if (!root) + goto free; + + /* + * Make sure we're properly removed from the ordered operation + * lists. + */ + smp_mb(); + if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { + spin_lock(&root->fs_info->ordered_extent_lock); + list_del_init(&BTRFS_I(inode)->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); + } + + spin_lock(&root->list_lock); + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", + inode->i_ino); + list_del_init(&BTRFS_I(inode)->i_orphan); + } + spin_unlock(&root->list_lock); + + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); + if (!ordered) + break; + else { + printk(KERN_ERR "btrfs found ordered " + "extent %llu %llu on inode cleanup\n", + (unsigned long long)ordered->file_offset, + (unsigned long long)ordered->len); + btrfs_remove_ordered_extent(inode, ordered); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + } + inode_tree_del(inode); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); +free: + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} + +void btrfs_drop_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) + generic_delete_inode(inode); + else + generic_drop_inode(inode); +} + +static void init_once(void *foo) +{ + struct btrfs_inode *ei = (struct btrfs_inode *) foo; + + inode_init_once(&ei->vfs_inode); +} + +void btrfs_destroy_cachep(void) +{ + if (btrfs_inode_cachep) + kmem_cache_destroy(btrfs_inode_cachep); + if (btrfs_trans_handle_cachep) + kmem_cache_destroy(btrfs_trans_handle_cachep); + if (btrfs_transaction_cachep) + kmem_cache_destroy(btrfs_transaction_cachep); + if (btrfs_path_cachep) + kmem_cache_destroy(btrfs_path_cachep); +} + +int btrfs_init_cachep(void) +{ + btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); + if (!btrfs_inode_cachep) + goto fail; + + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_trans_handle_cachep) + goto fail; + + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_transaction_cachep) + goto fail; + + btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + sizeof(struct btrfs_path), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_path_cachep) + goto fail; + + return 0; +fail: + btrfs_destroy_cachep(); + return -ENOMEM; +} + +static int btrfs_getattr(struct vfsmount *mnt, + struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + generic_fillattr(inode, stat); + stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; + stat->blksize = PAGE_CACHE_SIZE; + stat->blocks = (inode_get_bytes(inode) + + BTRFS_I(inode)->delalloc_bytes) >> 9; + return 0; +} + +static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + u64 index = 0; + u64 root_objectid; + int ret; + + if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return -EPERM; + + /* we only allow rename subvolume link between subvolumes */ + if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + + if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || + (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) + return -ENOTEMPTY; + + if (S_ISDIR(old_inode->i_mode) && new_inode && + new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) + return -ENOTEMPTY; + + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items + * should cover the worst case number of items we'll modify. + */ + ret = btrfs_reserve_metadata_space(root, 11); + if (ret) + return ret; + + /* + * we're using rename to replace one file with another. + * and the replacement file is large. Start IO on it now so + * we don't add too much work to the end of the transaction + */ + if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && + old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(old_inode->i_mapping); + + /* close the racy window with snapshot create/destroy ioctl */ + if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&root->fs_info->subvol_sem); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, new_dir); + + if (dest != root) + btrfs_record_root_in_trans(trans, dest); + + ret = btrfs_set_inode_index(new_dir, &index); + if (ret) + goto out_fail; + + if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + /* force full log commit if subvolume involved. */ + root->fs_info->last_trans_log_full_commit = trans->transid; + } else { + ret = btrfs_insert_inode_ref(trans, dest, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_inode->i_ino, + new_dir->i_ino, index); + if (ret) + goto out_fail; + /* + * this is an ugly little race, but the rename is required + * to make sure that if we crash, the inode is either at the + * old name or the new one. pinning the log transaction lets + * us make sure we don't allow a log commit to come in after + * we unlink the name but before we add the new name back in. + */ + btrfs_pin_log_trans(root); + } + /* + * make sure the inode gets flushed if it is replacing + * something. + */ + if (new_inode && new_inode->i_size && + old_inode && S_ISREG(old_inode->i_mode)) { + btrfs_add_ordered_operation(trans, root, old_inode); + } + + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + + if (old_dentry->d_parent != new_dentry->d_parent) + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + + if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, + old_dentry->d_name.name, + old_dentry->d_name.len); + } else { + btrfs_inc_nlink(old_dentry->d_inode); + ret = btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + } + BUG_ON(ret); + + if (new_inode) { + new_inode->i_ctime = CURRENT_TIME; + if (unlikely(new_inode->i_ino == + BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + root_objectid = BTRFS_I(new_inode)->location.objectid; + ret = btrfs_unlink_subvol(trans, dest, new_dir, + root_objectid, + new_dentry->d_name.name, + new_dentry->d_name.len); + BUG_ON(new_inode->i_nlink == 0); + } else { + ret = btrfs_unlink_inode(trans, dest, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + } + BUG_ON(ret); + if (new_inode->i_nlink == 0) { + ret = btrfs_orphan_add(trans, new_dentry->d_inode); + BUG_ON(ret); + } + } + + ret = btrfs_add_link(trans, new_dir, old_inode, + new_dentry->d_name.name, + new_dentry->d_name.len, 0, index); + BUG_ON(ret); + + if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_log_new_name(trans, old_inode, old_dir, + new_dentry->d_parent); + btrfs_end_log_trans(root); + } +out_fail: + btrfs_end_transaction_throttle(trans, root); + + if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&root->fs_info->subvol_sem); + + btrfs_unreserve_metadata_space(root, 11); + return ret; +} + +/* + * some fairly slow code that needs optimization. This walks the list + * of all the inodes with pending delalloc and forces them to disk. + */ +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +{ + struct list_head *head = &root->fs_info->delalloc_inodes; + struct btrfs_inode *binode; + struct inode *inode; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + spin_lock(&root->fs_info->delalloc_lock); + while (!list_empty(head)) { + binode = list_entry(head->next, struct btrfs_inode, + delalloc_inodes); + inode = igrab(&binode->vfs_inode); + if (!inode) + list_del_init(&binode->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); + if (inode) { + filemap_flush(inode->i_mapping); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + } + cond_resched(); + spin_lock(&root->fs_info->delalloc_lock); + } + spin_unlock(&root->fs_info->delalloc_lock); + + /* the filemap_flush will queue IO into the worker threads, but + * we have to make sure the IO is actually started and that + * ordered extents get created before we return + */ + atomic_inc(&root->fs_info->async_submit_draining); + while (atomic_read(&root->fs_info->nr_async_submits) || + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->nr_async_submits) == 0 && + atomic_read(&root->fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&root->fs_info->async_submit_draining); + return 0; +} + +static int btrfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + u64 objectid; + u64 index = 0 ; + int name_len; + int datasize; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + struct extent_buffer *leaf; + unsigned long nr = 0; + + name_len = strlen(symname) + 1; + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) + return -ENAMETOOLONG; + + /* + * 2 items for inode item and ref + * 2 items for dir items + * 1 item for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); + if (err) + return err; + + trans = btrfs_start_transaction(root, 1); + if (!trans) + goto out_fail; + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, objectid, + BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, + &index); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + err = btrfs_init_inode_security(trans, inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + } + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); + if (drop_inode) + goto out_unlock; + + path = btrfs_alloc_path(); + BUG_ON(!path); + key.objectid = inode->i_ino; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + datasize = btrfs_file_extent_calc_inline_size(name_len); + err = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (err) { + drop_inode = 1; + goto out_unlock; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, + BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_compression(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); + + ptr = btrfs_file_extent_inline_start(ei); + write_extent_buffer(leaf, symname, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode_set_bytes(inode, name_len); + btrfs_i_size_write(inode, name_len - 1); + err = btrfs_update_inode(trans, root, inode); + if (err) + drop_inode = 1; + +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +out_fail: + btrfs_unreserve_metadata_space(root, 5); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int prealloc_file_range(struct inode *inode, u64 start, u64 end, + u64 alloc_hint, int mode, loff_t actual_len) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key ins; + u64 alloc_size; + u64 cur_offset = start; + u64 num_bytes = end - start; + int ret = 0; + u64 i_size; + + while (num_bytes > 0) { + alloc_size = min(num_bytes, root->fs_info->max_extent); + + trans = btrfs_start_transaction(root, 1); + + ret = btrfs_reserve_extent(trans, root, alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); + if (ret) { + WARN_ON(1); + goto stop_trans; + } + + ret = btrfs_reserve_metadata_space(root, 3); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, + ins.offset); + goto stop_trans; + } + + ret = insert_reserved_file_extent(trans, inode, + cur_offset, ins.objectid, + ins.offset, ins.offset, + ins.offset, 0, 0, 0, + BTRFS_FILE_EXTENT_PREALLOC); + BUG_ON(ret); + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + ins.offset -1, 0); + + num_bytes -= ins.offset; + cur_offset += ins.offset; + alloc_hint = ins.objectid + ins.offset; + + inode->i_ctime = CURRENT_TIME; + BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (actual_len > inode->i_size) && + (cur_offset > inode->i_size)) { + + if (cur_offset > actual_len) + i_size = actual_len; + else + i_size = cur_offset; + i_size_write(inode, i_size); + btrfs_ordered_update_i_size(inode, i_size, NULL); + } + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + + btrfs_end_transaction(trans, root); + btrfs_unreserve_metadata_space(root, 3); + } + return ret; + +stop_trans: + btrfs_end_transaction(trans, root); + return ret; + +} + +static long btrfs_fallocate(struct inode *inode, int mode, + loff_t offset, loff_t len) +{ + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 locked_end; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + struct extent_map *em; + int ret; + + alloc_start = offset & ~mask; + alloc_end = (offset + len + mask) & ~mask; + + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + + mutex_lock(&inode->i_mutex); + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(inode, alloc_start); + if (ret) + goto out; + } + + ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, + alloc_end - alloc_start); + if (ret) + goto out; + + locked_end = alloc_end - 1; + while (1) { + struct btrfs_ordered_extent *ordered; + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + alloc_end - 1); + if (ordered && + ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset < alloc_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent(&BTRFS_I(inode)->io_tree, + alloc_start, locked_end, GFP_NOFS); + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ + btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + } else { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + } + + cur_offset = alloc_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + alloc_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), alloc_end); + last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = prealloc_file_range(inode, + cur_offset, last_byte, + alloc_hint, mode, offset+len); + if (ret < 0) { + free_extent_map(em); + break; + } + } + if (em->block_start <= EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + free_extent_map(em); + + cur_offset = last_byte; + if (cur_offset >= alloc_end) { + ret = 0; + break; + } + } + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + GFP_NOFS); + + btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, + alloc_end - alloc_start); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +static int btrfs_set_page_dirty(struct page *page) +{ + return __set_page_dirty_nobuffers(page); +} + +static int btrfs_permission(struct inode *inode, int mask) +{ + if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) + return -EACCES; + return generic_permission(inode, mask, btrfs_check_acl); +} + +static const struct inode_operations btrfs_dir_inode_operations = { + .getattr = btrfs_getattr, + .lookup = btrfs_lookup, + .create = btrfs_create, + .unlink = btrfs_unlink, + .link = btrfs_link, + .mkdir = btrfs_mkdir, + .rmdir = btrfs_rmdir, + .rename = btrfs_rename, + .symlink = btrfs_symlink, + .setattr = btrfs_setattr, + .mknod = btrfs_mknod, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, +}; +static const struct inode_operations btrfs_dir_ro_inode_operations = { + .lookup = btrfs_lookup, + .permission = btrfs_permission, +}; + +static const struct file_operations btrfs_dir_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = btrfs_real_readdir, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_ioctl, +#endif + .release = btrfs_release_file, + .fsync = btrfs_sync_file, +}; + +static struct extent_io_ops btrfs_extent_io_ops = { + .fill_delalloc = run_delalloc_range, + .submit_bio_hook = btrfs_submit_bio_hook, + .merge_bio_hook = btrfs_merge_bio_hook, + .readpage_end_io_hook = btrfs_readpage_end_io_hook, + .writepage_end_io_hook = btrfs_writepage_end_io_hook, + .writepage_start_hook = btrfs_writepage_start_hook, + .readpage_io_failed_hook = btrfs_io_failed_hook, + .set_bit_hook = btrfs_set_bit_hook, + .clear_bit_hook = btrfs_clear_bit_hook, + .merge_extent_hook = btrfs_merge_extent_hook, + .split_extent_hook = btrfs_split_extent_hook, +}; + +/* + * btrfs doesn't support the bmap operation because swapfiles + * use bmap to make a mapping of extents in the file. They assume + * these extents won't change over the life of the file and they + * use the bmap result to do IO directly to the drive. + * + * the btrfs bmap call would return logical addresses that aren't + * suitable for IO and they also will change frequently as COW + * operations happen. So, swapfile + btrfs == corruption. + * + * For now we're avoiding this by dropping bmap. + */ +static const struct address_space_operations btrfs_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .writepages = btrfs_writepages, + .readpages = btrfs_readpages, + .sync_page = block_sync_page, + .direct_IO = btrfs_direct_IO, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, + .set_page_dirty = btrfs_set_page_dirty, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations btrfs_symlink_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, +}; + +static const struct inode_operations btrfs_file_inode_operations = { + .truncate = btrfs_truncate, + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, + .fallocate = btrfs_fallocate, + .fiemap = btrfs_fiemap, +}; +static const struct inode_operations btrfs_special_inode_operations = { + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .permission = btrfs_permission, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, +}; +static const struct inode_operations btrfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .permission = btrfs_permission, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, +}; + +const struct dentry_operations btrfs_dentry_operations = { + .d_delete = btrfs_dentry_delete, +}; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c new file mode 100644 index 00000000000..2c6ee6aacb2 --- /dev/null +++ b/fs/btrfs/ioctl.c @@ -0,0 +1,1352 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/fsnotify.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/backing-dev.h> +#include <linux/mount.h> +#include <linux/mpage.h> +#include <linux/namei.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/bit_spinlock.h> +#include <linux/security.h> +#include <linux/xattr.h> +#include <linux/vmalloc.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "volumes.h" +#include "locking.h" + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & ~FS_DIRSYNC_FL; + else + return flags & (FS_NODUMP_FL | FS_NOATIME_FL); +} + +/* + * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. + */ +static unsigned int btrfs_flags_to_ioctl(unsigned int flags) +{ + unsigned int iflags = 0; + + if (flags & BTRFS_INODE_SYNC) + iflags |= FS_SYNC_FL; + if (flags & BTRFS_INODE_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; + if (flags & BTRFS_INODE_APPEND) + iflags |= FS_APPEND_FL; + if (flags & BTRFS_INODE_NODUMP) + iflags |= FS_NODUMP_FL; + if (flags & BTRFS_INODE_NOATIME) + iflags |= FS_NOATIME_FL; + if (flags & BTRFS_INODE_DIRSYNC) + iflags |= FS_DIRSYNC_FL; + + return iflags; +} + +/* + * Update inode->i_flags based on the btrfs internal flags. + */ +void btrfs_update_iflags(struct inode *inode) +{ + struct btrfs_inode *ip = BTRFS_I(inode); + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + + if (ip->flags & BTRFS_INODE_SYNC) + inode->i_flags |= S_SYNC; + if (ip->flags & BTRFS_INODE_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + if (ip->flags & BTRFS_INODE_APPEND) + inode->i_flags |= S_APPEND; + if (ip->flags & BTRFS_INODE_NOATIME) + inode->i_flags |= S_NOATIME; + if (ip->flags & BTRFS_INODE_DIRSYNC) + inode->i_flags |= S_DIRSYNC; +} + +/* + * Inherit flags from the parent inode. + * + * Unlike extN we don't have any flags we don't want to inherit currently. + */ +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) +{ + unsigned int flags; + + if (!dir) + return; + + flags = BTRFS_I(dir)->flags; + + if (S_ISREG(inode->i_mode)) + flags &= ~BTRFS_INODE_DIRSYNC; + else if (!S_ISDIR(inode->i_mode)) + flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); + + BTRFS_I(inode)->flags = flags; + btrfs_update_iflags(inode); +} + +static int btrfs_ioctl_getflags(struct file *file, void __user *arg) +{ + struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); + unsigned int flags = btrfs_flags_to_ioctl(ip->flags); + + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +static int btrfs_ioctl_setflags(struct file *file, void __user *arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct btrfs_inode *ip = BTRFS_I(inode); + struct btrfs_root *root = ip->root; + struct btrfs_trans_handle *trans; + unsigned int flags, oldflags; + int ret; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL | FS_DIRSYNC_FL)) + return -EOPNOTSUPP; + + if (!is_owner_or_cap(inode)) + return -EACCES; + + mutex_lock(&inode->i_mutex); + + flags = btrfs_mask_flags(inode->i_mode, flags); + oldflags = btrfs_flags_to_ioctl(ip->flags); + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + ret = -EPERM; + goto out_unlock; + } + } + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out_unlock; + + if (flags & FS_SYNC_FL) + ip->flags |= BTRFS_INODE_SYNC; + else + ip->flags &= ~BTRFS_INODE_SYNC; + if (flags & FS_IMMUTABLE_FL) + ip->flags |= BTRFS_INODE_IMMUTABLE; + else + ip->flags &= ~BTRFS_INODE_IMMUTABLE; + if (flags & FS_APPEND_FL) + ip->flags |= BTRFS_INODE_APPEND; + else + ip->flags &= ~BTRFS_INODE_APPEND; + if (flags & FS_NODUMP_FL) + ip->flags |= BTRFS_INODE_NODUMP; + else + ip->flags &= ~BTRFS_INODE_NODUMP; + if (flags & FS_NOATIME_FL) + ip->flags |= BTRFS_INODE_NOATIME; + else + ip->flags &= ~BTRFS_INODE_NOATIME; + if (flags & FS_DIRSYNC_FL) + ip->flags |= BTRFS_INODE_DIRSYNC; + else + ip->flags &= ~BTRFS_INODE_DIRSYNC; + + + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + + btrfs_update_iflags(inode); + inode->i_ctime = CURRENT_TIME; + btrfs_end_transaction(trans, root); + + mnt_drop_write(file->f_path.mnt); + out_unlock: + mutex_unlock(&inode->i_mutex); + return 0; +} + +static int btrfs_ioctl_getversion(struct file *file, int __user *arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + + return put_user(inode->i_generation, arg); +} + +static noinline int create_subvol(struct btrfs_root *root, + struct dentry *dentry, + char *name, int namelen) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *new_root; + struct inode *dir = dentry->d_parent->d_inode; + int ret; + int err; + u64 objectid; + u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; + u64 index = 0; + + /* + * 1 - inode item + * 2 - refs + * 1 - root item + * 2 - dir items + */ + ret = btrfs_reserve_metadata_space(root, 6); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, + 0, &objectid); + if (ret) + goto fail; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, objectid, NULL, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + goto fail; + } + + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, objectid); + + write_extent_buffer(leaf, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + inode_item = &root_item.inode; + memset(inode_item, 0, sizeof(*inode_item)); + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_generation(&root_item, trans->transid); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 1); + btrfs_set_root_used(&root_item, leaf->len); + btrfs_set_root_last_snapshot(&root_item, 0); + + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); + root_item.drop_level = 0; + + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + leaf = NULL; + + btrfs_set_root_dirid(&root_item, new_dirid); + + key.objectid = objectid; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + &root_item); + if (ret) + goto fail; + + key.offset = (u64)-1; + new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); + BUG_ON(IS_ERR(new_root)); + + btrfs_record_root_in_trans(trans, new_root); + + ret = btrfs_create_subvol_root(trans, new_root, new_dirid, + BTRFS_I(dir)->block_group); + /* + * insert the directory item + */ + ret = btrfs_set_inode_index(dir, &index); + BUG_ON(ret); + + ret = btrfs_insert_dir_item(trans, root, + name, namelen, dir->i_ino, &key, + BTRFS_FT_DIR, index); + if (ret) + goto fail; + + btrfs_i_size_write(dir, dir->i_size + namelen * 2); + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + objectid, root->root_key.objectid, + dir->i_ino, index, name, namelen); + + BUG_ON(ret); + + d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); +fail: + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + + btrfs_unreserve_metadata_space(root, 6); + return ret; +} + +static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, + char *name, int namelen) +{ + struct inode *inode; + struct btrfs_pending_snapshot *pending_snapshot; + struct btrfs_trans_handle *trans; + int ret; + + if (!root->ref_cows) + return -EINVAL; + + /* + * 1 - inode item + * 2 - refs + * 1 - root item + * 2 - dir items + */ + ret = btrfs_reserve_metadata_space(root, 6); + if (ret) + goto fail; + + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) { + ret = -ENOMEM; + btrfs_unreserve_metadata_space(root, 6); + goto fail; + } + pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); + if (!pending_snapshot->name) { + ret = -ENOMEM; + kfree(pending_snapshot); + btrfs_unreserve_metadata_space(root, 6); + goto fail; + } + memcpy(pending_snapshot->name, name, namelen); + pending_snapshot->name[namelen] = '\0'; + pending_snapshot->dentry = dentry; + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + pending_snapshot->root = root; + list_add(&pending_snapshot->list, + &trans->transaction->pending_snapshots); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + btrfs_unreserve_metadata_space(root, 6); + + inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto fail; + } + BUG_ON(!inode); + d_instantiate(dentry, inode); + ret = 0; +fail: + return ret; +} + +/* copy of may_create in fs/namei.c() */ +static inline int btrfs_may_create(struct inode *dir, struct dentry *child) +{ + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* + * Create a new subvolume below @parent. This is largely modeled after + * sys_mkdirat and vfs_mkdir, but we only do a single component lookup + * inside this filesystem so it's quite a bit simpler. + */ +static noinline int btrfs_mksubvol(struct path *parent, + char *name, int namelen, + struct btrfs_root *snap_src) +{ + struct inode *dir = parent->dentry->d_inode; + struct dentry *dentry; + int error; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + + dentry = lookup_one_len(name, parent->dentry, namelen); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_unlock; + + error = -EEXIST; + if (dentry->d_inode) + goto out_dput; + + error = mnt_want_write(parent->mnt); + if (error) + goto out_dput; + + error = btrfs_may_create(dir, dentry); + if (error) + goto out_drop_write; + + down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); + + if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) + goto out_up_read; + + if (snap_src) { + error = create_snapshot(snap_src, dentry, + name, namelen); + } else { + error = create_subvol(BTRFS_I(dir)->root, dentry, + name, namelen); + } + if (!error) + fsnotify_mkdir(dir, dentry); +out_up_read: + up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); +out_drop_write: + mnt_drop_write(parent->mnt); +out_dput: + dput(dentry); +out_unlock: + mutex_unlock(&dir->i_mutex); + return error; +} + +static int btrfs_defrag_file(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct page *page; + unsigned long last_index; + unsigned long ra_pages = root->fs_info->bdi.ra_pages; + unsigned long total_read = 0; + u64 page_start; + u64 page_end; + unsigned long i; + int ret; + + ret = btrfs_check_data_free_space(root, inode, inode->i_size); + if (ret) + return -ENOSPC; + + mutex_lock(&inode->i_mutex); + last_index = inode->i_size >> PAGE_CACHE_SHIFT; + for (i = 0; i <= last_index; i++) { + if (total_read % ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, + min(last_index, i + ra_pages - 1)); + } + total_read++; +again: + page = grab_cache_page(inode->i_mapping, i); + if (!page) + goto out_unlock; + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + goto out_unlock; + } + } + + wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + set_page_extent_mapped(page); + + /* + * this makes sure page_mkwrite is called on the + * page if it is dirtied again later + */ + clear_page_dirty_for_io(page); + + btrfs_set_extent_delalloc(inode, page_start, page_end); + set_page_dirty(page); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); + } + +out_unlock: + mutex_unlock(&inode->i_mutex); + return 0; +} + +static noinline int btrfs_ioctl_resize(struct btrfs_root *root, + void __user *arg) +{ + u64 new_size; + u64 old_size; + u64 devid = 1; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + struct btrfs_device *device = NULL; + char *sizestr; + char *devstr = NULL; + int ret = 0; + int namelen; + int mod = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + + mutex_lock(&root->fs_info->volume_mutex); + sizestr = vol_args->name; + devstr = strchr(sizestr, ':'); + if (devstr) { + char *end; + sizestr = devstr + 1; + *devstr = '\0'; + devstr = vol_args->name; + devid = simple_strtoull(devstr, &end, 10); + printk(KERN_INFO "resizing devid %llu\n", + (unsigned long long)devid); + } + device = btrfs_find_device(root, devid, NULL, NULL); + if (!device) { + printk(KERN_INFO "resizer unable to find device %llu\n", + (unsigned long long)devid); + ret = -EINVAL; + goto out_unlock; + } + if (!strcmp(sizestr, "max")) + new_size = device->bdev->bd_inode->i_size; + else { + if (sizestr[0] == '-') { + mod = -1; + sizestr++; + } else if (sizestr[0] == '+') { + mod = 1; + sizestr++; + } + new_size = btrfs_parse_size(sizestr); + if (new_size == 0) { + ret = -EINVAL; + goto out_unlock; + } + } + + old_size = device->total_bytes; + + if (mod < 0) { + if (new_size > old_size) { + ret = -EINVAL; + goto out_unlock; + } + new_size = old_size - new_size; + } else if (mod > 0) { + new_size = old_size + new_size; + } + + if (new_size < 256 * 1024 * 1024) { + ret = -EINVAL; + goto out_unlock; + } + if (new_size > device->bdev->bd_inode->i_size) { + ret = -EFBIG; + goto out_unlock; + } + + do_div(new_size, root->sectorsize); + new_size *= root->sectorsize; + + printk(KERN_INFO "new size for %s is %llu\n", + device->name, (unsigned long long)new_size); + + if (new_size > old_size) { + trans = btrfs_start_transaction(root, 1); + ret = btrfs_grow_device(trans, device, new_size); + btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_shrink_device(device, new_size); + } + +out_unlock: + mutex_unlock(&root->fs_info->volume_mutex); + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_snap_create(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_ioctl_vol_args *vol_args; + struct file *src_file; + int namelen; + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + if (strchr(vol_args->name, '/')) { + ret = -EINVAL; + goto out; + } + + if (subvol) { + ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, + NULL); + } else { + struct inode *src_inode; + src_file = fget(vol_args->fd); + if (!src_file) { + ret = -EINVAL; + goto out; + } + + src_inode = src_file->f_path.dentry->d_inode; + if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { + printk(KERN_INFO "btrfs: Snapshot src from " + "another FS\n"); + ret = -EINVAL; + fput(src_file); + goto out; + } + ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, + BTRFS_I(src_inode)->root); + fput(src_file); + } +out: + kfree(vol_args); + return ret; +} + +/* + * helper to check if the subvolume references other subvolumes + */ +static noinline int may_destroy_subvol(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, + &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = 0; + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == root->root_key.objectid && + key.type == BTRFS_ROOT_REF_KEY) + ret = -ENOTEMPTY; + } +out: + btrfs_free_path(path); + return ret; +} + +static noinline int btrfs_ioctl_snap_destroy(struct file *file, + void __user *arg) +{ + struct dentry *parent = fdentry(file); + struct dentry *dentry; + struct inode *dir = parent->d_inode; + struct inode *inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *dest = NULL; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + int namelen; + int ret; + int err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + if (strchr(vol_args->name, '/') || + strncmp(vol_args->name, "..", namelen) == 0) { + err = -EINVAL; + goto out; + } + + err = mnt_want_write(file->f_path.mnt); + if (err) + goto out; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + dentry = lookup_one_len(vol_args->name, parent, namelen); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_unlock_dir; + } + + if (!dentry->d_inode) { + err = -ENOENT; + goto out_dput; + } + + inode = dentry->d_inode; + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { + err = -EINVAL; + goto out_dput; + } + + dest = BTRFS_I(inode)->root; + + mutex_lock(&inode->i_mutex); + err = d_invalidate(dentry); + if (err) + goto out_unlock; + + down_write(&root->fs_info->subvol_sem); + + err = may_destroy_subvol(dest); + if (err) + goto out_up_write; + + trans = btrfs_start_transaction(root, 1); + ret = btrfs_unlink_subvol(trans, root, dir, + dest->root_key.objectid, + dentry->d_name.name, + dentry->d_name.len); + BUG_ON(ret); + + btrfs_record_root_in_trans(trans, dest); + + memset(&dest->root_item.drop_progress, 0, + sizeof(dest->root_item.drop_progress)); + dest->root_item.drop_level = 0; + btrfs_set_root_refs(&dest->root_item, 0); + + ret = btrfs_insert_orphan_item(trans, + root->fs_info->tree_root, + dest->root_key.objectid); + BUG_ON(ret); + + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + inode->i_flags |= S_DEAD; +out_up_write: + up_write(&root->fs_info->subvol_sem); +out_unlock: + mutex_unlock(&inode->i_mutex); + if (!err) { + shrink_dcache_sb(root->fs_info->sb); + btrfs_invalidate_inodes(dest); + d_delete(dentry); + } +out_dput: + dput(dentry); +out_unlock_dir: + mutex_unlock(&dir->i_mutex); + mnt_drop_write(file->f_path.mnt); +out: + kfree(vol_args); + return err; +} + +static int btrfs_ioctl_defrag(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + btrfs_defrag_root(root, 0); + btrfs_defrag_root(root->fs_info->extent_root, 0); + break; + case S_IFREG: + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EINVAL; + goto out; + } + btrfs_defrag_file(file); + break; + } +out: + mnt_drop_write(file->f_path.mnt); + return ret; +} + +static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_init_new_device(root, vol_args->name); + + kfree(vol_args); + return ret; +} + +static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_rm_device(root, vol_args->name); + + kfree(vol_args); + return ret; +} + +static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct file *src_file; + struct inode *src; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct extent_buffer *leaf; + char *buf; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + u64 len = olen; + u64 bs = root->fs_info->sb->s_blocksize; + u64 hint_byte; + + /* + * TODO: + * - split compressed inline extents. annoying: we need to + * decompress into destination's address_space (the file offset + * may change, so source mapping won't do), then recompress (or + * otherwise reinsert) a subrange. + * - allow ranges within the same file to be cloned (provided + * they don't overlap)? + */ + + /* the destination must be opened for writing */ + if (!(file->f_mode & FMODE_WRITE)) + return -EINVAL; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + + src_file = fget(srcfd); + if (!src_file) { + ret = -EBADF; + goto out_drop_write; + } + + src = src_file->f_dentry->d_inode; + + ret = -EINVAL; + if (src == inode) + goto out_fput; + + /* the src must be open for reading */ + if (!(src_file->f_mode & FMODE_READ)) + goto out_fput; + + ret = -EISDIR; + if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) + goto out_fput; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) + goto out_fput; + + ret = -ENOMEM; + buf = vmalloc(btrfs_level_size(root, 0)); + if (!buf) + goto out_fput; + + path = btrfs_alloc_path(); + if (!path) { + vfree(buf); + goto out_fput; + } + path->reada = 2; + + if (inode < src) { + mutex_lock(&inode->i_mutex); + mutex_lock(&src->i_mutex); + } else { + mutex_lock(&src->i_mutex); + mutex_lock(&inode->i_mutex); + } + + /* determine range to clone */ + ret = -EINVAL; + if (off >= src->i_size || off + len > src->i_size) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - off; + /* if we extend to eof, continue to block boundary */ + if (off + len == src->i_size) + len = ((src->i_size + bs-1) & ~(bs-1)) + - off; + + /* verify the end result is block aligned */ + if ((off & (bs-1)) || + ((off + len) & (bs-1))) + goto out_unlock; + + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, off+len); + if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) + break; + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + if (ordered) + btrfs_put_ordered_extent(ordered); + btrfs_wait_ordered_range(src, off, off+len); + } + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + /* punch hole in destination first */ + btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1); + + /* clone data */ + key.objectid = src->i_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + + while (1) { + /* + * note the key will change type as we walk through the + * tree. + */ + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + nritems = btrfs_header_nritems(path->nodes[0]); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || + key.objectid != src->i_ino) + break; + + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *extent; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + disko = btrfs_file_extent_disk_bytenr(leaf, + extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, + extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, + extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, + extent); + } + btrfs_release_path(root, path); + + if (key.offset + datal < off || + key.offset >= off+len) + goto next; + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = inode->i_ino; + new_key.offset = key.offset + destoff - off; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + /* disko == 0 means it's a hole */ + if (!disko) + datao = 0; + + btrfs_set_file_extent_offset(leaf, extent, + datao); + btrfs_set_file_extent_num_bytes(leaf, extent, + datal); + if (disko) { + inode_add_bytes(inode, datal); + ret = btrfs_inc_extent_ref(trans, root, + disko, diskl, 0, + root->root_key.objectid, + inode->i_ino, + new_key.offset - datao); + BUG_ON(ret); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; + if (off > key.offset) { + skip = off - key.offset; + new_key.offset += skip; + } + + if (key.offset + datal > off+len) + trim = key.offset + datal - (off+len); + + if (comp && (skip || trim)) { + ret = -EINVAL; + goto out; + } + size -= skip + trim; + datal -= skip + trim; + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) + goto out; + + if (skip) { + u32 start = + btrfs_file_extent_calc_inline_size(0); + memmove(buf+start, buf+start+skip, + datal); + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + inode_add_bytes(inode, datal); + } + + btrfs_mark_buffer_dirty(leaf); + } + +next: + btrfs_release_path(root, path); + key.offset++; + } + ret = 0; +out: + btrfs_release_path(root, path); + if (ret == 0) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (destoff + olen > inode->i_size) + btrfs_i_size_write(inode, destoff + olen); + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; + ret = btrfs_update_inode(trans, root, inode); + } + btrfs_end_transaction(trans, root); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + if (ret) + vmtruncate(inode, 0); +out_unlock: + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); + vfree(buf); + btrfs_free_path(path); +out_fput: + fput(src_file); +out_drop_write: + mnt_drop_write(file->f_path.mnt); + return ret; +} + +static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_clone_range_args args; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, + args.src_length, args.dest_offset); +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +static long btrfs_ioctl_trans_start(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret; + + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -EINPROGRESS; + if (file->private_data) + goto out; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out; + + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans++; + mutex_unlock(&root->fs_info->trans_mutex); + + ret = -ENOMEM; + trans = btrfs_start_ioctl_transaction(root, 0); + if (!trans) + goto out_drop; + + file->private_data = trans; + return 0; + +out_drop: + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + mnt_drop_write(file->f_path.mnt); +out: + return ret; +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +long btrfs_ioctl_trans_end(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + + trans = file->private_data; + if (!trans) + return -EINVAL; + file->private_data = NULL; + + btrfs_end_transaction(trans, root); + + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + + mnt_drop_write(file->f_path.mnt); + return 0; +} + +long btrfs_ioctl(struct file *file, unsigned int + cmd, unsigned long arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case FS_IOC_GETFLAGS: + return btrfs_ioctl_getflags(file, argp); + case FS_IOC_SETFLAGS: + return btrfs_ioctl_setflags(file, argp); + case FS_IOC_GETVERSION: + return btrfs_ioctl_getversion(file, argp); + case BTRFS_IOC_SNAP_CREATE: + return btrfs_ioctl_snap_create(file, argp, 0); + case BTRFS_IOC_SUBVOL_CREATE: + return btrfs_ioctl_snap_create(file, argp, 1); + case BTRFS_IOC_SNAP_DESTROY: + return btrfs_ioctl_snap_destroy(file, argp); + case BTRFS_IOC_DEFRAG: + return btrfs_ioctl_defrag(file); + case BTRFS_IOC_RESIZE: + return btrfs_ioctl_resize(root, argp); + case BTRFS_IOC_ADD_DEV: + return btrfs_ioctl_add_dev(root, argp); + case BTRFS_IOC_RM_DEV: + return btrfs_ioctl_rm_dev(root, argp); + case BTRFS_IOC_BALANCE: + return btrfs_balance(root->fs_info->dev_root); + case BTRFS_IOC_CLONE: + return btrfs_ioctl_clone(file, arg, 0, 0, 0); + case BTRFS_IOC_CLONE_RANGE: + return btrfs_ioctl_clone_range(file, argp); + case BTRFS_IOC_TRANS_START: + return btrfs_ioctl_trans_start(file); + case BTRFS_IOC_TRANS_END: + return btrfs_ioctl_trans_end(file); + case BTRFS_IOC_SYNC: + btrfs_sync_fs(file->f_dentry->d_sb, 1); + return 0; + } + + return -ENOTTY; +} diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h new file mode 100644 index 00000000000..bc49914475e --- /dev/null +++ b/fs/btrfs/ioctl.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __IOCTL_ +#define __IOCTL_ +#include <linux/ioctl.h> + +#define BTRFS_IOCTL_MAGIC 0x94 +#define BTRFS_VOL_NAME_MAX 255 +#define BTRFS_PATH_NAME_MAX 4087 + +/* this should be 4k */ +struct btrfs_ioctl_vol_args { + __s64 fd; + char name[BTRFS_PATH_NAME_MAX + 1]; +}; + +struct btrfs_ioctl_clone_range_args { + __s64 src_fd; + __u64 src_offset, src_length; + __u64 dest_offset; +}; + +#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ + struct btrfs_ioctl_vol_args) +/* trans start and trans end are dangerous, and only for + * use by applications that know how to avoid the + * resulting deadlocks + */ +#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) +#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) +#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) + +#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) +#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ + struct btrfs_ioctl_vol_args) + +#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ + struct btrfs_ioctl_clone_range_args) + +#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ + struct btrfs_ioctl_vol_args) +#endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c new file mode 100644 index 00000000000..1c36e5cd8f5 --- /dev/null +++ b/fs/btrfs/locking.c @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/gfp.h> +#include <linux/pagemap.h> +#include <linux/spinlock.h> +#include <linux/page-flags.h> +#include <asm/bug.h> +#include "ctree.h" +#include "extent_io.h" +#include "locking.h" + +static inline void spin_nested(struct extent_buffer *eb) +{ + spin_lock(&eb->lock); +} + +/* + * Setting a lock to blocking will drop the spinlock and set the + * flag that forces other procs who want the lock to wait. After + * this you can safely schedule with the lock held. + */ +void btrfs_set_lock_blocking(struct extent_buffer *eb) +{ + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); + spin_unlock(&eb->lock); + } + /* exit with the spin lock released and the bit set */ +} + +/* + * clearing the blocking flag will take the spinlock again. + * After this you can't safely schedule + */ +void btrfs_clear_lock_blocking(struct extent_buffer *eb) +{ + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + spin_nested(eb); + clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); + smp_mb__after_clear_bit(); + } + /* exit with the spin lock held */ +} + +/* + * unfortunately, many of the places that currently set a lock to blocking + * don't end up blocking for very long, and often they don't block + * at all. For a dbench 50 run, if we don't spin on the blocking bit + * at all, the context switch rate can jump up to 400,000/sec or more. + * + * So, we're still stuck with this crummy spin on the blocking bit, + * at least until the most common causes of the short blocks + * can be dealt with. + */ +static int btrfs_spin_on_block(struct extent_buffer *eb) +{ + int i; + + for (i = 0; i < 512; i++) { + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + if (need_resched()) + break; + cpu_relax(); + } + return 0; +} + +/* + * This is somewhat different from trylock. It will take the + * spinlock but if it finds the lock is set to blocking, it will + * return without the lock held. + * + * returns 1 if it was able to take the lock and zero otherwise + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_try_spin_lock(struct extent_buffer *eb) +{ + int i; + + if (btrfs_spin_on_block(eb)) { + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + } + /* spin for a bit on the BLOCKING flag */ + for (i = 0; i < 2; i++) { + cpu_relax(); + if (!btrfs_spin_on_block(eb)) + break; + + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + } + return 0; +} + +/* + * the autoremove wake function will return 0 if it tried to wake up + * a process that was already awake, which means that process won't + * count as an exclusive wakeup. The waitq code will continue waking + * procs until it finds one that was actually sleeping. + * + * For btrfs, this isn't quite what we want. We want a single proc + * to be notified that the lock is ready for taking. If that proc + * already happen to be awake, great, it will loop around and try for + * the lock. + * + * So, btrfs_wake_function always returns 1, even when the proc that we + * tried to wake up was already awake. + */ +static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + autoremove_wake_function(wait, mode, sync, key); + return 1; +} + +/* + * returns with the extent buffer spinlocked. + * + * This will spin and/or wait as required to take the lock, and then + * return with the spinlock held. + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_tree_lock(struct extent_buffer *eb) +{ + DEFINE_WAIT(wait); + wait.func = btrfs_wake_function; + + if (!btrfs_spin_on_block(eb)) + goto sleep; + + while(1) { + spin_nested(eb); + + /* nobody is blocking, exit with the spinlock held */ + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 0; + + /* + * we have the spinlock, but the real owner is blocking. + * wait for them + */ + spin_unlock(&eb->lock); + + /* + * spin for a bit, and if the blocking flag goes away, + * loop around + */ + cpu_relax(); + if (btrfs_spin_on_block(eb)) + continue; +sleep: + prepare_to_wait_exclusive(&eb->lock_wq, &wait, + TASK_UNINTERRUPTIBLE); + + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + schedule(); + + finish_wait(&eb->lock_wq, &wait); + } + return 0; +} + +/* + * Very quick trylock, this does not spin or schedule. It returns + * 1 with the spinlock held if it was able to take the lock, or it + * returns zero if it was unable to take the lock. + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_try_tree_lock(struct extent_buffer *eb) +{ + if (spin_trylock(&eb->lock)) { + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + /* + * we've got the spinlock, but the real owner is + * blocking. Drop the spinlock and return failure + */ + spin_unlock(&eb->lock); + return 0; + } + return 1; + } + /* someone else has the spinlock giveup */ + return 0; +} + +int btrfs_tree_unlock(struct extent_buffer *eb) +{ + /* + * if we were a blocking owner, we don't have the spinlock held + * just clear the bit and look for waiters + */ + if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + smp_mb__after_clear_bit(); + else + spin_unlock(&eb->lock); + + if (waitqueue_active(&eb->lock_wq)) + wake_up(&eb->lock_wq); + return 0; +} + +void btrfs_assert_tree_locked(struct extent_buffer *eb) +{ + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + assert_spin_locked(&eb->lock); +} diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h new file mode 100644 index 00000000000..6c4ce457168 --- /dev/null +++ b/fs/btrfs/locking.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_LOCKING_ +#define __BTRFS_LOCKING_ + +int btrfs_tree_lock(struct extent_buffer *eb); +int btrfs_tree_unlock(struct extent_buffer *eb); + +int btrfs_try_tree_lock(struct extent_buffer *eb); +int btrfs_try_spin_lock(struct extent_buffer *eb); + +void btrfs_set_lock_blocking(struct extent_buffer *eb); +void btrfs_clear_lock_blocking(struct extent_buffer *eb); +void btrfs_assert_tree_locked(struct extent_buffer *eb); +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c new file mode 100644 index 00000000000..5c2a9e78a94 --- /dev/null +++ b/fs/btrfs/ordered-data.c @@ -0,0 +1,830 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include "ctree.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "extent_io.h" + +static u64 entry_end(struct btrfs_ordered_extent *entry) +{ + if (entry->file_offset + entry->len < entry->file_offset) + return (u64)-1; + return entry->file_offset + entry->len; +} + +/* returns NULL if the insertion worked, or it returns the node it did find + * in the tree + */ +static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_ordered_extent *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); + + if (file_offset < entry->file_offset) + p = &(*p)->rb_left; + else if (file_offset >= entry_end(entry)) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * look for a given offset in the tree, and if it can't be found return the + * first lesser offset + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, + struct rb_node **prev_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *test; + struct btrfs_ordered_extent *entry; + struct btrfs_ordered_extent *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); + prev = n; + prev_entry = entry; + + if (file_offset < entry->file_offset) + n = n->rb_left; + else if (file_offset >= entry_end(entry)) + n = n->rb_right; + else + return n; + } + if (!prev_ret) + return NULL; + + while (prev && file_offset >= entry_end(prev_entry)) { + test = rb_next(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + if (file_offset < entry_end(prev_entry)) + break; + + prev = test; + } + if (prev) + prev_entry = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + while (prev && file_offset < entry_end(prev_entry)) { + test = rb_prev(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + prev = test; + } + *prev_ret = prev; + return NULL; +} + +/* + * helper to check if a given offset is inside a given entry + */ +static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) +{ + if (file_offset < entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + +/* + * look find the first ordered struct that has this offset, otherwise + * the first one less than this offset + */ +static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, + u64 file_offset) +{ + struct rb_root *root = &tree->tree; + struct rb_node *prev; + struct rb_node *ret; + struct btrfs_ordered_extent *entry; + + if (tree->last) { + entry = rb_entry(tree->last, struct btrfs_ordered_extent, + rb_node); + if (offset_in_entry(entry, file_offset)) + return tree->last; + } + ret = __tree_search(root, file_offset, &prev); + if (!ret) + ret = prev; + if (ret) + tree->last = ret; + return ret; +} + +/* allocate and add a new ordered_extent into the per-inode tree. + * file_offset is the logical offset in the file + * + * start is the disk block number of an extent already reserved in the + * extent allocation tree + * + * len is the length of the extent + * + * The tree is given a single reference on the ordered extent that was + * inserted. + */ +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry; + + tree = &BTRFS_I(inode)->ordered_tree; + entry = kzalloc(sizeof(*entry), GFP_NOFS); + if (!entry) + return -ENOMEM; + + mutex_lock(&tree->mutex); + entry->file_offset = file_offset; + entry->start = start; + entry->len = len; + entry->disk_len = disk_len; + entry->bytes_left = len; + entry->inode = inode; + if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) + set_bit(type, &entry->flags); + + /* one ref for the tree */ + atomic_set(&entry->refs, 1); + init_waitqueue_head(&entry->wait); + INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->root_extent_list); + + node = tree_insert(&tree->tree, file_offset, + &entry->rb_node); + BUG_ON(node); + + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_add_tail(&entry->root_extent_list, + &BTRFS_I(inode)->root->fs_info->ordered_extents); + spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + + mutex_unlock(&tree->mutex); + BUG_ON(node); + return 0; +} + +/* + * Add a struct btrfs_ordered_sum into the list of checksums to be inserted + * when an ordered extent is finished. If the list covers more than one + * ordered extent, it is split across multiples. + */ +int btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum) +{ + struct btrfs_ordered_inode_tree *tree; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + list_add_tail(&sum->list, &entry->list); + mutex_unlock(&tree->mutex); + return 0; +} + +/* + * this is used to account for finished IO across a given range + * of the file. The IO should not span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + */ +int btrfs_dec_test_ordered_pending(struct inode *inode, + u64 file_offset, u64 io_size) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + node = tree_search(tree, file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, file_offset)) { + ret = 1; + goto out; + } + + if (io_size > entry->bytes_left) { + printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + (unsigned long long)entry->bytes_left, + (unsigned long long)io_size); + } + entry->bytes_left -= io_size; + if (entry->bytes_left == 0) + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + else + ret = 1; +out: + mutex_unlock(&tree->mutex); + return ret == 0; +} + +/* + * used to drop a reference on an ordered extent. This will free + * the extent if the last reference is dropped + */ +int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) +{ + struct list_head *cur; + struct btrfs_ordered_sum *sum; + + if (atomic_dec_and_test(&entry->refs)) { + while (!list_empty(&entry->list)) { + cur = entry->list.next; + sum = list_entry(cur, struct btrfs_ordered_sum, list); + list_del(&sum->list); + kfree(sum); + } + kfree(entry); + } + return 0; +} + +/* + * remove an ordered extent from the tree. No references are dropped + * and you must wake_up entry->wait. You must hold the tree mutex + * while you call this function. + */ +static int __btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + + tree = &BTRFS_I(inode)->ordered_tree; + node = &entry->rb_node; + rb_erase(node, &tree->tree); + tree->last = NULL; + set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, + inode, 1); + + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_del_init(&entry->root_extent_list); + + /* + * we have no more ordered extents for this inode and + * no dirty pages. We can safely remove it from the + * list of ordered extents + */ + if (RB_EMPTY_ROOT(&tree->tree) && + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + list_del_init(&BTRFS_I(inode)->ordered_operations); + } + spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + + return 0; +} + +/* + * remove an ordered extent from the tree. No references are dropped + * but any waiters are woken. + */ +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) +{ + struct btrfs_ordered_inode_tree *tree; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + ret = __btrfs_remove_ordered_extent(inode, entry); + mutex_unlock(&tree->mutex); + wake_up(&entry->wait); + + return ret; +} + +/* + * wait for all the ordered extents in a root. This is done when balancing + * space between drives. + */ +int btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput) +{ + struct list_head splice; + struct list_head *cur; + struct btrfs_ordered_extent *ordered; + struct inode *inode; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->fs_info->ordered_extent_lock); + list_splice_init(&root->fs_info->ordered_extents, &splice); + while (!list_empty(&splice)) { + cur = splice.next; + ordered = list_entry(cur, struct btrfs_ordered_extent, + root_extent_list); + if (nocow_only && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + list_move(&ordered->root_extent_list, + &root->fs_info->ordered_extents); + cond_resched_lock(&root->fs_info->ordered_extent_lock); + continue; + } + + list_del_init(&ordered->root_extent_list); + atomic_inc(&ordered->refs); + + /* + * the inode may be getting freed (in sys_unlink path). + */ + inode = igrab(ordered->inode); + + spin_unlock(&root->fs_info->ordered_extent_lock); + + if (inode) { + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + } else { + btrfs_put_ordered_extent(ordered); + } + + spin_lock(&root->fs_info->ordered_extent_lock); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + return 0; +} + +/* + * this is used during transaction commit to write all the inodes + * added to the ordered operation list. These files must be fully on + * disk before the transaction commits. + * + * we have two modes here, one is to just start the IO via filemap_flush + * and the other is to wait for all the io. When we wait, we have an + * extra check to make sure the ordered operation list really is empty + * before we return + */ +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +{ + struct btrfs_inode *btrfs_inode; + struct inode *inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->fs_info->ordered_operations_mutex); + spin_lock(&root->fs_info->ordered_extent_lock); +again: + list_splice_init(&root->fs_info->ordered_operations, &splice); + + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + ordered_operations); + + inode = &btrfs_inode->vfs_inode; + + list_del_init(&btrfs_inode->ordered_operations); + + /* + * the inode may be getting freed (in sys_unlink path). + */ + inode = igrab(inode); + + if (!wait && inode) { + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &root->fs_info->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + if (inode) { + if (wait) + btrfs_wait_ordered_range(inode, 0, (u64)-1); + else + filemap_flush(inode->i_mapping); + btrfs_add_delayed_iput(inode); + } + + cond_resched(); + spin_lock(&root->fs_info->ordered_extent_lock); + } + if (wait && !list_empty(&root->fs_info->ordered_operations)) + goto again; + + spin_unlock(&root->fs_info->ordered_extent_lock); + mutex_unlock(&root->fs_info->ordered_operations_mutex); + + return 0; +} + +/* + * Used to start IO or wait for a given ordered extent to finish. + * + * If wait is one, this effectively waits on page writeback for all the pages + * in the extent, and it waits on the io completion code to insert + * metadata into the btree corresponding to the extent + */ +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, + int wait) +{ + u64 start = entry->file_offset; + u64 end = start + entry->len - 1; + + /* + * pages in the range can be dirty, clean or writeback. We + * start IO on any dirty ones so the wait doesn't stall waiting + * for pdflush to find them + */ + filemap_fdatawrite_range(inode->i_mapping, start, end); + if (wait) { + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, + &entry->flags)); + } +} + +/* + * Used to wait on ordered extents across a large range of bytes. + */ +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +{ + u64 end; + u64 orig_end; + u64 wait_end; + struct btrfs_ordered_extent *ordered; + int found; + + if (start + len < start) { + orig_end = INT_LIMIT(loff_t); + } else { + orig_end = start + len - 1; + if (orig_end > INT_LIMIT(loff_t)) + orig_end = INT_LIMIT(loff_t); + } + wait_end = orig_end; +again: + /* start IO across the range first to instantiate any delalloc + * extents + */ + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + + /* The compression code will leave pages locked but return from + * writepage without setting the page writeback. Starting again + * with WB_SYNC_ALL will end up waiting for the IO to actually start. + */ + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + + filemap_fdatawait_range(inode->i_mapping, start, orig_end); + + end = orig_end; + found = 0; + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, end); + if (!ordered) + break; + if (ordered->file_offset > orig_end) { + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered->file_offset + ordered->len < start) { + btrfs_put_ordered_extent(ordered); + break; + } + found++; + btrfs_start_ordered_extent(inode, ordered, 1); + end = ordered->file_offset; + btrfs_put_ordered_extent(ordered); + if (end == 0 || end == start) + break; + end--; + } + if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, + EXTENT_DELALLOC, 0, NULL)) { + schedule_timeout(1); + goto again; + } + return 0; +} + +/* + * find an ordered extent corresponding to file_offset. return NULL if + * nothing is found, otherwise take a reference on the extent and return it + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, + u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, file_offset)) + entry = NULL; + if (entry) + atomic_inc(&entry->refs); +out: + mutex_unlock(&tree->mutex); + return entry; +} + +/* + * lookup and return any extent before 'file_offset'. NULL is returned + * if none is found + */ +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + atomic_inc(&entry->refs); +out: + mutex_unlock(&tree->mutex); + return entry; +} + +/* + * After an extent is done, call this to conditionally update the on disk + * i_size. i_size is updated to cover any fully written part of the file. + */ +int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 disk_i_size; + u64 new_i_size; + u64 i_size_test; + u64 i_size = i_size_read(inode); + struct rb_node *node; + struct rb_node *prev = NULL; + struct btrfs_ordered_extent *test; + int ret = 1; + + if (ordered) + offset = entry_end(ordered); + else + offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); + + mutex_lock(&tree->mutex); + disk_i_size = BTRFS_I(inode)->disk_i_size; + + /* truncate file */ + if (disk_i_size > i_size) { + BTRFS_I(inode)->disk_i_size = i_size; + ret = 0; + goto out; + } + + /* + * if the disk i_size is already at the inode->i_size, or + * this ordered extent is inside the disk i_size, we're done + */ + if (disk_i_size == i_size || offset <= disk_i_size) { + goto out; + } + + /* + * we can't update the disk_isize if there are delalloc bytes + * between disk_i_size and this ordered extent + */ + if (test_range_bit(io_tree, disk_i_size, offset - 1, + EXTENT_DELALLOC, 0, NULL)) { + goto out; + } + /* + * walk backward from this ordered extent to disk_i_size. + * if we find an ordered extent then we can't update disk i_size + * yet + */ + if (ordered) { + node = rb_prev(&ordered->rb_node); + } else { + prev = tree_search(tree, offset); + /* + * we insert file extents without involving ordered struct, + * so there should be no ordered struct cover this offset + */ + if (prev) { + test = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + BUG_ON(offset_in_entry(test, offset)); + } + node = prev; + } + while (node) { + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset + test->len <= disk_i_size) + break; + if (test->file_offset >= i_size) + break; + if (test->file_offset >= disk_i_size) + goto out; + node = rb_prev(node); + } + new_i_size = min_t(u64, offset, i_size); + + /* + * at this point, we know we can safely update i_size to at least + * the offset from this ordered extent. But, we need to + * walk forward and see if ios from higher up in the file have + * finished. + */ + if (ordered) { + node = rb_next(&ordered->rb_node); + } else { + if (prev) + node = rb_next(prev); + else + node = rb_first(&tree->tree); + } + i_size_test = 0; + if (node) { + /* + * do we have an area where IO might have finished + * between our ordered extent and the next one. + */ + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset > offset) + i_size_test = test->file_offset; + } else { + i_size_test = i_size; + } + + /* + * i_size_test is the end of a region after this ordered + * extent where there are no ordered extents. As long as there + * are no delalloc bytes in this area, it is safe to update + * disk_i_size to the end of the region. + */ + if (i_size_test > offset && + !test_range_bit(io_tree, offset, i_size_test - 1, + EXTENT_DELALLOC, 0, NULL)) { + new_i_size = min_t(u64, i_size_test, i_size); + } + BTRFS_I(inode)->disk_i_size = new_i_size; + ret = 0; +out: + /* + * we need to remove the ordered extent with the tree lock held + * so that other people calling this function don't find our fully + * processed ordered entry and skip updating the i_size + */ + if (ordered) + __btrfs_remove_ordered_extent(inode, ordered); + mutex_unlock(&tree->mutex); + if (ordered) + wake_up(&ordered->wait); + return ret; +} + +/* + * search the ordered extents for one corresponding to 'offset' and + * try to find a checksum. This is used because we allow pages to + * be reclaimed before their checksum is actually put into the btree + */ +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, + u32 *sum) +{ + struct btrfs_ordered_sum *ordered_sum; + struct btrfs_sector_sum *sector_sums; + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + unsigned long num_sectors; + unsigned long i; + u32 sectorsize = BTRFS_I(inode)->root->sectorsize; + int ret = 1; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + if (!ordered) + return 1; + + mutex_lock(&tree->mutex); + list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { + if (disk_bytenr >= ordered_sum->bytenr) { + num_sectors = ordered_sum->len / sectorsize; + sector_sums = ordered_sum->sums; + for (i = 0; i < num_sectors; i++) { + if (sector_sums[i].bytenr == disk_bytenr) { + *sum = sector_sums[i].sum; + ret = 0; + goto out; + } + } + } + } +out: + mutex_unlock(&tree->mutex); + btrfs_put_ordered_extent(ordered); + return ret; +} + + +/* + * add a given inode to the list of inodes that must be fully on + * disk before a transaction commit finishes. + * + * This basically gives us the ext3 style data=ordered mode, and it is mostly + * used to make sure renamed files are fully on disk. + * + * It is a noop if the inode is already fully on disk. + * + * If trans is not null, we'll do a friendly check for a transaction that + * is already flushing things and force the IO down ourselves. + */ +int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + u64 last_mod; + + last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); + + /* + * if this file hasn't been changed since the last transaction + * commit, we can safely return without doing anything + */ + if (last_mod < root->fs_info->last_trans_committed) + return 0; + + /* + * the transaction is already committing. Just start the IO and + * don't bother with all of this list nonsense + */ + if (trans && root->fs_info->running_transaction->blocked) { + btrfs_wait_ordered_range(inode, 0, (u64)-1); + return 0; + } + + spin_lock(&root->fs_info->ordered_extent_lock); + if (list_empty(&BTRFS_I(inode)->ordered_operations)) { + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &root->fs_info->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + return 0; +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h new file mode 100644 index 00000000000..1fe1282ef47 --- /dev/null +++ b/fs/btrfs/ordered-data.h @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ORDERED_DATA__ +#define __BTRFS_ORDERED_DATA__ + +/* one of these per inode */ +struct btrfs_ordered_inode_tree { + struct mutex mutex; + struct rb_root tree; + struct rb_node *last; +}; + +/* + * these are used to collect checksums done just before bios submission. + * They are attached via a list into the ordered extent, and + * checksum items are inserted into the tree after all the blocks in + * the ordered extent are on disk + */ +struct btrfs_sector_sum { + /* bytenr on disk */ + u64 bytenr; + u32 sum; +}; + +struct btrfs_ordered_sum { + /* bytenr is the start of this extent on disk */ + u64 bytenr; + + /* + * this is the length in bytes covered by the sums array below. + */ + unsigned long len; + struct list_head list; + /* last field is a variable length array of btrfs_sector_sums */ + struct btrfs_sector_sum sums[]; +}; + +/* + * bits for the flags field: + * + * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. + * It is used to make sure metadata is inserted into the tree only once + * per extent. + * + * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the + * rbtree, just before waking any waiters. It is used to indicate the + * IO is done and any metadata is inserted into the tree. + */ +#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ + +#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ + +#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ + +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ + +#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ + +struct btrfs_ordered_extent { + /* logical offset in the file */ + u64 file_offset; + + /* disk byte number */ + u64 start; + + /* ram length of the extent in bytes */ + u64 len; + + /* extent length on disk */ + u64 disk_len; + + /* number of bytes that still need writing */ + u64 bytes_left; + + /* flags (described above) */ + unsigned long flags; + + /* reference count */ + atomic_t refs; + + /* the inode we belong to */ + struct inode *inode; + + /* list of checksums for insertion when the extent io is done */ + struct list_head list; + + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ + wait_queue_head_t wait; + + /* our friendly rbtree entry */ + struct rb_node rb_node; + + /* a per root list of all the pending ordered extents */ + struct list_head root_extent_list; +}; + + +/* + * calculates the total size you need to allocate for an ordered sum + * structure spanning 'bytes' in the file + */ +static inline int btrfs_ordered_sum_size(struct btrfs_root *root, + unsigned long bytes) +{ + unsigned long num_sectors = (bytes + root->sectorsize - 1) / + root->sectorsize; + num_sectors++; + return sizeof(struct btrfs_ordered_sum) + + num_sectors * sizeof(struct btrfs_sector_sum); +} + +static inline void +btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) +{ + mutex_init(&t->mutex); + t->tree.rb_node = NULL; + t->last = NULL; +} + +int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry); +int btrfs_dec_test_ordered_pending(struct inode *inode, + u64 file_offset, u64 io_size); +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int tyep); +int btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, + u64 file_offset); +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, int wait); +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, + struct btrfs_ordered_extent *ordered); +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); +int btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput); +#endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c new file mode 100644 index 00000000000..79cba5fbc28 --- /dev/null +++ b/fs/btrfs/orphan.c @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_free_path(path); + return ret; +} + +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + ret = btrfs_del_item(trans, root, path); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c new file mode 100644 index 00000000000..0d126be22b6 --- /dev/null +++ b/fs/btrfs/print-tree.c @@ -0,0 +1,337 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" + +static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) +{ + int num_stripes = btrfs_chunk_num_stripes(eb, chunk); + int i; + printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu " + "num_stripes %d\n", + (unsigned long long)btrfs_chunk_length(eb, chunk), + (unsigned long long)btrfs_chunk_owner(eb, chunk), + (unsigned long long)btrfs_chunk_type(eb, chunk), + num_stripes); + for (i = 0 ; i < num_stripes ; i++) { + printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i, + (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i), + (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i)); + } +} +static void print_dev_item(struct extent_buffer *eb, + struct btrfs_dev_item *dev_item) +{ + printk(KERN_INFO "\t\tdev item devid %llu " + "total_bytes %llu bytes used %llu\n", + (unsigned long long)btrfs_device_id(eb, dev_item), + (unsigned long long)btrfs_device_total_bytes(eb, dev_item), + (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); +} +static void print_extent_data_ref(struct extent_buffer *eb, + struct btrfs_extent_data_ref *ref) +{ + printk(KERN_INFO "\t\textent data backref root %llu " + "objectid %llu offset %llu count %u\n", + (unsigned long long)btrfs_extent_data_ref_root(eb, ref), + (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref), + (unsigned long long)btrfs_extent_data_ref_offset(eb, ref), + btrfs_extent_data_ref_count(eb, ref)); +} + +static void print_extent_item(struct extent_buffer *eb, int slot) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_disk_key key; + unsigned long end; + unsigned long ptr; + int type; + u32 item_size = btrfs_item_size_nr(eb, slot); + u64 flags; + u64 offset; + + if (item_size < sizeof(*ei)) { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0); + printk(KERN_INFO "\t\textent refs %u\n", + btrfs_extent_refs_v0(eb, ei0)); + return; +#else + BUG(); +#endif + } + + ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n", + (unsigned long long)btrfs_extent_refs(eb, ei), + (unsigned long long)btrfs_extent_generation(eb, ei), + (unsigned long long)flags); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + info = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_tree_block_key(eb, info, &key); + printk(KERN_INFO "\t\ttree block key (%llu %x %llu) " + "level %d\n", + (unsigned long long)btrfs_disk_key_objectid(&key), + key.type, + (unsigned long long)btrfs_disk_key_offset(&key), + btrfs_tree_block_level(eb, info)); + iref = (struct btrfs_extent_inline_ref *)(info + 1); + } else { + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(eb, iref); + offset = btrfs_extent_inline_ref_offset(eb, iref); + switch (type) { + case BTRFS_TREE_BLOCK_REF_KEY: + printk(KERN_INFO "\t\ttree block backref " + "root %llu\n", (unsigned long long)offset); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + printk(KERN_INFO "\t\tshared block backref " + "parent %llu\n", (unsigned long long)offset); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + print_extent_data_ref(eb, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + printk(KERN_INFO "\t\tshared data backref " + "parent %llu count %u\n", + (unsigned long long)offset, + btrfs_shared_data_ref_count(eb, sref)); + break; + default: + BUG(); + } + ptr += btrfs_extent_inline_ref_size(type); + } + WARN_ON(ptr > end); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static void print_extent_ref_v0(struct extent_buffer *eb, int slot) +{ + struct btrfs_extent_ref_v0 *ref0; + + ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0); + printk("\t\textent back ref root %llu gen %llu " + "owner %llu num_refs %lu\n", + (unsigned long long)btrfs_ref_root_v0(eb, ref0), + (unsigned long long)btrfs_ref_generation_v0(eb, ref0), + (unsigned long long)btrfs_ref_objectid_v0(eb, ref0), + (unsigned long)btrfs_ref_count_v0(eb, ref0)); +} +#endif + +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) +{ + int i; + u32 type; + u32 nr = btrfs_header_nritems(l); + struct btrfs_item *item; + struct btrfs_root_item *ri; + struct btrfs_dir_item *di; + struct btrfs_inode_item *ii; + struct btrfs_block_group_item *bi; + struct btrfs_file_extent_item *fi; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_dev_extent *dev_extent; + struct btrfs_key key; + struct btrfs_key found_key; + + printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", + (unsigned long long)btrfs_header_bytenr(l), nr, + btrfs_leaf_free_space(root, l)); + for (i = 0 ; i < nr ; i++) { + item = btrfs_item_nr(l, i); + btrfs_item_key_to_cpu(l, &key, i); + type = btrfs_key_type(&key); + printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d " + "itemsize %d\n", + i, + (unsigned long long)key.objectid, type, + (unsigned long long)key.offset, + btrfs_item_offset(l, item), btrfs_item_size(l, item)); + switch (type) { + case BTRFS_INODE_ITEM_KEY: + ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); + printk(KERN_INFO "\t\tinode generation %llu size %llu " + "mode %o\n", + (unsigned long long) + btrfs_inode_generation(l, ii), + (unsigned long long)btrfs_inode_size(l, ii), + btrfs_inode_mode(l, ii)); + break; + case BTRFS_DIR_ITEM_KEY: + di = btrfs_item_ptr(l, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(l, di, &found_key); + printk(KERN_INFO "\t\tdir oid %llu type %u\n", + (unsigned long long)found_key.objectid, + btrfs_dir_type(l, di)); + break; + case BTRFS_ROOT_ITEM_KEY: + ri = btrfs_item_ptr(l, i, struct btrfs_root_item); + printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n", + (unsigned long long) + btrfs_disk_root_bytenr(l, ri), + btrfs_disk_root_refs(l, ri)); + break; + case BTRFS_EXTENT_ITEM_KEY: + print_extent_item(l, i); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + printk(KERN_INFO "\t\ttree block backref\n"); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + printk(KERN_INFO "\t\tshared block backref\n"); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = btrfs_item_ptr(l, i, + struct btrfs_extent_data_ref); + print_extent_data_ref(l, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = btrfs_item_ptr(l, i, + struct btrfs_shared_data_ref); + printk(KERN_INFO "\t\tshared data backref count %u\n", + btrfs_shared_data_ref_count(l, sref)); + break; + case BTRFS_EXTENT_DATA_KEY: + fi = btrfs_item_ptr(l, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(l, fi) == + BTRFS_FILE_EXTENT_INLINE) { + printk(KERN_INFO "\t\tinline extent data " + "size %u\n", + btrfs_file_extent_inline_len(l, fi)); + break; + } + printk(KERN_INFO "\t\textent data disk bytenr %llu " + "nr %llu\n", + (unsigned long long) + btrfs_file_extent_disk_bytenr(l, fi), + (unsigned long long) + btrfs_file_extent_disk_num_bytes(l, fi)); + printk(KERN_INFO "\t\textent data offset %llu " + "nr %llu ram %llu\n", + (unsigned long long) + btrfs_file_extent_offset(l, fi), + (unsigned long long) + btrfs_file_extent_num_bytes(l, fi), + (unsigned long long) + btrfs_file_extent_ram_bytes(l, fi)); + break; + case BTRFS_EXTENT_REF_V0_KEY: +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + print_extent_ref_v0(l, i); +#else + BUG(); +#endif + case BTRFS_BLOCK_GROUP_ITEM_KEY: + bi = btrfs_item_ptr(l, i, + struct btrfs_block_group_item); + printk(KERN_INFO "\t\tblock group used %llu\n", + (unsigned long long) + btrfs_disk_block_group_used(l, bi)); + break; + case BTRFS_CHUNK_ITEM_KEY: + print_chunk(l, btrfs_item_ptr(l, i, + struct btrfs_chunk)); + break; + case BTRFS_DEV_ITEM_KEY: + print_dev_item(l, btrfs_item_ptr(l, i, + struct btrfs_dev_item)); + break; + case BTRFS_DEV_EXTENT_KEY: + dev_extent = btrfs_item_ptr(l, i, + struct btrfs_dev_extent); + printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n" + "\t\tchunk objectid %llu chunk offset %llu " + "length %llu\n", + (unsigned long long) + btrfs_dev_extent_chunk_tree(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_chunk_objectid(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_chunk_offset(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_length(l, dev_extent)); + }; + } +} + +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) +{ + int i; u32 nr; + struct btrfs_key key; + int level; + + if (!c) + return; + nr = btrfs_header_nritems(c); + level = btrfs_header_level(c); + if (level == 0) { + btrfs_print_leaf(root, c); + return; + } + printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", + (unsigned long long)btrfs_header_bytenr(c), + level, nr, + (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); + for (i = 0; i < nr; i++) { + btrfs_node_key_to_cpu(c, &key, i); + printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", + i, + (unsigned long long)key.objectid, + key.type, + (unsigned long long)key.offset, + (unsigned long long)btrfs_node_blockptr(c, i)); + } + for (i = 0; i < nr; i++) { + struct extent_buffer *next = read_tree_block(root, + btrfs_node_blockptr(c, i), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(c, i)); + if (btrfs_is_leaf(next) && + level != 1) + BUG(); + if (btrfs_header_level(next) != + level - 1) + BUG(); + btrfs_print_tree(root, next); + free_extent_buffer(next); + } +} diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h new file mode 100644 index 00000000000..da75efe534d --- /dev/null +++ b/fs/btrfs/print-tree.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __PRINT_TREE_ +#define __PRINT_TREE_ +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t); +#endif diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c new file mode 100644 index 00000000000..d0cc62bccb9 --- /dev/null +++ b/fs/btrfs/ref-cache.c @@ -0,0 +1,231 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/sort.h> +#include "ctree.h" +#include "ref-cache.h" +#include "transaction.h" + +/* + * leaf refs are used to cache the information about which extents + * a given leaf has references on. This allows us to process that leaf + * in btrfs_drop_snapshot without needing to read it back from disk. + */ + +/* + * kmalloc a leaf reference struct and update the counters for the + * total ref cache size + */ +struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, + int nr_extents) +{ + struct btrfs_leaf_ref *ref; + size_t size = btrfs_leaf_ref_size(nr_extents); + + ref = kmalloc(size, GFP_NOFS); + if (ref) { + spin_lock(&root->fs_info->ref_cache_lock); + root->fs_info->total_ref_cache_size += size; + spin_unlock(&root->fs_info->ref_cache_lock); + + memset(ref, 0, sizeof(*ref)); + atomic_set(&ref->usage, 1); + INIT_LIST_HEAD(&ref->list); + } + return ref; +} + +/* + * free a leaf reference struct and update the counters for the + * total ref cache size + */ +void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) +{ + if (!ref) + return; + WARN_ON(atomic_read(&ref->usage) == 0); + if (atomic_dec_and_test(&ref->usage)) { + size_t size = btrfs_leaf_ref_size(ref->nritems); + + BUG_ON(ref->in_tree); + kfree(ref); + + spin_lock(&root->fs_info->ref_cache_lock); + root->fs_info->total_ref_cache_size -= size; + spin_unlock(&root->fs_info->ref_cache_lock); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_leaf_ref *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct btrfs_leaf_ref, rb_node); + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n = root->rb_node; + struct btrfs_leaf_ref *entry; + + while (n) { + entry = rb_entry(n, struct btrfs_leaf_ref, rb_node); + WARN_ON(!entry->in_tree); + + if (bytenr < entry->bytenr) + n = n->rb_left; + else if (bytenr > entry->bytenr) + n = n->rb_right; + else + return n; + } + return NULL; +} + +int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, + int shared) +{ + struct btrfs_leaf_ref *ref = NULL; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; + + if (shared) + tree = &root->fs_info->shared_ref_tree; + if (!tree) + return 0; + + spin_lock(&tree->lock); + while (!list_empty(&tree->list)) { + ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list); + BUG_ON(ref->tree != tree); + if (ref->root_gen > max_root_gen) + break; + if (!xchg(&ref->in_tree, 0)) { + cond_resched_lock(&tree->lock); + continue; + } + + rb_erase(&ref->rb_node, &tree->root); + list_del_init(&ref->list); + + spin_unlock(&tree->lock); + btrfs_free_leaf_ref(root, ref); + cond_resched(); + spin_lock(&tree->lock); + } + spin_unlock(&tree->lock); + return 0; +} + +/* + * find the leaf ref for a given extent. This returns the ref struct with + * a usage reference incremented + */ +struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, + u64 bytenr) +{ + struct rb_node *rb; + struct btrfs_leaf_ref *ref = NULL; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; +again: + if (tree) { + spin_lock(&tree->lock); + rb = tree_search(&tree->root, bytenr); + if (rb) + ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node); + if (ref) + atomic_inc(&ref->usage); + spin_unlock(&tree->lock); + if (ref) + return ref; + } + if (tree != &root->fs_info->shared_ref_tree) { + tree = &root->fs_info->shared_ref_tree; + goto again; + } + return NULL; +} + +/* + * add a fully filled in leaf ref struct + * remove all the refs older than a given root generation + */ +int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, + int shared) +{ + int ret = 0; + struct rb_node *rb; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; + + if (shared) + tree = &root->fs_info->shared_ref_tree; + + spin_lock(&tree->lock); + rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node); + if (rb) { + ret = -EEXIST; + } else { + atomic_inc(&ref->usage); + ref->tree = tree; + ref->in_tree = 1; + list_add_tail(&ref->list, &tree->list); + } + spin_unlock(&tree->lock); + return ret; +} + +/* + * remove a single leaf ref from the tree. This drops the ref held by the tree + * only + */ +int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) +{ + struct btrfs_leaf_ref_tree *tree; + + if (!xchg(&ref->in_tree, 0)) + return 0; + + tree = ref->tree; + spin_lock(&tree->lock); + + rb_erase(&ref->rb_node, &tree->root); + list_del_init(&ref->list); + + spin_unlock(&tree->lock); + + btrfs_free_leaf_ref(root, ref); + return 0; +} diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h new file mode 100644 index 00000000000..bc283ad2db7 --- /dev/null +++ b/fs/btrfs/ref-cache.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __REFCACHE__ +#define __REFCACHE__ + +struct btrfs_extent_info { + /* bytenr and num_bytes find the extent in the extent allocation tree */ + u64 bytenr; + u64 num_bytes; + + /* objectid and offset find the back reference for the file */ + u64 objectid; + u64 offset; +}; + +struct btrfs_leaf_ref { + struct rb_node rb_node; + struct btrfs_leaf_ref_tree *tree; + int in_tree; + atomic_t usage; + + u64 root_gen; + u64 bytenr; + u64 owner; + u64 generation; + int nritems; + + struct list_head list; + struct btrfs_extent_info extents[]; +}; + +static inline size_t btrfs_leaf_ref_size(int nr_extents) +{ + return sizeof(struct btrfs_leaf_ref) + + sizeof(struct btrfs_extent_info) * nr_extents; +} + +static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) +{ + tree->root.rb_node = NULL; + INIT_LIST_HEAD(&tree->list); + spin_lock_init(&tree->lock); +} + +static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree) +{ + return RB_EMPTY_ROOT(&tree->root); +} + +void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree); +struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, + int nr_extents); +void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); +struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, + u64 bytenr); +int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, + int shared); +int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, + int shared); +int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); +#endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c new file mode 100644 index 00000000000..ab7ab531874 --- /dev/null +++ b/fs/btrfs/relocation.c @@ -0,0 +1,3815 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/rbtree.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "volumes.h" +#include "locking.h" +#include "btrfs_inode.h" +#include "async-thread.h" + +/* + * backref_node, mapping_node and tree_block start with this + */ +struct tree_entry { + struct rb_node rb_node; + u64 bytenr; +}; + +/* + * present a tree block in the backref cache + */ +struct backref_node { + struct rb_node rb_node; + u64 bytenr; + /* objectid tree block owner */ + u64 owner; + /* list of upper level blocks reference this block */ + struct list_head upper; + /* list of child blocks in the cache */ + struct list_head lower; + /* NULL if this node is not tree root */ + struct btrfs_root *root; + /* extent buffer got by COW the block */ + struct extent_buffer *eb; + /* level of tree block */ + unsigned int level:8; + /* 1 if the block is root of old snapshot */ + unsigned int old_root:1; + /* 1 if no child blocks in the cache */ + unsigned int lowest:1; + /* is the extent buffer locked */ + unsigned int locked:1; + /* has the block been processed */ + unsigned int processed:1; + /* have backrefs of this block been checked */ + unsigned int checked:1; +}; + +/* + * present a block pointer in the backref cache + */ +struct backref_edge { + struct list_head list[2]; + struct backref_node *node[2]; + u64 blockptr; +}; + +#define LOWER 0 +#define UPPER 1 + +struct backref_cache { + /* red black tree of all backref nodes in the cache */ + struct rb_root rb_root; + /* list of backref nodes with no child block in the cache */ + struct list_head pending[BTRFS_MAX_LEVEL]; + spinlock_t lock; +}; + +/* + * map address of tree root to tree + */ +struct mapping_node { + struct rb_node rb_node; + u64 bytenr; + void *data; +}; + +struct mapping_tree { + struct rb_root rb_root; + spinlock_t lock; +}; + +/* + * present a tree block to process + */ +struct tree_block { + struct rb_node rb_node; + u64 bytenr; + struct btrfs_key key; + unsigned int level:8; + unsigned int key_ready:1; +}; + +/* inode vector */ +#define INODEVEC_SIZE 16 + +struct inodevec { + struct list_head list; + struct inode *inode[INODEVEC_SIZE]; + int nr; +}; + +#define MAX_EXTENTS 128 + +struct file_extent_cluster { + u64 start; + u64 end; + u64 boundary[MAX_EXTENTS]; + unsigned int nr; +}; + +struct reloc_control { + /* block group to relocate */ + struct btrfs_block_group_cache *block_group; + /* extent tree */ + struct btrfs_root *extent_root; + /* inode for moving data */ + struct inode *data_inode; + struct btrfs_workers workers; + /* tree blocks have been processed */ + struct extent_io_tree processed_blocks; + /* map start of tree root to corresponding reloc tree */ + struct mapping_tree reloc_root_tree; + /* list of reloc trees */ + struct list_head reloc_roots; + u64 search_start; + u64 extents_found; + u64 extents_skipped; + int stage; + int create_reloc_root; + unsigned int found_file_extent:1; + unsigned int found_old_snapshot:1; +}; + +/* stages of data relocation */ +#define MOVE_DATA_EXTENTS 0 +#define UPDATE_DATA_PTRS 1 + +/* + * merge reloc tree to corresponding fs tree in worker threads + */ +struct async_merge { + struct btrfs_work work; + struct reloc_control *rc; + struct btrfs_root *root; + struct completion *done; + atomic_t *num_pending; +}; + +static void mapping_tree_init(struct mapping_tree *tree) +{ + tree->rb_root.rb_node = NULL; + spin_lock_init(&tree->lock); +} + +static void backref_cache_init(struct backref_cache *cache) +{ + int i; + cache->rb_root.rb_node = NULL; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&cache->pending[i]); + spin_lock_init(&cache->lock); +} + +static void backref_node_init(struct backref_node *node) +{ + memset(node, 0, sizeof(*node)); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct tree_entry *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n = root->rb_node; + struct tree_entry *entry; + + while (n) { + entry = rb_entry(n, struct tree_entry, rb_node); + + if (bytenr < entry->bytenr) + n = n->rb_left; + else if (bytenr > entry->bytenr) + n = n->rb_right; + else + return n; + } + return NULL; +} + +/* + * walk up backref nodes until reach node presents tree root + */ +static struct backref_node *walk_up_backref(struct backref_node *node, + struct backref_edge *edges[], + int *index) +{ + struct backref_edge *edge; + int idx = *index; + + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, + struct backref_edge, list[LOWER]); + edges[idx++] = edge; + node = edge->node[UPPER]; + } + *index = idx; + return node; +} + +/* + * walk down backref nodes to find start of next reference path + */ +static struct backref_node *walk_down_backref(struct backref_edge *edges[], + int *index) +{ + struct backref_edge *edge; + struct backref_node *lower; + int idx = *index; + + while (idx > 0) { + edge = edges[idx - 1]; + lower = edge->node[LOWER]; + if (list_is_last(&edge->list[LOWER], &lower->upper)) { + idx--; + continue; + } + edge = list_entry(edge->list[LOWER].next, + struct backref_edge, list[LOWER]); + edges[idx - 1] = edge; + *index = idx; + return edge->node[UPPER]; + } + *index = 0; + return NULL; +} + +static void drop_node_buffer(struct backref_node *node) +{ + if (node->eb) { + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +static void drop_backref_node(struct backref_cache *tree, + struct backref_node *node) +{ + BUG_ON(!node->lowest); + BUG_ON(!list_empty(&node->upper)); + + drop_node_buffer(node); + list_del(&node->lower); + + rb_erase(&node->rb_node, &tree->rb_root); + kfree(node); +} + +/* + * remove a backref node from the backref cache + */ +static void remove_backref_node(struct backref_cache *cache, + struct backref_node *node) +{ + struct backref_node *upper; + struct backref_edge *edge; + + if (!node) + return; + + BUG_ON(!node->lowest); + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, struct backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + list_del(&edge->list[LOWER]); + list_del(&edge->list[UPPER]); + kfree(edge); + /* + * add the node to pending list if no other + * child block cached. + */ + if (list_empty(&upper->lower)) { + list_add_tail(&upper->lower, + &cache->pending[upper->level]); + upper->lowest = 1; + } + } + drop_backref_node(cache, node); +} + +/* + * find reloc tree by address of tree root + */ +static struct btrfs_root *find_reloc_root(struct reloc_control *rc, + u64 bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node; + struct btrfs_root *root = NULL; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + root = (struct btrfs_root *)node->data; + } + spin_unlock(&rc->reloc_root_tree.lock); + return root; +} + +static int is_cowonly_root(u64 root_objectid) +{ + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || + root_objectid == BTRFS_EXTENT_TREE_OBJECTID || + root_objectid == BTRFS_CHUNK_TREE_OBJECTID || + root_objectid == BTRFS_DEV_TREE_OBJECTID || + root_objectid == BTRFS_TREE_LOG_OBJECTID || + root_objectid == BTRFS_CSUM_TREE_OBJECTID) + return 1; + return 0; +} + +static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid) +{ + struct btrfs_key key; + + key.objectid = root_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + if (is_cowonly_root(root_objectid)) + key.offset = 0; + else + key.offset = (u64)-1; + + return btrfs_read_fs_root_no_name(fs_info, &key); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static noinline_for_stack +struct btrfs_root *find_tree_root(struct reloc_control *rc, + struct extent_buffer *leaf, + struct btrfs_extent_ref_v0 *ref0) +{ + struct btrfs_root *root; + u64 root_objectid = btrfs_ref_root_v0(leaf, ref0); + u64 generation = btrfs_ref_generation_v0(leaf, ref0); + + BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID); + + root = read_fs_root(rc->extent_root->fs_info, root_objectid); + BUG_ON(IS_ERR(root)); + + if (root->ref_cows && + generation != btrfs_root_generation(&root->root_item)) + return NULL; + + return root; +} +#endif + +static noinline_for_stack +int find_inline_backref(struct extent_buffer *leaf, int slot, + unsigned long *ptr, unsigned long *end) +{ + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + u32 item_size; + + item_size = btrfs_item_size_nr(leaf, slot); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + return 1; + } +#endif + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + WARN_ON(!(btrfs_extent_flags(leaf, ei) & + BTRFS_EXTENT_FLAG_TREE_BLOCK)); + + if (item_size <= sizeof(*ei) + sizeof(*bi)) { + WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); + return 1; + } + + bi = (struct btrfs_tree_block_info *)(ei + 1); + *ptr = (unsigned long)(bi + 1); + *end = (unsigned long)ei + item_size; + return 0; +} + +/* + * build backref tree for a given tree block. root of the backref tree + * corresponds the tree block, leaves of the backref tree correspond + * roots of b-trees that reference the tree block. + * + * the basic idea of this function is check backrefs of a given block + * to find upper level blocks that refernece the block, and then check + * bakcrefs of these upper level blocks recursively. the recursion stop + * when tree root is reached or backrefs for the block is cached. + * + * NOTE: if we find backrefs for a block are cached, we know backrefs + * for all upper level blocks that directly/indirectly reference the + * block are also cached. + */ +static struct backref_node *build_backref_tree(struct reloc_control *rc, + struct backref_cache *cache, + struct btrfs_key *node_key, + int level, u64 bytenr) +{ + struct btrfs_path *path1; + struct btrfs_path *path2; + struct extent_buffer *eb; + struct btrfs_root *root; + struct backref_node *cur; + struct backref_node *upper; + struct backref_node *lower; + struct backref_node *node = NULL; + struct backref_node *exist = NULL; + struct backref_edge *edge; + struct rb_node *rb_node; + struct btrfs_key key; + unsigned long end; + unsigned long ptr; + LIST_HEAD(list); + int ret; + int err = 0; + + path1 = btrfs_alloc_path(); + path2 = btrfs_alloc_path(); + if (!path1 || !path2) { + err = -ENOMEM; + goto out; + } + + node = kmalloc(sizeof(*node), GFP_NOFS); + if (!node) { + err = -ENOMEM; + goto out; + } + + backref_node_init(node); + node->bytenr = bytenr; + node->owner = 0; + node->level = level; + node->lowest = 1; + cur = node; +again: + end = 0; + ptr = 0; + key.objectid = cur->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; + + path1->search_commit_root = 1; + path1->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, + 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(!ret || !path1->slots[0]); + + path1->slots[0]--; + + WARN_ON(cur->checked); + if (!list_empty(&cur->upper)) { + /* + * the backref was added previously when processsing + * backref of type BTRFS_TREE_BLOCK_REF_KEY + */ + BUG_ON(!list_is_singular(&cur->upper)); + edge = list_entry(cur->upper.next, struct backref_edge, + list[LOWER]); + BUG_ON(!list_empty(&edge->list[UPPER])); + exist = edge->node[UPPER]; + /* + * add the upper level block to pending list if we need + * check its backrefs + */ + if (!exist->checked) + list_add_tail(&edge->list[UPPER], &list); + } else { + exist = NULL; + } + + while (1) { + cond_resched(); + eb = path1->nodes[0]; + + if (ptr >= end) { + if (path1->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(rc->extent_root, path1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + break; + eb = path1->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); + if (key.objectid != cur->bytenr) { + WARN_ON(exist); + break; + } + + if (key.type == BTRFS_EXTENT_ITEM_KEY) { + ret = find_inline_backref(eb, path1->slots[0], + &ptr, &end); + if (ret) + goto next; + } + } + + if (ptr < end) { + /* update key for inline back ref */ + struct btrfs_extent_inline_ref *iref; + iref = (struct btrfs_extent_inline_ref *)ptr; + key.type = btrfs_extent_inline_ref_type(eb, iref); + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && + key.type != BTRFS_SHARED_BLOCK_REF_KEY); + } + + if (exist && + ((key.type == BTRFS_TREE_BLOCK_REF_KEY && + exist->owner == key.offset) || + (key.type == BTRFS_SHARED_BLOCK_REF_KEY && + exist->bytenr == key.offset))) { + exist = NULL; + goto next; + } + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || + key.type == BTRFS_EXTENT_REF_V0_KEY) { + if (key.objectid == key.offset && + key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(eb, path1->slots[0], + struct btrfs_extent_ref_v0); + root = find_tree_root(rc, eb, ref0); + if (root) + cur->root = root; + else + cur->old_root = 1; + break; + } +#else + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { +#endif + if (key.objectid == key.offset) { + /* + * only root blocks of reloc trees use + * backref of this type. + */ + root = find_reloc_root(rc, cur->bytenr); + BUG_ON(!root); + cur->root = root; + break; + } + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (!edge) { + err = -ENOMEM; + goto out; + } + rb_node = tree_search(&cache->rb_root, key.offset); + if (!rb_node) { + upper = kmalloc(sizeof(*upper), GFP_NOFS); + if (!upper) { + kfree(edge); + err = -ENOMEM; + goto out; + } + backref_node_init(upper); + upper->bytenr = key.offset; + upper->owner = 0; + upper->level = cur->level + 1; + /* + * backrefs for the upper level block isn't + * cached, add the block to pending list + */ + list_add_tail(&edge->list[UPPER], &list); + } else { + upper = rb_entry(rb_node, struct backref_node, + rb_node); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + list_add(&edge->list[LOWER], &cur->upper); + edge->node[UPPER] = upper; + edge->node[LOWER] = cur; + + goto next; + } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { + goto next; + } + + /* key.type == BTRFS_TREE_BLOCK_REF_KEY */ + root = read_fs_root(rc->extent_root->fs_info, key.offset); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + + if (btrfs_root_level(&root->root_item) == cur->level) { + /* tree root */ + BUG_ON(btrfs_root_bytenr(&root->root_item) != + cur->bytenr); + cur->root = root; + break; + } + + level = cur->level + 1; + + /* + * searching the tree to find upper level blocks + * reference the block. + */ + path2->search_commit_root = 1; + path2->skip_locking = 1; + path2->lowest_level = level; + ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); + path2->lowest_level = 0; + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0 && path2->slots[level] > 0) + path2->slots[level]--; + + eb = path2->nodes[level]; + WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != + cur->bytenr); + + lower = cur; + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path2->nodes[level]) { + BUG_ON(btrfs_root_bytenr(&root->root_item) != + lower->bytenr); + lower->root = root; + break; + } + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (!edge) { + err = -ENOMEM; + goto out; + } + + eb = path2->nodes[level]; + rb_node = tree_search(&cache->rb_root, eb->start); + if (!rb_node) { + upper = kmalloc(sizeof(*upper), GFP_NOFS); + if (!upper) { + kfree(edge); + err = -ENOMEM; + goto out; + } + backref_node_init(upper); + upper->bytenr = eb->start; + upper->owner = btrfs_header_owner(eb); + upper->level = lower->level + 1; + + /* + * if we know the block isn't shared + * we can void checking its backrefs. + */ + if (btrfs_block_can_be_shared(root, eb)) + upper->checked = 0; + else + upper->checked = 1; + + /* + * add the block to pending list if we + * need check its backrefs. only block + * at 'cur->level + 1' is added to the + * tail of pending list. this guarantees + * we check backrefs from lower level + * blocks to upper level blocks. + */ + if (!upper->checked && + level == cur->level + 1) { + list_add_tail(&edge->list[UPPER], + &list); + } else + INIT_LIST_HEAD(&edge->list[UPPER]); + } else { + upper = rb_entry(rb_node, struct backref_node, + rb_node); + BUG_ON(!upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + list_add_tail(&edge->list[LOWER], &lower->upper); + edge->node[UPPER] = upper; + edge->node[LOWER] = lower; + + if (rb_node) + break; + lower = upper; + upper = NULL; + } + btrfs_release_path(root, path2); +next: + if (ptr < end) { + ptr += btrfs_extent_inline_ref_size(key.type); + if (ptr >= end) { + WARN_ON(ptr > end); + ptr = 0; + end = 0; + } + } + if (ptr >= end) + path1->slots[0]++; + } + btrfs_release_path(rc->extent_root, path1); + + cur->checked = 1; + WARN_ON(exist); + + /* the pending list isn't empty, take the first block to process */ + if (!list_empty(&list)) { + edge = list_entry(list.next, struct backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + cur = edge->node[UPPER]; + goto again; + } + + /* + * everything goes well, connect backref nodes and insert backref nodes + * into the cache. + */ + BUG_ON(!node->checked); + rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); + BUG_ON(rb_node); + + list_for_each_entry(edge, &node->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + + while (!list_empty(&list)) { + edge = list_entry(list.next, struct backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + upper = edge->node[UPPER]; + + if (!RB_EMPTY_NODE(&upper->rb_node)) { + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + continue; + } + + BUG_ON(!upper->checked); + rb_node = tree_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + BUG_ON(rb_node); + + list_add_tail(&edge->list[UPPER], &upper->lower); + + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + } +out: + btrfs_free_path(path1); + btrfs_free_path(path2); + if (err) { + INIT_LIST_HEAD(&list); + upper = node; + while (upper) { + if (RB_EMPTY_NODE(&upper->rb_node)) { + list_splice_tail(&upper->upper, &list); + kfree(upper); + } + + if (list_empty(&list)) + break; + + edge = list_entry(list.next, struct backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + kfree(edge); + } + return ERR_PTR(err); + } + return node; +} + +/* + * helper to add 'address of tree root -> reloc tree' mapping + */ +static int __add_reloc_root(struct btrfs_root *root) +{ + struct rb_node *rb_node; + struct mapping_node *node; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + node = kmalloc(sizeof(*node), GFP_NOFS); + BUG_ON(!node); + + node->bytenr = root->node->start; + node->data = root; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + BUG_ON(rb_node); + + list_add_tail(&root->root_list, &rc->reloc_roots); + return 0; +} + +/* + * helper to update/delete the 'address of tree root -> reloc tree' + * mapping + */ +static int __update_reloc_root(struct btrfs_root *root, int del) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + } + spin_unlock(&rc->reloc_root_tree.lock); + + BUG_ON((struct btrfs_root *)node->data != root); + + if (!del) { + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = root->node->start; + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + BUG_ON(rb_node); + } else { + list_del_init(&root->root_list); + kfree(node); + } + return 0; +} + +/* + * create reloc tree for a given fs tree. reloc tree is just a + * snapshot of the fs tree with special root objectid. + */ +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb; + struct btrfs_root_item *root_item; + struct btrfs_key root_key; + int ret; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + reloc_root->last_trans = trans->transid; + return 0; + } + + if (!root->fs_info->reloc_ctl || + !root->fs_info->reloc_ctl->create_reloc_root || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + root_item = kmalloc(sizeof(*root_item), GFP_NOFS); + BUG_ON(!root_item); + + root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = root->root_key.objectid; + + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1); + memcpy(root_item, &root->root_item, sizeof(*root_item)); + btrfs_set_root_refs(root_item, 1); + btrfs_set_root_bytenr(root_item, eb->start); + btrfs_set_root_level(root_item, btrfs_header_level(eb)); + btrfs_set_root_generation(root_item, trans->transid); + memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); + root_item->drop_level = 0; + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_insert_root(trans, root->fs_info->tree_root, + &root_key, root_item); + BUG_ON(ret); + kfree(root_item); + + reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &root_key); + BUG_ON(IS_ERR(reloc_root)); + reloc_root->last_trans = trans->transid; + + __add_reloc_root(reloc_root); + root->reloc_root = reloc_root; + return 0; +} + +/* + * update root item of reloc tree + */ +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + int del = 0; + int ret; + + if (!root->reloc_root) + return 0; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (btrfs_root_refs(root_item) == 0) { + root->reloc_root = NULL; + del = 1; + } + + __update_reloc_root(reloc_root, del); + + if (reloc_root->commit_root != reloc_root->node) { + btrfs_set_root_node(root_item, reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + reloc_root->commit_root = btrfs_root_node(reloc_root); + } + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &reloc_root->root_key, root_item); + BUG_ON(ret); + return 0; +} + +/* + * helper to find first cached inode with inode number >= objectid + * in a subvolume + */ +static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < entry->vfs_inode.i_ino) + node = node->rb_left; + else if (objectid > entry->vfs_inode.i_ino) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= entry->vfs_inode.i_ino) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + return inode; + } + + objectid = entry->vfs_inode.i_ino + 1; + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); + return NULL; +} + +static int in_block_group(u64 bytenr, + struct btrfs_block_group_cache *block_group) +{ + if (bytenr >= block_group->key.objectid && + bytenr < block_group->key.objectid + block_group->key.offset) + return 1; + return 0; +} + +/* + * get new location of data + */ +static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + bytenr -= BTRFS_I(reloc_inode)->index_cnt; + ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, + bytenr, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + BUG_ON(btrfs_file_extent_offset(leaf, fi) || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)); + + if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { + ret = 1; + goto out; + } + + if (new_bytenr) + *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * update file extent items in the tree leaf to point to + * the new locations. + */ +static int replace_file_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root, + struct extent_buffer *leaf, + struct list_head *inode_list) +{ + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct inode *inode = NULL; + struct inodevec *ivec = NULL; + u64 parent; + u64 bytenr; + u64 new_bytenr; + u64 num_bytes; + u64 end; + u32 nritems; + u32 i; + int ret; + int first = 1; + int dirty = 0; + + if (rc->stage != UPDATE_DATA_PTRS) + return 0; + + /* reloc trees always use full backref */ + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent = leaf->start; + else + parent = 0; + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + cond_resched(); + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + if (bytenr == 0) + continue; + if (!in_block_group(bytenr, rc->block_group)) + continue; + + /* + * if we are modifying block in fs tree, wait for readpage + * to complete and drop the extent cache + */ + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (!ivec || ivec->nr == INODEVEC_SIZE) { + ivec = kmalloc(sizeof(*ivec), GFP_NOFS); + BUG_ON(!ivec); + ivec->nr = 0; + list_add_tail(&ivec->list, inode_list); + } + if (first) { + inode = find_next_inode(root, key.objectid); + if (inode) + ivec->inode[ivec->nr++] = inode; + first = 0; + } else if (inode && inode->i_ino < key.objectid) { + inode = find_next_inode(root, key.objectid); + if (inode) + ivec->inode[ivec->nr++] = inode; + } + if (inode && inode->i_ino == key.objectid) { + end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + WARN_ON(!IS_ALIGNED(key.offset, + root->sectorsize)); + WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + end--; + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end, + GFP_NOFS); + if (!ret) + continue; + + btrfs_drop_extent_cache(inode, key.offset, end, + 1); + unlock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end, GFP_NOFS); + } + } + + ret = get_new_location(rc->data_inode, &new_bytenr, + bytenr, num_bytes); + if (ret > 0) + continue; + BUG_ON(ret < 0); + + btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); + dirty = 1; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + ret = btrfs_inc_extent_ref(trans, root, new_bytenr, + num_bytes, parent, + btrfs_header_owner(leaf), + key.objectid, key.offset); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + parent, btrfs_header_owner(leaf), + key.objectid, key.offset); + BUG_ON(ret); + } + if (dirty) + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static noinline_for_stack +int memcmp_node_keys(struct extent_buffer *eb, int slot, + struct btrfs_path *path, int level) +{ + struct btrfs_disk_key key1; + struct btrfs_disk_key key2; + btrfs_node_key(eb, &key1, slot); + btrfs_node_key(path->nodes[level], &key2, path->slots[level]); + return memcmp(&key1, &key2, sizeof(key1)); +} + +/* + * try to replace tree blocks in fs tree with the new blocks + * in reloc tree. tree blocks haven't been modified since the + * reloc tree was create can be replaced. + * + * if a block was replaced, level of the block + 1 is returned. + * if no block got replaced, 0 is returned. if there are other + * errors, a negative error number is returned. + */ +static int replace_path(struct btrfs_trans_handle *trans, + struct btrfs_root *dest, struct btrfs_root *src, + struct btrfs_path *path, struct btrfs_key *next_key, + struct extent_buffer **leaf, + int lowest_level, int max_level) +{ + struct extent_buffer *eb; + struct extent_buffer *parent; + struct btrfs_key key; + u64 old_bytenr; + u64 new_bytenr; + u64 old_ptr_gen; + u64 new_ptr_gen; + u64 last_snapshot; + u32 blocksize; + int level; + int ret; + int slot; + + BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(lowest_level > 1 && leaf); + + last_snapshot = btrfs_root_last_snapshot(&src->root_item); + + slot = path->slots[lowest_level]; + btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); + + eb = btrfs_lock_root_node(dest); + btrfs_set_lock_blocking(eb); + level = btrfs_header_level(eb); + + if (level < lowest_level) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + return 0; + } + + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + BUG_ON(ret); + btrfs_set_lock_blocking(eb); + + if (next_key) { + next_key->objectid = (u64)-1; + next_key->type = (u8)-1; + next_key->offset = (u64)-1; + } + + parent = eb; + while (1) { + level = btrfs_header_level(parent); + BUG_ON(level < lowest_level); + + ret = btrfs_bin_search(parent, &key, level, &slot); + if (ret && slot > 0) + slot--; + + if (next_key && slot + 1 < btrfs_header_nritems(parent)) + btrfs_node_key_to_cpu(parent, next_key, slot + 1); + + old_bytenr = btrfs_node_blockptr(parent, slot); + blocksize = btrfs_level_size(dest, level - 1); + old_ptr_gen = btrfs_node_ptr_generation(parent, slot); + + if (level <= max_level) { + eb = path->nodes[level]; + new_bytenr = btrfs_node_blockptr(eb, + path->slots[level]); + new_ptr_gen = btrfs_node_ptr_generation(eb, + path->slots[level]); + } else { + new_bytenr = 0; + new_ptr_gen = 0; + } + + if (new_bytenr > 0 && new_bytenr == old_bytenr) { + WARN_ON(1); + ret = level; + break; + } + + if (new_bytenr == 0 || old_ptr_gen > last_snapshot || + memcmp_node_keys(parent, slot, path, level)) { + if (level <= lowest_level && !leaf) { + ret = 0; + break; + } + + eb = read_tree_block(dest, old_bytenr, blocksize, + old_ptr_gen); + btrfs_tree_lock(eb); + ret = btrfs_cow_block(trans, dest, eb, parent, + slot, &eb); + BUG_ON(ret); + btrfs_set_lock_blocking(eb); + + if (level <= lowest_level) { + *leaf = eb; + ret = 0; + break; + } + + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + + parent = eb; + continue; + } + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + btrfs_release_path(src, path); + + path->lowest_level = level; + ret = btrfs_search_slot(trans, src, &key, path, 0, 1); + path->lowest_level = 0; + BUG_ON(ret); + + /* + * swap blocks in fs tree and reloc tree. + */ + btrfs_set_node_blockptr(parent, slot, new_bytenr); + btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); + btrfs_mark_buffer_dirty(parent); + + btrfs_set_node_blockptr(path->nodes[level], + path->slots[level], old_bytenr); + btrfs_set_node_ptr_generation(path->nodes[level], + path->slots[level], old_ptr_gen); + btrfs_mark_buffer_dirty(path->nodes[level]); + + ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, + path->nodes[level]->start, + src->root_key.objectid, level - 1, 0); + BUG_ON(ret); + ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, + 0, dest->root_key.objectid, level - 1, + 0); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, + path->nodes[level]->start, + src->root_key.objectid, level - 1, 0); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, + 0, dest->root_key.objectid, level - 1, + 0); + BUG_ON(ret); + + btrfs_unlock_up_safe(path, 0); + + ret = level; + break; + } + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + return ret; +} + +/* + * helper to find next relocated block in reloc tree + */ +static noinline_for_stack +int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb; + int i; + u64 last_snapshot; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = 0; i < *level; i++) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + + for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] + 1 < nritems) { + path->slots[i]++; + if (btrfs_node_ptr_generation(eb, path->slots[i]) <= + last_snapshot) + continue; + + *level = i; + return 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + return 1; +} + +/* + * walk down reloc tree to find relocated block of lowest level + */ +static noinline_for_stack +int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb = NULL; + int i; + u64 bytenr; + u64 ptr_gen = 0; + u64 last_snapshot; + u32 blocksize; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = *level; i > 0; i--) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] < nritems) { + ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); + if (ptr_gen > last_snapshot) + break; + path->slots[i]++; + } + if (path->slots[i] >= nritems) { + if (i == *level) + break; + *level = i + 1; + return 0; + } + if (i == 1) { + *level = i; + return 0; + } + + bytenr = btrfs_node_blockptr(eb, path->slots[i]); + blocksize = btrfs_level_size(root, i - 1); + eb = read_tree_block(root, bytenr, blocksize, ptr_gen); + BUG_ON(btrfs_header_level(eb) != i - 1); + path->nodes[i - 1] = eb; + path->slots[i - 1] = 0; + } + return 1; +} + +/* + * invalidate extent cache for file extents whose key in range of + * [min_key, max_key) + */ +static int invalidate_extent_cache(struct btrfs_root *root, + struct btrfs_key *min_key, + struct btrfs_key *max_key) +{ + struct inode *inode = NULL; + u64 objectid; + u64 start, end; + + objectid = min_key->objectid; + while (1) { + cond_resched(); + iput(inode); + + if (objectid > max_key->objectid) + break; + + inode = find_next_inode(root, objectid); + if (!inode) + break; + + if (inode->i_ino > max_key->objectid) { + iput(inode); + break; + } + + objectid = inode->i_ino + 1; + if (!S_ISREG(inode->i_mode)) + continue; + + if (unlikely(min_key->objectid == inode->i_ino)) { + if (min_key->type > BTRFS_EXTENT_DATA_KEY) + continue; + if (min_key->type < BTRFS_EXTENT_DATA_KEY) + start = 0; + else { + start = min_key->offset; + WARN_ON(!IS_ALIGNED(start, root->sectorsize)); + } + } else { + start = 0; + } + + if (unlikely(max_key->objectid == inode->i_ino)) { + if (max_key->type < BTRFS_EXTENT_DATA_KEY) + continue; + if (max_key->type > BTRFS_EXTENT_DATA_KEY) { + end = (u64)-1; + } else { + if (max_key->offset == 0) + continue; + end = max_key->offset; + WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + end--; + } + } else { + end = (u64)-1; + } + + /* the lock_extent waits for readpage to complete */ + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + btrfs_drop_extent_cache(inode, start, end, 1); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + } + return 0; +} + +static void put_inodes(struct list_head *list) +{ + struct inodevec *ivec; + while (!list_empty(list)) { + ivec = list_entry(list->next, struct inodevec, list); + list_del(&ivec->list); + while (ivec->nr > 0) { + ivec->nr--; + iput(ivec->inode[ivec->nr]); + } + kfree(ivec); + } +} + +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + level++; + } + return 1; +} + +/* + * merge the relocated tree blocks in reloc tree with corresponding + * fs tree. + */ +static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, + struct btrfs_root *root) +{ + LIST_HEAD(inode_list); + struct btrfs_key key; + struct btrfs_key next_key; + struct btrfs_trans_handle *trans; + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + struct btrfs_path *path; + struct extent_buffer *leaf = NULL; + unsigned long nr; + int level; + int max_level; + int replaced = 0; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_root_level(root_item); + extent_buffer_get(reloc_root->node); + path->nodes[level] = reloc_root->node; + path->slots[level] = 0; + } else { + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + + level = root_item->drop_level; + BUG_ON(level == 0); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + + btrfs_node_key_to_cpu(path->nodes[level], &next_key, + path->slots[level]); + WARN_ON(memcmp(&key, &next_key, sizeof(key))); + + btrfs_unlock_up_safe(path, 0); + } + + if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { + trans = btrfs_start_transaction(root, 1); + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, 0); + btrfs_release_path(reloc_root, path); + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) { + err = ret; + goto out; + } + + leaf = path->nodes[0]; + btrfs_unlock_up_safe(path, 1); + ret = replace_file_extents(trans, rc, root, leaf, + &inode_list); + if (ret < 0) + err = ret; + goto out; + } + + memset(&next_key, 0, sizeof(next_key)); + + while (1) { + leaf = NULL; + replaced = 0; + trans = btrfs_start_transaction(root, 1); + max_level = level; + + ret = walk_down_reloc_tree(reloc_root, path, &level); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + break; + + if (!find_next_key(path, level, &key) && + btrfs_comp_cpu_keys(&next_key, &key) >= 0) { + ret = 0; + } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) { + ret = replace_path(trans, root, reloc_root, + path, &next_key, &leaf, + level, max_level); + } else { + ret = replace_path(trans, root, reloc_root, + path, &next_key, NULL, + level, max_level); + } + if (ret < 0) { + err = ret; + goto out; + } + + if (ret > 0) { + level = ret; + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + replaced = 1; + } else if (leaf) { + /* + * no block got replaced, try replacing file extents + */ + btrfs_item_key_to_cpu(leaf, &key, 0); + ret = replace_file_extents(trans, rc, root, leaf, + &inode_list); + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + BUG_ON(ret < 0); + } + + ret = walk_up_reloc_tree(reloc_root, path, &level); + if (ret > 0) + break; + + BUG_ON(level == 0); + /* + * save the merging progress in the drop_progress. + * this is OK since root refs == 1 in this case. + */ + btrfs_node_key(path->nodes[level], &root_item->drop_progress, + path->slots[level]); + root_item->drop_level = level; + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + + btrfs_btree_balance_dirty(root, nr); + + /* + * put inodes outside transaction, otherwise we may deadlock. + */ + put_inodes(&inode_list); + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + } + + /* + * handle the case only one block in the fs tree need to be + * relocated and the block is tree root. + */ + leaf = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + if (ret < 0) + err = ret; +out: + btrfs_free_path(path); + + if (err == 0) { + memset(&root_item->drop_progress, 0, + sizeof(root_item->drop_progress)); + root_item->drop_level = 0; + btrfs_set_root_refs(root_item, 0); + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + + btrfs_btree_balance_dirty(root, nr); + + put_inodes(&inode_list); + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + + return err; +} + +/* + * callback for the work threads. + * this function merges reloc tree with corresponding fs tree, + * and then drops the reloc tree. + */ +static void merge_func(struct btrfs_work *work) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_root *reloc_root; + struct async_merge *async; + + async = container_of(work, struct async_merge, work); + reloc_root = async->root; + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + root = read_fs_root(reloc_root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + merge_reloc_root(async->rc, root); + + trans = btrfs_start_transaction(root, 1); + btrfs_update_reloc_root(trans, root); + btrfs_end_transaction(trans, root); + } + + btrfs_drop_snapshot(reloc_root, 0); + + if (atomic_dec_and_test(async->num_pending)) + complete(async->done); + + kfree(async); +} + +static int merge_reloc_roots(struct reloc_control *rc) +{ + struct async_merge *async; + struct btrfs_root *root; + struct completion done; + atomic_t num_pending; + + init_completion(&done); + atomic_set(&num_pending, 1); + + while (!list_empty(&rc->reloc_roots)) { + root = list_entry(rc->reloc_roots.next, + struct btrfs_root, root_list); + list_del_init(&root->root_list); + + async = kmalloc(sizeof(*async), GFP_NOFS); + BUG_ON(!async); + async->work.func = merge_func; + async->work.flags = 0; + async->rc = rc; + async->root = root; + async->done = &done; + async->num_pending = &num_pending; + atomic_inc(&num_pending); + btrfs_queue_worker(&rc->workers, &async->work); + } + + if (!atomic_dec_and_test(&num_pending)) + wait_for_completion(&done); + + BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + return 0; +} + +static void free_block_list(struct rb_root *blocks) +{ + struct tree_block *block; + struct rb_node *rb_node; + while ((rb_node = rb_first(blocks))) { + block = rb_entry(rb_node, struct tree_block, rb_node); + rb_erase(rb_node, blocks); + kfree(block); + } +} + +static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *reloc_root) +{ + struct btrfs_root *root; + + if (reloc_root->last_trans == trans->transid) + return 0; + + root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + return btrfs_record_root_in_trans(trans, root); +} + +/* + * select one tree from trees that references the block. + * for blocks in refernce counted trees, we preper reloc tree. + * if no reloc tree found and reloc_only is true, NULL is returned. + */ +static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct backref_edge *edges[], + int *nr, int reloc_only) +{ + struct backref_node *next; + struct btrfs_root *root; + int index; + int loop = 0; +again: + index = 0; + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + if (!root) { + BUG_ON(!node->old_root); + goto skip; + } + + /* no other choice for non-refernce counted tree */ + if (!root->ref_cows) { + BUG_ON(reloc_only); + break; + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + record_reloc_root_in_trans(trans, root); + break; + } + + if (loop) { + btrfs_record_root_in_trans(trans, root); + break; + } + + if (reloc_only || next != node) { + if (!root->reloc_root) + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + /* + * if the reloc tree was created in current + * transation, there is no node in backref tree + * corresponds to the root of the reloc tree. + */ + if (btrfs_root_last_snapshot(&root->root_item) == + trans->transid - 1) + break; + } +skip: + root = NULL; + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + + if (!root && !loop && !reloc_only) { + loop = 1; + goto again; + } + + if (root) + *nr = index; + else + *nr = 0; + + return root; +} + +static noinline_for_stack +struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, + struct backref_node *node) +{ + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int nr; + return __select_one_root(trans, node, edges, &nr, 0); +} + +static noinline_for_stack +struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct backref_edge *edges[], int *nr) +{ + return __select_one_root(trans, node, edges, nr, 1); +} + +static void grab_path_buffers(struct btrfs_path *path, + struct backref_node *node, + struct backref_edge *edges[], int nr) +{ + int i = 0; + while (1) { + drop_node_buffer(node); + node->eb = path->nodes[node->level]; + BUG_ON(!node->eb); + if (path->locks[node->level]) + node->locked = 1; + path->nodes[node->level] = NULL; + path->locks[node->level] = 0; + + if (i >= nr) + break; + + edges[i]->blockptr = node->eb->start; + node = edges[i]->node[UPPER]; + i++; + } +} + +/* + * relocate a block tree, and then update pointers in upper level + * blocks that reference the block to point to the new location. + * + * if called by link_to_upper, the block has already been relocated. + * in that case this function just updates pointers. + */ +static int do_relocation(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path, int lowest) +{ + struct backref_node *upper; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + struct btrfs_root *root; + struct extent_buffer *eb; + u32 blocksize; + u64 bytenr; + u64 generation; + int nr; + int slot; + int ret; + int err = 0; + + BUG_ON(lowest && node->eb); + + path->lowest_level = node->level + 1; + list_for_each_entry(edge, &node->upper, list[LOWER]) { + cond_resched(); + if (node->eb && node->eb->start == edge->blockptr) + continue; + + upper = edge->node[UPPER]; + root = select_reloc_root(trans, upper, edges, &nr); + if (!root) + continue; + + if (upper->eb && !upper->locked) + drop_node_buffer(upper); + + if (!upper->eb) { + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) { + err = ret; + break; + } + BUG_ON(ret > 0); + + slot = path->slots[upper->level]; + + btrfs_unlock_up_safe(path, upper->level + 1); + grab_path_buffers(path, upper, edges, nr); + + btrfs_release_path(NULL, path); + } else { + ret = btrfs_bin_search(upper->eb, key, upper->level, + &slot); + BUG_ON(ret); + } + + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (!lowest) { + if (node->eb->start == bytenr) { + btrfs_tree_unlock(upper->eb); + upper->locked = 0; + continue; + } + } else { + BUG_ON(node->bytenr != bytenr); + } + + blocksize = btrfs_level_size(root, node->level); + generation = btrfs_node_ptr_generation(upper->eb, slot); + eb = read_tree_block(root, bytenr, blocksize, generation); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + + if (!node->eb) { + ret = btrfs_cow_block(trans, root, eb, upper->eb, + slot, &eb); + if (ret < 0) { + err = ret; + break; + } + btrfs_set_lock_blocking(eb); + node->eb = eb; + node->locked = 1; + } else { + btrfs_set_node_blockptr(upper->eb, slot, + node->eb->start); + btrfs_set_node_ptr_generation(upper->eb, slot, + trans->transid); + btrfs_mark_buffer_dirty(upper->eb); + + ret = btrfs_inc_extent_ref(trans, root, + node->eb->start, blocksize, + upper->eb->start, + btrfs_header_owner(upper->eb), + node->level, 0); + BUG_ON(ret); + + ret = btrfs_drop_subtree(trans, root, eb, upper->eb); + BUG_ON(ret); + } + if (!lowest) { + btrfs_tree_unlock(upper->eb); + upper->locked = 0; + } + } + path->lowest_level = 0; + return err; +} + +static int link_to_upper(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct btrfs_path *path) +{ + struct btrfs_key key; + if (!node->eb || list_empty(&node->upper)) + return 0; + + btrfs_node_key_to_cpu(node->eb, &key, 0); + return do_relocation(trans, node, &key, path, 0); +} + +static int finish_pending_nodes(struct btrfs_trans_handle *trans, + struct backref_cache *cache, + struct btrfs_path *path) +{ + struct backref_node *node; + int level; + int ret; + int err = 0; + + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + while (!list_empty(&cache->pending[level])) { + node = list_entry(cache->pending[level].next, + struct backref_node, lower); + BUG_ON(node->level != level); + + ret = link_to_upper(trans, node, path); + if (ret < 0) + err = ret; + /* + * this remove the node from the pending list and + * may add some other nodes to the level + 1 + * pending list + */ + remove_backref_node(cache, node); + } + } + BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); + return err; +} + +static void mark_block_processed(struct reloc_control *rc, + struct backref_node *node) +{ + u32 blocksize; + if (node->level == 0 || + in_block_group(node->bytenr, rc->block_group)) { + blocksize = btrfs_level_size(rc->extent_root, node->level); + set_extent_bits(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY, + GFP_NOFS); + } + node->processed = 1; +} + +/* + * mark a block and all blocks directly/indirectly reference the block + * as processed. + */ +static void update_processed_blocks(struct reloc_control *rc, + struct backref_node *node) +{ + struct backref_node *next = node; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int index = 0; + + while (next) { + cond_resched(); + while (1) { + if (next->processed) + break; + + mark_block_processed(rc, next); + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } +} + +static int tree_block_processed(u64 bytenr, u32 blocksize, + struct reloc_control *rc) +{ + if (test_range_bit(&rc->processed_blocks, bytenr, + bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) + return 1; + return 0; +} + +/* + * check if there are any file extent pointers in the leaf point to + * data require processing + */ +static int check_file_extents(struct reloc_control *rc, + u64 bytenr, u32 blocksize, u64 ptr_gen) +{ + struct btrfs_key found_key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + u32 nritems; + int i; + int ret = 0; + + leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen); + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + cond_resched(); + btrfs_item_key_to_cpu(leaf, &found_key, i); + if (found_key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) + continue; + if (in_block_group(bytenr, rc->block_group)) { + ret = 1; + break; + } + } + free_extent_buffer(leaf); + return ret; +} + +/* + * scan child blocks of a given block to find blocks require processing + */ +static int add_child_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct rb_root *blocks) +{ + struct tree_block *block; + struct rb_node *rb_node; + u64 bytenr; + u64 ptr_gen; + u32 blocksize; + u32 nritems; + int i; + int err = 0; + + nritems = btrfs_header_nritems(node->eb); + blocksize = btrfs_level_size(rc->extent_root, node->level - 1); + for (i = 0; i < nritems; i++) { + cond_resched(); + bytenr = btrfs_node_blockptr(node->eb, i); + ptr_gen = btrfs_node_ptr_generation(node->eb, i); + if (ptr_gen == trans->transid) + continue; + if (!in_block_group(bytenr, rc->block_group) && + (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) + continue; + if (tree_block_processed(bytenr, blocksize, rc)) + continue; + + readahead_tree_block(rc->extent_root, + bytenr, blocksize, ptr_gen); + } + + for (i = 0; i < nritems; i++) { + cond_resched(); + bytenr = btrfs_node_blockptr(node->eb, i); + ptr_gen = btrfs_node_ptr_generation(node->eb, i); + if (ptr_gen == trans->transid) + continue; + if (!in_block_group(bytenr, rc->block_group) && + (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) + continue; + if (tree_block_processed(bytenr, blocksize, rc)) + continue; + if (!in_block_group(bytenr, rc->block_group) && + !check_file_extents(rc, bytenr, blocksize, ptr_gen)) + continue; + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + err = -ENOMEM; + break; + } + block->bytenr = bytenr; + btrfs_node_key_to_cpu(node->eb, &block->key, i); + block->level = node->level - 1; + block->key_ready = 1; + rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); + BUG_ON(rb_node); + } + if (err) + free_block_list(blocks); + return err; +} + +/* + * find adjacent blocks require processing + */ +static noinline_for_stack +int add_adjacent_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_cache *cache, + struct rb_root *blocks, int level, + struct backref_node **upper) +{ + struct backref_node *node; + int ret = 0; + + WARN_ON(!list_empty(&cache->pending[level])); + + if (list_empty(&cache->pending[level + 1])) + return 1; + + node = list_entry(cache->pending[level + 1].next, + struct backref_node, lower); + if (node->eb) + ret = add_child_blocks(trans, rc, node, blocks); + + *upper = node; + return ret; +} + +static int get_tree_block_key(struct reloc_control *rc, + struct tree_block *block) +{ + struct extent_buffer *eb; + + BUG_ON(block->key_ready); + eb = read_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); + WARN_ON(btrfs_header_level(eb) != block->level); + if (block->level == 0) + btrfs_item_key_to_cpu(eb, &block->key, 0); + else + btrfs_node_key_to_cpu(eb, &block->key, 0); + free_extent_buffer(eb); + block->key_ready = 1; + return 0; +} + +static int reada_tree_block(struct reloc_control *rc, + struct tree_block *block) +{ + BUG_ON(block->key_ready); + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); + return 0; +} + +/* + * helper function to relocate a tree block + */ +static int relocate_tree_block(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path) +{ + struct btrfs_root *root; + int ret; + + root = select_one_root(trans, node); + if (unlikely(!root)) { + rc->found_old_snapshot = 1; + update_processed_blocks(rc, node); + return 0; + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + ret = do_relocation(trans, node, key, path, 1); + if (ret < 0) + goto out; + if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) { + ret = replace_file_extents(trans, rc, root, + node->eb, NULL); + if (ret < 0) + goto out; + } + drop_node_buffer(node); + } else if (!root->ref_cows) { + path->lowest_level = node->level; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + btrfs_release_path(root, path); + if (ret < 0) + goto out; + } else if (root != node->root) { + WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS); + } + + update_processed_blocks(rc, node); + ret = 0; +out: + drop_node_buffer(node); + return ret; +} + +/* + * relocate a list of blocks + */ +static noinline_for_stack +int relocate_tree_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct rb_root *blocks) +{ + struct backref_cache *cache; + struct backref_node *node; + struct btrfs_path *path; + struct tree_block *block; + struct rb_node *rb_node; + int level = -1; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + cache = kmalloc(sizeof(*cache), GFP_NOFS); + if (!cache) { + btrfs_free_path(path); + return -ENOMEM; + } + + backref_cache_init(cache); + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (level == -1) + level = block->level; + else + BUG_ON(level != block->level); + if (!block->key_ready) + reada_tree_block(rc, block); + rb_node = rb_next(rb_node); + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (!block->key_ready) + get_tree_block_key(rc, block); + rb_node = rb_next(rb_node); + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + + node = build_backref_tree(rc, cache, &block->key, + block->level, block->bytenr); + if (IS_ERR(node)) { + err = PTR_ERR(node); + goto out; + } + + ret = relocate_tree_block(trans, rc, node, &block->key, + path); + if (ret < 0) { + err = ret; + goto out; + } + remove_backref_node(cache, node); + rb_node = rb_next(rb_node); + } + + if (level > 0) + goto out; + + free_block_list(blocks); + + /* + * now backrefs of some upper level tree blocks have been cached, + * try relocating blocks referenced by these upper level blocks. + */ + while (1) { + struct backref_node *upper = NULL; + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) + break; + + ret = add_adjacent_blocks(trans, rc, cache, blocks, level, + &upper); + if (ret < 0) + err = ret; + if (ret != 0) + break; + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) + goto out; + BUG_ON(!block->key_ready); + node = build_backref_tree(rc, cache, &block->key, + level, block->bytenr); + if (IS_ERR(node)) { + err = PTR_ERR(node); + goto out; + } + + ret = relocate_tree_block(trans, rc, node, + &block->key, path); + if (ret < 0) { + err = ret; + goto out; + } + remove_backref_node(cache, node); + rb_node = rb_next(rb_node); + } + free_block_list(blocks); + + if (upper) { + ret = link_to_upper(trans, upper, path); + if (ret < 0) { + err = ret; + break; + } + remove_backref_node(cache, upper); + } + } +out: + free_block_list(blocks); + + ret = finish_pending_nodes(trans, cache, path); + if (ret < 0) + err = ret; + + kfree(cache); + btrfs_free_path(path); + return err; +} + +static noinline_for_stack +int setup_extent_mapping(struct inode *inode, u64 start, u64 end, + u64 block_start) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret = 0; + + em = alloc_extent_map(GFP_NOFS); + if (!em) + return -ENOMEM; + + em->start = start; + em->len = end + 1 - start; + em->block_len = em->len; + em->block_start = block_start; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, end, 0); + } + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + return ret; +} + +static int relocate_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) +{ + u64 page_start; + u64 page_end; + u64 offset = BTRFS_I(inode)->index_cnt; + unsigned long index; + unsigned long last_index; + unsigned int dirty_page = 0; + struct page *page; + struct file_ra_state *ra; + int nr = 0; + int ret = 0; + + if (!cluster->nr) + return 0; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; + last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; + + mutex_lock(&inode->i_mutex); + + i_size_write(inode, cluster->end + 1 - offset); + ret = setup_extent_mapping(inode, cluster->start - offset, + cluster->end - offset, cluster->start); + if (ret) + goto out_unlock; + + file_ra_state_init(ra, inode->i_mapping); + + WARN_ON(cluster->start != cluster->boundary[0]); + while (index <= last_index) { + page = find_lock_page(inode->i_mapping, index); + if (!page) { + page_cache_sync_readahead(inode->i_mapping, + ra, NULL, index, + last_index + 1 - index); + page = grab_cache_page(inode->i_mapping, index); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + } + + if (PageReadahead(page)) { + page_cache_async_readahead(inode->i_mapping, + ra, NULL, page, index, + last_index + 1 - index); + } + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + goto out_unlock; + } + } + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + + lock_extent(&BTRFS_I(inode)->io_tree, + page_start, page_end, GFP_NOFS); + + set_page_extent_mapped(page); + + if (nr < cluster->nr && + page_start + offset == cluster->boundary[nr]) { + set_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end, + EXTENT_BOUNDARY, GFP_NOFS); + nr++; + } + btrfs_set_extent_delalloc(inode, page_start, page_end); + + set_page_dirty(page); + dirty_page++; + + unlock_extent(&BTRFS_I(inode)->io_tree, + page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + + index++; + if (nr < cluster->nr && + page_end + 1 + offset == cluster->boundary[nr]) { + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + dirty_page); + dirty_page = 0; + } + } + if (dirty_page) { + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + dirty_page); + } + WARN_ON(nr != cluster->nr); +out_unlock: + mutex_unlock(&inode->i_mutex); + kfree(ra); + return ret; +} + +static noinline_for_stack +int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, + struct file_extent_cluster *cluster) +{ + int ret; + + if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + } + + if (!cluster->nr) + cluster->start = extent_key->objectid; + else + BUG_ON(cluster->nr >= MAX_EXTENTS); + cluster->end = extent_key->objectid + extent_key->offset - 1; + cluster->boundary[cluster->nr] = extent_key->objectid; + cluster->nr++; + + if (cluster->nr >= MAX_EXTENTS) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + } + return 0; +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static int get_ref_objectid_v0(struct reloc_control *rc, + struct btrfs_path *path, + struct btrfs_key *extent_key, + u64 *ref_objectid, int *path_change) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_ref_v0 *ref0; + int ret; + int slot; + + leaf = path->nodes[0]; + slot = path->slots[0]; + while (1) { + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret < 0) + return ret; + BUG_ON(ret > 0); + leaf = path->nodes[0]; + slot = path->slots[0]; + if (path_change) + *path_change = 1; + } + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != extent_key->objectid) + return -ENOENT; + + if (key.type != BTRFS_EXTENT_REF_V0_KEY) { + slot++; + continue; + } + ref0 = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_ref_v0); + *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0); + break; + } + return 0; +} +#endif + +/* + * helper to add a tree block to the list. + * the major work is getting the generation and level of the block + */ +static int add_tree_block(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + struct tree_block *block; + struct rb_node *rb_node; + u32 item_size; + int level = -1; + int generation; + + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (item_size >= sizeof(*ei) + sizeof(*bi)) { + ei = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_item); + bi = (struct btrfs_tree_block_info *)(ei + 1); + generation = btrfs_extent_generation(eb, ei); + level = btrfs_tree_block_level(eb, bi); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + u64 ref_owner; + int ret; + + BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + ret = get_ref_objectid_v0(rc, path, extent_key, + &ref_owner, NULL); + BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); + level = (int)ref_owner; + /* FIXME: get real generation */ + generation = 0; +#else + BUG(); +#endif + } + + btrfs_release_path(rc->extent_root, path); + + BUG_ON(level == -1); + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) + return -ENOMEM; + + block->bytenr = extent_key->objectid; + block->key.objectid = extent_key->offset; + block->key.offset = generation; + block->level = level; + block->key_ready = 0; + + rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); + BUG_ON(rb_node); + + return 0; +} + +/* + * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY + */ +static int __add_tree_block(struct reloc_control *rc, + u64 bytenr, u32 blocksize, + struct rb_root *blocks) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + if (tree_block_processed(bytenr, blocksize, rc)) + return 0; + + if (tree_search(blocks, bytenr)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = blocksize; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret); + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ret = add_tree_block(rc, &key, path, blocks); +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to check if the block use full backrefs for pointers in it + */ +static int block_use_full_backref(struct reloc_control *rc, + struct extent_buffer *eb) +{ + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct btrfs_key key; + u64 flags; + int ret; + + if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || + btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) + return 1; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = eb->start; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = eb->len; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, + &key, path, 0, 0); + BUG_ON(ret); + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + flags = btrfs_extent_flags(path->nodes[0], ei); + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = 1; + else + ret = 0; + btrfs_free_path(path); + return ret; +} + +/* + * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY + * this function scans fs tree to find blocks reference the data extent + */ +static int find_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + struct rb_root *blocks) +{ + struct btrfs_path *path; + struct tree_block *block; + struct btrfs_root *root; + struct btrfs_file_extent_item *fi; + struct rb_node *rb_node; + struct btrfs_key key; + u64 ref_root; + u64 ref_objectid; + u64 ref_offset; + u32 ref_count; + u32 nritems; + int err = 0; + int added = 0; + int counted; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ref_root = btrfs_extent_data_ref_root(leaf, ref); + ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); + ref_offset = btrfs_extent_data_ref_offset(leaf, ref); + ref_count = btrfs_extent_data_ref_count(leaf, ref); + + root = read_fs_root(rc->extent_root->fs_info, ref_root); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + + key.objectid = ref_objectid; + key.offset = ref_offset; + key.type = BTRFS_EXTENT_DATA_KEY; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + /* + * the references in tree blocks that use full backrefs + * are not counted in + */ + if (block_use_full_backref(rc, leaf)) + counted = 0; + else + counted = 1; + rb_node = tree_search(blocks, leaf->start); + if (rb_node) { + if (counted) + added = 1; + else + path->slots[0] = nritems; + } + + while (ref_count > 0) { + while (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + WARN_ON(1); + goto out; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + added = 0; + + if (block_use_full_backref(rc, leaf)) + counted = 0; + else + counted = 1; + rb_node = tree_search(blocks, leaf->start); + if (rb_node) { + if (counted) + added = 1; + else + path->slots[0] = nritems; + } + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ref_objectid || + key.type != BTRFS_EXTENT_DATA_KEY) { + WARN_ON(1); + break; + } + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + goto next; + + if (btrfs_file_extent_disk_bytenr(leaf, fi) != + extent_key->objectid) + goto next; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + if (key.offset != ref_offset) + goto next; + + if (counted) + ref_count--; + if (added) + goto next; + + if (!tree_block_processed(leaf->start, leaf->len, rc)) { + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + err = -ENOMEM; + break; + } + block->bytenr = leaf->start; + btrfs_item_key_to_cpu(leaf, &block->key, 0); + block->level = 0; + block->key_ready = 1; + rb_node = tree_insert(blocks, block->bytenr, + &block->rb_node); + BUG_ON(rb_node); + } + if (counted) + added = 1; + else + path->slots[0] = nritems; +next: + path->slots[0]++; + + } +out: + btrfs_free_path(path); + return err; +} + +/* + * hepler to find all tree blocks that reference a given data extent + */ +static noinline_for_stack +int add_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_extent_data_ref *dref; + struct btrfs_extent_inline_ref *iref; + unsigned long ptr; + unsigned long end; + u32 blocksize; + int ret; + int err = 0; + + ret = get_new_location(rc->data_inode, NULL, extent_key->objectid, + extent_key->offset); + BUG_ON(ret < 0); + if (ret > 0) { + /* the relocated data is fragmented */ + rc->extents_skipped++; + btrfs_release_path(rc->extent_root, path); + return 0; + } + + blocksize = btrfs_level_size(rc->extent_root, 0); + + eb = path->nodes[0]; + ptr = btrfs_item_ptr_offset(eb, path->slots[0]); + end = ptr + btrfs_item_size_nr(eb, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (ptr + sizeof(struct btrfs_extent_item_v0) == end) + ptr = end; + else +#endif + ptr += sizeof(struct btrfs_extent_item); + + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + key.type = btrfs_extent_inline_ref_type(eb, iref); + if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + ret = __add_tree_block(rc, key.offset, blocksize, + blocks); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + ret = find_data_references(rc, extent_key, + eb, dref, blocks); + } else { + BUG(); + } + ptr += btrfs_extent_inline_ref_size(key.type); + } + WARN_ON(ptr > end); + + while (1) { + cond_resched(); + eb = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret < 0) { + err = ret; + break; + } + if (ret > 0) + break; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + if (key.objectid != extent_key->objectid) + break; + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (key.type == BTRFS_SHARED_DATA_REF_KEY || + key.type == BTRFS_EXTENT_REF_V0_KEY) { +#else + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + if (key.type == BTRFS_SHARED_DATA_REF_KEY) { +#endif + ret = __add_tree_block(rc, key.offset, blocksize, + blocks); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_data_ref); + ret = find_data_references(rc, extent_key, + eb, dref, blocks); + } else { + ret = 0; + } + if (ret) { + err = ret; + break; + } + path->slots[0]++; + } + btrfs_release_path(rc->extent_root, path); + if (err) + free_block_list(blocks); + return err; +} + +/* + * hepler to find next unprocessed extent + */ +static noinline_for_stack +int find_next_extent(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct btrfs_path *path) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + u64 start, end, last; + int ret; + + last = rc->block_group->key.objectid + rc->block_group->key.offset; + while (1) { + cond_resched(); + if (rc->search_start >= last) { + ret = 1; + break; + } + + key.objectid = rc->search_start; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, + 0, 0); + if (ret < 0) + break; +next: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid >= last) { + ret = 1; + break; + } + + if (key.type != BTRFS_EXTENT_ITEM_KEY || + key.objectid + key.offset <= rc->search_start) { + path->slots[0]++; + goto next; + } + + ret = find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY); + + if (ret == 0 && start <= key.objectid) { + btrfs_release_path(rc->extent_root, path); + rc->search_start = end + 1; + } else { + rc->search_start = key.objectid + key.offset; + return 0; + } + } + btrfs_release_path(rc->extent_root, path); + return ret; +} + +static void set_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + mutex_lock(&fs_info->trans_mutex); + fs_info->reloc_ctl = rc; + mutex_unlock(&fs_info->trans_mutex); +} + +static void unset_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + mutex_lock(&fs_info->trans_mutex); + fs_info->reloc_ctl = NULL; + mutex_unlock(&fs_info->trans_mutex); +} + +static int check_extent_flags(u64 flags) +{ + if ((flags & BTRFS_EXTENT_FLAG_DATA) && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return 1; + if (!(flags & BTRFS_EXTENT_FLAG_DATA) && + !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return 1; + if ((flags & BTRFS_EXTENT_FLAG_DATA) && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + return 1; + return 0; +} + + +static noinline_for_stack int relocate_block_group(struct reloc_control *rc) +{ + struct rb_root blocks = RB_ROOT; + struct btrfs_key key; + struct file_extent_cluster *cluster; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + unsigned long nr; + u64 flags; + u32 item_size; + int ret; + int err = 0; + + cluster = kzalloc(sizeof(*cluster), GFP_NOFS); + if (!cluster) + return -ENOMEM; + + path = btrfs_alloc_path(); + if (!path) { + kfree(cluster); + return -ENOMEM; + } + + rc->extents_found = 0; + rc->extents_skipped = 0; + + rc->search_start = rc->block_group->key.objectid; + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, + GFP_NOFS); + + rc->create_reloc_root = 1; + set_reloc_control(rc); + + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + + while (1) { + trans = btrfs_start_transaction(rc->extent_root, 1); + + ret = find_next_extent(trans, rc, path); + if (ret < 0) + err = ret; + if (ret != 0) + break; + + rc->extents_found++; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + item_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (item_size >= sizeof(*ei)) { + flags = btrfs_extent_flags(path->nodes[0], ei); + ret = check_extent_flags(flags); + BUG_ON(ret); + + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + u64 ref_owner; + int path_change = 0; + + BUG_ON(item_size != + sizeof(struct btrfs_extent_item_v0)); + ret = get_ref_objectid_v0(rc, path, &key, &ref_owner, + &path_change); + if (ref_owner < BTRFS_FIRST_FREE_OBJECTID) + flags = BTRFS_EXTENT_FLAG_TREE_BLOCK; + else + flags = BTRFS_EXTENT_FLAG_DATA; + + if (path_change) { + btrfs_release_path(rc->extent_root, path); + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, + &key, path, 0, 0); + if (ret < 0) { + err = ret; + break; + } + BUG_ON(ret > 0); + } +#else + BUG(); +#endif + } + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = add_tree_block(rc, &key, path, &blocks); + } else if (rc->stage == UPDATE_DATA_PTRS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + ret = add_data_references(rc, &key, path, &blocks); + } else { + btrfs_release_path(rc->extent_root, path); + ret = 0; + } + if (ret < 0) { + err = 0; + break; + } + + if (!RB_EMPTY_ROOT(&blocks)) { + ret = relocate_tree_blocks(trans, rc, &blocks); + if (ret < 0) { + err = ret; + break; + } + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, rc->extent_root); + trans = NULL; + btrfs_btree_balance_dirty(rc->extent_root, nr); + + if (rc->stage == MOVE_DATA_EXTENTS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + rc->found_file_extent = 1; + ret = relocate_data_extent(rc->data_inode, + &key, cluster); + if (ret < 0) { + err = ret; + break; + } + } + } + btrfs_free_path(path); + + if (trans) { + nr = trans->blocks_used; + btrfs_end_transaction(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root, nr); + } + + if (!err) { + ret = relocate_file_extent_cluster(rc->data_inode, cluster); + if (ret < 0) + err = ret; + } + + kfree(cluster); + + rc->create_reloc_root = 0; + smp_mb(); + + if (rc->extents_found > 0) { + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + } + + merge_reloc_roots(rc); + + unset_reloc_control(rc); + + /* get rid of pinned extents */ + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + + return err; +} + +static int __insert_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) +{ + struct btrfs_path *path; + struct btrfs_inode_item *item; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + btrfs_set_inode_generation(leaf, item, 1); + btrfs_set_inode_size(leaf, item, 0); + btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to create inode for data relocation. + * the inode is in data relocation tree and its link count is 0 + */ +static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_key key; + unsigned long nr; + u64 objectid = BTRFS_FIRST_FREE_OBJECTID; + int err = 0; + + root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(root)) + return ERR_CAST(root); + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + err = btrfs_find_free_objectid(trans, root, objectid, &objectid); + if (err) + goto out; + + err = __insert_orphan_inode(trans, root, objectid); + BUG_ON(err); + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root); + BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); + BTRFS_I(inode)->index_cnt = group->key.objectid; + + err = btrfs_orphan_add(trans, inode); +out: + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + + btrfs_btree_balance_dirty(root, nr); + if (err) { + if (inode) + iput(inode); + inode = ERR_PTR(err); + } + return inode; +} + +/* + * function to relocate all extents in a block group. + */ +int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) +{ + struct btrfs_fs_info *fs_info = extent_root->fs_info; + struct reloc_control *rc; + int ret; + int err = 0; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return -ENOMEM; + + mapping_tree_init(&rc->reloc_root_tree); + extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); + INIT_LIST_HEAD(&rc->reloc_roots); + + rc->block_group = btrfs_lookup_block_group(fs_info, group_start); + BUG_ON(!rc->block_group); + + btrfs_init_workers(&rc->workers, "relocate", + fs_info->thread_pool_size, NULL); + + rc->extent_root = extent_root; + btrfs_prepare_block_group_relocation(extent_root, rc->block_group); + + rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + if (IS_ERR(rc->data_inode)) { + err = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + goto out; + } + + printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", + (unsigned long long)rc->block_group->key.objectid, + (unsigned long long)rc->block_group->flags); + + btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); + + while (1) { + rc->extents_found = 0; + rc->extents_skipped = 0; + + mutex_lock(&fs_info->cleaner_mutex); + + btrfs_clean_old_snapshots(fs_info->tree_root); + ret = relocate_block_group(rc); + + mutex_unlock(&fs_info->cleaner_mutex); + if (ret < 0) { + err = ret; + break; + } + + if (rc->extents_found == 0) + break; + + printk(KERN_INFO "btrfs: found %llu extents\n", + (unsigned long long)rc->extents_found); + + if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { + btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); + invalidate_mapping_pages(rc->data_inode->i_mapping, + 0, -1); + rc->stage = UPDATE_DATA_PTRS; + } else if (rc->stage == UPDATE_DATA_PTRS && + rc->extents_skipped >= rc->extents_found) { + iput(rc->data_inode); + rc->data_inode = create_reloc_inode(fs_info, + rc->block_group); + if (IS_ERR(rc->data_inode)) { + err = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + break; + } + rc->stage = MOVE_DATA_EXTENTS; + rc->found_file_extent = 0; + } + } + + filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, + rc->block_group->key.objectid, + rc->block_group->key.objectid + + rc->block_group->key.offset - 1); + + WARN_ON(rc->block_group->pinned > 0); + WARN_ON(rc->block_group->reserved > 0); + WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); +out: + iput(rc->data_inode); + btrfs_stop_workers(&rc->workers); + btrfs_put_block_group(rc->block_group); + kfree(rc); + return err; +} + +static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + trans = btrfs_start_transaction(root->fs_info->tree_root, 1); + + memset(&root->root_item.drop_progress, 0, + sizeof(root->root_item.drop_progress)); + root->root_item.drop_level = 0; + btrfs_set_root_refs(&root->root_item, 0); + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + BUG_ON(ret); + + ret = btrfs_end_transaction(trans, root->fs_info->tree_root); + BUG_ON(ret); + return 0; +} + +/* + * recover relocation interrupted by system crash. + * + * this function resumes merging reloc trees with corresponding fs trees. + * this is important for keeping the sharing of tree blocks + */ +int btrfs_recover_relocation(struct btrfs_root *root) +{ + LIST_HEAD(reloc_roots); + struct btrfs_key key; + struct btrfs_root *fs_root; + struct btrfs_root *reloc_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct reloc_control *rc = NULL; + struct btrfs_trans_handle *trans; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_TREE_RELOC_OBJECTID; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, + path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(root->fs_info->tree_root, path); + + if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || + key.type != BTRFS_ROOT_ITEM_KEY) + break; + + reloc_root = btrfs_read_fs_root_no_radix(root, &key); + if (IS_ERR(reloc_root)) { + err = PTR_ERR(reloc_root); + goto out; + } + + list_add(&reloc_root->root_list, &reloc_roots); + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + if (ret != -ENOENT) { + err = ret; + goto out; + } + mark_garbage_root(reloc_root); + } + } + + if (key.offset == 0) + break; + + key.offset--; + } + btrfs_release_path(root->fs_info->tree_root, path); + + if (list_empty(&reloc_roots)) + goto out; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) { + err = -ENOMEM; + goto out; + } + + mapping_tree_init(&rc->reloc_root_tree); + INIT_LIST_HEAD(&rc->reloc_roots); + btrfs_init_workers(&rc->workers, "relocate", + root->fs_info->thread_pool_size, NULL); + rc->extent_root = root->fs_info->extent_root; + + set_reloc_control(rc); + + while (!list_empty(&reloc_roots)) { + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + list_del(&reloc_root->root_list); + + if (btrfs_root_refs(&reloc_root->root_item) == 0) { + list_add_tail(&reloc_root->root_list, + &rc->reloc_roots); + continue; + } + + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(fs_root)); + + __add_reloc_root(reloc_root); + fs_root->reloc_root = reloc_root; + } + + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + + merge_reloc_roots(rc); + + unset_reloc_control(rc); + + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); +out: + if (rc) { + btrfs_stop_workers(&rc->workers); + kfree(rc); + } + while (!list_empty(&reloc_roots)) { + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + list_del(&reloc_root->root_list); + free_extent_buffer(reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + kfree(reloc_root); + } + btrfs_free_path(path); + + if (err == 0) { + /* cleanup orphan inode in data relocation tree */ + fs_root = read_fs_root(root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(fs_root)) + err = PTR_ERR(fs_root); + else + btrfs_orphan_cleanup(fs_root); + } + return err; +} + +/* + * helper to add ordered checksum for data relocation. + * + * cloning checksum properly handles the nodatasum extents. + * it also saves CPU time to re-calculate the checksum. + */ +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + size_t offset; + int ret; + u64 disk_bytenr; + LIST_HEAD(list); + + ordered = btrfs_lookup_ordered_extent(inode, file_pos); + BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + + disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, + disk_bytenr + len - 1, &list); + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); + + sector_sum = sums->sums; + sums->bytenr = ordered->start; + + offset = 0; + while (offset < sums->len) { + sector_sum->bytenr += ordered->start - disk_bytenr; + sector_sum++; + offset += root->sectorsize; + } + + btrfs_add_ordered_sum(inode, ordered, sums); + } + btrfs_put_ordered_extent(ordered); + return 0; +} diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c new file mode 100644 index 00000000000..67fa2d29d66 --- /dev/null +++ b/fs/btrfs/root-tree.c @@ -0,0 +1,461 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "print-tree.h" + +/* + * search forward for a root, starting with objectid 'search_start' + * if a root key is found, the objectid we find is filled into 'found_objectid' + * and 0 is returned. < 0 is returned on error, 1 if there is nothing + * left in the tree. + */ +int btrfs_search_root(struct btrfs_root *root, u64 search_start, + u64 *found_objectid) +{ + struct btrfs_path *path; + struct btrfs_key search_key; + int ret; + + root = root->fs_info->tree_root; + search_key.objectid = search_start; + search_key.type = (u8)-1; + search_key.offset = (u64)-1; + + path = btrfs_alloc_path(); + BUG_ON(!path); +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + if (ret == 0) { + ret = 1; + goto out; + } + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + } + btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]); + if (search_key.type != BTRFS_ROOT_ITEM_KEY) { + search_key.offset++; + btrfs_release_path(root, path); + goto again; + } + ret = 0; + *found_objectid = search_key.objectid; + +out: + btrfs_free_path(path); + return ret; +} + +/* + * lookup the root with the highest offset for a given objectid. The key we do + * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 + * on error. + */ +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, + struct btrfs_root_item *item, struct btrfs_key *key) +{ + struct btrfs_path *path; + struct btrfs_key search_key; + struct btrfs_key found_key; + struct extent_buffer *l; + int ret; + int slot; + + search_key.objectid = objectid; + search_key.type = BTRFS_ROOT_ITEM_KEY; + search_key.offset = (u64)-1; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + + BUG_ON(ret == 0); + if (path->slots[0] == 0) { + ret = 1; + goto out; + } + l = path->nodes[0]; + slot = path->slots[0] - 1; + btrfs_item_key_to_cpu(l, &found_key, slot); + if (found_key.objectid != objectid || + found_key.type != BTRFS_ROOT_ITEM_KEY) { + ret = 1; + goto out; + } + if (item) + read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), + sizeof(*item)); + if (key) + memcpy(key, &found_key, sizeof(found_key)); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node) +{ + btrfs_set_root_bytenr(item, node->start); + btrfs_set_root_level(item, btrfs_header_level(node)); + btrfs_set_root_generation(item, btrfs_header_generation(node)); + return 0; +} + +/* + * copy the data in 'item' into the btree + */ +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) +{ + struct btrfs_path *path; + struct extent_buffer *l; + int ret; + int slot; + unsigned long ptr; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret != 0) { + btrfs_print_leaf(root, path->nodes[0]); + printk(KERN_CRIT "unable to update root key %llu %u %llu\n", + (unsigned long long)key->objectid, key->type, + (unsigned long long)key->offset); + BUG_ON(1); + } + + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr_offset(l, slot); + write_extent_buffer(l, item, ptr, sizeof(*item)); + btrfs_mark_buffer_dirty(path->nodes[0]); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) +{ + int ret; + ret = btrfs_insert_item(trans, root, key, item, sizeof(*item)); + return ret; +} + +/* + * at mount time we want to find all the old transaction snapshots that were in + * the process of being deleted if we crashed. This is any root item with an + * offset lower than the latest root. They need to be queued for deletion to + * finish what was happening when we crashed. + */ +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) +{ + struct btrfs_root *dead_root; + struct btrfs_item *item; + struct btrfs_root_item *ri; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + int ret; + u32 nritems; + struct extent_buffer *leaf; + int slot; + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = 0; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +again: + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + if (slot >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) + goto next; + + if (key.objectid < objectid) + goto next; + + if (key.objectid > objectid) + break; + + ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item); + if (btrfs_disk_root_refs(leaf, ri) != 0) + goto next; + + memcpy(&found_key, &key, sizeof(key)); + key.offset++; + btrfs_release_path(root, path); + dead_root = + btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &found_key); + if (IS_ERR(dead_root)) { + ret = PTR_ERR(dead_root); + goto err; + } + + ret = btrfs_add_dead_root(dead_root); + if (ret) + goto err; + goto again; +next: + slot++; + path->slots[0]++; + } + ret = 0; +err: + btrfs_free_path(path); + return ret; +} + +int btrfs_find_orphan_roots(struct btrfs_root *tree_root) +{ + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_key key; + int err = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = 0; + + while (1) { + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) { + err = ret; + break; + } + + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(tree_root, path); + if (ret < 0) + err = ret; + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(tree_root, path); + + if (key.objectid != BTRFS_ORPHAN_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + ret = btrfs_find_dead_roots(tree_root, key.offset); + if (ret) { + err = ret; + break; + } + + key.offset++; + } + + btrfs_free_path(path); + return err; +} + +/* drop the root item for 'key' from 'root' */ +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key) +{ + struct btrfs_path *path; + int ret; + u32 refs; + struct btrfs_root_item *ri; + struct extent_buffer *leaf; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, key, path, -1, 1); + if (ret < 0) + goto out; + + BUG_ON(ret != 0); + leaf = path->nodes[0]; + ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); + + refs = btrfs_disk_root_refs(leaf, ri); + BUG_ON(refs != 0); + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, + const char *name, int name_len) + +{ + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + int err = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = root_id; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = ref_id; +again: + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + BUG_ON(ret < 0); + if (ret == 0) { + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + + WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); + WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); + ptr = (unsigned long)(ref + 1); + WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); + *sequence = btrfs_root_ref_sequence(leaf, ref); + + ret = btrfs_del_item(trans, tree_root, path); + BUG_ON(ret); + } else + err = -ENOENT; + + if (key.type == BTRFS_ROOT_BACKREF_KEY) { + btrfs_release_path(tree_root, path); + key.objectid = ref_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = root_id; + goto again; + } + + btrfs_free_path(path); + return err; +} + +int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, + u64 root_id, u64 ref_id) +{ + struct btrfs_key key; + int ret; + + key.objectid = root_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = ref_id; + + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + return ret; +} + +/* + * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY + * or BTRFS_ROOT_BACKREF_KEY. + * + * The dirid, sequence, name and name_len refer to the directory entry + * that is referencing the root. + * + * For a forward ref, the root_id is the id of the tree referencing + * the root and ref_id is the id of the subvol or snapshot. + * + * For a back ref the root_id is the id of the subvol or snapshot and + * ref_id is the id of the tree referencing it. + */ +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 sequence, + const char *name, int name_len) +{ + struct btrfs_key key; + int ret; + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = root_id; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = ref_id; +again: + ret = btrfs_insert_empty_item(trans, tree_root, path, &key, + sizeof(*ref) + name_len); + BUG_ON(ret); + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + btrfs_set_root_ref_dirid(leaf, ref, dirid); + btrfs_set_root_ref_sequence(leaf, ref, sequence); + btrfs_set_root_ref_name_len(leaf, ref, name_len); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(leaf, name, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + + if (key.type == BTRFS_ROOT_BACKREF_KEY) { + btrfs_release_path(tree_root, path); + key.objectid = ref_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = root_id; + goto again; + } + + btrfs_free_path(path); + return 0; +} diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c new file mode 100644 index 00000000000..c0f7ecaf1e7 --- /dev/null +++ b/fs/btrfs/struct-funcs.c @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/highmem.h> + +/* this is some deeply nasty code. ctree.h has a different + * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef + * + * The end result is that anyone who #includes ctree.h gets a + * declaration for the btrfs_set_foo functions and btrfs_foo functions + * + * This file declares the macros and then #includes ctree.h, which results + * in cpp creating the function here based on the template below. + * + * These setget functions do all the extent_buffer related mapping + * required to efficiently read and write specific fields in the extent + * buffers. Every pointer to metadata items in btrfs is really just + * an unsigned long offset into the extent buffer which has been + * cast to a specific type. This gives us all the gcc type checking. + * + * The extent buffer api is used to do all the kmapping and page + * spanning work required to get extent buffers in highmem and have + * a metadata blocksize different from the page size. + * + * The macro starts with a simple function prototype declaration so that + * sparse won't complain about it being static. + */ + +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ +u##bits btrfs_##name(struct extent_buffer *eb, \ + type *s) \ +{ \ + unsigned long part_offset = (unsigned long)s; \ + unsigned long offset = part_offset + offsetof(type, member); \ + type *p; \ + /* ugly, but we want the fast path here */ \ + if (eb->map_token && offset >= eb->map_start && \ + offset + sizeof(((type *)0)->member) <= eb->map_start + \ + eb->map_len) { \ + p = (type *)(eb->kaddr + part_offset - eb->map_start); \ + return le##bits##_to_cpu(p->member); \ + } \ + { \ + int err; \ + char *map_token; \ + char *kaddr; \ + int unmap_on_exit = (eb->map_token == NULL); \ + unsigned long map_start; \ + unsigned long map_len; \ + u##bits res; \ + err = map_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &map_token, &kaddr, \ + &map_start, &map_len, KM_USER1); \ + if (err) { \ + __le##bits leres; \ + read_eb_member(eb, s, type, member, &leres); \ + return le##bits##_to_cpu(leres); \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + res = le##bits##_to_cpu(p->member); \ + if (unmap_on_exit) \ + unmap_extent_buffer(eb, map_token, KM_USER1); \ + return res; \ + } \ +} \ +void btrfs_set_##name(struct extent_buffer *eb, \ + type *s, u##bits val) \ +{ \ + unsigned long part_offset = (unsigned long)s; \ + unsigned long offset = part_offset + offsetof(type, member); \ + type *p; \ + /* ugly, but we want the fast path here */ \ + if (eb->map_token && offset >= eb->map_start && \ + offset + sizeof(((type *)0)->member) <= eb->map_start + \ + eb->map_len) { \ + p = (type *)(eb->kaddr + part_offset - eb->map_start); \ + p->member = cpu_to_le##bits(val); \ + return; \ + } \ + { \ + int err; \ + char *map_token; \ + char *kaddr; \ + int unmap_on_exit = (eb->map_token == NULL); \ + unsigned long map_start; \ + unsigned long map_len; \ + err = map_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &map_token, &kaddr, \ + &map_start, &map_len, KM_USER1); \ + if (err) { \ + __le##bits val2; \ + val2 = cpu_to_le##bits(val); \ + write_eb_member(eb, s, type, member, &val2); \ + return; \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + p->member = cpu_to_le##bits(val); \ + if (unmap_on_exit) \ + unmap_extent_buffer(eb, map_token, KM_USER1); \ + } \ +} + +#include "ctree.h" + +void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr = btrfs_node_key_ptr_offset(nr); + if (eb->map_token && ptr >= eb->map_start && + ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) { + memcpy(disk_key, eb->kaddr + ptr - eb->map_start, + sizeof(*disk_key)); + return; + } else if (eb->map_token) { + unmap_extent_buffer(eb, eb->map_token, KM_USER1); + eb->map_token = NULL; + } + read_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c new file mode 100644 index 00000000000..8a1ea6e6457 --- /dev/null +++ b/fs/btrfs/super.c @@ -0,0 +1,799 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/seq_file.h> +#include <linux/string.h> +#include <linux/backing-dev.h> +#include <linux/mount.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/parser.h> +#include <linux/ctype.h> +#include <linux/namei.h> +#include <linux/miscdevice.h> +#include <linux/magic.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "xattr.h" +#include "volumes.h" +#include "version.h" +#include "export.h" +#include "compression.h" + +static const struct super_operations btrfs_super_ops; + +static void btrfs_put_super(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + ret = close_ctree(root); + sb->s_fs_info = NULL; +} + +enum { + Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, + Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, + Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, + Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, + Opt_flushoncommit, + Opt_discard, Opt_err, +}; + +static match_table_t tokens = { + {Opt_degraded, "degraded"}, + {Opt_subvol, "subvol=%s"}, + {Opt_device, "device=%s"}, + {Opt_nodatasum, "nodatasum"}, + {Opt_nodatacow, "nodatacow"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_max_extent, "max_extent=%s"}, + {Opt_max_inline, "max_inline=%s"}, + {Opt_alloc_start, "alloc_start=%s"}, + {Opt_thread_pool, "thread_pool=%d"}, + {Opt_compress, "compress"}, + {Opt_compress_force, "compress-force"}, + {Opt_ssd, "ssd"}, + {Opt_ssd_spread, "ssd_spread"}, + {Opt_nossd, "nossd"}, + {Opt_noacl, "noacl"}, + {Opt_notreelog, "notreelog"}, + {Opt_flushoncommit, "flushoncommit"}, + {Opt_ratio, "metadata_ratio=%d"}, + {Opt_discard, "discard"}, + {Opt_err, NULL}, +}; + +u64 btrfs_parse_size(char *str) +{ + u64 res; + int mult = 1; + char *end; + char last; + + res = simple_strtoul(str, &end, 10); + + last = end[0]; + if (isalpha(last)) { + last = tolower(last); + switch (last) { + case 'g': + mult *= 1024; + case 'm': + mult *= 1024; + case 'k': + mult *= 1024; + } + res = res * mult; + } + return res; +} + +/* + * Regular mount options parser. Everything that is needed only when + * reading in a new superblock is parsed here. + */ +int btrfs_parse_options(struct btrfs_root *root, char *options) +{ + struct btrfs_fs_info *info = root->fs_info; + substring_t args[MAX_OPT_ARGS]; + char *p, *num; + int intarg; + int ret = 0; + + if (!options) + return 0; + + /* + * strsep changes the string, duplicate it because parse_options + * gets called twice + */ + options = kstrdup(options, GFP_NOFS); + if (!options) + return -ENOMEM; + + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_degraded: + printk(KERN_INFO "btrfs: allowing degraded mounts\n"); + btrfs_set_opt(info->mount_opt, DEGRADED); + break; + case Opt_subvol: + case Opt_device: + /* + * These are parsed by btrfs_parse_early_options + * and can be happily ignored here. + */ + break; + case Opt_nodatasum: + printk(KERN_INFO "btrfs: setting nodatasum\n"); + btrfs_set_opt(info->mount_opt, NODATASUM); + break; + case Opt_nodatacow: + printk(KERN_INFO "btrfs: setting nodatacow\n"); + btrfs_set_opt(info->mount_opt, NODATACOW); + btrfs_set_opt(info->mount_opt, NODATASUM); + break; + case Opt_compress: + printk(KERN_INFO "btrfs: use compression\n"); + btrfs_set_opt(info->mount_opt, COMPRESS); + break; + case Opt_compress_force: + printk(KERN_INFO "btrfs: forcing compression\n"); + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + btrfs_set_opt(info->mount_opt, COMPRESS); + break; + case Opt_ssd: + printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); + btrfs_set_opt(info->mount_opt, SSD); + break; + case Opt_ssd_spread: + printk(KERN_INFO "btrfs: use spread ssd " + "allocation scheme\n"); + btrfs_set_opt(info->mount_opt, SSD); + btrfs_set_opt(info->mount_opt, SSD_SPREAD); + break; + case Opt_nossd: + printk(KERN_INFO "btrfs: not using ssd allocation " + "scheme\n"); + btrfs_set_opt(info->mount_opt, NOSSD); + btrfs_clear_opt(info->mount_opt, SSD); + btrfs_clear_opt(info->mount_opt, SSD_SPREAD); + break; + case Opt_nobarrier: + printk(KERN_INFO "btrfs: turning off barriers\n"); + btrfs_set_opt(info->mount_opt, NOBARRIER); + break; + case Opt_thread_pool: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->thread_pool_size = intarg; + printk(KERN_INFO "btrfs: thread pool %d\n", + info->thread_pool_size); + } + break; + case Opt_max_extent: + num = match_strdup(&args[0]); + if (num) { + info->max_extent = btrfs_parse_size(num); + kfree(num); + + info->max_extent = max_t(u64, + info->max_extent, root->sectorsize); + printk(KERN_INFO "btrfs: max_extent at %llu\n", + (unsigned long long)info->max_extent); + } + break; + case Opt_max_inline: + num = match_strdup(&args[0]); + if (num) { + info->max_inline = btrfs_parse_size(num); + kfree(num); + + if (info->max_inline) { + info->max_inline = max_t(u64, + info->max_inline, + root->sectorsize); + } + printk(KERN_INFO "btrfs: max_inline at %llu\n", + (unsigned long long)info->max_inline); + } + break; + case Opt_alloc_start: + num = match_strdup(&args[0]); + if (num) { + info->alloc_start = btrfs_parse_size(num); + kfree(num); + printk(KERN_INFO + "btrfs: allocations start at %llu\n", + (unsigned long long)info->alloc_start); + } + break; + case Opt_noacl: + root->fs_info->sb->s_flags &= ~MS_POSIXACL; + break; + case Opt_notreelog: + printk(KERN_INFO "btrfs: disabling tree log\n"); + btrfs_set_opt(info->mount_opt, NOTREELOG); + break; + case Opt_flushoncommit: + printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); + btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); + break; + case Opt_ratio: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->metadata_ratio = intarg; + printk(KERN_INFO "btrfs: metadata ratio %d\n", + info->metadata_ratio); + } + break; + case Opt_discard: + btrfs_set_opt(info->mount_opt, DISCARD); + break; + case Opt_err: + printk(KERN_INFO "btrfs: unrecognized mount option " + "'%s'\n", p); + ret = -EINVAL; + goto out; + default: + break; + } + } +out: + kfree(options); + return ret; +} + +/* + * Parse mount options that are required early in the mount process. + * + * All other options will be parsed on much later in the mount process and + * only when we need to allocate a new super block. + */ +static int btrfs_parse_early_options(const char *options, fmode_t flags, + void *holder, char **subvol_name, + struct btrfs_fs_devices **fs_devices) +{ + substring_t args[MAX_OPT_ARGS]; + char *opts, *p; + int error = 0; + + if (!options) + goto out; + + /* + * strsep changes the string, duplicate it because parse_options + * gets called twice + */ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + + while ((p = strsep(&opts, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_subvol: + *subvol_name = match_strdup(&args[0]); + break; + case Opt_device: + error = btrfs_scan_one_device(match_strdup(&args[0]), + flags, holder, fs_devices); + if (error) + goto out_free_opts; + break; + default: + break; + } + } + + out_free_opts: + kfree(opts); + out: + /* + * If no subvolume name is specified we use the default one. Allocate + * a copy of the string "." here so that code later in the + * mount path doesn't care if it's the default volume or another one. + */ + if (!*subvol_name) { + *subvol_name = kstrdup(".", GFP_KERNEL); + if (!*subvol_name) + return -ENOMEM; + } + return error; +} + +static int btrfs_fill_super(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + void *data, int silent) +{ + struct inode *inode; + struct dentry *root_dentry; + struct btrfs_super_block *disk_super; + struct btrfs_root *tree_root; + struct btrfs_key key; + int err; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_magic = BTRFS_SUPER_MAGIC; + sb->s_op = &btrfs_super_ops; + sb->s_export_op = &btrfs_export_ops; + sb->s_xattr = btrfs_xattr_handlers; + sb->s_time_gran = 1; +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + sb->s_flags |= MS_POSIXACL; +#endif + + tree_root = open_ctree(sb, fs_devices, (char *)data); + + if (IS_ERR(tree_root)) { + printk("btrfs: open_ctree failed\n"); + return PTR_ERR(tree_root); + } + sb->s_fs_info = tree_root; + disk_super = &tree_root->fs_info->super_copy; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto fail_close; + } + + root_dentry = d_alloc_root(inode); + if (!root_dentry) { + iput(inode); + err = -ENOMEM; + goto fail_close; + } +#if 0 + /* this does the super kobj at the same time */ + err = btrfs_sysfs_add_super(tree_root->fs_info); + if (err) + goto fail_close; +#endif + + sb->s_root = root_dentry; + + save_mount_options(sb, data); + return 0; + +fail_close: + close_ctree(tree_root); + return err; +} + +int btrfs_sync_fs(struct super_block *sb, int wait) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + if (!wait) { + filemap_flush(root->fs_info->btree_inode->i_mapping); + return 0; + } + + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); + + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); + return ret; +} + +static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); + struct btrfs_fs_info *info = root->fs_info; + + if (btrfs_test_opt(root, DEGRADED)) + seq_puts(seq, ",degraded"); + if (btrfs_test_opt(root, NODATASUM)) + seq_puts(seq, ",nodatasum"); + if (btrfs_test_opt(root, NODATACOW)) + seq_puts(seq, ",nodatacow"); + if (btrfs_test_opt(root, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + if (info->max_extent != (u64)-1) + seq_printf(seq, ",max_extent=%llu", + (unsigned long long)info->max_extent); + if (info->max_inline != 8192 * 1024) + seq_printf(seq, ",max_inline=%llu", + (unsigned long long)info->max_inline); + if (info->alloc_start != 0) + seq_printf(seq, ",alloc_start=%llu", + (unsigned long long)info->alloc_start); + if (info->thread_pool_size != min_t(unsigned long, + num_online_cpus() + 2, 8)) + seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); + if (btrfs_test_opt(root, COMPRESS)) + seq_puts(seq, ",compress"); + if (btrfs_test_opt(root, NOSSD)) + seq_puts(seq, ",nossd"); + if (btrfs_test_opt(root, SSD_SPREAD)) + seq_puts(seq, ",ssd_spread"); + else if (btrfs_test_opt(root, SSD)) + seq_puts(seq, ",ssd"); + if (btrfs_test_opt(root, NOTREELOG)) + seq_puts(seq, ",notreelog"); + if (btrfs_test_opt(root, FLUSHONCOMMIT)) + seq_puts(seq, ",flushoncommit"); + if (btrfs_test_opt(root, DISCARD)) + seq_puts(seq, ",discard"); + if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) + seq_puts(seq, ",noacl"); + return 0; +} + +static int btrfs_test_super(struct super_block *s, void *data) +{ + struct btrfs_fs_devices *test_fs_devices = data; + struct btrfs_root *root = btrfs_sb(s); + + return root->fs_info->fs_devices == test_fs_devices; +} + +/* + * Find a superblock for the given device / mount point. + * + * Note: This is based on get_sb_bdev from fs/super.c with a few additions + * for multiple device setup. Make sure to keep it in sync. + */ +static int btrfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + char *subvol_name = NULL; + struct block_device *bdev = NULL; + struct super_block *s; + struct dentry *root; + struct btrfs_fs_devices *fs_devices = NULL; + fmode_t mode = FMODE_READ; + int error = 0; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + error = btrfs_parse_early_options(data, mode, fs_type, + &subvol_name, &fs_devices); + if (error) + return error; + + error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); + if (error) + goto error_free_subvol_name; + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_free_subvol_name; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; + goto error_close_devices; + } + + bdev = fs_devices->latest_bdev; + s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); + if (IS_ERR(s)) + goto error_s; + + if (s->s_root) { + if ((flags ^ s->s_flags) & MS_RDONLY) { + deactivate_locked_super(s); + error = -EBUSY; + goto error_close_devices; + } + + btrfs_close_devices(fs_devices); + } else { + char b[BDEVNAME_SIZE]; + + s->s_flags = flags; + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + error = btrfs_fill_super(s, fs_devices, data, + flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(s); + goto error_free_subvol_name; + } + + btrfs_sb(s)->fs_info->bdev_holder = fs_type; + s->s_flags |= MS_ACTIVE; + } + + if (!strcmp(subvol_name, ".")) + root = dget(s->s_root); + else { + mutex_lock(&s->s_root->d_inode->i_mutex); + root = lookup_one_len(subvol_name, s->s_root, + strlen(subvol_name)); + mutex_unlock(&s->s_root->d_inode->i_mutex); + + if (IS_ERR(root)) { + deactivate_locked_super(s); + error = PTR_ERR(root); + goto error_free_subvol_name; + } + if (!root->d_inode) { + dput(root); + deactivate_locked_super(s); + error = -ENXIO; + goto error_free_subvol_name; + } + } + + mnt->mnt_sb = s; + mnt->mnt_root = root; + + kfree(subvol_name); + return 0; + +error_s: + error = PTR_ERR(s); +error_close_devices: + btrfs_close_devices(fs_devices); +error_free_subvol_name: + kfree(subvol_name); + return error; +} + +static int btrfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + ret = btrfs_parse_options(root, data); + if (ret) + return -EINVAL; + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + + if (*flags & MS_RDONLY) { + sb->s_flags |= MS_RDONLY; + + ret = btrfs_commit_super(root); + WARN_ON(ret); + } else { + if (root->fs_info->fs_devices->rw_devices == 0) + return -EACCES; + + if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) + return -EINVAL; + + /* recover relocation */ + ret = btrfs_recover_relocation(root); + WARN_ON(ret); + + ret = btrfs_cleanup_fs_roots(root->fs_info); + WARN_ON(ret); + + sb->s_flags &= ~MS_RDONLY; + } + + return 0; +} + +static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct btrfs_root *root = btrfs_sb(dentry->d_sb); + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + int bits = dentry->d_sb->s_blocksize_bits; + __be32 *fsid = (__be32 *)root->fs_info->fsid; + + buf->f_namelen = BTRFS_NAME_LEN; + buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; + buf->f_bfree = buf->f_blocks - + (btrfs_super_bytes_used(disk_super) >> bits); + buf->f_bavail = buf->f_bfree; + buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_type = BTRFS_SUPER_MAGIC; + + /* We treat it as constant endianness (it doesn't matter _which_) + because we want the fsid to come out the same whether mounted + on a big-endian or little-endian host */ + buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); + buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); + /* Mask in the root object ID too, to disambiguate subvols */ + buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; + buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; + + return 0; +} + +static struct file_system_type btrfs_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .get_sb = btrfs_get_sb, + .kill_sb = kill_anon_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +/* + * used by btrfsctl to scan devices when no FS is mounted + */ +static long btrfs_control_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct btrfs_ioctl_vol_args *vol; + struct btrfs_fs_devices *fs_devices; + int ret = -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol = memdup_user((void __user *)arg, sizeof(*vol)); + if (IS_ERR(vol)) + return PTR_ERR(vol); + + switch (cmd) { + case BTRFS_IOC_SCAN_DEV: + ret = btrfs_scan_one_device(vol->name, FMODE_READ, + &btrfs_fs_type, &fs_devices); + break; + } + + kfree(vol); + return ret; +} + +static int btrfs_freeze(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + mutex_lock(&root->fs_info->cleaner_mutex); + return 0; +} + +static int btrfs_unfreeze(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + mutex_unlock(&root->fs_info->cleaner_mutex); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + return 0; +} + +static const struct super_operations btrfs_super_ops = { + .drop_inode = btrfs_drop_inode, + .delete_inode = btrfs_delete_inode, + .put_super = btrfs_put_super, + .sync_fs = btrfs_sync_fs, + .show_options = btrfs_show_options, + .write_inode = btrfs_write_inode, + .dirty_inode = btrfs_dirty_inode, + .alloc_inode = btrfs_alloc_inode, + .destroy_inode = btrfs_destroy_inode, + .statfs = btrfs_statfs, + .remount_fs = btrfs_remount, + .freeze_fs = btrfs_freeze, + .unfreeze_fs = btrfs_unfreeze, +}; + +static const struct file_operations btrfs_ctl_fops = { + .unlocked_ioctl = btrfs_control_ioctl, + .compat_ioctl = btrfs_control_ioctl, + .owner = THIS_MODULE, +}; + +static struct miscdevice btrfs_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "btrfs-control", + .fops = &btrfs_ctl_fops +}; + +static int btrfs_interface_init(void) +{ + return misc_register(&btrfs_misc); +} + +static void btrfs_interface_exit(void) +{ + if (misc_deregister(&btrfs_misc) < 0) + printk(KERN_INFO "misc_deregister failed for control device"); +} + +static int __init init_btrfs_fs(void) +{ + int err; + + err = btrfs_init_sysfs(); + if (err) + return err; + + err = btrfs_init_cachep(); + if (err) + goto free_sysfs; + + err = extent_io_init(); + if (err) + goto free_cachep; + + err = extent_map_init(); + if (err) + goto free_extent_io; + + err = btrfs_interface_init(); + if (err) + goto free_extent_map; + + err = register_filesystem(&btrfs_fs_type); + if (err) + goto unregister_ioctl; + + printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); + return 0; + +unregister_ioctl: + btrfs_interface_exit(); +free_extent_map: + extent_map_exit(); +free_extent_io: + extent_io_exit(); +free_cachep: + btrfs_destroy_cachep(); +free_sysfs: + btrfs_exit_sysfs(); + return err; +} + +static void __exit exit_btrfs_fs(void) +{ + btrfs_destroy_cachep(); + extent_map_exit(); + extent_io_exit(); + btrfs_interface_exit(); + unregister_filesystem(&btrfs_fs_type); + btrfs_exit_sysfs(); + btrfs_cleanup_fs_uuids(); + btrfs_zlib_exit(); +} + +module_init(init_btrfs_fs) +module_exit(exit_btrfs_fs) + +MODULE_LICENSE("GPL"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c new file mode 100644 index 00000000000..a240b6fa81d --- /dev/null +++ b/fs/btrfs/sysfs.c @@ -0,0 +1,269 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/kobject.h> + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_root_used(&root->root_item)); +} + +static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_root_limit(&root->root_item)); +} + +static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) +{ + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_bytes_used(&fs->super_copy)); +} + +static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_total_bytes(&fs->super_copy)); +} + +static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_sectorsize(&fs->super_copy)); +} + +/* this is for root attrs (subvols/snapshots) */ +struct btrfs_root_attr { + struct attribute attr; + ssize_t (*show)(struct btrfs_root *, char *); + ssize_t (*store)(struct btrfs_root *, const char *, size_t); +}; + +#define ROOT_ATTR(name, mode, show, store) \ +static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \ + show, store) + +ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL); +ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL); + +static struct attribute *btrfs_root_attrs[] = { + &btrfs_root_attr_blocks_used.attr, + &btrfs_root_attr_block_limit.attr, + NULL, +}; + +/* this is for super attrs (actual full fs) */ +struct btrfs_super_attr { + struct attribute attr; + ssize_t (*show)(struct btrfs_fs_info *, char *); + ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t); +}; + +#define SUPER_ATTR(name, mode, show, store) \ +static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \ + show, store) + +SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL); +SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL); +SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL); + +static struct attribute *btrfs_super_attrs[] = { + &btrfs_super_attr_blocks_used.attr, + &btrfs_super_attr_total_blocks.attr, + &btrfs_super_attr_blocksize.attr, + NULL, +}; + +static ssize_t btrfs_super_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + struct btrfs_super_attr *a = container_of(attr, + struct btrfs_super_attr, + attr); + + return a->show ? a->show(fs, buf) : 0; +} + +static ssize_t btrfs_super_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + struct btrfs_super_attr *a = container_of(attr, + struct btrfs_super_attr, + attr); + + return a->store ? a->store(fs, buf, len) : 0; +} + +static ssize_t btrfs_root_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + struct btrfs_root_attr *a = container_of(attr, + struct btrfs_root_attr, + attr); + + return a->show ? a->show(root, buf) : 0; +} + +static ssize_t btrfs_root_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + struct btrfs_root_attr *a = container_of(attr, + struct btrfs_root_attr, + attr); + return a->store ? a->store(root, buf, len) : 0; +} + +static void btrfs_super_release(struct kobject *kobj) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + complete(&fs->kobj_unregister); +} + +static void btrfs_root_release(struct kobject *kobj) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + complete(&root->kobj_unregister); +} + +static struct sysfs_ops btrfs_super_attr_ops = { + .show = btrfs_super_attr_show, + .store = btrfs_super_attr_store, +}; + +static struct sysfs_ops btrfs_root_attr_ops = { + .show = btrfs_root_attr_show, + .store = btrfs_root_attr_store, +}; + +static struct kobj_type btrfs_root_ktype = { + .default_attrs = btrfs_root_attrs, + .sysfs_ops = &btrfs_root_attr_ops, + .release = btrfs_root_release, +}; + +static struct kobj_type btrfs_super_ktype = { + .default_attrs = btrfs_super_attrs, + .sysfs_ops = &btrfs_super_attr_ops, + .release = btrfs_super_release, +}; + +/* /sys/fs/btrfs/ entry */ +static struct kset *btrfs_kset; + +int btrfs_sysfs_add_super(struct btrfs_fs_info *fs) +{ + int error; + char *name; + char c; + int len = strlen(fs->sb->s_id) + 1; + int i; + + name = kmalloc(len, GFP_NOFS); + if (!name) { + error = -ENOMEM; + goto fail; + } + + for (i = 0; i < len; i++) { + c = fs->sb->s_id[i]; + if (c == '/' || c == '\\') + c = '!'; + name[i] = c; + } + name[len] = '\0'; + + fs->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype, + NULL, "%s", name); + kfree(name); + if (error) + goto fail; + + return 0; + +fail: + printk(KERN_ERR "btrfs: sysfs creation for super failed\n"); + return error; +} + +int btrfs_sysfs_add_root(struct btrfs_root *root) +{ + int error; + + error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype, + &root->fs_info->super_kobj, + "%s", root->name); + if (error) + goto fail; + + return 0; + +fail: + printk(KERN_ERR "btrfs: sysfs creation for root failed\n"); + return error; +} + +void btrfs_sysfs_del_root(struct btrfs_root *root) +{ + kobject_put(&root->root_kobj); + wait_for_completion(&root->kobj_unregister); +} + +void btrfs_sysfs_del_super(struct btrfs_fs_info *fs) +{ + kobject_put(&fs->super_kobj); + wait_for_completion(&fs->kobj_unregister); +} + +int btrfs_init_sysfs(void) +{ + btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); + if (!btrfs_kset) + return -ENOMEM; + return 0; +} + +void btrfs_exit_sysfs(void) +{ + kset_unregister(btrfs_kset); +} + diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c new file mode 100644 index 00000000000..b2acc79f1b3 --- /dev/null +++ b/fs/btrfs/transaction.c @@ -0,0 +1,1153 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/writeback.h> +#include <linux/pagemap.h> +#include <linux/blkdev.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "locking.h" +#include "tree-log.h" + +#define BTRFS_ROOT_TRANS_TAG 0 + +static noinline void put_transaction(struct btrfs_transaction *transaction) +{ + WARN_ON(transaction->use_count == 0); + transaction->use_count--; + if (transaction->use_count == 0) { + list_del_init(&transaction->list); + memset(transaction, 0, sizeof(*transaction)); + kmem_cache_free(btrfs_transaction_cachep, transaction); + } +} + +static noinline void switch_commit_root(struct btrfs_root *root) +{ + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); +} + +/* + * either allocate a new transaction or hop into the existing one + */ +static noinline int join_transaction(struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans; + cur_trans = root->fs_info->running_transaction; + if (!cur_trans) { + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, + GFP_NOFS); + BUG_ON(!cur_trans); + root->fs_info->generation++; + cur_trans->num_writers = 1; + cur_trans->num_joined = 0; + cur_trans->transid = root->fs_info->generation; + init_waitqueue_head(&cur_trans->writer_wait); + init_waitqueue_head(&cur_trans->commit_wait); + cur_trans->in_commit = 0; + cur_trans->blocked = 0; + cur_trans->use_count = 1; + cur_trans->commit_done = 0; + cur_trans->start_time = get_seconds(); + + cur_trans->delayed_refs.root.rb_node = NULL; + cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.num_heads = 0; + cur_trans->delayed_refs.flushing = 0; + cur_trans->delayed_refs.run_delayed_start = 0; + spin_lock_init(&cur_trans->delayed_refs.lock); + + INIT_LIST_HEAD(&cur_trans->pending_snapshots); + list_add_tail(&cur_trans->list, &root->fs_info->trans_list); + extent_io_tree_init(&cur_trans->dirty_pages, + root->fs_info->btree_inode->i_mapping, + GFP_NOFS); + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = cur_trans; + spin_unlock(&root->fs_info->new_trans_lock); + } else { + cur_trans->num_writers++; + cur_trans->num_joined++; + } + + return 0; +} + +/* + * this does all the record keeping required to make sure that a reference + * counted root is properly recorded in a given transaction. This is required + * to make sure the old root from before we joined the transaction is deleted + * when the transaction commits + */ +static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (root->ref_cows && root->last_trans < trans->transid) { + WARN_ON(root == root->fs_info->extent_root); + WARN_ON(root->commit_root != root->node); + + radix_tree_tag_set(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + root->last_trans = trans->transid; + btrfs_init_reloc_root(trans, root); + } + return 0; +} + +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!root->ref_cows) + return 0; + + mutex_lock(&root->fs_info->trans_mutex); + if (root->last_trans == trans->transid) { + mutex_unlock(&root->fs_info->trans_mutex); + return 0; + } + + record_root_in_trans(trans, root); + mutex_unlock(&root->fs_info->trans_mutex); + return 0; +} + +/* wait for commit against the current transaction to become unblocked + * when this is done, it is safe to start a new transaction, but the current + * transaction might not be fully on disk. + */ +static void wait_current_trans(struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans; + + cur_trans = root->fs_info->running_transaction; + if (cur_trans && cur_trans->blocked) { + DEFINE_WAIT(wait); + cur_trans->use_count++; + while (1) { + prepare_to_wait(&root->fs_info->transaction_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (cur_trans->blocked) { + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + finish_wait(&root->fs_info->transaction_wait, + &wait); + } else { + finish_wait(&root->fs_info->transaction_wait, + &wait); + break; + } + } + put_transaction(cur_trans); + } +} + +enum btrfs_trans_type { + TRANS_START, + TRANS_JOIN, + TRANS_USERSPACE, +}; + +static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, + int num_blocks, int type) +{ + struct btrfs_trans_handle *h = + kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + int ret; + + mutex_lock(&root->fs_info->trans_mutex); + if (!root->fs_info->log_root_recovering && + ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || + type == TRANS_USERSPACE)) + wait_current_trans(root); + ret = join_transaction(root); + BUG_ON(ret); + + h->transid = root->fs_info->running_transaction->transid; + h->transaction = root->fs_info->running_transaction; + h->blocks_reserved = num_blocks; + h->blocks_used = 0; + h->block_group = 0; + h->alloc_exclude_nr = 0; + h->alloc_exclude_start = 0; + h->delayed_ref_updates = 0; + + if (!current->journal_info && type != TRANS_USERSPACE) + current->journal_info = h; + + root->fs_info->running_transaction->use_count++; + record_root_in_trans(h, root); + mutex_unlock(&root->fs_info->trans_mutex); + return h; +} + +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_blocks) +{ + return start_transaction(root, num_blocks, TRANS_START); +} +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, + int num_blocks) +{ + return start_transaction(root, num_blocks, TRANS_JOIN); +} + +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, + int num_blocks) +{ + return start_transaction(r, num_blocks, TRANS_USERSPACE); +} + +/* wait for a transaction commit to be fully complete */ +static noinline int wait_for_commit(struct btrfs_root *root, + struct btrfs_transaction *commit) +{ + DEFINE_WAIT(wait); + mutex_lock(&root->fs_info->trans_mutex); + while (!commit->commit_done) { + prepare_to_wait(&commit->commit_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (commit->commit_done) + break; + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + } + mutex_unlock(&root->fs_info->trans_mutex); + finish_wait(&commit->commit_wait, &wait); + return 0; +} + +#if 0 +/* + * rate limit against the drop_snapshot code. This helps to slow down new + * operations if the drop_snapshot code isn't able to keep up. + */ +static void throttle_on_drops(struct btrfs_root *root) +{ + struct btrfs_fs_info *info = root->fs_info; + int harder_count = 0; + +harder: + if (atomic_read(&info->throttles)) { + DEFINE_WAIT(wait); + int thr; + thr = atomic_read(&info->throttle_gen); + + do { + prepare_to_wait(&info->transaction_throttle, + &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_read(&info->throttles)) { + finish_wait(&info->transaction_throttle, &wait); + break; + } + schedule(); + finish_wait(&info->transaction_throttle, &wait); + } while (thr == atomic_read(&info->throttle_gen)); + harder_count++; + + if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 && + harder_count < 2) + goto harder; + + if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 && + harder_count < 10) + goto harder; + + if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 && + harder_count < 20) + goto harder; + } +} +#endif + +void btrfs_throttle(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->trans_mutex); + if (!root->fs_info->open_ioctl_trans) + wait_current_trans(root); + mutex_unlock(&root->fs_info->trans_mutex); +} + +static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int throttle) +{ + struct btrfs_transaction *cur_trans; + struct btrfs_fs_info *info = root->fs_info; + int count = 0; + + while (count < 4) { + unsigned long cur = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (cur && + trans->transaction->delayed_refs.num_heads_ready > 64) { + trans->delayed_ref_updates = 0; + + /* + * do a full flush if the transaction is trying + * to close + */ + if (trans->transaction->delayed_refs.flushing) + cur = 0; + btrfs_run_delayed_refs(trans, root, cur); + } else { + break; + } + count++; + } + + mutex_lock(&info->trans_mutex); + cur_trans = info->running_transaction; + WARN_ON(cur_trans != trans->transaction); + WARN_ON(cur_trans->num_writers < 1); + cur_trans->num_writers--; + + if (waitqueue_active(&cur_trans->writer_wait)) + wake_up(&cur_trans->writer_wait); + put_transaction(cur_trans); + mutex_unlock(&info->trans_mutex); + + if (current->journal_info == trans) + current->journal_info = NULL; + memset(trans, 0, sizeof(*trans)); + kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (throttle) + btrfs_run_delayed_iputs(root); + + return 0; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 0); +} + +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 1); +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are sent to disk but does not wait on them + */ +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark); + if (ret) + break; + while (start <= end) { + cond_resched(); + + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + + btree_lock_page_hook(page); + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + continue; + } + + if (PageWriteback(page)) { + if (PageDirty(page)) + wait_on_page_writeback(page); + else { + unlock_page(page); + page_cache_release(page); + continue; + } + } + err = write_one_page(page, 0); + if (err) + werr = err; + page_cache_release(page); + } + } + if (err) + werr = err; + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit. We wait + * on all the pages and clear them from the dirty pages state tree + */ +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark); + if (ret) + break; + + clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + if (PageDirty(page)) { + btree_lock_page_hook(page); + wait_on_page_writeback(page); + err = write_one_page(page, 0); + if (err) + werr = err; + } + wait_on_page_writeback(page); + page_cache_release(page); + cond_resched(); + } + } + if (err) + werr = err; + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark) +{ + int ret; + int ret2; + + ret = btrfs_write_marked_extents(root, dirty_pages, mark); + ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); + return ret || ret2; +} + +int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!trans || !trans->transaction) { + struct inode *btree_inode; + btree_inode = root->fs_info->btree_inode; + return filemap_write_and_wait(btree_inode->i_mapping); + } + return btrfs_write_and_wait_marked_extents(root, + &trans->transaction->dirty_pages, + EXTENT_DIRTY); +} + +/* + * this is used to update the root pointer in the tree of tree roots. + * + * But, in the case of the extent allocation tree, updating the root + * pointer may allocate blocks which may change the root of the extent + * allocation tree. + * + * So, this loops and repeats and makes sure the cowonly root didn't + * change while the root pointer was being updated in the metadata. + */ +static int update_cowonly_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + u64 old_root_bytenr; + u64 old_root_used; + struct btrfs_root *tree_root = root->fs_info->tree_root; + + old_root_used = btrfs_root_used(&root->root_item); + btrfs_write_dirty_block_groups(trans, root); + + while (1) { + old_root_bytenr = btrfs_root_bytenr(&root->root_item); + if (old_root_bytenr == root->node->start && + old_root_used == btrfs_root_used(&root->root_item)) + break; + + btrfs_set_root_node(&root->root_item, root->node); + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + &root->root_item); + BUG_ON(ret); + + old_root_used = btrfs_root_used(&root->root_item); + ret = btrfs_write_dirty_block_groups(trans, root); + BUG_ON(ret); + } + + if (root != root->fs_info->extent_root) + switch_commit_root(root); + + return 0; +} + +/* + * update all the cowonly tree roots on disk + */ +static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *next; + struct extent_buffer *eb; + int ret; + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); + + eb = btrfs_lock_root_node(fs_info->tree_root); + btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); + + while (!list_empty(&fs_info->dirty_cowonly_roots)) { + next = fs_info->dirty_cowonly_roots.next; + list_del_init(next); + root = list_entry(next, struct btrfs_root, dirty_list); + + update_cowonly_root(trans, root); + } + + down_write(&fs_info->extent_commit_sem); + switch_commit_root(fs_info->extent_root); + up_write(&fs_info->extent_commit_sem); + + return 0; +} + +/* + * dead roots are old snapshots that need to be deleted. This allocates + * a dirty root struct and adds it into the list of dead roots that need to + * be deleted + */ +int btrfs_add_dead_root(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->trans_mutex); + list_add(&root->root_list, &root->fs_info->dead_roots); + mutex_unlock(&root->fs_info->trans_mutex); + return 0; +} + +/* + * update all the cowonly tree roots on disk + */ +static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *gang[8]; + struct btrfs_fs_info *fs_info = root->fs_info; + int i; + int ret; + int err = 0; + + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + root = gang[i]; + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + + btrfs_free_log(trans, root); + btrfs_update_reloc_root(trans, root); + + if (root->commit_root != root->node) { + switch_commit_root(root); + btrfs_set_root_node(&root->root_item, + root->node); + } + + err = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, + &root->root_item); + if (err) + break; + } + } + return err; +} + +/* + * defrag a given btree. If cacheonly == 1, this won't read from the disk, + * otherwise every leaf in the btree is read and defragged. + */ +int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) +{ + struct btrfs_fs_info *info = root->fs_info; + int ret; + struct btrfs_trans_handle *trans; + unsigned long nr; + + smp_mb(); + if (root->defrag_running) + return 0; + trans = btrfs_start_transaction(root, 1); + while (1) { + root->defrag_running = 1; + ret = btrfs_defrag_leaves(trans, root, cacheonly); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(info->tree_root, nr); + cond_resched(); + + trans = btrfs_start_transaction(root, 1); + if (root->fs_info->closing || ret != -EAGAIN) + break; + } + root->defrag_running = 0; + smp_mb(); + btrfs_end_transaction(trans, root); + return 0; +} + +#if 0 +/* + * when dropping snapshots, we generate a ton of delayed refs, and it makes + * sense not to join the transaction while it is trying to flush the current + * queue of delayed refs out. + * + * This is used by the drop snapshot code only + */ +static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) +{ + DEFINE_WAIT(wait); + + mutex_lock(&info->trans_mutex); + while (info->running_transaction && + info->running_transaction->delayed_refs.flushing) { + prepare_to_wait(&info->transaction_wait, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&info->trans_mutex); + + schedule(); + + mutex_lock(&info->trans_mutex); + finish_wait(&info->transaction_wait, &wait); + } + mutex_unlock(&info->trans_mutex); + return 0; +} + +/* + * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on + * all of them + */ +int btrfs_drop_dead_root(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = root->fs_info->tree_root; + unsigned long nr; + int ret; + + while (1) { + /* + * we don't want to jump in and create a bunch of + * delayed refs if the transaction is starting to close + */ + wait_transaction_pre_flush(tree_root->fs_info); + trans = btrfs_start_transaction(tree_root, 1); + + /* + * we've joined a transaction, make sure it isn't + * closing right now + */ + if (trans->transaction->delayed_refs.flushing) { + btrfs_end_transaction(trans, tree_root); + continue; + } + + ret = btrfs_drop_snapshot(trans, root); + if (ret != -EAGAIN) + break; + + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + &root->root_item); + if (ret) + break; + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, tree_root); + BUG_ON(ret); + + btrfs_btree_balance_dirty(tree_root, nr); + cond_resched(); + } + BUG_ON(ret); + + ret = btrfs_del_root(trans, tree_root, &root->root_key); + BUG_ON(ret); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, tree_root); + BUG_ON(ret); + + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); + + btrfs_btree_balance_dirty(tree_root, nr); + return ret; +} +#endif + +/* + * new snapshots need to be created at a very specific time in the + * transaction commit. This does the actual creation + */ +static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_key key; + struct btrfs_root_item *new_root_item; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root = pending->root; + struct extent_buffer *tmp; + struct extent_buffer *old; + int ret; + u64 objectid; + + new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); + if (!new_root_item) { + ret = -ENOMEM; + goto fail; + } + ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); + if (ret) + goto fail; + + record_root_in_trans(trans, root); + btrfs_set_root_last_snapshot(&root->root_item, trans->transid); + memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + + key.objectid = objectid; + /* record when the snapshot was created in key.offset */ + key.offset = trans->transid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + + old = btrfs_lock_root_node(root); + btrfs_cow_block(trans, root, old, NULL, 0, &old); + btrfs_set_lock_blocking(old); + + btrfs_copy_root(trans, root, old, &tmp, objectid); + btrfs_tree_unlock(old); + free_extent_buffer(old); + + btrfs_set_root_node(new_root_item, tmp); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + new_root_item); + btrfs_tree_unlock(tmp); + free_extent_buffer(tmp); + if (ret) + goto fail; + + key.offset = (u64)-1; + memcpy(&pending->root_key, &key, sizeof(key)); +fail: + kfree(new_root_item); + return ret; +} + +static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, + struct btrfs_pending_snapshot *pending) +{ + int ret; + int namelen; + u64 index = 0; + struct btrfs_trans_handle *trans; + struct inode *parent_inode; + struct btrfs_root *parent_root; + + parent_inode = pending->dentry->d_parent->d_inode; + parent_root = BTRFS_I(parent_inode)->root; + trans = btrfs_join_transaction(parent_root, 1); + + /* + * insert the directory item + */ + namelen = strlen(pending->name); + ret = btrfs_set_inode_index(parent_inode, &index); + ret = btrfs_insert_dir_item(trans, parent_root, + pending->name, namelen, + parent_inode->i_ino, + &pending->root_key, BTRFS_FT_DIR, index); + + if (ret) + goto fail; + + btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); + ret = btrfs_update_inode(trans, parent_root, parent_inode); + BUG_ON(ret); + + ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, + pending->root_key.objectid, + parent_root->root_key.objectid, + parent_inode->i_ino, index, pending->name, + namelen); + + BUG_ON(ret); + +fail: + btrfs_end_transaction(trans, fs_info->fs_root); + return ret; +} + +/* + * create all the snapshots we've scheduled for creation + */ +static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + int ret; + + list_for_each_entry(pending, head, list) { + ret = create_pending_snapshot(trans, fs_info, pending); + BUG_ON(ret); + } + return 0; +} + +static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + int ret; + + while (!list_empty(head)) { + pending = list_entry(head->next, + struct btrfs_pending_snapshot, list); + ret = finish_pending_snapshot(fs_info, pending); + BUG_ON(ret); + list_del(&pending->list); + kfree(pending->name); + kfree(pending); + } + return 0; +} + +static void update_super_roots(struct btrfs_root *root) +{ + struct btrfs_root_item *root_item; + struct btrfs_super_block *super; + + super = &root->fs_info->super_copy; + + root_item = &root->fs_info->chunk_root->root_item; + super->chunk_root = root_item->bytenr; + super->chunk_root_generation = root_item->generation; + super->chunk_root_level = root_item->level; + + root_item = &root->fs_info->tree_root->root_item; + super->root = root_item->bytenr; + super->generation = root_item->generation; + super->root_level = root_item->level; +} + +int btrfs_transaction_in_commit(struct btrfs_fs_info *info) +{ + int ret = 0; + spin_lock(&info->new_trans_lock); + if (info->running_transaction) + ret = info->running_transaction->in_commit; + spin_unlock(&info->new_trans_lock); + return ret; +} + +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + unsigned long joined = 0; + unsigned long timeout = 1; + struct btrfs_transaction *cur_trans; + struct btrfs_transaction *prev_trans = NULL; + DEFINE_WAIT(wait); + int ret; + int should_grow = 0; + unsigned long now = get_seconds(); + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); + + btrfs_run_ordered_operations(root, 0); + + /* make a pass through all the delayed refs we have so far + * any runnings procs may add more while we are here + */ + ret = btrfs_run_delayed_refs(trans, root, 0); + BUG_ON(ret); + + cur_trans = trans->transaction; + /* + * set the flushing flag so procs in this transaction have to + * start sending their work down. + */ + cur_trans->delayed_refs.flushing = 1; + + ret = btrfs_run_delayed_refs(trans, root, 0); + BUG_ON(ret); + + mutex_lock(&root->fs_info->trans_mutex); + if (cur_trans->in_commit) { + cur_trans->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + btrfs_end_transaction(trans, root); + + ret = wait_for_commit(root, cur_trans); + BUG_ON(ret); + + mutex_lock(&root->fs_info->trans_mutex); + put_transaction(cur_trans); + mutex_unlock(&root->fs_info->trans_mutex); + + return 0; + } + + trans->transaction->in_commit = 1; + trans->transaction->blocked = 1; + if (cur_trans->list.prev != &root->fs_info->trans_list) { + prev_trans = list_entry(cur_trans->list.prev, + struct btrfs_transaction, list); + if (!prev_trans->commit_done) { + prev_trans->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + + wait_for_commit(root, prev_trans); + + mutex_lock(&root->fs_info->trans_mutex); + put_transaction(prev_trans); + } + } + + if (now < cur_trans->start_time || now - cur_trans->start_time < 1) + should_grow = 1; + + do { + int snap_pending = 0; + joined = cur_trans->num_joined; + if (!list_empty(&trans->transaction->pending_snapshots)) + snap_pending = 1; + + WARN_ON(cur_trans != trans->transaction); + prepare_to_wait(&cur_trans->writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (cur_trans->num_writers > 1) + timeout = MAX_SCHEDULE_TIMEOUT; + else if (should_grow) + timeout = 1; + + mutex_unlock(&root->fs_info->trans_mutex); + + if (flush_on_commit) { + btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_wait_ordered_extents(root, 0, 1); + BUG_ON(ret); + } else if (snap_pending) { + ret = btrfs_wait_ordered_extents(root, 0, 1); + BUG_ON(ret); + } + + /* + * rename don't use btrfs_join_transaction, so, once we + * set the transaction to blocked above, we aren't going + * to get any new ordered operations. We can safely run + * it here and no for sure that nothing new will be added + * to the list + */ + btrfs_run_ordered_operations(root, 1); + + smp_mb(); + if (cur_trans->num_writers > 1 || should_grow) + schedule_timeout(timeout); + + mutex_lock(&root->fs_info->trans_mutex); + finish_wait(&cur_trans->writer_wait, &wait); + } while (cur_trans->num_writers > 1 || + (should_grow && cur_trans->num_joined != joined)); + + ret = create_pending_snapshots(trans, root->fs_info); + BUG_ON(ret); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); + + WARN_ON(cur_trans != trans->transaction); + + /* btrfs_commit_tree_roots is responsible for getting the + * various roots consistent with each other. Every pointer + * in the tree of tree roots has to point to the most up to date + * root for every subvolume and other tree. So, we have to keep + * the tree logging code from jumping in and changing any + * of the trees. + * + * At this point in the commit, there can't be any tree-log + * writers, but a little lower down we drop the trans mutex + * and let new people in. By holding the tree_log_mutex + * from now until after the super is written, we avoid races + * with the tree-log code. + */ + mutex_lock(&root->fs_info->tree_log_mutex); + + ret = commit_fs_roots(trans, root); + BUG_ON(ret); + + /* commit_fs_roots gets rid of all the tree log roots, it is now + * safe to free the root of tree log roots + */ + btrfs_free_log_root_tree(trans, root->fs_info); + + ret = commit_cowonly_roots(trans, root); + BUG_ON(ret); + + btrfs_prepare_extent_commit(trans, root); + + cur_trans = root->fs_info->running_transaction; + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->new_trans_lock); + + btrfs_set_root_node(&root->fs_info->tree_root->root_item, + root->fs_info->tree_root->node); + switch_commit_root(root->fs_info->tree_root); + + btrfs_set_root_node(&root->fs_info->chunk_root->root_item, + root->fs_info->chunk_root->node); + switch_commit_root(root->fs_info->chunk_root); + + update_super_roots(root); + + if (!root->fs_info->log_root_recovering) { + btrfs_set_super_log_root(&root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); + } + + memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, + sizeof(root->fs_info->super_copy)); + + trans->transaction->blocked = 0; + + wake_up(&root->fs_info->transaction_wait); + + mutex_unlock(&root->fs_info->trans_mutex); + ret = btrfs_write_and_wait_transaction(trans, root); + BUG_ON(ret); + write_ctree_super(trans, root, 0); + + /* + * the super is written, we can safely allow the tree-loggers + * to go about their business + */ + mutex_unlock(&root->fs_info->tree_log_mutex); + + btrfs_finish_extent_commit(trans, root); + + /* do the directory inserts of any pending snapshot creations */ + finish_pending_snapshots(trans, root->fs_info); + + mutex_lock(&root->fs_info->trans_mutex); + + cur_trans->commit_done = 1; + + root->fs_info->last_trans_committed = cur_trans->transid; + + wake_up(&cur_trans->commit_wait); + + put_transaction(cur_trans); + put_transaction(cur_trans); + + mutex_unlock(&root->fs_info->trans_mutex); + + if (current->journal_info == trans) + current->journal_info = NULL; + + kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (current != root->fs_info->transaction_kthread) + btrfs_run_delayed_iputs(root); + + return ret; +} + +/* + * interface function to delete all the snapshots we have scheduled for deletion + */ +int btrfs_clean_old_snapshots(struct btrfs_root *root) +{ + LIST_HEAD(list); + struct btrfs_fs_info *fs_info = root->fs_info; + + mutex_lock(&fs_info->trans_mutex); + list_splice_init(&fs_info->dead_roots, &list); + mutex_unlock(&fs_info->trans_mutex); + + while (!list_empty(&list)) { + root = list_entry(list.next, struct btrfs_root, root_list); + list_del(&root->root_list); + + if (btrfs_header_backref_rev(root->node) < + BTRFS_MIXED_BACKREF_REV) + btrfs_drop_snapshot(root, 0); + else + btrfs_drop_snapshot(root, 1); + } + return 0; +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h new file mode 100644 index 00000000000..93c7ccb3311 --- /dev/null +++ b/fs/btrfs/transaction.h @@ -0,0 +1,116 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_TRANSACTION__ +#define __BTRFS_TRANSACTION__ +#include "btrfs_inode.h" +#include "delayed-ref.h" + +struct btrfs_transaction { + u64 transid; + /* + * total writers in this transaction, it must be zero before the + * transaction can end + */ + unsigned long num_writers; + + unsigned long num_joined; + int in_commit; + int use_count; + int commit_done; + int blocked; + struct list_head list; + struct extent_io_tree dirty_pages; + unsigned long start_time; + wait_queue_head_t writer_wait; + wait_queue_head_t commit_wait; + struct list_head pending_snapshots; + struct btrfs_delayed_ref_root delayed_refs; +}; + +struct btrfs_trans_handle { + u64 transid; + unsigned long blocks_reserved; + unsigned long blocks_used; + struct btrfs_transaction *transaction; + u64 block_group; + u64 alloc_exclude_start; + u64 alloc_exclude_nr; + unsigned long delayed_ref_updates; +}; + +struct btrfs_pending_snapshot { + struct dentry *dentry; + struct btrfs_root *root; + char *name; + struct btrfs_key root_key; + struct list_head list; +}; + +static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + trans->block_group = BTRFS_I(inode)->block_group; +} + +static inline void btrfs_update_inode_block_group( + struct btrfs_trans_handle *trans, + struct inode *inode) +{ + BTRFS_I(inode)->block_group = trans->block_group; +} + +static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + BTRFS_I(inode)->last_trans = trans->transaction->transid; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_blocks); +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, + int num_blocks); +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, + int num_blocks); +int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + +int btrfs_add_dead_root(struct btrfs_root *root); +int btrfs_drop_dead_root(struct btrfs_root *root); +int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); +int btrfs_clean_old_snapshots(struct btrfs_root *root); +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +void btrfs_throttle(struct btrfs_root *root); +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark); +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark); +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark); +int btrfs_transaction_in_commit(struct btrfs_fs_info *info); +#endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c new file mode 100644 index 00000000000..b10eacdb162 --- /dev/null +++ b/fs/btrfs/tree-defrag.c @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "locking.h" + +/* defrag all the leaves in a given btree. If cache_only == 1, don't read + * things from disk, otherwise read all the leaves and try to get key order to + * better reflect disk order + */ + +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int cache_only) +{ + struct btrfs_path *path = NULL; + struct btrfs_key key; + int ret = 0; + int wret; + int level; + int orig_level; + int is_extent = 0; + int next_key_ret = 0; + u64 last_ret = 0; + u64 min_trans = 0; + + if (cache_only) + goto out; + + if (root->fs_info->extent_root == root) { + /* + * there's recursion here right now in the tree locking, + * we can't defrag the extent root without deadlock + */ + goto out; + } + + if (root->ref_cows == 0 && !is_extent) + goto out; + + if (btrfs_test_opt(root, SSD)) + goto out; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(root->node); + orig_level = level; + + if (level == 0) + goto out; + + if (root->defrag_progress.objectid == 0) { + struct extent_buffer *root_node; + u32 nritems; + + root_node = btrfs_lock_root_node(root); + btrfs_set_lock_blocking(root_node); + nritems = btrfs_header_nritems(root_node); + root->defrag_max.objectid = 0; + /* from above we know this is not a leaf */ + btrfs_node_key_to_cpu(root_node, &root->defrag_max, + nritems - 1); + btrfs_tree_unlock(root_node); + free_extent_buffer(root_node); + memset(&key, 0, sizeof(key)); + } else { + memcpy(&key, &root->defrag_progress, sizeof(key)); + } + + path->keep_locks = 1; + if (cache_only) + min_trans = root->defrag_trans_start; + + ret = btrfs_search_forward(root, &key, NULL, path, + cache_only, min_trans); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + btrfs_release_path(root, path); + wret = btrfs_search_slot(trans, root, &key, path, 0, 1); + + if (wret < 0) { + ret = wret; + goto out; + } + if (!path->nodes[1]) { + ret = 0; + goto out; + } + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, + min_trans); + ret = btrfs_realloc_node(trans, root, + path->nodes[1], 0, + cache_only, &last_ret, + &root->defrag_progress); + WARN_ON(ret && ret != -EAGAIN); + if (next_key_ret == 0) { + memcpy(&root->defrag_progress, &key, sizeof(key)); + ret = -EAGAIN; + } + + btrfs_release_path(root, path); +out: + if (path) + btrfs_free_path(path); + if (ret == -EAGAIN) { + if (root->defrag_max.objectid > root->defrag_progress.objectid) + goto done; + if (root->defrag_max.type > root->defrag_progress.type) + goto done; + if (root->defrag_max.offset > root->defrag_progress.offset) + goto done; + ret = 0; + } +done: + if (ret != -EAGAIN) { + memset(&root->defrag_progress, 0, + sizeof(root->defrag_progress)); + root->defrag_trans_start = trans->transid; + } + return ret; +} diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c new file mode 100644 index 00000000000..4a9434b622e --- /dev/null +++ b/fs/btrfs/tree-log.c @@ -0,0 +1,3198 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "print-tree.h" +#include "compat.h" +#include "tree-log.h" + +/* magic values for the inode_only field in btrfs_log_inode: + * + * LOG_INODE_ALL means to log everything + * LOG_INODE_EXISTS means to log just enough to recreate the inode + * during log replay + */ +#define LOG_INODE_ALL 0 +#define LOG_INODE_EXISTS 1 + +/* + * directory trouble cases + * + * 1) on rename or unlink, if the inode being unlinked isn't in the fsync + * log, we must force a full commit before doing an fsync of the directory + * where the unlink was done. + * ---> record transid of last unlink/rename per directory + * + * mkdir foo/some_dir + * normal commit + * rename foo/some_dir foo2/some_dir + * mkdir foo/some_dir + * fsync foo/some_dir/some_file + * + * The fsync above will unlink the original some_dir without recording + * it in its new location (foo2). After a crash, some_dir will be gone + * unless the fsync of some_file forces a full commit + * + * 2) we must log any new names for any file or dir that is in the fsync + * log. ---> check inode while renaming/linking. + * + * 2a) we must log any new names for any file or dir during rename + * when the directory they are being removed from was logged. + * ---> check inode and old parent dir during rename + * + * 2a is actually the more important variant. With the extra logging + * a crash might unlink the old name without recreating the new one + * + * 3) after a crash, we must go through any directories with a link count + * of zero and redo the rm -rf + * + * mkdir f1/foo + * normal commit + * rm -rf f1/foo + * fsync(f1) + * + * The directory f1 was fully removed from the FS, but fsync was never + * called on f1, only its parent dir. After a crash the rm -rf must + * be replayed. This must be able to recurse down the entire + * directory tree. The inode link count fixup code takes care of the + * ugly details. + */ + +/* + * stages for the tree walking. The first + * stage (0) is to only pin down the blocks we find + * the second stage (1) is to make sure that all the inodes + * we find in the log are created in the subvolume. + * + * The last stage is to deal with directories and links and extents + * and all the other fun semantics + */ +#define LOG_WALK_PIN_ONLY 0 +#define LOG_WALK_REPLAY_INODES 1 +#define LOG_WALK_REPLAY_ALL 2 + +static int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only); +static int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, int del_all); + +/* + * tree logging is a special write ahead log used to make sure that + * fsyncs and O_SYNCs can happen without doing full tree commits. + * + * Full tree commits are expensive because they require commonly + * modified blocks to be recowed, creating many dirty pages in the + * extent tree an 4x-6x higher write load than ext3. + * + * Instead of doing a tree commit on every fsync, we use the + * key ranges and transaction ids to find items for a given file or directory + * that have changed in this transaction. Those items are copied into + * a special tree (one per subvolume root), that tree is written to disk + * and then the fsync is considered complete. + * + * After a crash, items are copied out of the log-tree back into the + * subvolume tree. Any file data extents found are recorded in the extent + * allocation tree, and the log-tree freed. + * + * The log tree is read three times, once to pin down all the extents it is + * using in ram and once, once to create all the inodes logged in the tree + * and once to do all the other items. + */ + +/* + * start a sub transaction and setup the log tree + * this increments the log tree writer count to make the people + * syncing the tree wait for us to finish + */ +static int start_log_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + + mutex_lock(&root->log_mutex); + if (root->log_root) { + if (!root->log_start_pid) { + root->log_start_pid = current->pid; + root->log_multiple_pids = false; + } else if (root->log_start_pid != current->pid) { + root->log_multiple_pids = true; + } + + root->log_batch++; + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return 0; + } + root->log_multiple_pids = false; + root->log_start_pid = current->pid; + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) { + ret = btrfs_init_log_root_tree(trans, root->fs_info); + BUG_ON(ret); + } + if (!root->log_root) { + ret = btrfs_add_log_tree(trans, root); + BUG_ON(ret); + } + mutex_unlock(&root->fs_info->tree_log_mutex); + root->log_batch++; + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return 0; +} + +/* + * returns 0 if there was a log transaction running and we were able + * to join, or returns -ENOENT if there were not transactions + * in progress + */ +static int join_running_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + smp_mb(); + if (!root->log_root) + return -ENOENT; + + mutex_lock(&root->log_mutex); + if (root->log_root) { + ret = 0; + atomic_inc(&root->log_writers); + } + mutex_unlock(&root->log_mutex); + return ret; +} + +/* + * This either makes the current running log transaction wait + * until you call btrfs_end_log_trans() or it makes any future + * log transactions wait until you call btrfs_end_log_trans() + */ +int btrfs_pin_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + mutex_lock(&root->log_mutex); + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return ret; +} + +/* + * indicate we're done making changes to the log tree + * and wake up anyone waiting to do a sync + */ +int btrfs_end_log_trans(struct btrfs_root *root) +{ + if (atomic_dec_and_test(&root->log_writers)) { + smp_mb(); + if (waitqueue_active(&root->log_writer_wait)) + wake_up(&root->log_writer_wait); + } + return 0; +} + + +/* + * the walk control struct is used to pass state down the chain when + * processing the log tree. The stage field tells us which part + * of the log tree processing we are currently doing. The others + * are state fields used for that specific part + */ +struct walk_control { + /* should we free the extent on disk when done? This is used + * at transaction commit time while freeing a log tree + */ + int free; + + /* should we write out the extent buffer? This is used + * while flushing the log tree to disk during a sync + */ + int write; + + /* should we wait for the extent buffer io to finish? Also used + * while flushing the log tree to disk for a sync + */ + int wait; + + /* pin only walk, we record which extents on disk belong to the + * log trees + */ + int pin; + + /* what stage of the replay code we're currently in */ + int stage; + + /* the root we are currently replaying */ + struct btrfs_root *replay_dest; + + /* the trans handle for the current replay */ + struct btrfs_trans_handle *trans; + + /* the function that gets used to process blocks we find in the + * tree. Note the extent_buffer might not be up to date when it is + * passed in, and it must be checked or read if you need the data + * inside it + */ + int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen); +}; + +/* + * process_func used to pin down extents, write them or wait on them + */ +static int process_one_buffer(struct btrfs_root *log, + struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + if (wc->pin) + btrfs_pin_extent(log->fs_info->extent_root, + eb->start, eb->len, 0); + + if (btrfs_buffer_uptodate(eb, gen)) { + if (wc->write) + btrfs_write_tree_block(eb); + if (wc->wait) + btrfs_wait_tree_block_writeback(eb); + } + return 0; +} + +/* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static noinline int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size; + u64 saved_i_size = 0; + int save_old_i_size = 0; + unsigned long src_ptr; + unsigned long dst_ptr; + int overwrite_root = 0; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + overwrite_root = 1; + + item_size = btrfs_item_size_nr(eb, slot); + src_ptr = btrfs_item_ptr_offset(eb, slot); + + /* look for the key in the destination tree */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret == 0) { + char *src_copy; + char *dst_copy; + u32 dst_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (dst_size != item_size) + goto insert; + + if (item_size == 0) { + btrfs_release_path(root, path); + return 0; + } + dst_copy = kmalloc(item_size, GFP_NOFS); + src_copy = kmalloc(item_size, GFP_NOFS); + + read_extent_buffer(eb, src_copy, src_ptr, item_size); + + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, + item_size); + ret = memcmp(dst_copy, src_copy, item_size); + + kfree(dst_copy); + kfree(src_copy); + /* + * they have the same contents, just return, this saves + * us from cowing blocks in the destination tree and doing + * extra writes that may not have been done by a previous + * sync + */ + if (ret == 0) { + btrfs_release_path(root, path); + return 0; + } + + } +insert: + btrfs_release_path(root, path); + /* try to insert the key into the destination tree */ + ret = btrfs_insert_empty_item(trans, root, path, + key, item_size); + + /* make sure any existing item is the correct size */ + if (ret == -EEXIST) { + u32 found_size; + found_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (found_size > item_size) { + btrfs_truncate_item(trans, root, path, item_size, 1); + } else if (found_size < item_size) { + ret = btrfs_extend_item(trans, root, path, + item_size - found_size); + BUG_ON(ret); + } + } else if (ret) { + BUG(); + } + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + + /* don't overwrite an existing inode if the generation number + * was logged as zero. This is done when the tree logging code + * is just logging an inode to make sure it exists after recovery. + * + * Also, don't overwrite i_size on directories during replay. + * log replay inserts and removes directory items based on the + * state of the tree found in the subvolume, and i_size is modified + * as it goes + */ + if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { + struct btrfs_inode_item *src_item; + struct btrfs_inode_item *dst_item; + + src_item = (struct btrfs_inode_item *)src_ptr; + dst_item = (struct btrfs_inode_item *)dst_ptr; + + if (btrfs_inode_generation(eb, src_item) == 0) + goto no_copy; + + if (overwrite_root && + S_ISDIR(btrfs_inode_mode(eb, src_item)) && + S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + save_old_i_size = 1; + saved_i_size = btrfs_inode_size(path->nodes[0], + dst_item); + } + } + + copy_extent_buffer(path->nodes[0], eb, dst_ptr, + src_ptr, item_size); + + if (save_old_i_size) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + } + + /* make sure the generation is filled in */ + if (key->type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { + btrfs_set_inode_generation(path->nodes[0], dst_item, + trans->transid); + } + } +no_copy: + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(root, path); + return 0; +} + +/* + * simple helper to read an inode off the disk from a given root + * This can only be called for subvolume roots and not for the log + */ +static noinline struct inode *read_one_inode(struct btrfs_root *root, + u64 objectid) +{ + struct btrfs_key key; + struct inode *inode; + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root); + if (IS_ERR(inode)) { + inode = NULL; + } else if (is_bad_inode(inode)) { + iput(inode); + inode = NULL; + } + return inode; +} + +/* replays a single extent in 'eb' at 'slot' with 'key' into the + * subvolume 'root'. path is released on entry and should be released + * on exit. + * + * extents in the log tree have not been allocated out of the extent + * tree yet. So, this completes the allocation, taking a reference + * as required if the extent already exists or creating a new extent + * if it isn't in the extent allocation tree yet. + * + * The extent is inserted into the file, dropping any existing extents + * from the file that overlap the new one. + */ +static noinline int replay_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int found_type; + u64 mask = root->sectorsize - 1; + u64 extent_end; + u64 alloc_hint; + u64 start = key->offset; + u64 saved_nbytes; + struct btrfs_file_extent_item *item; + struct inode *inode = NULL; + unsigned long size; + int ret = 0; + + item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) + extent_end = start + btrfs_file_extent_num_bytes(eb, item); + else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size = btrfs_file_extent_inline_len(eb, item); + extent_end = (start + size + mask) & ~mask; + } else { + ret = 0; + goto out; + } + + inode = read_one_inode(root, key->objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + /* + * first check to see if we already have this extent in the + * file. This must be done before the btrfs_drop_extents run + * so we don't try to drop this extent. + */ + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + start, 0); + + if (ret == 0 && + (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC)) { + struct btrfs_file_extent_item cmp1; + struct btrfs_file_extent_item cmp2; + struct btrfs_file_extent_item *existing; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + existing = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + read_extent_buffer(eb, &cmp1, (unsigned long)item, + sizeof(cmp1)); + read_extent_buffer(leaf, &cmp2, (unsigned long)existing, + sizeof(cmp2)); + + /* + * we already have a pointer to this exact extent, + * we don't have to do anything + */ + if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + btrfs_release_path(root, path); + goto out; + } + } + btrfs_release_path(root, path); + + saved_nbytes = inode_get_bytes(inode); + /* drop any overlapping extents */ + ret = btrfs_drop_extents(trans, inode, start, extent_end, + &alloc_hint, 1); + BUG_ON(ret); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 offset; + unsigned long dest_offset; + struct btrfs_key ins; + + ret = btrfs_insert_empty_item(trans, root, path, key, + sizeof(*item)); + BUG_ON(ret); + dest_offset = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + copy_extent_buffer(path->nodes[0], eb, dest_offset, + (unsigned long)item, sizeof(*item)); + + ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); + ins.type = BTRFS_EXTENT_ITEM_KEY; + offset = key->offset - btrfs_file_extent_offset(eb, item); + + if (ins.objectid > 0) { + u64 csum_start; + u64 csum_end; + LIST_HEAD(ordered_sums); + /* + * is this extent already allocated in the extent + * allocation tree? If so, just add a reference + */ + ret = btrfs_lookup_extent(root, ins.objectid, + ins.offset); + if (ret == 0) { + ret = btrfs_inc_extent_ref(trans, root, + ins.objectid, ins.offset, + 0, root->root_key.objectid, + key->objectid, offset); + } else { + /* + * insert the extent pointer in the extent + * allocation tree + */ + ret = btrfs_alloc_logged_file_extent(trans, + root, root->root_key.objectid, + key->objectid, offset, &ins); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + if (btrfs_file_extent_compression(eb, item)) { + csum_start = ins.objectid; + csum_end = csum_start + ins.offset; + } else { + csum_start = ins.objectid + + btrfs_file_extent_offset(eb, item); + csum_end = csum_start + + btrfs_file_extent_num_bytes(eb, item); + } + + ret = btrfs_lookup_csums_range(root->log_root, + csum_start, csum_end - 1, + &ordered_sums); + BUG_ON(ret); + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums; + sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + ret = btrfs_csum_file_blocks(trans, + root->fs_info->csum_root, + sums); + BUG_ON(ret); + list_del(&sums->list); + kfree(sums); + } + } else { + btrfs_release_path(root, path); + } + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + /* inline extents are easy, we just overwrite them */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + } + + inode_set_bytes(inode, saved_nbytes); + btrfs_update_inode(trans, root, inode); +out: + if (inode) + iput(inode); + return ret; +} + +/* + * when cleaning up conflicts between the directory names in the + * subvolume, directory names in the log and directory names in the + * inode back references, we may have to unlink inodes from directories. + * + * This is a helper function to do the unlink of a specific directory + * item + */ +static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct inode *dir, + struct btrfs_dir_item *di) +{ + struct inode *inode; + char *name; + int name_len; + struct extent_buffer *leaf; + struct btrfs_key location; + int ret; + + leaf = path->nodes[0]; + + btrfs_dir_item_key_to_cpu(leaf, di, &location); + name_len = btrfs_dir_name_len(leaf, di); + name = kmalloc(name_len, GFP_NOFS); + read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); + btrfs_release_path(root, path); + + inode = read_one_inode(root, location.objectid); + BUG_ON(!inode); + + ret = link_to_fixup_dir(trans, root, path, location.objectid); + BUG_ON(ret); + + ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + BUG_ON(ret); + kfree(name); + + iput(inode); + return ret; +} + +/* + * helper function to see if a given name and sequence number found + * in an inode back reference are already in a directory and correctly + * point to this inode + */ +static noinline int inode_in_dir(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 objectid, u64 index, + const char *name, int name_len) +{ + struct btrfs_dir_item *di; + struct btrfs_key location; + int match = 0; + + di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, + index, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + btrfs_release_path(root, path); + + di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + match = 1; +out: + btrfs_release_path(root, path); + return match; +} + +/* + * helper function to check a log tree for a named back reference in + * an inode. This is used to decide if a back reference that is + * found in the subvolume conflicts with what we find in the log. + * + * inode backreferences may have multiple refs in a single item, + * during replay we process one reference at a time, and we don't + * want to delete valid links to a file from the subvolume if that + * link is also in the log. + */ +static noinline int backref_in_log(struct btrfs_root *log, + struct btrfs_key *key, + char *name, int namelen) +{ + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long ptr_end; + unsigned long name_ptr; + int found_name_len; + int item_size; + int ret; + int match = 0; + + path = btrfs_alloc_path(); + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); + if (ret != 0) + goto out; + + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + ref = (struct btrfs_inode_ref *)ptr; + found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); + if (found_name_len == namelen) { + name_ptr = (unsigned long)(ref + 1); + ret = memcmp_extent_buffer(path->nodes[0], name, + name_ptr, namelen); + if (ret == 0) { + match = 1; + goto out; + } + } + ptr = (unsigned long)(ref + 1) + found_name_len; + } +out: + btrfs_free_path(path); + return match; +} + + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function. (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct inode *dir; + int ret; + struct btrfs_key location; + struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; + struct inode *inode; + char *name; + int namelen; + unsigned long ref_ptr; + unsigned long ref_end; + + location.objectid = key->objectid; + location.type = BTRFS_INODE_ITEM_KEY; + location.offset = 0; + + /* + * it is possible that we didn't log all the parent directories + * for a given inode. If we don't find the dir, just don't + * copy the back ref in. The link count fixup code will take + * care of the rest + */ + dir = read_one_inode(root, key->offset); + if (!dir) + return -ENOENT; + + inode = read_one_inode(root, key->objectid); + BUG_ON(!inode); + + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + +again: + ref = (struct btrfs_inode_ref *)ref_ptr; + + namelen = btrfs_inode_ref_name_len(eb, ref); + name = kmalloc(namelen, GFP_NOFS); + BUG_ON(!name); + + read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); + + /* if we already have a perfect match, we're done */ + if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, + btrfs_inode_ref_index(eb, ref), + name, namelen)) { + goto out; + } + + /* + * look for a conflicting back reference in the metadata. + * if we find one we have to unlink that name of the file + * before we add our new link. Later on, we overwrite any + * existing back reference, and we don't want to create + * dangling pointers in the directory. + */ +conflict_again: + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret == 0) { + char *victim_name; + int victim_name_len; + struct btrfs_inode_ref *victim_ref; + unsigned long ptr; + unsigned long ptr_end; + struct extent_buffer *leaf = path->nodes[0]; + + /* are we trying to overwrite a back ref for the root directory + * if so, just jump out, we're done + */ + if (key->objectid == key->offset) + goto out_nowrite; + + /* check all the names in this back reference to see + * if they are in the log. if so, we allow them to stay + * otherwise they must be unlinked as a conflict + */ + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); + while (ptr < ptr_end) { + victim_ref = (struct btrfs_inode_ref *)ptr; + victim_name_len = btrfs_inode_ref_name_len(leaf, + victim_ref); + victim_name = kmalloc(victim_name_len, GFP_NOFS); + BUG_ON(!victim_name); + + read_extent_buffer(leaf, victim_name, + (unsigned long)(victim_ref + 1), + victim_name_len); + + if (!backref_in_log(log, key, victim_name, + victim_name_len)) { + btrfs_inc_nlink(inode); + btrfs_release_path(root, path); + + ret = btrfs_unlink_inode(trans, root, dir, + inode, victim_name, + victim_name_len); + kfree(victim_name); + btrfs_release_path(root, path); + goto conflict_again; + } + kfree(victim_name); + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; + } + BUG_ON(ret); + } + btrfs_release_path(root, path); + + /* look for a conflicting sequence number */ + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + btrfs_inode_ref_index(eb, ref), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + + /* look for a conflicting name */ + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + /* insert our name */ + ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, + btrfs_inode_ref_index(eb, ref)); + BUG_ON(ret); + + btrfs_update_inode(trans, root, inode); + +out: + ref_ptr = (unsigned long)(ref + 1) + namelen; + kfree(name); + if (ref_ptr < ref_end) + goto again; + + /* finally write the back reference in the inode */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + +out_nowrite: + btrfs_release_path(root, path); + iput(dir); + iput(inode); + return 0; +} + +static int insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + int ret; + ret = btrfs_find_orphan_item(root, offset); + if (ret > 0) + ret = btrfs_insert_orphan_item(trans, root, offset); + return ret; +} + + +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found. If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + u64 nlink = 0; + unsigned long ptr; + unsigned long ptr_end; + int name_len; + + key.objectid = inode->i_ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid != inode->i_ino || + key.type != BTRFS_INODE_REF_KEY) + break; + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + while (ptr < ptr_end) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + name_len = btrfs_inode_ref_name_len(path->nodes[0], + ref); + ptr = (unsigned long)(ref + 1) + name_len; + nlink++; + } + + if (key.offset == 0) + break; + key.offset--; + btrfs_release_path(root, path); + } + btrfs_release_path(root, path); + if (nlink != inode->i_nlink) { + inode->i_nlink = nlink; + btrfs_update_inode(trans, root, inode); + } + BTRFS_I(inode)->index_cnt = (u64)-1; + + if (inode->i_nlink == 0) { + if (S_ISDIR(inode->i_mode)) { + ret = replay_dir_deletes(trans, root, NULL, path, + inode->i_ino, 1); + BUG_ON(ret); + } + ret = insert_orphan_item(trans, root, inode->i_ino); + BUG_ON(ret); + } + btrfs_free_path(path); + + return 0; +} + +static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + struct inode *inode; + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = (u64)-1; + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + break; + + if (ret == 1) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_release_path(root, path); + inode = read_one_inode(root, key.offset); + BUG_ON(!inode); + + ret = fixup_inode_link_count(trans, root, inode); + BUG_ON(ret); + + iput(inode); + + /* + * fixup on a directory may create new entries, + * make sure we always look for the highset possible + * offset + */ + key.offset = (u64)-1; + } + btrfs_release_path(root, path); + return 0; +} + + +/* + * record a given inode in the fixup dir so we can check its link + * count when replay is done. The link count is incremented here + * so the inode won't go away until we check it + */ +static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_key key; + int ret = 0; + struct inode *inode; + + inode = read_one_inode(root, objectid); + BUG_ON(!inode); + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = objectid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_release_path(root, path); + if (ret == 0) { + btrfs_inc_nlink(inode); + btrfs_update_inode(trans, root, inode); + } else if (ret == -EEXIST) { + ret = 0; + } else { + BUG(); + } + iput(inode); + + return ret; +} + +/* + * when replaying the log for a directory, we only insert names + * for inodes that actually exist. This means an fsync on a directory + * does not implicitly fsync all the new files in it + */ +static noinline int insert_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 index, + char *name, int name_len, u8 type, + struct btrfs_key *location) +{ + struct inode *inode; + struct inode *dir; + int ret; + + inode = read_one_inode(root, location->objectid); + if (!inode) + return -ENOENT; + + dir = read_one_inode(root, dirid); + if (!dir) { + iput(inode); + return -EIO; + } + ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); + + /* FIXME, put inode into FIXUP list */ + + iput(inode); + iput(dir); + return ret; +} + +/* + * take a single entry in a log directory item and replay it into + * the subvolume. + * + * if a conflicting item exists in the subdirectory already, + * the inode it points to is unlinked and put into the link count + * fix up tree. + * + * If a name from the log points to a file or directory that does + * not exist in the FS, it is skipped. fsyncs on directories + * do not force down inodes inside that directory, just changes to the + * names or unlinks in a directory. + */ +static noinline int replay_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, + struct btrfs_dir_item *di, + struct btrfs_key *key) +{ + char *name; + int name_len; + struct btrfs_dir_item *dst_di; + struct btrfs_key found_key; + struct btrfs_key log_key; + struct inode *dir; + u8 log_type; + int exists; + int ret; + + dir = read_one_inode(root, key->objectid); + BUG_ON(!dir); + + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + log_type = btrfs_dir_type(eb, di); + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + + btrfs_dir_item_key_to_cpu(eb, di, &log_key); + exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); + if (exists == 0) + exists = 1; + else + exists = 0; + btrfs_release_path(root, path); + + if (key->type == BTRFS_DIR_ITEM_KEY) { + dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, + name, name_len, 1); + } else if (key->type == BTRFS_DIR_INDEX_KEY) { + dst_di = btrfs_lookup_dir_index_item(trans, root, path, + key->objectid, + key->offset, name, + name_len, 1); + } else { + BUG(); + } + if (!dst_di || IS_ERR(dst_di)) { + /* we need a sequence number to insert, so we only + * do inserts for the BTRFS_DIR_INDEX_KEY types + */ + if (key->type != BTRFS_DIR_INDEX_KEY) + goto out; + goto insert; + } + + btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + /* the existing item matches the logged item */ + if (found_key.objectid == log_key.objectid && + found_key.type == log_key.type && + found_key.offset == log_key.offset && + btrfs_dir_type(path->nodes[0], dst_di) == log_type) { + goto out; + } + + /* + * don't drop the conflicting directory entry if the inode + * for the new entry doesn't exist + */ + if (!exists) + goto out; + + ret = drop_one_dir_item(trans, root, path, dir, dst_di); + BUG_ON(ret); + + if (key->type == BTRFS_DIR_INDEX_KEY) + goto insert; +out: + btrfs_release_path(root, path); + kfree(name); + iput(dir); + return 0; + +insert: + btrfs_release_path(root, path); + ret = insert_one_name(trans, root, path, key->objectid, key->offset, + name, name_len, log_type, &log_key); + + BUG_ON(ret && ret != -ENOENT); + goto out; +} + +/* + * find all the names in a directory item and reconcile them into + * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than + * one name in a directory item, but the same code gets used for + * both directory index types + */ +static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size = btrfs_item_size_nr(eb, slot); + struct btrfs_dir_item *di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + name_len = btrfs_dir_name_len(eb, di); + ret = replay_one_name(trans, root, path, eb, di, key); + BUG_ON(ret); + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + return 0; +} + +/* + * directory replay has two parts. There are the standard directory + * items in the log copied from the subvolume, and range items + * created in the log while the subvolume was logged. + * + * The range items tell us which parts of the key space the log + * is authoritative for. During replay, if a key in the subvolume + * directory is in a logged range item, but not actually in the log + * that means it was deleted from the directory before the fsync + * and should be removed. + */ +static noinline int find_dir_range(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, int key_type, + u64 *start_ret, u64 *end_ret) +{ + struct btrfs_key key; + u64 found_end; + struct btrfs_dir_log_item *item; + int ret; + int nritems; + + if (*start_ret == (u64)-1) + return 1; + + key.objectid = dirid; + key.type = key_type; + key.offset = *start_ret; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + if (ret != 0) + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto next; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + + if (*start_ret >= key.offset && *start_ret <= found_end) { + ret = 0; + *start_ret = key.offset; + *end_ret = found_end; + goto out; + } + ret = 1; +next: + /* check the next slot in the tree to see if it is a valid item */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + } else { + path->slots[0]++; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto out; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + *start_ret = key.offset; + *end_ret = found_end; + ret = 0; +out: + btrfs_release_path(root, path); + return ret; +} + +/* + * this looks for a given directory item in the log. If the directory + * item is not in the log, the item is removed and the inode it points + * to is unlinked + */ +static noinline int check_item_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct btrfs_path *log_path, + struct inode *dir, + struct btrfs_key *dir_key) +{ + int ret; + struct extent_buffer *eb; + int slot; + u32 item_size; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + char *name; + struct inode *inode; + struct btrfs_key location; + +again: + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + log_di = NULL; + if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { + log_di = btrfs_lookup_dir_item(trans, log, log_path, + dir_key->objectid, + name, name_len, 0); + } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { + log_di = btrfs_lookup_dir_index_item(trans, log, + log_path, + dir_key->objectid, + dir_key->offset, + name, name_len, 0); + } + if (!log_di || IS_ERR(log_di)) { + btrfs_dir_item_key_to_cpu(eb, di, &location); + btrfs_release_path(root, path); + btrfs_release_path(log, log_path); + inode = read_one_inode(root, location.objectid); + BUG_ON(!inode); + + ret = link_to_fixup_dir(trans, root, + path, location.objectid); + BUG_ON(ret); + btrfs_inc_nlink(inode); + ret = btrfs_unlink_inode(trans, root, dir, inode, + name, name_len); + BUG_ON(ret); + kfree(name); + iput(inode); + + /* there might still be more names under this key + * check and repeat if required + */ + ret = btrfs_search_slot(NULL, root, dir_key, path, + 0, 0); + if (ret == 0) + goto again; + ret = 0; + goto out; + } + btrfs_release_path(log, log_path); + kfree(name); + + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + ret = 0; +out: + btrfs_release_path(root, path); + btrfs_release_path(log, log_path); + return ret; +} + +/* + * deletion replay happens before we copy any new directory items + * out of the log or out of backreferences from inodes. It + * scans the log to find ranges of keys that log is authoritative for, + * and then scans the directory to find items in those ranges that are + * not present in the log. + * + * Anything we don't find in the log is unlinked and removed from the + * directory. + */ +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, int del_all) +{ + u64 range_start; + u64 range_end; + int key_type = BTRFS_DIR_LOG_ITEM_KEY; + int ret = 0; + struct btrfs_key dir_key; + struct btrfs_key found_key; + struct btrfs_path *log_path; + struct inode *dir; + + dir_key.objectid = dirid; + dir_key.type = BTRFS_DIR_ITEM_KEY; + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + dir = read_one_inode(root, dirid); + /* it isn't an error if the inode isn't there, that can happen + * because we replay the deletes before we copy in the inode item + * from the log + */ + if (!dir) { + btrfs_free_path(log_path); + return 0; + } +again: + range_start = 0; + range_end = 0; + while (1) { + if (del_all) + range_end = (u64)-1; + else { + ret = find_dir_range(log, path, dirid, key_type, + &range_start, &range_end); + if (ret != 0) + break; + } + + dir_key.offset = range_start; + while (1) { + int nritems; + ret = btrfs_search_slot(NULL, root, &dir_key, path, + 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != dirid || + found_key.type != dir_key.type) + goto next_type; + + if (found_key.offset > range_end) + break; + + ret = check_item_in_log(trans, root, log, path, + log_path, dir, + &found_key); + BUG_ON(ret); + if (found_key.offset == (u64)-1) + break; + dir_key.offset = found_key.offset + 1; + } + btrfs_release_path(root, path); + if (range_end == (u64)-1) + break; + range_start = range_end + 1; + } + +next_type: + ret = 0; + if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { + key_type = BTRFS_DIR_LOG_INDEX_KEY; + dir_key.type = BTRFS_DIR_INDEX_KEY; + btrfs_release_path(root, path); + goto again; + } +out: + btrfs_release_path(root, path); + btrfs_free_path(log_path); + iput(dir); + return ret; +} + +/* + * the process_func used to replay items from the log tree. This + * gets called in two different stages. The first stage just looks + * for inodes and makes sure they are all copied into the subvolume. + * + * The second stage copies all the other item types from the log into + * the subvolume. The two stage approach is slower, but gets rid of + * lots of complexity around inodes referencing other inodes that exist + * only in the log (references come from either directory items or inode + * back refs). + */ +static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + int nritems; + struct btrfs_path *path; + struct btrfs_root *root = wc->replay_dest; + struct btrfs_key key; + u32 item_size; + int level; + int i; + int ret; + + btrfs_read_buffer(eb, gen); + + level = btrfs_header_level(eb); + + if (level != 0) + return 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + nritems = btrfs_header_nritems(eb); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + item_size = btrfs_item_size_nr(eb, i); + + /* inode keys are done during the first stage */ + if (key.type == BTRFS_INODE_ITEM_KEY && + wc->stage == LOG_WALK_REPLAY_INODES) { + struct btrfs_inode_item *inode_item; + u32 mode; + + inode_item = btrfs_item_ptr(eb, i, + struct btrfs_inode_item); + mode = btrfs_inode_mode(eb, inode_item); + if (S_ISDIR(mode)) { + ret = replay_dir_deletes(wc->trans, + root, log, path, key.objectid, 0); + BUG_ON(ret); + } + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + + /* for regular files, make sure corresponding + * orhpan item exist. extents past the new EOF + * will be truncated later by orphan cleanup. + */ + if (S_ISREG(mode)) { + ret = insert_orphan_item(wc->trans, root, + key.objectid); + BUG_ON(ret); + } + + ret = link_to_fixup_dir(wc->trans, root, + path, key.objectid); + BUG_ON(ret); + } + if (wc->stage < LOG_WALK_REPLAY_ALL) + continue; + + /* these keys are simply copied */ + if (key.type == BTRFS_XATTR_ITEM_KEY) { + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_INODE_REF_KEY) { + ret = add_inode_ref(wc->trans, root, log, path, + eb, i, &key); + BUG_ON(ret && ret != -ENOENT); + } else if (key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_DIR_ITEM_KEY || + key.type == BTRFS_DIR_INDEX_KEY) { + ret = replay_one_dir_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } + } + btrfs_free_path(path); + return 0; +} + +static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + u64 root_gen; + u64 bytenr; + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + u32 blocksize; + int ret = 0; + + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + while (*level > 0) { + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + cur = path->nodes[*level]; + + if (btrfs_header_level(cur) != *level) + WARN_ON(1); + + if (path->slots[*level] >= + btrfs_header_nritems(cur)) + break; + + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + + parent = path->nodes[*level]; + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + next = btrfs_find_create_tree_block(root, bytenr, blocksize); + + wc->process_func(root, next, wc, ptr_gen); + + if (*level == 1) { + path->slots[*level]++; + if (wc->free) { + btrfs_read_buffer(next, ptr_gen); + + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + WARN_ON(root_owner != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, + bytenr, blocksize); + BUG_ON(ret); + } + free_extent_buffer(next); + continue; + } + btrfs_read_buffer(next, ptr_gen); + + WARN_ON(*level <= 0); + if (path->nodes[*level-1]) + free_extent_buffer(path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(next); + path->slots[*level] = 0; + cond_resched(); + } + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + bytenr = path->nodes[*level]->start; + + blocksize = btrfs_level_size(root, *level); + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level])); + + if (wc->free) { + next = path->nodes[*level]; + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, bytenr, blocksize); + BUG_ON(ret); + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + + cond_resched(); + return 0; +} + +static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + u64 root_gen; + int i; + int slot; + int ret; + + for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + struct extent_buffer *node; + node = path->nodes[i]; + path->slots[i]++; + *level = i; + WARN_ON(*level == 0); + return 0; + } else { + struct extent_buffer *parent; + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level])); + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[*level]; + + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, + path->nodes[*level]->start, + path->nodes[*level]->len); + BUG_ON(ret); + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'. This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +static int walk_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct walk_control *wc) +{ + int ret = 0; + int wret; + int level; + struct btrfs_path *path; + int i; + int orig_level; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + level = btrfs_header_level(log->node); + orig_level = level; + path->nodes[level] = log->node; + extent_buffer_get(log->node); + path->slots[level] = 0; + + while (1) { + wret = walk_down_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + + wret = walk_up_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + } + + /* was the root node processed? if not, catch it here */ + if (path->nodes[orig_level]) { + wc->process_func(log, path->nodes[orig_level], wc, + btrfs_header_generation(path->nodes[orig_level])); + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[orig_level]; + + btrfs_tree_lock(next); + clean_tree_block(trans, log, next); + btrfs_set_lock_blocking(next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + WARN_ON(log->root_key.objectid != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(log, next->start, + next->len); + BUG_ON(ret); + } + } + + for (i = 0; i <= orig_level; i++) { + if (path->nodes[i]) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + } + btrfs_free_path(path); + return ret; +} + +/* + * helper function to update the item for a given subvolumes log root + * in the tree of log roots + */ +static int update_log_root(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + int ret; + + if (log->log_transid == 1) { + /* insert root item on the first sync */ + ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + } else { + ret = btrfs_update_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + } + return ret; +} + +static int wait_log_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long transid) +{ + DEFINE_WAIT(wait); + int index = transid % 2; + + /* + * we only allow two pending log transactions at a time, + * so we know that if ours is more than 2 older than the + * current transaction, we're done + */ + do { + prepare_to_wait(&root->log_commit_wait[index], + &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&root->log_mutex); + + if (root->fs_info->last_trans_log_full_commit != + trans->transid && root->log_transid < transid + 2 && + atomic_read(&root->log_commit[index])) + schedule(); + + finish_wait(&root->log_commit_wait[index], &wait); + mutex_lock(&root->log_mutex); + } while (root->log_transid < transid + 2 && + atomic_read(&root->log_commit[index])); + return 0; +} + +static int wait_for_writer(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + DEFINE_WAIT(wait); + while (atomic_read(&root->log_writers)) { + prepare_to_wait(&root->log_writer_wait, + &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&root->log_mutex); + if (root->fs_info->last_trans_log_full_commit != + trans->transid && atomic_read(&root->log_writers)) + schedule(); + mutex_lock(&root->log_mutex); + finish_wait(&root->log_writer_wait, &wait); + } + return 0; +} + +/* + * btrfs_sync_log does sends a given tree log down to the disk and + * updates the super blocks to record it. When this call is done, + * you know that any inodes previously logged are safely on disk only + * if it returns 0. + * + * Any other return value means you need to call btrfs_commit_transaction. + * Some of the edge cases for fsyncing directories that have had unlinks + * or renames done in the past mean that sometimes the only safe + * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, + * that has happened. + */ +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int index1; + int index2; + int mark; + int ret; + struct btrfs_root *log = root->log_root; + struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; + unsigned long log_transid = 0; + + mutex_lock(&root->log_mutex); + index1 = root->log_transid % 2; + if (atomic_read(&root->log_commit[index1])) { + wait_log_commit(trans, root, root->log_transid); + mutex_unlock(&root->log_mutex); + return 0; + } + atomic_set(&root->log_commit[index1], 1); + + /* wait for previous tree log sync to complete */ + if (atomic_read(&root->log_commit[(index1 + 1) % 2])) + wait_log_commit(trans, root, root->log_transid - 1); + + while (1) { + unsigned long batch = root->log_batch; + if (root->log_multiple_pids) { + mutex_unlock(&root->log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&root->log_mutex); + } + wait_for_writer(trans, root); + if (batch == root->log_batch) + break; + } + + /* bail out if we need to do a full commit */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { + ret = -EAGAIN; + mutex_unlock(&root->log_mutex); + goto out; + } + + log_transid = root->log_transid; + if (log_transid % 2 == 0) + mark = EXTENT_DIRTY; + else + mark = EXTENT_NEW; + + /* we start IO on all the marked extents here, but we don't actually + * wait for them until later. + */ + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); + BUG_ON(ret); + + btrfs_set_root_node(&log->root_item, log->node); + + root->log_batch = 0; + root->log_transid++; + log->log_transid = root->log_transid; + root->log_start_pid = 0; + smp_mb(); + /* + * IO has been started, blocks of the log tree have WRITTEN flag set + * in their headers. new modifications of the log will be written to + * new positions. so it's safe to allow log writers to go in. + */ + mutex_unlock(&root->log_mutex); + + mutex_lock(&log_root_tree->log_mutex); + log_root_tree->log_batch++; + atomic_inc(&log_root_tree->log_writers); + mutex_unlock(&log_root_tree->log_mutex); + + ret = update_log_root(trans, log); + BUG_ON(ret); + + mutex_lock(&log_root_tree->log_mutex); + if (atomic_dec_and_test(&log_root_tree->log_writers)) { + smp_mb(); + if (waitqueue_active(&log_root_tree->log_writer_wait)) + wake_up(&log_root_tree->log_writer_wait); + } + + index2 = log_root_tree->log_transid % 2; + if (atomic_read(&log_root_tree->log_commit[index2])) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } + atomic_set(&log_root_tree->log_commit[index2], 1); + + if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid - 1); + } + + wait_for_writer(trans, log_root_tree); + + /* + * now that we've moved on to the tree of log tree roots, + * check the full commit flag again + */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out_wake_log_root; + } + + ret = btrfs_write_and_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + BUG_ON(ret); + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + + btrfs_set_super_log_root(&root->fs_info->super_for_commit, + log_root_tree->node->start); + btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_header_level(log_root_tree->node)); + + log_root_tree->log_batch = 0; + log_root_tree->log_transid++; + smp_mb(); + + mutex_unlock(&log_root_tree->log_mutex); + + /* + * nobody else is going to jump in and write the the ctree + * super here because the log_commit atomic below is protecting + * us. We must be called with a transaction handle pinning + * the running transaction open, so a full commit can't hop + * in and cause problems either. + */ + write_ctree_super(trans, root->fs_info->tree_root, 1); + ret = 0; + + mutex_lock(&root->log_mutex); + if (root->last_log_commit < log_transid) + root->last_log_commit = log_transid; + mutex_unlock(&root->log_mutex); + +out_wake_log_root: + atomic_set(&log_root_tree->log_commit[index2], 0); + smp_mb(); + if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) + wake_up(&log_root_tree->log_commit_wait[index2]); +out: + atomic_set(&root->log_commit[index1], 0); + smp_mb(); + if (waitqueue_active(&root->log_commit_wait[index1])) + wake_up(&root->log_commit_wait[index1]); + return 0; +} + +/* + * free all the extents used by the tree log. This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ + int ret; + struct btrfs_root *log; + struct key; + u64 start; + u64 end; + struct walk_control wc = { + .free = 1, + .process_func = process_one_buffer + }; + + if (!root->log_root || root->fs_info->log_root_recovering) + return 0; + + log = root->log_root; + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + while (1) { + ret = find_first_extent_bit(&log->dirty_log_pages, + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); + if (ret) + break; + + clear_extent_bits(&log->dirty_log_pages, start, end, + EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + } + + if (log->log_transid > 0) { + ret = btrfs_del_root(trans, root->fs_info->log_root_tree, + &log->root_key); + BUG_ON(ret); + } + root->log_root = NULL; + free_extent_buffer(log->node); + kfree(log); + return 0; +} + +/* + * If both a file and directory are logged, and unlinks or renames are + * mixed in, we have a few interesting corners: + * + * create file X in dir Y + * link file X to X.link in dir Y + * fsync file X + * unlink file X but leave X.link + * fsync dir Y + * + * After a crash we would expect only X.link to exist. But file X + * didn't get fsync'd again so the log has back refs for X and X.link. + * + * We solve this by removing directory entries and inode backrefs from the + * log when a file that was logged in the current transaction is + * unlinked. Any later fsync will include the updated log entries, and + * we'll be able to reconstruct the proper directory items from backrefs. + * + * This optimizations allows us to avoid relogging the entire inode + * or the entire directory. + */ +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index) +{ + struct btrfs_root *log; + struct btrfs_dir_item *di; + struct btrfs_path *path; + int ret; + int bytes_del = 0; + + if (BTRFS_I(dir)->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + + mutex_lock(&BTRFS_I(dir)->log_mutex); + + log = root->log_root; + path = btrfs_alloc_path(); + di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, + name, name_len, -1); + if (di && !IS_ERR(di)) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + BUG_ON(ret); + } + btrfs_release_path(log, path); + di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, + index, name, name_len, -1); + if (di && !IS_ERR(di)) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + BUG_ON(ret); + } + + /* update the directory size in the log to reflect the names + * we have removed + */ + if (bytes_del) { + struct btrfs_key key; + + key.objectid = dir->i_ino; + key.offset = 0; + key.type = BTRFS_INODE_ITEM_KEY; + btrfs_release_path(log, path); + + ret = btrfs_search_slot(trans, log, &key, path, 0, 1); + if (ret == 0) { + struct btrfs_inode_item *item; + u64 i_size; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + i_size = btrfs_inode_size(path->nodes[0], item); + if (i_size > bytes_del) + i_size -= bytes_del; + else + i_size = 0; + btrfs_set_inode_size(path->nodes[0], item, i_size); + btrfs_mark_buffer_dirty(path->nodes[0]); + } else + ret = 0; + btrfs_release_path(log, path); + } + + btrfs_free_path(path); + mutex_unlock(&BTRFS_I(dir)->log_mutex); + btrfs_end_log_trans(root); + + return 0; +} + +/* see comments for btrfs_del_dir_entries_in_log */ +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid) +{ + struct btrfs_root *log; + u64 index; + int ret; + + if (BTRFS_I(inode)->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + log = root->log_root; + mutex_lock(&BTRFS_I(inode)->log_mutex); + + ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, + dirid, &index); + mutex_unlock(&BTRFS_I(inode)->log_mutex); + btrfs_end_log_trans(root); + + return ret; +} + +/* + * creates a range item in the log for 'dirid'. first_offset and + * last_offset tell us which parts of the key space the log should + * be considered authoritative for. + */ +static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + int key_type, u64 dirid, + u64 first_offset, u64 last_offset) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_log_item *item; + + key.objectid = dirid; + key.offset = first_offset; + if (key_type == BTRFS_DIR_ITEM_KEY) + key.type = BTRFS_DIR_LOG_ITEM_KEY; + else + key.type = BTRFS_DIR_LOG_INDEX_KEY; + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); + BUG_ON(ret); + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + btrfs_set_dir_log_end(path->nodes[0], item, last_offset); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(log, path); + return 0; +} + +/* + * log all the items included in the current transaction for a given + * directory. This also creates the range items in the log tree required + * to replay anything deleted before the fsync + */ +static noinline int log_dir_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, int key_type, + u64 min_offset, u64 *last_offset_ret) +{ + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src; + int ret; + int i; + int nritems; + u64 first_offset = min_offset; + u64 last_offset = (u64)-1; + + log = root->log_root; + max_key.objectid = inode->i_ino; + max_key.offset = (u64)-1; + max_key.type = key_type; + + min_key.objectid = inode->i_ino; + min_key.type = key_type; + min_key.offset = min_offset; + + path->keep_locks = 1; + + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, trans->transid); + + /* + * we didn't find anything from this transaction, see if there + * is anything at all + */ + if (ret != 0 || min_key.objectid != inode->i_ino || + min_key.type != key_type) { + min_key.objectid = inode->i_ino; + min_key.type = key_type; + min_key.offset = (u64)-1; + btrfs_release_path(root, path); + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret < 0) { + btrfs_release_path(root, path); + return ret; + } + ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + + /* if ret == 0 there are items for this type, + * create a range to tell us the last key of this type. + * otherwise, there are no items in this directory after + * *min_offset, and we create a range to indicate that. + */ + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, + path->slots[0]); + if (key_type == tmp.type) + first_offset = max(min_offset, tmp.offset) + 1; + } + goto done; + } + + /* go backward to find any previous key */ + ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (key_type == tmp.type) { + first_offset = tmp.offset; + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + } + } + btrfs_release_path(root, path); + + /* find the first key from this transaction again */ + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret != 0) { + WARN_ON(1); + goto done; + } + + /* + * we have a block from this transaction, log every item in it + * from our directory + */ + while (1) { + struct btrfs_key tmp; + src = path->nodes[0]; + nritems = btrfs_header_nritems(src); + for (i = path->slots[0]; i < nritems; i++) { + btrfs_item_key_to_cpu(src, &min_key, i); + + if (min_key.objectid != inode->i_ino || + min_key.type != key_type) + goto done; + ret = overwrite_item(trans, log, dst_path, src, i, + &min_key); + BUG_ON(ret); + } + path->slots[0] = nritems; + + /* + * look ahead to the next item and see if it is also + * from this directory and from this transaction + */ + ret = btrfs_next_leaf(root, path); + if (ret == 1) { + last_offset = (u64)-1; + goto done; + } + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (tmp.objectid != inode->i_ino || tmp.type != key_type) { + last_offset = (u64)-1; + goto done; + } + if (btrfs_header_generation(path->nodes[0]) != trans->transid) { + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + + BUG_ON(ret); + last_offset = tmp.offset; + goto done; + } + } +done: + *last_offset_ret = last_offset; + btrfs_release_path(root, path); + btrfs_release_path(log, dst_path); + + /* insert the log range keys to indicate where the log is valid */ + ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, + first_offset, last_offset); + BUG_ON(ret); + return 0; +} + +/* + * logging directories is very similar to logging inodes, We find all the items + * from the current transaction and write them to the log. + * + * The recovery code scans the directory in the subvolume, and if it finds a + * key in the range logged that is not present in the log tree, then it means + * that dir entry was unlinked during the transaction. + * + * In order for that scan to work, we must include one key smaller than + * the smallest logged by this transaction and one key larger than the largest + * key logged by this transaction. + */ +static noinline int log_directory_changes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + u64 min_key; + u64 max_key; + int ret; + int key_type = BTRFS_DIR_ITEM_KEY; + +again: + min_key = 0; + max_key = 0; + while (1) { + ret = log_dir_items(trans, root, inode, path, + dst_path, key_type, min_key, + &max_key); + BUG_ON(ret); + if (max_key == (u64)-1) + break; + min_key = max_key + 1; + } + + if (key_type == BTRFS_DIR_ITEM_KEY) { + key_type = BTRFS_DIR_INDEX_KEY; + goto again; + } + return 0; +} + +/* + * a helper function to drop items from the log before we relog an + * inode. max_key_type indicates the highest item type to remove. + * This cannot be run for file data extents because it does not + * free the extents they point to. + */ +static int drop_objectid_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 objectid, int max_key_type) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + + key.objectid = objectid; + key.type = max_key_type; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + + if (ret != 1) + break; + + if (path->slots[0] == 0) + break; + + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + if (found_key.objectid != objectid) + break; + + ret = btrfs_del_item(trans, log, path); + BUG_ON(ret); + btrfs_release_path(log, path); + } + btrfs_release_path(log, path); + return 0; +} + +static noinline int copy_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *dst_path, + struct extent_buffer *src, + int start_slot, int nr, int inode_only) +{ + unsigned long src_offset; + unsigned long dst_offset; + struct btrfs_file_extent_item *extent; + struct btrfs_inode_item *inode_item; + int ret; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + char *ins_data; + int i; + struct list_head ordered_sums; + + INIT_LIST_HEAD(&ordered_sums); + + ins_data = kmalloc(nr * sizeof(struct btrfs_key) + + nr * sizeof(u32), GFP_NOFS); + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); + + for (i = 0; i < nr; i++) { + ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); + btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); + } + ret = btrfs_insert_empty_items(trans, log, dst_path, + ins_keys, ins_sizes, nr); + BUG_ON(ret); + + for (i = 0; i < nr; i++, dst_path->slots[0]++) { + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], + dst_path->slots[0]); + + src_offset = btrfs_item_ptr_offset(src, start_slot + i); + + copy_extent_buffer(dst_path->nodes[0], src, dst_offset, + src_offset, ins_sizes[i]); + + if (inode_only == LOG_INODE_EXISTS && + ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(dst_path->nodes[0], + dst_path->slots[0], + struct btrfs_inode_item); + btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); + + /* set the generation to zero so the recover code + * can tell the difference between an logging + * just to say 'this inode exists' and a logging + * to say 'update this inode with these values' + */ + btrfs_set_inode_generation(dst_path->nodes[0], + inode_item, 0); + } + /* take a reference on file data extents so that truncates + * or deletes of this inode don't have to relog the inode + * again + */ + if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { + int found_type; + extent = btrfs_item_ptr(src, start_slot + i, + struct btrfs_file_extent_item); + + found_type = btrfs_file_extent_type(src, extent); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 ds, dl, cs, cl; + ds = btrfs_file_extent_disk_bytenr(src, + extent); + /* ds == 0 is a hole */ + if (ds == 0) + continue; + + dl = btrfs_file_extent_disk_num_bytes(src, + extent); + cs = btrfs_file_extent_offset(src, extent); + cl = btrfs_file_extent_num_bytes(src, + extent); + if (btrfs_file_extent_compression(src, + extent)) { + cs = 0; + cl = dl; + } + + ret = btrfs_lookup_csums_range( + log->fs_info->csum_root, + ds + cs, ds + cs + cl - 1, + &ordered_sums); + BUG_ON(ret); + } + } + } + + btrfs_mark_buffer_dirty(dst_path->nodes[0]); + btrfs_release_path(log, dst_path); + kfree(ins_data); + + /* + * we have to do this after the loop above to avoid changing the + * log tree while trying to change the log tree. + */ + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + ret = btrfs_csum_file_blocks(trans, log, sums); + BUG_ON(ret); + list_del(&sums->list); + kfree(sums); + } + return 0; +} + +/* log a single inode in the tree log. + * At least one parent directory for this inode must exist in the tree + * or be logged already. + * + * Any items from this inode changed by the current transaction are copied + * to the log tree. An extra reference is taken on any extents in this + * file, allowing us to avoid a whole pile of corner cases around logging + * blocks that have been removed from the tree. + * + * See LOG_INODE_ALL and related defines for a description of what inode_only + * does. + * + * This handles both files and directories. + */ +static int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only) +{ + struct btrfs_path *path; + struct btrfs_path *dst_path; + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src = NULL; + u32 size; + int ret; + int nritems; + int ins_start_slot = 0; + int ins_nr; + + log = root->log_root; + + path = btrfs_alloc_path(); + dst_path = btrfs_alloc_path(); + + min_key.objectid = inode->i_ino; + min_key.type = BTRFS_INODE_ITEM_KEY; + min_key.offset = 0; + + max_key.objectid = inode->i_ino; + + /* today the code can only do partial logging of directories */ + if (!S_ISDIR(inode->i_mode)) + inode_only = LOG_INODE_ALL; + + if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) + max_key.type = BTRFS_XATTR_ITEM_KEY; + else + max_key.type = (u8)-1; + max_key.offset = (u64)-1; + + mutex_lock(&BTRFS_I(inode)->log_mutex); + + /* + * a brute force approach to making sure we get the most uptodate + * copies of everything. + */ + if (S_ISDIR(inode->i_mode)) { + int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + + if (inode_only == LOG_INODE_EXISTS) + max_key_type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, + inode->i_ino, max_key_type); + } else { + ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); + } + BUG_ON(ret); + path->keep_locks = 1; + + while (1) { + ins_nr = 0; + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, trans->transid); + if (ret != 0) + break; +again: + /* note, ins_nr might be > 0 here, cleanup outside the loop */ + if (min_key.objectid != inode->i_ino) + break; + if (min_key.type > max_key.type) + break; + + src = path->nodes[0]; + size = btrfs_item_size_nr(src, path->slots[0]); + if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { + ins_nr++; + goto next_slot; + } else if (!ins_nr) { + ins_start_slot = path->slots[0]; + ins_nr = 1; + goto next_slot; + } + + ret = copy_items(trans, log, dst_path, src, ins_start_slot, + ins_nr, inode_only); + BUG_ON(ret); + ins_nr = 1; + ins_start_slot = path->slots[0]; +next_slot: + + nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(path->nodes[0], &min_key, + path->slots[0]); + goto again; + } + if (ins_nr) { + ret = copy_items(trans, log, dst_path, src, + ins_start_slot, + ins_nr, inode_only); + BUG_ON(ret); + ins_nr = 0; + } + btrfs_release_path(root, path); + + if (min_key.offset < (u64)-1) + min_key.offset++; + else if (min_key.type < (u8)-1) + min_key.type++; + else if (min_key.objectid < (u64)-1) + min_key.objectid++; + else + break; + } + if (ins_nr) { + ret = copy_items(trans, log, dst_path, src, + ins_start_slot, + ins_nr, inode_only); + BUG_ON(ret); + ins_nr = 0; + } + WARN_ON(ins_nr); + if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { + btrfs_release_path(root, path); + btrfs_release_path(log, dst_path); + ret = log_directory_changes(trans, root, inode, path, dst_path); + BUG_ON(ret); + } + BTRFS_I(inode)->logged_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); + + btrfs_free_path(path); + btrfs_free_path(dst_path); + return 0; +} + +/* + * follow the dentry parent pointers up the chain and see if any + * of the directories in it require a full commit before they can + * be logged. Returns zero if nothing special needs to be done or 1 if + * a full commit is required. + */ +static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, + struct inode *inode, + struct dentry *parent, + struct super_block *sb, + u64 last_committed) +{ + int ret = 0; + struct btrfs_root *root; + + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) + goto out; + + if (!S_ISDIR(inode->i_mode)) { + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + goto out; + inode = parent->d_inode; + } + + while (1) { + BTRFS_I(inode)->logged_trans = trans->transid; + smp_mb(); + + if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + root = BTRFS_I(inode)->root; + + /* + * make sure any commits to the log are forced + * to be full commits + */ + root->fs_info->last_trans_log_full_commit = + trans->transid; + ret = 1; + break; + } + + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + break; + + if (IS_ROOT(parent)) + break; + + parent = parent->d_parent; + inode = parent->d_inode; + + } +out: + return ret; +} + +static int inode_in_log(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + mutex_lock(&root->log_mutex); + if (BTRFS_I(inode)->logged_trans == trans->transid && + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) + ret = 1; + mutex_unlock(&root->log_mutex); + return ret; +} + + +/* + * helper function around btrfs_log_inode to make sure newly created + * parent directories also end up in the log. A minimal inode and backref + * only logging is done of any parent directories that are older than + * the last committed transaction + */ +int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only) +{ + int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; + struct super_block *sb; + int ret = 0; + u64 last_committed = root->fs_info->last_trans_committed; + + sb = inode->i_sb; + + if (btrfs_test_opt(root, NOTREELOG)) { + ret = 1; + goto end_no_trans; + } + + if (root->fs_info->last_trans_log_full_commit > + root->fs_info->last_trans_committed) { + ret = 1; + goto end_no_trans; + } + + if (root != BTRFS_I(inode)->root || + btrfs_root_refs(&root->root_item) == 0) { + ret = 1; + goto end_no_trans; + } + + ret = check_parent_dirs_for_sync(trans, inode, parent, + sb, last_committed); + if (ret) + goto end_no_trans; + + if (inode_in_log(trans, inode)) { + ret = BTRFS_NO_LOG_SYNC; + goto end_no_trans; + } + + start_log_trans(trans, root); + + ret = btrfs_log_inode(trans, root, inode, inode_only); + BUG_ON(ret); + + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) + goto no_parent; + + inode_only = LOG_INODE_EXISTS; + while (1) { + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + break; + + inode = parent->d_inode; + if (root != BTRFS_I(inode)->root) + break; + + if (BTRFS_I(inode)->generation > + root->fs_info->last_trans_committed) { + ret = btrfs_log_inode(trans, root, inode, inode_only); + BUG_ON(ret); + } + if (IS_ROOT(parent)) + break; + + parent = parent->d_parent; + } +no_parent: + ret = 0; + btrfs_end_log_trans(root); +end_no_trans: + return ret; +} + +/* + * it is not safe to log dentry if the chunk root has added new + * chunks. This returns 0 if the dentry was logged, and 1 otherwise. + * If this returns 1, you must commit the transaction to safely get your + * data on disk. + */ +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry) +{ + return btrfs_log_inode_parent(trans, root, dentry->d_inode, + dentry->d_parent, 0); +} + +/* + * should be called during mount to recover any replay any log trees + * from the FS + */ +int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) +{ + int ret; + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key tmp_key; + struct btrfs_root *log; + struct btrfs_fs_info *fs_info = log_root_tree->fs_info; + struct walk_control wc = { + .process_func = process_one_buffer, + .stage = 0, + }; + + fs_info->log_root_recovering = 1; + path = btrfs_alloc_path(); + BUG_ON(!path); + + trans = btrfs_start_transaction(fs_info->tree_root, 1); + + wc.trans = trans; + wc.pin = 1; + + walk_log_tree(trans, log_root_tree, &wc); + +again: + key.objectid = BTRFS_TREE_LOG_OBJECTID; + key.offset = (u64)-1; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + + while (1) { + ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + btrfs_release_path(log_root_tree, path); + if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) + break; + + log = btrfs_read_fs_root_no_radix(log_root_tree, + &found_key); + BUG_ON(!log); + + + tmp_key.objectid = found_key.offset; + tmp_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_key.offset = (u64)-1; + + wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); + BUG_ON(!wc.replay_dest); + + wc.replay_dest->log_root = log; + btrfs_record_root_in_trans(trans, wc.replay_dest); + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + if (wc.stage == LOG_WALK_REPLAY_ALL) { + ret = fixup_inode_link_counts(trans, wc.replay_dest, + path); + BUG_ON(ret); + } + + key.offset = found_key.offset - 1; + wc.replay_dest->log_root = NULL; + free_extent_buffer(log->node); + free_extent_buffer(log->commit_root); + kfree(log); + + if (found_key.offset == 0) + break; + } + btrfs_release_path(log_root_tree, path); + + /* step one is to pin it all, step two is to replay just inodes */ + if (wc.pin) { + wc.pin = 0; + wc.process_func = replay_one_buffer; + wc.stage = LOG_WALK_REPLAY_INODES; + goto again; + } + /* step three is to replay everything */ + if (wc.stage < LOG_WALK_REPLAY_ALL) { + wc.stage++; + goto again; + } + + btrfs_free_path(path); + + free_extent_buffer(log_root_tree->node); + log_root_tree->log_root = NULL; + fs_info->log_root_recovering = 0; + + /* step 4: commit the transaction, which also unpins the blocks */ + btrfs_commit_transaction(trans, fs_info->tree_root); + + kfree(log_root_tree); + return 0; +} + +/* + * there are some corner cases where we want to force a full + * commit instead of allowing a directory to be logged. + * + * They revolve around files there were unlinked from the directory, and + * this function updates the parent directory so that a full commit is + * properly done if it is fsync'd later after the unlinks are done. + */ +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename) +{ + /* + * when we're logging a file, if it hasn't been renamed + * or unlinked, and its inode is fully committed on disk, + * we don't have to worry about walking up the directory chain + * to log its parents. + * + * So, we use the last_unlink_trans field to put this transid + * into the file. When the file is logged we check it and + * don't log the parents if the file is fully on disk. + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + + /* + * if this directory was already logged any new + * names for this file/dir will get recorded + */ + smp_mb(); + if (BTRFS_I(dir)->logged_trans == trans->transid) + return; + + /* + * if the inode we're about to unlink was logged, + * the log will be properly updated for any new names + */ + if (BTRFS_I(inode)->logged_trans == trans->transid) + return; + + /* + * when renaming files across directories, if the directory + * there we're unlinking from gets fsync'd later on, there's + * no way to find the destination directory later and fsync it + * properly. So, we have to be conservative and force commits + * so the new name gets discovered. + */ + if (for_rename) + goto record; + + /* we can safely do the unlink without any special recording */ + return; + +record: + BTRFS_I(dir)->last_unlink_trans = trans->transid; +} + +/* + * Call this after adding a new name for a file and it will properly + * update the log to reflect the new name. + * + * It will return zero if all goes well, and it will return 1 if a + * full transaction commit is required. + */ +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent) +{ + struct btrfs_root * root = BTRFS_I(inode)->root; + + /* + * this will force the logging code to walk the dentry chain + * up for the file + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + + /* + * if this inode hasn't been logged and directory we're renaming it + * from hasn't been logged, we don't need to log it + */ + if (BTRFS_I(inode)->logged_trans <= + root->fs_info->last_trans_committed && + (!old_dir || BTRFS_I(old_dir)->logged_trans <= + root->fs_info->last_trans_committed)) + return 0; + + return btrfs_log_inode_parent(trans, root, inode, parent, 1); +} + diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h new file mode 100644 index 00000000000..0776eacb508 --- /dev/null +++ b/fs/btrfs/tree-log.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __TREE_LOG_ +#define __TREE_LOG_ + +/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ +#define BTRFS_NO_LOG_SYNC 256 + +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_recover_log_trees(struct btrfs_root *tree_root); +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry); +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index); +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid); +int btrfs_join_running_log_trans(struct btrfs_root *root); +int btrfs_end_log_trans(struct btrfs_root *root); +int btrfs_pin_log_trans(struct btrfs_root *root); +int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only); +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename); +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent); +#endif diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h new file mode 100644 index 00000000000..9bf3946d5ef --- /dev/null +++ b/fs/btrfs/version.h @@ -0,0 +1,4 @@ +#ifndef __BTRFS_VERSION_H +#define __BTRFS_VERSION_H +#define BTRFS_BUILD_VERSION "Btrfs" +#endif diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh new file mode 100644 index 00000000000..1ca1952fd91 --- /dev/null +++ b/fs/btrfs/version.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# determine-version -- report a useful version for releases +# +# Copyright 2008, Aron Griffis <agriffis@n01se.net> +# Copyright 2008, Oracle +# Released under the GNU GPLv2 + +v="v0.16" + +which git &> /dev/null +if [ $? == 0 ]; then + git branch >& /dev/null + if [ $? == 0 ]; then + if head=`git rev-parse --verify HEAD 2>/dev/null`; then + if tag=`git describe --tags 2>/dev/null`; then + v="$tag" + fi + + # Are there uncommitted changes? + git update-index --refresh --unmerged > /dev/null + if git diff-index --name-only HEAD | \ + grep -v "^scripts/package" \ + | read dummy; then + v="$v"-dirty + fi + fi + fi +fi + +echo "#ifndef __BUILD_VERSION" > .build-version.h +echo "#define __BUILD_VERSION" >> .build-version.h +echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h +echo "#endif" >> .build-version.h + +diff -q version.h .build-version.h >& /dev/null + +if [ $? == 0 ]; then + rm .build-version.h + exit 0 +fi + +mv .build-version.h version.h diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c new file mode 100644 index 00000000000..41ecbb2347f --- /dev/null +++ b/fs/btrfs/volumes.c @@ -0,0 +1,3426 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/iocontext.h> +#include <asm/div64.h> +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" + +struct map_lookup { + u64 type; + int io_align; + int io_width; + int stripe_len; + int sector_size; + int num_stripes; + int sub_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +static int init_first_rw_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device); +static int btrfs_relocate_sys_chunks(struct btrfs_root *root); + +#define map_lookup_size(n) (sizeof(struct map_lookup) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + +static DEFINE_MUTEX(uuid_mutex); +static LIST_HEAD(fs_uuids); + +void btrfs_lock_volumes(void) +{ + mutex_lock(&uuid_mutex); +} + +void btrfs_unlock_volumes(void) +{ + mutex_unlock(&uuid_mutex); +} + +static void lock_chunks(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->chunk_mutex); +} + +static void unlock_chunks(struct btrfs_root *root) +{ + mutex_unlock(&root->fs_info->chunk_mutex); +} + +static void free_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + WARN_ON(fs_devices->opened); + while (!list_empty(&fs_devices->devices)) { + device = list_entry(fs_devices->devices.next, + struct btrfs_device, dev_list); + list_del(&device->dev_list); + kfree(device->name); + kfree(device); + } + kfree(fs_devices); +} + +int btrfs_cleanup_fs_uuids(void) +{ + struct btrfs_fs_devices *fs_devices; + + while (!list_empty(&fs_uuids)) { + fs_devices = list_entry(fs_uuids.next, + struct btrfs_fs_devices, list); + list_del(&fs_devices->list); + free_fs_devices(fs_devices); + } + return 0; +} + +static noinline struct btrfs_device *__find_device(struct list_head *head, + u64 devid, u8 *uuid) +{ + struct btrfs_device *dev; + + list_for_each_entry(dev, head, dev_list) { + if (dev->devid == devid && + (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { + return dev; + } + } + return NULL; +} + +static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) +{ + struct btrfs_fs_devices *fs_devices; + + list_for_each_entry(fs_devices, &fs_uuids, list) { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) + return fs_devices; + } + return NULL; +} + +static void requeue_list(struct btrfs_pending_bios *pending_bios, + struct bio *head, struct bio *tail) +{ + + struct bio *old_head; + + old_head = pending_bios->head; + pending_bios->head = head; + if (pending_bios->tail) + tail->bi_next = old_head; + else + pending_bios->tail = tail; +} + +/* + * we try to collect pending bios for a device so we don't get a large + * number of procs sending bios down to the same device. This greatly + * improves the schedulers ability to collect and merge the bios. + * + * But, it also turns into a long list of bios to process and that is sure + * to eventually make the worker thread block. The solution here is to + * make some progress and then put this work struct back at the end of + * the list if the block device is congested. This way, multiple devices + * can make progress from a single worker thread. + */ +static noinline int run_scheduled_bios(struct btrfs_device *device) +{ + struct bio *pending; + struct backing_dev_info *bdi; + struct btrfs_fs_info *fs_info; + struct btrfs_pending_bios *pending_bios; + struct bio *tail; + struct bio *cur; + int again = 0; + unsigned long num_run; + unsigned long num_sync_run; + unsigned long batch_run = 0; + unsigned long limit; + unsigned long last_waited = 0; + int force_reg = 0; + + bdi = blk_get_backing_dev_info(device->bdev); + fs_info = device->dev_root->fs_info; + limit = btrfs_async_submit_limit(fs_info); + limit = limit * 2 / 3; + + /* we want to make sure that every time we switch from the sync + * list to the normal list, we unplug + */ + num_sync_run = 0; + +loop: + spin_lock(&device->io_lock); + +loop_lock: + num_run = 0; + + /* take all the bios off the list at once and process them + * later on (without the lock held). But, remember the + * tail and other pointers so the bios can be properly reinserted + * into the list if we hit congestion + */ + if (!force_reg && device->pending_sync_bios.head) { + pending_bios = &device->pending_sync_bios; + force_reg = 1; + } else { + pending_bios = &device->pending_bios; + force_reg = 0; + } + + pending = pending_bios->head; + tail = pending_bios->tail; + WARN_ON(pending && !tail); + + /* + * if pending was null this time around, no bios need processing + * at all and we can stop. Otherwise it'll loop back up again + * and do an additional check so no bios are missed. + * + * device->running_pending is used to synchronize with the + * schedule_bio code. + */ + if (device->pending_sync_bios.head == NULL && + device->pending_bios.head == NULL) { + again = 0; + device->running_pending = 0; + } else { + again = 1; + device->running_pending = 1; + } + + pending_bios->head = NULL; + pending_bios->tail = NULL; + + spin_unlock(&device->io_lock); + + /* + * if we're doing the regular priority list, make sure we unplug + * for any high prio bios we've sent down + */ + if (pending_bios == &device->pending_bios && num_sync_run > 0) { + num_sync_run = 0; + blk_run_backing_dev(bdi, NULL); + } + + while (pending) { + + rmb(); + /* we want to work on both lists, but do more bios on the + * sync list than the regular list + */ + if ((num_run > 32 && + pending_bios != &device->pending_sync_bios && + device->pending_sync_bios.head) || + (num_run > 64 && pending_bios == &device->pending_sync_bios && + device->pending_bios.head)) { + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); + goto loop_lock; + } + + cur = pending; + pending = pending->bi_next; + cur->bi_next = NULL; + atomic_dec(&fs_info->nr_async_bios); + + if (atomic_read(&fs_info->nr_async_bios) < limit && + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); + + BUG_ON(atomic_read(&cur->bi_cnt) == 0); + submit_bio(cur->bi_rw, cur); + num_run++; + batch_run++; + + if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) + num_sync_run++; + + if (need_resched()) { + if (num_sync_run) { + blk_run_backing_dev(bdi, NULL); + num_sync_run = 0; + } + cond_resched(); + } + + /* + * we made progress, there is more work to do and the bdi + * is now congested. Back off and let other work structs + * run instead + */ + if (pending && bdi_write_congested(bdi) && batch_run > 8 && + fs_info->fs_devices->open_devices > 1) { + struct io_context *ioc; + + ioc = current->io_context; + + /* + * the main goal here is that we don't want to + * block if we're going to be able to submit + * more requests without blocking. + * + * This code does two great things, it pokes into + * the elevator code from a filesystem _and_ + * it makes assumptions about how batching works. + */ + if (ioc && ioc->nr_batch_requests > 0 && + time_before(jiffies, ioc->last_waited + HZ/50UL) && + (last_waited == 0 || + ioc->last_waited == last_waited)) { + /* + * we want to go through our batch of + * requests and stop. So, we copy out + * the ioc->last_waited time and test + * against it before looping + */ + last_waited = ioc->last_waited; + if (need_resched()) { + if (num_sync_run) { + blk_run_backing_dev(bdi, NULL); + num_sync_run = 0; + } + cond_resched(); + } + continue; + } + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); + device->running_pending = 1; + + spin_unlock(&device->io_lock); + btrfs_requeue_work(&device->work); + goto done; + } + } + + if (num_sync_run) { + num_sync_run = 0; + blk_run_backing_dev(bdi, NULL); + } + + cond_resched(); + if (again) + goto loop; + + spin_lock(&device->io_lock); + if (device->pending_bios.head || device->pending_sync_bios.head) + goto loop_lock; + spin_unlock(&device->io_lock); + + /* + * IO has already been through a long path to get here. Checksumming, + * async helper threads, perhaps compression. We've done a pretty + * good job of collecting a batch of IO and should just unplug + * the device right away. + * + * This will help anyone who is waiting on the IO, they might have + * already unplugged, but managed to do so before the bio they + * cared about found its way down here. + */ + blk_run_backing_dev(bdi, NULL); +done: + return 0; +} + +static void pending_bios_fn(struct btrfs_work *work) +{ + struct btrfs_device *device; + + device = container_of(work, struct btrfs_device, work); + run_scheduled_bios(device); +} + +static noinline int device_list_add(const char *path, + struct btrfs_super_block *disk_super, + u64 devid, struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices; + u64 found_transid = btrfs_super_generation(disk_super); + + fs_devices = find_fsid(disk_super->fsid); + if (!fs_devices) { + fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!fs_devices) + return -ENOMEM; + INIT_LIST_HEAD(&fs_devices->devices); + INIT_LIST_HEAD(&fs_devices->alloc_list); + list_add(&fs_devices->list, &fs_uuids); + memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); + fs_devices->latest_devid = devid; + fs_devices->latest_trans = found_transid; + mutex_init(&fs_devices->device_list_mutex); + device = NULL; + } else { + device = __find_device(&fs_devices->devices, devid, + disk_super->dev_item.uuid); + } + if (!device) { + if (fs_devices->opened) + return -EBUSY; + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + /* we can safely leave the fs_devices entry around */ + return -ENOMEM; + } + device->devid = devid; + device->work.func = pending_bios_fn; + memcpy(device->uuid, disk_super->dev_item.uuid, + BTRFS_UUID_SIZE); + device->barriers = 1; + spin_lock_init(&device->io_lock); + device->name = kstrdup(path, GFP_NOFS); + if (!device->name) { + kfree(device); + return -ENOMEM; + } + INIT_LIST_HEAD(&device->dev_alloc_list); + + mutex_lock(&fs_devices->device_list_mutex); + list_add(&device->dev_list, &fs_devices->devices); + mutex_unlock(&fs_devices->device_list_mutex); + + device->fs_devices = fs_devices; + fs_devices->num_devices++; + } + + if (found_transid > fs_devices->latest_trans) { + fs_devices->latest_devid = devid; + fs_devices->latest_trans = found_transid; + } + *fs_devices_ret = fs_devices; + return 0; +} + +static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) +{ + struct btrfs_fs_devices *fs_devices; + struct btrfs_device *device; + struct btrfs_device *orig_dev; + + fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!fs_devices) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&fs_devices->devices); + INIT_LIST_HEAD(&fs_devices->alloc_list); + INIT_LIST_HEAD(&fs_devices->list); + mutex_init(&fs_devices->device_list_mutex); + fs_devices->latest_devid = orig->latest_devid; + fs_devices->latest_trans = orig->latest_trans; + memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); + + mutex_lock(&orig->device_list_mutex); + list_for_each_entry(orig_dev, &orig->devices, dev_list) { + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) + goto error; + + device->name = kstrdup(orig_dev->name, GFP_NOFS); + if (!device->name) { + kfree(device); + goto error; + } + + device->devid = orig_dev->devid; + device->work.func = pending_bios_fn; + memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); + device->barriers = 1; + spin_lock_init(&device->io_lock); + INIT_LIST_HEAD(&device->dev_list); + INIT_LIST_HEAD(&device->dev_alloc_list); + + list_add(&device->dev_list, &fs_devices->devices); + device->fs_devices = fs_devices; + fs_devices->num_devices++; + } + mutex_unlock(&orig->device_list_mutex); + return fs_devices; +error: + mutex_unlock(&orig->device_list_mutex); + free_fs_devices(fs_devices); + return ERR_PTR(-ENOMEM); +} + +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device, *next; + + mutex_lock(&uuid_mutex); +again: + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { + if (device->in_fs_metadata) + continue; + + if (device->bdev) { + close_bdev_exclusive(device->bdev, device->mode); + device->bdev = NULL; + fs_devices->open_devices--; + } + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + device->writeable = 0; + fs_devices->rw_devices--; + } + list_del_init(&device->dev_list); + fs_devices->num_devices--; + kfree(device->name); + kfree(device); + } + mutex_unlock(&fs_devices->device_list_mutex); + + if (fs_devices->seed) { + fs_devices = fs_devices->seed; + goto again; + } + + mutex_unlock(&uuid_mutex); + return 0; +} + +static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + + if (--fs_devices->opened > 0) + return 0; + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->bdev) { + close_bdev_exclusive(device->bdev, device->mode); + fs_devices->open_devices--; + } + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + device->bdev = NULL; + device->writeable = 0; + device->in_fs_metadata = 0; + } + WARN_ON(fs_devices->open_devices); + WARN_ON(fs_devices->rw_devices); + fs_devices->opened = 0; + fs_devices->seeding = 0; + + return 0; +} + +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_fs_devices *seed_devices = NULL; + int ret; + + mutex_lock(&uuid_mutex); + ret = __btrfs_close_devices(fs_devices); + if (!fs_devices->opened) { + seed_devices = fs_devices->seed; + fs_devices->seed = NULL; + } + mutex_unlock(&uuid_mutex); + + while (seed_devices) { + fs_devices = seed_devices; + seed_devices = fs_devices->seed; + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + } + return ret; +} + +static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder) +{ + struct block_device *bdev; + struct list_head *head = &fs_devices->devices; + struct btrfs_device *device; + struct block_device *latest_bdev = NULL; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + u64 latest_devid = 0; + u64 latest_transid = 0; + u64 devid; + int seeding = 1; + int ret = 0; + + list_for_each_entry(device, head, dev_list) { + if (device->bdev) + continue; + if (!device->name) + continue; + + bdev = open_bdev_exclusive(device->name, flags, holder); + if (IS_ERR(bdev)) { + printk(KERN_INFO "open %s failed\n", device->name); + goto error; + } + set_blocksize(bdev, 4096); + + bh = btrfs_read_dev_super(bdev); + if (!bh) + goto error_close; + + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = le64_to_cpu(disk_super->dev_item.devid); + if (devid != device->devid) + goto error_brelse; + + if (memcmp(device->uuid, disk_super->dev_item.uuid, + BTRFS_UUID_SIZE)) + goto error_brelse; + + device->generation = btrfs_super_generation(disk_super); + if (!latest_transid || device->generation > latest_transid) { + latest_devid = devid; + latest_transid = device->generation; + latest_bdev = bdev; + } + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + device->writeable = 0; + } else { + device->writeable = !bdev_read_only(bdev); + seeding = 0; + } + + device->bdev = bdev; + device->in_fs_metadata = 0; + device->mode = flags; + + if (!blk_queue_nonrot(bdev_get_queue(bdev))) + fs_devices->rotating = 1; + + fs_devices->open_devices++; + if (device->writeable) { + fs_devices->rw_devices++; + list_add(&device->dev_alloc_list, + &fs_devices->alloc_list); + } + continue; + +error_brelse: + brelse(bh); +error_close: + close_bdev_exclusive(bdev, FMODE_READ); +error: + continue; + } + if (fs_devices->open_devices == 0) { + ret = -EIO; + goto out; + } + fs_devices->seeding = seeding; + fs_devices->opened = 1; + fs_devices->latest_bdev = latest_bdev; + fs_devices->latest_devid = latest_devid; + fs_devices->latest_trans = latest_transid; + fs_devices->total_rw_bytes = 0; +out: + return ret; +} + +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder) +{ + int ret; + + mutex_lock(&uuid_mutex); + if (fs_devices->opened) { + fs_devices->opened++; + ret = 0; + } else { + ret = __btrfs_open_devices(fs_devices, flags, holder); + } + mutex_unlock(&uuid_mutex); + return ret; +} + +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_super_block *disk_super; + struct block_device *bdev; + struct buffer_head *bh; + int ret; + u64 devid; + u64 transid; + + mutex_lock(&uuid_mutex); + + bdev = open_bdev_exclusive(path, flags, holder); + + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto error; + } + + ret = set_blocksize(bdev, 4096); + if (ret) + goto error_close; + bh = btrfs_read_dev_super(bdev); + if (!bh) { + ret = -EIO; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = le64_to_cpu(disk_super->dev_item.devid); + transid = btrfs_super_generation(disk_super); + if (disk_super->label[0]) + printk(KERN_INFO "device label %s ", disk_super->label); + else { + /* FIXME, make a readl uuid parser */ + printk(KERN_INFO "device fsid %llx-%llx ", + *(unsigned long long *)disk_super->fsid, + *(unsigned long long *)(disk_super->fsid + 8)); + } + printk(KERN_CONT "devid %llu transid %llu %s\n", + (unsigned long long)devid, (unsigned long long)transid, path); + ret = device_list_add(path, disk_super, devid, fs_devices_ret); + + brelse(bh); +error_close: + close_bdev_exclusive(bdev, flags); +error: + mutex_unlock(&uuid_mutex); + return ret; +} + +/* + * this uses a pretty simple search, the expectation is that it is + * called very infrequently and that a given device has a small number + * of extents + */ +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *max_avail) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + u64 hole_size = 0; + u64 last_byte = 0; + u64 search_start = 0; + u64 search_end = device->total_bytes; + int ret; + int slot = 0; + int start_found; + struct extent_buffer *l; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + start_found = 0; + + /* FIXME use last free of some kind */ + + /* we don't want to overwrite the superblock on the drive, + * so we make sure to start at an offset of at least 1MB + */ + search_start = max((u64)1024 * 1024, search_start); + + if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) + search_start = max(root->fs_info->alloc_start, search_start); + + key.objectid = device->devid; + key.offset = search_start; + key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + if (ret < 0) + goto error; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto error; + if (ret > 0) + start_found = 1; + } + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; +no_more_items: + if (!start_found) { + if (search_start >= search_end) { + ret = -ENOSPC; + goto error; + } + *start = search_start; + start_found = 1; + goto check_pending; + } + *start = last_byte > search_start ? + last_byte : search_start; + if (search_end <= *start) { + ret = -ENOSPC; + goto error; + } + goto check_pending; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + goto no_more_items; + + if (key.offset >= search_start && key.offset > last_byte && + start_found) { + if (last_byte < search_start) + last_byte = search_start; + hole_size = key.offset - last_byte; + + if (hole_size > *max_avail) + *max_avail = hole_size; + + if (key.offset > last_byte && + hole_size >= num_bytes) { + *start = last_byte; + goto check_pending; + } + } + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + start_found = 1; + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); +next: + path->slots[0]++; + cond_resched(); + } +check_pending: + /* we have to make sure we didn't find an extent that has already + * been allocated by the map tree or the original allocation + */ + BUG_ON(*start < search_start); + + if (*start + num_bytes > search_end) { + ret = -ENOSPC; + goto error; + } + /* check for pending inserts here */ + ret = 0; + +error: + btrfs_free_path(path); + return ret; +} + +static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 start) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf = NULL; + struct btrfs_dev_extent *extent = NULL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, + BTRFS_DEV_EXTENT_KEY); + BUG_ON(ret); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + BUG_ON(found_key.offset > start || found_key.offset + + btrfs_dev_extent_length(leaf, extent) < start); + ret = 0; + } else if (ret == 0) { + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + } + BUG_ON(ret); + + if (device->bytes_used > 0) + device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return ret; +} + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + + WARN_ON(!device->in_fs_metadata); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); + btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); + + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), + BTRFS_UUID_SIZE); + + btrfs_set_dev_extent_length(leaf, extent, num_bytes); + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + return ret; +} + +static noinline int find_next_chunk(struct btrfs_root *root, + u64 objectid, u64 *offset) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct btrfs_chunk *chunk; + struct btrfs_key found_key; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = objectid; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); + + ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); + if (ret) { + *offset = 0; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != objectid) + *offset = 0; + else { + chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_chunk); + *offset = found_key.offset + + btrfs_chunk_length(path->nodes[0], chunk); + } + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); + + ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, + BTRFS_DEV_ITEM_KEY); + if (ret) { + *objectid = 1; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + *objectid = found_key.offset + 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +/* + * the device information is stored in the chunk root + * the btrfs_device struct should be fully filled in + */ +int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*dev_item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_generation(leaf, dev_item, 0); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_set_device_group(leaf, dev_item, 0); + btrfs_set_device_seek_speed(leaf, dev_item, 0); + btrfs_set_device_bandwidth(leaf, dev_item, 0); + btrfs_set_device_start_offset(leaf, dev_item, 0); + + ptr = (unsigned long)btrfs_device_uuid(dev_item); + write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); + ptr = (unsigned long)btrfs_device_fsid(dev_item); + write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_rm_dev_item(struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_trans_handle *trans; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + lock_chunks(root); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; +out: + btrfs_free_path(path); + unlock_chunks(root); + btrfs_commit_transaction(trans, root); + return ret; +} + +int btrfs_rm_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_device *device; + struct btrfs_device *next_device; + struct block_device *bdev; + struct buffer_head *bh = NULL; + struct btrfs_super_block *disk_super; + u64 all_avail; + u64 devid; + u64 num_devices; + u8 *dev_uuid; + int ret = 0; + + mutex_lock(&uuid_mutex); + mutex_lock(&root->fs_info->volume_mutex); + + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && + root->fs_info->fs_devices->num_devices <= 4) { + printk(KERN_ERR "btrfs: unable to go below four devices " + "on raid10\n"); + ret = -EINVAL; + goto out; + } + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && + root->fs_info->fs_devices->num_devices <= 2) { + printk(KERN_ERR "btrfs: unable to go below two " + "devices on raid1\n"); + ret = -EINVAL; + goto out; + } + + if (strcmp(device_path, "missing") == 0) { + struct list_head *devices; + struct btrfs_device *tmp; + + device = NULL; + devices = &root->fs_info->fs_devices->devices; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + list_for_each_entry(tmp, devices, dev_list) { + if (tmp->in_fs_metadata && !tmp->bdev) { + device = tmp; + break; + } + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + bdev = NULL; + bh = NULL; + disk_super = NULL; + if (!device) { + printk(KERN_ERR "btrfs: no missing devices found to " + "remove\n"); + goto out; + } + } else { + bdev = open_bdev_exclusive(device_path, FMODE_READ, + root->fs_info->bdev_holder); + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto out; + } + + set_blocksize(bdev, 4096); + bh = btrfs_read_dev_super(bdev); + if (!bh) { + ret = -EIO; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = le64_to_cpu(disk_super->dev_item.devid); + dev_uuid = disk_super->dev_item.uuid; + device = btrfs_find_device(root, devid, dev_uuid, + disk_super->fsid); + if (!device) { + ret = -ENOENT; + goto error_brelse; + } + } + + if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { + printk(KERN_ERR "btrfs: unable to remove the only writeable " + "device\n"); + ret = -EINVAL; + goto error_brelse; + } + + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + root->fs_info->fs_devices->rw_devices--; + } + + ret = btrfs_shrink_device(device, 0); + if (ret) + goto error_brelse; + + ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); + if (ret) + goto error_brelse; + + device->in_fs_metadata = 0; + + /* + * the device list mutex makes sure that we don't change + * the device list while someone else is writing out all + * the device supers. + */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + list_del_init(&device->dev_list); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + device->fs_devices->num_devices--; + + next_device = list_entry(root->fs_info->fs_devices->devices.next, + struct btrfs_device, dev_list); + if (device->bdev == root->fs_info->sb->s_bdev) + root->fs_info->sb->s_bdev = next_device->bdev; + if (device->bdev == root->fs_info->fs_devices->latest_bdev) + root->fs_info->fs_devices->latest_bdev = next_device->bdev; + + if (device->bdev) { + close_bdev_exclusive(device->bdev, device->mode); + device->bdev = NULL; + device->fs_devices->open_devices--; + } + + num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); + + if (device->fs_devices->open_devices == 0) { + struct btrfs_fs_devices *fs_devices; + fs_devices = root->fs_info->fs_devices; + while (fs_devices) { + if (fs_devices->seed == device->fs_devices) + break; + fs_devices = fs_devices->seed; + } + fs_devices->seed = device->fs_devices->seed; + device->fs_devices->seed = NULL; + __btrfs_close_devices(device->fs_devices); + free_fs_devices(device->fs_devices); + } + + /* + * at this point, the device is zero sized. We want to + * remove it from the devices list and zero out the old super + */ + if (device->writeable) { + /* make sure this device isn't detected as part of + * the FS anymore + */ + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } + + kfree(device->name); + kfree(device); + ret = 0; + +error_brelse: + brelse(bh); +error_close: + if (bdev) + close_bdev_exclusive(bdev, FMODE_READ); +out: + mutex_unlock(&root->fs_info->volume_mutex); + mutex_unlock(&uuid_mutex); + return ret; +} + +/* + * does all the dirty work required for changing file system's UUID. + */ +static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *old_devices; + struct btrfs_fs_devices *seed_devices; + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_device *device; + u64 super_flags; + + BUG_ON(!mutex_is_locked(&uuid_mutex)); + if (!fs_devices->seeding) + return -EINVAL; + + seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!seed_devices) + return -ENOMEM; + + old_devices = clone_fs_devices(fs_devices); + if (IS_ERR(old_devices)) { + kfree(seed_devices); + return PTR_ERR(old_devices); + } + + list_add(&old_devices->list, &fs_uuids); + + memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); + seed_devices->opened = 1; + INIT_LIST_HEAD(&seed_devices->devices); + INIT_LIST_HEAD(&seed_devices->alloc_list); + mutex_init(&seed_devices->device_list_mutex); + list_splice_init(&fs_devices->devices, &seed_devices->devices); + list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); + list_for_each_entry(device, &seed_devices->devices, dev_list) { + device->fs_devices = seed_devices; + } + + fs_devices->seeding = 0; + fs_devices->num_devices = 0; + fs_devices->open_devices = 0; + fs_devices->seed = seed_devices; + + generate_random_uuid(fs_devices->fsid); + memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + super_flags = btrfs_super_flags(disk_super) & + ~BTRFS_SUPER_FLAG_SEEDING; + btrfs_set_super_flags(disk_super, super_flags); + + return 0; +} + +/* + * strore the expected generation for seed devices in device items. + */ +static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dev_item *dev_item; + struct btrfs_device *device; + struct btrfs_key key; + u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + u64 devid; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + root = root->fs_info->chunk_root; + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = BTRFS_DEV_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto error; + + leaf = path->nodes[0]; +next_slot: + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret > 0) + break; + if (ret < 0) + goto error; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(root, path); + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || + key.type != BTRFS_DEV_ITEM_KEY) + break; + + dev_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_item); + devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, + (unsigned long)btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, + (unsigned long)btrfs_device_fsid(dev_item), + BTRFS_UUID_SIZE); + device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + BUG_ON(!device); + + if (device->fs_devices->seeding) { + btrfs_set_device_generation(leaf, dev_item, + device->generation); + btrfs_mark_buffer_dirty(leaf); + } + + path->slots[0]++; + goto next_slot; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_init_new_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_trans_handle *trans; + struct btrfs_device *device; + struct block_device *bdev; + struct list_head *devices; + struct super_block *sb = root->fs_info->sb; + u64 total_bytes; + int seeding_dev = 0; + int ret = 0; + + if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) + return -EINVAL; + + bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + if (root->fs_info->fs_devices->seeding) { + seeding_dev = 1; + down_write(&sb->s_umount); + mutex_lock(&uuid_mutex); + } + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + mutex_lock(&root->fs_info->volume_mutex); + + devices = &root->fs_info->fs_devices->devices; + /* + * we have the volume lock, so we don't need the extra + * device list mutex while reading the list here. + */ + list_for_each_entry(device, devices, dev_list) { + if (device->bdev == bdev) { + ret = -EEXIST; + goto error; + } + } + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + /* we can safely leave the fs_devices entry around */ + ret = -ENOMEM; + goto error; + } + + device->name = kstrdup(device_path, GFP_NOFS); + if (!device->name) { + kfree(device); + ret = -ENOMEM; + goto error; + } + + ret = find_next_devid(root, &device->devid); + if (ret) { + kfree(device); + goto error; + } + + trans = btrfs_start_transaction(root, 1); + lock_chunks(root); + + device->barriers = 1; + device->writeable = 1; + device->work.func = pending_bios_fn; + generate_random_uuid(device->uuid); + spin_lock_init(&device->io_lock); + device->generation = trans->transid; + device->io_width = root->sectorsize; + device->io_align = root->sectorsize; + device->sector_size = root->sectorsize; + device->total_bytes = i_size_read(bdev->bd_inode); + device->disk_total_bytes = device->total_bytes; + device->dev_root = root->fs_info->dev_root; + device->bdev = bdev; + device->in_fs_metadata = 1; + device->mode = 0; + set_blocksize(device->bdev, 4096); + + if (seeding_dev) { + sb->s_flags &= ~MS_RDONLY; + ret = btrfs_prepare_sprout(trans, root); + BUG_ON(ret); + } + + device->fs_devices = root->fs_info->fs_devices; + + /* + * we don't want write_supers to jump in here with our device + * half setup + */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + list_add(&device->dev_list, &root->fs_info->fs_devices->devices); + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->num_devices++; + root->fs_info->fs_devices->open_devices++; + root->fs_info->fs_devices->rw_devices++; + root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + + if (!blk_queue_nonrot(bdev_get_queue(bdev))) + root->fs_info->fs_devices->rotating = 1; + + total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); + btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes + device->total_bytes); + + total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); + btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes + 1); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + if (seeding_dev) { + ret = init_first_rw_device(trans, root, device); + BUG_ON(ret); + ret = btrfs_finish_sprout(trans, root); + BUG_ON(ret); + } else { + ret = btrfs_add_device(trans, root, device); + } + + /* + * we've got more storage, clear any full flags on the space + * infos + */ + btrfs_clear_space_info_full(root->fs_info); + + unlock_chunks(root); + btrfs_commit_transaction(trans, root); + + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + + ret = btrfs_relocate_sys_chunks(root); + BUG_ON(ret); + } +out: + mutex_unlock(&root->fs_info->volume_mutex); + return ret; +error: + close_bdev_exclusive(bdev, 0); + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + } + goto out; +} + +static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + + root = device->dev_root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_mark_buffer_dirty(leaf); + +out: + btrfs_free_path(path); + return ret; +} + +static int __btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + struct btrfs_super_block *super_copy = + &device->dev_root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 diff = new_size - device->total_bytes; + + if (!device->writeable) + return -EACCES; + if (new_size <= device->total_bytes) + return -EINVAL; + + btrfs_set_super_total_bytes(super_copy, old_total + diff); + device->fs_devices->total_rw_bytes += diff; + + device->total_bytes = new_size; + device->disk_total_bytes = new_size; + btrfs_clear_space_info_full(device->dev_root->fs_info); + + return btrfs_update_device(trans, device); +} + +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + int ret; + lock_chunks(device->dev_root); + ret = __btrfs_grow_device(trans, device, new_size); + unlock_chunks(device->dev_root); + return ret; +} + +static int btrfs_free_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + root = root->fs_info->chunk_root; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = chunk_objectid; + key.offset = chunk_offset; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + BUG_ON(ret); + + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return 0; +} + +static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 + chunk_offset) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)(ptr + len); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + len += btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + if (key.objectid == chunk_objectid && + key.offset == chunk_offset) { + memmove(ptr, ptr + len, array_size - (cur + len)); + array_size -= len; + btrfs_set_super_sys_array_size(super_copy, array_size); + } else { + ptr += len; + cur += len; + } + } + return ret; +} + +static int btrfs_relocate_chunk(struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + struct extent_map_tree *em_tree; + struct btrfs_root *extent_root; + struct btrfs_trans_handle *trans; + struct extent_map *em; + struct map_lookup *map; + int ret; + int i; + + root = root->fs_info->chunk_root; + extent_root = root->fs_info->extent_root; + em_tree = &root->fs_info->mapping_tree.map_tree; + + ret = btrfs_can_relocate(extent_root, chunk_offset); + if (ret) + return -ENOSPC; + + /* step one, relocate all the extents inside this chunk */ + ret = btrfs_relocate_block_group(extent_root, chunk_offset); + BUG_ON(ret); + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + lock_chunks(root); + + /* + * step two, delete the device extents and the + * chunk tree entries + */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + read_unlock(&em_tree->lock); + + BUG_ON(em->start > chunk_offset || + em->start + em->len < chunk_offset); + map = (struct map_lookup *)em->bdev; + + for (i = 0; i < map->num_stripes; i++) { + ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, + map->stripes[i].physical); + BUG_ON(ret); + + if (map->stripes[i].dev) { + ret = btrfs_update_device(trans, map->stripes[i].dev); + BUG_ON(ret); + } + } + ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, + chunk_offset); + + BUG_ON(ret); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); + BUG_ON(ret); + } + + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); + BUG_ON(ret); + + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + kfree(map); + em->bdev = NULL; + + /* once for the tree */ + free_extent_map(em); + /* once for us */ + free_extent_map(em); + + unlock_chunks(root); + btrfs_end_transaction(trans, root); + return 0; +} + +static int btrfs_relocate_sys_chunks(struct btrfs_root *root) +{ + struct btrfs_root *chunk_root = root->fs_info->chunk_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_chunk *chunk; + struct btrfs_key key; + struct btrfs_key found_key; + u64 chunk_tree = chunk_root->root_key.objectid; + u64 chunk_type; + bool retried = false; + int failed = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +again: + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); + + ret = btrfs_previous_item(chunk_root, path, key.objectid, + key.type); + if (ret < 0) + goto error; + if (ret > 0) + break; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + chunk = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_chunk); + chunk_type = btrfs_chunk_type(leaf, chunk); + btrfs_release_path(chunk_root, path); + + if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_relocate_chunk(chunk_root, chunk_tree, + found_key.objectid, + found_key.offset); + if (ret == -ENOSPC) + failed++; + else if (ret) + BUG(); + } + + if (found_key.offset == 0) + break; + key.offset = found_key.offset - 1; + } + ret = 0; + if (failed && !retried) { + failed = 0; + retried = true; + goto again; + } else if (failed && retried) { + WARN_ON(1); + ret = -ENOSPC; + } +error: + btrfs_free_path(path); + return ret; +} + +static u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +int btrfs_balance(struct btrfs_root *dev_root) +{ + int ret; + struct list_head *devices = &dev_root->fs_info->fs_devices->devices; + struct btrfs_device *device; + u64 old_size; + u64 size_to_free; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_chunk *chunk; + struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; + struct btrfs_trans_handle *trans; + struct btrfs_key found_key; + + if (dev_root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&dev_root->fs_info->volume_mutex); + dev_root = dev_root->fs_info->dev_root; + + /* step one make some room on all the devices */ + list_for_each_entry(device, devices, dev_list) { + old_size = device->total_bytes; + size_to_free = div_factor(old_size, 1); + size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); + if (!device->writeable || + device->total_bytes - device->bytes_used > size_to_free) + continue; + + ret = btrfs_shrink_device(device, old_size - size_to_free); + if (ret == -ENOSPC) + break; + BUG_ON(ret); + + trans = btrfs_start_transaction(dev_root, 1); + BUG_ON(!trans); + + ret = btrfs_grow_device(trans, device, old_size); + BUG_ON(ret); + + btrfs_end_transaction(trans, dev_root); + } + + /* step two, relocate all the chunks */ + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + + /* + * this shouldn't happen, it means the last relocate + * failed + */ + if (ret == 0) + break; + + ret = btrfs_previous_item(chunk_root, path, 0, + BTRFS_CHUNK_ITEM_KEY); + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid) + break; + + chunk = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_chunk); + /* chunk zero is special */ + if (found_key.offset == 0) + break; + + btrfs_release_path(chunk_root, path); + ret = btrfs_relocate_chunk(chunk_root, + chunk_root->root_key.objectid, + found_key.objectid, + found_key.offset); + BUG_ON(ret && ret != -ENOSPC); + key.offset = found_key.offset - 1; + } + ret = 0; +error: + btrfs_free_path(path); + mutex_unlock(&dev_root->fs_info->volume_mutex); + return ret; +} + +/* + * shrinking a device means finding all of the device extents past + * the new size, and then following the back refs to the chunks. + * The chunk relocation code actually frees the device extent + */ +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + u64 length; + u64 chunk_tree; + u64 chunk_objectid; + u64 chunk_offset; + int ret; + int slot; + int failed = 0; + bool retried = false; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 old_size = device->total_bytes; + u64 diff = device->total_bytes - new_size; + + if (new_size >= device->total_bytes) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 2; + + lock_chunks(root); + + device->total_bytes = new_size; + if (device->writeable) + device->fs_devices->total_rw_bytes -= diff; + unlock_chunks(root); + +again: + key.objectid = device->devid; + key.offset = (u64)-1; + key.type = BTRFS_DEV_EXTENT_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto done; + + ret = btrfs_previous_item(root, path, 0, key.type); + if (ret < 0) + goto done; + if (ret) { + ret = 0; + btrfs_release_path(root, path); + break; + } + + l = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + + if (key.objectid != device->devid) { + btrfs_release_path(root, path); + break; + } + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + + if (key.offset + length <= new_size) { + btrfs_release_path(root, path); + break; + } + + chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); + chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); + chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); + btrfs_release_path(root, path); + + ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, + chunk_offset); + if (ret && ret != -ENOSPC) + goto done; + if (ret == -ENOSPC) + failed++; + key.offset -= 1; + } + + if (failed && !retried) { + failed = 0; + retried = true; + goto again; + } else if (failed && retried) { + ret = -ENOSPC; + lock_chunks(root); + + device->total_bytes = old_size; + if (device->writeable) + device->fs_devices->total_rw_bytes += diff; + unlock_chunks(root); + goto done; + } + + /* Shrinking succeeded, else we would be at "done". */ + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto done; + } + lock_chunks(root); + + device->disk_total_bytes = new_size; + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); + if (ret) { + unlock_chunks(root); + btrfs_end_transaction(trans, root); + goto done; + } + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + unlock_chunks(root); + btrfs_end_transaction(trans, root); +done: + btrfs_free_path(path); + return ret; +} + +static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_chunk *chunk, int item_size) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_disk_key disk_key; + u32 array_size; + u8 *ptr; + + array_size = btrfs_super_sys_array_size(super_copy); + if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + return -EFBIG; + + ptr = super_copy->sys_chunk_array + array_size; + btrfs_cpu_key_to_disk(&disk_key, key); + memcpy(ptr, &disk_key, sizeof(disk_key)); + ptr += sizeof(disk_key); + memcpy(ptr, chunk, item_size); + item_size += sizeof(disk_key); + btrfs_set_super_sys_array_size(super_copy, array_size + item_size); + return 0; +} + +static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, + int num_stripes, int sub_stripes) +{ + if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) + return calc_size; + else if (type & BTRFS_BLOCK_GROUP_RAID10) + return calc_size * (num_stripes / sub_stripes); + else + return calc_size * num_stripes; +} + +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup **map_ret, + u64 *num_bytes, u64 *stripe_size, + u64 start, u64 type) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_device *device = NULL; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct list_head *cur; + struct map_lookup *map = NULL; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct list_head private_devs; + int min_stripe_size = 1 * 1024 * 1024; + u64 calc_size = 1024 * 1024 * 1024; + u64 max_chunk_size = calc_size; + u64 min_free; + u64 avail; + u64 max_avail = 0; + u64 dev_offset; + int num_stripes = 1; + int min_stripes = 1; + int sub_stripes = 0; + int looped = 0; + int ret; + int index; + int stripe_len = 64 * 1024; + + if ((type & BTRFS_BLOCK_GROUP_RAID1) && + (type & BTRFS_BLOCK_GROUP_DUP)) { + WARN_ON(1); + type &= ~BTRFS_BLOCK_GROUP_DUP; + } + if (list_empty(&fs_devices->alloc_list)) + return -ENOSPC; + + if (type & (BTRFS_BLOCK_GROUP_RAID0)) { + num_stripes = fs_devices->rw_devices; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_DUP)) { + num_stripes = 2; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_RAID1)) { + num_stripes = min_t(u64, 2, fs_devices->rw_devices); + if (num_stripes < 2) + return -ENOSPC; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_RAID10)) { + num_stripes = fs_devices->rw_devices; + if (num_stripes < 4) + return -ENOSPC; + num_stripes &= ~(u32)1; + sub_stripes = 2; + min_stripes = 4; + } + + if (type & BTRFS_BLOCK_GROUP_DATA) { + max_chunk_size = 10 * calc_size; + min_stripe_size = 64 * 1024 * 1024; + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + max_chunk_size = 256 * 1024 * 1024; + min_stripe_size = 32 * 1024 * 1024; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + calc_size = 8 * 1024 * 1024; + max_chunk_size = calc_size * 2; + min_stripe_size = 1 * 1024 * 1024; + } + + /* we don't want a chunk larger than 10% of writeable space */ + max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + max_chunk_size); + +again: + max_avail = 0; + if (!map || map->num_stripes != num_stripes) { + kfree(map); + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) + return -ENOMEM; + map->num_stripes = num_stripes; + } + + if (calc_size * num_stripes > max_chunk_size) { + calc_size = max_chunk_size; + do_div(calc_size, num_stripes); + do_div(calc_size, stripe_len); + calc_size *= stripe_len; + } + /* we don't want tiny stripes */ + calc_size = max_t(u64, min_stripe_size, calc_size); + + do_div(calc_size, stripe_len); + calc_size *= stripe_len; + + cur = fs_devices->alloc_list.next; + index = 0; + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_free = calc_size * 2; + else + min_free = calc_size; + + /* + * we add 1MB because we never use the first 1MB of the device, unless + * we've looped, then we are likely allocating the maximum amount of + * space left already + */ + if (!looped) + min_free += 1024 * 1024; + + INIT_LIST_HEAD(&private_devs); + while (index < num_stripes) { + device = list_entry(cur, struct btrfs_device, dev_alloc_list); + BUG_ON(!device->writeable); + if (device->total_bytes > device->bytes_used) + avail = device->total_bytes - device->bytes_used; + else + avail = 0; + cur = cur->next; + + if (device->in_fs_metadata && avail >= min_free) { + ret = find_free_dev_extent(trans, device, + min_free, &dev_offset, + &max_avail); + if (ret == 0) { + list_move_tail(&device->dev_alloc_list, + &private_devs); + map->stripes[index].dev = device; + map->stripes[index].physical = dev_offset; + index++; + if (type & BTRFS_BLOCK_GROUP_DUP) { + map->stripes[index].dev = device; + map->stripes[index].physical = + dev_offset + calc_size; + index++; + } + } + } else if (device->in_fs_metadata && avail > max_avail) + max_avail = avail; + if (cur == &fs_devices->alloc_list) + break; + } + list_splice(&private_devs, &fs_devices->alloc_list); + if (index < num_stripes) { + if (index >= min_stripes) { + num_stripes = index; + if (type & (BTRFS_BLOCK_GROUP_RAID10)) { + num_stripes /= sub_stripes; + num_stripes *= sub_stripes; + } + looped = 1; + goto again; + } + if (!looped && max_avail > 0) { + looped = 1; + calc_size = max_avail; + goto again; + } + kfree(map); + return -ENOSPC; + } + map->sector_size = extent_root->sectorsize; + map->stripe_len = stripe_len; + map->io_align = stripe_len; + map->io_width = stripe_len; + map->type = type; + map->num_stripes = num_stripes; + map->sub_stripes = sub_stripes; + + *map_ret = map; + *stripe_size = calc_size; + *num_bytes = chunk_bytes_by_type(type, calc_size, + num_stripes, sub_stripes); + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + kfree(map); + return -ENOMEM; + } + em->bdev = (struct block_device *)map; + em->start = start; + em->len = *num_bytes; + em->block_start = 0; + em->block_len = em->len; + + em_tree = &extent_root->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + BUG_ON(ret); + free_extent_map(em); + + ret = btrfs_make_block_group(trans, extent_root, 0, type, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, *num_bytes); + BUG_ON(ret); + + index = 0; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + dev_offset = map->stripes[index].physical; + + ret = btrfs_alloc_dev_extent(trans, device, + info->chunk_root->root_key.objectid, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, dev_offset, calc_size); + BUG_ON(ret); + index++; + } + + return 0; +} + +static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup *map, u64 chunk_offset, + u64 chunk_size, u64 stripe_size) +{ + u64 dev_offset; + struct btrfs_key key; + struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + struct btrfs_device *device; + struct btrfs_chunk *chunk; + struct btrfs_stripe *stripe; + size_t item_size = btrfs_chunk_item_size(map->num_stripes); + int index = 0; + int ret; + + chunk = kzalloc(item_size, GFP_NOFS); + if (!chunk) + return -ENOMEM; + + index = 0; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + device->bytes_used += stripe_size; + ret = btrfs_update_device(trans, device); + BUG_ON(ret); + index++; + } + + index = 0; + stripe = &chunk->stripe; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + dev_offset = map->stripes[index].physical; + + btrfs_set_stack_stripe_devid(stripe, device->devid); + btrfs_set_stack_stripe_offset(stripe, dev_offset); + memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); + stripe++; + index++; + } + + btrfs_set_stack_chunk_length(chunk, chunk_size); + btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); + btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); + btrfs_set_stack_chunk_type(chunk, map->type); + btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); + btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); + btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); + btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); + btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_offset; + + ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); + BUG_ON(ret); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, + item_size); + BUG_ON(ret); + } + kfree(chunk); + return 0; +} + +/* + * Chunk allocation falls into two parts. The first part does works + * that make the new allocated chunk useable, but not do any operation + * that modifies the chunk tree. The second part does the works that + * require modifying the chunk tree. This division is important for the + * bootstrap process of adding storage to a seed btrfs. + */ +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 type) +{ + u64 chunk_offset; + u64 chunk_size; + u64 stripe_size; + struct map_lookup *map; + struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + int ret; + + ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, + &chunk_offset); + if (ret) + return ret; + + ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, + &stripe_size, chunk_offset, type); + if (ret) + return ret; + + ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, + chunk_size, stripe_size); + BUG_ON(ret); + return 0; +} + +static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + u64 chunk_offset; + u64 sys_chunk_offset; + u64 chunk_size; + u64 sys_chunk_size; + u64 stripe_size; + u64 sys_stripe_size; + u64 alloc_profile; + struct map_lookup *map; + struct map_lookup *sys_map; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; + int ret; + + ret = find_next_chunk(fs_info->chunk_root, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); + BUG_ON(ret); + + alloc_profile = BTRFS_BLOCK_GROUP_METADATA | + (fs_info->metadata_alloc_profile & + fs_info->avail_metadata_alloc_bits); + alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); + + ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, + &stripe_size, chunk_offset, alloc_profile); + BUG_ON(ret); + + sys_chunk_offset = chunk_offset + chunk_size; + + alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | + (fs_info->system_alloc_profile & + fs_info->avail_system_alloc_bits); + alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); + + ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, + &sys_chunk_size, &sys_stripe_size, + sys_chunk_offset, alloc_profile); + BUG_ON(ret); + + ret = btrfs_add_device(trans, fs_info->chunk_root, device); + BUG_ON(ret); + + /* + * Modifying chunk tree needs allocating new blocks from both + * system block group and metadata block group. So we only can + * do operations require modifying the chunk tree after both + * block groups were created. + */ + ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, + chunk_size, stripe_size); + BUG_ON(ret); + + ret = __finish_chunk_alloc(trans, extent_root, sys_map, + sys_chunk_offset, sys_chunk_size, + sys_stripe_size); + BUG_ON(ret); + return 0; +} + +int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + int readonly = 0; + int i; + + read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); + read_unlock(&map_tree->map_tree.lock); + if (!em) + return 1; + + if (btrfs_test_opt(root, DEGRADED)) { + free_extent_map(em); + return 0; + } + + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) { + if (!map->stripes[i].dev->writeable) { + readonly = 1; + break; + } + } + free_extent_map(em); + return readonly; +} + +void btrfs_mapping_init(struct btrfs_mapping_tree *tree) +{ + extent_map_tree_init(&tree->map_tree, GFP_NOFS); +} + +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) +{ + struct extent_map *em; + + while (1) { + write_lock(&tree->map_tree.lock); + em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + if (em) + remove_extent_mapping(&tree->map_tree, em); + write_unlock(&tree->map_tree.lock); + if (!em) + break; + kfree(em->bdev); + /* once for us */ + free_extent_map(em); + /* once for the tree */ + free_extent_map(em); + } +} + +int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + ret = map->num_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID10) + ret = map->sub_stripes; + else + ret = 1; + free_extent_map(em); + return ret; +} + +static int find_live_mirror(struct map_lookup *map, int first, int num, + int optimal) +{ + int i; + if (map->stripes[optimal].dev->bdev) + return optimal; + for (i = first; i < first + num; i++) { + if (map->stripes[i].dev->bdev) + return i; + } + /* we couldn't find one that doesn't fail. Just return something + * and the io error handling code will clean up eventually + */ + return optimal; +} + +static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, + int mirror_num, struct page *unplug_page) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + u64 offset; + u64 stripe_offset; + u64 stripe_nr; + int stripes_allocated = 8; + int stripes_required = 1; + int stripe_index; + int i; + int num_stripes; + int max_errors = 0; + struct btrfs_multi_bio *multi = NULL; + + if (multi_ret && !(rw & (1 << BIO_RW))) + stripes_allocated = 1; +again: + if (multi_ret) { + multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), + GFP_NOFS); + if (!multi) + return -ENOMEM; + + atomic_set(&multi->error, 0); + } + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, *length); + read_unlock(&em_tree->lock); + + if (!em && unplug_page) { + kfree(multi); + return 0; + } + + if (!em) { + printk(KERN_CRIT "unable to find logical %llu len %llu\n", + (unsigned long long)logical, + (unsigned long long)*length); + BUG(); + } + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + offset = logical - em->start; + + if (mirror_num > map->num_stripes) + mirror_num = 0; + + /* if our multi bio struct is too small, back off and try again */ + if (rw & (1 << BIO_RW)) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP)) { + stripes_required = map->num_stripes; + max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripes_required = map->sub_stripes; + max_errors = 1; + } + } + if (multi_ret && (rw & (1 << BIO_RW)) && + stripes_allocated < stripes_required) { + stripes_allocated = map->num_stripes; + free_extent_map(em); + kfree(multi); + goto again; + } + stripe_nr = offset; + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ + do_div(stripe_nr, map->stripe_len); + + stripe_offset = stripe_nr * map->stripe_len; + BUG_ON(offset < stripe_offset); + + /* stripe_offset is the offset of this block in its stripe*/ + stripe_offset = offset - stripe_offset; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { + /* we limit the length of each bio to what fits in a stripe */ + *length = min_t(u64, em->len - offset, + map->stripe_len - stripe_offset); + } else { + *length = em->len - offset; + } + + if (!multi_ret && !unplug_page) + goto out; + + num_stripes = 1; + stripe_index = 0; + if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + if (unplug_page || (rw & (1 << BIO_RW))) + num_stripes = map->num_stripes; + else if (mirror_num) + stripe_index = mirror_num - 1; + else { + stripe_index = find_live_mirror(map, 0, + map->num_stripes, + current->pid % map->num_stripes); + } + + } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { + if (rw & (1 << BIO_RW)) + num_stripes = map->num_stripes; + else if (mirror_num) + stripe_index = mirror_num - 1; + + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + int factor = map->num_stripes / map->sub_stripes; + + stripe_index = do_div(stripe_nr, factor); + stripe_index *= map->sub_stripes; + + if (unplug_page || (rw & (1 << BIO_RW))) + num_stripes = map->sub_stripes; + else if (mirror_num) + stripe_index += mirror_num - 1; + else { + stripe_index = find_live_mirror(map, stripe_index, + map->sub_stripes, stripe_index + + current->pid % map->sub_stripes); + } + } else { + /* + * after this do_div call, stripe_nr is the number of stripes + * on this device we have to walk to find the data, and + * stripe_index is the number of our device in the stripe array + */ + stripe_index = do_div(stripe_nr, map->num_stripes); + } + BUG_ON(stripe_index >= map->num_stripes); + + for (i = 0; i < num_stripes; i++) { + if (unplug_page) { + struct btrfs_device *device; + struct backing_dev_info *bdi; + + device = map->stripes[stripe_index].dev; + if (device->bdev) { + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi->unplug_io_fn) + bdi->unplug_io_fn(bdi, unplug_page); + } + } else { + multi->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + multi->stripes[i].dev = map->stripes[stripe_index].dev; + } + stripe_index++; + } + if (multi_ret) { + *multi_ret = multi; + multi->num_stripes = num_stripes; + multi->max_errors = max_errors; + } +out: + free_extent_map(em); + return 0; +} + +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, int mirror_num) +{ + return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + mirror_num, NULL); +} + +int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 chunk_start, u64 physical, u64 devid, + u64 **logical, int *naddrs, int *stripe_len) +{ + struct extent_map_tree *em_tree = &map_tree->map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 *buf; + u64 bytenr; + u64 length; + u64 stripe_nr; + int i, j, nr = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_start, 1); + read_unlock(&em_tree->lock); + + BUG_ON(!em || em->start != chunk_start); + map = (struct map_lookup *)em->bdev; + + length = em->len; + if (map->type & BTRFS_BLOCK_GROUP_RAID10) + do_div(length, map->num_stripes / map->sub_stripes); + else if (map->type & BTRFS_BLOCK_GROUP_RAID0) + do_div(length, map->num_stripes); + + buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + BUG_ON(!buf); + + for (i = 0; i < map->num_stripes; i++) { + if (devid && map->stripes[i].dev->devid != devid) + continue; + if (map->stripes[i].physical > physical || + map->stripes[i].physical + length <= physical) + continue; + + stripe_nr = physical - map->stripes[i].physical; + do_div(stripe_nr, map->stripe_len); + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripe_nr = stripe_nr * map->num_stripes + i; + do_div(stripe_nr, map->sub_stripes); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + stripe_nr = stripe_nr * map->num_stripes + i; + } + bytenr = chunk_start + stripe_nr * map->stripe_len; + WARN_ON(nr >= map->num_stripes); + for (j = 0; j < nr; j++) { + if (buf[j] == bytenr) + break; + } + if (j == nr) { + WARN_ON(nr >= map->num_stripes); + buf[nr++] = bytenr; + } + } + + *logical = buf; + *naddrs = nr; + *stripe_len = map->stripe_len; + + free_extent_map(em); + return 0; +} + +int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, + u64 logical, struct page *page) +{ + u64 length = PAGE_CACHE_SIZE; + return __btrfs_map_block(map_tree, READ, logical, &length, + NULL, 0, page); +} + +static void end_bio_multi_stripe(struct bio *bio, int err) +{ + struct btrfs_multi_bio *multi = bio->bi_private; + int is_orig_bio = 0; + + if (err) + atomic_inc(&multi->error); + + if (bio == multi->orig_bio) + is_orig_bio = 1; + + if (atomic_dec_and_test(&multi->stripes_pending)) { + if (!is_orig_bio) { + bio_put(bio); + bio = multi->orig_bio; + } + bio->bi_private = multi->private; + bio->bi_end_io = multi->end_io; + /* only send an error to the higher layers if it is + * beyond the tolerance of the multi-bio + */ + if (atomic_read(&multi->error) > multi->max_errors) { + err = -EIO; + } else if (err) { + /* + * this bio is actually up to date, we didn't + * go over the max number of errors + */ + set_bit(BIO_UPTODATE, &bio->bi_flags); + err = 0; + } + kfree(multi); + + bio_endio(bio, err); + } else if (!is_orig_bio) { + bio_put(bio); + } +} + +struct async_sched { + struct bio *bio; + int rw; + struct btrfs_fs_info *info; + struct btrfs_work work; +}; + +/* + * see run_scheduled_bios for a description of why bios are collected for + * async submit. + * + * This will add one bio to the pending list for a device and make sure + * the work struct is scheduled. + */ +static noinline int schedule_bio(struct btrfs_root *root, + struct btrfs_device *device, + int rw, struct bio *bio) +{ + int should_queue = 1; + struct btrfs_pending_bios *pending_bios; + + /* don't bother with additional async steps for reads, right now */ + if (!(rw & (1 << BIO_RW))) { + bio_get(bio); + submit_bio(rw, bio); + bio_put(bio); + return 0; + } + + /* + * nr_async_bios allows us to reliably return congestion to the + * higher layers. Otherwise, the async bio makes it appear we have + * made progress against dirty pages when we've really just put it + * on a queue for later + */ + atomic_inc(&root->fs_info->nr_async_bios); + WARN_ON(bio->bi_next); + bio->bi_next = NULL; + bio->bi_rw |= rw; + + spin_lock(&device->io_lock); + if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) + pending_bios = &device->pending_sync_bios; + else + pending_bios = &device->pending_bios; + + if (pending_bios->tail) + pending_bios->tail->bi_next = bio; + + pending_bios->tail = bio; + if (!pending_bios->head) + pending_bios->head = bio; + if (device->running_pending) + should_queue = 0; + + spin_unlock(&device->io_lock); + + if (should_queue) + btrfs_queue_worker(&root->fs_info->submit_workers, + &device->work); + return 0; +} + +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num, int async_submit) +{ + struct btrfs_mapping_tree *map_tree; + struct btrfs_device *dev; + struct bio *first_bio = bio; + u64 logical = (u64)bio->bi_sector << 9; + u64 length = 0; + u64 map_length; + struct btrfs_multi_bio *multi = NULL; + int ret; + int dev_nr = 0; + int total_devs = 1; + + length = bio->bi_size; + map_tree = &root->fs_info->mapping_tree; + map_length = length; + + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + mirror_num); + BUG_ON(ret); + + total_devs = multi->num_stripes; + if (map_length < length) { + printk(KERN_CRIT "mapping failed logical %llu bio len %llu " + "len %llu\n", (unsigned long long)logical, + (unsigned long long)length, + (unsigned long long)map_length); + BUG(); + } + multi->end_io = first_bio->bi_end_io; + multi->private = first_bio->bi_private; + multi->orig_bio = first_bio; + atomic_set(&multi->stripes_pending, multi->num_stripes); + + while (dev_nr < total_devs) { + if (total_devs > 1) { + if (dev_nr < total_devs - 1) { + bio = bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); + } else { + bio = first_bio; + } + bio->bi_private = multi; + bio->bi_end_io = end_bio_multi_stripe; + } + bio->bi_sector = multi->stripes[dev_nr].physical >> 9; + dev = multi->stripes[dev_nr].dev; + BUG_ON(rw == WRITE && !dev->writeable); + if (dev && dev->bdev) { + bio->bi_bdev = dev->bdev; + if (async_submit) + schedule_bio(root, dev, rw, bio); + else + submit_bio(rw, bio); + } else { + bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; + bio->bi_sector = logical >> 9; + bio_endio(bio, -EIO); + } + dev_nr++; + } + if (total_devs == 1) + kfree(multi); + return 0; +} + +struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, + u8 *uuid, u8 *fsid) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *cur_devices; + + cur_devices = root->fs_info->fs_devices; + while (cur_devices) { + if (!fsid || + !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { + device = __find_device(&cur_devices->devices, + devid, uuid); + if (device) + return device; + } + cur_devices = cur_devices->seed; + } + return NULL; +} + +static struct btrfs_device *add_missing_dev(struct btrfs_root *root, + u64 devid, u8 *dev_uuid) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) + return NULL; + list_add(&device->dev_list, + &fs_devices->devices); + device->barriers = 1; + device->dev_root = root->fs_info->dev_root; + device->devid = devid; + device->work.func = pending_bios_fn; + device->fs_devices = fs_devices; + fs_devices->num_devices++; + spin_lock_init(&device->io_lock); + INIT_LIST_HEAD(&device->dev_alloc_list); + memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); + return device; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 devid; + u8 uuid[BTRFS_UUID_SIZE]; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + + read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); + read_unlock(&map_tree->map_tree.lock); + + /* already mapped? */ + if (em && em->start <= logical && em->start + em->len > logical) { + free_extent_map(em); + return 0; + } else if (em) { + free_extent_map(em); + } + + em = alloc_extent_map(GFP_NOFS); + if (!em) + return -ENOMEM; + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + free_extent_map(em); + return -ENOMEM; + } + + em->bdev = (struct block_device *)map; + em->start = logical; + em->len = length; + em->block_start = 0; + em->block_len = em->len; + + map->num_stripes = num_stripes; + map->io_width = btrfs_chunk_io_width(leaf, chunk); + map->io_align = btrfs_chunk_io_align(leaf, chunk); + map->sector_size = btrfs_chunk_sector_size(leaf, chunk); + map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + map->type = btrfs_chunk_type(leaf, chunk); + map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + for (i = 0; i < num_stripes; i++) { + map->stripes[i].physical = + btrfs_stripe_offset_nr(leaf, chunk, i); + devid = btrfs_stripe_devid_nr(leaf, chunk, i); + read_extent_buffer(leaf, uuid, (unsigned long) + btrfs_stripe_dev_uuid_nr(chunk, i), + BTRFS_UUID_SIZE); + map->stripes[i].dev = btrfs_find_device(root, devid, uuid, + NULL); + if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { + kfree(map); + free_extent_map(em); + return -EIO; + } + if (!map->stripes[i].dev) { + map->stripes[i].dev = + add_missing_dev(root, devid, uuid); + if (!map->stripes[i].dev) { + kfree(map); + free_extent_map(em); + return -EIO; + } + } + map->stripes[i].dev->in_fs_metadata = 1; + } + + write_lock(&map_tree->map_tree.lock); + ret = add_extent_mapping(&map_tree->map_tree, em); + write_unlock(&map_tree->map_tree.lock); + BUG_ON(ret); + free_extent_map(em); + + return 0; +} + +static int fill_device_from_item(struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item, + struct btrfs_device *device) +{ + unsigned long ptr; + + device->devid = btrfs_device_id(leaf, dev_item); + device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->total_bytes = device->disk_total_bytes; + device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); + device->type = btrfs_device_type(leaf, dev_item); + device->io_align = btrfs_device_io_align(leaf, dev_item); + device->io_width = btrfs_device_io_width(leaf, dev_item); + device->sector_size = btrfs_device_sector_size(leaf, dev_item); + + ptr = (unsigned long)btrfs_device_uuid(dev_item); + read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); + + return 0; +} + +static int open_seed_devices(struct btrfs_root *root, u8 *fsid) +{ + struct btrfs_fs_devices *fs_devices; + int ret; + + mutex_lock(&uuid_mutex); + + fs_devices = root->fs_info->fs_devices->seed; + while (fs_devices) { + if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { + ret = 0; + goto out; + } + fs_devices = fs_devices->seed; + } + + fs_devices = find_fsid(fsid); + if (!fs_devices) { + ret = -ENOENT; + goto out; + } + + fs_devices = clone_fs_devices(fs_devices); + if (IS_ERR(fs_devices)) { + ret = PTR_ERR(fs_devices); + goto out; + } + + ret = __btrfs_open_devices(fs_devices, FMODE_READ, + root->fs_info->bdev_holder); + if (ret) + goto out; + + if (!fs_devices->seeding) { + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + ret = -EINVAL; + goto out; + } + + fs_devices->seed = root->fs_info->fs_devices->seed; + root->fs_info->fs_devices->seed = fs_devices; +out: + mutex_unlock(&uuid_mutex); + return ret; +} + +static int read_one_dev(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item) +{ + struct btrfs_device *device; + u64 devid; + int ret; + u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + + devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, + (unsigned long)btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, + (unsigned long)btrfs_device_fsid(dev_item), + BTRFS_UUID_SIZE); + + if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { + ret = open_seed_devices(root, fs_uuid); + if (ret && !btrfs_test_opt(root, DEGRADED)) + return ret; + } + + device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + if (!device || !device->bdev) { + if (!btrfs_test_opt(root, DEGRADED)) + return -EIO; + + if (!device) { + printk(KERN_WARNING "warning devid %llu missing\n", + (unsigned long long)devid); + device = add_missing_dev(root, devid, dev_uuid); + if (!device) + return -ENOMEM; + } + } + + if (device->fs_devices != root->fs_info->fs_devices) { + BUG_ON(device->writeable); + if (device->generation != + btrfs_device_generation(leaf, dev_item)) + return -EINVAL; + } + + fill_device_from_item(leaf, dev_item, device); + device->dev_root = root->fs_info->dev_root; + device->in_fs_metadata = 1; + if (device->writeable) + device->fs_devices->total_rw_bytes += device->total_bytes; + ret = 0; + return ret; +} + +int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) +{ + struct btrfs_dev_item *dev_item; + + dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, + dev_item); + return read_one_dev(root, buf, dev_item); +} + +int btrfs_read_sys_array(struct btrfs_root *root) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct extent_buffer *sb; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + unsigned long sb_ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, + BTRFS_SUPER_INFO_SIZE); + if (!sb) + return -ENOMEM; + btrfs_set_buffer_uptodate(sb); + btrfs_set_buffer_lockdep_class(sb, 0); + + write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); ptr += len; + sb_ptr += len; + cur += len; + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)sb_ptr; + ret = read_one_chunk(root, &key, sb, chunk); + if (ret) + break; + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + len = btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + ptr += len; + sb_ptr += len; + cur += len; + } + free_extent_buffer(sb); + return ret; +} + +int btrfs_read_chunk_tree(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + int slot; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* first we search for all of the device items, and then we + * read in all of the chunk items. This way we can create chunk + * mappings that reference all of the devices that are afound + */ + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = 0; +again: + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { + if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) + break; + if (found_key.type == BTRFS_DEV_ITEM_KEY) { + struct btrfs_dev_item *dev_item; + dev_item = btrfs_item_ptr(leaf, slot, + struct btrfs_dev_item); + ret = read_one_dev(root, leaf, dev_item); + if (ret) + goto error; + } + } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { + struct btrfs_chunk *chunk; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + ret = read_one_chunk(root, &found_key, leaf, chunk); + if (ret) + goto error; + } + path->slots[0]++; + } + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { + key.objectid = 0; + btrfs_release_path(root, path); + goto again; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h new file mode 100644 index 00000000000..31b0fabdd2e --- /dev/null +++ b/fs/btrfs/volumes.h @@ -0,0 +1,187 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_VOLUMES_ +#define __BTRFS_VOLUMES_ + +#include <linux/bio.h> +#include "async-thread.h" + +struct buffer_head; +struct btrfs_pending_bios { + struct bio *head; + struct bio *tail; +}; + +struct btrfs_device { + struct list_head dev_list; + struct list_head dev_alloc_list; + struct btrfs_fs_devices *fs_devices; + struct btrfs_root *dev_root; + + /* regular prio bios */ + struct btrfs_pending_bios pending_bios; + /* WRITE_SYNC bios */ + struct btrfs_pending_bios pending_sync_bios; + + int running_pending; + u64 generation; + + int barriers; + int writeable; + int in_fs_metadata; + + spinlock_t io_lock; + + struct block_device *bdev; + + /* the mode sent to open_bdev_exclusive */ + fmode_t mode; + + char *name; + + /* the internal btrfs device id */ + u64 devid; + + /* size of the device */ + u64 total_bytes; + + /* size of the disk */ + u64 disk_total_bytes; + + /* bytes used */ + u64 bytes_used; + + /* optimal io alignment for this device */ + u32 io_align; + + /* optimal io width for this device */ + u32 io_width; + + /* minimal io size for this device */ + u32 sector_size; + + /* type and info about this device */ + u64 type; + + /* physical drive uuid (or lvm uuid) */ + u8 uuid[BTRFS_UUID_SIZE]; + + struct btrfs_work work; +}; + +struct btrfs_fs_devices { + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + + /* the device with this id has the most recent copy of the super */ + u64 latest_devid; + u64 latest_trans; + u64 num_devices; + u64 open_devices; + u64 rw_devices; + u64 total_rw_bytes; + struct block_device *latest_bdev; + + /* all of the devices in the FS, protected by a mutex + * so we can safely walk it to write out the supers without + * worrying about add/remove by the multi-device code + */ + struct mutex device_list_mutex; + struct list_head devices; + + /* devices not currently being allocated */ + struct list_head alloc_list; + struct list_head list; + + struct btrfs_fs_devices *seed; + int seeding; + + int opened; + + /* set when we find or add a device that doesn't have the + * nonrot flag set + */ + int rotating; +}; + +struct btrfs_bio_stripe { + struct btrfs_device *dev; + u64 physical; +}; + +struct btrfs_multi_bio { + atomic_t stripes_pending; + bio_end_io_t *end_io; + struct bio *orig_bio; + void *private; + atomic_t error; + int max_errors; + int num_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes); +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, int mirror_num); +int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 chunk_start, u64 physical, u64 devid, + u64 **logical, int *naddrs, int *stripe_len); +int btrfs_read_sys_array(struct btrfs_root *root); +int btrfs_read_chunk_tree(struct btrfs_root *root); +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 type); +void btrfs_mapping_init(struct btrfs_mapping_tree *tree); +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num, int async_submit); +int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder); +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret); +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device); +int btrfs_rm_device(struct btrfs_root *root, char *device_path); +int btrfs_cleanup_fs_uuids(void); +int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); +int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, + u64 logical, struct page *page); +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size); +struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, + u8 *uuid, u8 *fsid); +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); +int btrfs_init_new_device(struct btrfs_root *root, char *path); +int btrfs_balance(struct btrfs_root *dev_root); +void btrfs_unlock_volumes(void); +void btrfs_lock_volumes(void); +int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *max_avail); +#endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c new file mode 100644 index 00000000000..193b58f7d3f --- /dev/null +++ b/fs/btrfs/xattr.c @@ -0,0 +1,392 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/rwsem.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include "ctree.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "xattr.h" +#include "disk-io.h" + + +ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret = 0; + unsigned long data_ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* lookup the xattr by name */ + di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, + strlen(name), 0); + if (!di) { + ret = -ENODATA; + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + leaf = path->nodes[0]; + /* if size is 0, that means we want the size of the attr */ + if (!size) { + ret = btrfs_dir_data_len(leaf, di); + goto out; + } + + /* now get the data out of our dir_item */ + if (btrfs_dir_data_len(leaf, di) > size) { + ret = -ERANGE; + goto out; + } + + /* + * The way things are packed into the leaf is like this + * |struct btrfs_dir_item|name|data| + * where name is the xattr name, so security.foo, and data is the + * content of the xattr. data_ptr points to the location in memory + * where the data starts in the in memory leaf + */ + data_ptr = (unsigned long)((char *)(di + 1) + + btrfs_dir_name_len(leaf, di)); + read_extent_buffer(leaf, buffer, data_ptr, + btrfs_dir_data_len(leaf, di)); + ret = btrfs_dir_data_len(leaf, di); + +out: + btrfs_free_path(path); + return ret; +} + +static int do_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + size_t name_len = strlen(name); + int ret = 0; + + if (name_len + size > BTRFS_MAX_XATTR_SIZE(root)) + return -ENOSPC; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* first lets see if we already have this xattr */ + di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, + strlen(name), -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + /* ok we already have this xattr, lets remove it */ + if (di) { + /* if we want create only exit */ + if (flags & XATTR_CREATE) { + ret = -EEXIST; + goto out; + } + + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + btrfs_release_path(root, path); + + /* if we don't have a value then we are removing the xattr */ + if (!value) + goto out; + } else { + btrfs_release_path(root, path); + + if (flags & XATTR_REPLACE) { + /* we couldn't find the attr to replace */ + ret = -ENODATA; + goto out; + } + } + + /* ok we have to create a completely new xattr */ + ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino, + name, name_len, value, size); + BUG_ON(ret); +out: + btrfs_free_path(path); + return ret; +} + +int __btrfs_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (trans) + return do_setxattr(trans, inode, name, value, size, flags); + + ret = btrfs_reserve_metadata_space(root, 2); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + btrfs_set_trans_block_group(trans, inode); + + ret = do_setxattr(trans, inode, name, value, size, flags); + if (ret) + goto out; + + inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); +out: + btrfs_end_transaction_throttle(trans, root); + btrfs_unreserve_metadata_space(root, 2); + return ret; +} + +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct btrfs_key key, found_key; + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_item *item; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + int ret = 0, slot, advance; + size_t total_size = 0, size_left = size; + unsigned long name_ptr; + size_t name_len; + u32 nritems; + + /* + * ok we want all objects associated with this id. + * NOTE: we set key.offset = 0; because we want to start with the + * first xattr that we find and walk forward + */ + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + /* search for our xattrs */ + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + advance = 0; + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + + /* this is where we start walking through the path */ + if (advance || slot >= nritems) { + /* + * if we've reached the last slot in this leaf we need + * to go to the next leaf and reset everything + */ + if (slot >= nritems-1) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } else { + /* + * just walking through the slots on this leaf + */ + slot++; + path->slots[0]++; + } + } + advance = 1; + + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* check to make sure this item is what we want */ + if (found_key.objectid != key.objectid) + break; + if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) + break; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + + name_len = btrfs_dir_name_len(leaf, di); + total_size += name_len + 1; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + continue; + + if (!buffer || (name_len + 1) > size_left) { + ret = -ERANGE; + goto err; + } + + name_ptr = (unsigned long)(di + 1); + read_extent_buffer(leaf, buffer, name_ptr, name_len); + buffer[name_len] = '\0'; + + size_left -= name_len + 1; + buffer += name_len + 1; + } + ret = total_size; + +err: + btrfs_free_path(path); + + return ret; +} + +/* + * List of handlers for synthetic system.* attributes. All real ondisk + * attributes are handled directly. + */ +struct xattr_handler *btrfs_xattr_handlers[] = { +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + &btrfs_xattr_acl_access_handler, + &btrfs_xattr_acl_default_handler, +#endif + NULL, +}; + +/* + * Check if the attribute is in a supported namespace. + * + * This applied after the check for the synthetic attributes in the system + * namespace. + */ +static bool btrfs_is_valid_xattr(const char *name) +{ + return !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || + !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); +} + +ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, buffer, size); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + return __btrfs_getxattr(dentry->d_inode, name, buffer, size); +} + +int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + + if (size == 0) + value = ""; /* empty EA, do not remove */ + + return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size, + flags); +} + +int btrfs_removexattr(struct dentry *dentry, const char *name) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + + return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, + XATTR_REPLACE); +} + +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) +{ + int err; + size_t len; + void *value; + char *suffix; + char *name; + + err = security_inode_init_security(inode, dir, &suffix, &value, &len); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1, + GFP_NOFS); + if (!name) { + err = -ENOMEM; + } else { + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); + err = __btrfs_setxattr(trans, inode, name, value, len, 0); + kfree(name); + } + + kfree(suffix); + kfree(value); + return err; +} diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h new file mode 100644 index 00000000000..721efa0346e --- /dev/null +++ b/fs/btrfs/xattr.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __XATTR__ +#define __XATTR__ + +#include <linux/xattr.h> + +extern struct xattr_handler btrfs_xattr_acl_access_handler; +extern struct xattr_handler btrfs_xattr_acl_default_handler; +extern struct xattr_handler *btrfs_xattr_handlers[]; + +extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size); +extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags); +extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size); +extern int btrfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +extern int btrfs_removexattr(struct dentry *dentry, const char *name); + +extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir); + +#endif /* __XATTR__ */ diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c new file mode 100644 index 00000000000..3e2b90eaa23 --- /dev/null +++ b/fs/btrfs/zlib.c @@ -0,0 +1,632 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on jffs2 zlib code: + * Copyright © 2001-2007 Red Hat, Inc. + * Created by David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/zlib.h> +#include <linux/zutil.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include "compression.h" + +/* Plan: call deflate() with avail_in == *sourcelen, + avail_out = *dstlen - 12 and flush == Z_FINISH. + If it doesn't manage to finish, call it again with + avail_in == 0 and avail_out set to the remaining 12 + bytes for it to clean up. + Q: Is 12 bytes sufficient? +*/ +#define STREAM_END_SPACE 12 + +struct workspace { + z_stream inf_strm; + z_stream def_strm; + char *buf; + struct list_head list; +}; + +static LIST_HEAD(idle_workspace); +static DEFINE_SPINLOCK(workspace_lock); +static unsigned long num_workspace; +static atomic_t alloc_workspace = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); + +/* + * this finds an available zlib workspace or allocates a new one + * NULL or an ERR_PTR is returned if things go bad. + */ +static struct workspace *find_zlib_workspace(void) +{ + struct workspace *workspace; + int ret; + int cpus = num_online_cpus(); + +again: + spin_lock(&workspace_lock); + if (!list_empty(&idle_workspace)) { + workspace = list_entry(idle_workspace.next, struct workspace, + list); + list_del(&workspace->list); + num_workspace--; + spin_unlock(&workspace_lock); + return workspace; + + } + spin_unlock(&workspace_lock); + if (atomic_read(&alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&alloc_workspace) > cpus) + schedule(); + finish_wait(&workspace_wait, &wait); + goto again; + } + atomic_inc(&alloc_workspace); + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) { + ret = -ENOMEM; + goto fail; + } + + workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); + if (!workspace->def_strm.workspace) { + ret = -ENOMEM; + goto fail; + } + workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!workspace->inf_strm.workspace) { + ret = -ENOMEM; + goto fail_inflate; + } + workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!workspace->buf) { + ret = -ENOMEM; + goto fail_kmalloc; + } + return workspace; + +fail_kmalloc: + vfree(workspace->inf_strm.workspace); +fail_inflate: + vfree(workspace->def_strm.workspace); +fail: + kfree(workspace); + atomic_dec(&alloc_workspace); + wake_up(&workspace_wait); + return ERR_PTR(ret); +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static int free_workspace(struct workspace *workspace) +{ + spin_lock(&workspace_lock); + if (num_workspace < num_online_cpus()) { + list_add_tail(&workspace->list, &idle_workspace); + num_workspace++; + spin_unlock(&workspace_lock); + if (waitqueue_active(&workspace_wait)) + wake_up(&workspace_wait); + return 0; + } + spin_unlock(&workspace_lock); + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); + + atomic_dec(&alloc_workspace); + if (waitqueue_active(&workspace_wait)) + wake_up(&workspace_wait); + return 0; +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct workspace *workspace; + while (!list_empty(&idle_workspace)) { + workspace = list_entry(idle_workspace.next, struct workspace, + list); + list_del(&workspace->list); + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); + atomic_dec(&alloc_workspace); + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + int ret; + struct workspace *workspace; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + int out_written = 0; + int in_read = 0; + unsigned long bytes_left; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + workspace = find_zlib_workspace(); + if (IS_ERR(workspace)) + return -1; + + if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { + printk(KERN_WARNING "deflateInit failed\n"); + ret = -1; + goto out; + } + + workspace->def_strm.total_in = 0; + workspace->def_strm.total_out = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + cpage_out = kmap(out_page); + pages[0] = out_page; + nr_pages = 1; + + workspace->def_strm.next_in = data_in; + workspace->def_strm.next_out = cpage_out; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); + + out_written = 0; + in_read = 0; + + while (workspace->def_strm.total_in < len) { + ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); + if (ret != Z_OK) { + printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + ret); + zlib_deflateEnd(&workspace->def_strm); + ret = -1; + goto out; + } + + /* we're making it bigger, give up */ + if (workspace->def_strm.total_in > 8192 && + workspace->def_strm.total_in < + workspace->def_strm.total_out) { + ret = -1; + goto out; + } + /* we need another page for writing out. Test this + * before the total_in so we will pull in a new page for + * the stream end if required + */ + if (workspace->def_strm.avail_out == 0) { + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -1; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + cpage_out = kmap(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.next_out = cpage_out; + } + /* we're all done */ + if (workspace->def_strm.total_in >= len) + break; + + /* we've read in a full page, get a new one */ + if (workspace->def_strm.avail_in == 0) { + if (workspace->def_strm.total_out > max_out) + break; + + bytes_left = len - workspace->def_strm.total_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, + start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + workspace->def_strm.avail_in = min(bytes_left, + PAGE_CACHE_SIZE); + workspace->def_strm.next_in = data_in; + } + } + workspace->def_strm.avail_in = 0; + ret = zlib_deflate(&workspace->def_strm, Z_FINISH); + zlib_deflateEnd(&workspace->def_strm); + + if (ret != Z_STREAM_END) { + ret = -1; + goto out; + } + + if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { + ret = -1; + goto out; + } + + ret = 0; + *total_out = workspace->def_strm.total_out; + *total_in = workspace->def_strm.total_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + free_workspace(workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + int ret = 0; + int wbits = MAX_WBITS; + struct workspace *workspace; + char *data_in; + size_t total_out = 0; + unsigned long page_bytes_left; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + struct page *page_out; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + unsigned long start_byte; + unsigned long current_buf_start; + char *kaddr; + + workspace = find_zlib_workspace(); + if (IS_ERR(workspace)) + return -ENOMEM; + + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.total_out = 0; + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + page_out = bvec[page_out_index].bv_page; + page_bytes_left = PAGE_CACHE_SIZE; + pg_offset = 0; + + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + ret = -1; + goto out; + } + while (workspace->inf_strm.total_in < srclen) { + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + /* + * buf start is the byte offset we're of the start of + * our workspace buffer + */ + buf_start = total_out; + + /* total_out is the last byte of the workspace buffer */ + total_out = workspace->inf_strm.total_out; + + working_bytes = total_out - buf_start; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + if (working_bytes == 0) { + /* we didn't make progress in this inflate + * call, we're done + */ + if (ret != Z_STREAM_END) + ret = -1; + break; + } + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) + goto next; + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while (working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, + bytes); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page_out); + + pg_offset += bytes; + page_bytes_left -= bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (page_bytes_left == 0) { + page_out_index++; + if (page_out_index >= vcnt) { + ret = 0; + goto done; + } + + page_out = bvec[page_out_index].bv_page; + pg_offset = 0; + page_bytes_left = PAGE_CACHE_SIZE; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + goto next; + + /* the next page in the biovec might not + * be adjacent to the last page, but it + * might still be found inside this working + * buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + + buf_offset; + } + } + } +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + + if (workspace->inf_strm.avail_in == 0) { + unsigned long tmp; + kunmap(pages_in[page_in_index]); + page_in_index++; + if (page_in_index >= total_pages_in) { + data_in = NULL; + break; + } + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + tmp = srclen - workspace->inf_strm.total_in; + workspace->inf_strm.avail_in = min(tmp, + PAGE_CACHE_SIZE); + } + } + if (ret != Z_STREAM_END) + ret = -1; + else + ret = 0; +done: + zlib_inflateEnd(&workspace->inf_strm); + if (data_in) + kunmap(pages_in[page_in_index]); +out: + free_workspace(workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + int ret = 0; + int wbits = MAX_WBITS; + struct workspace *workspace; + unsigned long bytes_left = destlen; + unsigned long total_out = 0; + char *kaddr; + + if (destlen > PAGE_CACHE_SIZE) + return -ENOMEM; + + workspace = find_zlib_workspace(); + if (IS_ERR(workspace)) + return -ENOMEM; + + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = srclen; + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->inf_strm.total_out = 0; + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + ret = -1; + goto out; + } + + while (bytes_left > 0) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long pg_offset = 0; + + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + + buf_start = total_out; + total_out = workspace->inf_strm.total_out; + + if (total_out == buf_start) { + ret = -1; + break; + } + + if (total_out <= start_byte) + goto next; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, bytes_left); + + kaddr = kmap_atomic(dest_page, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); + kunmap_atomic(kaddr, KM_USER0); + + pg_offset += bytes; + bytes_left -= bytes; +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + } + + if (ret != Z_STREAM_END && bytes_left != 0) + ret = -1; + else + ret = 0; + + zlib_inflateEnd(&workspace->inf_strm); +out: + free_workspace(workspace); + return ret; +} + +void btrfs_zlib_exit(void) +{ + free_workspaces(); +} |