summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig31
-rw-r--r--fs/btrfs/Makefile10
-rw-r--r--fs/btrfs/acl.c314
-rw-r--r--fs/btrfs/async-thread.c716
-rw-r--r--fs/btrfs/async-thread.h119
-rw-r--r--fs/btrfs/btrfs_inode.h170
-rw-r--r--fs/btrfs/compat.h7
-rw-r--r--fs/btrfs/compression.c707
-rw-r--r--fs/btrfs/compression.h47
-rw-r--r--fs/btrfs/ctree.c4388
-rw-r--r--fs/btrfs/ctree.h2411
-rw-r--r--fs/btrfs/delayed-ref.c919
-rw-r--r--fs/btrfs/delayed-ref.h214
-rw-r--r--fs/btrfs/dir-item.c431
-rw-r--r--fs/btrfs/disk-io.c2614
-rw-r--r--fs/btrfs/disk-io.h115
-rw-r--r--fs/btrfs/export.c240
-rw-r--r--fs/btrfs/export.h19
-rw-r--r--fs/btrfs/extent-tree.c7687
-rw-r--r--fs/btrfs/extent_io.c3856
-rw-r--r--fs/btrfs/extent_io.h303
-rw-r--r--fs/btrfs/extent_map.c419
-rw-r--r--fs/btrfs/extent_map.h65
-rw-r--r--fs/btrfs/file-item.c834
-rw-r--r--fs/btrfs/file.c1167
-rw-r--r--fs/btrfs/free-space-cache.c1364
-rw-r--r--fs/btrfs/free-space-cache.h53
-rw-r--r--fs/btrfs/hash.h27
-rw-r--r--fs/btrfs/inode-item.c209
-rw-r--r--fs/btrfs/inode-map.c80
-rw-r--r--fs/btrfs/inode.c6056
-rw-r--r--fs/btrfs/ioctl.c1352
-rw-r--r--fs/btrfs/ioctl.h70
-rw-r--r--fs/btrfs/locking.c234
-rw-r--r--fs/btrfs/locking.h31
-rw-r--r--fs/btrfs/ordered-data.c830
-rw-r--r--fs/btrfs/ordered-data.h162
-rw-r--r--fs/btrfs/orphan.c87
-rw-r--r--fs/btrfs/print-tree.c337
-rw-r--r--fs/btrfs/print-tree.h23
-rw-r--r--fs/btrfs/ref-cache.c231
-rw-r--r--fs/btrfs/ref-cache.h76
-rw-r--r--fs/btrfs/relocation.c3815
-rw-r--r--fs/btrfs/root-tree.c461
-rw-r--r--fs/btrfs/struct-funcs.c139
-rw-r--r--fs/btrfs/super.c799
-rw-r--r--fs/btrfs/sysfs.c269
-rw-r--r--fs/btrfs/transaction.c1153
-rw-r--r--fs/btrfs/transaction.h116
-rw-r--r--fs/btrfs/tree-defrag.c146
-rw-r--r--fs/btrfs/tree-log.c3198
-rw-r--r--fs/btrfs/tree-log.h51
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/version.sh43
-rw-r--r--fs/btrfs/volumes.c3426
-rw-r--r--fs/btrfs/volumes.h187
-rw-r--r--fs/btrfs/xattr.c392
-rw-r--r--fs/btrfs/xattr.h42
-rw-r--r--fs/btrfs/zlib.c632
59 files changed, 53898 insertions, 0 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
new file mode 100644
index 00000000000..7bb3c020e57
--- /dev/null
+++ b/fs/btrfs/Kconfig
@@ -0,0 +1,31 @@
+config BTRFS_FS
+ tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
+ depends on EXPERIMENTAL
+ select LIBCRC32C
+ select ZLIB_INFLATE
+ select ZLIB_DEFLATE
+ help
+ Btrfs is a new filesystem with extents, writable snapshotting,
+ support for multiple devices and many more features.
+
+ Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
+ FINALIZED. You should say N here unless you are interested in
+ testing Btrfs with non-critical data.
+
+ To compile this file system support as a module, choose M here. The
+ module will be called btrfs.
+
+ If unsure, say N.
+
+config BTRFS_FS_POSIX_ACL
+ bool "Btrfs POSIX Access Control Lists"
+ depends on BTRFS_FS
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 00000000000..a35eb36b32f
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,10 @@
+
+obj-$(CONFIG_BTRFS_FS) := btrfs.o
+
+btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+ file-item.o inode-item.o inode-map.o disk-io.o \
+ transaction.o inode.o file.o tree-defrag.o \
+ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+ export.o tree-log.o acl.o free-space-cache.o zlib.o \
+ compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 00000000000..9be949a231e
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (C) 2007 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "xattr.h"
+
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+
+static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+{
+ int size;
+ const char *name;
+ char *value = NULL;
+ struct posix_acl *acl;
+
+ acl = get_cached_acl(inode, type);
+ if (acl != ACL_NOT_CACHED)
+ return acl;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = POSIX_ACL_XATTR_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = POSIX_ACL_XATTR_DEFAULT;
+ break;
+ default:
+ BUG();
+ }
+
+ size = __btrfs_getxattr(inode, name, "", 0);
+ if (size > 0) {
+ value = kzalloc(size, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ size = __btrfs_getxattr(inode, name, value, size);
+ if (size > 0) {
+ acl = posix_acl_from_xattr(value, size);
+ set_cached_acl(inode, type, acl);
+ }
+ kfree(value);
+ } else if (size == -ENOENT || size == -ENODATA || size == 0) {
+ /* FIXME, who returns -ENOENT? I think nobody */
+ acl = NULL;
+ set_cached_acl(inode, type, acl);
+ } else {
+ acl = ERR_PTR(-EIO);
+ }
+
+ return acl;
+}
+
+static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
+ void *value, size_t size, int type)
+{
+ struct posix_acl *acl;
+ int ret = 0;
+
+ acl = btrfs_get_acl(dentry->d_inode, type);
+
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl == NULL)
+ return -ENODATA;
+ ret = posix_acl_to_xattr(acl, value, size);
+ posix_acl_release(acl);
+
+ return ret;
+}
+
+/*
+ * Needs to be called with fs_mutex held
+ */
+static int btrfs_set_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct posix_acl *acl, int type)
+{
+ int ret, size = 0;
+ const char *name;
+ char *value = NULL;
+ mode_t mode;
+
+ if (acl) {
+ ret = posix_acl_valid(acl);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+ }
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ mode = inode->i_mode;
+ name = POSIX_ACL_XATTR_ACCESS;
+ if (acl) {
+ ret = posix_acl_equiv_mode(acl, &mode);
+ if (ret < 0)
+ return ret;
+ inode->i_mode = mode;
+ }
+ ret = 0;
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EINVAL : 0;
+ name = POSIX_ACL_XATTR_DEFAULT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (acl) {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmalloc(size, GFP_NOFS);
+ if (!value) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = posix_acl_to_xattr(acl, value, size);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
+out:
+ kfree(value);
+
+ if (!ret)
+ set_cached_acl(inode, type, acl);
+
+ return ret;
+}
+
+static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ int ret;
+ struct posix_acl *acl = NULL;
+
+ if (!is_owner_or_cap(dentry->d_inode))
+ return -EPERM;
+
+ if (value) {
+ acl = posix_acl_from_xattr(value, size);
+ if (acl == NULL) {
+ value = NULL;
+ size = 0;
+ } else if (IS_ERR(acl)) {
+ return PTR_ERR(acl);
+ }
+ }
+
+ ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
+
+ posix_acl_release(acl);
+
+ return ret;
+}
+
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+ struct posix_acl *acl;
+ int error = -EAGAIN;
+
+ acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl) {
+ error = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ }
+
+ return error;
+}
+
+/*
+ * btrfs_init_acl is already generally called under fs_mutex, so the locking
+ * stuff has been fixed to work with that. If the locking stuff changes, we
+ * need to re-evaluate the acl locking stuff.
+ */
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir)
+{
+ struct posix_acl *acl = NULL;
+ int ret = 0;
+
+ /* this happens with subvols */
+ if (!dir)
+ return 0;
+
+ if (!S_ISLNK(inode->i_mode)) {
+ if (IS_POSIXACL(dir)) {
+ acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+
+ if (!acl)
+ inode->i_mode &= ~current_umask();
+ }
+
+ if (IS_POSIXACL(dir) && acl) {
+ struct posix_acl *clone;
+ mode_t mode;
+
+ if (S_ISDIR(inode->i_mode)) {
+ ret = btrfs_set_acl(trans, inode, acl,
+ ACL_TYPE_DEFAULT);
+ if (ret)
+ goto failed;
+ }
+ clone = posix_acl_clone(acl, GFP_NOFS);
+ ret = -ENOMEM;
+ if (!clone)
+ goto failed;
+
+ mode = inode->i_mode;
+ ret = posix_acl_create_masq(clone, &mode);
+ if (ret >= 0) {
+ inode->i_mode = mode;
+ if (ret > 0) {
+ /* we need an acl */
+ ret = btrfs_set_acl(trans, inode, clone,
+ ACL_TYPE_ACCESS);
+ }
+ }
+ posix_acl_release(clone);
+ }
+failed:
+ posix_acl_release(acl);
+
+ return ret;
+}
+
+int btrfs_acl_chmod(struct inode *inode)
+{
+ struct posix_acl *acl, *clone;
+ int ret = 0;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (!IS_POSIXACL(inode))
+ return 0;
+
+ acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ posix_acl_release(acl);
+ if (!clone)
+ return -ENOMEM;
+
+ ret = posix_acl_chmod_masq(clone, inode->i_mode);
+ if (!ret)
+ ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS);
+
+ posix_acl_release(clone);
+
+ return ret;
+}
+
+struct xattr_handler btrfs_xattr_acl_default_handler = {
+ .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .flags = ACL_TYPE_DEFAULT,
+ .get = btrfs_xattr_acl_get,
+ .set = btrfs_xattr_acl_set,
+};
+
+struct xattr_handler btrfs_xattr_acl_access_handler = {
+ .prefix = POSIX_ACL_XATTR_ACCESS,
+ .flags = ACL_TYPE_ACCESS,
+ .get = btrfs_xattr_acl_get,
+ .set = btrfs_xattr_acl_set,
+};
+
+#else /* CONFIG_BTRFS_FS_POSIX_ACL */
+
+int btrfs_acl_chmod(struct inode *inode)
+{
+ return 0;
+}
+
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir)
+{
+ return 0;
+}
+
+#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 00000000000..c0861e781cd
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,716 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/freezer.h>
+#include "async-thread.h"
+
+#define WORK_QUEUED_BIT 0
+#define WORK_DONE_BIT 1
+#define WORK_ORDER_DONE_BIT 2
+#define WORK_HIGH_PRIO_BIT 3
+
+/*
+ * container for the kthread task pointer and the list of pending work
+ * One of these is allocated per thread.
+ */
+struct btrfs_worker_thread {
+ /* pool we belong to */
+ struct btrfs_workers *workers;
+
+ /* list of struct btrfs_work that are waiting for service */
+ struct list_head pending;
+ struct list_head prio_pending;
+
+ /* list of worker threads from struct btrfs_workers */
+ struct list_head worker_list;
+
+ /* kthread */
+ struct task_struct *task;
+
+ /* number of things on the pending list */
+ atomic_t num_pending;
+
+ /* reference counter for this struct */
+ atomic_t refs;
+
+ unsigned long sequence;
+
+ /* protects the pending list. */
+ spinlock_t lock;
+
+ /* set to non-zero when this thread is already awake and kicking */
+ int working;
+
+ /* are we currently idle */
+ int idle;
+};
+
+/*
+ * btrfs_start_workers uses kthread_run, which can block waiting for memory
+ * for a very long time. It will actually throttle on page writeback,
+ * and so it may not make progress until after our btrfs worker threads
+ * process all of the pending work structs in their queue
+ *
+ * This means we can't use btrfs_start_workers from inside a btrfs worker
+ * thread that is used as part of cleaning dirty memory, which pretty much
+ * involves all of the worker threads.
+ *
+ * Instead we have a helper queue who never has more than one thread
+ * where we scheduler thread start operations. This worker_start struct
+ * is used to contain the work and hold a pointer to the queue that needs
+ * another worker.
+ */
+struct worker_start {
+ struct btrfs_work work;
+ struct btrfs_workers *queue;
+};
+
+static void start_new_worker_func(struct btrfs_work *work)
+{
+ struct worker_start *start;
+ start = container_of(work, struct worker_start, work);
+ btrfs_start_workers(start->queue, 1);
+ kfree(start);
+}
+
+static int start_new_worker(struct btrfs_workers *queue)
+{
+ struct worker_start *start;
+ int ret;
+
+ start = kzalloc(sizeof(*start), GFP_NOFS);
+ if (!start)
+ return -ENOMEM;
+
+ start->work.func = start_new_worker_func;
+ start->queue = queue;
+ ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
+ if (ret)
+ kfree(start);
+ return ret;
+}
+
+/*
+ * helper function to move a thread onto the idle list after it
+ * has finished some requests.
+ */
+static void check_idle_worker(struct btrfs_worker_thread *worker)
+{
+ if (!worker->idle && atomic_read(&worker->num_pending) <
+ worker->workers->idle_thresh / 2) {
+ unsigned long flags;
+ spin_lock_irqsave(&worker->workers->lock, flags);
+ worker->idle = 1;
+
+ /* the list may be empty if the worker is just starting */
+ if (!list_empty(&worker->worker_list)) {
+ list_move(&worker->worker_list,
+ &worker->workers->idle_list);
+ }
+ spin_unlock_irqrestore(&worker->workers->lock, flags);
+ }
+}
+
+/*
+ * helper function to move a thread off the idle list after new
+ * pending work is added.
+ */
+static void check_busy_worker(struct btrfs_worker_thread *worker)
+{
+ if (worker->idle && atomic_read(&worker->num_pending) >=
+ worker->workers->idle_thresh) {
+ unsigned long flags;
+ spin_lock_irqsave(&worker->workers->lock, flags);
+ worker->idle = 0;
+
+ if (!list_empty(&worker->worker_list)) {
+ list_move_tail(&worker->worker_list,
+ &worker->workers->worker_list);
+ }
+ spin_unlock_irqrestore(&worker->workers->lock, flags);
+ }
+}
+
+static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
+{
+ struct btrfs_workers *workers = worker->workers;
+ unsigned long flags;
+
+ rmb();
+ if (!workers->atomic_start_pending)
+ return;
+
+ spin_lock_irqsave(&workers->lock, flags);
+ if (!workers->atomic_start_pending)
+ goto out;
+
+ workers->atomic_start_pending = 0;
+ if (workers->num_workers + workers->num_workers_starting >=
+ workers->max_workers)
+ goto out;
+
+ workers->num_workers_starting += 1;
+ spin_unlock_irqrestore(&workers->lock, flags);
+ start_new_worker(workers);
+ return;
+
+out:
+ spin_unlock_irqrestore(&workers->lock, flags);
+}
+
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+ struct btrfs_work *work)
+{
+ if (!workers->ordered)
+ return 0;
+
+ set_bit(WORK_DONE_BIT, &work->flags);
+
+ spin_lock(&workers->order_lock);
+
+ while (1) {
+ if (!list_empty(&workers->prio_order_list)) {
+ work = list_entry(workers->prio_order_list.next,
+ struct btrfs_work, order_list);
+ } else if (!list_empty(&workers->order_list)) {
+ work = list_entry(workers->order_list.next,
+ struct btrfs_work, order_list);
+ } else {
+ break;
+ }
+ if (!test_bit(WORK_DONE_BIT, &work->flags))
+ break;
+
+ /* we are going to call the ordered done function, but
+ * we leave the work item on the list as a barrier so
+ * that later work items that are done don't have their
+ * functions called before this one returns
+ */
+ if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+ break;
+
+ spin_unlock(&workers->order_lock);
+
+ work->ordered_func(work);
+
+ /* now take the lock again and call the freeing code */
+ spin_lock(&workers->order_lock);
+ list_del(&work->order_list);
+ work->ordered_free(work);
+ }
+
+ spin_unlock(&workers->order_lock);
+ return 0;
+}
+
+static void put_worker(struct btrfs_worker_thread *worker)
+{
+ if (atomic_dec_and_test(&worker->refs))
+ kfree(worker);
+}
+
+static int try_worker_shutdown(struct btrfs_worker_thread *worker)
+{
+ int freeit = 0;
+
+ spin_lock_irq(&worker->lock);
+ spin_lock(&worker->workers->lock);
+ if (worker->workers->num_workers > 1 &&
+ worker->idle &&
+ !worker->working &&
+ !list_empty(&worker->worker_list) &&
+ list_empty(&worker->prio_pending) &&
+ list_empty(&worker->pending) &&
+ atomic_read(&worker->num_pending) == 0) {
+ freeit = 1;
+ list_del_init(&worker->worker_list);
+ worker->workers->num_workers--;
+ }
+ spin_unlock(&worker->workers->lock);
+ spin_unlock_irq(&worker->lock);
+
+ if (freeit)
+ put_worker(worker);
+ return freeit;
+}
+
+static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
+ struct list_head *prio_head,
+ struct list_head *head)
+{
+ struct btrfs_work *work = NULL;
+ struct list_head *cur = NULL;
+
+ if(!list_empty(prio_head))
+ cur = prio_head->next;
+
+ smp_mb();
+ if (!list_empty(&worker->prio_pending))
+ goto refill;
+
+ if (!list_empty(head))
+ cur = head->next;
+
+ if (cur)
+ goto out;
+
+refill:
+ spin_lock_irq(&worker->lock);
+ list_splice_tail_init(&worker->prio_pending, prio_head);
+ list_splice_tail_init(&worker->pending, head);
+
+ if (!list_empty(prio_head))
+ cur = prio_head->next;
+ else if (!list_empty(head))
+ cur = head->next;
+ spin_unlock_irq(&worker->lock);
+
+ if (!cur)
+ goto out_fail;
+
+out:
+ work = list_entry(cur, struct btrfs_work, list);
+
+out_fail:
+ return work;
+}
+
+/*
+ * main loop for servicing work items
+ */
+static int worker_loop(void *arg)
+{
+ struct btrfs_worker_thread *worker = arg;
+ struct list_head head;
+ struct list_head prio_head;
+ struct btrfs_work *work;
+
+ INIT_LIST_HEAD(&head);
+ INIT_LIST_HEAD(&prio_head);
+
+ do {
+again:
+ while (1) {
+
+
+ work = get_next_work(worker, &prio_head, &head);
+ if (!work)
+ break;
+
+ list_del(&work->list);
+ clear_bit(WORK_QUEUED_BIT, &work->flags);
+
+ work->worker = worker;
+
+ work->func(work);
+
+ atomic_dec(&worker->num_pending);
+ /*
+ * unless this is an ordered work queue,
+ * 'work' was probably freed by func above.
+ */
+ run_ordered_completions(worker->workers, work);
+
+ check_pending_worker_creates(worker);
+
+ }
+
+ spin_lock_irq(&worker->lock);
+ check_idle_worker(worker);
+
+ if (freezing(current)) {
+ worker->working = 0;
+ spin_unlock_irq(&worker->lock);
+ refrigerator();
+ } else {
+ spin_unlock_irq(&worker->lock);
+ if (!kthread_should_stop()) {
+ cpu_relax();
+ /*
+ * we've dropped the lock, did someone else
+ * jump_in?
+ */
+ smp_mb();
+ if (!list_empty(&worker->pending) ||
+ !list_empty(&worker->prio_pending))
+ continue;
+
+ /*
+ * this short schedule allows more work to
+ * come in without the queue functions
+ * needing to go through wake_up_process()
+ *
+ * worker->working is still 1, so nobody
+ * is going to try and wake us up
+ */
+ schedule_timeout(1);
+ smp_mb();
+ if (!list_empty(&worker->pending) ||
+ !list_empty(&worker->prio_pending))
+ continue;
+
+ if (kthread_should_stop())
+ break;
+
+ /* still no more work?, sleep for real */
+ spin_lock_irq(&worker->lock);
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!list_empty(&worker->pending) ||
+ !list_empty(&worker->prio_pending)) {
+ spin_unlock_irq(&worker->lock);
+ goto again;
+ }
+
+ /*
+ * this makes sure we get a wakeup when someone
+ * adds something new to the queue
+ */
+ worker->working = 0;
+ spin_unlock_irq(&worker->lock);
+
+ if (!kthread_should_stop()) {
+ schedule_timeout(HZ * 120);
+ if (!worker->working &&
+ try_worker_shutdown(worker)) {
+ return 0;
+ }
+ }
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+/*
+ * this will wait for all the worker threads to shutdown
+ */
+int btrfs_stop_workers(struct btrfs_workers *workers)
+{
+ struct list_head *cur;
+ struct btrfs_worker_thread *worker;
+ int can_stop;
+
+ spin_lock_irq(&workers->lock);
+ list_splice_init(&workers->idle_list, &workers->worker_list);
+ while (!list_empty(&workers->worker_list)) {
+ cur = workers->worker_list.next;
+ worker = list_entry(cur, struct btrfs_worker_thread,
+ worker_list);
+
+ atomic_inc(&worker->refs);
+ workers->num_workers -= 1;
+ if (!list_empty(&worker->worker_list)) {
+ list_del_init(&worker->worker_list);
+ put_worker(worker);
+ can_stop = 1;
+ } else
+ can_stop = 0;
+ spin_unlock_irq(&workers->lock);
+ if (can_stop)
+ kthread_stop(worker->task);
+ spin_lock_irq(&workers->lock);
+ put_worker(worker);
+ }
+ spin_unlock_irq(&workers->lock);
+ return 0;
+}
+
+/*
+ * simple init on struct btrfs_workers
+ */
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+ struct btrfs_workers *async_helper)
+{
+ workers->num_workers = 0;
+ workers->num_workers_starting = 0;
+ INIT_LIST_HEAD(&workers->worker_list);
+ INIT_LIST_HEAD(&workers->idle_list);
+ INIT_LIST_HEAD(&workers->order_list);
+ INIT_LIST_HEAD(&workers->prio_order_list);
+ spin_lock_init(&workers->lock);
+ spin_lock_init(&workers->order_lock);
+ workers->max_workers = max;
+ workers->idle_thresh = 32;
+ workers->name = name;
+ workers->ordered = 0;
+ workers->atomic_start_pending = 0;
+ workers->atomic_worker_start = async_helper;
+}
+
+/*
+ * starts new worker threads. This does not enforce the max worker
+ * count in case you need to temporarily go past it.
+ */
+static int __btrfs_start_workers(struct btrfs_workers *workers,
+ int num_workers)
+{
+ struct btrfs_worker_thread *worker;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < num_workers; i++) {
+ worker = kzalloc(sizeof(*worker), GFP_NOFS);
+ if (!worker) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ INIT_LIST_HEAD(&worker->pending);
+ INIT_LIST_HEAD(&worker->prio_pending);
+ INIT_LIST_HEAD(&worker->worker_list);
+ spin_lock_init(&worker->lock);
+
+ atomic_set(&worker->num_pending, 0);
+ atomic_set(&worker->refs, 1);
+ worker->workers = workers;
+ worker->task = kthread_run(worker_loop, worker,
+ "btrfs-%s-%d", workers->name,
+ workers->num_workers + i);
+ if (IS_ERR(worker->task)) {
+ ret = PTR_ERR(worker->task);
+ kfree(worker);
+ goto fail;
+ }
+ spin_lock_irq(&workers->lock);
+ list_add_tail(&worker->worker_list, &workers->idle_list);
+ worker->idle = 1;
+ workers->num_workers++;
+ workers->num_workers_starting--;
+ WARN_ON(workers->num_workers_starting < 0);
+ spin_unlock_irq(&workers->lock);
+ }
+ return 0;
+fail:
+ btrfs_stop_workers(workers);
+ return ret;
+}
+
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+ spin_lock_irq(&workers->lock);
+ workers->num_workers_starting += num_workers;
+ spin_unlock_irq(&workers->lock);
+ return __btrfs_start_workers(workers, num_workers);
+}
+
+/*
+ * run through the list and find a worker thread that doesn't have a lot
+ * to do right now. This can return null if we aren't yet at the thread
+ * count limit and all of the threads are busy.
+ */
+static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+{
+ struct btrfs_worker_thread *worker;
+ struct list_head *next;
+ int enforce_min;
+
+ enforce_min = (workers->num_workers + workers->num_workers_starting) <
+ workers->max_workers;
+
+ /*
+ * if we find an idle thread, don't move it to the end of the
+ * idle list. This improves the chance that the next submission
+ * will reuse the same thread, and maybe catch it while it is still
+ * working
+ */
+ if (!list_empty(&workers->idle_list)) {
+ next = workers->idle_list.next;
+ worker = list_entry(next, struct btrfs_worker_thread,
+ worker_list);
+ return worker;
+ }
+ if (enforce_min || list_empty(&workers->worker_list))
+ return NULL;
+
+ /*
+ * if we pick a busy task, move the task to the end of the list.
+ * hopefully this will keep things somewhat evenly balanced.
+ * Do the move in batches based on the sequence number. This groups
+ * requests submitted at roughly the same time onto the same worker.
+ */
+ next = workers->worker_list.next;
+ worker = list_entry(next, struct btrfs_worker_thread, worker_list);
+ worker->sequence++;
+
+ if (worker->sequence % workers->idle_thresh == 0)
+ list_move_tail(next, &workers->worker_list);
+ return worker;
+}
+
+/*
+ * selects a worker thread to take the next job. This will either find
+ * an idle worker, start a new worker up to the max count, or just return
+ * one of the existing busy workers.
+ */
+static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+{
+ struct btrfs_worker_thread *worker;
+ unsigned long flags;
+ struct list_head *fallback;
+
+again:
+ spin_lock_irqsave(&workers->lock, flags);
+ worker = next_worker(workers);
+
+ if (!worker) {
+ if (workers->num_workers + workers->num_workers_starting >=
+ workers->max_workers) {
+ goto fallback;
+ } else if (workers->atomic_worker_start) {
+ workers->atomic_start_pending = 1;
+ goto fallback;
+ } else {
+ workers->num_workers_starting++;
+ spin_unlock_irqrestore(&workers->lock, flags);
+ /* we're below the limit, start another worker */
+ __btrfs_start_workers(workers, 1);
+ goto again;
+ }
+ }
+ goto found;
+
+fallback:
+ fallback = NULL;
+ /*
+ * we have failed to find any workers, just
+ * return the first one we can find.
+ */
+ if (!list_empty(&workers->worker_list))
+ fallback = workers->worker_list.next;
+ if (!list_empty(&workers->idle_list))
+ fallback = workers->idle_list.next;
+ BUG_ON(!fallback);
+ worker = list_entry(fallback,
+ struct btrfs_worker_thread, worker_list);
+found:
+ /*
+ * this makes sure the worker doesn't exit before it is placed
+ * onto a busy/idle list
+ */
+ atomic_inc(&worker->num_pending);
+ spin_unlock_irqrestore(&workers->lock, flags);
+ return worker;
+}
+
+/*
+ * btrfs_requeue_work just puts the work item back on the tail of the list
+ * it was taken from. It is intended for use with long running work functions
+ * that make some progress and want to give the cpu up for others.
+ */
+int btrfs_requeue_work(struct btrfs_work *work)
+{
+ struct btrfs_worker_thread *worker = work->worker;
+ unsigned long flags;
+ int wake = 0;
+
+ if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+ goto out;
+
+ spin_lock_irqsave(&worker->lock, flags);
+ if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+ list_add_tail(&work->list, &worker->prio_pending);
+ else
+ list_add_tail(&work->list, &worker->pending);
+ atomic_inc(&worker->num_pending);
+
+ /* by definition we're busy, take ourselves off the idle
+ * list
+ */
+ if (worker->idle) {
+ spin_lock(&worker->workers->lock);
+ worker->idle = 0;
+ list_move_tail(&worker->worker_list,
+ &worker->workers->worker_list);
+ spin_unlock(&worker->workers->lock);
+ }
+ if (!worker->working) {
+ wake = 1;
+ worker->working = 1;
+ }
+
+ if (wake)
+ wake_up_process(worker->task);
+ spin_unlock_irqrestore(&worker->lock, flags);
+out:
+
+ return 0;
+}
+
+void btrfs_set_work_high_prio(struct btrfs_work *work)
+{
+ set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
+
+/*
+ * places a struct btrfs_work into the pending queue of one of the kthreads
+ */
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+{
+ struct btrfs_worker_thread *worker;
+ unsigned long flags;
+ int wake = 0;
+
+ /* don't requeue something already on a list */
+ if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+ goto out;
+
+ worker = find_worker(workers);
+ if (workers->ordered) {
+ /*
+ * you're not allowed to do ordered queues from an
+ * interrupt handler
+ */
+ spin_lock(&workers->order_lock);
+ if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+ list_add_tail(&work->order_list,
+ &workers->prio_order_list);
+ } else {
+ list_add_tail(&work->order_list, &workers->order_list);
+ }
+ spin_unlock(&workers->order_lock);
+ } else {
+ INIT_LIST_HEAD(&work->order_list);
+ }
+
+ spin_lock_irqsave(&worker->lock, flags);
+
+ if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+ list_add_tail(&work->list, &worker->prio_pending);
+ else
+ list_add_tail(&work->list, &worker->pending);
+ check_busy_worker(worker);
+
+ /*
+ * avoid calling into wake_up_process if this thread has already
+ * been kicked
+ */
+ if (!worker->working)
+ wake = 1;
+ worker->working = 1;
+
+ if (wake)
+ wake_up_process(worker->task);
+ spin_unlock_irqrestore(&worker->lock, flags);
+
+out:
+ return 0;
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 00000000000..5077746cf85
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ASYNC_THREAD_
+#define __BTRFS_ASYNC_THREAD_
+
+struct btrfs_worker_thread;
+
+/*
+ * This is similar to a workqueue, but it is meant to spread the operations
+ * across all available cpus instead of just the CPU that was used to
+ * queue the work. There is also some batching introduced to try and
+ * cut down on context switches.
+ *
+ * By default threads are added on demand up to 2 * the number of cpus.
+ * Changing struct btrfs_workers->max_workers is one way to prevent
+ * demand creation of kthreads.
+ *
+ * the basic model of these worker threads is to embed a btrfs_work
+ * structure in your own data struct, and use container_of in a
+ * work function to get back to your data struct.
+ */
+struct btrfs_work {
+ /*
+ * func should be set to the function you want called
+ * your work struct is passed as the only arg
+ *
+ * ordered_func must be set for work sent to an ordered work queue,
+ * and it is called to complete a given work item in the same
+ * order they were sent to the queue.
+ */
+ void (*func)(struct btrfs_work *work);
+ void (*ordered_func)(struct btrfs_work *work);
+ void (*ordered_free)(struct btrfs_work *work);
+
+ /*
+ * flags should be set to zero. It is used to make sure the
+ * struct is only inserted once into the list.
+ */
+ unsigned long flags;
+
+ /* don't touch these */
+ struct btrfs_worker_thread *worker;
+ struct list_head list;
+ struct list_head order_list;
+};
+
+struct btrfs_workers {
+ /* current number of running workers */
+ int num_workers;
+
+ int num_workers_starting;
+
+ /* max number of workers allowed. changed by btrfs_start_workers */
+ int max_workers;
+
+ /* once a worker has this many requests or fewer, it is idle */
+ int idle_thresh;
+
+ /* force completions in the order they were queued */
+ int ordered;
+
+ /* more workers required, but in an interrupt handler */
+ int atomic_start_pending;
+
+ /*
+ * are we allowed to sleep while starting workers or are we required
+ * to start them at a later time? If we can't sleep, this indicates
+ * which queue we need to use to schedule thread creation.
+ */
+ struct btrfs_workers *atomic_worker_start;
+
+ /* list with all the work threads. The workers on the idle thread
+ * may be actively servicing jobs, but they haven't yet hit the
+ * idle thresh limit above.
+ */
+ struct list_head worker_list;
+ struct list_head idle_list;
+
+ /*
+ * when operating in ordered mode, this maintains the list
+ * of work items waiting for completion
+ */
+ struct list_head order_list;
+ struct list_head prio_order_list;
+
+ /* lock for finding the next worker thread to queue on */
+ spinlock_t lock;
+
+ /* lock for the ordered lists */
+ spinlock_t order_lock;
+
+ /* extra name for this worker, used for current->name */
+ char *name;
+};
+
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+ struct btrfs_workers *async_starter);
+int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_set_work_high_prio(struct btrfs_work *work);
+#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 00000000000..3f1f50d9d91
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_I__
+#define __BTRFS_I__
+
+#include "extent_map.h"
+#include "extent_io.h"
+#include "ordered-data.h"
+
+/* in memory btrfs inode */
+struct btrfs_inode {
+ /* which subvolume this inode belongs to */
+ struct btrfs_root *root;
+
+ /* key used to find this inode on disk. This is used by the code
+ * to read in roots of subvolumes
+ */
+ struct btrfs_key location;
+
+ /* the extent_tree has caches of all the extent mappings to disk */
+ struct extent_map_tree extent_tree;
+
+ /* the io_tree does range state (DIRTY, LOCKED etc) */
+ struct extent_io_tree io_tree;
+
+ /* special utility tree used to record which mirrors have already been
+ * tried when checksums fail for a given block
+ */
+ struct extent_io_tree io_failure_tree;
+
+ /* held while logging the inode in tree-log.c */
+ struct mutex log_mutex;
+
+ /* used to order data wrt metadata */
+ struct btrfs_ordered_inode_tree ordered_tree;
+
+ /* for keeping track of orphaned inodes */
+ struct list_head i_orphan;
+
+ /* list of all the delalloc inodes in the FS. There are times we need
+ * to write all the delalloc pages to disk, and this list is used
+ * to walk them all.
+ */
+ struct list_head delalloc_inodes;
+
+ /*
+ * list for tracking inodes that must be sent to disk before a
+ * rename or truncate commit
+ */
+ struct list_head ordered_operations;
+
+ /* node for the red-black tree that links inodes in subvolume root */
+ struct rb_node rb_node;
+
+ /* the space_info for where this inode's data allocations are done */
+ struct btrfs_space_info *space_info;
+
+ /* full 64 bit generation number, struct vfs_inode doesn't have a big
+ * enough field for this.
+ */
+ u64 generation;
+
+ /* sequence number for NFS changes */
+ u64 sequence;
+
+ /*
+ * transid of the trans_handle that last modified this inode
+ */
+ u64 last_trans;
+
+ /*
+ * log transid when this inode was last modified
+ */
+ u64 last_sub_trans;
+
+ /*
+ * transid that last logged this inode
+ */
+ u64 logged_trans;
+
+ /* total number of bytes pending delalloc, used by stat to calc the
+ * real block usage of the file
+ */
+ u64 delalloc_bytes;
+
+ /* total number of bytes that may be used for this inode for
+ * delalloc
+ */
+ u64 reserved_bytes;
+
+ /*
+ * the size of the file stored in the metadata on disk. data=ordered
+ * means the in-memory i_size might be larger than the size on disk
+ * because not all the blocks are written yet.
+ */
+ u64 disk_i_size;
+
+ /* flags field from the on disk inode */
+ u32 flags;
+
+ /*
+ * if this is a directory then index_cnt is the counter for the index
+ * number for new files that are created
+ */
+ u64 index_cnt;
+
+ /* the start of block group preferred for allocations. */
+ u64 block_group;
+
+ /* the fsync log has some corner cases that mean we have to check
+ * directories to see if any unlinks have been done before
+ * the directory was logged. See tree-log.c for all the
+ * details
+ */
+ u64 last_unlink_trans;
+
+ /*
+ * Counters to keep track of the number of extent item's we may use due
+ * to delalloc and such. outstanding_extents is the number of extent
+ * items we think we'll end up using, and reserved_extents is the number
+ * of extent items we've reserved metadata for.
+ */
+ spinlock_t accounting_lock;
+ int reserved_extents;
+ int outstanding_extents;
+
+ /*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero. When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ *
+ * yes, its silly to have a single bitflag, but we might grow more
+ * of these.
+ */
+ unsigned ordered_data_close:1;
+ unsigned dummy_inode:1;
+
+ struct inode vfs_inode;
+};
+
+static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
+{
+ return container_of(inode, struct btrfs_inode, vfs_inode);
+}
+
+static inline void btrfs_i_size_write(struct inode *inode, u64 size)
+{
+ i_size_write(inode, size);
+ BTRFS_I(inode)->disk_i_size = size;
+}
+
+#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 00000000000..7c4503ef6ef
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,7 @@
+#ifndef _COMPAT_H_
+#define _COMPAT_H_
+
+#define btrfs_drop_nlink(inode) drop_nlink(inode)
+#define btrfs_inc_nlink(inode) inc_nlink(inode)
+
+#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 00000000000..a11a32058b5
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,707 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/pagevec.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+ /* number of bios pending for this compressed extent */
+ atomic_t pending_bios;
+
+ /* the pages with the compressed data on them */
+ struct page **compressed_pages;
+
+ /* inode that owns this data */
+ struct inode *inode;
+
+ /* starting offset in the inode for our pages */
+ u64 start;
+
+ /* number of bytes in the inode we're working on */
+ unsigned long len;
+
+ /* number of bytes on disk */
+ unsigned long compressed_len;
+
+ /* number of compressed pages in the array */
+ unsigned long nr_pages;
+
+ /* IO errors */
+ int errors;
+ int mirror_num;
+
+ /* for reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+
+ /*
+ * the start of a variable length array of checksums only
+ * used by reads
+ */
+ u32 sums;
+};
+
+static inline int compressed_bio_size(struct btrfs_root *root,
+ unsigned long disk_size)
+{
+ u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+ return sizeof(struct compressed_bio) +
+ ((disk_size + root->sectorsize - 1) / root->sectorsize) *
+ csum_size;
+}
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+ u64 first_byte, gfp_t gfp_flags)
+{
+ struct bio *bio;
+ int nr_vecs;
+
+ nr_vecs = bio_get_nr_vecs(bdev);
+ bio = bio_alloc(gfp_flags, nr_vecs);
+
+ if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (nr_vecs /= 2))
+ bio = bio_alloc(gfp_flags, nr_vecs);
+ }
+
+ if (bio) {
+ bio->bi_size = 0;
+ bio->bi_bdev = bdev;
+ bio->bi_sector = first_byte >> 9;
+ }
+ return bio;
+}
+
+static int check_compressed_csum(struct inode *inode,
+ struct compressed_bio *cb,
+ u64 disk_start)
+{
+ int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct page *page;
+ unsigned long i;
+ char *kaddr;
+ u32 csum;
+ u32 *cb_sum = &cb->sums;
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ return 0;
+
+ for (i = 0; i < cb->nr_pages; i++) {
+ page = cb->compressed_pages[i];
+ csum = ~(u32)0;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+ btrfs_csum_final(csum, (char *)&csum);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ if (csum != *cb_sum) {
+ printk(KERN_INFO "btrfs csum failed ino %lu "
+ "extent %llu csum %u "
+ "wanted %u mirror %d\n", inode->i_ino,
+ (unsigned long long)disk_start,
+ csum, *cb_sum, cb->mirror_num);
+ ret = -EIO;
+ goto fail;
+ }
+ cb_sum++;
+
+ }
+ ret = 0;
+fail:
+ return ret;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+ struct extent_io_tree *tree;
+ struct compressed_bio *cb = bio->bi_private;
+ struct inode *inode;
+ struct page *page;
+ unsigned long index;
+ int ret;
+
+ if (err)
+ cb->errors = 1;
+
+ /* if there are more bios still pending for this compressed
+ * extent, just exit
+ */
+ if (!atomic_dec_and_test(&cb->pending_bios))
+ goto out;
+
+ inode = cb->inode;
+ ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+ if (ret)
+ goto csum_failed;
+
+ /* ok, we're the last bio for this extent, lets start
+ * the decompression.
+ */
+ tree = &BTRFS_I(inode)->io_tree;
+ ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+ cb->start,
+ cb->orig_bio->bi_io_vec,
+ cb->orig_bio->bi_vcnt,
+ cb->compressed_len);
+csum_failed:
+ if (ret)
+ cb->errors = 1;
+
+ /* release the compressed pages */
+ index = 0;
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ page_cache_release(page);
+ }
+
+ /* do io completion on the original bio */
+ if (cb->errors) {
+ bio_io_error(cb->orig_bio);
+ } else {
+ int bio_index = 0;
+ struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+
+ /*
+ * we have verified the checksum already, set page
+ * checked so the end_io handlers know about it
+ */
+ while (bio_index < cb->orig_bio->bi_vcnt) {
+ SetPageChecked(bvec->bv_page);
+ bvec++;
+ bio_index++;
+ }
+ bio_endio(cb->orig_bio, 0);
+ }
+
+ /* finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+out:
+ bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+ unsigned long ram_size)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+ struct page *pages[16];
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+ int ret;
+
+ while (nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long,
+ nr_pages, ARRAY_SIZE(pages)), pages);
+ if (ret == 0) {
+ nr_pages -= 1;
+ index += 1;
+ continue;
+ }
+ for (i = 0; i < ret; i++) {
+ end_page_writeback(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ }
+ /* the inode may be gone now */
+ return 0;
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+ struct extent_io_tree *tree;
+ struct compressed_bio *cb = bio->bi_private;
+ struct inode *inode;
+ struct page *page;
+ unsigned long index;
+
+ if (err)
+ cb->errors = 1;
+
+ /* if there are more bios still pending for this compressed
+ * extent, just exit
+ */
+ if (!atomic_dec_and_test(&cb->pending_bios))
+ goto out;
+
+ /* ok, we're the last bio for this extent, step one is to
+ * call back into the FS and do all the end_io operations
+ */
+ inode = cb->inode;
+ tree = &BTRFS_I(inode)->io_tree;
+ cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
+ tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+ cb->start,
+ cb->start + cb->len - 1,
+ NULL, 1);
+ cb->compressed_pages[0]->mapping = NULL;
+
+ end_compressed_writeback(inode, cb->start, cb->len);
+ /* note, our inode could be gone now */
+
+ /*
+ * release the compressed pages, these came from alloc_page and
+ * are not attached to the inode at all
+ */
+ index = 0;
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ page_cache_release(page);
+ }
+
+ /* finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+out:
+ bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+ unsigned long len, u64 disk_start,
+ unsigned long compressed_len,
+ struct page **compressed_pages,
+ unsigned long nr_pages)
+{
+ struct bio *bio = NULL;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct compressed_bio *cb;
+ unsigned long bytes_left;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ int page_index = 0;
+ struct page *page;
+ u64 first_byte = disk_start;
+ struct block_device *bdev;
+ int ret;
+
+ WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+ cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+ atomic_set(&cb->pending_bios, 0);
+ cb->errors = 0;
+ cb->inode = inode;
+ cb->start = start;
+ cb->len = len;
+ cb->mirror_num = 0;
+ cb->compressed_pages = compressed_pages;
+ cb->compressed_len = compressed_len;
+ cb->orig_bio = NULL;
+ cb->nr_pages = nr_pages;
+
+ bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+ bio->bi_private = cb;
+ bio->bi_end_io = end_compressed_bio_write;
+ atomic_inc(&cb->pending_bios);
+
+ /* create and submit bios for the compressed pages */
+ bytes_left = compressed_len;
+ for (page_index = 0; page_index < cb->nr_pages; page_index++) {
+ page = compressed_pages[page_index];
+ page->mapping = inode->i_mapping;
+ if (bio->bi_size)
+ ret = io_tree->ops->merge_bio_hook(page, 0,
+ PAGE_CACHE_SIZE,
+ bio, 0);
+ else
+ ret = 0;
+
+ page->mapping = NULL;
+ if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ bio_get(bio);
+
+ /*
+ * inc the count before we submit the bio so
+ * we know the end IO handler won't happen before
+ * we inc the count. Otherwise, the cb might get
+ * freed before we're done setting it up
+ */
+ atomic_inc(&cb->pending_bios);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+ BUG_ON(ret);
+
+ ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+ BUG_ON(ret);
+
+ bio_put(bio);
+
+ bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+ bio->bi_private = cb;
+ bio->bi_end_io = end_compressed_bio_write;
+ bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ }
+ if (bytes_left < PAGE_CACHE_SIZE) {
+ printk("bytes left %lu compress len %lu nr %lu\n",
+ bytes_left, cb->compressed_len, cb->nr_pages);
+ }
+ bytes_left -= PAGE_CACHE_SIZE;
+ first_byte += PAGE_CACHE_SIZE;
+ cond_resched();
+ }
+ bio_get(bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+ BUG_ON(ret);
+
+ ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+ BUG_ON(ret);
+
+ bio_put(bio);
+ return 0;
+}
+
+static noinline int add_ra_bio_pages(struct inode *inode,
+ u64 compressed_end,
+ struct compressed_bio *cb)
+{
+ unsigned long end_index;
+ unsigned long page_index;
+ u64 last_offset;
+ u64 isize = i_size_read(inode);
+ int ret;
+ struct page *page;
+ unsigned long nr_pages = 0;
+ struct extent_map *em;
+ struct address_space *mapping = inode->i_mapping;
+ struct pagevec pvec;
+ struct extent_map_tree *em_tree;
+ struct extent_io_tree *tree;
+ u64 end;
+ int misses = 0;
+
+ page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
+ last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+ em_tree = &BTRFS_I(inode)->extent_tree;
+ tree = &BTRFS_I(inode)->io_tree;
+
+ if (isize == 0)
+ return 0;
+
+ end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+ pagevec_init(&pvec, 0);
+ while (last_offset < compressed_end) {
+ page_index = last_offset >> PAGE_CACHE_SHIFT;
+
+ if (page_index > end_index)
+ break;
+
+ rcu_read_lock();
+ page = radix_tree_lookup(&mapping->page_tree, page_index);
+ rcu_read_unlock();
+ if (page) {
+ misses++;
+ if (misses > 4)
+ break;
+ goto next;
+ }
+
+ page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
+ if (!page)
+ break;
+
+ page->index = page_index;
+ /*
+ * what we want to do here is call add_to_page_cache_lru,
+ * but that isn't exported, so we reproduce it here
+ */
+ if (add_to_page_cache(page, mapping,
+ page->index, GFP_NOFS)) {
+ page_cache_release(page);
+ goto next;
+ }
+
+ /* open coding of lru_cache_add, also not exported */
+ page_cache_get(page);
+ if (!pagevec_add(&pvec, page))
+ __pagevec_lru_add_file(&pvec);
+
+ end = last_offset + PAGE_CACHE_SIZE - 1;
+ /*
+ * at this point, we have a locked page in the page cache
+ * for these bytes in the file. But, we have to make
+ * sure they map to this compressed extent on disk.
+ */
+ set_page_extent_mapped(page);
+ lock_extent(tree, last_offset, end, GFP_NOFS);
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, last_offset,
+ PAGE_CACHE_SIZE);
+ read_unlock(&em_tree->lock);
+
+ if (!em || last_offset < em->start ||
+ (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+ (em->block_start >> 9) != cb->orig_bio->bi_sector) {
+ free_extent_map(em);
+ unlock_extent(tree, last_offset, end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ break;
+ }
+ free_extent_map(em);
+
+ if (page->index == end_index) {
+ char *userpage;
+ size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+
+ if (zero_offset) {
+ int zeros;
+ zeros = PAGE_CACHE_SIZE - zero_offset;
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + zero_offset, 0, zeros);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage, KM_USER0);
+ }
+ }
+
+ ret = bio_add_page(cb->orig_bio, page,
+ PAGE_CACHE_SIZE, 0);
+
+ if (ret == PAGE_CACHE_SIZE) {
+ nr_pages++;
+ page_cache_release(page);
+ } else {
+ unlock_extent(tree, last_offset, end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ break;
+ }
+next:
+ last_offset += PAGE_CACHE_SIZE;
+ }
+ if (pagevec_count(&pvec))
+ __pagevec_lru_add_file(&pvec);
+ return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it. We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ struct extent_io_tree *tree;
+ struct extent_map_tree *em_tree;
+ struct compressed_bio *cb;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+ unsigned long compressed_len;
+ unsigned long nr_pages;
+ unsigned long page_index;
+ struct page *page;
+ struct block_device *bdev;
+ struct bio *comp_bio;
+ u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+ u64 em_len;
+ u64 em_start;
+ struct extent_map *em;
+ int ret;
+ u32 *sums;
+
+ tree = &BTRFS_I(inode)->io_tree;
+ em_tree = &BTRFS_I(inode)->extent_tree;
+
+ /* we need the actual starting offset of this extent in the file */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree,
+ page_offset(bio->bi_io_vec->bv_page),
+ PAGE_CACHE_SIZE);
+ read_unlock(&em_tree->lock);
+
+ compressed_len = em->block_len;
+ cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+ atomic_set(&cb->pending_bios, 0);
+ cb->errors = 0;
+ cb->inode = inode;
+ cb->mirror_num = mirror_num;
+ sums = &cb->sums;
+
+ cb->start = em->orig_start;
+ em_len = em->len;
+ em_start = em->start;
+
+ free_extent_map(em);
+ em = NULL;
+
+ cb->len = uncompressed_len;
+ cb->compressed_len = compressed_len;
+ cb->orig_bio = bio;
+
+ nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE;
+ cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+ GFP_NOFS);
+ bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ for (page_index = 0; page_index < nr_pages; page_index++) {
+ cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+ __GFP_HIGHMEM);
+ }
+ cb->nr_pages = nr_pages;
+
+ add_ra_bio_pages(inode, em_start + em_len, cb);
+
+ /* include any pages we added in add_ra-bio_pages */
+ uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+ cb->len = uncompressed_len;
+
+ comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+ comp_bio->bi_private = cb;
+ comp_bio->bi_end_io = end_compressed_bio_read;
+ atomic_inc(&cb->pending_bios);
+
+ for (page_index = 0; page_index < nr_pages; page_index++) {
+ page = cb->compressed_pages[page_index];
+ page->mapping = inode->i_mapping;
+ page->index = em_start >> PAGE_CACHE_SHIFT;
+
+ if (comp_bio->bi_size)
+ ret = tree->ops->merge_bio_hook(page, 0,
+ PAGE_CACHE_SIZE,
+ comp_bio, 0);
+ else
+ ret = 0;
+
+ page->mapping = NULL;
+ if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ bio_get(comp_bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ BUG_ON(ret);
+
+ /*
+ * inc the count before we submit the bio so
+ * we know the end IO handler won't happen before
+ * we inc the count. Otherwise, the cb might get
+ * freed before we're done setting it up
+ */
+ atomic_inc(&cb->pending_bios);
+
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ btrfs_lookup_bio_sums(root, inode, comp_bio,
+ sums);
+ }
+ sums += (comp_bio->bi_size + root->sectorsize - 1) /
+ root->sectorsize;
+
+ ret = btrfs_map_bio(root, READ, comp_bio,
+ mirror_num, 0);
+ BUG_ON(ret);
+
+ bio_put(comp_bio);
+
+ comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+ GFP_NOFS);
+ comp_bio->bi_private = cb;
+ comp_bio->bi_end_io = end_compressed_bio_read;
+
+ bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
+ }
+ cur_disk_byte += PAGE_CACHE_SIZE;
+ }
+ bio_get(comp_bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ BUG_ON(ret);
+
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+ btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+
+ ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
+ BUG_ON(ret);
+
+ bio_put(comp_bio);
+ return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 00000000000..421f5b4aa71
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+int btrfs_zlib_decompress(unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+ unsigned long len, u64 disk_start,
+ unsigned long compressed_len,
+ struct page **compressed_pages,
+ unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
+#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 00000000000..c4bc570a396
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,4388 @@
+/*
+ * Copyright (C) 2007,2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "locking.h"
+
+static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int level);
+static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *ins_key,
+ struct btrfs_path *path, int data_size, int extend);
+static int push_node_left(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *dst,
+ struct extent_buffer *src, int empty);
+static int balance_node_right(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *dst_buf,
+ struct extent_buffer *src_buf);
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int level, int slot);
+static int setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr);
+
+
+struct btrfs_path *btrfs_alloc_path(void)
+{
+ struct btrfs_path *path;
+ path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+ if (path)
+ path->reada = 1;
+ return path;
+}
+
+/*
+ * set all locked nodes in the path to blocking locks. This should
+ * be done before scheduling
+ */
+noinline void btrfs_set_path_blocking(struct btrfs_path *p)
+{
+ int i;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (p->nodes[i] && p->locks[i])
+ btrfs_set_lock_blocking(p->nodes[i]);
+ }
+}
+
+/*
+ * reset all the locked nodes in the patch to spinning locks.
+ *
+ * held is used to keep lockdep happy, when lockdep is enabled
+ * we set held to a blocking lock before we go around and
+ * retake all the spinlocks in the path. You can safely use NULL
+ * for held
+ */
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
+ struct extent_buffer *held)
+{
+ int i;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /* lockdep really cares that we take all of these spinlocks
+ * in the right order. If any of the locks in the path are not
+ * currently blocking, it is going to complain. So, make really
+ * really sure by forcing the path to blocking before we clear
+ * the path blocking.
+ */
+ if (held)
+ btrfs_set_lock_blocking(held);
+ btrfs_set_path_blocking(p);
+#endif
+
+ for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
+ if (p->nodes[i] && p->locks[i])
+ btrfs_clear_lock_blocking(p->nodes[i]);
+ }
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (held)
+ btrfs_clear_lock_blocking(held);
+#endif
+}
+
+/* this also releases the path */
+void btrfs_free_path(struct btrfs_path *p)
+{
+ btrfs_release_path(NULL, p);
+ kmem_cache_free(btrfs_path_cachep, p);
+}
+
+/*
+ * path release drops references on the extent buffers in the path
+ * and it drops any locks held by this path
+ *
+ * It is safe to call this on paths that no locks or extent buffers held.
+ */
+noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ p->slots[i] = 0;
+ if (!p->nodes[i])
+ continue;
+ if (p->locks[i]) {
+ btrfs_tree_unlock(p->nodes[i]);
+ p->locks[i] = 0;
+ }
+ free_extent_buffer(p->nodes[i]);
+ p->nodes[i] = NULL;
+ }
+}
+
+/*
+ * safely gets a reference on the root node of a tree. A lock
+ * is not taken, so a concurrent writer may put a different node
+ * at the root of the tree. See btrfs_lock_root_node for the
+ * looping required.
+ *
+ * The extent buffer returned by this has a reference taken, so
+ * it won't disappear. It may stop being the root of the tree
+ * at any time because there are no locks held.
+ */
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+ spin_lock(&root->node_lock);
+ eb = root->node;
+ extent_buffer_get(eb);
+ spin_unlock(&root->node_lock);
+ return eb;
+}
+
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root. A locked buffer
+ * is returned, with a reference held.
+ */
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ btrfs_tree_lock(eb);
+
+ spin_lock(&root->node_lock);
+ if (eb == root->node) {
+ spin_unlock(&root->node_lock);
+ break;
+ }
+ spin_unlock(&root->node_lock);
+
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/* cowonly root (everything not a reference counted cow subvolume), just get
+ * put onto a simple dirty list. transaction.c walks this to make sure they
+ * get properly updated on disk.
+ */
+static void add_root_to_dirty_list(struct btrfs_root *root)
+{
+ if (root->track_dirty && list_empty(&root->dirty_list)) {
+ list_add(&root->dirty_list,
+ &root->fs_info->dirty_cowonly_roots);
+ }
+}
+
+/*
+ * used by snapshot creation to make a copy of a root for a tree with
+ * a given objectid. The buffer with the new root node is returned in
+ * cow_ret, and this func returns zero on success or a negative error code.
+ */
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer **cow_ret, u64 new_root_objectid)
+{
+ struct extent_buffer *cow;
+ u32 nritems;
+ int ret = 0;
+ int level;
+ struct btrfs_disk_key disk_key;
+
+ WARN_ON(root->ref_cows && trans->transid !=
+ root->fs_info->running_transaction->transid);
+ WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+
+ level = btrfs_header_level(buf);
+ nritems = btrfs_header_nritems(buf);
+ if (level == 0)
+ btrfs_item_key(buf, &disk_key, 0);
+ else
+ btrfs_node_key(buf, &disk_key, 0);
+
+ cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
+ new_root_objectid, &disk_key, level,
+ buf->start, 0);
+ if (IS_ERR(cow))
+ return PTR_ERR(cow);
+
+ copy_extent_buffer(cow, buf, 0, 0, cow->len);
+ btrfs_set_header_bytenr(cow, cow->start);
+ btrfs_set_header_generation(cow, trans->transid);
+ btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+ btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+ BTRFS_HEADER_FLAG_RELOC);
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+ else
+ btrfs_set_header_owner(cow, new_root_objectid);
+
+ write_extent_buffer(cow, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(cow),
+ BTRFS_FSID_SIZE);
+
+ WARN_ON(btrfs_header_generation(buf) > trans->transid);
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ ret = btrfs_inc_ref(trans, root, cow, 1);
+ else
+ ret = btrfs_inc_ref(trans, root, cow, 0);
+
+ if (ret)
+ return ret;
+
+ btrfs_mark_buffer_dirty(cow);
+ *cow_ret = cow;
+ return 0;
+}
+
+/*
+ * check if the tree block can be shared by multiple trees
+ */
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+ struct extent_buffer *buf)
+{
+ /*
+ * Tree blocks not in refernece counted trees and tree roots
+ * are never shared. If a block was allocated after the last
+ * snapshot and the block was not allocated by tree relocation,
+ * we know the block is not shared.
+ */
+ if (root->ref_cows &&
+ buf != root->node && buf != root->commit_root &&
+ (btrfs_header_generation(buf) <=
+ btrfs_root_last_snapshot(&root->root_item) ||
+ btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+ return 1;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (root->ref_cows &&
+ btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+ return 1;
+#endif
+ return 0;
+}
+
+static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *cow)
+{
+ u64 refs;
+ u64 owner;
+ u64 flags;
+ u64 new_flags = 0;
+ int ret;
+
+ /*
+ * Backrefs update rules:
+ *
+ * Always use full backrefs for extent pointers in tree block
+ * allocated by tree relocation.
+ *
+ * If a shared tree block is no longer referenced by its owner
+ * tree (btrfs_header_owner(buf) == root->root_key.objectid),
+ * use full backrefs for extent pointers in tree block.
+ *
+ * If a tree block is been relocating
+ * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
+ * use full backrefs for extent pointers in tree block.
+ * The reason for this is some operations (such as drop tree)
+ * are only allowed for blocks use full backrefs.
+ */
+
+ if (btrfs_block_can_be_shared(root, buf)) {
+ ret = btrfs_lookup_extent_info(trans, root, buf->start,
+ buf->len, &refs, &flags);
+ BUG_ON(ret);
+ BUG_ON(refs == 0);
+ } else {
+ refs = 1;
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+ flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ else
+ flags = 0;
+ }
+
+ owner = btrfs_header_owner(buf);
+ BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
+ !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+
+ if (refs > 1) {
+ if ((owner == root->root_key.objectid ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+ !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+ ret = btrfs_inc_ref(trans, root, buf, 1);
+ BUG_ON(ret);
+
+ if (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID) {
+ ret = btrfs_dec_ref(trans, root, buf, 0);
+ BUG_ON(ret);
+ ret = btrfs_inc_ref(trans, root, cow, 1);
+ BUG_ON(ret);
+ }
+ new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ } else {
+
+ if (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID)
+ ret = btrfs_inc_ref(trans, root, cow, 1);
+ else
+ ret = btrfs_inc_ref(trans, root, cow, 0);
+ BUG_ON(ret);
+ }
+ if (new_flags != 0) {
+ ret = btrfs_set_disk_extent_flags(trans, root,
+ buf->start,
+ buf->len,
+ new_flags, 0);
+ BUG_ON(ret);
+ }
+ } else {
+ if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ if (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID)
+ ret = btrfs_inc_ref(trans, root, cow, 1);
+ else
+ ret = btrfs_inc_ref(trans, root, cow, 0);
+ BUG_ON(ret);
+ ret = btrfs_dec_ref(trans, root, buf, 1);
+ BUG_ON(ret);
+ }
+ clean_tree_block(trans, root, buf);
+ }
+ return 0;
+}
+
+/*
+ * does the dirty work in cow of a single block. The parent block (if
+ * supplied) is updated to point to the new cow copy. The new buffer is marked
+ * dirty and returned locked. If you modify the block it needs to be marked
+ * dirty again.
+ *
+ * search_start -- an allocation hint for the new block
+ *
+ * empty_size -- a hint that you plan on doing more cow. This is the size in
+ * bytes the allocator should try to find free next to the block it returns.
+ * This is just a hint and may be ignored by the allocator.
+ */
+static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret,
+ u64 search_start, u64 empty_size)
+{
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *cow;
+ int level;
+ int unlock_orig = 0;
+ u64 parent_start;
+
+ if (*cow_ret == buf)
+ unlock_orig = 1;
+
+ btrfs_assert_tree_locked(buf);
+
+ WARN_ON(root->ref_cows && trans->transid !=
+ root->fs_info->running_transaction->transid);
+ WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+
+ level = btrfs_header_level(buf);
+
+ if (level == 0)
+ btrfs_item_key(buf, &disk_key, 0);
+ else
+ btrfs_node_key(buf, &disk_key, 0);
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (parent)
+ parent_start = parent->start;
+ else
+ parent_start = 0;
+ } else
+ parent_start = 0;
+
+ cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
+ root->root_key.objectid, &disk_key,
+ level, search_start, empty_size);
+ if (IS_ERR(cow))
+ return PTR_ERR(cow);
+
+ /* cow is set to blocking by btrfs_init_new_buffer */
+
+ copy_extent_buffer(cow, buf, 0, 0, cow->len);
+ btrfs_set_header_bytenr(cow, cow->start);
+ btrfs_set_header_generation(cow, trans->transid);
+ btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+ btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+ BTRFS_HEADER_FLAG_RELOC);
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+ else
+ btrfs_set_header_owner(cow, root->root_key.objectid);
+
+ write_extent_buffer(cow, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(cow),
+ BTRFS_FSID_SIZE);
+
+ update_ref_for_cow(trans, root, buf, cow);
+
+ if (buf == root->node) {
+ WARN_ON(parent && parent != buf);
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+ parent_start = buf->start;
+ else
+ parent_start = 0;
+
+ spin_lock(&root->node_lock);
+ root->node = cow;
+ extent_buffer_get(cow);
+ spin_unlock(&root->node_lock);
+
+ btrfs_free_tree_block(trans, root, buf->start, buf->len,
+ parent_start, root->root_key.objectid, level);
+ free_extent_buffer(buf);
+ add_root_to_dirty_list(root);
+ } else {
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ parent_start = parent->start;
+ else
+ parent_start = 0;
+
+ WARN_ON(trans->transid != btrfs_header_generation(parent));
+ btrfs_set_node_blockptr(parent, parent_slot,
+ cow->start);
+ btrfs_set_node_ptr_generation(parent, parent_slot,
+ trans->transid);
+ btrfs_mark_buffer_dirty(parent);
+ btrfs_free_tree_block(trans, root, buf->start, buf->len,
+ parent_start, root->root_key.objectid, level);
+ }
+ if (unlock_orig)
+ btrfs_tree_unlock(buf);
+ free_extent_buffer(buf);
+ btrfs_mark_buffer_dirty(cow);
+ *cow_ret = cow;
+ return 0;
+}
+
+static inline int should_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf)
+{
+ if (btrfs_header_generation(buf) == trans->transid &&
+ !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
+ !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+ return 0;
+ return 1;
+}
+
+/*
+ * cows a single block, see __btrfs_cow_block for the real work.
+ * This version of it has extra checks so that a block isn't cow'd more than
+ * once per transaction, as long as it hasn't been written yet
+ */
+noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret)
+{
+ u64 search_start;
+ int ret;
+
+ if (trans->transaction != root->fs_info->running_transaction) {
+ printk(KERN_CRIT "trans %llu running %llu\n",
+ (unsigned long long)trans->transid,
+ (unsigned long long)
+ root->fs_info->running_transaction->transid);
+ WARN_ON(1);
+ }
+ if (trans->transid != root->fs_info->generation) {
+ printk(KERN_CRIT "trans %llu running %llu\n",
+ (unsigned long long)trans->transid,
+ (unsigned long long)root->fs_info->generation);
+ WARN_ON(1);
+ }
+
+ if (!should_cow_block(trans, root, buf)) {
+ *cow_ret = buf;
+ return 0;
+ }
+
+ search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+
+ if (parent)
+ btrfs_set_lock_blocking(parent);
+ btrfs_set_lock_blocking(buf);
+
+ ret = __btrfs_cow_block(trans, root, buf, parent,
+ parent_slot, cow_ret, search_start, 0);
+ return ret;
+}
+
+/*
+ * helper function for defrag to decide if two blocks pointed to by a
+ * node are actually close by
+ */
+static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+ if (blocknr < other && other - (blocknr + blocksize) < 32768)
+ return 1;
+ if (blocknr > other && blocknr - (other + blocksize) < 32768)
+ return 1;
+ return 0;
+}
+
+/*
+ * compare two keys in a memcmp fashion
+ */
+static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
+{
+ struct btrfs_key k1;
+
+ btrfs_disk_key_to_cpu(&k1, disk);
+
+ return btrfs_comp_cpu_keys(&k1, k2);
+}
+
+/*
+ * same as comp_keys only with two btrfs_key's
+ */
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+{
+ if (k1->objectid > k2->objectid)
+ return 1;
+ if (k1->objectid < k2->objectid)
+ return -1;
+ if (k1->type > k2->type)
+ return 1;
+ if (k1->type < k2->type)
+ return -1;
+ if (k1->offset > k2->offset)
+ return 1;
+ if (k1->offset < k2->offset)
+ return -1;
+ return 0;
+}
+
+/*
+ * this is used by the defrag code to go through all the
+ * leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order
+ */
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *parent,
+ int start_slot, int cache_only, u64 *last_ret,
+ struct btrfs_key *progress)
+{
+ struct extent_buffer *cur;
+ u64 blocknr;
+ u64 gen;
+ u64 search_start = *last_ret;
+ u64 last_block = 0;
+ u64 other;
+ u32 parent_nritems;
+ int end_slot;
+ int i;
+ int err = 0;
+ int parent_level;
+ int uptodate;
+ u32 blocksize;
+ int progress_passed = 0;
+ struct btrfs_disk_key disk_key;
+
+ parent_level = btrfs_header_level(parent);
+ if (cache_only && parent_level != 1)
+ return 0;
+
+ if (trans->transaction != root->fs_info->running_transaction)
+ WARN_ON(1);
+ if (trans->transid != root->fs_info->generation)
+ WARN_ON(1);
+
+ parent_nritems = btrfs_header_nritems(parent);
+ blocksize = btrfs_level_size(root, parent_level - 1);
+ end_slot = parent_nritems;
+
+ if (parent_nritems == 1)
+ return 0;
+
+ btrfs_set_lock_blocking(parent);
+
+ for (i = start_slot; i < end_slot; i++) {
+ int close = 1;
+
+ if (!parent->map_token) {
+ map_extent_buffer(parent,
+ btrfs_node_key_ptr_offset(i),
+ sizeof(struct btrfs_key_ptr),
+ &parent->map_token, &parent->kaddr,
+ &parent->map_start, &parent->map_len,
+ KM_USER1);
+ }
+ btrfs_node_key(parent, &disk_key, i);
+ if (!progress_passed && comp_keys(&disk_key, progress) < 0)
+ continue;
+
+ progress_passed = 1;
+ blocknr = btrfs_node_blockptr(parent, i);
+ gen = btrfs_node_ptr_generation(parent, i);
+ if (last_block == 0)
+ last_block = blocknr;
+
+ if (i > 0) {
+ other = btrfs_node_blockptr(parent, i - 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (!close && i < end_slot - 2) {
+ other = btrfs_node_blockptr(parent, i + 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (close) {
+ last_block = blocknr;
+ continue;
+ }
+ if (parent->map_token) {
+ unmap_extent_buffer(parent, parent->map_token,
+ KM_USER1);
+ parent->map_token = NULL;
+ }
+
+ cur = btrfs_find_tree_block(root, blocknr, blocksize);
+ if (cur)
+ uptodate = btrfs_buffer_uptodate(cur, gen);
+ else
+ uptodate = 0;
+ if (!cur || !uptodate) {
+ if (cache_only) {
+ free_extent_buffer(cur);
+ continue;
+ }
+ if (!cur) {
+ cur = read_tree_block(root, blocknr,
+ blocksize, gen);
+ } else if (!uptodate) {
+ btrfs_read_buffer(cur, gen);
+ }
+ }
+ if (search_start == 0)
+ search_start = last_block;
+
+ btrfs_tree_lock(cur);
+ btrfs_set_lock_blocking(cur);
+ err = __btrfs_cow_block(trans, root, cur, parent, i,
+ &cur, search_start,
+ min(16 * blocksize,
+ (end_slot - i) * blocksize));
+ if (err) {
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ break;
+ }
+ search_start = cur->start;
+ last_block = cur->start;
+ *last_ret = search_start;
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ }
+ if (parent->map_token) {
+ unmap_extent_buffer(parent, parent->map_token,
+ KM_USER1);
+ parent->map_token = NULL;
+ }
+ return err;
+}
+
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
+static inline unsigned int leaf_data_end(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ u32 nr = btrfs_header_nritems(leaf);
+ if (nr == 0)
+ return BTRFS_LEAF_DATA_SIZE(root);
+ return btrfs_item_offset_nr(leaf, nr - 1);
+}
+
+/*
+ * extra debugging checks to make sure all the items in a key are
+ * well formed and in the proper order
+ */
+static int check_node(struct btrfs_root *root, struct btrfs_path *path,
+ int level)
+{
+ struct extent_buffer *parent = NULL;
+ struct extent_buffer *node = path->nodes[level];
+ struct btrfs_disk_key parent_key;
+ struct btrfs_disk_key node_key;
+ int parent_slot;
+ int slot;
+ struct btrfs_key cpukey;
+ u32 nritems = btrfs_header_nritems(node);
+
+ if (path->nodes[level + 1])
+ parent = path->nodes[level + 1];
+
+ slot = path->slots[level];
+ BUG_ON(nritems == 0);
+ if (parent) {
+ parent_slot = path->slots[level + 1];
+ btrfs_node_key(parent, &parent_key, parent_slot);
+ btrfs_node_key(node, &node_key, 0);
+ BUG_ON(memcmp(&parent_key, &node_key,
+ sizeof(struct btrfs_disk_key)));
+ BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
+ btrfs_header_bytenr(node));
+ }
+ BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
+ if (slot != 0) {
+ btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
+ btrfs_node_key(node, &node_key, slot);
+ BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
+ }
+ if (slot < nritems - 1) {
+ btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
+ btrfs_node_key(node, &node_key, slot);
+ BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
+ }
+ return 0;
+}
+
+/*
+ * extra checking to make sure all the items in a leaf are
+ * well formed and in the proper order
+ */
+static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
+ int level)
+{
+ struct extent_buffer *leaf = path->nodes[level];
+ struct extent_buffer *parent = NULL;
+ int parent_slot;
+ struct btrfs_key cpukey;
+ struct btrfs_disk_key parent_key;
+ struct btrfs_disk_key leaf_key;
+ int slot = path->slots[0];
+
+ u32 nritems = btrfs_header_nritems(leaf);
+
+ if (path->nodes[level + 1])
+ parent = path->nodes[level + 1];
+
+ if (nritems == 0)
+ return 0;
+
+ if (parent) {
+ parent_slot = path->slots[level + 1];
+ btrfs_node_key(parent, &parent_key, parent_slot);
+ btrfs_item_key(leaf, &leaf_key, 0);
+
+ BUG_ON(memcmp(&parent_key, &leaf_key,
+ sizeof(struct btrfs_disk_key)));
+ BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
+ btrfs_header_bytenr(leaf));
+ }
+ if (slot != 0 && slot < nritems - 1) {
+ btrfs_item_key(leaf, &leaf_key, slot);
+ btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
+ if (comp_keys(&leaf_key, &cpukey) <= 0) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "slot %d offset bad key\n", slot);
+ BUG_ON(1);
+ }
+ if (btrfs_item_offset_nr(leaf, slot - 1) !=
+ btrfs_item_end_nr(leaf, slot)) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "slot %d offset bad\n", slot);
+ BUG_ON(1);
+ }
+ }
+ if (slot < nritems - 1) {
+ btrfs_item_key(leaf, &leaf_key, slot);
+ btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
+ BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
+ if (btrfs_item_offset_nr(leaf, slot) !=
+ btrfs_item_end_nr(leaf, slot + 1)) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "slot %d offset bad\n", slot);
+ BUG_ON(1);
+ }
+ }
+ BUG_ON(btrfs_item_offset_nr(leaf, 0) +
+ btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
+ return 0;
+}
+
+static noinline int check_block(struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ return 0;
+ if (level == 0)
+ return check_leaf(root, path, level);
+ return check_node(root, path, level);
+}
+
+/*
+ * search for key in the extent_buffer. The items start at offset p,
+ * and they are item_size apart. There are 'max' items in p.
+ *
+ * the slot in the array is returned via slot, and it points to
+ * the place where you would insert key if it is not found in
+ * the array.
+ *
+ * slot may point to max if the key is bigger than all of the keys
+ */
+static noinline int generic_bin_search(struct extent_buffer *eb,
+ unsigned long p,
+ int item_size, struct btrfs_key *key,
+ int max, int *slot)
+{
+ int low = 0;
+ int high = max;
+ int mid;
+ int ret;
+ struct btrfs_disk_key *tmp = NULL;
+ struct btrfs_disk_key unaligned;
+ unsigned long offset;
+ char *map_token = NULL;
+ char *kaddr = NULL;
+ unsigned long map_start = 0;
+ unsigned long map_len = 0;
+ int err;
+
+ while (low < high) {
+ mid = (low + high) / 2;
+ offset = p + mid * item_size;
+
+ if (!map_token || offset < map_start ||
+ (offset + sizeof(struct btrfs_disk_key)) >
+ map_start + map_len) {
+ if (map_token) {
+ unmap_extent_buffer(eb, map_token, KM_USER0);
+ map_token = NULL;
+ }
+
+ err = map_private_extent_buffer(eb, offset,
+ sizeof(struct btrfs_disk_key),
+ &map_token, &kaddr,
+ &map_start, &map_len, KM_USER0);
+
+ if (!err) {
+ tmp = (struct btrfs_disk_key *)(kaddr + offset -
+ map_start);
+ } else {
+ read_extent_buffer(eb, &unaligned,
+ offset, sizeof(unaligned));
+ tmp = &unaligned;
+ }
+
+ } else {
+ tmp = (struct btrfs_disk_key *)(kaddr + offset -
+ map_start);
+ }
+ ret = comp_keys(tmp, key);
+
+ if (ret < 0)
+ low = mid + 1;
+ else if (ret > 0)
+ high = mid;
+ else {
+ *slot = mid;
+ if (map_token)
+ unmap_extent_buffer(eb, map_token, KM_USER0);
+ return 0;
+ }
+ }
+ *slot = low;
+ if (map_token)
+ unmap_extent_buffer(eb, map_token, KM_USER0);
+ return 1;
+}
+
+/*
+ * simple bin_search frontend that does the right thing for
+ * leaves vs nodes
+ */
+static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+ int level, int *slot)
+{
+ if (level == 0) {
+ return generic_bin_search(eb,
+ offsetof(struct btrfs_leaf, items),
+ sizeof(struct btrfs_item),
+ key, btrfs_header_nritems(eb),
+ slot);
+ } else {
+ return generic_bin_search(eb,
+ offsetof(struct btrfs_node, ptrs),
+ sizeof(struct btrfs_key_ptr),
+ key, btrfs_header_nritems(eb),
+ slot);
+ }
+ return -1;
+}
+
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+ int level, int *slot)
+{
+ return bin_search(eb, key, level, slot);
+}
+
+/* given a node and slot number, this reads the blocks it points to. The
+ * extent buffer is returned with a reference taken (but unlocked).
+ * NULL is returned on error.
+ */
+static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
+ struct extent_buffer *parent, int slot)
+{
+ int level = btrfs_header_level(parent);
+ if (slot < 0)
+ return NULL;
+ if (slot >= btrfs_header_nritems(parent))
+ return NULL;
+
+ BUG_ON(level == 0);
+
+ return read_tree_block(root, btrfs_node_blockptr(parent, slot),
+ btrfs_level_size(root, level - 1),
+ btrfs_node_ptr_generation(parent, slot));
+}
+
+/*
+ * node level balancing, used to make sure nodes are in proper order for
+ * item deletion. We balance from the top down, so we have to make sure
+ * that a deletion won't leave an node completely empty later on.
+ */
+static noinline int balance_level(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct extent_buffer *right = NULL;
+ struct extent_buffer *mid;
+ struct extent_buffer *left = NULL;
+ struct extent_buffer *parent = NULL;
+ int ret = 0;
+ int wret;
+ int pslot;
+ int orig_slot = path->slots[level];
+ int err_on_enospc = 0;
+ u64 orig_ptr;
+
+ if (level == 0)
+ return 0;
+
+ mid = path->nodes[level];
+
+ WARN_ON(!path->locks[level]);
+ WARN_ON(btrfs_header_generation(mid) != trans->transid);
+
+ orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+
+ if (level < BTRFS_MAX_LEVEL - 1)
+ parent = path->nodes[level + 1];
+ pslot = path->slots[level + 1];
+
+ /*
+ * deal with the case where there is only one pointer in the root
+ * by promoting the node below to a root
+ */
+ if (!parent) {
+ struct extent_buffer *child;
+
+ if (btrfs_header_nritems(mid) != 1)
+ return 0;
+
+ /* promote the child to a root */
+ child = read_node_slot(root, mid, 0);
+ BUG_ON(!child);
+ btrfs_tree_lock(child);
+ btrfs_set_lock_blocking(child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+ BUG_ON(ret);
+
+ spin_lock(&root->node_lock);
+ root->node = child;
+ spin_unlock(&root->node_lock);
+
+ add_root_to_dirty_list(root);
+ btrfs_tree_unlock(child);
+
+ path->locks[level] = 0;
+ path->nodes[level] = NULL;
+ clean_tree_block(trans, root, mid);
+ btrfs_tree_unlock(mid);
+ /* once for the path */
+ free_extent_buffer(mid);
+ ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
+ 0, root->root_key.objectid, level);
+ /* once for the root ptr */
+ free_extent_buffer(mid);
+ return ret;
+ }
+ if (btrfs_header_nritems(mid) >
+ BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
+ return 0;
+
+ if (btrfs_header_nritems(mid) < 2)
+ err_on_enospc = 1;
+
+ left = read_node_slot(root, parent, pslot - 1);
+ if (left) {
+ btrfs_tree_lock(left);
+ btrfs_set_lock_blocking(left);
+ wret = btrfs_cow_block(trans, root, left,
+ parent, pslot - 1, &left);
+ if (wret) {
+ ret = wret;
+ goto enospc;
+ }
+ }
+ right = read_node_slot(root, parent, pslot + 1);
+ if (right) {
+ btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
+ wret = btrfs_cow_block(trans, root, right,
+ parent, pslot + 1, &right);
+ if (wret) {
+ ret = wret;
+ goto enospc;
+ }
+ }
+
+ /* first, try to make some room in the middle buffer */
+ if (left) {
+ orig_slot += btrfs_header_nritems(left);
+ wret = push_node_left(trans, root, left, mid, 1);
+ if (wret < 0)
+ ret = wret;
+ if (btrfs_header_nritems(mid) < 2)
+ err_on_enospc = 1;
+ }
+
+ /*
+ * then try to empty the right most buffer into the middle
+ */
+ if (right) {
+ wret = push_node_left(trans, root, mid, right, 1);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+ if (btrfs_header_nritems(right) == 0) {
+ u64 bytenr = right->start;
+ u32 blocksize = right->len;
+
+ clean_tree_block(trans, root, right);
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ right = NULL;
+ wret = del_ptr(trans, root, path, level + 1, pslot +
+ 1);
+ if (wret)
+ ret = wret;
+ wret = btrfs_free_tree_block(trans, root,
+ bytenr, blocksize, 0,
+ root->root_key.objectid,
+ level);
+ if (wret)
+ ret = wret;
+ } else {
+ struct btrfs_disk_key right_key;
+ btrfs_node_key(right, &right_key, 0);
+ btrfs_set_node_key(parent, &right_key, pslot + 1);
+ btrfs_mark_buffer_dirty(parent);
+ }
+ }
+ if (btrfs_header_nritems(mid) == 1) {
+ /*
+ * we're not allowed to leave a node with one item in the
+ * tree during a delete. A deletion from lower in the tree
+ * could try to delete the only pointer in this node.
+ * So, pull some keys from the left.
+ * There has to be a left pointer at this point because
+ * otherwise we would have pulled some pointers from the
+ * right
+ */
+ BUG_ON(!left);
+ wret = balance_node_right(trans, root, mid, left);
+ if (wret < 0) {
+ ret = wret;
+ goto enospc;
+ }
+ if (wret == 1) {
+ wret = push_node_left(trans, root, left, mid, 1);
+ if (wret < 0)
+ ret = wret;
+ }
+ BUG_ON(wret == 1);
+ }
+ if (btrfs_header_nritems(mid) == 0) {
+ /* we've managed to empty the middle node, drop it */
+ u64 bytenr = mid->start;
+ u32 blocksize = mid->len;
+
+ clean_tree_block(trans, root, mid);
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ mid = NULL;
+ wret = del_ptr(trans, root, path, level + 1, pslot);
+ if (wret)
+ ret = wret;
+ wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
+ 0, root->root_key.objectid, level);
+ if (wret)
+ ret = wret;
+ } else {
+ /* update the parent key to reflect our changes */
+ struct btrfs_disk_key mid_key;
+ btrfs_node_key(mid, &mid_key, 0);
+ btrfs_set_node_key(parent, &mid_key, pslot);
+ btrfs_mark_buffer_dirty(parent);
+ }
+
+ /* update the path */
+ if (left) {
+ if (btrfs_header_nritems(left) > orig_slot) {
+ extent_buffer_get(left);
+ /* left was locked after cow */
+ path->nodes[level] = left;
+ path->slots[level + 1] -= 1;
+ path->slots[level] = orig_slot;
+ if (mid) {
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ }
+ } else {
+ orig_slot -= btrfs_header_nritems(left);
+ path->slots[level] = orig_slot;
+ }
+ }
+ /* double check we haven't messed things up */
+ check_block(root, path, level);
+ if (orig_ptr !=
+ btrfs_node_blockptr(path->nodes[level], path->slots[level]))
+ BUG();
+enospc:
+ if (right) {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ if (left) {
+ if (path->nodes[level] != left)
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ }
+ return ret;
+}
+
+/* Node balancing for insertion. Here we only split or push nodes around
+ * when they are completely full. This is also done top down, so we
+ * have to be pessimistic.
+ */
+static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct extent_buffer *right = NULL;
+ struct extent_buffer *mid;
+ struct extent_buffer *left = NULL;
+ struct extent_buffer *parent = NULL;
+ int ret = 0;
+ int wret;
+ int pslot;
+ int orig_slot = path->slots[level];
+ u64 orig_ptr;
+
+ if (level == 0)
+ return 1;
+
+ mid = path->nodes[level];
+ WARN_ON(btrfs_header_generation(mid) != trans->transid);
+ orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+
+ if (level < BTRFS_MAX_LEVEL - 1)
+ parent = path->nodes[level + 1];
+ pslot = path->slots[level + 1];
+
+ if (!parent)
+ return 1;
+
+ left = read_node_slot(root, parent, pslot - 1);
+
+ /* first, try to make some room in the middle buffer */
+ if (left) {
+ u32 left_nr;
+
+ btrfs_tree_lock(left);
+ btrfs_set_lock_blocking(left);
+
+ left_nr = btrfs_header_nritems(left);
+ if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+ wret = 1;
+ } else {
+ ret = btrfs_cow_block(trans, root, left, parent,
+ pslot - 1, &left);
+ if (ret)
+ wret = 1;
+ else {
+ wret = push_node_left(trans, root,
+ left, mid, 0);
+ }
+ }
+ if (wret < 0)
+ ret = wret;
+ if (wret == 0) {
+ struct btrfs_disk_key disk_key;
+ orig_slot += left_nr;
+ btrfs_node_key(mid, &disk_key, 0);
+ btrfs_set_node_key(parent, &disk_key, pslot);
+ btrfs_mark_buffer_dirty(parent);
+ if (btrfs_header_nritems(left) > orig_slot) {
+ path->nodes[level] = left;
+ path->slots[level + 1] -= 1;
+ path->slots[level] = orig_slot;
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ } else {
+ orig_slot -=
+ btrfs_header_nritems(left);
+ path->slots[level] = orig_slot;
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ }
+ return 0;
+ }
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ }
+ right = read_node_slot(root, parent, pslot + 1);
+
+ /*
+ * then try to empty the right most buffer into the middle
+ */
+ if (right) {
+ u32 right_nr;
+
+ btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
+
+ right_nr = btrfs_header_nritems(right);
+ if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+ wret = 1;
+ } else {
+ ret = btrfs_cow_block(trans, root, right,
+ parent, pslot + 1,
+ &right);
+ if (ret)
+ wret = 1;
+ else {
+ wret = balance_node_right(trans, root,
+ right, mid);
+ }
+ }
+ if (wret < 0)
+ ret = wret;
+ if (wret == 0) {
+ struct btrfs_disk_key disk_key;
+
+ btrfs_node_key(right, &disk_key, 0);
+ btrfs_set_node_key(parent, &disk_key, pslot + 1);
+ btrfs_mark_buffer_dirty(parent);
+
+ if (btrfs_header_nritems(mid) <= orig_slot) {
+ path->nodes[level] = right;
+ path->slots[level + 1] += 1;
+ path->slots[level] = orig_slot -
+ btrfs_header_nritems(mid);
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ return 0;
+ }
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ return 1;
+}
+
+/*
+ * readahead one full node of leaves, finding things that are close
+ * to the block in 'slot', and triggering ra on them.
+ */
+static void reada_for_search(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int level, int slot, u64 objectid)
+{
+ struct extent_buffer *node;
+ struct btrfs_disk_key disk_key;
+ u32 nritems;
+ u64 search;
+ u64 target;
+ u64 nread = 0;
+ int direction = path->reada;
+ struct extent_buffer *eb;
+ u32 nr;
+ u32 blocksize;
+ u32 nscan = 0;
+
+ if (level != 1)
+ return;
+
+ if (!path->nodes[level])
+ return;
+
+ node = path->nodes[level];
+
+ search = btrfs_node_blockptr(node, slot);
+ blocksize = btrfs_level_size(root, level - 1);
+ eb = btrfs_find_tree_block(root, search, blocksize);
+ if (eb) {
+ free_extent_buffer(eb);
+ return;
+ }
+
+ target = search;
+
+ nritems = btrfs_header_nritems(node);
+ nr = slot;
+ while (1) {
+ if (direction < 0) {
+ if (nr == 0)
+ break;
+ nr--;
+ } else if (direction > 0) {
+ nr++;
+ if (nr >= nritems)
+ break;
+ }
+ if (path->reada < 0 && objectid) {
+ btrfs_node_key(node, &disk_key, nr);
+ if (btrfs_disk_key_objectid(&disk_key) != objectid)
+ break;
+ }
+ search = btrfs_node_blockptr(node, nr);
+ if ((search <= target && target - search <= 65536) ||
+ (search > target && search - target <= 65536)) {
+ readahead_tree_block(root, search, blocksize,
+ btrfs_node_ptr_generation(node, nr));
+ nread += blocksize;
+ }
+ nscan++;
+ if ((nread > 65536 || nscan > 32))
+ break;
+ }
+}
+
+/*
+ * returns -EAGAIN if it had to drop the path, or zero if everything was in
+ * cache
+ */
+static noinline int reada_for_balance(struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ int slot;
+ int nritems;
+ struct extent_buffer *parent;
+ struct extent_buffer *eb;
+ u64 gen;
+ u64 block1 = 0;
+ u64 block2 = 0;
+ int ret = 0;
+ int blocksize;
+
+ parent = path->nodes[level + 1];
+ if (!parent)
+ return 0;
+
+ nritems = btrfs_header_nritems(parent);
+ slot = path->slots[level + 1];
+ blocksize = btrfs_level_size(root, level);
+
+ if (slot > 0) {
+ block1 = btrfs_node_blockptr(parent, slot - 1);
+ gen = btrfs_node_ptr_generation(parent, slot - 1);
+ eb = btrfs_find_tree_block(root, block1, blocksize);
+ if (eb && btrfs_buffer_uptodate(eb, gen))
+ block1 = 0;
+ free_extent_buffer(eb);
+ }
+ if (slot + 1 < nritems) {
+ block2 = btrfs_node_blockptr(parent, slot + 1);
+ gen = btrfs_node_ptr_generation(parent, slot + 1);
+ eb = btrfs_find_tree_block(root, block2, blocksize);
+ if (eb && btrfs_buffer_uptodate(eb, gen))
+ block2 = 0;
+ free_extent_buffer(eb);
+ }
+ if (block1 || block2) {
+ ret = -EAGAIN;
+
+ /* release the whole path */
+ btrfs_release_path(root, path);
+
+ /* read the blocks */
+ if (block1)
+ readahead_tree_block(root, block1, blocksize, 0);
+ if (block2)
+ readahead_tree_block(root, block2, blocksize, 0);
+
+ if (block1) {
+ eb = read_tree_block(root, block1, blocksize, 0);
+ free_extent_buffer(eb);
+ }
+ if (block2) {
+ eb = read_tree_block(root, block2, blocksize, 0);
+ free_extent_buffer(eb);
+ }
+ }
+ return ret;
+}
+
+
+/*
+ * when we walk down the tree, it is usually safe to unlock the higher layers
+ * in the tree. The exceptions are when our path goes through slot 0, because
+ * operations on the tree might require changing key pointers higher up in the
+ * tree.
+ *
+ * callers might also have set path->keep_locks, which tells this code to keep
+ * the lock if the path points to the last slot in the block. This is part of
+ * walking through the tree, and selecting the next slot in the higher block.
+ *
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
+ * if lowest_unlock is 1, level 0 won't be unlocked
+ */
+static noinline void unlock_up(struct btrfs_path *path, int level,
+ int lowest_unlock)
+{
+ int i;
+ int skip_level = level;
+ int no_skips = 0;
+ struct extent_buffer *t;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ if (!path->nodes[i])
+ break;
+ if (!path->locks[i])
+ break;
+ if (!no_skips && path->slots[i] == 0) {
+ skip_level = i + 1;
+ continue;
+ }
+ if (!no_skips && path->keep_locks) {
+ u32 nritems;
+ t = path->nodes[i];
+ nritems = btrfs_header_nritems(t);
+ if (nritems < 1 || path->slots[i] >= nritems - 1) {
+ skip_level = i + 1;
+ continue;
+ }
+ }
+ if (skip_level < i && i >= lowest_unlock)
+ no_skips = 1;
+
+ t = path->nodes[i];
+ if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+ btrfs_tree_unlock(t);
+ path->locks[i] = 0;
+ }
+ }
+}
+
+/*
+ * This releases any locks held in the path starting at level and
+ * going all the way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few
+ * corner cases, such as COW of the block at slot zero in the node. This
+ * ignores those rules, and it should only be called when there are no
+ * more updates to be done higher up in the tree.
+ */
+noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
+{
+ int i;
+
+ if (path->keep_locks)
+ return;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ if (!path->nodes[i])
+ continue;
+ if (!path->locks[i])
+ continue;
+ btrfs_tree_unlock(path->nodes[i]);
+ path->locks[i] = 0;
+ }
+}
+
+/*
+ * helper function for btrfs_search_slot. The goal is to find a block
+ * in cache without setting the path to blocking. If we find the block
+ * we return zero and the path is unchanged.
+ *
+ * If we can't find the block, we set the path blocking and do some
+ * reada. -EAGAIN is returned and the search must be repeated.
+ */
+static int
+read_block_for_search(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *p,
+ struct extent_buffer **eb_ret, int level, int slot,
+ struct btrfs_key *key)
+{
+ u64 blocknr;
+ u64 gen;
+ u32 blocksize;
+ struct extent_buffer *b = *eb_ret;
+ struct extent_buffer *tmp;
+ int ret;
+
+ blocknr = btrfs_node_blockptr(b, slot);
+ gen = btrfs_node_ptr_generation(b, slot);
+ blocksize = btrfs_level_size(root, level - 1);
+
+ tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+ if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+ /*
+ * we found an up to date block without sleeping, return
+ * right away
+ */
+ *eb_ret = tmp;
+ return 0;
+ }
+
+ /*
+ * reduce lock contention at high levels
+ * of the btree by dropping locks before
+ * we read. Don't release the lock on the current
+ * level because we need to walk this node to figure
+ * out which blocks to read.
+ */
+ btrfs_unlock_up_safe(p, level + 1);
+ btrfs_set_path_blocking(p);
+
+ if (tmp)
+ free_extent_buffer(tmp);
+ if (p->reada)
+ reada_for_search(root, p, level, slot, key->objectid);
+
+ btrfs_release_path(NULL, p);
+
+ ret = -EAGAIN;
+ tmp = read_tree_block(root, blocknr, blocksize, gen);
+ if (tmp) {
+ /*
+ * If the read above didn't mark this buffer up to date,
+ * it will never end up being up to date. Set ret to EIO now
+ * and give up so that our caller doesn't loop forever
+ * on our EAGAINs.
+ */
+ if (!btrfs_buffer_uptodate(tmp, 0))
+ ret = -EIO;
+ free_extent_buffer(tmp);
+ }
+ return ret;
+}
+
+/*
+ * helper function for btrfs_search_slot. This does all of the checks
+ * for node-level blocks and does any balancing required based on
+ * the ins_len.
+ *
+ * If no extra work was required, zero is returned. If we had to
+ * drop the path, -EAGAIN is returned and btrfs_search_slot must
+ * start over
+ */
+static int
+setup_nodes_for_search(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *p,
+ struct extent_buffer *b, int level, int ins_len)
+{
+ int ret;
+ if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
+ BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+ int sret;
+
+ sret = reada_for_balance(root, p, level);
+ if (sret)
+ goto again;
+
+ btrfs_set_path_blocking(p);
+ sret = split_node(trans, root, p, level);
+ btrfs_clear_path_blocking(p, NULL);
+
+ BUG_ON(sret > 0);
+ if (sret) {
+ ret = sret;
+ goto done;
+ }
+ b = p->nodes[level];
+ } else if (ins_len < 0 && btrfs_header_nritems(b) <
+ BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
+ int sret;
+
+ sret = reada_for_balance(root, p, level);
+ if (sret)
+ goto again;
+
+ btrfs_set_path_blocking(p);
+ sret = balance_level(trans, root, p, level);
+ btrfs_clear_path_blocking(p, NULL);
+
+ if (sret) {
+ ret = sret;
+ goto done;
+ }
+ b = p->nodes[level];
+ if (!b) {
+ btrfs_release_path(NULL, p);
+ goto again;
+ }
+ BUG_ON(btrfs_header_nritems(b) == 1);
+ }
+ return 0;
+
+again:
+ ret = -EAGAIN;
+done:
+ return ret;
+}
+
+/*
+ * look for key in the tree. path is filled in with nodes along the way
+ * if key is found, we return zero and you can find the item in the leaf
+ * level of the path (level 0)
+ *
+ * If the key isn't found, the path points to the slot where it should
+ * be inserted, and 1 is returned. If there are other errors during the
+ * search a negative error number is returned.
+ *
+ * if ins_len > 0, nodes and leaves will be split as we walk down the
+ * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
+ * possible)
+ */
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_path *p, int
+ ins_len, int cow)
+{
+ struct extent_buffer *b;
+ int slot;
+ int ret;
+ int err;
+ int level;
+ int lowest_unlock = 1;
+ u8 lowest_level = 0;
+
+ lowest_level = p->lowest_level;
+ WARN_ON(lowest_level && ins_len > 0);
+ WARN_ON(p->nodes[0] != NULL);
+
+ if (ins_len < 0)
+ lowest_unlock = 2;
+
+again:
+ if (p->search_commit_root) {
+ b = root->commit_root;
+ extent_buffer_get(b);
+ if (!p->skip_locking)
+ btrfs_tree_lock(b);
+ } else {
+ if (p->skip_locking)
+ b = btrfs_root_node(root);
+ else
+ b = btrfs_lock_root_node(root);
+ }
+
+ while (b) {
+ level = btrfs_header_level(b);
+
+ /*
+ * setup the path here so we can release it under lock
+ * contention with the cow code
+ */
+ p->nodes[level] = b;
+ if (!p->skip_locking)
+ p->locks[level] = 1;
+
+ if (cow) {
+ /*
+ * if we don't really need to cow this block
+ * then we don't want to set the path blocking,
+ * so we test it here
+ */
+ if (!should_cow_block(trans, root, b))
+ goto cow_done;
+
+ btrfs_set_path_blocking(p);
+
+ err = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1], &b);
+ if (err) {
+ free_extent_buffer(b);
+ ret = err;
+ goto done;
+ }
+ }
+cow_done:
+ BUG_ON(!cow && ins_len);
+ if (level != btrfs_header_level(b))
+ WARN_ON(1);
+ level = btrfs_header_level(b);
+
+ p->nodes[level] = b;
+ if (!p->skip_locking)
+ p->locks[level] = 1;
+
+ btrfs_clear_path_blocking(p, NULL);
+
+ /*
+ * we have a lock on b and as long as we aren't changing
+ * the tree, there is no way to for the items in b to change.
+ * It is safe to drop the lock on our parent before we
+ * go through the expensive btree search on b.
+ *
+ * If cow is true, then we might be changing slot zero,
+ * which may require changing the parent. So, we can't
+ * drop the lock until after we know which slot we're
+ * operating on.
+ */
+ if (!cow)
+ btrfs_unlock_up_safe(p, level + 1);
+
+ ret = check_block(root, p, level);
+ if (ret) {
+ ret = -1;
+ goto done;
+ }
+
+ ret = bin_search(b, key, level, &slot);
+
+ if (level != 0) {
+ int dec = 0;
+ if (ret && slot > 0) {
+ dec = 1;
+ slot -= 1;
+ }
+ p->slots[level] = slot;
+ err = setup_nodes_for_search(trans, root, p, b, level,
+ ins_len);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+ b = p->nodes[level];
+ slot = p->slots[level];
+
+ unlock_up(p, level, lowest_unlock);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(trans, root, p,
+ &b, level, slot, key);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ if (!p->skip_locking) {
+ btrfs_clear_path_blocking(p, NULL);
+ err = btrfs_try_spin_lock(b);
+
+ if (!err) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_lock(b);
+ btrfs_clear_path_blocking(p, b);
+ }
+ }
+ } else {
+ p->slots[level] = slot;
+ if (ins_len > 0 &&
+ btrfs_leaf_free_space(root, b) < ins_len) {
+ btrfs_set_path_blocking(p);
+ err = split_leaf(trans, root, key,
+ p, ins_len, ret == 0);
+ btrfs_clear_path_blocking(p, NULL);
+
+ BUG_ON(err > 0);
+ if (err) {
+ ret = err;
+ goto done;
+ }
+ }
+ if (!p->search_for_split)
+ unlock_up(p, level, lowest_unlock);
+ goto done;
+ }
+ }
+ ret = 1;
+done:
+ /*
+ * we don't really know what they plan on doing with the path
+ * from here on, so for now just mark it as blocking
+ */
+ if (!p->leave_spinning)
+ btrfs_set_path_blocking(p);
+ if (ret < 0)
+ btrfs_release_path(root, p);
+ return ret;
+}
+
+/*
+ * adjust the pointers going up the tree, starting at level
+ * making sure the right key of each node is points to 'key'.
+ * This is used after shifting pointers to the left, so it stops
+ * fixing up pointers when a given leaf/node is not in slot 0 of the
+ * higher levels
+ *
+ * If this fails to write a tree block, it returns -1, but continues
+ * fixing up the blocks in ram so the tree is consistent.
+ */
+static int fixup_low_keys(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_disk_key *key, int level)
+{
+ int i;
+ int ret = 0;
+ struct extent_buffer *t;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ int tslot = path->slots[i];
+ if (!path->nodes[i])
+ break;
+ t = path->nodes[i];
+ btrfs_set_node_key(t, key, tslot);
+ btrfs_mark_buffer_dirty(path->nodes[i]);
+ if (tslot != 0)
+ break;
+ }
+ return ret;
+}
+
+/*
+ * update item key.
+ *
+ * This function isn't completely safe. It's the caller's responsibility
+ * that the new key won't break the order
+ */
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *new_key)
+{
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *eb;
+ int slot;
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ if (slot > 0) {
+ btrfs_item_key(eb, &disk_key, slot - 1);
+ if (comp_keys(&disk_key, new_key) >= 0)
+ return -1;
+ }
+ if (slot < btrfs_header_nritems(eb) - 1) {
+ btrfs_item_key(eb, &disk_key, slot + 1);
+ if (comp_keys(&disk_key, new_key) <= 0)
+ return -1;
+ }
+
+ btrfs_cpu_key_to_disk(&disk_key, new_key);
+ btrfs_set_item_key(eb, &disk_key, slot);
+ btrfs_mark_buffer_dirty(eb);
+ if (slot == 0)
+ fixup_low_keys(trans, root, path, &disk_key, 1);
+ return 0;
+}
+
+/*
+ * try to push data from one node into the next node left in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the left hand block.
+ */
+static int push_node_left(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *dst,
+ struct extent_buffer *src, int empty)
+{
+ int push_items = 0;
+ int src_nritems;
+ int dst_nritems;
+ int ret = 0;
+
+ src_nritems = btrfs_header_nritems(src);
+ dst_nritems = btrfs_header_nritems(dst);
+ push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+ WARN_ON(btrfs_header_generation(src) != trans->transid);
+ WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+ if (!empty && src_nritems <= 8)
+ return 1;
+
+ if (push_items <= 0)
+ return 1;
+
+ if (empty) {
+ push_items = min(src_nritems, push_items);
+ if (push_items < src_nritems) {
+ /* leave at least 8 pointers in the node if
+ * we aren't going to empty it
+ */
+ if (src_nritems - push_items < 8) {
+ if (push_items <= 8)
+ return 1;
+ push_items -= 8;
+ }
+ }
+ } else
+ push_items = min(src_nritems - 8, push_items);
+
+ copy_extent_buffer(dst, src,
+ btrfs_node_key_ptr_offset(dst_nritems),
+ btrfs_node_key_ptr_offset(0),
+ push_items * sizeof(struct btrfs_key_ptr));
+
+ if (push_items < src_nritems) {
+ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(push_items),
+ (src_nritems - push_items) *
+ sizeof(struct btrfs_key_ptr));
+ }
+ btrfs_set_header_nritems(src, src_nritems - push_items);
+ btrfs_set_header_nritems(dst, dst_nritems + push_items);
+ btrfs_mark_buffer_dirty(src);
+ btrfs_mark_buffer_dirty(dst);
+
+ return ret;
+}
+
+/*
+ * try to push data from one node into the next node right in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the right hand block.
+ *
+ * this will only push up to 1/2 the contents of the left node over
+ */
+static int balance_node_right(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *dst,
+ struct extent_buffer *src)
+{
+ int push_items = 0;
+ int max_push;
+ int src_nritems;
+ int dst_nritems;
+ int ret = 0;
+
+ WARN_ON(btrfs_header_generation(src) != trans->transid);
+ WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+ src_nritems = btrfs_header_nritems(src);
+ dst_nritems = btrfs_header_nritems(dst);
+ push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+ if (push_items <= 0)
+ return 1;
+
+ if (src_nritems < 4)
+ return 1;
+
+ max_push = src_nritems / 2 + 1;
+ /* don't try to empty the node */
+ if (max_push >= src_nritems)
+ return 1;
+
+ if (max_push < push_items)
+ push_items = max_push;
+
+ memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
+ btrfs_node_key_ptr_offset(0),
+ (dst_nritems) *
+ sizeof(struct btrfs_key_ptr));
+
+ copy_extent_buffer(dst, src,
+ btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(src_nritems - push_items),
+ push_items * sizeof(struct btrfs_key_ptr));
+
+ btrfs_set_header_nritems(src, src_nritems - push_items);
+ btrfs_set_header_nritems(dst, dst_nritems + push_items);
+
+ btrfs_mark_buffer_dirty(src);
+ btrfs_mark_buffer_dirty(dst);
+
+ return ret;
+}
+
+/*
+ * helper function to insert a new root level in the tree.
+ * A new node is allocated, and a single item is inserted to
+ * point to the existing root
+ *
+ * returns zero on success or < 0 on failure.
+ */
+static noinline int insert_new_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ u64 lower_gen;
+ struct extent_buffer *lower;
+ struct extent_buffer *c;
+ struct extent_buffer *old;
+ struct btrfs_disk_key lower_key;
+
+ BUG_ON(path->nodes[level]);
+ BUG_ON(path->nodes[level-1] != root->node);
+
+ lower = path->nodes[level-1];
+ if (level == 1)
+ btrfs_item_key(lower, &lower_key, 0);
+ else
+ btrfs_node_key(lower, &lower_key, 0);
+
+ c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+ root->root_key.objectid, &lower_key,
+ level, root->node->start, 0);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+
+ memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_nritems(c, 1);
+ btrfs_set_header_level(c, level);
+ btrfs_set_header_bytenr(c, c->start);
+ btrfs_set_header_generation(c, trans->transid);
+ btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(c, root->root_key.objectid);
+
+ write_extent_buffer(c, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(c),
+ BTRFS_FSID_SIZE);
+
+ write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_header_chunk_tree_uuid(c),
+ BTRFS_UUID_SIZE);
+
+ btrfs_set_node_key(c, &lower_key, 0);
+ btrfs_set_node_blockptr(c, 0, lower->start);
+ lower_gen = btrfs_header_generation(lower);
+ WARN_ON(lower_gen != trans->transid);
+
+ btrfs_set_node_ptr_generation(c, 0, lower_gen);
+
+ btrfs_mark_buffer_dirty(c);
+
+ spin_lock(&root->node_lock);
+ old = root->node;
+ root->node = c;
+ spin_unlock(&root->node_lock);
+
+ /* the super has an extra ref to root->node */
+ free_extent_buffer(old);
+
+ add_root_to_dirty_list(root);
+ extent_buffer_get(c);
+ path->nodes[level] = c;
+ path->locks[level] = 1;
+ path->slots[level] = 0;
+ return 0;
+}
+
+/*
+ * worker function to insert a single pointer in a node.
+ * the node should have enough room for the pointer already
+ *
+ * slot and level indicate where you want the key to go, and
+ * blocknr is the block the key points to.
+ *
+ * returns zero on success and < 0 on any error
+ */
+static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, struct btrfs_disk_key
+ *key, u64 bytenr, int slot, int level)
+{
+ struct extent_buffer *lower;
+ int nritems;
+
+ BUG_ON(!path->nodes[level]);
+ lower = path->nodes[level];
+ nritems = btrfs_header_nritems(lower);
+ BUG_ON(slot > nritems);
+ if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
+ BUG();
+ if (slot != nritems) {
+ memmove_extent_buffer(lower,
+ btrfs_node_key_ptr_offset(slot + 1),
+ btrfs_node_key_ptr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_key_ptr));
+ }
+ btrfs_set_node_key(lower, key, slot);
+ btrfs_set_node_blockptr(lower, slot, bytenr);
+ WARN_ON(trans->transid == 0);
+ btrfs_set_node_ptr_generation(lower, slot, trans->transid);
+ btrfs_set_header_nritems(lower, nritems + 1);
+ btrfs_mark_buffer_dirty(lower);
+ return 0;
+}
+
+/*
+ * split the node at the specified level in path in two.
+ * The path is corrected to point to the appropriate node after the split
+ *
+ * Before splitting this tries to make some room in the node by pushing
+ * left and right, if either one works, it returns right away.
+ *
+ * returns 0 on success and < 0 on failure
+ */
+static noinline int split_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct extent_buffer *c;
+ struct extent_buffer *split;
+ struct btrfs_disk_key disk_key;
+ int mid;
+ int ret;
+ int wret;
+ u32 c_nritems;
+
+ c = path->nodes[level];
+ WARN_ON(btrfs_header_generation(c) != trans->transid);
+ if (c == root->node) {
+ /* trying to split the root, lets make a new one */
+ ret = insert_new_root(trans, root, path, level + 1);
+ if (ret)
+ return ret;
+ } else {
+ ret = push_nodes_for_insert(trans, root, path, level);
+ c = path->nodes[level];
+ if (!ret && btrfs_header_nritems(c) <
+ BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
+ return 0;
+ if (ret < 0)
+ return ret;
+ }
+
+ c_nritems = btrfs_header_nritems(c);
+ mid = (c_nritems + 1) / 2;
+ btrfs_node_key(c, &disk_key, mid);
+
+ split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+ root->root_key.objectid,
+ &disk_key, level, c->start, 0);
+ if (IS_ERR(split))
+ return PTR_ERR(split);
+
+ memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_level(split, btrfs_header_level(c));
+ btrfs_set_header_bytenr(split, split->start);
+ btrfs_set_header_generation(split, trans->transid);
+ btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(split, root->root_key.objectid);
+ write_extent_buffer(split, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(split),
+ BTRFS_FSID_SIZE);
+ write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_header_chunk_tree_uuid(split),
+ BTRFS_UUID_SIZE);
+
+
+ copy_extent_buffer(split, c,
+ btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(mid),
+ (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
+ btrfs_set_header_nritems(split, c_nritems - mid);
+ btrfs_set_header_nritems(c, mid);
+ ret = 0;
+
+ btrfs_mark_buffer_dirty(c);
+ btrfs_mark_buffer_dirty(split);
+
+ wret = insert_ptr(trans, root, path, &disk_key, split->start,
+ path->slots[level + 1] + 1,
+ level + 1);
+ if (wret)
+ ret = wret;
+
+ if (path->slots[level] >= mid) {
+ path->slots[level] -= mid;
+ btrfs_tree_unlock(c);
+ free_extent_buffer(c);
+ path->nodes[level] = split;
+ path->slots[level + 1] += 1;
+ } else {
+ btrfs_tree_unlock(split);
+ free_extent_buffer(split);
+ }
+ return ret;
+}
+
+/*
+ * how many bytes are required to store the items in a leaf. start
+ * and nr indicate which items in the leaf to check. This totals up the
+ * space used both by the item structs and the item data
+ */
+static int leaf_space_used(struct extent_buffer *l, int start, int nr)
+{
+ int data_len;
+ int nritems = btrfs_header_nritems(l);
+ int end = min(nritems, start + nr) - 1;
+
+ if (!nr)
+ return 0;
+ data_len = btrfs_item_end_nr(l, start);
+ data_len = data_len - btrfs_item_offset_nr(l, end);
+ data_len += sizeof(struct btrfs_item) * nr;
+ WARN_ON(data_len < 0);
+ return data_len;
+}
+
+/*
+ * The space between the end of the leaf items and
+ * the start of the leaf data. IOW, how much room
+ * the leaf has left for both items and data
+ */
+noinline int btrfs_leaf_free_space(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ int nritems = btrfs_header_nritems(leaf);
+ int ret;
+ ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
+ if (ret < 0) {
+ printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
+ "used %d nritems %d\n",
+ ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
+ leaf_space_used(leaf, 0, nritems), nritems);
+ }
+ return ret;
+}
+
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int data_size, int empty,
+ struct extent_buffer *right,
+ int free_space, u32 left_nritems)
+{
+ struct extent_buffer *left = path->nodes[0];
+ struct extent_buffer *upper = path->nodes[1];
+ struct btrfs_disk_key disk_key;
+ int slot;
+ u32 i;
+ int push_space = 0;
+ int push_items = 0;
+ struct btrfs_item *item;
+ u32 nr;
+ u32 right_nritems;
+ u32 data_end;
+ u32 this_item_size;
+
+ if (empty)
+ nr = 0;
+ else
+ nr = 1;
+
+ if (path->slots[0] >= left_nritems)
+ push_space += data_size;
+
+ slot = path->slots[1];
+ i = left_nritems - 1;
+ while (i >= nr) {
+ item = btrfs_item_nr(left, i);
+
+ if (!empty && push_items > 0) {
+ if (path->slots[0] > i)
+ break;
+ if (path->slots[0] == i) {
+ int space = btrfs_leaf_free_space(root, left);
+ if (space + push_space * 2 > free_space)
+ break;
+ }
+ }
+
+ if (path->slots[0] == i)
+ push_space += data_size;
+
+ if (!left->map_token) {
+ map_extent_buffer(left, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &left->map_token, &left->kaddr,
+ &left->map_start, &left->map_len,
+ KM_USER1);
+ }
+
+ this_item_size = btrfs_item_size(left, item);
+ if (this_item_size + sizeof(*item) + push_space > free_space)
+ break;
+
+ push_items++;
+ push_space += this_item_size + sizeof(*item);
+ if (i == 0)
+ break;
+ i--;
+ }
+ if (left->map_token) {
+ unmap_extent_buffer(left, left->map_token, KM_USER1);
+ left->map_token = NULL;
+ }
+
+ if (push_items == 0)
+ goto out_unlock;
+
+ if (!empty && push_items == left_nritems)
+ WARN_ON(1);
+
+ /* push left to right */
+ right_nritems = btrfs_header_nritems(right);
+
+ push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+ push_space -= leaf_data_end(root, left);
+
+ /* make room in the right data area */
+ data_end = leaf_data_end(root, right);
+ memmove_extent_buffer(right,
+ btrfs_leaf_data(right) + data_end - push_space,
+ btrfs_leaf_data(right) + data_end,
+ BTRFS_LEAF_DATA_SIZE(root) - data_end);
+
+ /* copy from the left data area */
+ copy_extent_buffer(right, left, btrfs_leaf_data(right) +
+ BTRFS_LEAF_DATA_SIZE(root) - push_space,
+ btrfs_leaf_data(left) + leaf_data_end(root, left),
+ push_space);
+
+ memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
+ btrfs_item_nr_offset(0),
+ right_nritems * sizeof(struct btrfs_item));
+
+ /* copy the items from left to right */
+ copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(left_nritems - push_items),
+ push_items * sizeof(struct btrfs_item));
+
+ /* update the item pointers */
+ right_nritems += push_items;
+ btrfs_set_header_nritems(right, right_nritems);
+ push_space = BTRFS_LEAF_DATA_SIZE(root);
+ for (i = 0; i < right_nritems; i++) {
+ item = btrfs_item_nr(right, i);
+ if (!right->map_token) {
+ map_extent_buffer(right, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &right->map_token, &right->kaddr,
+ &right->map_start, &right->map_len,
+ KM_USER1);
+ }
+ push_space -= btrfs_item_size(right, item);
+ btrfs_set_item_offset(right, item, push_space);
+ }
+
+ if (right->map_token) {
+ unmap_extent_buffer(right, right->map_token, KM_USER1);
+ right->map_token = NULL;
+ }
+ left_nritems -= push_items;
+ btrfs_set_header_nritems(left, left_nritems);
+
+ if (left_nritems)
+ btrfs_mark_buffer_dirty(left);
+ btrfs_mark_buffer_dirty(right);
+
+ btrfs_item_key(right, &disk_key, 0);
+ btrfs_set_node_key(upper, &disk_key, slot + 1);
+ btrfs_mark_buffer_dirty(upper);
+
+ /* then fixup the leaf pointer in the path */
+ if (path->slots[0] >= left_nritems) {
+ path->slots[0] -= left_nritems;
+ if (btrfs_header_nritems(path->nodes[0]) == 0)
+ clean_tree_block(trans, root, path->nodes[0]);
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[1] += 1;
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ return 0;
+
+out_unlock:
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return 1;
+}
+
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int data_size,
+ int empty)
+{
+ struct extent_buffer *left = path->nodes[0];
+ struct extent_buffer *right;
+ struct extent_buffer *upper;
+ int slot;
+ int free_space;
+ u32 left_nritems;
+ int ret;
+
+ if (!path->nodes[1])
+ return 1;
+
+ slot = path->slots[1];
+ upper = path->nodes[1];
+ if (slot >= btrfs_header_nritems(upper) - 1)
+ return 1;
+
+ btrfs_assert_tree_locked(path->nodes[1]);
+
+ right = read_node_slot(root, upper, slot + 1);
+ btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, right, upper,
+ slot + 1, &right);
+ if (ret)
+ goto out_unlock;
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ left_nritems = btrfs_header_nritems(left);
+ if (left_nritems == 0)
+ goto out_unlock;
+
+ return __push_leaf_right(trans, root, path, data_size, empty,
+ right, free_space, left_nritems);
+out_unlock:
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return 1;
+}
+
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ */
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int data_size,
+ int empty, struct extent_buffer *left,
+ int free_space, int right_nritems)
+{
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *right = path->nodes[0];
+ int slot;
+ int i;
+ int push_space = 0;
+ int push_items = 0;
+ struct btrfs_item *item;
+ u32 old_left_nritems;
+ u32 nr;
+ int ret = 0;
+ int wret;
+ u32 this_item_size;
+ u32 old_left_item_size;
+
+ slot = path->slots[1];
+
+ if (empty)
+ nr = right_nritems;
+ else
+ nr = right_nritems - 1;
+
+ for (i = 0; i < nr; i++) {
+ item = btrfs_item_nr(right, i);
+ if (!right->map_token) {
+ map_extent_buffer(right, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &right->map_token, &right->kaddr,
+ &right->map_start, &right->map_len,
+ KM_USER1);
+ }
+
+ if (!empty && push_items > 0) {
+ if (path->slots[0] < i)
+ break;
+ if (path->slots[0] == i) {
+ int space = btrfs_leaf_free_space(root, right);
+ if (space + push_space * 2 > free_space)
+ break;
+ }
+ }
+
+ if (path->slots[0] == i)
+ push_space += data_size;
+
+ this_item_size = btrfs_item_size(right, item);
+ if (this_item_size + sizeof(*item) + push_space > free_space)
+ break;
+
+ push_items++;
+ push_space += this_item_size + sizeof(*item);
+ }
+
+ if (right->map_token) {
+ unmap_extent_buffer(right, right->map_token, KM_USER1);
+ right->map_token = NULL;
+ }
+
+ if (push_items == 0) {
+ ret = 1;
+ goto out;
+ }
+ if (!empty && push_items == btrfs_header_nritems(right))
+ WARN_ON(1);
+
+ /* push data from right to left */
+ copy_extent_buffer(left, right,
+ btrfs_item_nr_offset(btrfs_header_nritems(left)),
+ btrfs_item_nr_offset(0),
+ push_items * sizeof(struct btrfs_item));
+
+ push_space = BTRFS_LEAF_DATA_SIZE(root) -
+ btrfs_item_offset_nr(right, push_items - 1);
+
+ copy_extent_buffer(left, right, btrfs_leaf_data(left) +
+ leaf_data_end(root, left) - push_space,
+ btrfs_leaf_data(right) +
+ btrfs_item_offset_nr(right, push_items - 1),
+ push_space);
+ old_left_nritems = btrfs_header_nritems(left);
+ BUG_ON(old_left_nritems <= 0);
+
+ old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+ for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+ u32 ioff;
+
+ item = btrfs_item_nr(left, i);
+ if (!left->map_token) {
+ map_extent_buffer(left, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &left->map_token, &left->kaddr,
+ &left->map_start, &left->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(left, item);
+ btrfs_set_item_offset(left, item,
+ ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
+ }
+ btrfs_set_header_nritems(left, old_left_nritems + push_items);
+ if (left->map_token) {
+ unmap_extent_buffer(left, left->map_token, KM_USER1);
+ left->map_token = NULL;
+ }
+
+ /* fixup right node */
+ if (push_items > right_nritems) {
+ printk(KERN_CRIT "push items %d nr %u\n", push_items,
+ right_nritems);
+ WARN_ON(1);
+ }
+
+ if (push_items < right_nritems) {
+ push_space = btrfs_item_offset_nr(right, push_items - 1) -
+ leaf_data_end(root, right);
+ memmove_extent_buffer(right, btrfs_leaf_data(right) +
+ BTRFS_LEAF_DATA_SIZE(root) - push_space,
+ btrfs_leaf_data(right) +
+ leaf_data_end(root, right), push_space);
+
+ memmove_extent_buffer(right, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(push_items),
+ (btrfs_header_nritems(right) - push_items) *
+ sizeof(struct btrfs_item));
+ }
+ right_nritems -= push_items;
+ btrfs_set_header_nritems(right, right_nritems);
+ push_space = BTRFS_LEAF_DATA_SIZE(root);
+ for (i = 0; i < right_nritems; i++) {
+ item = btrfs_item_nr(right, i);
+
+ if (!right->map_token) {
+ map_extent_buffer(right, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &right->map_token, &right->kaddr,
+ &right->map_start, &right->map_len,
+ KM_USER1);
+ }
+
+ push_space = push_space - btrfs_item_size(right, item);
+ btrfs_set_item_offset(right, item, push_space);
+ }
+ if (right->map_token) {
+ unmap_extent_buffer(right, right->map_token, KM_USER1);
+ right->map_token = NULL;
+ }
+
+ btrfs_mark_buffer_dirty(left);
+ if (right_nritems)
+ btrfs_mark_buffer_dirty(right);
+
+ btrfs_item_key(right, &disk_key, 0);
+ wret = fixup_low_keys(trans, root, path, &disk_key, 1);
+ if (wret)
+ ret = wret;
+
+ /* then fixup the leaf pointer in the path */
+ if (path->slots[0] < push_items) {
+ path->slots[0] += old_left_nritems;
+ if (btrfs_header_nritems(path->nodes[0]) == 0)
+ clean_tree_block(trans, root, path->nodes[0]);
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = left;
+ path->slots[1] -= 1;
+ } else {
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ path->slots[0] -= push_items;
+ }
+ BUG_ON(path->slots[0] < 0);
+ return ret;
+out:
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ return ret;
+}
+
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int data_size,
+ int empty)
+{
+ struct extent_buffer *right = path->nodes[0];
+ struct extent_buffer *left;
+ int slot;
+ int free_space;
+ u32 right_nritems;
+ int ret = 0;
+
+ slot = path->slots[1];
+ if (slot == 0)
+ return 1;
+ if (!path->nodes[1])
+ return 1;
+
+ right_nritems = btrfs_header_nritems(right);
+ if (right_nritems == 0)
+ return 1;
+
+ btrfs_assert_tree_locked(path->nodes[1]);
+
+ left = read_node_slot(root, path->nodes[1], slot - 1);
+ btrfs_tree_lock(left);
+ btrfs_set_lock_blocking(left);
+
+ free_space = btrfs_leaf_free_space(root, left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, left,
+ path->nodes[1], slot - 1, &left);
+ if (ret) {
+ /* we hit -ENOSPC, but it isn't fatal here */
+ ret = 1;
+ goto out;
+ }
+
+ free_space = btrfs_leaf_free_space(root, left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ return __push_leaf_left(trans, root, path, data_size,
+ empty, left, free_space, right_nritems);
+out:
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int copy_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *l,
+ struct extent_buffer *right,
+ int slot, int mid, int nritems)
+{
+ int data_copy_size;
+ int rt_data_off;
+ int i;
+ int ret = 0;
+ int wret;
+ struct btrfs_disk_key disk_key;
+
+ nritems = nritems - mid;
+ btrfs_set_header_nritems(right, nritems);
+ data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+ copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(mid),
+ nritems * sizeof(struct btrfs_item));
+
+ copy_extent_buffer(right, l,
+ btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+ data_copy_size, btrfs_leaf_data(l) +
+ leaf_data_end(root, l), data_copy_size);
+
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+ btrfs_item_end_nr(l, mid);
+
+ for (i = 0; i < nritems; i++) {
+ struct btrfs_item *item = btrfs_item_nr(right, i);
+ u32 ioff;
+
+ if (!right->map_token) {
+ map_extent_buffer(right, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &right->map_token, &right->kaddr,
+ &right->map_start, &right->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(right, item);
+ btrfs_set_item_offset(right, item, ioff + rt_data_off);
+ }
+
+ if (right->map_token) {
+ unmap_extent_buffer(right, right->map_token, KM_USER1);
+ right->map_token = NULL;
+ }
+
+ btrfs_set_header_nritems(l, mid);
+ ret = 0;
+ btrfs_item_key(right, &disk_key, 0);
+ wret = insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
+ if (wret)
+ ret = wret;
+
+ btrfs_mark_buffer_dirty(right);
+ btrfs_mark_buffer_dirty(l);
+ BUG_ON(path->slots[0] != slot);
+
+ if (mid <= slot) {
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] -= mid;
+ path->slots[1] += 1;
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+
+ BUG_ON(path->slots[0] < 0);
+
+ return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int split_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *ins_key,
+ struct btrfs_path *path, int data_size,
+ int extend)
+{
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *l;
+ u32 nritems;
+ int mid;
+ int slot;
+ struct extent_buffer *right;
+ int ret = 0;
+ int wret;
+ int split;
+ int num_doubles = 0;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (extend && data_size + btrfs_item_size_nr(l, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
+ return -EOVERFLOW;
+
+ /* first try to make some room by pushing left and right */
+ if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+ wret = push_leaf_right(trans, root, path, data_size, 0);
+ if (wret < 0)
+ return wret;
+ if (wret) {
+ wret = push_leaf_left(trans, root, path, data_size, 0);
+ if (wret < 0)
+ return wret;
+ }
+ l = path->nodes[0];
+
+ /* did the pushes work? */
+ if (btrfs_leaf_free_space(root, l) >= data_size)
+ return 0;
+ }
+
+ if (!path->nodes[1]) {
+ ret = insert_new_root(trans, root, path, 1);
+ if (ret)
+ return ret;
+ }
+again:
+ split = 1;
+ l = path->nodes[0];
+ slot = path->slots[0];
+ nritems = btrfs_header_nritems(l);
+ mid = (nritems + 1) / 2;
+
+ if (mid <= slot) {
+ if (nritems == 1 ||
+ leaf_space_used(l, mid, nritems - mid) + data_size >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ if (slot >= nritems) {
+ split = 0;
+ } else {
+ mid = slot;
+ if (mid != nritems &&
+ leaf_space_used(l, mid, nritems - mid) +
+ data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+ split = 2;
+ }
+ }
+ }
+ } else {
+ if (leaf_space_used(l, 0, mid) + data_size >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ if (!extend && data_size && slot == 0) {
+ split = 0;
+ } else if ((extend || !data_size) && slot == 0) {
+ mid = 1;
+ } else {
+ mid = slot;
+ if (mid != nritems &&
+ leaf_space_used(l, mid, nritems - mid) +
+ data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+ split = 2 ;
+ }
+ }
+ }
+ }
+
+ if (split == 0)
+ btrfs_cpu_key_to_disk(&disk_key, ins_key);
+ else
+ btrfs_item_key(l, &disk_key, mid);
+
+ right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+ root->root_key.objectid,
+ &disk_key, 0, l->start, 0);
+ if (IS_ERR(right)) {
+ BUG_ON(1);
+ return PTR_ERR(right);
+ }
+
+ memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_bytenr(right, right->start);
+ btrfs_set_header_generation(right, trans->transid);
+ btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(right, root->root_key.objectid);
+ btrfs_set_header_level(right, 0);
+ write_extent_buffer(right, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(right),
+ BTRFS_FSID_SIZE);
+
+ write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_header_chunk_tree_uuid(right),
+ BTRFS_UUID_SIZE);
+
+ if (split == 0) {
+ if (mid <= slot) {
+ btrfs_set_header_nritems(right, 0);
+ wret = insert_ptr(trans, root, path,
+ &disk_key, right->start,
+ path->slots[1] + 1, 1);
+ if (wret)
+ ret = wret;
+
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] = 0;
+ path->slots[1] += 1;
+ } else {
+ btrfs_set_header_nritems(right, 0);
+ wret = insert_ptr(trans, root, path,
+ &disk_key,
+ right->start,
+ path->slots[1], 1);
+ if (wret)
+ ret = wret;
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] = 0;
+ if (path->slots[1] == 0) {
+ wret = fixup_low_keys(trans, root,
+ path, &disk_key, 1);
+ if (wret)
+ ret = wret;
+ }
+ }
+ btrfs_mark_buffer_dirty(right);
+ return ret;
+ }
+
+ ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
+ BUG_ON(ret);
+
+ if (split == 2) {
+ BUG_ON(num_doubles != 0);
+ num_doubles++;
+ goto again;
+ }
+
+ return ret;
+}
+
+static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int ins_len)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ u64 extent_len = 0;
+ u32 item_size;
+ int ret;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+ key.type != BTRFS_EXTENT_CSUM_KEY);
+
+ if (btrfs_leaf_free_space(root, leaf) >= ins_len)
+ return 0;
+
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+ }
+ btrfs_release_path(root, path);
+
+ path->keep_locks = 1;
+ path->search_for_split = 1;
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ path->search_for_split = 0;
+ if (ret < 0)
+ goto err;
+
+ ret = -EAGAIN;
+ leaf = path->nodes[0];
+ /* if our item isn't there or got smaller, return now */
+ if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+ goto err;
+
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
+ goto err;
+ }
+
+ btrfs_set_path_blocking(path);
+ ret = split_leaf(trans, root, &key, path, ins_len, 1);
+ BUG_ON(ret);
+
+ path->keep_locks = 0;
+ btrfs_unlock_up_safe(path, 1);
+ return 0;
+err:
+ path->keep_locks = 0;
+ return ret;
+}
+
+static noinline int split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ unsigned long split_offset)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ struct btrfs_item *new_item;
+ int slot;
+ char *buf;
+ u32 nritems;
+ u32 item_size;
+ u32 orig_offset;
+ struct btrfs_disk_key disk_key;
+
+ leaf = path->nodes[0];
+ BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+ btrfs_set_path_blocking(path);
+
+ item = btrfs_item_nr(leaf, path->slots[0]);
+ orig_offset = btrfs_item_offset(leaf, item);
+ item_size = btrfs_item_size(leaf, item);
+
+ buf = kmalloc(item_size, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
+ path->slots[0]), item_size);
+
+ slot = path->slots[0] + 1;
+ nritems = btrfs_header_nritems(leaf);
+ if (slot != nritems) {
+ /* shift the items */
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+ btrfs_item_nr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_item));
+ }
+
+ btrfs_cpu_key_to_disk(&disk_key, new_key);
+ btrfs_set_item_key(leaf, &disk_key, slot);
+
+ new_item = btrfs_item_nr(leaf, slot);
+
+ btrfs_set_item_offset(leaf, new_item, orig_offset);
+ btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+
+ btrfs_set_item_offset(leaf, item,
+ orig_offset + item_size - split_offset);
+ btrfs_set_item_size(leaf, item, split_offset);
+
+ btrfs_set_header_nritems(leaf, nritems + 1);
+
+ /* write the data for the start of the original item */
+ write_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ split_offset);
+
+ /* write the data for the new item */
+ write_extent_buffer(leaf, buf + split_offset,
+ btrfs_item_ptr_offset(leaf, slot),
+ item_size - split_offset);
+ btrfs_mark_buffer_dirty(leaf);
+
+ BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
+ kfree(buf);
+ return 0;
+}
+
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation. After
+ * the split, the path is pointing to the old item. The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ unsigned long split_offset)
+{
+ int ret;
+ ret = setup_leaf_for_split(trans, root, path,
+ sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ ret = split_item(trans, root, path, new_key, split_offset);
+ return ret;
+}
+
+/*
+ * This function duplicate a item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item
+ * is contiguous with the original item.
+ *
+ * This allows us to split file extent in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key)
+{
+ struct extent_buffer *leaf;
+ int ret;
+ u32 item_size;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ret = setup_leaf_for_split(trans, root, path,
+ item_size + sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ path->slots[0]++;
+ ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
+ item_size, item_size +
+ sizeof(struct btrfs_item), 1);
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ memcpy_extent_buffer(leaf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+ item_size);
+ return 0;
+}
+
+/*
+ * make the item pointed to by the path smaller. new_size indicates
+ * how small to make it, and from_end tells us if we just chop bytes
+ * off the end of the item or if we shift the item to chop bytes off
+ * the front.
+ */
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u32 new_size, int from_end)
+{
+ int ret = 0;
+ int slot;
+ int slot_orig;
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ u32 nritems;
+ unsigned int data_end;
+ unsigned int old_data_start;
+ unsigned int old_size;
+ unsigned int size_diff;
+ int i;
+
+ slot_orig = path->slots[0];
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ old_size = btrfs_item_size_nr(leaf, slot);
+ if (old_size == new_size)
+ return 0;
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(root, leaf);
+
+ old_data_start = btrfs_item_offset_nr(leaf, slot);
+
+ size_diff = old_size - new_size;
+
+ BUG_ON(slot < 0);
+ BUG_ON(slot >= nritems);
+
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+ item = btrfs_item_nr(leaf, i);
+
+ if (!leaf->map_token) {
+ map_extent_buffer(leaf, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &leaf->map_token, &leaf->kaddr,
+ &leaf->map_start, &leaf->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(leaf, item);
+ btrfs_set_item_offset(leaf, item, ioff + size_diff);
+ }
+
+ if (leaf->map_token) {
+ unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+ leaf->map_token = NULL;
+ }
+
+ /* shift the data */
+ if (from_end) {
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end + size_diff, btrfs_leaf_data(leaf) +
+ data_end, old_data_start + new_size - data_end);
+ } else {
+ struct btrfs_disk_key disk_key;
+ u64 offset;
+
+ btrfs_item_key(leaf, &disk_key, slot);
+
+ if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
+ unsigned long ptr;
+ struct btrfs_file_extent_item *fi;
+
+ fi = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ fi = (struct btrfs_file_extent_item *)(
+ (unsigned long)fi - size_diff);
+
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ memmove_extent_buffer(leaf, ptr,
+ (unsigned long)fi,
+ offsetof(struct btrfs_file_extent_item,
+ disk_bytenr));
+ }
+ }
+
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end + size_diff, btrfs_leaf_data(leaf) +
+ data_end, old_data_start - data_end);
+
+ offset = btrfs_disk_key_offset(&disk_key);
+ btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
+ btrfs_set_item_key(leaf, &disk_key, slot);
+ if (slot == 0)
+ fixup_low_keys(trans, root, path, &disk_key, 1);
+ }
+
+ item = btrfs_item_nr(leaf, slot);
+ btrfs_set_item_size(leaf, item, new_size);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+ if (btrfs_leaf_free_space(root, leaf) < 0) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+ return ret;
+}
+
+/*
+ * make the item pointed to by the path bigger, data_size is the new size.
+ */
+int btrfs_extend_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ u32 data_size)
+{
+ int ret = 0;
+ int slot;
+ int slot_orig;
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ u32 nritems;
+ unsigned int data_end;
+ unsigned int old_data;
+ unsigned int old_size;
+ int i;
+
+ slot_orig = path->slots[0];
+ leaf = path->nodes[0];
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(root, leaf);
+
+ if (btrfs_leaf_free_space(root, leaf) < data_size) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+ slot = path->slots[0];
+ old_data = btrfs_item_end_nr(leaf, slot);
+
+ BUG_ON(slot < 0);
+ if (slot >= nritems) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "slot %d too large, nritems %d\n",
+ slot, nritems);
+ BUG_ON(1);
+ }
+
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+ item = btrfs_item_nr(leaf, i);
+
+ if (!leaf->map_token) {
+ map_extent_buffer(leaf, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &leaf->map_token, &leaf->kaddr,
+ &leaf->map_start, &leaf->map_len,
+ KM_USER1);
+ }
+ ioff = btrfs_item_offset(leaf, item);
+ btrfs_set_item_offset(leaf, item, ioff - data_size);
+ }
+
+ if (leaf->map_token) {
+ unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+ leaf->map_token = NULL;
+ }
+
+ /* shift the data */
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end - data_size, btrfs_leaf_data(leaf) +
+ data_end, old_data - data_end);
+
+ data_end = old_data;
+ old_size = btrfs_item_size_nr(leaf, slot);
+ item = btrfs_item_nr(leaf, slot);
+ btrfs_set_item_size(leaf, item, old_size + data_size);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+ if (btrfs_leaf_free_space(root, leaf) < 0) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+ return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ * Returns the number of keys that were inserted.
+ */
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ int nr)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ int ret = 0;
+ int slot;
+ int i;
+ u32 nritems;
+ u32 total_data = 0;
+ u32 total_size = 0;
+ unsigned int data_end;
+ struct btrfs_disk_key disk_key;
+ struct btrfs_key found_key;
+
+ for (i = 0; i < nr; i++) {
+ if (total_size + data_size[i] + sizeof(struct btrfs_item) >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ break;
+ nr = i;
+ }
+ total_data += data_size[i];
+ total_size += data_size[i] + sizeof(struct btrfs_item);
+ }
+ BUG_ON(nr == 0);
+
+ ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ if (ret == 0)
+ return -EEXIST;
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(root, leaf);
+
+ if (btrfs_leaf_free_space(root, leaf) < total_size) {
+ for (i = nr; i >= 0; i--) {
+ total_data -= data_size[i];
+ total_size -= data_size[i] + sizeof(struct btrfs_item);
+ if (total_size < btrfs_leaf_free_space(root, leaf))
+ break;
+ }
+ nr = i;
+ }
+
+ slot = path->slots[0];
+ BUG_ON(slot < 0);
+
+ if (slot != nritems) {
+ unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+
+ item = btrfs_item_nr(leaf, slot);
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /* figure out how many keys we can insert in here */
+ total_data = data_size[0];
+ for (i = 1; i < nr; i++) {
+ if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+ break;
+ total_data += data_size[i];
+ }
+ nr = i;
+
+ if (old_data < data_end) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+ slot, old_data, data_end);
+ BUG_ON(1);
+ }
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ WARN_ON(leaf->map_token);
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+
+ item = btrfs_item_nr(leaf, i);
+ if (!leaf->map_token) {
+ map_extent_buffer(leaf, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &leaf->map_token, &leaf->kaddr,
+ &leaf->map_start, &leaf->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(leaf, item);
+ btrfs_set_item_offset(leaf, item, ioff - total_data);
+ }
+ if (leaf->map_token) {
+ unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+ leaf->map_token = NULL;
+ }
+
+ /* shift the items */
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+ btrfs_item_nr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_item));
+
+ /* shift the data */
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end - total_data, btrfs_leaf_data(leaf) +
+ data_end, old_data - data_end);
+ data_end = old_data;
+ } else {
+ /*
+ * this sucks but it has to be done, if we are inserting at
+ * the end of the leaf only insert 1 of the items, since we
+ * have no way of knowing whats on the next leaf and we'd have
+ * to drop our current locks to figure it out
+ */
+ nr = 1;
+ }
+
+ /* setup the item for the new data */
+ for (i = 0; i < nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+ btrfs_set_item_key(leaf, &disk_key, slot + i);
+ item = btrfs_item_nr(leaf, slot + i);
+ btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+ data_end -= data_size[i];
+ btrfs_set_item_size(leaf, item, data_size[i]);
+ }
+ btrfs_set_header_nritems(leaf, nritems + nr);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+ if (slot == 0) {
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+ }
+
+ if (btrfs_leaf_free_space(root, leaf) < 0) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+out:
+ if (!ret)
+ ret = nr;
+ return ret;
+}
+
+/*
+ * this is a helper for btrfs_insert_empty_items, the main goal here is
+ * to save stack depth by doing the bulk of the work in a function
+ * that doesn't call btrfs_search_slot
+ */
+static noinline_for_stack int
+setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr)
+{
+ struct btrfs_item *item;
+ int i;
+ u32 nritems;
+ unsigned int data_end;
+ struct btrfs_disk_key disk_key;
+ int ret;
+ struct extent_buffer *leaf;
+ int slot;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(root, leaf);
+
+ if (btrfs_leaf_free_space(root, leaf) < total_size) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "not enough freespace need %u have %d\n",
+ total_size, btrfs_leaf_free_space(root, leaf));
+ BUG();
+ }
+
+ if (slot != nritems) {
+ unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+
+ if (old_data < data_end) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+ slot, old_data, data_end);
+ BUG_ON(1);
+ }
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ WARN_ON(leaf->map_token);
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+
+ item = btrfs_item_nr(leaf, i);
+ if (!leaf->map_token) {
+ map_extent_buffer(leaf, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &leaf->map_token, &leaf->kaddr,
+ &leaf->map_start, &leaf->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(leaf, item);
+ btrfs_set_item_offset(leaf, item, ioff - total_data);
+ }
+ if (leaf->map_token) {
+ unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+ leaf->map_token = NULL;
+ }
+
+ /* shift the items */
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+ btrfs_item_nr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_item));
+
+ /* shift the data */
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end - total_data, btrfs_leaf_data(leaf) +
+ data_end, old_data - data_end);
+ data_end = old_data;
+ }
+
+ /* setup the item for the new data */
+ for (i = 0; i < nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+ btrfs_set_item_key(leaf, &disk_key, slot + i);
+ item = btrfs_item_nr(leaf, slot + i);
+ btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+ data_end -= data_size[i];
+ btrfs_set_item_size(leaf, item, data_size[i]);
+ }
+
+ btrfs_set_header_nritems(leaf, nritems + nr);
+
+ ret = 0;
+ if (slot == 0) {
+ struct btrfs_disk_key disk_key;
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+ }
+ btrfs_unlock_up_safe(path, 1);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (btrfs_leaf_free_space(root, leaf) < 0) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+ return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ int nr)
+{
+ struct extent_buffer *leaf;
+ int ret = 0;
+ int slot;
+ int i;
+ u32 total_size = 0;
+ u32 total_data = 0;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ if (ret == 0)
+ return -EEXIST;
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ BUG_ON(slot < 0);
+
+ ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+ total_data, total_size, nr);
+
+out:
+ return ret;
+}
+
+/*
+ * Given a key and some data, insert an item into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *cpu_key, void *data, u32
+ data_size)
+{
+ int ret = 0;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+ if (!ret) {
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ write_extent_buffer(leaf, data, ptr, data_size);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * delete the pointer from a given node.
+ *
+ * the tree should have been previously balanced so the deletion does not
+ * empty a node.
+ */
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int level, int slot)
+{
+ struct extent_buffer *parent = path->nodes[level];
+ u32 nritems;
+ int ret = 0;
+ int wret;
+
+ nritems = btrfs_header_nritems(parent);
+ if (slot != nritems - 1) {
+ memmove_extent_buffer(parent,
+ btrfs_node_key_ptr_offset(slot),
+ btrfs_node_key_ptr_offset(slot + 1),
+ sizeof(struct btrfs_key_ptr) *
+ (nritems - slot - 1));
+ }
+ nritems--;
+ btrfs_set_header_nritems(parent, nritems);
+ if (nritems == 0 && parent == root->node) {
+ BUG_ON(btrfs_header_level(root->node) != 1);
+ /* just turn the root into a leaf and break */
+ btrfs_set_header_level(root->node, 0);
+ } else if (slot == 0) {
+ struct btrfs_disk_key disk_key;
+
+ btrfs_node_key(parent, &disk_key, 0);
+ wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
+ if (wret)
+ ret = wret;
+ }
+ btrfs_mark_buffer_dirty(parent);
+ return ret;
+}
+
+/*
+ * a helper function to delete the leaf pointed to by path->slots[1] and
+ * path->nodes[1].
+ *
+ * This deletes the pointer in path->nodes[1] and frees the leaf
+ * block extent. zero is returned if it all worked out, < 0 otherwise.
+ *
+ * The path must have already been setup for deleting the leaf, including
+ * all the proper balancing. path->nodes[1] must be locked.
+ */
+static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *leaf)
+{
+ int ret;
+
+ WARN_ON(btrfs_header_generation(leaf) != trans->transid);
+ ret = del_ptr(trans, root, path, 1, path->slots[1]);
+ if (ret)
+ return ret;
+
+ /*
+ * btrfs_free_extent is expensive, we want to make sure we
+ * aren't holding any locks when we call it
+ */
+ btrfs_unlock_up_safe(path, 0);
+
+ ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
+ 0, root->root_key.objectid, 0);
+ return ret;
+}
+/*
+ * delete the item at the leaf level in path. If that empties
+ * the leaf, remove it from the tree
+ */
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int slot, int nr)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ int last_off;
+ int dsize = 0;
+ int ret = 0;
+ int wret;
+ int i;
+ u32 nritems;
+
+ leaf = path->nodes[0];
+ last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+
+ for (i = 0; i < nr; i++)
+ dsize += btrfs_item_size_nr(leaf, slot + i);
+
+ nritems = btrfs_header_nritems(leaf);
+
+ if (slot + nr != nritems) {
+ int data_end = leaf_data_end(root, leaf);
+
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end + dsize,
+ btrfs_leaf_data(leaf) + data_end,
+ last_off - data_end);
+
+ for (i = slot + nr; i < nritems; i++) {
+ u32 ioff;
+
+ item = btrfs_item_nr(leaf, i);
+ if (!leaf->map_token) {
+ map_extent_buffer(leaf, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &leaf->map_token, &leaf->kaddr,
+ &leaf->map_start, &leaf->map_len,
+ KM_USER1);
+ }
+ ioff = btrfs_item_offset(leaf, item);
+ btrfs_set_item_offset(leaf, item, ioff + dsize);
+ }
+
+ if (leaf->map_token) {
+ unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+ leaf->map_token = NULL;
+ }
+
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
+ btrfs_item_nr_offset(slot + nr),
+ sizeof(struct btrfs_item) *
+ (nritems - slot - nr));
+ }
+ btrfs_set_header_nritems(leaf, nritems - nr);
+ nritems -= nr;
+
+ /* delete the leaf if we've emptied it */
+ if (nritems == 0) {
+ if (leaf == root->node) {
+ btrfs_set_header_level(leaf, 0);
+ } else {
+ ret = btrfs_del_leaf(trans, root, path, leaf);
+ BUG_ON(ret);
+ }
+ } else {
+ int used = leaf_space_used(leaf, 0, nritems);
+ if (slot == 0) {
+ struct btrfs_disk_key disk_key;
+
+ btrfs_item_key(leaf, &disk_key, 0);
+ wret = fixup_low_keys(trans, root, path,
+ &disk_key, 1);
+ if (wret)
+ ret = wret;
+ }
+
+ /* delete the leaf if it is mostly empty */
+ if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
+ /* push_leaf_left fixes the path.
+ * make sure the path still points to our leaf
+ * for possible call to del_ptr below
+ */
+ slot = path->slots[1];
+ extent_buffer_get(leaf);
+
+ btrfs_set_path_blocking(path);
+ wret = push_leaf_left(trans, root, path, 1, 1);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+
+ if (path->nodes[0] == leaf &&
+ btrfs_header_nritems(leaf)) {
+ wret = push_leaf_right(trans, root, path, 1, 1);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+ }
+
+ if (btrfs_header_nritems(leaf) == 0) {
+ path->slots[1] = slot;
+ ret = btrfs_del_leaf(trans, root, path, leaf);
+ BUG_ON(ret);
+ free_extent_buffer(leaf);
+ } else {
+ /* if we're still in the path, make sure
+ * we're dirty. Otherwise, one of the
+ * push_leaf functions must have already
+ * dirtied this buffer
+ */
+ if (path->nodes[0] == leaf)
+ btrfs_mark_buffer_dirty(leaf);
+ free_extent_buffer(leaf);
+ }
+ } else {
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ }
+ return ret;
+}
+
+/*
+ * search the tree again to find a leaf with lesser keys
+ * returns 0 if it found something or 1 if there are no lesser leaves.
+ * returns < 0 on io errors.
+ *
+ * This may release the path, and so you may lose any locks held at the
+ * time you call it.
+ */
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ struct btrfs_disk_key found_key;
+ int ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
+
+ if (key.offset > 0)
+ key.offset--;
+ else if (key.type > 0)
+ key.type--;
+ else if (key.objectid > 0)
+ key.objectid--;
+ else
+ return 1;
+
+ btrfs_release_path(root, path);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+ btrfs_item_key(path->nodes[0], &found_key, 0);
+ ret = comp_keys(&found_key, &key);
+ if (ret < 0)
+ return 0;
+ return 1;
+}
+
+/*
+ * A helper function to walk down the tree starting at min_key, and looking
+ * for nodes or leaves that are either in cache or have a minimum
+ * transaction id. This is used by the btree defrag code, and tree logging
+ *
+ * This does not cow, but it does stuff the starting key it finds back
+ * into min_key, so you can call btrfs_search_slot with cow=1 on the
+ * key and get a writable path.
+ *
+ * This does lock as it descends, and path->keep_locks should be set
+ * to 1 by the caller.
+ *
+ * This honors path->lowest_level to prevent descent past a given level
+ * of the tree.
+ *
+ * min_trans indicates the oldest transaction that you are interested
+ * in walking through. Any nodes or leaves older than min_trans are
+ * skipped over (without reading them).
+ *
+ * returns zero if something useful was found, < 0 on error and 1 if there
+ * was nothing in the tree that matched the search criteria.
+ */
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+ struct btrfs_key *max_key,
+ struct btrfs_path *path, int cache_only,
+ u64 min_trans)
+{
+ struct extent_buffer *cur;
+ struct btrfs_key found_key;
+ int slot;
+ int sret;
+ u32 nritems;
+ int level;
+ int ret = 1;
+
+ WARN_ON(!path->keep_locks);
+again:
+ cur = btrfs_lock_root_node(root);
+ level = btrfs_header_level(cur);
+ WARN_ON(path->nodes[level]);
+ path->nodes[level] = cur;
+ path->locks[level] = 1;
+
+ if (btrfs_header_generation(cur) < min_trans) {
+ ret = 1;
+ goto out;
+ }
+ while (1) {
+ nritems = btrfs_header_nritems(cur);
+ level = btrfs_header_level(cur);
+ sret = bin_search(cur, min_key, level, &slot);
+
+ /* at the lowest level, we're done, setup the path and exit */
+ if (level == path->lowest_level) {
+ if (slot >= nritems)
+ goto find_next_key;
+ ret = 0;
+ path->slots[level] = slot;
+ btrfs_item_key_to_cpu(cur, &found_key, slot);
+ goto out;
+ }
+ if (sret && slot > 0)
+ slot--;
+ /*
+ * check this node pointer against the cache_only and
+ * min_trans parameters. If it isn't in cache or is too
+ * old, skip to the next one.
+ */
+ while (slot < nritems) {
+ u64 blockptr;
+ u64 gen;
+ struct extent_buffer *tmp;
+ struct btrfs_disk_key disk_key;
+
+ blockptr = btrfs_node_blockptr(cur, slot);
+ gen = btrfs_node_ptr_generation(cur, slot);
+ if (gen < min_trans) {
+ slot++;
+ continue;
+ }
+ if (!cache_only)
+ break;
+
+ if (max_key) {
+ btrfs_node_key(cur, &disk_key, slot);
+ if (comp_keys(&disk_key, max_key) >= 0) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+ tmp = btrfs_find_tree_block(root, blockptr,
+ btrfs_level_size(root, level - 1));
+
+ if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+ free_extent_buffer(tmp);
+ break;
+ }
+ if (tmp)
+ free_extent_buffer(tmp);
+ slot++;
+ }
+find_next_key:
+ /*
+ * we didn't find a candidate key in this node, walk forward
+ * and find another one
+ */
+ if (slot >= nritems) {
+ path->slots[level] = slot;
+ btrfs_set_path_blocking(path);
+ sret = btrfs_find_next_key(root, path, min_key, level,
+ cache_only, min_trans);
+ if (sret == 0) {
+ btrfs_release_path(root, path);
+ goto again;
+ } else {
+ goto out;
+ }
+ }
+ /* save our key for returning back */
+ btrfs_node_key_to_cpu(cur, &found_key, slot);
+ path->slots[level] = slot;
+ if (level == path->lowest_level) {
+ ret = 0;
+ unlock_up(path, level, 1);
+ goto out;
+ }
+ btrfs_set_path_blocking(path);
+ cur = read_node_slot(root, cur, slot);
+
+ btrfs_tree_lock(cur);
+
+ path->locks[level - 1] = 1;
+ path->nodes[level - 1] = cur;
+ unlock_up(path, level, 1);
+ btrfs_clear_path_blocking(path, NULL);
+ }
+out:
+ if (ret == 0)
+ memcpy(min_key, &found_key, sizeof(found_key));
+ btrfs_set_path_blocking(path);
+ return ret;
+}
+
+/*
+ * this is similar to btrfs_next_leaf, but does not try to preserve
+ * and fixup the path. It looks for and returns the next key in the
+ * tree based on the current path and the cache_only and min_trans
+ * parameters.
+ *
+ * 0 is returned if another key is found, < 0 if there are any errors
+ * and 1 is returned if there are no higher keys in the tree
+ *
+ * path->keep_locks should be set to 1 on the search made before
+ * calling this function.
+ */
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, int level,
+ int cache_only, u64 min_trans)
+{
+ int slot;
+ struct extent_buffer *c;
+
+ WARN_ON(!path->keep_locks);
+ while (level < BTRFS_MAX_LEVEL) {
+ if (!path->nodes[level])
+ return 1;
+
+ slot = path->slots[level] + 1;
+ c = path->nodes[level];
+next:
+ if (slot >= btrfs_header_nritems(c)) {
+ int ret;
+ int orig_lowest;
+ struct btrfs_key cur_key;
+ if (level + 1 >= BTRFS_MAX_LEVEL ||
+ !path->nodes[level + 1])
+ return 1;
+
+ if (path->locks[level + 1]) {
+ level++;
+ continue;
+ }
+
+ slot = btrfs_header_nritems(c) - 1;
+ if (level == 0)
+ btrfs_item_key_to_cpu(c, &cur_key, slot);
+ else
+ btrfs_node_key_to_cpu(c, &cur_key, slot);
+
+ orig_lowest = path->lowest_level;
+ btrfs_release_path(root, path);
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, &cur_key, path,
+ 0, 0);
+ path->lowest_level = orig_lowest;
+ if (ret < 0)
+ return ret;
+
+ c = path->nodes[level];
+ slot = path->slots[level];
+ if (ret == 0)
+ slot++;
+ goto next;
+ }
+
+ if (level == 0)
+ btrfs_item_key_to_cpu(c, key, slot);
+ else {
+ u64 blockptr = btrfs_node_blockptr(c, slot);
+ u64 gen = btrfs_node_ptr_generation(c, slot);
+
+ if (cache_only) {
+ struct extent_buffer *cur;
+ cur = btrfs_find_tree_block(root, blockptr,
+ btrfs_level_size(root, level - 1));
+ if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
+ slot++;
+ if (cur)
+ free_extent_buffer(cur);
+ goto next;
+ }
+ free_extent_buffer(cur);
+ }
+ if (gen < min_trans) {
+ slot++;
+ goto next;
+ }
+ btrfs_node_key_to_cpu(c, key, slot);
+ }
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * search the tree again to find a leaf with greater keys
+ * returns 0 if it found something or 1 if there are no greater leaves.
+ * returns < 0 on io errors.
+ */
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+ int slot;
+ int level;
+ struct extent_buffer *c;
+ struct extent_buffer *next;
+ struct btrfs_key key;
+ u32 nritems;
+ int ret;
+ int old_spinning = path->leave_spinning;
+ int force_blocking = 0;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (nritems == 0)
+ return 1;
+
+ /*
+ * we take the blocks in an order that upsets lockdep. Using
+ * blocking mode is the only way around it.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ force_blocking = 1;
+#endif
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+again:
+ level = 1;
+ next = NULL;
+ btrfs_release_path(root, path);
+
+ path->keep_locks = 1;
+
+ if (!force_blocking)
+ path->leave_spinning = 1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ path->keep_locks = 0;
+
+ if (ret < 0)
+ return ret;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ /*
+ * by releasing the path above we dropped all our locks. A balance
+ * could have added more items next to the key that used to be
+ * at the very end of the block. So, check again here and
+ * advance the path if there are now more items available.
+ */
+ if (nritems > 0 && path->slots[0] < nritems - 1) {
+ if (ret == 0)
+ path->slots[0]++;
+ ret = 0;
+ goto done;
+ }
+
+ while (level < BTRFS_MAX_LEVEL) {
+ if (!path->nodes[level]) {
+ ret = 1;
+ goto done;
+ }
+
+ slot = path->slots[level] + 1;
+ c = path->nodes[level];
+ if (slot >= btrfs_header_nritems(c)) {
+ level++;
+ if (level == BTRFS_MAX_LEVEL) {
+ ret = 1;
+ goto done;
+ }
+ continue;
+ }
+
+ if (next) {
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ }
+
+ next = c;
+ ret = read_block_for_search(NULL, root, path, &next, level,
+ slot, &key);
+ if (ret == -EAGAIN)
+ goto again;
+
+ if (ret < 0) {
+ btrfs_release_path(root, path);
+ goto done;
+ }
+
+ if (!path->skip_locking) {
+ ret = btrfs_try_spin_lock(next);
+ if (!ret) {
+ btrfs_set_path_blocking(path);
+ btrfs_tree_lock(next);
+ if (!force_blocking)
+ btrfs_clear_path_blocking(path, next);
+ }
+ if (force_blocking)
+ btrfs_set_lock_blocking(next);
+ }
+ break;
+ }
+ path->slots[level] = slot;
+ while (1) {
+ level--;
+ c = path->nodes[level];
+ if (path->locks[level])
+ btrfs_tree_unlock(c);
+
+ free_extent_buffer(c);
+ path->nodes[level] = next;
+ path->slots[level] = 0;
+ if (!path->skip_locking)
+ path->locks[level] = 1;
+
+ if (!level)
+ break;
+
+ ret = read_block_for_search(NULL, root, path, &next, level,
+ 0, &key);
+ if (ret == -EAGAIN)
+ goto again;
+
+ if (ret < 0) {
+ btrfs_release_path(root, path);
+ goto done;
+ }
+
+ if (!path->skip_locking) {
+ btrfs_assert_tree_locked(path->nodes[level]);
+ ret = btrfs_try_spin_lock(next);
+ if (!ret) {
+ btrfs_set_path_blocking(path);
+ btrfs_tree_lock(next);
+ if (!force_blocking)
+ btrfs_clear_path_blocking(path, next);
+ }
+ if (force_blocking)
+ btrfs_set_lock_blocking(next);
+ }
+ }
+ ret = 0;
+done:
+ unlock_up(path, 0, 1);
+ path->leave_spinning = old_spinning;
+ if (!old_spinning)
+ btrfs_set_path_blocking(path);
+
+ return ret;
+}
+
+/*
+ * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
+ * searching until it gets past min_objectid or finds an item of 'type'
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid,
+ int type)
+{
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int ret;
+
+ while (1) {
+ if (path->slots[0] == 0) {
+ btrfs_set_path_blocking(path);
+ ret = btrfs_prev_leaf(root, path);
+ if (ret != 0)
+ return ret;
+ } else {
+ path->slots[0]--;
+ }
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (nritems == 0)
+ return 1;
+ if (path->slots[0] == nritems)
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid < min_objectid)
+ break;
+ if (found_key.type == type)
+ return 0;
+ if (found_key.objectid == min_objectid &&
+ found_key.type < type)
+ break;
+ }
+ return 1;
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 00000000000..2aa8ec6a098
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,2411 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_CTREE__
+#define __BTRFS_CTREE__
+
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <asm/kmap_types.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "async-thread.h"
+
+struct btrfs_trans_handle;
+struct btrfs_transaction;
+extern struct kmem_cache *btrfs_trans_handle_cachep;
+extern struct kmem_cache *btrfs_transaction_cachep;
+extern struct kmem_cache *btrfs_bit_radix_cachep;
+extern struct kmem_cache *btrfs_path_cachep;
+struct btrfs_ordered_sum;
+
+#define BTRFS_MAGIC "_BHRfS_M"
+
+#define BTRFS_MAX_LEVEL 8
+
+#define BTRFS_COMPAT_EXTENT_TREE_V0
+
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list. That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
+
+/* holds pointers to all of the tree roots */
+#define BTRFS_ROOT_TREE_OBJECTID 1ULL
+
+/* stores information about which extents are in use, and reference counts */
+#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+
+/*
+ * chunk tree stores translations from logical -> physical block numbering
+ * the super block points to the chunk tree
+ */
+#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
+
+/*
+ * stores information about which areas of a given device are in use.
+ * one per device. The tree of tree roots points to the device tree
+ */
+#define BTRFS_DEV_TREE_OBJECTID 4ULL
+
+/* one per subvolume, storing files and directories */
+#define BTRFS_FS_TREE_OBJECTID 5ULL
+
+/* directory objectid inside the root tree */
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
+
+/* holds checksums of all the data extents */
+#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+
+/* orhpan objectid for tracking unlinked/truncated files */
+#define BTRFS_ORPHAN_OBJECTID -5ULL
+
+/* does write ahead logging to speed up fsyncs */
+#define BTRFS_TREE_LOG_OBJECTID -6ULL
+#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
+
+/* for space balancing */
+#define BTRFS_TREE_RELOC_OBJECTID -8ULL
+#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
+
+/*
+ * extent checksums all have this objectid
+ * this allows them to share the logging tree
+ * for fsyncs
+ */
+#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+
+/* dummy objectid represents multiple objectids */
+#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
+
+/*
+ * All files have objectids in this range.
+ */
+#define BTRFS_FIRST_FREE_OBJECTID 256ULL
+#define BTRFS_LAST_FREE_OBJECTID -256ULL
+#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
+
+
+/*
+ * the device items go into the chunk tree. The key is in the form
+ * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
+ */
+#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+
+#define BTRFS_BTREE_INODE_OBJECTID 1
+
+#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+
+/*
+ * we can actually store much bigger names, but lets not confuse the rest
+ * of linux
+ */
+#define BTRFS_NAME_LEN 255
+
+/* 32 bytes in various csum fields */
+#define BTRFS_CSUM_SIZE 32
+
+/* csum types */
+#define BTRFS_CSUM_TYPE_CRC32 0
+
+static int btrfs_csum_sizes[] = { 4, 0 };
+
+/* four bytes for CRC32 */
+#define BTRFS_EMPTY_DIR_SIZE 0
+
+#define BTRFS_FT_UNKNOWN 0
+#define BTRFS_FT_REG_FILE 1
+#define BTRFS_FT_DIR 2
+#define BTRFS_FT_CHRDEV 3
+#define BTRFS_FT_BLKDEV 4
+#define BTRFS_FT_FIFO 5
+#define BTRFS_FT_SOCK 6
+#define BTRFS_FT_SYMLINK 7
+#define BTRFS_FT_XATTR 8
+#define BTRFS_FT_MAX 9
+
+/*
+ * The key defines the order in the tree, and so it also defines (optimal)
+ * block layout.
+ *
+ * objectid corresponds to the inode number.
+ *
+ * type tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with type of 1 might refer to the inode data,
+ * type of 2 may point to file data in the btree and type == 3 may point to
+ * extents.
+ *
+ * offset is the starting byte offset for this key in the stream.
+ *
+ * btrfs_disk_key is in disk byte order. struct btrfs_key is always
+ * in cpu native order. Otherwise they are identical and their sizes
+ * should be the same (ie both packed)
+ */
+struct btrfs_disk_key {
+ __le64 objectid;
+ u8 type;
+ __le64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_key {
+ u64 objectid;
+ u8 type;
+ u64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_mapping_tree {
+ struct extent_map_tree map_tree;
+};
+
+#define BTRFS_UUID_SIZE 16
+struct btrfs_dev_item {
+ /* the internal btrfs device id */
+ __le64 devid;
+
+ /* size of the device */
+ __le64 total_bytes;
+
+ /* bytes used */
+ __le64 bytes_used;
+
+ /* optimal io alignment for this device */
+ __le32 io_align;
+
+ /* optimal io width for this device */
+ __le32 io_width;
+
+ /* minimal io size for this device */
+ __le32 sector_size;
+
+ /* type and info about this device */
+ __le64 type;
+
+ /* expected generation for this device */
+ __le64 generation;
+
+ /*
+ * starting byte of this partition on the device,
+ * to allow for stripe alignment in the future
+ */
+ __le64 start_offset;
+
+ /* grouping information for allocation decisions */
+ __le32 dev_group;
+
+ /* seek speed 0-100 where 100 is fastest */
+ u8 seek_speed;
+
+ /* bandwidth 0-100 where 100 is fastest */
+ u8 bandwidth;
+
+ /* btrfs generated uuid for this device */
+ u8 uuid[BTRFS_UUID_SIZE];
+
+ /* uuid of FS who owns this device */
+ u8 fsid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_stripe {
+ __le64 devid;
+ __le64 offset;
+ u8 dev_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_chunk {
+ /* size of this chunk in bytes */
+ __le64 length;
+
+ /* objectid of the root referencing this chunk */
+ __le64 owner;
+
+ __le64 stripe_len;
+ __le64 type;
+
+ /* optimal io alignment for this chunk */
+ __le32 io_align;
+
+ /* optimal io width for this chunk */
+ __le32 io_width;
+
+ /* minimal io size for this chunk */
+ __le32 sector_size;
+
+ /* 2^16 stripes is quite a lot, a second limit is the size of a single
+ * item in the btree
+ */
+ __le16 num_stripes;
+
+ /* sub stripes only matter for raid10 */
+ __le16 sub_stripes;
+ struct btrfs_stripe stripe;
+ /* additional stripes go here */
+} __attribute__ ((__packed__));
+
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+ BUG_ON(num_stripes == 0);
+ return sizeof(struct btrfs_chunk) +
+ sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
+#define BTRFS_FSID_SIZE 16
+#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
+#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
+#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
+#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
+
+#define BTRFS_BACKREF_REV_MAX 256
+#define BTRFS_BACKREF_REV_SHIFT 56
+#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
+ BTRFS_BACKREF_REV_SHIFT)
+
+#define BTRFS_OLD_BACKREF_REV 0
+#define BTRFS_MIXED_BACKREF_REV 1
+
+/*
+ * every tree block (leaf or node) starts with this header.
+ */
+struct btrfs_header {
+ /* these first four must match the super block */
+ u8 csum[BTRFS_CSUM_SIZE];
+ u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ __le64 bytenr; /* which block this node is supposed to live in */
+ __le64 flags;
+
+ /* allowed to be different from the super from here on down */
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+ __le64 generation;
+ __le64 owner;
+ __le32 nritems;
+ u8 level;
+} __attribute__ ((__packed__));
+
+#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
+ sizeof(struct btrfs_header)) / \
+ sizeof(struct btrfs_key_ptr))
+#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
+#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+ sizeof(struct btrfs_item) - \
+ sizeof(struct btrfs_file_extent_item))
+#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+ sizeof(struct btrfs_item) -\
+ sizeof(struct btrfs_dir_item))
+
+
+/*
+ * this is a very generous portion of the super block, giving us
+ * room to translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+#define BTRFS_LABEL_SIZE 256
+
+/*
+ * the super block basically lists the main trees of the FS
+ * it currently lacks any block count etc etc
+ */
+struct btrfs_super_block {
+ u8 csum[BTRFS_CSUM_SIZE];
+ /* the first 4 fields must match struct btrfs_header */
+ u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ __le64 bytenr; /* this block number */
+ __le64 flags;
+
+ /* allowed to be different from the btrfs_header from here own down */
+ __le64 magic;
+ __le64 generation;
+ __le64 root;
+ __le64 chunk_root;
+ __le64 log_root;
+
+ /* this will help find the new super based on the log root */
+ __le64 log_root_transid;
+ __le64 total_bytes;
+ __le64 bytes_used;
+ __le64 root_dir_objectid;
+ __le64 num_devices;
+ __le32 sectorsize;
+ __le32 nodesize;
+ __le32 leafsize;
+ __le32 stripesize;
+ __le32 sys_chunk_array_size;
+ __le64 chunk_root_generation;
+ __le64 compat_flags;
+ __le64 compat_ro_flags;
+ __le64 incompat_flags;
+ __le16 csum_type;
+ u8 root_level;
+ u8 chunk_root_level;
+ u8 log_root_level;
+ struct btrfs_dev_item dev_item;
+
+ char label[BTRFS_LABEL_SIZE];
+
+ /* future expansion */
+ __le64 reserved[32];
+ u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+} __attribute__ ((__packed__));
+
+/*
+ * Compat flags that we support. If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
+
+#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
+
+/*
+ * A leaf is full of items. offset and size tell us where to find
+ * the item in the leaf (relative to the start of the data area)
+ */
+struct btrfs_item {
+ struct btrfs_disk_key key;
+ __le32 offset;
+ __le32 size;
+} __attribute__ ((__packed__));
+
+/*
+ * leaves have an item area and a data area:
+ * [item0, item1....itemN] [free space] [dataN...data1, data0]
+ *
+ * The data is separate from the items to get the keys closer together
+ * during searches.
+ */
+struct btrfs_leaf {
+ struct btrfs_header header;
+ struct btrfs_item items[];
+} __attribute__ ((__packed__));
+
+/*
+ * all non-leaf blocks are nodes, they hold only keys and pointers to
+ * other blocks
+ */
+struct btrfs_key_ptr {
+ struct btrfs_disk_key key;
+ __le64 blockptr;
+ __le64 generation;
+} __attribute__ ((__packed__));
+
+struct btrfs_node {
+ struct btrfs_header header;
+ struct btrfs_key_ptr ptrs[];
+} __attribute__ ((__packed__));
+
+/*
+ * btrfs_paths remember the path taken from the root down to the leaf.
+ * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
+ * to any other levels that are present.
+ *
+ * The slots array records the index of the item or block pointer
+ * used while walking the tree.
+ */
+struct btrfs_path {
+ struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
+ int slots[BTRFS_MAX_LEVEL];
+ /* if there is real range locking, this locks field will change */
+ int locks[BTRFS_MAX_LEVEL];
+ int reada;
+ /* keep some upper locks as we walk down */
+ int lowest_level;
+
+ /*
+ * set by btrfs_split_item, tells search_slot to keep all locks
+ * and to force calls to keep space in the nodes
+ */
+ unsigned int search_for_split:1;
+ unsigned int keep_locks:1;
+ unsigned int skip_locking:1;
+ unsigned int leave_spinning:1;
+ unsigned int search_commit_root:1;
+};
+
+/*
+ * items in the extent btree are used to record the objectid of the
+ * owner of the block and the number of references
+ */
+
+struct btrfs_extent_item {
+ __le64 refs;
+ __le64 generation;
+ __le64 flags;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_item_v0 {
+ __le32 refs;
+} __attribute__ ((__packed__));
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
+ sizeof(struct btrfs_item))
+
+#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0)
+#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1)
+
+/* following flags only apply to tree blocks */
+
+/* use full backrefs for extent pointers in the block */
+#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
+
+struct btrfs_tree_block_info {
+ struct btrfs_disk_key key;
+ u8 level;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_data_ref {
+ __le64 root;
+ __le64 objectid;
+ __le64 offset;
+ __le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_shared_data_ref {
+ __le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_inline_ref {
+ u8 type;
+ __le64 offset;
+} __attribute__ ((__packed__));
+
+/* old style backrefs item */
+struct btrfs_extent_ref_v0 {
+ __le64 root;
+ __le64 generation;
+ __le64 objectid;
+ __le32 count;
+} __attribute__ ((__packed__));
+
+
+/* dev extents record free space on individual devices. The owner
+ * field points back to the chunk allocation mapping tree that allocated
+ * the extent. The chunk tree uuid field is a way to double check the owner
+ */
+struct btrfs_dev_extent {
+ __le64 chunk_tree;
+ __le64 chunk_objectid;
+ __le64 chunk_offset;
+ __le64 length;
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_inode_ref {
+ __le64 index;
+ __le16 name_len;
+ /* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_timespec {
+ __le64 sec;
+ __le32 nsec;
+} __attribute__ ((__packed__));
+
+enum btrfs_compression_type {
+ BTRFS_COMPRESS_NONE = 0,
+ BTRFS_COMPRESS_ZLIB = 1,
+ BTRFS_COMPRESS_LAST = 2,
+};
+
+struct btrfs_inode_item {
+ /* nfs style generation number */
+ __le64 generation;
+ /* transid that last touched this inode */
+ __le64 transid;
+ __le64 size;
+ __le64 nbytes;
+ __le64 block_group;
+ __le32 nlink;
+ __le32 uid;
+ __le32 gid;
+ __le32 mode;
+ __le64 rdev;
+ __le64 flags;
+
+ /* modification sequence number for NFS */
+ __le64 sequence;
+
+ /*
+ * a little future expansion, for more than this we can
+ * just grow the inode item and version it
+ */
+ __le64 reserved[4];
+ struct btrfs_timespec atime;
+ struct btrfs_timespec ctime;
+ struct btrfs_timespec mtime;
+ struct btrfs_timespec otime;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_log_item {
+ __le64 end;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_item {
+ struct btrfs_disk_key location;
+ __le64 transid;
+ __le16 data_len;
+ __le16 name_len;
+ u8 type;
+} __attribute__ ((__packed__));
+
+struct btrfs_root_item {
+ struct btrfs_inode_item inode;
+ __le64 generation;
+ __le64 root_dirid;
+ __le64 bytenr;
+ __le64 byte_limit;
+ __le64 bytes_used;
+ __le64 last_snapshot;
+ __le64 flags;
+ __le32 refs;
+ struct btrfs_disk_key drop_progress;
+ u8 drop_level;
+ u8 level;
+} __attribute__ ((__packed__));
+
+/*
+ * this is used for both forward and backward root refs
+ */
+struct btrfs_root_ref {
+ __le64 dirid;
+ __le64 sequence;
+ __le16 name_len;
+} __attribute__ ((__packed__));
+
+#define BTRFS_FILE_EXTENT_INLINE 0
+#define BTRFS_FILE_EXTENT_REG 1
+#define BTRFS_FILE_EXTENT_PREALLOC 2
+
+struct btrfs_file_extent_item {
+ /*
+ * transaction id that created this extent
+ */
+ __le64 generation;
+ /*
+ * max number of bytes to hold this extent in ram
+ * when we split a compressed extent we can't know how big
+ * each of the resulting pieces will be. So, this is
+ * an upper limit on the size of the extent in ram instead of
+ * an exact limit.
+ */
+ __le64 ram_bytes;
+
+ /*
+ * 32 bits for the various ways we might encode the data,
+ * including compression and encryption. If any of these
+ * are set to something a given disk format doesn't understand
+ * it is treated like an incompat flag for reading and writing,
+ * but not for stat.
+ */
+ u8 compression;
+ u8 encryption;
+ __le16 other_encoding; /* spare for later use */
+
+ /* are we inline data or a real extent? */
+ u8 type;
+
+ /*
+ * disk space consumed by the extent, checksum blocks are included
+ * in these numbers
+ */
+ __le64 disk_bytenr;
+ __le64 disk_num_bytes;
+ /*
+ * the logical offset in file blocks (no csums)
+ * this extent record is for. This allows a file extent to point
+ * into the middle of an existing extent on disk, sharing it
+ * between two snapshots (useful if some bytes in the middle of the
+ * extent have changed
+ */
+ __le64 offset;
+ /*
+ * the logical number of file blocks (no csums included). This
+ * always reflects the size uncompressed and without encoding.
+ */
+ __le64 num_bytes;
+
+} __attribute__ ((__packed__));
+
+struct btrfs_csum_item {
+ u8 csum;
+} __attribute__ ((__packed__));
+
+/* different types of block groups (and chunks) */
+#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
+#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
+#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
+#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
+#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
+
+struct btrfs_block_group_item {
+ __le64 used;
+ __le64 chunk_objectid;
+ __le64 flags;
+} __attribute__ ((__packed__));
+
+struct btrfs_space_info {
+ u64 flags;
+
+ u64 total_bytes; /* total bytes in the space */
+ u64 bytes_used; /* total bytes used on disk */
+ u64 bytes_pinned; /* total bytes pinned, will be freed when the
+ transaction finishes */
+ u64 bytes_reserved; /* total bytes the allocator has reserved for
+ current allocations */
+ u64 bytes_readonly; /* total bytes that are read only */
+ u64 bytes_super; /* total bytes reserved for the super blocks */
+ u64 bytes_root; /* the number of bytes needed to commit a
+ transaction */
+ u64 bytes_may_use; /* number of bytes that may be used for
+ delalloc/allocations */
+ u64 bytes_delalloc; /* number of bytes currently reserved for
+ delayed allocation */
+
+ int full; /* indicates that we cannot allocate any more
+ chunks for this space */
+ int force_alloc; /* set if we need to force a chunk alloc for
+ this space */
+ int force_delalloc; /* make people start doing filemap_flush until
+ we're under a threshold */
+
+ struct list_head list;
+
+ /* for controlling how we free up space for allocations */
+ wait_queue_head_t allocate_wait;
+ wait_queue_head_t flush_wait;
+ int allocating_chunk;
+ int flushing;
+
+ /* for block groups in our same type */
+ struct list_head block_groups;
+ spinlock_t lock;
+ struct rw_semaphore groups_sem;
+ atomic_t caching_threads;
+};
+
+/*
+ * free clusters are used to claim free space in relatively large chunks,
+ * allowing us to do less seeky writes. They are used for all metadata
+ * allocations and data allocations in ssd mode.
+ */
+struct btrfs_free_cluster {
+ spinlock_t lock;
+ spinlock_t refill_lock;
+ struct rb_root root;
+
+ /* largest extent in this cluster */
+ u64 max_size;
+
+ /* first extent starting offset */
+ u64 window_start;
+
+ /* if this cluster simply points at a bitmap in the block group */
+ bool points_to_bitmap;
+
+ struct btrfs_block_group_cache *block_group;
+ /*
+ * when a cluster is allocated from a block group, we put the
+ * cluster onto a list in the block group so that it can
+ * be freed before the block group is freed.
+ */
+ struct list_head block_group_list;
+};
+
+enum btrfs_caching_type {
+ BTRFS_CACHE_NO = 0,
+ BTRFS_CACHE_STARTED = 1,
+ BTRFS_CACHE_FINISHED = 2,
+};
+
+struct btrfs_caching_control {
+ struct list_head list;
+ struct mutex mutex;
+ wait_queue_head_t wait;
+ struct btrfs_block_group_cache *block_group;
+ u64 progress;
+ atomic_t count;
+};
+
+struct btrfs_block_group_cache {
+ struct btrfs_key key;
+ struct btrfs_block_group_item item;
+ struct btrfs_fs_info *fs_info;
+ spinlock_t lock;
+ u64 pinned;
+ u64 reserved;
+ u64 bytes_super;
+ u64 flags;
+ u64 sectorsize;
+ int extents_thresh;
+ int free_extents;
+ int total_bitmaps;
+ int ro;
+ int dirty;
+
+ /* cache tracking stuff */
+ int cached;
+ struct btrfs_caching_control *caching_ctl;
+ u64 last_byte_to_unpin;
+
+ struct btrfs_space_info *space_info;
+
+ /* free space cache stuff */
+ spinlock_t tree_lock;
+ struct rb_root free_space_offset;
+ u64 free_space;
+
+ /* block group cache stuff */
+ struct rb_node cache_node;
+
+ /* for block groups in the same raid type */
+ struct list_head list;
+
+ /* usage count */
+ atomic_t count;
+
+ /* List of struct btrfs_free_clusters for this block group.
+ * Today it will only have one thing on it, but that may change
+ */
+ struct list_head cluster_list;
+};
+
+struct reloc_control;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_fs_info {
+ u8 fsid[BTRFS_FSID_SIZE];
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+ struct btrfs_root *extent_root;
+ struct btrfs_root *tree_root;
+ struct btrfs_root *chunk_root;
+ struct btrfs_root *dev_root;
+ struct btrfs_root *fs_root;
+ struct btrfs_root *csum_root;
+
+ /* the log root tree is a directory of all the other log roots */
+ struct btrfs_root *log_root_tree;
+
+ spinlock_t fs_roots_radix_lock;
+ struct radix_tree_root fs_roots_radix;
+
+ /* block group cache stuff */
+ spinlock_t block_group_cache_lock;
+ struct rb_root block_group_cache_tree;
+
+ struct extent_io_tree freed_extents[2];
+ struct extent_io_tree *pinned_extents;
+
+ /* logical->physical extent mapping */
+ struct btrfs_mapping_tree mapping_tree;
+
+ u64 generation;
+ u64 last_trans_committed;
+
+ /*
+ * this is updated to the current trans every time a full commit
+ * is required instead of the faster short fsync log commits
+ */
+ u64 last_trans_log_full_commit;
+ u64 open_ioctl_trans;
+ unsigned long mount_opt;
+ u64 max_extent;
+ u64 max_inline;
+ u64 alloc_start;
+ struct btrfs_transaction *running_transaction;
+ wait_queue_head_t transaction_throttle;
+ wait_queue_head_t transaction_wait;
+ wait_queue_head_t async_submit_wait;
+
+ struct btrfs_super_block super_copy;
+ struct btrfs_super_block super_for_commit;
+ struct block_device *__bdev;
+ struct super_block *sb;
+ struct inode *btree_inode;
+ struct backing_dev_info bdi;
+ struct mutex trans_mutex;
+ struct mutex tree_log_mutex;
+ struct mutex transaction_kthread_mutex;
+ struct mutex cleaner_mutex;
+ struct mutex chunk_mutex;
+ struct mutex volume_mutex;
+ /*
+ * this protects the ordered operations list only while we are
+ * processing all of the entries on it. This way we make
+ * sure the commit code doesn't find the list temporarily empty
+ * because another function happens to be doing non-waiting preflush
+ * before jumping into the main commit.
+ */
+ struct mutex ordered_operations_mutex;
+ struct rw_semaphore extent_commit_sem;
+
+ struct rw_semaphore cleanup_work_sem;
+
+ struct rw_semaphore subvol_sem;
+ struct srcu_struct subvol_srcu;
+
+ struct list_head trans_list;
+ struct list_head hashers;
+ struct list_head dead_roots;
+ struct list_head caching_block_groups;
+
+ spinlock_t delayed_iput_lock;
+ struct list_head delayed_iputs;
+
+ atomic_t nr_async_submits;
+ atomic_t async_submit_draining;
+ atomic_t nr_async_bios;
+ atomic_t async_delalloc_pages;
+
+ /*
+ * this is used by the balancing code to wait for all the pending
+ * ordered extents
+ */
+ spinlock_t ordered_extent_lock;
+
+ /*
+ * all of the data=ordered extents pending writeback
+ * these can span multiple transactions and basically include
+ * every dirty data page that isn't from nodatacow
+ */
+ struct list_head ordered_extents;
+
+ /*
+ * all of the inodes that have delalloc bytes. It is possible for
+ * this list to be empty even when there is still dirty data=ordered
+ * extents waiting to finish IO.
+ */
+ struct list_head delalloc_inodes;
+
+ /*
+ * special rename and truncate targets that must be on disk before
+ * we're allowed to commit. This is basically the ext3 style
+ * data=ordered list.
+ */
+ struct list_head ordered_operations;
+
+ /*
+ * there is a pool of worker threads for checksumming during writes
+ * and a pool for checksumming after reads. This is because readers
+ * can run with FS locks held, and the writers may be waiting for
+ * those locks. We don't want ordering in the pending list to cause
+ * deadlocks, and so the two are serviced separately.
+ *
+ * A third pool does submit_bio to avoid deadlocking with the other
+ * two
+ */
+ struct btrfs_workers generic_worker;
+ struct btrfs_workers workers;
+ struct btrfs_workers delalloc_workers;
+ struct btrfs_workers endio_workers;
+ struct btrfs_workers endio_meta_workers;
+ struct btrfs_workers endio_meta_write_workers;
+ struct btrfs_workers endio_write_workers;
+ struct btrfs_workers submit_workers;
+ struct btrfs_workers enospc_workers;
+ /*
+ * fixup workers take dirty pages that didn't properly go through
+ * the cow mechanism and make them safe to write. It happens
+ * for the sys_munmap function call path
+ */
+ struct btrfs_workers fixup_workers;
+ struct task_struct *transaction_kthread;
+ struct task_struct *cleaner_kthread;
+ int thread_pool_size;
+
+ struct kobject super_kobj;
+ struct completion kobj_unregister;
+ int do_barriers;
+ int closing;
+ int log_root_recovering;
+
+ u64 total_pinned;
+
+ /* protected by the delalloc lock, used to keep from writing
+ * metadata until there is a nice batch
+ */
+ u64 dirty_metadata_bytes;
+ struct list_head dirty_cowonly_roots;
+
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * the space_info list is almost entirely read only. It only changes
+ * when we add a new raid type to the FS, and that happens
+ * very rarely. RCU is used to protect it.
+ */
+ struct list_head space_info;
+
+ struct reloc_control *reloc_ctl;
+
+ spinlock_t delalloc_lock;
+ spinlock_t new_trans_lock;
+ u64 delalloc_bytes;
+
+ /* data_alloc_cluster is only used in ssd mode */
+ struct btrfs_free_cluster data_alloc_cluster;
+
+ /* all metadata allocations go through this cluster */
+ struct btrfs_free_cluster meta_alloc_cluster;
+
+ spinlock_t ref_cache_lock;
+ u64 total_ref_cache_size;
+
+ u64 avail_data_alloc_bits;
+ u64 avail_metadata_alloc_bits;
+ u64 avail_system_alloc_bits;
+ u64 data_alloc_profile;
+ u64 metadata_alloc_profile;
+ u64 system_alloc_profile;
+
+ unsigned data_chunk_allocations;
+ unsigned metadata_ratio;
+
+ void *bdev_holder;
+};
+
+/*
+ * in ram representation of the tree. extent_root is used for all allocations
+ * and for the extent tree extent_root root.
+ */
+struct btrfs_root {
+ struct extent_buffer *node;
+
+ /* the node lock is held while changing the node pointer */
+ spinlock_t node_lock;
+
+ struct extent_buffer *commit_root;
+ struct btrfs_root *log_root;
+ struct btrfs_root *reloc_root;
+
+ struct btrfs_root_item root_item;
+ struct btrfs_key root_key;
+ struct btrfs_fs_info *fs_info;
+ struct extent_io_tree dirty_log_pages;
+
+ struct kobject root_kobj;
+ struct completion kobj_unregister;
+ struct mutex objectid_mutex;
+
+ struct mutex log_mutex;
+ wait_queue_head_t log_writer_wait;
+ wait_queue_head_t log_commit_wait[2];
+ atomic_t log_writers;
+ atomic_t log_commit[2];
+ unsigned long log_transid;
+ unsigned long last_log_commit;
+ unsigned long log_batch;
+ pid_t log_start_pid;
+ bool log_multiple_pids;
+
+ u64 objectid;
+ u64 last_trans;
+
+ /* data allocations are done in sectorsize units */
+ u32 sectorsize;
+
+ /* node allocations are done in nodesize units */
+ u32 nodesize;
+
+ /* leaf allocations are done in leafsize units */
+ u32 leafsize;
+
+ u32 stripesize;
+
+ u32 type;
+
+ u64 highest_objectid;
+ int ref_cows;
+ int track_dirty;
+ int in_radix;
+ int clean_orphans;
+
+ u64 defrag_trans_start;
+ struct btrfs_key defrag_progress;
+ struct btrfs_key defrag_max;
+ int defrag_running;
+ char *name;
+ int in_sysfs;
+
+ /* the dirty list is only used by non-reference counted roots */
+ struct list_head dirty_list;
+
+ struct list_head root_list;
+
+ spinlock_t list_lock;
+ struct list_head orphan_list;
+
+ spinlock_t inode_lock;
+ /* red-black tree that keeps track of in-memory inodes */
+ struct rb_root inode_tree;
+
+ /*
+ * right now this just gets used so that a root has its own devid
+ * for stat. It may be used for more later
+ */
+ struct super_block anon_super;
+};
+
+/*
+ * inode items have the data typically returned from stat and store other
+ * info about object characteristics. There is one for every file and dir in
+ * the FS
+ */
+#define BTRFS_INODE_ITEM_KEY 1
+#define BTRFS_INODE_REF_KEY 12
+#define BTRFS_XATTR_ITEM_KEY 24
+#define BTRFS_ORPHAN_ITEM_KEY 48
+/* reserve 2-15 close to the inode for later flexibility */
+
+/*
+ * dir items are the name -> inode pointers in a directory. There is one
+ * for every name in a directory.
+ */
+#define BTRFS_DIR_LOG_ITEM_KEY 60
+#define BTRFS_DIR_LOG_INDEX_KEY 72
+#define BTRFS_DIR_ITEM_KEY 84
+#define BTRFS_DIR_INDEX_KEY 96
+/*
+ * extent data is for file data
+ */
+#define BTRFS_EXTENT_DATA_KEY 108
+
+/*
+ * extent csums are stored in a separate tree and hold csums for
+ * an entire extent on disk.
+ */
+#define BTRFS_EXTENT_CSUM_KEY 128
+
+/*
+ * root items point to tree roots. They are typically in the root
+ * tree used by the super block to find all the other trees
+ */
+#define BTRFS_ROOT_ITEM_KEY 132
+
+/*
+ * root backrefs tie subvols and snapshots to the directory entries that
+ * reference them
+ */
+#define BTRFS_ROOT_BACKREF_KEY 144
+
+/*
+ * root refs make a fast index for listing all of the snapshots and
+ * subvolumes referenced by a given root. They point directly to the
+ * directory item in the root that references the subvol
+ */
+#define BTRFS_ROOT_REF_KEY 156
+
+/*
+ * extent items are in the extent map tree. These record which blocks
+ * are used, and how many references there are to each block
+ */
+#define BTRFS_EXTENT_ITEM_KEY 168
+
+#define BTRFS_TREE_BLOCK_REF_KEY 176
+
+#define BTRFS_EXTENT_DATA_REF_KEY 178
+
+#define BTRFS_EXTENT_REF_V0_KEY 180
+
+#define BTRFS_SHARED_BLOCK_REF_KEY 182
+
+#define BTRFS_SHARED_DATA_REF_KEY 184
+
+/*
+ * block groups give us hints into the extent allocation trees. Which
+ * blocks are free etc etc
+ */
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+
+#define BTRFS_DEV_EXTENT_KEY 204
+#define BTRFS_DEV_ITEM_KEY 216
+#define BTRFS_CHUNK_ITEM_KEY 228
+
+/*
+ * string items are for debugging. They just store a short string of
+ * data in the FS
+ */
+#define BTRFS_STRING_ITEM_KEY 253
+
+#define BTRFS_MOUNT_NODATASUM (1 << 0)
+#define BTRFS_MOUNT_NODATACOW (1 << 1)
+#define BTRFS_MOUNT_NOBARRIER (1 << 2)
+#define BTRFS_MOUNT_SSD (1 << 3)
+#define BTRFS_MOUNT_DEGRADED (1 << 4)
+#define BTRFS_MOUNT_COMPRESS (1 << 5)
+#define BTRFS_MOUNT_NOTREELOG (1 << 6)
+#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
+#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
+#define BTRFS_MOUNT_NOSSD (1 << 9)
+#define BTRFS_MOUNT_DISCARD (1 << 10)
+#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
+
+#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
+ BTRFS_MOUNT_##opt)
+/*
+ * Inode flags
+ */
+#define BTRFS_INODE_NODATASUM (1 << 0)
+#define BTRFS_INODE_NODATACOW (1 << 1)
+#define BTRFS_INODE_READONLY (1 << 2)
+#define BTRFS_INODE_NOCOMPRESS (1 << 3)
+#define BTRFS_INODE_PREALLOC (1 << 4)
+#define BTRFS_INODE_SYNC (1 << 5)
+#define BTRFS_INODE_IMMUTABLE (1 << 6)
+#define BTRFS_INODE_APPEND (1 << 7)
+#define BTRFS_INODE_NODUMP (1 << 8)
+#define BTRFS_INODE_NOATIME (1 << 9)
+#define BTRFS_INODE_DIRSYNC (1 << 10)
+
+
+/* some macros to generate set/get funcs for the struct fields. This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple
+ * one for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+
+#define read_eb_member(eb, ptr, type, member, result) ( \
+ read_extent_buffer(eb, (char *)(result), \
+ ((unsigned long)(ptr)) + \
+ offsetof(type, member), \
+ sizeof(((type *)0)->member)))
+
+#define write_eb_member(eb, ptr, type, member, result) ( \
+ write_extent_buffer(eb, (char *)(result), \
+ ((unsigned long)(ptr)) + \
+ offsetof(type, member), \
+ sizeof(((type *)0)->member)))
+
+#ifndef BTRFS_SETGET_FUNCS
+#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
+u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
+#endif
+
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(struct extent_buffer *eb) \
+{ \
+ type *p = kmap_atomic(eb->first_page, KM_USER0); \
+ u##bits res = le##bits##_to_cpu(p->member); \
+ kunmap_atomic(p, KM_USER0); \
+ return res; \
+} \
+static inline void btrfs_set_##name(struct extent_buffer *eb, \
+ u##bits val) \
+{ \
+ type *p = kmap_atomic(eb->first_page, KM_USER0); \
+ p->member = cpu_to_le##bits(val); \
+ kunmap_atomic(p, KM_USER0); \
+}
+
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(type *s) \
+{ \
+ return le##bits##_to_cpu(s->member); \
+} \
+static inline void btrfs_set_##name(type *s, u##bits val) \
+{ \
+ s->member = cpu_to_le##bits(val); \
+}
+
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
+ start_offset, 64);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+ io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+ io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+ sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
+ dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+ seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+ bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+ generation, 64);
+
+static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+ return (char *)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+ return (char *)d + offsetof(struct btrfs_dev_item, fsid);
+}
+
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+ return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+ stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
+ io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
+ io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+ sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+ num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+ sub_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
+ int nr)
+{
+ unsigned long offset = (unsigned long)c;
+ offset += offsetof(struct btrfs_chunk, stripe);
+ offset += nr * sizeof(struct btrfs_stripe);
+ return (struct btrfs_stripe *)offset;
+}
+
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr,
+ u64 val)
+{
+ btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr,
+ u64 val)
+{
+ btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
+ used, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
+ used, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+ struct btrfs_block_group_item, chunk_objectid, 64);
+
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
+ struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_flags,
+ struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+ struct btrfs_block_group_item, flags, 64);
+
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+
+static inline struct btrfs_timespec *
+btrfs_inode_atime(struct btrfs_inode_item *inode_item)
+{
+ unsigned long ptr = (unsigned long)inode_item;
+ ptr += offsetof(struct btrfs_inode_item, atime);
+ return (struct btrfs_timespec *)ptr;
+}
+
+static inline struct btrfs_timespec *
+btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
+{
+ unsigned long ptr = (unsigned long)inode_item;
+ ptr += offsetof(struct btrfs_inode_item, mtime);
+ return (struct btrfs_timespec *)ptr;
+}
+
+static inline struct btrfs_timespec *
+btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
+{
+ unsigned long ptr = (unsigned long)inode_item;
+ ptr += offsetof(struct btrfs_inode_item, ctime);
+ return (struct btrfs_timespec *)ptr;
+}
+
+static inline struct btrfs_timespec *
+btrfs_inode_otime(struct btrfs_inode_item *inode_item)
+{
+ unsigned long ptr = (unsigned long)inode_item;
+ ptr += offsetof(struct btrfs_inode_item, otime);
+ return (struct btrfs_timespec *)ptr;
+}
+
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
+ chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+ chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+ chunk_offset, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+
+static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+{
+ unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
+ return (u8 *)((unsigned long)dev + ptr);
+}
+
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
+
+BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
+
+
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
+
+static inline void btrfs_tree_block_key(struct extent_buffer *eb,
+ struct btrfs_tree_block_info *item,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
+ struct btrfs_tree_block_info *item,
+ struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
+ root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+ objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+ offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
+ count, 32);
+
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
+ count, 32);
+
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
+ type, 8);
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
+ offset, 64);
+
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+ if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+ type == BTRFS_SHARED_BLOCK_REF_KEY)
+ return sizeof(struct btrfs_extent_inline_ref);
+ if (type == BTRFS_SHARED_DATA_REF_KEY)
+ return sizeof(struct btrfs_shared_data_ref) +
+ sizeof(struct btrfs_extent_inline_ref);
+ if (type == BTRFS_EXTENT_DATA_REF_KEY)
+ return sizeof(struct btrfs_extent_data_ref) +
+ offsetof(struct btrfs_extent_inline_ref, offset);
+ BUG();
+ return 0;
+}
+
+BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
+ generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
+
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+
+static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
+ int nr, u64 val)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
+ int nr, u64 val)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline unsigned long btrfs_node_key_ptr_offset(int nr)
+{
+ return offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+}
+
+void btrfs_node_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr);
+
+static inline void btrfs_set_node_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ unsigned long ptr;
+ ptr = btrfs_node_key_ptr_offset(nr);
+ write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+ struct btrfs_key_ptr, key, disk_key);
+}
+
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+
+static inline unsigned long btrfs_item_nr_offset(int nr)
+{
+ return offsetof(struct btrfs_leaf, items) +
+ sizeof(struct btrfs_item) * nr;
+}
+
+static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
+ int nr)
+{
+ return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+}
+
+static inline u32 btrfs_item_end(struct extent_buffer *eb,
+ struct btrfs_item *item)
+{
+ return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+}
+
+static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
+{
+ return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
+}
+
+static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
+{
+ return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
+}
+
+static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
+{
+ return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
+}
+
+static inline void btrfs_item_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ struct btrfs_item *item = btrfs_item_nr(eb, nr);
+ read_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ struct btrfs_item *item = btrfs_item_nr(eb, nr);
+ write_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+
+/*
+ * struct btrfs_root_ref
+ */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+
+static inline void btrfs_dir_item_key(struct extent_buffer *eb,
+ struct btrfs_dir_item *item,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+ struct btrfs_dir_item *item,
+ struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
+ objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+ struct btrfs_disk_key *disk)
+{
+ cpu->offset = le64_to_cpu(disk->offset);
+ cpu->type = disk->type;
+ cpu->objectid = le64_to_cpu(disk->objectid);
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+ struct btrfs_key *cpu)
+{
+ disk->offset = cpu_to_le64(cpu->offset);
+ disk->type = cpu->type;
+ disk->objectid = cpu_to_le64(cpu->objectid);
+}
+
+static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
+{
+ struct btrfs_disk_key disk_key;
+ btrfs_node_key(eb, &disk_key, nr);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
+{
+ struct btrfs_disk_key disk_key;
+ btrfs_item_key(eb, &disk_key, nr);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
+ struct btrfs_dir_item *item,
+ struct btrfs_key *key)
+{
+ struct btrfs_disk_key disk_key;
+ btrfs_dir_item_key(eb, item, &disk_key);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+
+static inline u8 btrfs_key_type(struct btrfs_key *key)
+{
+ return key->type;
+}
+
+static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
+{
+ key->type = val;
+}
+
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
+ generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+
+static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ return (btrfs_header_flags(eb) & flag) == flag;
+}
+
+static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ u64 flags = btrfs_header_flags(eb);
+ btrfs_set_header_flags(eb, flags | flag);
+ return (flags & flag) == flag;
+}
+
+static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ u64 flags = btrfs_header_flags(eb);
+ btrfs_set_header_flags(eb, flags & ~flag);
+ return (flags & flag) == flag;
+}
+
+static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+{
+ u64 flags = btrfs_header_flags(eb);
+ return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
+ int rev)
+{
+ u64 flags = btrfs_header_flags(eb);
+ flags &= ~BTRFS_BACKREF_REV_MASK;
+ flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+ btrfs_set_header_flags(eb, flags);
+}
+
+static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
+{
+ unsigned long ptr = offsetof(struct btrfs_header, fsid);
+ return (u8 *)ptr;
+}
+
+static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+{
+ unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
+ return (u8 *)ptr;
+}
+
+static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
+{
+ unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
+ return (u8 *)ptr;
+}
+
+static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
+{
+ unsigned long ptr = offsetof(struct btrfs_header, csum);
+ return (u8 *)ptr;
+}
+
+static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
+{
+ return NULL;
+}
+
+static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
+{
+ return NULL;
+}
+
+static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
+{
+ return NULL;
+}
+
+static inline int btrfs_is_leaf(struct extent_buffer *eb)
+{
+ return btrfs_header_level(eb) == 0;
+}
+
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+ last_snapshot, 64);
+
+/* struct btrfs_super_block */
+
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+ struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+ struct btrfs_super_block, chunk_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+ root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+ chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
+ log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
+ log_root_transid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+ log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+ sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+ nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
+ leafsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+ stripesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+ root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+ num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+ compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+ compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+ incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+ csum_type, 16);
+
+static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+{
+ int t = btrfs_super_csum_type(s);
+ BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
+ return btrfs_csum_sizes[t];
+}
+
+static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
+{
+ return offsetof(struct btrfs_leaf, items);
+}
+
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+
+static inline unsigned long
+btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
+{
+ unsigned long offset = (unsigned long)e;
+ offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
+ return offset;
+}
+
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+ return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
+}
+
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+ disk_bytenr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+ disk_num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+ offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+ num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+ ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+ compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+ other_encoding, 16);
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+ struct btrfs_file_extent_item *e)
+{
+ return btrfs_file_extent_ram_bytes(eb, e);
+}
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers. If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+ struct btrfs_item *e)
+{
+ unsigned long offset;
+ offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+ return btrfs_item_size(eb, e) - offset;
+}
+
+static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline int btrfs_set_root_name(struct btrfs_root *root,
+ const char *name, int len)
+{
+ /* if we already have a name just free it */
+ kfree(root->name);
+
+ root->name = kmalloc(len+1, GFP_KERNEL);
+ if (!root->name)
+ return -ENOMEM;
+
+ memcpy(root->name, name, len);
+ root->name[len] = '\0';
+
+ return 0;
+}
+
+static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
+{
+ if (level == 0)
+ return root->leafsize;
+ return root->nodesize;
+}
+
+/* helper function to cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type) \
+ ((type *)(btrfs_leaf_data(leaf) + \
+ btrfs_item_offset_nr(leaf, slot)))
+
+#define btrfs_item_ptr_offset(leaf, slot) \
+ ((unsigned long)(btrfs_leaf_data(leaf) + \
+ btrfs_item_offset_nr(leaf, slot)))
+
+static inline struct dentry *fdentry(struct file *file)
+{
+ return file->f_path.dentry;
+}
+
+/* extent-tree.c */
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long count);
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num, int reserved);
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 objectid, u64 offset, u64 bytenr);
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+ struct btrfs_fs_info *info,
+ u64 bytenr);
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+u64 btrfs_find_block_group(struct btrfs_root *root,
+ u64 search_start, u64 search_hint, int owner);
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u32 blocksize,
+ u64 parent, u64 root_objectid,
+ struct btrfs_disk_key *key, int level,
+ u64 hint, u64 empty_size);
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u32 blocksize,
+ u64 parent, u64 root_objectid, int level);
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u32 blocksize,
+ int level);
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 root_objectid, u64 owner,
+ u64 offset, struct btrfs_key *ins);
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 root_objectid, u64 owner, u64 offset,
+ struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 num_bytes, u64 min_alloc_size,
+ u64 empty_size, u64 hint_byte,
+ u64 search_end, struct btrfs_key *ins,
+ u64 data);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 flags,
+ int is_data);
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner, u64 offset);
+
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner, u64 offset);
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytes_used,
+ u64 type, u64 chunk_objectid, u64 chunk_offset,
+ u64 size);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 group_start);
+int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+ struct btrfs_block_group_cache *group);
+
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items);
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items);
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes);
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+ struct inode *inode, u64 bytes);
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes);
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes);
+/* ctree.c */
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+ int level, int *slot);
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
+int btrfs_previous_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid,
+ int type);
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *new_key);
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, int lowest_level,
+ int cache_only, u64 min_trans);
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+ struct btrfs_key *max_key,
+ struct btrfs_path *path, int cache_only,
+ u64 min_trans);
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret);
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+ struct extent_buffer *buf);
+int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, u32 data_size);
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u32 new_size, int from_end);
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ unsigned long split_offset);
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key);
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_path *p, int
+ ins_len, int cow);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *parent,
+ int start_slot, int cache_only, u64 *last_ret,
+ struct btrfs_key *progress);
+void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
+struct btrfs_path *btrfs_alloc_path(void);
+void btrfs_free_path(struct btrfs_path *p);
+void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
+
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int slot, int nr);
+static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ return btrfs_del_items(trans, root, path, path->slots[0], 1);
+}
+
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, void *data, u32 data_size);
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ int nr);
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size, int nr);
+
+static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ u32 data_size)
+{
+ return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+}
+
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *node,
+ struct extent_buffer *parent);
+/* root-item.c */
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+ struct btrfs_path *path,
+ u64 root_id, u64 ref_id);
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+ const char *name, int name_len);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+ const char *name, int name_len);
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_root_item
+ *item);
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_root_item
+ *item);
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
+ btrfs_root_item *item, struct btrfs_key *key);
+int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+ u64 *found_objectid);
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
+int btrfs_set_root_node(struct btrfs_root_item *item,
+ struct extent_buffer *node);
+/* dir-item.c */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *name,
+ int name_len, u64 dir,
+ struct btrfs_key *location, u8 type, u64 index);
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, int name_len,
+ int mod);
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ u64 objectid, const char *name, int name_len,
+ int mod);
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const char *name, int name_len);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name, u16 name_len,
+ const void *data, u16 data_len);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, u16 name_len,
+ int mod);
+
+/* orphan.c */
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
+
+/* inode-map.c */
+int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+ struct btrfs_root *fs_root,
+ u64 dirid, u64 *objectid);
+int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
+
+/* inode-item.c */
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 *index);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path,
+ struct btrfs_key *location, int mod);
+
+/* file-item.c */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 len);
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u32 *dst);
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 objectid, u64 pos,
+ u64 disk_offset, u64 disk_num_bytes,
+ u64 num_bytes, u64 offset, u64 ram_bytes,
+ u8 compression, u8 encryption, u16 other_encoding);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ u64 bytenr, int mod);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_ordered_sum *sums);
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 file_start, int contig);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+ u64 start, unsigned long len);
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, int cow);
+int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ u64 isize);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
+ u64 end, struct list_head *list);
+/* inode.c */
+
+/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
+#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
+#define ClearPageChecked ClearPageFsMisc
+#define SetPageChecked SetPageFsMisc
+#define PageChecked PageFsMisc
+#endif
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct inode *dir, u64 *index);
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, struct inode *inode,
+ const char *name, int name_len);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+ struct inode *parent_inode, struct inode *inode,
+ const char *name, int name_len, int add_backref, u64 index);
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, u64 objectid,
+ const char *name, int name_len);
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode, u64 new_size,
+ u32 min_type);
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+int btrfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc);
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *new_root,
+ u64 new_dirid, u64 alloc_hint);
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+ size_t size, struct bio *bio, unsigned long bio_flags);
+
+unsigned long btrfs_force_ra(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *file,
+ pgoff_t offset, pgoff_t last_index);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+int btrfs_readpage(struct file *file, struct page *page);
+void btrfs_delete_inode(struct inode *inode);
+void btrfs_put_inode(struct inode *inode);
+int btrfs_write_inode(struct inode *inode, int wait);
+void btrfs_dirty_inode(struct inode *inode);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+void btrfs_drop_inode(struct inode *inode);
+int btrfs_init_cachep(void);
+void btrfs_destroy_cachep(void);
+long btrfs_ioctl_trans_end(struct file *file);
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+ struct btrfs_root *root);
+int btrfs_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to);
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+ size_t page_offset, u64 start, u64 end,
+ int create);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
+void btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_cont_expand(struct inode *inode, loff_t size);
+int btrfs_invalidate_inodes(struct btrfs_root *root);
+void btrfs_add_delayed_iput(struct inode *inode);
+void btrfs_run_delayed_iputs(struct btrfs_root *root);
+extern const struct dentry_operations btrfs_dentry_operations;
+
+/* ioctl.c */
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void btrfs_update_iflags(struct inode *inode);
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
+
+/* file.c */
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned);
+int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
+extern const struct file_operations btrfs_file_operations;
+int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
+ u64 start, u64 end, u64 *hint_byte, int drop_cache);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 start, u64 end);
+int btrfs_release_file(struct inode *inode, struct file *file);
+
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int cache_only);
+
+/* sysfs.c */
+int btrfs_init_sysfs(void);
+void btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
+int btrfs_sysfs_add_root(struct btrfs_root *root);
+void btrfs_sysfs_del_root(struct btrfs_root *root);
+void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
+
+/* xattr.c */
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+/* super.c */
+u64 btrfs_parse_size(char *str);
+int btrfs_parse_options(struct btrfs_root *root, char *options);
+int btrfs_sync_fs(struct super_block *sb, int wait);
+
+/* acl.c */
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+int btrfs_check_acl(struct inode *inode, int mask);
+#else
+#define btrfs_check_acl NULL
+#endif
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir);
+int btrfs_acl_chmod(struct inode *inode);
+
+/* relocation.c */
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 00000000000..84e6781413b
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,919 @@
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/sort.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+
+/*
+ * delayed back reference update tracking. For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing. This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ */
+
+/*
+ * compare two delayed tree backrefs with same bytenr and type
+ */
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
+ struct btrfs_delayed_tree_ref *ref1)
+{
+ if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+ if (ref1->root < ref2->root)
+ return -1;
+ if (ref1->root > ref2->root)
+ return 1;
+ } else {
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * compare two delayed data backrefs with same bytenr and type
+ */
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
+ struct btrfs_delayed_data_ref *ref1)
+{
+ if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ if (ref1->root < ref2->root)
+ return -1;
+ if (ref1->root > ref2->root)
+ return 1;
+ if (ref1->objectid < ref2->objectid)
+ return -1;
+ if (ref1->objectid > ref2->objectid)
+ return 1;
+ if (ref1->offset < ref2->offset)
+ return -1;
+ if (ref1->offset > ref2->offset)
+ return 1;
+ } else {
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * entries in the rb tree are ordered by the byte number of the extent,
+ * type of the delayed backrefs and content of delayed backrefs.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref2,
+ struct btrfs_delayed_ref_node *ref1)
+{
+ if (ref1->bytenr < ref2->bytenr)
+ return -1;
+ if (ref1->bytenr > ref2->bytenr)
+ return 1;
+ if (ref1->is_head && ref2->is_head)
+ return 0;
+ if (ref2->is_head)
+ return -1;
+ if (ref1->is_head)
+ return 1;
+ if (ref1->type < ref2->type)
+ return -1;
+ if (ref1->type > ref2->type)
+ return 1;
+ if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+ return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
+ btrfs_delayed_node_to_tree_ref(ref1));
+ } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+ return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
+ btrfs_delayed_node_to_data_ref(ref1));
+ }
+ BUG();
+ return 0;
+}
+
+/*
+ * insert a new ref into the rbtree. This returns any existing refs
+ * for the same (bytenr,parent) tuple, or NULL if the new node was properly
+ * inserted.
+ */
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_delayed_ref_node *entry;
+ struct btrfs_delayed_ref_node *ins;
+ int cmp;
+
+ ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+ rb_node);
+
+ cmp = comp_entry(entry, ins);
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ rb_link_node(node, parent_node, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+/*
+ * find an head entry based on bytenr. This returns the delayed ref
+ * head if it was able to find one, or NULL if nothing was in that spot
+ */
+static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
+ u64 bytenr,
+ struct btrfs_delayed_ref_node **last)
+{
+ struct rb_node *n = root->rb_node;
+ struct btrfs_delayed_ref_node *entry;
+ int cmp;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ WARN_ON(!entry->in_tree);
+ if (last)
+ *last = entry;
+
+ if (bytenr < entry->bytenr)
+ cmp = -1;
+ else if (bytenr > entry->bytenr)
+ cmp = 1;
+ else if (!btrfs_delayed_ref_is_head(entry))
+ cmp = 1;
+ else
+ cmp = 0;
+
+ if (cmp < 0)
+ n = n->rb_left;
+ else if (cmp > 0)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ assert_spin_locked(&delayed_refs->lock);
+ if (mutex_trylock(&head->mutex))
+ return 0;
+
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ mutex_lock(&head->mutex);
+ spin_lock(&delayed_refs->lock);
+ if (!head->node.in_tree) {
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ return -EAGAIN;
+ }
+ btrfs_put_delayed_ref(&head->node);
+ return 0;
+}
+
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+ struct list_head *cluster, u64 start)
+{
+ int count = 0;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *node;
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *head;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ if (start == 0) {
+ node = rb_first(&delayed_refs->root);
+ } else {
+ ref = NULL;
+ find_ref_head(&delayed_refs->root, start, &ref);
+ if (ref) {
+ struct btrfs_delayed_ref_node *tmp;
+
+ node = rb_prev(&ref->rb_node);
+ while (node) {
+ tmp = rb_entry(node,
+ struct btrfs_delayed_ref_node,
+ rb_node);
+ if (tmp->bytenr < start)
+ break;
+ ref = tmp;
+ node = rb_prev(&ref->rb_node);
+ }
+ node = &ref->rb_node;
+ } else
+ node = rb_first(&delayed_refs->root);
+ }
+again:
+ while (node && count < 32) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ if (btrfs_delayed_ref_is_head(ref)) {
+ head = btrfs_delayed_node_to_head(ref);
+ if (list_empty(&head->cluster)) {
+ list_add_tail(&head->cluster, cluster);
+ delayed_refs->run_delayed_start =
+ head->node.bytenr;
+ count++;
+
+ WARN_ON(delayed_refs->num_heads_ready == 0);
+ delayed_refs->num_heads_ready--;
+ } else if (count) {
+ /* the goal of the clustering is to find extents
+ * that are likely to end up in the same extent
+ * leaf on disk. So, we don't want them spread
+ * all over the tree. Stop now if we've hit
+ * a head that was already in use
+ */
+ break;
+ }
+ }
+ node = rb_next(node);
+ }
+ if (count) {
+ return 0;
+ } else if (start) {
+ /*
+ * we've gone to the end of the rbtree without finding any
+ * clusters. start from the beginning and try again
+ */
+ start = 0;
+ node = rb_first(&delayed_refs->root);
+ goto again;
+ }
+ return 1;
+}
+
+/*
+ * This checks to see if there are any delayed refs in the
+ * btree for a given bytenr. It returns one if it finds any
+ * and zero otherwise.
+ *
+ * If it only finds a head node, it returns 0.
+ *
+ * The idea is to use this when deciding if you can safely delete an
+ * extent from the extent allocation tree. There may be a pending
+ * ref in the rbtree that adds or removes references, so as long as this
+ * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
+ * allocation tree.
+ */
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *prev_node;
+ int ret = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+ if (ref) {
+ prev_node = rb_prev(&ref->rb_node);
+ if (!prev_node)
+ goto out;
+ ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (ref->bytenr == bytenr)
+ ret = 1;
+ }
+out:
+ spin_unlock(&delayed_refs->lock);
+ return ret;
+}
+
+/*
+ * helper function to lookup reference count and flags of extent.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *refs, u64 *flags)
+{
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u32 item_size;
+ u64 num_refs;
+ u64 extent_flags;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ delayed_refs = &trans->transaction->delayed_refs;
+again:
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (item_size >= sizeof(*ei)) {
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ extent_flags = btrfs_extent_flags(leaf, ei);
+ } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ struct btrfs_extent_item_v0 *ei0;
+ BUG_ON(item_size != sizeof(*ei0));
+ ei0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item_v0);
+ num_refs = btrfs_extent_refs_v0(leaf, ei0);
+ /* FIXME: this isn't correct for data */
+ extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+ BUG();
+#endif
+ }
+ BUG_ON(num_refs == 0);
+ } else {
+ num_refs = 0;
+ extent_flags = 0;
+ ret = 0;
+ }
+
+ spin_lock(&delayed_refs->lock);
+ ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+ if (ref) {
+ head = btrfs_delayed_node_to_head(ref);
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&ref->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(root->fs_info->extent_root, path);
+
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(ref);
+ goto again;
+ }
+ if (head->extent_op && head->extent_op->update_flags)
+ extent_flags |= head->extent_op->flags_to_set;
+ else
+ BUG_ON(num_refs == 0);
+
+ num_refs += ref->ref_mod;
+ mutex_unlock(&head->mutex);
+ }
+ WARN_ON(num_refs == 0);
+ if (refs)
+ *refs = num_refs;
+ if (flags)
+ *flags = extent_flags;
+out:
+ spin_unlock(&delayed_refs->lock);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper function to update an extent delayed ref in the
+ * rbtree. existing and update must both have the same
+ * bytenr and parent
+ *
+ * This may free existing if the update cancels out whatever
+ * operation it was doing.
+ */
+static noinline void
+update_existing_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_node *existing,
+ struct btrfs_delayed_ref_node *update)
+{
+ if (update->action != existing->action) {
+ /*
+ * this is effectively undoing either an add or a
+ * drop. We decrement the ref_mod, and if it goes
+ * down to zero we just delete the entry without
+ * every changing the extent allocation tree.
+ */
+ existing->ref_mod--;
+ if (existing->ref_mod == 0) {
+ rb_erase(&existing->rb_node,
+ &delayed_refs->root);
+ existing->in_tree = 0;
+ btrfs_put_delayed_ref(existing);
+ delayed_refs->num_entries--;
+ if (trans->delayed_ref_updates)
+ trans->delayed_ref_updates--;
+ } else {
+ WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ }
+ } else {
+ WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ /*
+ * the action on the existing ref matches
+ * the action on the ref we're trying to add.
+ * Bump the ref_mod by one so the backref that
+ * is eventually added/removed has the correct
+ * reference count
+ */
+ existing->ref_mod += update->ref_mod;
+ }
+}
+
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void
+update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+ struct btrfs_delayed_ref_node *update)
+{
+ struct btrfs_delayed_ref_head *existing_ref;
+ struct btrfs_delayed_ref_head *ref;
+
+ existing_ref = btrfs_delayed_node_to_head(existing);
+ ref = btrfs_delayed_node_to_head(update);
+ BUG_ON(existing_ref->is_data != ref->is_data);
+
+ if (ref->must_insert_reserved) {
+ /* if the extent was freed and then
+ * reallocated before the delayed ref
+ * entries were processed, we can end up
+ * with an existing head ref without
+ * the must_insert_reserved flag set.
+ * Set it again here
+ */
+ existing_ref->must_insert_reserved = ref->must_insert_reserved;
+
+ /*
+ * update the num_bytes so we make sure the accounting
+ * is done correctly
+ */
+ existing->num_bytes = update->num_bytes;
+
+ }
+
+ if (ref->extent_op) {
+ if (!existing_ref->extent_op) {
+ existing_ref->extent_op = ref->extent_op;
+ } else {
+ if (ref->extent_op->update_key) {
+ memcpy(&existing_ref->extent_op->key,
+ &ref->extent_op->key,
+ sizeof(ref->extent_op->key));
+ existing_ref->extent_op->update_key = 1;
+ }
+ if (ref->extent_op->update_flags) {
+ existing_ref->extent_op->flags_to_set |=
+ ref->extent_op->flags_to_set;
+ existing_ref->extent_op->update_flags = 1;
+ }
+ kfree(ref->extent_op);
+ }
+ }
+ /*
+ * update the reference mod on the head to reflect this new operation
+ */
+ existing->ref_mod += update->ref_mod;
+}
+
+/*
+ * helper function to actually insert a head node into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count.
+ */
+static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *ref,
+ u64 bytenr, u64 num_bytes,
+ int action, int is_data)
+{
+ struct btrfs_delayed_ref_node *existing;
+ struct btrfs_delayed_ref_head *head_ref = NULL;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int count_mod = 1;
+ int must_insert_reserved = 0;
+
+ /*
+ * the head node stores the sum of all the mods, so dropping a ref
+ * should drop the sum in the head node by one.
+ */
+ if (action == BTRFS_UPDATE_DELAYED_HEAD)
+ count_mod = 0;
+ else if (action == BTRFS_DROP_DELAYED_REF)
+ count_mod = -1;
+
+ /*
+ * BTRFS_ADD_DELAYED_EXTENT means that we need to update
+ * the reserved accounting when the extent is finally added, or
+ * if a later modification deletes the delayed ref without ever
+ * inserting the extent into the extent allocation tree.
+ * ref->must_insert_reserved is the flag used to record
+ * that accounting mods are required.
+ *
+ * Once we record must_insert_reserved, switch the action to
+ * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+ */
+ if (action == BTRFS_ADD_DELAYED_EXTENT)
+ must_insert_reserved = 1;
+ else
+ must_insert_reserved = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ /* first set the basic ref node struct up */
+ atomic_set(&ref->refs, 1);
+ ref->bytenr = bytenr;
+ ref->num_bytes = num_bytes;
+ ref->ref_mod = count_mod;
+ ref->type = 0;
+ ref->action = 0;
+ ref->is_head = 1;
+ ref->in_tree = 1;
+
+ head_ref = btrfs_delayed_node_to_head(ref);
+ head_ref->must_insert_reserved = must_insert_reserved;
+ head_ref->is_data = is_data;
+
+ INIT_LIST_HEAD(&head_ref->cluster);
+ mutex_init(&head_ref->mutex);
+
+ existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+
+ if (existing) {
+ update_existing_head_ref(existing, ref);
+ /*
+ * we've updated the existing ref, free the newly
+ * allocated ref
+ */
+ kfree(ref);
+ } else {
+ delayed_refs->num_heads++;
+ delayed_refs->num_heads_ready++;
+ delayed_refs->num_entries++;
+ trans->delayed_ref_updates++;
+ }
+ return 0;
+}
+
+/*
+ * helper to insert a delayed tree ref into the rbtree.
+ */
+static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *ref,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 ref_root, int level, int action)
+{
+ struct btrfs_delayed_ref_node *existing;
+ struct btrfs_delayed_tree_ref *full_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ if (action == BTRFS_ADD_DELAYED_EXTENT)
+ action = BTRFS_ADD_DELAYED_REF;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ /* first set the basic ref node struct up */
+ atomic_set(&ref->refs, 1);
+ ref->bytenr = bytenr;
+ ref->num_bytes = num_bytes;
+ ref->ref_mod = 1;
+ ref->action = action;
+ ref->is_head = 0;
+ ref->in_tree = 1;
+
+ full_ref = btrfs_delayed_node_to_tree_ref(ref);
+ if (parent) {
+ full_ref->parent = parent;
+ ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
+ } else {
+ full_ref->root = ref_root;
+ ref->type = BTRFS_TREE_BLOCK_REF_KEY;
+ }
+ full_ref->level = level;
+
+ existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+
+ if (existing) {
+ update_existing_ref(trans, delayed_refs, existing, ref);
+ /*
+ * we've updated the existing ref, free the newly
+ * allocated ref
+ */
+ kfree(ref);
+ } else {
+ delayed_refs->num_entries++;
+ trans->delayed_ref_updates++;
+ }
+ return 0;
+}
+
+/*
+ * helper to insert a delayed data ref into the rbtree.
+ */
+static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *ref,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 ref_root, u64 owner, u64 offset,
+ int action)
+{
+ struct btrfs_delayed_ref_node *existing;
+ struct btrfs_delayed_data_ref *full_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ if (action == BTRFS_ADD_DELAYED_EXTENT)
+ action = BTRFS_ADD_DELAYED_REF;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ /* first set the basic ref node struct up */
+ atomic_set(&ref->refs, 1);
+ ref->bytenr = bytenr;
+ ref->num_bytes = num_bytes;
+ ref->ref_mod = 1;
+ ref->action = action;
+ ref->is_head = 0;
+ ref->in_tree = 1;
+
+ full_ref = btrfs_delayed_node_to_data_ref(ref);
+ if (parent) {
+ full_ref->parent = parent;
+ ref->type = BTRFS_SHARED_DATA_REF_KEY;
+ } else {
+ full_ref->root = ref_root;
+ ref->type = BTRFS_EXTENT_DATA_REF_KEY;
+ }
+ full_ref->objectid = owner;
+ full_ref->offset = offset;
+
+ existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+
+ if (existing) {
+ update_existing_ref(trans, delayed_refs, existing, ref);
+ /*
+ * we've updated the existing ref, free the newly
+ * allocated ref
+ */
+ kfree(ref);
+ } else {
+ delayed_refs->num_entries++;
+ trans->delayed_ref_updates++;
+ }
+ return 0;
+}
+
+/*
+ * add a delayed tree ref. This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 ref_root, int level, int action,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ BUG_ON(extent_op && extent_op->is_data);
+ ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ if (!head_ref) {
+ kfree(ref);
+ return -ENOMEM;
+ }
+
+ head_ref->extent_op = extent_op;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
+ action, 0);
+ BUG_ON(ret);
+
+ ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
+ parent, ref_root, level, action);
+ BUG_ON(ret);
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
+
+/*
+ * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
+ */
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes,
+ u64 parent, u64 ref_root,
+ u64 owner, u64 offset, int action,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_delayed_data_ref *ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ BUG_ON(extent_op && !extent_op->is_data);
+ ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ if (!head_ref) {
+ kfree(ref);
+ return -ENOMEM;
+ }
+
+ head_ref->extent_op = extent_op;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
+ action, 1);
+ BUG_ON(ret);
+
+ ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
+ parent, ref_root, owner, offset, action);
+ BUG_ON(ret);
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
+
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ if (!head_ref)
+ return -ENOMEM;
+
+ head_ref->extent_op = extent_op;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+ num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+ extent_op->is_data);
+ BUG_ON(ret);
+
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
+
+/*
+ * this does a simple search for the head node for a given extent.
+ * It must be called with the delayed ref spinlock held, and it returns
+ * the head node if any where found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+ if (ref)
+ return btrfs_delayed_node_to_head(ref);
+ return NULL;
+}
+
+/*
+ * add a delayed ref to the tree. This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ *
+ * The main point of this call is to add and remove a backreference in a single
+ * shot, taking the lock only once, and only searching for the head node once.
+ *
+ * It is the same as doing a ref add and delete in two separate calls.
+ */
+#if 0
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 orig_parent,
+ u64 parent, u64 orig_ref_root, u64 ref_root,
+ u64 orig_ref_generation, u64 ref_generation,
+ u64 owner_objectid, int pin)
+{
+ struct btrfs_delayed_ref *ref;
+ struct btrfs_delayed_ref *old_ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
+ if (!old_ref) {
+ kfree(ref);
+ return -ENOMEM;
+ }
+
+ /*
+ * the parent = 0 case comes from cases where we don't actually
+ * know the parent yet. It will get updated later via a add/drop
+ * pair.
+ */
+ if (parent == 0)
+ parent = bytenr;
+ if (orig_parent == 0)
+ orig_parent = bytenr;
+
+ head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ if (!head_ref) {
+ kfree(ref);
+ kfree(old_ref);
+ return -ENOMEM;
+ }
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+ (u64)-1, 0, 0, 0,
+ BTRFS_UPDATE_DELAYED_HEAD, 0);
+ BUG_ON(ret);
+
+ ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+ parent, ref_root, ref_generation,
+ owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
+ BUG_ON(ret);
+
+ ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
+ orig_parent, orig_ref_root,
+ orig_ref_generation, owner_objectid,
+ BTRFS_DROP_DELAYED_REF, pin);
+ BUG_ON(ret);
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
+#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 00000000000..f6fc67ddad3
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_REF__
+#define __DELAYED_REF__
+
+/* these are the possible values of struct btrfs_delayed_ref->action */
+#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+
+struct btrfs_delayed_ref_node {
+ struct rb_node rb_node;
+
+ /* the starting bytenr of the extent */
+ u64 bytenr;
+
+ /* the size of the extent */
+ u64 num_bytes;
+
+ /* ref count on this data structure */
+ atomic_t refs;
+
+ /*
+ * how many refs is this entry adding or deleting. For
+ * head refs, this may be a negative number because it is keeping
+ * track of the total mods done to the reference count.
+ * For individual refs, this will always be a positive number
+ *
+ * It may be more than one, since it is possible for a single
+ * parent to have more than one ref on an extent
+ */
+ int ref_mod;
+
+ unsigned int action:8;
+ unsigned int type:8;
+ /* is this node still in the rbtree? */
+ unsigned int is_head:1;
+ unsigned int in_tree:1;
+};
+
+struct btrfs_delayed_extent_op {
+ struct btrfs_disk_key key;
+ u64 flags_to_set;
+ unsigned int update_key:1;
+ unsigned int update_flags:1;
+ unsigned int is_data:1;
+};
+
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent. They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+ struct btrfs_delayed_ref_node node;
+
+ /*
+ * the mutex is held while running the refs, and it is also
+ * held when checking the sum of reference modifications.
+ */
+ struct mutex mutex;
+
+ struct list_head cluster;
+
+ struct btrfs_delayed_extent_op *extent_op;
+ /*
+ * when a new extent is allocated, it is just reserved in memory
+ * The actual extent isn't inserted into the extent allocation tree
+ * until the delayed ref is processed. must_insert_reserved is
+ * used to flag a delayed ref so the accounting can be updated
+ * when a full insert is done.
+ *
+ * It is possible the extent will be freed before it is ever
+ * inserted into the extent allocation tree. In this case
+ * we need to update the in ram accounting to properly reflect
+ * the free has happened.
+ */
+ unsigned int must_insert_reserved:1;
+ unsigned int is_data:1;
+};
+
+struct btrfs_delayed_tree_ref {
+ struct btrfs_delayed_ref_node node;
+ union {
+ u64 root;
+ u64 parent;
+ };
+ int level;
+};
+
+struct btrfs_delayed_data_ref {
+ struct btrfs_delayed_ref_node node;
+ union {
+ u64 root;
+ u64 parent;
+ };
+ u64 objectid;
+ u64 offset;
+};
+
+struct btrfs_delayed_ref_root {
+ struct rb_root root;
+
+ /* this spin lock protects the rbtree and the entries inside */
+ spinlock_t lock;
+
+ /* how many delayed ref updates we've queued, used by the
+ * throttling code
+ */
+ unsigned long num_entries;
+
+ /* total number of head nodes in tree */
+ unsigned long num_heads;
+
+ /* total number of head nodes ready for processing */
+ unsigned long num_heads_ready;
+
+ /*
+ * set when the tree is flushing before a transaction commit,
+ * used by the throttling code to decide if new updates need
+ * to be run right away
+ */
+ int flushing;
+
+ u64 run_delayed_start;
+};
+
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+ WARN_ON(atomic_read(&ref->refs) == 0);
+ if (atomic_dec_and_test(&ref->refs)) {
+ WARN_ON(ref->in_tree);
+ kfree(ref);
+ }
+}
+
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 ref_root, int level, int action,
+ struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes,
+ u64 parent, u64 ref_root,
+ u64 owner, u64 offset, int action,
+ struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes,
+ struct btrfs_delayed_extent_op *extent_op);
+
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *refs, u64 *flags);
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 orig_parent,
+ u64 parent, u64 orig_ref_root, u64 ref_root,
+ u64 orig_ref_generation, u64 ref_generation,
+ u64 owner_objectid, int pin);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+ struct list_head *cluster, u64 search_start);
+/*
+ * a node might live in a head or a regular ref, this lets you
+ * test for the proper type to use.
+ */
+static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
+{
+ return node->is_head;
+}
+
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_tree_ref *
+btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
+{
+ WARN_ON(btrfs_delayed_ref_is_head(node));
+ return container_of(node, struct btrfs_delayed_tree_ref, node);
+}
+
+static inline struct btrfs_delayed_data_ref *
+btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+{
+ WARN_ON(btrfs_delayed_ref_is_head(node));
+ return container_of(node, struct btrfs_delayed_data_ref, node);
+}
+
+static inline struct btrfs_delayed_ref_head *
+btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
+{
+ WARN_ON(!btrfs_delayed_ref_is_head(node));
+ return container_of(node, struct btrfs_delayed_ref_head, node);
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 00000000000..e9103b3baa4
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+
+/*
+ * insert a name into a directory, doing overflow properly if there is a hash
+ * collision. data_size indicates how big the item inserted should be. On
+ * success a struct btrfs_dir_item pointer is returned, otherwise it is
+ * an ERR_PTR.
+ *
+ * The name is not copied into the dir item, you have to do that yourself.
+ */
+static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+ *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key,
+ u32 data_size,
+ const char *name,
+ int name_len)
+{
+ int ret;
+ char *ptr;
+ struct btrfs_item *item;
+ struct extent_buffer *leaf;
+
+ ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+ if (ret == -EEXIST) {
+ struct btrfs_dir_item *di;
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (di)
+ return ERR_PTR(-EEXIST);
+ ret = btrfs_extend_item(trans, root, path, data_size);
+ WARN_ON(ret > 0);
+ }
+ if (ret < 0)
+ return ERR_PTR(ret);
+ WARN_ON(ret > 0);
+ leaf = path->nodes[0];
+ item = btrfs_item_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+ BUG_ON(data_size > btrfs_item_size(leaf, item));
+ ptr += btrfs_item_size(leaf, item) - data_size;
+ return (struct btrfs_dir_item *)ptr;
+}
+
+/*
+ * xattrs work a lot like directories, this inserts an xattr item
+ * into the tree
+ */
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name, u16 name_len,
+ const void *data, u16 data_len)
+{
+ int ret = 0;
+ struct btrfs_dir_item *dir_item;
+ unsigned long name_ptr, data_ptr;
+ struct btrfs_key key, location;
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *leaf;
+ u32 data_size;
+
+ BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
+
+ key.objectid = objectid;
+ btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.offset = btrfs_name_hash(name, name_len);
+
+ data_size = sizeof(*dir_item) + name_len + data_len;
+ dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+ name, name_len);
+ /*
+ * FIXME: at some point we should handle xattr's that are larger than
+ * what we can fit in our leaf. We set location to NULL b/c we arent
+ * pointing at anything else, that will change if we store the xattr
+ * data in a separate inode.
+ */
+ BUG_ON(IS_ERR(dir_item));
+ memset(&location, 0, sizeof(location));
+
+ leaf = path->nodes[0];
+ btrfs_cpu_key_to_disk(&disk_key, &location);
+ btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+ btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+ btrfs_set_dir_name_len(leaf, dir_item, name_len);
+ btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+ btrfs_set_dir_data_len(leaf, dir_item, data_len);
+ name_ptr = (unsigned long)(dir_item + 1);
+ data_ptr = (unsigned long)((char *)name_ptr + name_len);
+
+ write_extent_buffer(leaf, name, name_ptr, name_len);
+ write_extent_buffer(leaf, data, data_ptr, data_len);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+ return ret;
+}
+
+/*
+ * insert a directory item in the tree, doing all the magic for
+ * both indexes. 'dir' indicates which objectid to insert it into,
+ * 'location' is the key to stuff into the directory item, 'type' is the
+ * type of the inode we're pointing to, and 'index' is the sequence number
+ * to use for the second index (if one is created).
+ */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, const char *name, int name_len, u64 dir,
+ struct btrfs_key *location, u8 type, u64 index)
+{
+ int ret = 0;
+ int ret2 = 0;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *dir_item;
+ struct extent_buffer *leaf;
+ unsigned long name_ptr;
+ struct btrfs_key key;
+ struct btrfs_disk_key disk_key;
+ u32 data_size;
+
+ key.objectid = dir;
+ btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.offset = btrfs_name_hash(name, name_len);
+
+ path = btrfs_alloc_path();
+ path->leave_spinning = 1;
+
+ data_size = sizeof(*dir_item) + name_len;
+ dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+ name, name_len);
+ if (IS_ERR(dir_item)) {
+ ret = PTR_ERR(dir_item);
+ if (ret == -EEXIST)
+ goto second_insert;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_cpu_key_to_disk(&disk_key, location);
+ btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+ btrfs_set_dir_type(leaf, dir_item, type);
+ btrfs_set_dir_data_len(leaf, dir_item, 0);
+ btrfs_set_dir_name_len(leaf, dir_item, name_len);
+ btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+ name_ptr = (unsigned long)(dir_item + 1);
+
+ write_extent_buffer(leaf, name, name_ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+
+second_insert:
+ /* FIXME, use some real flag for selecting the extra index */
+ if (root == root->fs_info->tree_root) {
+ ret = 0;
+ goto out;
+ }
+ btrfs_release_path(root, path);
+
+ btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+ key.offset = index;
+ dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+ name, name_len);
+ if (IS_ERR(dir_item)) {
+ ret2 = PTR_ERR(dir_item);
+ goto out;
+ }
+ leaf = path->nodes[0];
+ btrfs_cpu_key_to_disk(&disk_key, location);
+ btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+ btrfs_set_dir_type(leaf, dir_item, type);
+ btrfs_set_dir_data_len(leaf, dir_item, 0);
+ btrfs_set_dir_name_len(leaf, dir_item, name_len);
+ btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+ name_ptr = (unsigned long)(dir_item + 1);
+ write_extent_buffer(leaf, name, name_ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ if (ret)
+ return ret;
+ if (ret2)
+ return ret2;
+ return 0;
+}
+
+/*
+ * lookup a directory item based on name. 'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ */
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, int name_len,
+ int mod)
+{
+ int ret;
+ struct btrfs_key key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+
+ key.objectid = dir;
+ btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+
+ key.offset = btrfs_name_hash(name, name_len);
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ return NULL;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid != dir ||
+ btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
+ found_key.offset != key.offset)
+ return NULL;
+
+ return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+/*
+ * lookup a directory item based on index. 'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ *
+ * The name is used to make sure the index really points to the name you were
+ * looking for.
+ */
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ u64 objectid, const char *name, int name_len,
+ int mod)
+{
+ int ret;
+ struct btrfs_key key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+
+ key.objectid = dir;
+ btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+ key.offset = objectid;
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return ERR_PTR(-ENOENT);
+ return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const char *name, int name_len)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u32 nritems;
+ int ret;
+
+ key.objectid = dirid;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ while (1) {
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
+ break;
+
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (di)
+ return di;
+
+ path->slots[0]++;
+ }
+ return NULL;
+}
+
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, u16 name_len,
+ int mod)
+{
+ int ret;
+ struct btrfs_key key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+
+ key.objectid = dir;
+ btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.offset = btrfs_name_hash(name, name_len);
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ return NULL;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid != dir ||
+ btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
+ found_key.offset != key.offset)
+ return NULL;
+
+ return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+/*
+ * helper function to look at the directory item pointed to by 'path'
+ * this walks through all the entries in a dir item and finds one
+ * for a specific name.
+ */
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len)
+{
+ struct btrfs_dir_item *dir_item;
+ unsigned long name_ptr;
+ u32 total_len;
+ u32 cur = 0;
+ u32 this_len;
+ struct extent_buffer *leaf;
+
+ leaf = path->nodes[0];
+ dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+ total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ while (cur < total_len) {
+ this_len = sizeof(*dir_item) +
+ btrfs_dir_name_len(leaf, dir_item) +
+ btrfs_dir_data_len(leaf, dir_item);
+ name_ptr = (unsigned long)(dir_item + 1);
+
+ if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
+ memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+ return dir_item;
+
+ cur += this_len;
+ dir_item = (struct btrfs_dir_item *)((char *)dir_item +
+ this_len);
+ }
+ return NULL;
+}
+
+/*
+ * given a pointer into a directory item, delete it. This
+ * handles items that have more than one entry in them.
+ */
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *di)
+{
+
+ struct extent_buffer *leaf;
+ u32 sub_item_len;
+ u32 item_len;
+ int ret = 0;
+
+ leaf = path->nodes[0];
+ sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
+ btrfs_dir_data_len(leaf, di);
+ item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (sub_item_len == item_len) {
+ ret = btrfs_del_item(trans, root, path);
+ } else {
+ /* MARKER */
+ unsigned long ptr = (unsigned long)di;
+ unsigned long start;
+
+ start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+ item_len - (ptr + sub_item_len - start));
+ ret = btrfs_truncate_item(trans, root, path,
+ item_len - sub_item_len, 1);
+ }
+ return 0;
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 00000000000..2b59201b955
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2614 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/radix-tree.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/crc32c.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "async-thread.h"
+#include "locking.h"
+#include "tree-log.h"
+#include "free-space-cache.h"
+
+static struct extent_io_ops btree_extent_io_ops;
+static void end_workqueue_fn(struct btrfs_work *work);
+static void free_fs_root(struct btrfs_root *root);
+
+static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
+
+/*
+ * end_io_wq structs are used to do processing in task context when an IO is
+ * complete. This is used during reads to verify checksums, and it is used
+ * by writes to insert metadata for new file extents after IO is complete.
+ */
+struct end_io_wq {
+ struct bio *bio;
+ bio_end_io_t *end_io;
+ void *private;
+ struct btrfs_fs_info *info;
+ int error;
+ int metadata;
+ struct list_head list;
+ struct btrfs_work work;
+};
+
+/*
+ * async submit bios are used to offload expensive checksumming
+ * onto the worker threads. They checksum file and metadata bios
+ * just before they are sent down the IO stack.
+ */
+struct async_submit_bio {
+ struct inode *inode;
+ struct bio *bio;
+ struct list_head list;
+ extent_submit_bio_hook_t *submit_bio_start;
+ extent_submit_bio_hook_t *submit_bio_done;
+ int rw;
+ int mirror_num;
+ unsigned long bio_flags;
+ struct btrfs_work work;
+};
+
+/* These are used to set the lockdep class on the extent buffer locks.
+ * The class is set by the readpage_end_io_hook after the buffer has
+ * passed csum validation but before the pages are unlocked.
+ *
+ * The lockdep class is also set by btrfs_init_new_buffer on freshly
+ * allocated blocks.
+ *
+ * The class is based on the level in the tree block, which allows lockdep
+ * to know that lower nodes nest inside the locks of higher nodes.
+ *
+ * We also add a check to make sure the highest level of the tree is
+ * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this
+ * code needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+# error
+# endif
+static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
+static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
+ /* leaf */
+ "btrfs-extent-00",
+ "btrfs-extent-01",
+ "btrfs-extent-02",
+ "btrfs-extent-03",
+ "btrfs-extent-04",
+ "btrfs-extent-05",
+ "btrfs-extent-06",
+ "btrfs-extent-07",
+ /* highest possible level */
+ "btrfs-extent-08",
+};
+#endif
+
+/*
+ * extents on the btree inode are pretty simple, there's one extent
+ * that covers the entire device
+ */
+static struct extent_map *btree_get_extent(struct inode *inode,
+ struct page *page, size_t page_offset, u64 start, u64 len,
+ int create)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (em) {
+ em->bdev =
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ read_unlock(&em_tree->lock);
+ goto out;
+ }
+ read_unlock(&em_tree->lock);
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em) {
+ em = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ em->start = 0;
+ em->len = (u64)-1;
+ em->block_len = (u64)-1;
+ em->block_start = 0;
+ em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ if (ret == -EEXIST) {
+ u64 failed_start = em->start;
+ u64 failed_len = em->len;
+
+ free_extent_map(em);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (em) {
+ ret = 0;
+ } else {
+ em = lookup_extent_mapping(em_tree, failed_start,
+ failed_len);
+ ret = -EIO;
+ }
+ } else if (ret) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ write_unlock(&em_tree->lock);
+
+ if (ret)
+ em = ERR_PTR(ret);
+out:
+ return em;
+}
+
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+{
+ return crc32c(seed, data, len);
+}
+
+void btrfs_csum_final(u32 crc, char *result)
+{
+ *(__le32 *)result = ~cpu_to_le32(crc);
+}
+
+/*
+ * compute the csum for a btree block, and either verify it or write it
+ * into the csum field of the block.
+ */
+static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
+ int verify)
+{
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+ char *result = NULL;
+ unsigned long len;
+ unsigned long cur_len;
+ unsigned long offset = BTRFS_CSUM_SIZE;
+ char *map_token = NULL;
+ char *kaddr;
+ unsigned long map_start;
+ unsigned long map_len;
+ int err;
+ u32 crc = ~(u32)0;
+ unsigned long inline_result;
+
+ len = buf->len - offset;
+ while (len > 0) {
+ err = map_private_extent_buffer(buf, offset, 32,
+ &map_token, &kaddr,
+ &map_start, &map_len, KM_USER0);
+ if (err)
+ return 1;
+ cur_len = min(len, map_len - (offset - map_start));
+ crc = btrfs_csum_data(root, kaddr + offset - map_start,
+ crc, cur_len);
+ len -= cur_len;
+ offset += cur_len;
+ unmap_extent_buffer(buf, map_token, KM_USER0);
+ }
+ if (csum_size > sizeof(inline_result)) {
+ result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+ if (!result)
+ return 1;
+ } else {
+ result = (char *)&inline_result;
+ }
+
+ btrfs_csum_final(crc, result);
+
+ if (verify) {
+ if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
+ u32 val;
+ u32 found = 0;
+ memcpy(&found, result, csum_size);
+
+ read_extent_buffer(buf, &val, 0, csum_size);
+ if (printk_ratelimit()) {
+ printk(KERN_INFO "btrfs: %s checksum verify "
+ "failed on %llu wanted %X found %X "
+ "level %d\n",
+ root->fs_info->sb->s_id,
+ (unsigned long long)buf->start, val, found,
+ btrfs_header_level(buf));
+ }
+ if (result != (char *)&inline_result)
+ kfree(result);
+ return 1;
+ }
+ } else {
+ write_extent_buffer(buf, result, 0, csum_size);
+ }
+ if (result != (char *)&inline_result)
+ kfree(result);
+ return 0;
+}
+
+/*
+ * we can't consider a given block up to date unless the transid of the
+ * block matches the transid in the parent node's pointer. This is how we
+ * detect blocks that either didn't get written at all or got written
+ * in the wrong place.
+ */
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+ struct extent_buffer *eb, u64 parent_transid)
+{
+ int ret;
+
+ if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+ return 0;
+
+ lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
+ if (extent_buffer_uptodate(io_tree, eb) &&
+ btrfs_header_generation(eb) == parent_transid) {
+ ret = 0;
+ goto out;
+ }
+ if (printk_ratelimit()) {
+ printk("parent transid verify failed on %llu wanted %llu "
+ "found %llu\n",
+ (unsigned long long)eb->start,
+ (unsigned long long)parent_transid,
+ (unsigned long long)btrfs_header_generation(eb));
+ }
+ ret = 1;
+ clear_extent_buffer_uptodate(io_tree, eb);
+out:
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ GFP_NOFS);
+ return ret;
+}
+
+/*
+ * helper to read a given tree block, doing retries as required when
+ * the checksums don't match and we have alternate mirrors to try.
+ */
+static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+ struct extent_buffer *eb,
+ u64 start, u64 parent_transid)
+{
+ struct extent_io_tree *io_tree;
+ int ret;
+ int num_copies = 0;
+ int mirror_num = 0;
+
+ io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+ while (1) {
+ ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+ btree_get_extent, mirror_num);
+ if (!ret &&
+ !verify_parent_transid(io_tree, eb, parent_transid))
+ return ret;
+
+ num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+ eb->start, eb->len);
+ if (num_copies == 1)
+ return ret;
+
+ mirror_num++;
+ if (mirror_num > num_copies)
+ return ret;
+ }
+ return -EIO;
+}
+
+/*
+ * checksum a dirty tree block before IO. This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block
+ */
+
+static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+{
+ struct extent_io_tree *tree;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 found_start;
+ int found_level;
+ unsigned long len;
+ struct extent_buffer *eb;
+ int ret;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ if (page->private == EXTENT_PAGE_PRIVATE)
+ goto out;
+ if (!page->private)
+ goto out;
+ len = page->private >> 2;
+ WARN_ON(len == 0);
+
+ eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
+ btrfs_header_generation(eb));
+ BUG_ON(ret);
+ found_start = btrfs_header_bytenr(eb);
+ if (found_start != start) {
+ WARN_ON(1);
+ goto err;
+ }
+ if (eb->first_page != page) {
+ WARN_ON(1);
+ goto err;
+ }
+ if (!PageUptodate(page)) {
+ WARN_ON(1);
+ goto err;
+ }
+ found_level = btrfs_header_level(eb);
+
+ csum_tree_block(root, eb, 0);
+err:
+ free_extent_buffer(eb);
+out:
+ return 0;
+}
+
+static int check_tree_block_fsid(struct btrfs_root *root,
+ struct extent_buffer *eb)
+{
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ u8 fsid[BTRFS_UUID_SIZE];
+ int ret = 1;
+
+ read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
+ BTRFS_FSID_SIZE);
+ while (fs_devices) {
+ if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+ ret = 0;
+ break;
+ }
+ fs_devices = fs_devices->seed;
+ }
+ return ret;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
+{
+ lockdep_set_class_and_name(&eb->lock,
+ &btrfs_eb_class[level],
+ btrfs_eb_name[level]);
+}
+#endif
+
+static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+ struct extent_state *state)
+{
+ struct extent_io_tree *tree;
+ u64 found_start;
+ int found_level;
+ unsigned long len;
+ struct extent_buffer *eb;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ int ret = 0;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (page->private == EXTENT_PAGE_PRIVATE)
+ goto out;
+ if (!page->private)
+ goto out;
+
+ len = page->private >> 2;
+ WARN_ON(len == 0);
+
+ eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+
+ found_start = btrfs_header_bytenr(eb);
+ if (found_start != start) {
+ if (printk_ratelimit()) {
+ printk(KERN_INFO "btrfs bad tree block start "
+ "%llu %llu\n",
+ (unsigned long long)found_start,
+ (unsigned long long)eb->start);
+ }
+ ret = -EIO;
+ goto err;
+ }
+ if (eb->first_page != page) {
+ printk(KERN_INFO "btrfs bad first page %lu %lu\n",
+ eb->first_page->index, page->index);
+ WARN_ON(1);
+ ret = -EIO;
+ goto err;
+ }
+ if (check_tree_block_fsid(root, eb)) {
+ if (printk_ratelimit()) {
+ printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+ (unsigned long long)eb->start);
+ }
+ ret = -EIO;
+ goto err;
+ }
+ found_level = btrfs_header_level(eb);
+
+ btrfs_set_buffer_lockdep_class(eb, found_level);
+
+ ret = csum_tree_block(root, eb, 1);
+ if (ret)
+ ret = -EIO;
+
+ end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
+ end = eb->start + end - 1;
+err:
+ free_extent_buffer(eb);
+out:
+ return ret;
+}
+
+static void end_workqueue_bio(struct bio *bio, int err)
+{
+ struct end_io_wq *end_io_wq = bio->bi_private;
+ struct btrfs_fs_info *fs_info;
+
+ fs_info = end_io_wq->info;
+ end_io_wq->error = err;
+ end_io_wq->work.func = end_workqueue_fn;
+ end_io_wq->work.flags = 0;
+
+ if (bio->bi_rw & (1 << BIO_RW)) {
+ if (end_io_wq->metadata)
+ btrfs_queue_worker(&fs_info->endio_meta_write_workers,
+ &end_io_wq->work);
+ else
+ btrfs_queue_worker(&fs_info->endio_write_workers,
+ &end_io_wq->work);
+ } else {
+ if (end_io_wq->metadata)
+ btrfs_queue_worker(&fs_info->endio_meta_workers,
+ &end_io_wq->work);
+ else
+ btrfs_queue_worker(&fs_info->endio_workers,
+ &end_io_wq->work);
+ }
+}
+
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+ int metadata)
+{
+ struct end_io_wq *end_io_wq;
+ end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+ if (!end_io_wq)
+ return -ENOMEM;
+
+ end_io_wq->private = bio->bi_private;
+ end_io_wq->end_io = bio->bi_end_io;
+ end_io_wq->info = info;
+ end_io_wq->error = 0;
+ end_io_wq->bio = bio;
+ end_io_wq->metadata = metadata;
+
+ bio->bi_private = end_io_wq;
+ bio->bi_end_io = end_workqueue_bio;
+ return 0;
+}
+
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
+{
+ unsigned long limit = min_t(unsigned long,
+ info->workers.max_workers,
+ info->fs_devices->open_devices);
+ return 256 * limit;
+}
+
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
+{
+ return atomic_read(&info->nr_async_bios) >
+ btrfs_async_submit_limit(info);
+}
+
+static void run_one_async_start(struct btrfs_work *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct async_submit_bio *async;
+
+ async = container_of(work, struct async_submit_bio, work);
+ fs_info = BTRFS_I(async->inode)->root->fs_info;
+ async->submit_bio_start(async->inode, async->rw, async->bio,
+ async->mirror_num, async->bio_flags);
+}
+
+static void run_one_async_done(struct btrfs_work *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct async_submit_bio *async;
+ int limit;
+
+ async = container_of(work, struct async_submit_bio, work);
+ fs_info = BTRFS_I(async->inode)->root->fs_info;
+
+ limit = btrfs_async_submit_limit(fs_info);
+ limit = limit * 2 / 3;
+
+ atomic_dec(&fs_info->nr_async_submits);
+
+ if (atomic_read(&fs_info->nr_async_submits) < limit &&
+ waitqueue_active(&fs_info->async_submit_wait))
+ wake_up(&fs_info->async_submit_wait);
+
+ async->submit_bio_done(async->inode, async->rw, async->bio,
+ async->mirror_num, async->bio_flags);
+}
+
+static void run_one_async_free(struct btrfs_work *work)
+{
+ struct async_submit_bio *async;
+
+ async = container_of(work, struct async_submit_bio, work);
+ kfree(async);
+}
+
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+ int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags,
+ extent_submit_bio_hook_t *submit_bio_start,
+ extent_submit_bio_hook_t *submit_bio_done)
+{
+ struct async_submit_bio *async;
+
+ async = kmalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ return -ENOMEM;
+
+ async->inode = inode;
+ async->rw = rw;
+ async->bio = bio;
+ async->mirror_num = mirror_num;
+ async->submit_bio_start = submit_bio_start;
+ async->submit_bio_done = submit_bio_done;
+
+ async->work.func = run_one_async_start;
+ async->work.ordered_func = run_one_async_done;
+ async->work.ordered_free = run_one_async_free;
+
+ async->work.flags = 0;
+ async->bio_flags = bio_flags;
+
+ atomic_inc(&fs_info->nr_async_submits);
+
+ if (rw & (1 << BIO_RW_SYNCIO))
+ btrfs_set_work_high_prio(&async->work);
+
+ btrfs_queue_worker(&fs_info->workers, &async->work);
+
+ while (atomic_read(&fs_info->async_submit_draining) &&
+ atomic_read(&fs_info->nr_async_submits)) {
+ wait_event(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_submits) == 0));
+ }
+
+ return 0;
+}
+
+static int btree_csum_one_bio(struct bio *bio)
+{
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int bio_index = 0;
+ struct btrfs_root *root;
+
+ WARN_ON(bio->bi_vcnt <= 0);
+ while (bio_index < bio->bi_vcnt) {
+ root = BTRFS_I(bvec->bv_page->mapping->host)->root;
+ csum_dirty_buffer(root, bvec->bv_page);
+ bio_index++;
+ bvec++;
+ }
+ return 0;
+}
+
+static int __btree_submit_bio_start(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
+{
+ /*
+ * when we're called for a write, we're already in the async
+ * submission context. Just jump into btrfs_map_bio
+ */
+ btree_csum_one_bio(bio);
+ return 0;
+}
+
+static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ /*
+ * when we're called for a write, we're already in the async
+ * submission context. Just jump into btrfs_map_bio
+ */
+ return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+}
+
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ int ret;
+
+ ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+ bio, 1);
+ BUG_ON(ret);
+
+ if (!(rw & (1 << BIO_RW))) {
+ /*
+ * called for a read, do the setup so that checksum validation
+ * can happen in the async kernel threads
+ */
+ return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+ mirror_num, 0);
+ }
+
+ /*
+ * kthread helpers are used to submit writes so that checksumming
+ * can happen in parallel across all CPUs
+ */
+ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, mirror_num, 0,
+ __btree_submit_bio_start,
+ __btree_submit_bio_done);
+}
+
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ struct extent_buffer *eb;
+ int was_dirty;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (!(current->flags & PF_MEMALLOC)) {
+ return extent_write_full_page(tree, page,
+ btree_get_extent, wbc);
+ }
+
+ redirty_page_for_writepage(wbc, page);
+ eb = btrfs_find_tree_block(root, page_offset(page),
+ PAGE_CACHE_SIZE);
+ WARN_ON(!eb);
+
+ was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+ if (!was_dirty) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+ free_extent_buffer(eb);
+
+ unlock_page(page);
+ return 0;
+}
+
+static int btree_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(mapping->host)->io_tree;
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ struct btrfs_root *root = BTRFS_I(mapping->host)->root;
+ u64 num_dirty;
+ unsigned long thresh = 32 * 1024 * 1024;
+
+ if (wbc->for_kupdate)
+ return 0;
+
+ /* this is a bit racy, but that's ok */
+ num_dirty = root->fs_info->dirty_metadata_bytes;
+ if (num_dirty < thresh)
+ return 0;
+ }
+ return extent_writepages(tree, mapping, btree_get_extent, wbc);
+}
+
+static int btree_readpage(struct file *file, struct page *page)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ return extent_read_full_page(tree, page, btree_get_extent);
+}
+
+static int btree_releasepage(struct page *page, gfp_t gfp_flags)
+{
+ struct extent_io_tree *tree;
+ struct extent_map_tree *map;
+ int ret;
+
+ if (PageWriteback(page) || PageDirty(page))
+ return 0;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ map = &BTRFS_I(page->mapping->host)->extent_tree;
+
+ ret = try_release_extent_state(map, tree, page, gfp_flags);
+ if (!ret)
+ return 0;
+
+ ret = try_release_extent_buffer(tree, page);
+ if (ret == 1) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+
+ return ret;
+}
+
+static void btree_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ extent_invalidatepage(tree, page, offset);
+ btree_releasepage(page, GFP_NOFS);
+ if (PagePrivate(page)) {
+ printk(KERN_WARNING "btrfs warning page private not zero "
+ "on page %llu\n", (unsigned long long)page_offset(page));
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+}
+
+static const struct address_space_operations btree_aops = {
+ .readpage = btree_readpage,
+ .writepage = btree_writepage,
+ .writepages = btree_writepages,
+ .releasepage = btree_releasepage,
+ .invalidatepage = btree_invalidatepage,
+ .sync_page = block_sync_page,
+};
+
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ u64 parent_transid)
+{
+ struct extent_buffer *buf = NULL;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ int ret = 0;
+
+ buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ if (!buf)
+ return 0;
+ read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
+ buf, 0, 0, btree_get_extent, 0);
+ free_extent_buffer(buf);
+ return ret;
+}
+
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize)
+{
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_buffer *eb;
+ eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+ bytenr, blocksize, GFP_NOFS);
+ return eb;
+}
+
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize)
+{
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_buffer *eb;
+
+ eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+ bytenr, blocksize, NULL, GFP_NOFS);
+ return eb;
+}
+
+
+int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+ return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+ buf->start + buf->len - 1);
+}
+
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+ return filemap_fdatawait_range(buf->first_page->mapping,
+ buf->start, buf->start + buf->len - 1);
+}
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+ u32 blocksize, u64 parent_transid)
+{
+ struct extent_buffer *buf = NULL;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_io_tree *io_tree;
+ int ret;
+
+ io_tree = &BTRFS_I(btree_inode)->io_tree;
+
+ buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ if (!buf)
+ return NULL;
+
+ ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+
+ if (ret == 0)
+ set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
+ return buf;
+
+}
+
+int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf)
+{
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ if (btrfs_header_generation(buf) ==
+ root->fs_info->running_transaction->transid) {
+ btrfs_assert_tree_locked(buf);
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (root->fs_info->dirty_metadata_bytes >= buf->len)
+ root->fs_info->dirty_metadata_bytes -= buf->len;
+ else
+ WARN_ON(1);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+
+ /* ugh, clear_extent_buffer_dirty needs to lock the page */
+ btrfs_set_lock_blocking(buf);
+ clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+ buf);
+ }
+ return 0;
+}
+
+static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+ u32 stripesize, struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ root->node = NULL;
+ root->commit_root = NULL;
+ root->sectorsize = sectorsize;
+ root->nodesize = nodesize;
+ root->leafsize = leafsize;
+ root->stripesize = stripesize;
+ root->ref_cows = 0;
+ root->track_dirty = 0;
+ root->in_radix = 0;
+ root->clean_orphans = 0;
+
+ root->fs_info = fs_info;
+ root->objectid = objectid;
+ root->last_trans = 0;
+ root->highest_objectid = 0;
+ root->name = NULL;
+ root->in_sysfs = 0;
+ root->inode_tree.rb_node = NULL;
+
+ INIT_LIST_HEAD(&root->dirty_list);
+ INIT_LIST_HEAD(&root->orphan_list);
+ INIT_LIST_HEAD(&root->root_list);
+ spin_lock_init(&root->node_lock);
+ spin_lock_init(&root->list_lock);
+ spin_lock_init(&root->inode_lock);
+ mutex_init(&root->objectid_mutex);
+ mutex_init(&root->log_mutex);
+ init_waitqueue_head(&root->log_writer_wait);
+ init_waitqueue_head(&root->log_commit_wait[0]);
+ init_waitqueue_head(&root->log_commit_wait[1]);
+ atomic_set(&root->log_commit[0], 0);
+ atomic_set(&root->log_commit[1], 0);
+ atomic_set(&root->log_writers, 0);
+ root->log_batch = 0;
+ root->log_transid = 0;
+ root->last_log_commit = 0;
+ extent_io_tree_init(&root->dirty_log_pages,
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+
+ memset(&root->root_key, 0, sizeof(root->root_key));
+ memset(&root->root_item, 0, sizeof(root->root_item));
+ memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+ memset(&root->root_kobj, 0, sizeof(root->root_kobj));
+ root->defrag_trans_start = fs_info->generation;
+ init_completion(&root->kobj_unregister);
+ root->defrag_running = 0;
+ root->root_key.objectid = objectid;
+ root->anon_super.s_root = NULL;
+ root->anon_super.s_dev = 0;
+ INIT_LIST_HEAD(&root->anon_super.s_list);
+ INIT_LIST_HEAD(&root->anon_super.s_instances);
+ init_rwsem(&root->anon_super.s_umount);
+
+ return 0;
+}
+
+static int find_and_setup_root(struct btrfs_root *tree_root,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid,
+ struct btrfs_root *root)
+{
+ int ret;
+ u32 blocksize;
+ u64 generation;
+
+ __setup_root(tree_root->nodesize, tree_root->leafsize,
+ tree_root->sectorsize, tree_root->stripesize,
+ root, fs_info, objectid);
+ ret = btrfs_find_last_root(tree_root, objectid,
+ &root->root_item, &root->root_key);
+ if (ret > 0)
+ return -ENOENT;
+ BUG_ON(ret);
+
+ generation = btrfs_root_generation(&root->root_item);
+ blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ blocksize, generation);
+ BUG_ON(!root->node);
+ root->commit_root = btrfs_root_node(root);
+ return 0;
+}
+
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct extent_buffer *eb;
+ struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ u64 start = 0;
+ u64 end = 0;
+ int ret;
+
+ if (!log_root_tree)
+ return 0;
+
+ while (1) {
+ ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
+ 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+ if (ret)
+ break;
+
+ clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
+ EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+ }
+ eb = fs_info->log_root_tree->node;
+
+ WARN_ON(btrfs_header_level(eb) != 0);
+ WARN_ON(btrfs_header_nritems(eb) != 0);
+
+ ret = btrfs_free_reserved_extent(fs_info->tree_root,
+ eb->start, eb->len);
+ BUG_ON(ret);
+
+ free_extent_buffer(eb);
+ kfree(fs_info->log_root_tree);
+ fs_info->log_root_tree = NULL;
+ return 0;
+}
+
+static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct extent_buffer *leaf;
+
+ root = kzalloc(sizeof(*root), GFP_NOFS);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+
+ __setup_root(tree_root->nodesize, tree_root->leafsize,
+ tree_root->sectorsize, tree_root->stripesize,
+ root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+ root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+ /*
+ * log trees do not get reference counted because they go away
+ * before a real commit is actually done. They do store pointers
+ * to file data extents, and those reference counts still get
+ * updated (along with back refs to the log tree).
+ */
+ root->ref_cows = 0;
+
+ leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+ BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+ if (IS_ERR(leaf)) {
+ kfree(root);
+ return ERR_CAST(leaf);
+ }
+
+ memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_bytenr(leaf, leaf->start);
+ btrfs_set_header_generation(leaf, trans->transid);
+ btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
+ root->node = leaf;
+
+ write_extent_buffer(root->node, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(root->node),
+ BTRFS_FSID_SIZE);
+ btrfs_mark_buffer_dirty(root->node);
+ btrfs_tree_unlock(root->node);
+ return root;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *log_root;
+
+ log_root = alloc_log_tree(trans, fs_info);
+ if (IS_ERR(log_root))
+ return PTR_ERR(log_root);
+ WARN_ON(fs_info->log_root_tree);
+ fs_info->log_root_tree = log_root;
+ return 0;
+}
+
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *log_root;
+ struct btrfs_inode_item *inode_item;
+
+ log_root = alloc_log_tree(trans, root->fs_info);
+ if (IS_ERR(log_root))
+ return PTR_ERR(log_root);
+
+ log_root->last_trans = trans->transid;
+ log_root->root_key.offset = root->root_key.objectid;
+
+ inode_item = &log_root->root_item.inode;
+ inode_item->generation = cpu_to_le64(1);
+ inode_item->size = cpu_to_le64(3);
+ inode_item->nlink = cpu_to_le32(1);
+ inode_item->nbytes = cpu_to_le64(root->leafsize);
+ inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+ btrfs_set_root_node(&log_root->root_item, log_root->node);
+
+ WARN_ON(root->log_root);
+ root->log_root = log_root;
+ root->log_transid = 0;
+ root->last_log_commit = 0;
+ return 0;
+}
+
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ struct btrfs_key *location)
+{
+ struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_path *path;
+ struct extent_buffer *l;
+ u64 generation;
+ u32 blocksize;
+ int ret = 0;
+
+ root = kzalloc(sizeof(*root), GFP_NOFS);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+ if (location->offset == (u64)-1) {
+ ret = find_and_setup_root(tree_root, fs_info,
+ location->objectid, root);
+ if (ret) {
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+ goto out;
+ }
+
+ __setup_root(tree_root->nodesize, tree_root->leafsize,
+ tree_root->sectorsize, tree_root->stripesize,
+ root, fs_info, location->objectid);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
+ if (ret == 0) {
+ l = path->nodes[0];
+ read_extent_buffer(l, &root->root_item,
+ btrfs_item_ptr_offset(l, path->slots[0]),
+ sizeof(root->root_item));
+ memcpy(&root->root_key, location, sizeof(*location));
+ }
+ btrfs_free_path(path);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ return ERR_PTR(ret);
+ }
+
+ generation = btrfs_root_generation(&root->root_item);
+ blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ blocksize, generation);
+ root->commit_root = btrfs_root_node(root);
+ BUG_ON(!root->node);
+out:
+ if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
+ root->ref_cows = 1;
+
+ return root;
+}
+
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_objectid)
+{
+ struct btrfs_root *root;
+
+ if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return fs_info->tree_root;
+ if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return fs_info->extent_root;
+
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)root_objectid);
+ return root;
+}
+
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location)
+{
+ struct btrfs_root *root;
+ int ret;
+
+ if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return fs_info->tree_root;
+ if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return fs_info->extent_root;
+ if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ return fs_info->chunk_root;
+ if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+ return fs_info->dev_root;
+ if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return fs_info->csum_root;
+again:
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)location->objectid);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ if (root)
+ return root;
+
+ ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+ if (ret == 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+ if (IS_ERR(root))
+ return root;
+
+ WARN_ON(btrfs_root_refs(&root->root_item) == 0);
+ set_anon_super(&root->anon_super, NULL);
+
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ goto fail;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ root);
+ if (ret == 0) {
+ root->in_radix = 1;
+ root->clean_orphans = 1;
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ radix_tree_preload_end();
+ if (ret) {
+ if (ret == -EEXIST) {
+ free_fs_root(root);
+ goto again;
+ }
+ goto fail;
+ }
+
+ ret = btrfs_find_dead_roots(fs_info->tree_root,
+ root->root_key.objectid);
+ WARN_ON(ret);
+ return root;
+fail:
+ free_fs_root(root);
+ return ERR_PTR(ret);
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location,
+ const char *name, int namelen)
+{
+ return btrfs_read_fs_root_no_name(fs_info, location);
+#if 0
+ struct btrfs_root *root;
+ int ret;
+
+ root = btrfs_read_fs_root_no_name(fs_info, location);
+ if (!root)
+ return NULL;
+
+ if (root->in_sysfs)
+ return root;
+
+ ret = btrfs_set_root_name(root, name, namelen);
+ if (ret) {
+ free_extent_buffer(root->node);
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+
+ ret = btrfs_sysfs_add_root(root);
+ if (ret) {
+ free_extent_buffer(root->node);
+ kfree(root->name);
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+ root->in_sysfs = 1;
+ return root;
+#endif
+}
+
+static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+{
+ struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
+ int ret = 0;
+ struct btrfs_device *device;
+ struct backing_dev_info *bdi;
+
+ list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
+ if (!device->bdev)
+ continue;
+ bdi = blk_get_backing_dev_info(device->bdev);
+ if (bdi && bdi_congested(bdi, bdi_bits)) {
+ ret = 1;
+ break;
+ }
+ }
+ return ret;
+}
+
+/*
+ * this unplugs every device on the box, and it is only used when page
+ * is null
+ */
+static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_info *info;
+
+ info = (struct btrfs_fs_info *)bdi->unplug_io_data;
+ list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
+ if (!device->bdev)
+ continue;
+
+ bdi = blk_get_backing_dev_info(device->bdev);
+ if (bdi->unplug_io_fn)
+ bdi->unplug_io_fn(bdi, page);
+ }
+}
+
+static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+ struct inode *inode;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct address_space *mapping;
+ u64 offset;
+
+ /* the generic O_DIRECT read code does this */
+ if (1 || !page) {
+ __unplug_io_fn(bdi, page);
+ return;
+ }
+
+ /*
+ * page->mapping may change at any time. Get a consistent copy
+ * and use that for everything below
+ */
+ smp_mb();
+ mapping = page->mapping;
+ if (!mapping)
+ return;
+
+ inode = mapping->host;
+
+ /*
+ * don't do the expensive searching for a small number of
+ * devices
+ */
+ if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
+ __unplug_io_fn(bdi, page);
+ return;
+ }
+
+ offset = page_offset(page);
+
+ em_tree = &BTRFS_I(inode)->extent_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ __unplug_io_fn(bdi, page);
+ return;
+ }
+
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ __unplug_io_fn(bdi, page);
+ return;
+ }
+ offset = offset - em->start;
+ btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
+ em->block_start + offset, page);
+ free_extent_map(em);
+}
+
+/*
+ * If this fails, caller must call bdi_destroy() to get rid of the
+ * bdi again.
+ */
+static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
+{
+ int err;
+
+ bdi->name = "btrfs";
+ bdi->capabilities = BDI_CAP_MAP_COPY;
+ err = bdi_init(bdi);
+ if (err)
+ return err;
+
+ err = bdi_register(bdi, NULL, "btrfs-%d",
+ atomic_inc_return(&btrfs_bdi_num));
+ if (err) {
+ bdi_destroy(bdi);
+ return err;
+ }
+
+ bdi->ra_pages = default_backing_dev_info.ra_pages;
+ bdi->unplug_io_fn = btrfs_unplug_io_fn;
+ bdi->unplug_io_data = info;
+ bdi->congested_fn = btrfs_congested_fn;
+ bdi->congested_data = info;
+ return 0;
+}
+
+static int bio_ready_for_csum(struct bio *bio)
+{
+ u64 length = 0;
+ u64 buf_len = 0;
+ u64 start = 0;
+ struct page *page;
+ struct extent_io_tree *io_tree = NULL;
+ struct btrfs_fs_info *info = NULL;
+ struct bio_vec *bvec;
+ int i;
+ int ret;
+
+ bio_for_each_segment(bvec, bio, i) {
+ page = bvec->bv_page;
+ if (page->private == EXTENT_PAGE_PRIVATE) {
+ length += bvec->bv_len;
+ continue;
+ }
+ if (!page->private) {
+ length += bvec->bv_len;
+ continue;
+ }
+ length = bvec->bv_len;
+ buf_len = page->private >> 2;
+ start = page_offset(page) + bvec->bv_offset;
+ io_tree = &BTRFS_I(page->mapping->host)->io_tree;
+ info = BTRFS_I(page->mapping->host)->root->fs_info;
+ }
+ /* are we fully contained in this bio? */
+ if (buf_len <= length)
+ return 1;
+
+ ret = extent_range_uptodate(io_tree, start + length,
+ start + buf_len - 1);
+ return ret;
+}
+
+/*
+ * called by the kthread helper functions to finally call the bio end_io
+ * functions. This is where read checksum verification actually happens
+ */
+static void end_workqueue_fn(struct btrfs_work *work)
+{
+ struct bio *bio;
+ struct end_io_wq *end_io_wq;
+ struct btrfs_fs_info *fs_info;
+ int error;
+
+ end_io_wq = container_of(work, struct end_io_wq, work);
+ bio = end_io_wq->bio;
+ fs_info = end_io_wq->info;
+
+ /* metadata bio reads are special because the whole tree block must
+ * be checksummed at once. This makes sure the entire block is in
+ * ram and up to date before trying to verify things. For
+ * blocksize <= pagesize, it is basically a noop
+ */
+ if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
+ !bio_ready_for_csum(bio)) {
+ btrfs_queue_worker(&fs_info->endio_meta_workers,
+ &end_io_wq->work);
+ return;
+ }
+ error = end_io_wq->error;
+ bio->bi_private = end_io_wq->private;
+ bio->bi_end_io = end_io_wq->end_io;
+ kfree(end_io_wq);
+ bio_endio(bio, error);
+}
+
+static int cleaner_kthread(void *arg)
+{
+ struct btrfs_root *root = arg;
+
+ do {
+ smp_mb();
+ if (root->fs_info->closing)
+ break;
+
+ vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+
+ if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
+ mutex_trylock(&root->fs_info->cleaner_mutex)) {
+ btrfs_run_delayed_iputs(root);
+ btrfs_clean_old_snapshots(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+ }
+
+ if (freezing(current)) {
+ refrigerator();
+ } else {
+ smp_mb();
+ if (root->fs_info->closing)
+ break;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+static int transaction_kthread(void *arg)
+{
+ struct btrfs_root *root = arg;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_transaction *cur;
+ unsigned long now;
+ unsigned long delay;
+ int ret;
+
+ do {
+ smp_mb();
+ if (root->fs_info->closing)
+ break;
+
+ delay = HZ * 30;
+ vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+ mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ cur = root->fs_info->running_transaction;
+ if (!cur) {
+ mutex_unlock(&root->fs_info->trans_mutex);
+ goto sleep;
+ }
+
+ now = get_seconds();
+ if (now < cur->start_time || now - cur->start_time < 30) {
+ mutex_unlock(&root->fs_info->trans_mutex);
+ delay = HZ * 5;
+ goto sleep;
+ }
+ mutex_unlock(&root->fs_info->trans_mutex);
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_commit_transaction(trans, root);
+
+sleep:
+ wake_up_process(root->fs_info->cleaner_kthread);
+ mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+
+ if (freezing(current)) {
+ refrigerator();
+ } else {
+ if (root->fs_info->closing)
+ break;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(delay);
+ __set_current_state(TASK_RUNNING);
+ }
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+struct btrfs_root *open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options)
+{
+ u32 sectorsize;
+ u32 nodesize;
+ u32 leafsize;
+ u32 blocksize;
+ u32 stripesize;
+ u64 generation;
+ u64 features;
+ struct btrfs_key location;
+ struct buffer_head *bh;
+ struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
+ GFP_NOFS);
+ struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
+ GFP_NOFS);
+ struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
+ GFP_NOFS);
+ struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
+ GFP_NOFS);
+ struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
+ GFP_NOFS);
+ struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
+ GFP_NOFS);
+ struct btrfs_root *log_tree_root;
+
+ int ret;
+ int err = -EINVAL;
+
+ struct btrfs_super_block *disk_super;
+
+ if (!extent_root || !tree_root || !fs_info ||
+ !chunk_root || !dev_root || !csum_root) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ ret = init_srcu_struct(&fs_info->subvol_srcu);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
+
+ ret = setup_bdi(fs_info, &fs_info->bdi);
+ if (ret) {
+ err = ret;
+ goto fail_srcu;
+ }
+
+ fs_info->btree_inode = new_inode(sb);
+ if (!fs_info->btree_inode) {
+ err = -ENOMEM;
+ goto fail_bdi;
+ }
+
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ INIT_LIST_HEAD(&fs_info->trans_list);
+ INIT_LIST_HEAD(&fs_info->dead_roots);
+ INIT_LIST_HEAD(&fs_info->delayed_iputs);
+ INIT_LIST_HEAD(&fs_info->hashers);
+ INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+ INIT_LIST_HEAD(&fs_info->ordered_operations);
+ INIT_LIST_HEAD(&fs_info->caching_block_groups);
+ spin_lock_init(&fs_info->delalloc_lock);
+ spin_lock_init(&fs_info->new_trans_lock);
+ spin_lock_init(&fs_info->ref_cache_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
+ spin_lock_init(&fs_info->delayed_iput_lock);
+
+ init_completion(&fs_info->kobj_unregister);
+ fs_info->tree_root = tree_root;
+ fs_info->extent_root = extent_root;
+ fs_info->csum_root = csum_root;
+ fs_info->chunk_root = chunk_root;
+ fs_info->dev_root = dev_root;
+ fs_info->fs_devices = fs_devices;
+ INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
+ INIT_LIST_HEAD(&fs_info->space_info);
+ btrfs_mapping_init(&fs_info->mapping_tree);
+ atomic_set(&fs_info->nr_async_submits, 0);
+ atomic_set(&fs_info->async_delalloc_pages, 0);
+ atomic_set(&fs_info->async_submit_draining, 0);
+ atomic_set(&fs_info->nr_async_bios, 0);
+ fs_info->sb = sb;
+ fs_info->max_extent = (u64)-1;
+ fs_info->max_inline = 8192 * 1024;
+ fs_info->metadata_ratio = 0;
+
+ fs_info->thread_pool_size = min_t(unsigned long,
+ num_online_cpus() + 2, 8);
+
+ INIT_LIST_HEAD(&fs_info->ordered_extents);
+ spin_lock_init(&fs_info->ordered_extent_lock);
+
+ sb->s_blocksize = 4096;
+ sb->s_blocksize_bits = blksize_bits(4096);
+ sb->s_bdi = &fs_info->bdi;
+
+ fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ fs_info->btree_inode->i_nlink = 1;
+ /*
+ * we set the i_size on the btree inode to the max possible int.
+ * the real end of the address space is determined by all of
+ * the devices in the system
+ */
+ fs_info->btree_inode->i_size = OFFSET_MAX;
+ fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+ fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
+
+ RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
+ extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
+ fs_info->btree_inode->i_mapping,
+ GFP_NOFS);
+ extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
+ GFP_NOFS);
+
+ BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+
+ BTRFS_I(fs_info->btree_inode)->root = tree_root;
+ memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+ sizeof(struct btrfs_key));
+ BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+ insert_inode_hash(fs_info->btree_inode);
+
+ spin_lock_init(&fs_info->block_group_cache_lock);
+ fs_info->block_group_cache_tree.rb_node = NULL;
+
+ extent_io_tree_init(&fs_info->freed_extents[0],
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+ extent_io_tree_init(&fs_info->freed_extents[1],
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
+ fs_info->do_barriers = 1;
+
+
+ mutex_init(&fs_info->trans_mutex);
+ mutex_init(&fs_info->ordered_operations_mutex);
+ mutex_init(&fs_info->tree_log_mutex);
+ mutex_init(&fs_info->chunk_mutex);
+ mutex_init(&fs_info->transaction_kthread_mutex);
+ mutex_init(&fs_info->cleaner_mutex);
+ mutex_init(&fs_info->volume_mutex);
+ init_rwsem(&fs_info->extent_commit_sem);
+ init_rwsem(&fs_info->cleanup_work_sem);
+ init_rwsem(&fs_info->subvol_sem);
+
+ btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+ btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
+
+ init_waitqueue_head(&fs_info->transaction_throttle);
+ init_waitqueue_head(&fs_info->transaction_wait);
+ init_waitqueue_head(&fs_info->async_submit_wait);
+
+ __setup_root(4096, 4096, 4096, 4096, tree_root,
+ fs_info, BTRFS_ROOT_TREE_OBJECTID);
+
+
+ bh = btrfs_read_dev_super(fs_devices->latest_bdev);
+ if (!bh)
+ goto fail_iput;
+
+ memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
+ memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
+ sizeof(fs_info->super_for_commit));
+ brelse(bh);
+
+ memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+
+ disk_super = &fs_info->super_copy;
+ if (!btrfs_super_root(disk_super))
+ goto fail_iput;
+
+ ret = btrfs_parse_options(tree_root, options);
+ if (ret) {
+ err = ret;
+ goto fail_iput;
+ }
+
+ features = btrfs_super_incompat_flags(disk_super) &
+ ~BTRFS_FEATURE_INCOMPAT_SUPP;
+ if (features) {
+ printk(KERN_ERR "BTRFS: couldn't mount because of "
+ "unsupported optional features (%Lx).\n",
+ (unsigned long long)features);
+ err = -EINVAL;
+ goto fail_iput;
+ }
+
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
+ features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ }
+
+ features = btrfs_super_compat_ro_flags(disk_super) &
+ ~BTRFS_FEATURE_COMPAT_RO_SUPP;
+ if (!(sb->s_flags & MS_RDONLY) && features) {
+ printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
+ "unsupported option features (%Lx).\n",
+ (unsigned long long)features);
+ err = -EINVAL;
+ goto fail_iput;
+ }
+
+ btrfs_init_workers(&fs_info->generic_worker,
+ "genwork", 1, NULL);
+
+ btrfs_init_workers(&fs_info->workers, "worker",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+
+ btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+
+ btrfs_init_workers(&fs_info->submit_workers, "submit",
+ min_t(u64, fs_devices->num_devices,
+ fs_info->thread_pool_size),
+ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->enospc_workers, "enospc",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+
+ /* a higher idle thresh on the submit workers makes it much more
+ * likely that bios will be send down in a sane order to the
+ * devices
+ */
+ fs_info->submit_workers.idle_thresh = 64;
+
+ fs_info->workers.idle_thresh = 16;
+ fs_info->workers.ordered = 1;
+
+ fs_info->delalloc_workers.idle_thresh = 2;
+ fs_info->delalloc_workers.ordered = 1;
+
+ btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
+ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->endio_workers, "endio",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->endio_meta_write_workers,
+ "endio-meta-write", fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+
+ /*
+ * endios are largely parallel and should have a very
+ * low idle thresh
+ */
+ fs_info->endio_workers.idle_thresh = 4;
+ fs_info->endio_meta_workers.idle_thresh = 4;
+
+ fs_info->endio_write_workers.idle_thresh = 2;
+ fs_info->endio_meta_write_workers.idle_thresh = 2;
+
+ btrfs_start_workers(&fs_info->workers, 1);
+ btrfs_start_workers(&fs_info->generic_worker, 1);
+ btrfs_start_workers(&fs_info->submit_workers, 1);
+ btrfs_start_workers(&fs_info->delalloc_workers, 1);
+ btrfs_start_workers(&fs_info->fixup_workers, 1);
+ btrfs_start_workers(&fs_info->endio_workers, 1);
+ btrfs_start_workers(&fs_info->endio_meta_workers, 1);
+ btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
+ btrfs_start_workers(&fs_info->endio_write_workers, 1);
+ btrfs_start_workers(&fs_info->enospc_workers, 1);
+
+ fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+ fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+ 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+
+ nodesize = btrfs_super_nodesize(disk_super);
+ leafsize = btrfs_super_leafsize(disk_super);
+ sectorsize = btrfs_super_sectorsize(disk_super);
+ stripesize = btrfs_super_stripesize(disk_super);
+ tree_root->nodesize = nodesize;
+ tree_root->leafsize = leafsize;
+ tree_root->sectorsize = sectorsize;
+ tree_root->stripesize = stripesize;
+
+ sb->s_blocksize = sectorsize;
+ sb->s_blocksize_bits = blksize_bits(sectorsize);
+
+ if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+ sizeof(disk_super->magic))) {
+ printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
+ goto fail_sb_buffer;
+ }
+
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = btrfs_read_sys_array(tree_root);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret) {
+ printk(KERN_WARNING "btrfs: failed to read the system "
+ "array on %s\n", sb->s_id);
+ goto fail_sb_buffer;
+ }
+
+ blocksize = btrfs_level_size(tree_root,
+ btrfs_super_chunk_root_level(disk_super));
+ generation = btrfs_super_chunk_root_generation(disk_super);
+
+ __setup_root(nodesize, leafsize, sectorsize, stripesize,
+ chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+
+ chunk_root->node = read_tree_block(chunk_root,
+ btrfs_super_chunk_root(disk_super),
+ blocksize, generation);
+ BUG_ON(!chunk_root->node);
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+ printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+ sb->s_id);
+ goto fail_chunk_root;
+ }
+ btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
+ chunk_root->commit_root = btrfs_root_node(chunk_root);
+
+ read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
+ BTRFS_UUID_SIZE);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = btrfs_read_chunk_tree(chunk_root);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret) {
+ printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+ sb->s_id);
+ goto fail_chunk_root;
+ }
+
+ btrfs_close_extra_devices(fs_devices);
+
+ blocksize = btrfs_level_size(tree_root,
+ btrfs_super_root_level(disk_super));
+ generation = btrfs_super_generation(disk_super);
+
+ tree_root->node = read_tree_block(tree_root,
+ btrfs_super_root(disk_super),
+ blocksize, generation);
+ if (!tree_root->node)
+ goto fail_chunk_root;
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+ sb->s_id);
+ goto fail_tree_root;
+ }
+ btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+ tree_root->commit_root = btrfs_root_node(tree_root);
+
+ ret = find_and_setup_root(tree_root, fs_info,
+ BTRFS_EXTENT_TREE_OBJECTID, extent_root);
+ if (ret)
+ goto fail_tree_root;
+ extent_root->track_dirty = 1;
+
+ ret = find_and_setup_root(tree_root, fs_info,
+ BTRFS_DEV_TREE_OBJECTID, dev_root);
+ if (ret)
+ goto fail_extent_root;
+ dev_root->track_dirty = 1;
+
+ ret = find_and_setup_root(tree_root, fs_info,
+ BTRFS_CSUM_TREE_OBJECTID, csum_root);
+ if (ret)
+ goto fail_dev_root;
+
+ csum_root->track_dirty = 1;
+
+ btrfs_read_block_groups(extent_root);
+
+ fs_info->generation = generation;
+ fs_info->last_trans_committed = generation;
+ fs_info->data_alloc_profile = (u64)-1;
+ fs_info->metadata_alloc_profile = (u64)-1;
+ fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+ fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+ "btrfs-cleaner");
+ if (IS_ERR(fs_info->cleaner_kthread))
+ goto fail_csum_root;
+
+ fs_info->transaction_kthread = kthread_run(transaction_kthread,
+ tree_root,
+ "btrfs-transaction");
+ if (IS_ERR(fs_info->transaction_kthread))
+ goto fail_cleaner;
+
+ if (!btrfs_test_opt(tree_root, SSD) &&
+ !btrfs_test_opt(tree_root, NOSSD) &&
+ !fs_info->fs_devices->rotating) {
+ printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
+ "mode\n");
+ btrfs_set_opt(fs_info->mount_opt, SSD);
+ }
+
+ if (btrfs_super_log_root(disk_super) != 0) {
+ u64 bytenr = btrfs_super_log_root(disk_super);
+
+ if (fs_devices->rw_devices == 0) {
+ printk(KERN_WARNING "Btrfs log replay required "
+ "on RO media\n");
+ err = -EIO;
+ goto fail_trans_kthread;
+ }
+ blocksize =
+ btrfs_level_size(tree_root,
+ btrfs_super_log_root_level(disk_super));
+
+ log_tree_root = kzalloc(sizeof(struct btrfs_root),
+ GFP_NOFS);
+
+ __setup_root(nodesize, leafsize, sectorsize, stripesize,
+ log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+ log_tree_root->node = read_tree_block(tree_root, bytenr,
+ blocksize,
+ generation + 1);
+ ret = btrfs_recover_log_trees(log_tree_root);
+ BUG_ON(ret);
+
+ if (sb->s_flags & MS_RDONLY) {
+ ret = btrfs_commit_super(tree_root);
+ BUG_ON(ret);
+ }
+ }
+
+ ret = btrfs_find_orphan_roots(tree_root);
+ BUG_ON(ret);
+
+ if (!(sb->s_flags & MS_RDONLY)) {
+ ret = btrfs_recover_relocation(tree_root);
+ if (ret < 0) {
+ printk(KERN_WARNING
+ "btrfs: failed to recover relocation\n");
+ err = -EINVAL;
+ goto fail_trans_kthread;
+ }
+ }
+
+ location.objectid = BTRFS_FS_TREE_OBJECTID;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = (u64)-1;
+
+ fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
+ if (!fs_info->fs_root)
+ goto fail_trans_kthread;
+
+ if (!(sb->s_flags & MS_RDONLY)) {
+ down_read(&fs_info->cleanup_work_sem);
+ btrfs_orphan_cleanup(fs_info->fs_root);
+ up_read(&fs_info->cleanup_work_sem);
+ }
+
+ return tree_root;
+
+fail_trans_kthread:
+ kthread_stop(fs_info->transaction_kthread);
+fail_cleaner:
+ kthread_stop(fs_info->cleaner_kthread);
+
+ /*
+ * make sure we're done with the btree inode before we stop our
+ * kthreads
+ */
+ filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+
+fail_csum_root:
+ free_extent_buffer(csum_root->node);
+ free_extent_buffer(csum_root->commit_root);
+fail_dev_root:
+ free_extent_buffer(dev_root->node);
+ free_extent_buffer(dev_root->commit_root);
+fail_extent_root:
+ free_extent_buffer(extent_root->node);
+ free_extent_buffer(extent_root->commit_root);
+fail_tree_root:
+ free_extent_buffer(tree_root->node);
+ free_extent_buffer(tree_root->commit_root);
+fail_chunk_root:
+ free_extent_buffer(chunk_root->node);
+ free_extent_buffer(chunk_root->commit_root);
+fail_sb_buffer:
+ btrfs_stop_workers(&fs_info->generic_worker);
+ btrfs_stop_workers(&fs_info->fixup_workers);
+ btrfs_stop_workers(&fs_info->delalloc_workers);
+ btrfs_stop_workers(&fs_info->workers);
+ btrfs_stop_workers(&fs_info->endio_workers);
+ btrfs_stop_workers(&fs_info->endio_meta_workers);
+ btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+ btrfs_stop_workers(&fs_info->endio_write_workers);
+ btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->enospc_workers);
+fail_iput:
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+ iput(fs_info->btree_inode);
+
+ btrfs_close_devices(fs_info->fs_devices);
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+fail_bdi:
+ bdi_destroy(&fs_info->bdi);
+fail_srcu:
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
+fail:
+ kfree(extent_root);
+ kfree(tree_root);
+ kfree(fs_info);
+ kfree(chunk_root);
+ kfree(dev_root);
+ kfree(csum_root);
+ return ERR_PTR(err);
+}
+
+static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+ char b[BDEVNAME_SIZE];
+
+ if (uptodate) {
+ set_buffer_uptodate(bh);
+ } else {
+ if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+ printk(KERN_WARNING "lost page write due to "
+ "I/O error on %s\n",
+ bdevname(bh->b_bdev, b));
+ }
+ /* note, we dont' set_buffer_write_io_error because we have
+ * our own ways of dealing with the IO errors
+ */
+ clear_buffer_uptodate(bh);
+ }
+ unlock_buffer(bh);
+ put_bh(bh);
+}
+
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+{
+ struct buffer_head *bh;
+ struct buffer_head *latest = NULL;
+ struct btrfs_super_block *super;
+ int i;
+ u64 transid = 0;
+ u64 bytenr;
+
+ /* we would like to check all the supers, but that would make
+ * a btrfs mount succeed after a mkfs from a different FS.
+ * So, we need to add a special mount option to scan for
+ * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+ */
+ for (i = 0; i < 1; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
+ break;
+ bh = __bread(bdev, bytenr / 4096, 4096);
+ if (!bh)
+ continue;
+
+ super = (struct btrfs_super_block *)bh->b_data;
+ if (btrfs_super_bytenr(super) != bytenr ||
+ strncmp((char *)(&super->magic), BTRFS_MAGIC,
+ sizeof(super->magic))) {
+ brelse(bh);
+ continue;
+ }
+
+ if (!latest || btrfs_super_generation(super) > transid) {
+ brelse(latest);
+ latest = bh;
+ transid = btrfs_super_generation(super);
+ } else {
+ brelse(bh);
+ }
+ }
+ return latest;
+}
+
+/*
+ * this should be called twice, once with wait == 0 and
+ * once with wait == 1. When wait == 0 is done, all the buffer heads
+ * we write are pinned.
+ *
+ * They are released when wait == 1 is done.
+ * max_mirrors must be the same for both runs, and it indicates how
+ * many supers on this one device should be written.
+ *
+ * max_mirrors == 0 means to write them all.
+ */
+static int write_dev_supers(struct btrfs_device *device,
+ struct btrfs_super_block *sb,
+ int do_barriers, int wait, int max_mirrors)
+{
+ struct buffer_head *bh;
+ int i;
+ int ret;
+ int errors = 0;
+ u32 crc;
+ u64 bytenr;
+ int last_barrier = 0;
+
+ if (max_mirrors == 0)
+ max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+
+ /* make sure only the last submit_bh does a barrier */
+ if (do_barriers) {
+ for (i = 0; i < max_mirrors; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ device->total_bytes)
+ break;
+ last_barrier = i;
+ }
+ }
+
+ for (i = 0; i < max_mirrors; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+ break;
+
+ if (wait) {
+ bh = __find_get_block(device->bdev, bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ BUG_ON(!bh);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ errors++;
+
+ /* drop our reference */
+ brelse(bh);
+
+ /* drop the reference from the wait == 0 run */
+ brelse(bh);
+ continue;
+ } else {
+ btrfs_set_super_bytenr(sb, bytenr);
+
+ crc = ~(u32)0;
+ crc = btrfs_csum_data(NULL, (char *)sb +
+ BTRFS_CSUM_SIZE, crc,
+ BTRFS_SUPER_INFO_SIZE -
+ BTRFS_CSUM_SIZE);
+ btrfs_csum_final(crc, sb->csum);
+
+ /*
+ * one reference for us, and we leave it for the
+ * caller
+ */
+ bh = __getblk(device->bdev, bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+
+ /* one reference for submit_bh */
+ get_bh(bh);
+
+ set_buffer_uptodate(bh);
+ lock_buffer(bh);
+ bh->b_end_io = btrfs_end_buffer_write_sync;
+ }
+
+ if (i == last_barrier && do_barriers && device->barriers) {
+ ret = submit_bh(WRITE_BARRIER, bh);
+ if (ret == -EOPNOTSUPP) {
+ printk("btrfs: disabling barriers on dev %s\n",
+ device->name);
+ set_buffer_uptodate(bh);
+ device->barriers = 0;
+ /* one reference for submit_bh */
+ get_bh(bh);
+ lock_buffer(bh);
+ ret = submit_bh(WRITE_SYNC, bh);
+ }
+ } else {
+ ret = submit_bh(WRITE_SYNC, bh);
+ }
+
+ if (ret)
+ errors++;
+ }
+ return errors < i ? 0 : -1;
+}
+
+int write_all_supers(struct btrfs_root *root, int max_mirrors)
+{
+ struct list_head *head;
+ struct btrfs_device *dev;
+ struct btrfs_super_block *sb;
+ struct btrfs_dev_item *dev_item;
+ int ret;
+ int do_barriers;
+ int max_errors;
+ int total_errors = 0;
+ u64 flags;
+
+ max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+ do_barriers = !btrfs_test_opt(root, NOBARRIER);
+
+ sb = &root->fs_info->super_for_commit;
+ dev_item = &sb->dev_item;
+
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ head = &root->fs_info->fs_devices->devices;
+ list_for_each_entry(dev, head, dev_list) {
+ if (!dev->bdev) {
+ total_errors++;
+ continue;
+ }
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ btrfs_set_stack_device_generation(dev_item, 0);
+ btrfs_set_stack_device_type(dev_item, dev->type);
+ btrfs_set_stack_device_id(dev_item, dev->devid);
+ btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+ btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+ btrfs_set_stack_device_io_align(dev_item, dev->io_align);
+ btrfs_set_stack_device_io_width(dev_item, dev->io_width);
+ btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
+ memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+ memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
+
+ flags = btrfs_super_flags(sb);
+ btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+
+ ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
+ if (ret)
+ total_errors++;
+ }
+ if (total_errors > max_errors) {
+ printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+ total_errors);
+ BUG();
+ }
+
+ total_errors = 0;
+ list_for_each_entry(dev, head, dev_list) {
+ if (!dev->bdev)
+ continue;
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
+ if (ret)
+ total_errors++;
+ }
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ if (total_errors > max_errors) {
+ printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+ total_errors);
+ BUG();
+ }
+ return 0;
+}
+
+int write_ctree_super(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int max_mirrors)
+{
+ int ret;
+
+ ret = write_all_supers(root, max_mirrors);
+ return ret;
+}
+
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ radix_tree_delete(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ if (btrfs_root_refs(&root->root_item) == 0)
+ synchronize_srcu(&fs_info->subvol_srcu);
+
+ free_fs_root(root);
+ return 0;
+}
+
+static void free_fs_root(struct btrfs_root *root)
+{
+ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ if (root->anon_super.s_dev) {
+ down_write(&root->anon_super.s_umount);
+ kill_anon_super(&root->anon_super);
+ }
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
+ kfree(root->name);
+ kfree(root);
+}
+
+static int del_fs_roots(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ struct btrfs_root *gang[8];
+ int i;
+
+ while (!list_empty(&fs_info->dead_roots)) {
+ gang[0] = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&gang[0]->root_list);
+
+ if (gang[0]->in_radix) {
+ btrfs_free_fs_root(fs_info, gang[0]);
+ } else {
+ free_extent_buffer(gang[0]->node);
+ free_extent_buffer(gang[0]->commit_root);
+ kfree(gang[0]);
+ }
+ }
+
+ while (1) {
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang));
+ if (!ret)
+ break;
+ for (i = 0; i < ret; i++)
+ btrfs_free_fs_root(fs_info, gang[i]);
+ }
+ return 0;
+}
+
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+{
+ u64 root_objectid = 0;
+ struct btrfs_root *gang[8];
+ int i;
+ int ret;
+
+ while (1) {
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang));
+ if (!ret)
+ break;
+
+ root_objectid = gang[ret - 1]->root_key.objectid + 1;
+ for (i = 0; i < ret; i++) {
+ root_objectid = gang[i]->root_key.objectid;
+ btrfs_orphan_cleanup(gang[i]);
+ }
+ root_objectid++;
+ }
+ return 0;
+}
+
+int btrfs_commit_super(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_run_delayed_iputs(root);
+ btrfs_clean_old_snapshots(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ /* wait until ongoing cleanup work done */
+ down_write(&root->fs_info->cleanup_work_sem);
+ up_write(&root->fs_info->cleanup_work_sem);
+
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ /* run commit again to drop the original snapshot */
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_commit_transaction(trans, root);
+ ret = btrfs_write_and_wait_transaction(NULL, root);
+ BUG_ON(ret);
+
+ ret = write_ctree_super(NULL, root, 0);
+ return ret;
+}
+
+int close_ctree(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ fs_info->closing = 1;
+ smp_mb();
+
+ kthread_stop(root->fs_info->transaction_kthread);
+ kthread_stop(root->fs_info->cleaner_kthread);
+
+ if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+ ret = btrfs_commit_super(root);
+ if (ret)
+ printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+ }
+
+ fs_info->closing = 2;
+ smp_mb();
+
+ if (fs_info->delalloc_bytes) {
+ printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
+ (unsigned long long)fs_info->delalloc_bytes);
+ }
+ if (fs_info->total_ref_cache_size) {
+ printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
+ (unsigned long long)fs_info->total_ref_cache_size);
+ }
+
+ free_extent_buffer(fs_info->extent_root->node);
+ free_extent_buffer(fs_info->extent_root->commit_root);
+ free_extent_buffer(fs_info->tree_root->node);
+ free_extent_buffer(fs_info->tree_root->commit_root);
+ free_extent_buffer(root->fs_info->chunk_root->node);
+ free_extent_buffer(root->fs_info->chunk_root->commit_root);
+ free_extent_buffer(root->fs_info->dev_root->node);
+ free_extent_buffer(root->fs_info->dev_root->commit_root);
+ free_extent_buffer(root->fs_info->csum_root->node);
+ free_extent_buffer(root->fs_info->csum_root->commit_root);
+
+ btrfs_free_block_groups(root->fs_info);
+
+ del_fs_roots(fs_info);
+
+ iput(fs_info->btree_inode);
+
+ btrfs_stop_workers(&fs_info->generic_worker);
+ btrfs_stop_workers(&fs_info->fixup_workers);
+ btrfs_stop_workers(&fs_info->delalloc_workers);
+ btrfs_stop_workers(&fs_info->workers);
+ btrfs_stop_workers(&fs_info->endio_workers);
+ btrfs_stop_workers(&fs_info->endio_meta_workers);
+ btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+ btrfs_stop_workers(&fs_info->endio_write_workers);
+ btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->enospc_workers);
+
+ btrfs_close_devices(fs_info->fs_devices);
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+ bdi_destroy(&fs_info->bdi);
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
+
+ kfree(fs_info->extent_root);
+ kfree(fs_info->tree_root);
+ kfree(fs_info->chunk_root);
+ kfree(fs_info->dev_root);
+ kfree(fs_info->csum_root);
+ return 0;
+}
+
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
+{
+ int ret;
+ struct inode *btree_inode = buf->first_page->mapping->host;
+
+ ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+ if (!ret)
+ return ret;
+
+ ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
+ parent_transid);
+ return !ret;
+}
+
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
+{
+ struct inode *btree_inode = buf->first_page->mapping->host;
+ return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
+ buf);
+}
+
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+{
+ struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+ u64 transid = btrfs_header_generation(buf);
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ int was_dirty;
+
+ btrfs_assert_tree_locked(buf);
+ if (transid != root->fs_info->generation) {
+ printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+ "found %llu running %llu\n",
+ (unsigned long long)buf->start,
+ (unsigned long long)transid,
+ (unsigned long long)root->fs_info->generation);
+ WARN_ON(1);
+ }
+ was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+ buf);
+ if (!was_dirty) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ root->fs_info->dirty_metadata_bytes += buf->len;
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+}
+
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+{
+ /*
+ * looks as though older kernels can get into trouble with
+ * this code, they end up stuck in balance_dirty_pages forever
+ */
+ u64 num_dirty;
+ unsigned long thresh = 32 * 1024 * 1024;
+
+ if (current->flags & PF_MEMALLOC)
+ return;
+
+ num_dirty = root->fs_info->dirty_metadata_bytes;
+
+ if (num_dirty > thresh) {
+ balance_dirty_pages_ratelimited_nr(
+ root->fs_info->btree_inode->i_mapping, 1);
+ }
+ return;
+}
+
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+{
+ struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+ int ret;
+ ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+ if (ret == 0)
+ set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
+ return ret;
+}
+
+int btree_lock_page_hook(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_buffer *eb;
+ unsigned long len;
+ u64 bytenr = page_offset(page);
+
+ if (page->private == EXTENT_PAGE_PRIVATE)
+ goto out;
+
+ len = page->private >> 2;
+ eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
+ if (!eb)
+ goto out;
+
+ btrfs_tree_lock(eb);
+ btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (root->fs_info->dirty_metadata_bytes >= eb->len)
+ root->fs_info->dirty_metadata_bytes -= eb->len;
+ else
+ WARN_ON(1);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+out:
+ lock_page(page);
+ return 0;
+}
+
+static struct extent_io_ops btree_extent_io_ops = {
+ .write_cache_pages_lock_hook = btree_lock_page_hook,
+ .readpage_end_io_hook = btree_readpage_end_io_hook,
+ .submit_bio_hook = btree_submit_bio_hook,
+ /* note we're sharing with inode.c for the merge bio hook */
+ .merge_bio_hook = btrfs_merge_bio_hook,
+};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 00000000000..c958ecbc191
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __DISKIO__
+#define __DISKIO__
+
+#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_SIZE 4096
+
+#define BTRFS_SUPER_MIRROR_MAX 3
+#define BTRFS_SUPER_MIRROR_SHIFT 12
+
+static inline u64 btrfs_sb_offset(int mirror)
+{
+ u64 start = 16 * 1024;
+ if (mirror)
+ return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
+ return BTRFS_SUPER_INFO_OFFSET;
+}
+
+struct btrfs_device;
+struct btrfs_fs_devices;
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+ u32 blocksize, u64 parent_transid);
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ u64 parent_transid);
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize);
+int clean_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf);
+struct btrfs_root *open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options);
+int close_ctree(struct btrfs_root *root);
+int write_ctree_super(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int max_mirrors);
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_commit_super(struct btrfs_root *root);
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_objectid);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location,
+ const char *name, int namelen);
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location);
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
+int btrfs_insert_dev_radix(struct btrfs_root *root,
+ struct block_device *bdev,
+ u64 device_id,
+ u64 block_start,
+ u64 num_blocks);
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
+int wait_on_tree_block_writeback(struct btrfs_root *root,
+ struct extent_buffer *buf);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
+void btrfs_csum_final(u32 crc, char *result);
+int btrfs_open_device(struct btrfs_device *dev);
+int btrfs_verify_block_csum(struct btrfs_root *root,
+ struct extent_buffer *buf);
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+ int metadata);
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+ int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags,
+ extent_submit_bio_hook_t *submit_bio_start,
+ extent_submit_bio_hook_t *submit_bio_done);
+
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+int btrfs_write_tree_block(struct extent_buffer *buf);
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btree_lock_page_hook(struct page *page);
+
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+#else
+static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
+ int level)
+{
+}
+#endif
+#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 00000000000..ba5c3fd5ab8
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,240 @@
+#include <linux/fs.h>
+#include <linux/types.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "export.h"
+#include "compat.h"
+
+#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
+ parent_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
+ parent_root_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
+
+static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+ int connectable)
+{
+ struct btrfs_fid *fid = (struct btrfs_fid *)fh;
+ struct inode *inode = dentry->d_inode;
+ int len = *max_len;
+ int type;
+
+ if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
+ (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+ return 255;
+
+ len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+ type = FILEID_BTRFS_WITHOUT_PARENT;
+
+ fid->objectid = inode->i_ino;
+ fid->root_objectid = BTRFS_I(inode)->root->objectid;
+ fid->gen = inode->i_generation;
+
+ if (connectable && !S_ISDIR(inode->i_mode)) {
+ struct inode *parent;
+ u64 parent_root_id;
+
+ spin_lock(&dentry->d_lock);
+
+ parent = dentry->d_parent->d_inode;
+ fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+ fid->parent_gen = parent->i_generation;
+ parent_root_id = BTRFS_I(parent)->root->objectid;
+
+ spin_unlock(&dentry->d_lock);
+
+ if (parent_root_id != fid->root_objectid) {
+ fid->parent_root_objectid = parent_root_id;
+ len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
+ type = FILEID_BTRFS_WITH_PARENT_ROOT;
+ } else {
+ len = BTRFS_FID_SIZE_CONNECTABLE;
+ type = FILEID_BTRFS_WITH_PARENT;
+ }
+ }
+
+ *max_len = len;
+ return type;
+}
+
+static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u32 generation,
+ int check_generation)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+ struct btrfs_root *root;
+ struct dentry *dentry;
+ struct inode *inode;
+ struct btrfs_key key;
+ int index;
+ int err = 0;
+
+ if (objectid < BTRFS_FIRST_FREE_OBJECTID)
+ return ERR_PTR(-ESTALE);
+
+ key.objectid = root_objectid;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = (u64)-1;
+
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto fail;
+ }
+
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ err = -ENOENT;
+ goto fail;
+ }
+
+ key.objectid = objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+
+ inode = btrfs_iget(sb, &key, root);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto fail;
+ }
+
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+ if (check_generation && generation != inode->i_generation) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+
+ dentry = d_obtain_alias(inode);
+ if (!IS_ERR(dentry))
+ dentry->d_op = &btrfs_dentry_operations;
+ return dentry;
+fail:
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ return ERR_PTR(err);
+}
+
+static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+ u64 objectid, root_objectid;
+ u32 generation;
+
+ if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+ if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
+ return NULL;
+ root_objectid = fid->root_objectid;
+ } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
+ if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+ return NULL;
+ root_objectid = fid->parent_root_objectid;
+ } else
+ return NULL;
+
+ objectid = fid->parent_objectid;
+ generation = fid->parent_gen;
+
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+}
+
+static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+ u64 objectid, root_objectid;
+ u32 generation;
+
+ if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
+ fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
+ (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
+ fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+ (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
+ fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
+ return NULL;
+
+ objectid = fid->objectid;
+ root_objectid = fid->root_objectid;
+ generation = fid->gen;
+
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+}
+
+static struct dentry *btrfs_get_parent(struct dentry *child)
+{
+ struct inode *dir = child->d_inode;
+ static struct dentry *dentry;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_root_ref *ref;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int ret;
+
+ path = btrfs_alloc_path();
+
+ if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+ root = root->fs_info->tree_root;
+ } else {
+ key.objectid = dir->i_ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+ }
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto fail;
+
+ BUG_ON(ret == 0);
+ if (path->slots[0] == 0) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != key.objectid || found_key.type != key.type) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+ key.objectid = btrfs_root_ref_dirid(leaf, ref);
+ } else {
+ key.objectid = found_key.offset;
+ }
+ btrfs_free_path(path);
+
+ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+ return btrfs_get_dentry(root->fs_info->sb, key.objectid,
+ found_key.offset, 0, 0);
+ }
+
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+ if (!IS_ERR(dentry))
+ dentry->d_op = &btrfs_dentry_operations;
+ return dentry;
+fail:
+ btrfs_free_path(path);
+ return ERR_PTR(ret);
+}
+
+const struct export_operations btrfs_export_ops = {
+ .encode_fh = btrfs_encode_fh,
+ .fh_to_dentry = btrfs_fh_to_dentry,
+ .fh_to_parent = btrfs_fh_to_parent,
+ .get_parent = btrfs_get_parent,
+};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 00000000000..074348a9584
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
+#ifndef BTRFS_EXPORT_H
+#define BTRFS_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations btrfs_export_ops;
+
+struct btrfs_fid {
+ u64 objectid;
+ u64 root_objectid;
+ u32 gen;
+
+ u64 parent_objectid;
+ u32 parent_gen;
+
+ u64 parent_root_objectid;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 00000000000..559f72489b3
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,7687 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+#include <linux/rcupdate.h>
+#include <linux/kthread.h>
+#include "compat.h"
+#include "hash.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "free-space-cache.h"
+
+static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int alloc,
+ int mark_free);
+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve);
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner_objectid,
+ u64 owner_offset, int refs_to_drop,
+ struct btrfs_delayed_extent_op *extra_op);
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_item *ei);
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ u64 flags, u64 owner, u64 offset,
+ struct btrfs_key *ins, int ref_mod);
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ u64 flags, struct btrfs_disk_key *key,
+ int level, struct btrfs_key *ins);
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 alloc_bytes,
+ u64 flags, int force);
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes,
+ int is_data, int reserved,
+ struct extent_buffer **must_clean);
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key);
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups);
+
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+ smp_mb();
+ return cache->cached == BTRFS_CACHE_FINISHED;
+}
+
+static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+{
+ return (cache->flags & bits) == bits;
+}
+
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+{
+ atomic_inc(&cache->count);
+}
+
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
+{
+ if (atomic_dec_and_test(&cache->count))
+ kfree(cache);
+}
+
+/*
+ * this adds the block group to the fs_info rb tree for the block group
+ * cache
+ */
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct btrfs_block_group_cache *cache;
+
+ spin_lock(&info->block_group_cache_lock);
+ p = &info->block_group_cache_tree.rb_node;
+
+ while (*p) {
+ parent = *p;
+ cache = rb_entry(parent, struct btrfs_block_group_cache,
+ cache_node);
+ if (block_group->key.objectid < cache->key.objectid) {
+ p = &(*p)->rb_left;
+ } else if (block_group->key.objectid > cache->key.objectid) {
+ p = &(*p)->rb_right;
+ } else {
+ spin_unlock(&info->block_group_cache_lock);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&block_group->cache_node, parent, p);
+ rb_insert_color(&block_group->cache_node,
+ &info->block_group_cache_tree);
+ spin_unlock(&info->block_group_cache_lock);
+
+ return 0;
+}
+
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group_cache *
+block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+ int contains)
+{
+ struct btrfs_block_group_cache *cache, *ret = NULL;
+ struct rb_node *n;
+ u64 end, start;
+
+ spin_lock(&info->block_group_cache_lock);
+ n = info->block_group_cache_tree.rb_node;
+
+ while (n) {
+ cache = rb_entry(n, struct btrfs_block_group_cache,
+ cache_node);
+ end = cache->key.objectid + cache->key.offset - 1;
+ start = cache->key.objectid;
+
+ if (bytenr < start) {
+ if (!contains && (!ret || start < ret->key.objectid))
+ ret = cache;
+ n = n->rb_left;
+ } else if (bytenr > start) {
+ if (contains && bytenr <= end) {
+ ret = cache;
+ break;
+ }
+ n = n->rb_right;
+ } else {
+ ret = cache;
+ break;
+ }
+ }
+ if (ret)
+ btrfs_get_block_group(ret);
+ spin_unlock(&info->block_group_cache_lock);
+
+ return ret;
+}
+
+static int add_excluded_extent(struct btrfs_root *root,
+ u64 start, u64 num_bytes)
+{
+ u64 end = start + num_bytes - 1;
+ set_extent_bits(&root->fs_info->freed_extents[0],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ set_extent_bits(&root->fs_info->freed_extents[1],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ return 0;
+}
+
+static void free_excluded_extents(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ u64 start, end;
+
+ start = cache->key.objectid;
+ end = start + cache->key.offset - 1;
+
+ clear_extent_bits(&root->fs_info->freed_extents[0],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ clear_extent_bits(&root->fs_info->freed_extents[1],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+}
+
+static int exclude_super_stripes(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ u64 bytenr;
+ u64 *logical;
+ int stripe_len;
+ int i, nr, ret;
+
+ if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
+ stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
+ cache->bytes_super += stripe_len;
+ ret = add_excluded_extent(root, cache->key.objectid,
+ stripe_len);
+ BUG_ON(ret);
+ }
+
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+ cache->key.objectid, bytenr,
+ 0, &logical, &nr, &stripe_len);
+ BUG_ON(ret);
+
+ while (nr--) {
+ cache->bytes_super += stripe_len;
+ ret = add_excluded_extent(root, logical[nr],
+ stripe_len);
+ BUG_ON(ret);
+ }
+
+ kfree(logical);
+ }
+ return 0;
+}
+
+static struct btrfs_caching_control *
+get_caching_control(struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_caching_control *ctl;
+
+ spin_lock(&cache->lock);
+ if (cache->cached != BTRFS_CACHE_STARTED) {
+ spin_unlock(&cache->lock);
+ return NULL;
+ }
+
+ ctl = cache->caching_ctl;
+ atomic_inc(&ctl->count);
+ spin_unlock(&cache->lock);
+ return ctl;
+}
+
+static void put_caching_control(struct btrfs_caching_control *ctl)
+{
+ if (atomic_dec_and_test(&ctl->count))
+ kfree(ctl);
+}
+
+/*
+ * this is only called by cache_block_group, since we could have freed extents
+ * we need to check the pinned_extents for any extents that can't be used yet
+ * since their free space will be released as soon as the transaction commits.
+ */
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end)
+{
+ u64 extent_start, extent_end, size, total_added = 0;
+ int ret;
+
+ while (start < end) {
+ ret = find_first_extent_bit(info->pinned_extents, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY | EXTENT_UPTODATE);
+ if (ret)
+ break;
+
+ if (extent_start <= start) {
+ start = extent_end + 1;
+ } else if (extent_start > start && extent_start < end) {
+ size = extent_start - start;
+ total_added += size;
+ ret = btrfs_add_free_space(block_group, start,
+ size);
+ BUG_ON(ret);
+ start = extent_end + 1;
+ } else {
+ break;
+ }
+ }
+
+ if (start < end) {
+ size = end - start;
+ total_added += size;
+ ret = btrfs_add_free_space(block_group, start, size);
+ BUG_ON(ret);
+ }
+
+ return total_added;
+}
+
+static int caching_kthread(void *data)
+{
+ struct btrfs_block_group_cache *block_group = data;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 total_found = 0;
+ u64 last = 0;
+ u32 nritems;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ exclude_super_stripes(extent_root, block_group);
+ spin_lock(&block_group->space_info->lock);
+ block_group->space_info->bytes_super += block_group->bytes_super;
+ spin_unlock(&block_group->space_info->lock);
+
+ last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+
+ /*
+ * We don't want to deadlock with somebody trying to allocate a new
+ * extent for the extent root while also trying to search the extent
+ * root to add free space. So we skip locking and search the commit
+ * root, since its read-only
+ */
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ path->reada = 2;
+
+ key.objectid = last;
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+again:
+ mutex_lock(&caching_ctl->mutex);
+ /* need to make sure the commit_root doesn't disappear */
+ down_read(&fs_info->extent_commit_sem);
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto err;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ while (1) {
+ smp_mb();
+ if (fs_info->closing > 1) {
+ last = (u64)-1;
+ break;
+ }
+
+ if (path->slots[0] < nritems) {
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ } else {
+ ret = find_next_key(path, 0, &key);
+ if (ret)
+ break;
+
+ caching_ctl->progress = last;
+ btrfs_release_path(extent_root, path);
+ up_read(&fs_info->extent_commit_sem);
+ mutex_unlock(&caching_ctl->mutex);
+ if (btrfs_transaction_in_commit(fs_info))
+ schedule_timeout(1);
+ else
+ cond_resched();
+ goto again;
+ }
+
+ if (key.objectid < block_group->key.objectid) {
+ path->slots[0]++;
+ continue;
+ }
+
+ if (key.objectid >= block_group->key.objectid +
+ block_group->key.offset)
+ break;
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+ total_found += add_new_free_space(block_group,
+ fs_info, last,
+ key.objectid);
+ last = key.objectid + key.offset;
+
+ if (total_found > (1024 * 1024 * 2)) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+
+ total_found += add_new_free_space(block_group, fs_info, last,
+ block_group->key.objectid +
+ block_group->key.offset);
+ caching_ctl->progress = (u64)-1;
+
+ spin_lock(&block_group->lock);
+ block_group->caching_ctl = NULL;
+ block_group->cached = BTRFS_CACHE_FINISHED;
+ spin_unlock(&block_group->lock);
+
+err:
+ btrfs_free_path(path);
+ up_read(&fs_info->extent_commit_sem);
+
+ free_excluded_extents(extent_root, block_group);
+
+ mutex_unlock(&caching_ctl->mutex);
+ wake_up(&caching_ctl->wait);
+
+ put_caching_control(caching_ctl);
+ atomic_dec(&block_group->space_info->caching_threads);
+ btrfs_put_block_group(block_group);
+
+ return 0;
+}
+
+static int cache_block_group(struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct task_struct *tsk;
+ int ret = 0;
+
+ smp_mb();
+ if (cache->cached != BTRFS_CACHE_NO)
+ return 0;
+
+ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
+ BUG_ON(!caching_ctl);
+
+ INIT_LIST_HEAD(&caching_ctl->list);
+ mutex_init(&caching_ctl->mutex);
+ init_waitqueue_head(&caching_ctl->wait);
+ caching_ctl->block_group = cache;
+ caching_ctl->progress = cache->key.objectid;
+ /* one for caching kthread, one for caching block group list */
+ atomic_set(&caching_ctl->count, 2);
+
+ spin_lock(&cache->lock);
+ if (cache->cached != BTRFS_CACHE_NO) {
+ spin_unlock(&cache->lock);
+ kfree(caching_ctl);
+ return 0;
+ }
+ cache->caching_ctl = caching_ctl;
+ cache->cached = BTRFS_CACHE_STARTED;
+ spin_unlock(&cache->lock);
+
+ down_write(&fs_info->extent_commit_sem);
+ list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
+ up_write(&fs_info->extent_commit_sem);
+
+ atomic_inc(&cache->space_info->caching_threads);
+ btrfs_get_block_group(cache);
+
+ tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+ cache->key.objectid);
+ if (IS_ERR(tsk)) {
+ ret = PTR_ERR(tsk);
+ printk(KERN_ERR "error running thread %d\n", ret);
+ BUG();
+ }
+
+ return ret;
+}
+
+/*
+ * return the block group that starts at or after bytenr
+ */
+static struct btrfs_block_group_cache *
+btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = block_group_cache_tree_search(info, bytenr, 0);
+
+ return cache;
+}
+
+/*
+ * return the block group that contains the given bytenr
+ */
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+ struct btrfs_fs_info *info,
+ u64 bytenr)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = block_group_cache_tree_search(info, bytenr, 1);
+
+ return cache;
+}
+
+static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+ u64 flags)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list) {
+ if (found->flags == flags) {
+ rcu_read_unlock();
+ return found;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+/*
+ * after adding space to the filesystem, we need to clear the full flags
+ * on all the space infos.
+ */
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list)
+ found->full = 0;
+ rcu_read_unlock();
+}
+
+static u64 div_factor(u64 num, int factor)
+{
+ if (factor == 10)
+ return num;
+ num *= factor;
+ do_div(num, 10);
+ return num;
+}
+
+u64 btrfs_find_block_group(struct btrfs_root *root,
+ u64 search_start, u64 search_hint, int owner)
+{
+ struct btrfs_block_group_cache *cache;
+ u64 used;
+ u64 last = max(search_hint, search_start);
+ u64 group_start = 0;
+ int full_search = 0;
+ int factor = 9;
+ int wrapped = 0;
+again:
+ while (1) {
+ cache = btrfs_lookup_first_block_group(root->fs_info, last);
+ if (!cache)
+ break;
+
+ spin_lock(&cache->lock);
+ last = cache->key.objectid + cache->key.offset;
+ used = btrfs_block_group_used(&cache->item);
+
+ if ((full_search || !cache->ro) &&
+ block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
+ if (used + cache->pinned + cache->reserved <
+ div_factor(cache->key.offset, factor)) {
+ group_start = cache->key.objectid;
+ spin_unlock(&cache->lock);
+ btrfs_put_block_group(cache);
+ goto found;
+ }
+ }
+ spin_unlock(&cache->lock);
+ btrfs_put_block_group(cache);
+ cond_resched();
+ }
+ if (!wrapped) {
+ last = search_start;
+ wrapped = 1;
+ goto again;
+ }
+ if (!full_search && factor < 10) {
+ last = search_start;
+ full_search = 1;
+ factor = 10;
+ goto again;
+ }
+found:
+ return group_start;
+}
+
+/* simple helper to search for an existing extent at a given offset */
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ key.objectid = start;
+ key.offset = len;
+ btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+ ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
+ 0, 0);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Back reference rules. Back refs have three main goals:
+ *
+ * 1) differentiate between all holders of references to an extent so that
+ * when a reference is dropped we can make sure it was a valid reference
+ * before freeing the extent.
+ *
+ * 2) Provide enough information to quickly find the holders of an extent
+ * if we notice a given block is corrupted or bad.
+ *
+ * 3) Make it easy to migrate blocks for FS shrinking or storage pool
+ * maintenance. This is actually the same as #2, but with a slightly
+ * different use case.
+ *
+ * There are two kinds of back refs. The implicit back refs is optimized
+ * for pointers in non-shared tree blocks. For a given pointer in a block,
+ * back refs of this kind provide information about the block's owner tree
+ * and the pointer's key. These information allow us to find the block by
+ * b-tree searching. The full back refs is for pointers in tree blocks not
+ * referenced by their owner trees. The location of tree block is recorded
+ * in the back refs. Actually the full back refs is generic, and can be
+ * used in all cases the implicit back refs is used. The major shortcoming
+ * of the full back refs is its overhead. Every time a tree block gets
+ * COWed, we have to update back refs entry for all pointers in it.
+ *
+ * For a newly allocated tree block, we use implicit back refs for
+ * pointers in it. This means most tree related operations only involve
+ * implicit back refs. For a tree block created in old transaction, the
+ * only way to drop a reference to it is COW it. So we can detect the
+ * event that tree block loses its owner tree's reference and do the
+ * back refs conversion.
+ *
+ * When a tree block is COW'd through a tree, there are four cases:
+ *
+ * The reference count of the block is one and the tree is the block's
+ * owner tree. Nothing to do in this case.
+ *
+ * The reference count of the block is one and the tree is not the
+ * block's owner tree. In this case, full back refs is used for pointers
+ * in the block. Remove these full back refs, add implicit back refs for
+ * every pointers in the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * the block's owner tree. In this case, implicit back refs is used for
+ * pointers in the block. Add full back refs for every pointers in the
+ * block, increase lower level extents' reference counts. The original
+ * implicit back refs are entailed to the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * not the block's owner tree. Add implicit back refs for every pointer in
+ * the new block, increase lower level extents' reference count.
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent,
+ * The key type is used to differentiate between types of back refs.
+ * There are different meanings of the key offset for different types
+ * of back refs.
+ *
+ * File extents can be referenced by:
+ *
+ * - multiple snapshots, subvolumes, or different generations in one subvol
+ * - different files inside a single subvolume
+ * - different offsets inside a file (bookend extents in file.c)
+ *
+ * The extent ref structure for the implicit back refs has fields for:
+ *
+ * - Objectid of the subvolume root
+ * - objectid of the file holding the reference
+ * - original offset in the file
+ * - how many bookend extents
+ *
+ * The key offset for the implicit back refs is hash of the first
+ * three fields.
+ *
+ * The extent ref structure for the full back refs has field for:
+ *
+ * - number of pointers in the tree leaf
+ *
+ * The key offset for the implicit back refs is the first byte of
+ * the tree leaf
+ *
+ * When a file extent is allocated, The implicit back refs is used.
+ * the fields are filled in:
+ *
+ * (root_key.objectid, inode objectid, offset in file, 1)
+ *
+ * When a file extent is removed file truncation, we find the
+ * corresponding implicit back refs and check the following fields:
+ *
+ * (btrfs_header_owner(leaf), inode objectid, offset in file)
+ *
+ * Btree extents can be referenced by:
+ *
+ * - Different subvolumes
+ *
+ * Both the implicit back refs and the full back refs for tree blocks
+ * only consist of key. The key offset for the implicit back refs is
+ * objectid of block's owner tree. The key offset for the full back refs
+ * is the first byte of parent block.
+ *
+ * When implicit back refs is used, information about the lowest key and
+ * level of the tree block are required. These information are stored in
+ * tree block info structure.
+ */
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 owner, u32 extra_size)
+{
+ struct btrfs_extent_item *item;
+ struct btrfs_extent_item_v0 *ei0;
+ struct btrfs_extent_ref_v0 *ref0;
+ struct btrfs_tree_block_info *bi;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u32 new_size = sizeof(*item);
+ u64 refs;
+ int ret;
+
+ leaf = path->nodes[0];
+ BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ei0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item_v0);
+ refs = btrfs_extent_refs_v0(leaf, ei0);
+
+ if (owner == (u64)-1) {
+ while (1) {
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ BUG_ON(ret > 0);
+ leaf = path->nodes[0];
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key,
+ path->slots[0]);
+ BUG_ON(key.objectid != found_key.objectid);
+ if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
+ path->slots[0]++;
+ continue;
+ }
+ ref0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref_v0);
+ owner = btrfs_ref_objectid_v0(leaf, ref0);
+ break;
+ }
+ }
+ btrfs_release_path(root, path);
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID)
+ new_size += sizeof(*bi);
+
+ new_size -= sizeof(*ei0);
+ ret = btrfs_search_slot(trans, root, &key, path,
+ new_size + extra_size, 1);
+ if (ret < 0)
+ return ret;
+ BUG_ON(ret);
+
+ ret = btrfs_extend_item(trans, root, path, new_size);
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, item, refs);
+ /* FIXME: get real generation */
+ btrfs_set_extent_generation(leaf, item, 0);
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_set_extent_flags(leaf, item,
+ BTRFS_EXTENT_FLAG_TREE_BLOCK |
+ BTRFS_BLOCK_FLAG_FULL_BACKREF);
+ bi = (struct btrfs_tree_block_info *)(item + 1);
+ /* FIXME: get first key of the block */
+ memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
+ btrfs_set_tree_block_level(leaf, bi, (int)owner);
+ } else {
+ btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ return 0;
+}
+#endif
+
+static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
+{
+ u32 high_crc = ~(u32)0;
+ u32 low_crc = ~(u32)0;
+ __le64 lenum;
+
+ lenum = cpu_to_le64(root_objectid);
+ high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+ lenum = cpu_to_le64(owner);
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+ lenum = cpu_to_le64(offset);
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+
+ return ((u64)high_crc << 31) ^ (u64)low_crc;
+}
+
+static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
+ struct btrfs_extent_data_ref *ref)
+{
+ return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
+ btrfs_extent_data_ref_objectid(leaf, ref),
+ btrfs_extent_data_ref_offset(leaf, ref));
+}
+
+static int match_extent_data_ref(struct extent_buffer *leaf,
+ struct btrfs_extent_data_ref *ref,
+ u64 root_objectid, u64 owner, u64 offset)
+{
+ if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
+ btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
+ btrfs_extent_data_ref_offset(leaf, ref) != offset)
+ return 0;
+ return 1;
+}
+
+static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 root_objectid,
+ u64 owner, u64 offset)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_data_ref *ref;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int ret;
+ int recow;
+ int err = -ENOENT;
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_DATA_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_EXTENT_DATA_REF_KEY;
+ key.offset = hash_extent_data_ref(root_objectid,
+ owner, offset);
+ }
+again:
+ recow = 0;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0) {
+ err = ret;
+ goto fail;
+ }
+
+ if (parent) {
+ if (!ret)
+ return 0;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ key.type = BTRFS_EXTENT_REF_V0_KEY;
+ btrfs_release_path(root, path);
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0) {
+ err = ret;
+ goto fail;
+ }
+ if (!ret)
+ return 0;
+#endif
+ goto fail;
+ }
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ while (1) {
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ err = ret;
+ if (ret)
+ goto fail;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ recow = 1;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != bytenr ||
+ key.type != BTRFS_EXTENT_DATA_REF_KEY)
+ goto fail;
+
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+
+ if (match_extent_data_ref(leaf, ref, root_objectid,
+ owner, offset)) {
+ if (recow) {
+ btrfs_release_path(root, path);
+ goto again;
+ }
+ err = 0;
+ break;
+ }
+ path->slots[0]++;
+ }
+fail:
+ return err;
+}
+
+static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 root_objectid, u64 owner,
+ u64 offset, int refs_to_add)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u32 size;
+ u32 num_refs;
+ int ret;
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_DATA_REF_KEY;
+ key.offset = parent;
+ size = sizeof(struct btrfs_shared_data_ref);
+ } else {
+ key.type = BTRFS_EXTENT_DATA_REF_KEY;
+ key.offset = hash_extent_data_ref(root_objectid,
+ owner, offset);
+ size = sizeof(struct btrfs_extent_data_ref);
+ }
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, size);
+ if (ret && ret != -EEXIST)
+ goto fail;
+
+ leaf = path->nodes[0];
+ if (parent) {
+ struct btrfs_shared_data_ref *ref;
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_shared_data_ref);
+ if (ret == 0) {
+ btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
+ } else {
+ num_refs = btrfs_shared_data_ref_count(leaf, ref);
+ num_refs += refs_to_add;
+ btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
+ }
+ } else {
+ struct btrfs_extent_data_ref *ref;
+ while (ret == -EEXIST) {
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+ if (match_extent_data_ref(leaf, ref, root_objectid,
+ owner, offset))
+ break;
+ btrfs_release_path(root, path);
+ key.offset++;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ size);
+ if (ret && ret != -EEXIST)
+ goto fail;
+
+ leaf = path->nodes[0];
+ }
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+ if (ret == 0) {
+ btrfs_set_extent_data_ref_root(leaf, ref,
+ root_objectid);
+ btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+ btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+ btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+ } else {
+ num_refs = btrfs_extent_data_ref_count(leaf, ref);
+ num_refs += refs_to_add;
+ btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
+ }
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ ret = 0;
+fail:
+ btrfs_release_path(root, path);
+ return ret;
+}
+
+static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int refs_to_drop)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_data_ref *ref1 = NULL;
+ struct btrfs_shared_data_ref *ref2 = NULL;
+ struct extent_buffer *leaf;
+ u32 num_refs = 0;
+ int ret = 0;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ ref1 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+ num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+ } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+ ref2 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_shared_data_ref);
+ num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ struct btrfs_extent_ref_v0 *ref0;
+ ref0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref_v0);
+ num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+ } else {
+ BUG();
+ }
+
+ BUG_ON(num_refs < refs_to_drop);
+ num_refs -= refs_to_drop;
+
+ if (num_refs == 0) {
+ ret = btrfs_del_item(trans, root, path);
+ } else {
+ if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
+ btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
+ else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
+ btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ else {
+ struct btrfs_extent_ref_v0 *ref0;
+ ref0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref_v0);
+ btrfs_set_ref_count_v0(leaf, ref0, num_refs);
+ }
+#endif
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ return ret;
+}
+
+static noinline u32 extent_data_ref_count(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_data_ref *ref1;
+ struct btrfs_shared_data_ref *ref2;
+ u32 num_refs = 0;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (iref) {
+ if (btrfs_extent_inline_ref_type(leaf, iref) ==
+ BTRFS_EXTENT_DATA_REF_KEY) {
+ ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+ num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+ } else {
+ ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+ num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+ }
+ } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ ref1 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+ num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+ } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+ ref2 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_shared_data_ref);
+ num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ struct btrfs_extent_ref_v0 *ref0;
+ ref0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref_v0);
+ num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+ } else {
+ WARN_ON(1);
+ }
+ return num_refs;
+}
+
+static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 root_objectid)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (ret == -ENOENT && parent) {
+ btrfs_release_path(root, path);
+ key.type = BTRFS_EXTENT_REF_V0_KEY;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+ }
+#endif
+ return ret;
+}
+
+static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 root_objectid)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ btrfs_release_path(root, path);
+ return ret;
+}
+
+static inline int extent_ref_type(u64 parent, u64 owner)
+{
+ int type;
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (parent > 0)
+ type = BTRFS_SHARED_BLOCK_REF_KEY;
+ else
+ type = BTRFS_TREE_BLOCK_REF_KEY;
+ } else {
+ if (parent > 0)
+ type = BTRFS_SHARED_DATA_REF_KEY;
+ else
+ type = BTRFS_EXTENT_DATA_REF_KEY;
+ }
+ return type;
+}
+
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key)
+
+{
+ for (; level < BTRFS_MAX_LEVEL; level++) {
+ if (!path->nodes[level])
+ break;
+ if (path->slots[level] + 1 >=
+ btrfs_header_nritems(path->nodes[level]))
+ continue;
+ if (level == 0)
+ btrfs_item_key_to_cpu(path->nodes[level], key,
+ path->slots[level] + 1);
+ else
+ btrfs_node_key_to_cpu(path->nodes[level], key,
+ path->slots[level] + 1);
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * look for inline back ref. if back ref is found, *ref_ret is set
+ * to the address of inline back ref, and 0 is returned.
+ *
+ * if back ref isn't found, *ref_ret is set to the address where it
+ * should be inserted, and -ENOENT is returned.
+ *
+ * if insert is true and there are too many inline back refs, the path
+ * points to the extent item, and -EAGAIN is returned.
+ *
+ * NOTE: inline back refs are ordered in the same way that back ref
+ * items in the tree are ordered.
+ */
+static noinline_for_stack
+int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref **ref_ret,
+ u64 bytenr, u64 num_bytes,
+ u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int insert)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ u64 flags;
+ u64 item_size;
+ unsigned long ptr;
+ unsigned long end;
+ int extra_size;
+ int type;
+ int want;
+ int ret;
+ int err = 0;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ want = extent_ref_type(parent, owner);
+ if (insert) {
+ extra_size = btrfs_extent_inline_ref_size(want);
+ path->keep_locks = 1;
+ } else
+ extra_size = -1;
+ ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (item_size < sizeof(*ei)) {
+ if (!insert) {
+ err = -ENOENT;
+ goto out;
+ }
+ ret = convert_extent_item_v0(trans, root, path, owner,
+ extra_size);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ }
+#endif
+ BUG_ON(item_size < sizeof(*ei));
+
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+
+ ptr = (unsigned long)(ei + 1);
+ end = (unsigned long)ei + item_size;
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ ptr += sizeof(struct btrfs_tree_block_info);
+ BUG_ON(ptr > end);
+ } else {
+ BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+ }
+
+ err = -ENOENT;
+ while (1) {
+ if (ptr >= end) {
+ WARN_ON(ptr > end);
+ break;
+ }
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_extent_inline_ref_type(leaf, iref);
+ if (want < type)
+ break;
+ if (want > type) {
+ ptr += btrfs_extent_inline_ref_size(type);
+ continue;
+ }
+
+ if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+ struct btrfs_extent_data_ref *dref;
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ if (match_extent_data_ref(leaf, dref, root_objectid,
+ owner, offset)) {
+ err = 0;
+ break;
+ }
+ if (hash_extent_data_ref_item(leaf, dref) <
+ hash_extent_data_ref(root_objectid, owner, offset))
+ break;
+ } else {
+ u64 ref_offset;
+ ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+ if (parent > 0) {
+ if (parent == ref_offset) {
+ err = 0;
+ break;
+ }
+ if (ref_offset < parent)
+ break;
+ } else {
+ if (root_objectid == ref_offset) {
+ err = 0;
+ break;
+ }
+ if (ref_offset < root_objectid)
+ break;
+ }
+ }
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+ if (err == -ENOENT && insert) {
+ if (item_size + extra_size >=
+ BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ /*
+ * To add new inline back ref, we have to make sure
+ * there is no corresponding back ref item.
+ * For simplicity, we just do not add new inline back
+ * ref if there is any kind of item for this block
+ */
+ if (find_next_key(path, 0, &key) == 0 &&
+ key.objectid == bytenr &&
+ key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ err = -EAGAIN;
+ goto out;
+ }
+ }
+ *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
+out:
+ if (insert) {
+ path->keep_locks = 0;
+ btrfs_unlock_up_safe(path, 1);
+ }
+ return err;
+}
+
+/*
+ * helper to add new inline back ref
+ */
+static noinline_for_stack
+int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int refs_to_add,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ unsigned long ptr;
+ unsigned long end;
+ unsigned long item_offset;
+ u64 refs;
+ int size;
+ int type;
+ int ret;
+
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ item_offset = (unsigned long)iref - (unsigned long)ei;
+
+ type = extent_ref_type(parent, owner);
+ size = btrfs_extent_inline_ref_size(type);
+
+ ret = btrfs_extend_item(trans, root, path, size);
+ BUG_ON(ret);
+
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ refs = btrfs_extent_refs(leaf, ei);
+ refs += refs_to_add;
+ btrfs_set_extent_refs(leaf, ei, refs);
+ if (extent_op)
+ __run_delayed_extent_op(extent_op, leaf, ei);
+
+ ptr = (unsigned long)ei + item_offset;
+ end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+ if (ptr < end - size)
+ memmove_extent_buffer(leaf, ptr + size, ptr,
+ end - size - ptr);
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ btrfs_set_extent_inline_ref_type(leaf, iref, type);
+ if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+ struct btrfs_extent_data_ref *dref;
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
+ btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
+ btrfs_set_extent_data_ref_offset(leaf, dref, offset);
+ btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
+ } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+ struct btrfs_shared_data_ref *sref;
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else {
+ btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ return 0;
+}
+
+static int lookup_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref **ref_ret,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner, u64 offset)
+{
+ int ret;
+
+ ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
+ bytenr, num_bytes, parent,
+ root_objectid, owner, offset, 0);
+ if (ret != -ENOENT)
+ return ret;
+
+ btrfs_release_path(root, path);
+ *ref_ret = NULL;
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
+ root_objectid);
+ } else {
+ ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
+ root_objectid, owner, offset);
+ }
+ return ret;
+}
+
+/*
+ * helper to update/remove inline back ref
+ */
+static noinline_for_stack
+int update_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ int refs_to_mod,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_data_ref *dref = NULL;
+ struct btrfs_shared_data_ref *sref = NULL;
+ unsigned long ptr;
+ unsigned long end;
+ u32 item_size;
+ int size;
+ int type;
+ int ret;
+ u64 refs;
+
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ refs = btrfs_extent_refs(leaf, ei);
+ WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
+ refs += refs_to_mod;
+ btrfs_set_extent_refs(leaf, ei, refs);
+ if (extent_op)
+ __run_delayed_extent_op(extent_op, leaf, ei);
+
+ type = btrfs_extent_inline_ref_type(leaf, iref);
+
+ if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ refs = btrfs_extent_data_ref_count(leaf, dref);
+ } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ refs = btrfs_shared_data_ref_count(leaf, sref);
+ } else {
+ refs = 1;
+ BUG_ON(refs_to_mod != -1);
+ }
+
+ BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
+ refs += refs_to_mod;
+
+ if (refs > 0) {
+ if (type == BTRFS_EXTENT_DATA_REF_KEY)
+ btrfs_set_extent_data_ref_count(leaf, dref, refs);
+ else
+ btrfs_set_shared_data_ref_count(leaf, sref, refs);
+ } else {
+ size = btrfs_extent_inline_ref_size(type);
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = (unsigned long)iref;
+ end = (unsigned long)ei + item_size;
+ if (ptr + size < end)
+ memmove_extent_buffer(leaf, ptr, ptr + size,
+ end - ptr - size);
+ item_size -= size;
+ ret = btrfs_truncate_item(trans, root, path, item_size, 1);
+ BUG_ON(ret);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ return 0;
+}
+
+static noinline_for_stack
+int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner,
+ u64 offset, int refs_to_add,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+
+ ret = lookup_inline_extent_backref(trans, root, path, &iref,
+ bytenr, num_bytes, parent,
+ root_objectid, owner, offset, 1);
+ if (ret == 0) {
+ BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+ ret = update_inline_extent_backref(trans, root, path, iref,
+ refs_to_add, extent_op);
+ } else if (ret == -ENOENT) {
+ ret = setup_inline_extent_backref(trans, root, path, iref,
+ parent, root_objectid,
+ owner, offset, refs_to_add,
+ extent_op);
+ }
+ return ret;
+}
+
+static int insert_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int refs_to_add)
+{
+ int ret;
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ BUG_ON(refs_to_add != 1);
+ ret = insert_tree_block_ref(trans, root, path, bytenr,
+ parent, root_objectid);
+ } else {
+ ret = insert_extent_data_ref(trans, root, path, bytenr,
+ parent, root_objectid,
+ owner, offset, refs_to_add);
+ }
+ return ret;
+}
+
+static int remove_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ int refs_to_drop, int is_data)
+{
+ int ret;
+
+ BUG_ON(!is_data && refs_to_drop != 1);
+ if (iref) {
+ ret = update_inline_extent_backref(trans, root, path, iref,
+ -refs_to_drop, NULL);
+ } else if (is_data) {
+ ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+ } else {
+ ret = btrfs_del_item(trans, root, path);
+ }
+ return ret;
+}
+
+static void btrfs_issue_discard(struct block_device *bdev,
+ u64 start, u64 len)
+{
+ blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+ DISCARD_FL_BARRIER);
+}
+
+static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+ int ret;
+ u64 map_length = num_bytes;
+ struct btrfs_multi_bio *multi = NULL;
+
+ if (!btrfs_test_opt(root, DISCARD))
+ return 0;
+
+ /* Tell the block device(s) that the sectors can be discarded */
+ ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+ bytenr, &map_length, &multi, 0);
+ if (!ret) {
+ struct btrfs_bio_stripe *stripe = multi->stripes;
+ int i;
+
+ if (map_length > num_bytes)
+ map_length = num_bytes;
+
+ for (i = 0; i < multi->num_stripes; i++, stripe++) {
+ btrfs_issue_discard(stripe->dev->bdev,
+ stripe->physical,
+ map_length);
+ }
+ kfree(multi);
+ }
+
+ return ret;
+}
+
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner, u64 offset)
+{
+ int ret;
+ BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
+ root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+ parent, root_objectid, (int)owner,
+ BTRFS_ADD_DELAYED_REF, NULL);
+ } else {
+ ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+ parent, root_objectid, owner, offset,
+ BTRFS_ADD_DELAYED_REF, NULL);
+ }
+ return ret;
+}
+
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes,
+ u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int refs_to_add,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *item;
+ u64 refs;
+ int ret;
+ int err = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = 1;
+ path->leave_spinning = 1;
+ /* this will setup the path even if it fails to insert the back ref */
+ ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
+ path, bytenr, num_bytes, parent,
+ root_objectid, owner, offset,
+ refs_to_add, extent_op);
+ if (ret == 0)
+ goto out;
+
+ if (ret != -EAGAIN) {
+ err = ret;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ refs = btrfs_extent_refs(leaf, item);
+ btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
+ if (extent_op)
+ __run_delayed_extent_op(extent_op, leaf, item);
+
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(root->fs_info->extent_root, path);
+
+ path->reada = 1;
+ path->leave_spinning = 1;
+
+ /* now insert the actual backref */
+ ret = insert_extent_backref(trans, root->fs_info->extent_root,
+ path, bytenr, parent, root_objectid,
+ owner, offset, refs_to_add);
+ BUG_ON(ret);
+out:
+ btrfs_free_path(path);
+ return err;
+}
+
+static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op,
+ int insert_reserved)
+{
+ int ret = 0;
+ struct btrfs_delayed_data_ref *ref;
+ struct btrfs_key ins;
+ u64 parent = 0;
+ u64 ref_root = 0;
+ u64 flags = 0;
+
+ ins.objectid = node->bytenr;
+ ins.offset = node->num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+ ref = btrfs_delayed_node_to_data_ref(node);
+ if (node->type == BTRFS_SHARED_DATA_REF_KEY)
+ parent = ref->parent;
+ else
+ ref_root = ref->root;
+
+ if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+ if (extent_op) {
+ BUG_ON(extent_op->update_key);
+ flags |= extent_op->flags_to_set;
+ }
+ ret = alloc_reserved_file_extent(trans, root,
+ parent, ref_root, flags,
+ ref->objectid, ref->offset,
+ &ins, node->ref_mod);
+ } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+ ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+ node->num_bytes, parent,
+ ref_root, ref->objectid,
+ ref->offset, node->ref_mod,
+ extent_op);
+ } else if (node->action == BTRFS_DROP_DELAYED_REF) {
+ ret = __btrfs_free_extent(trans, root, node->bytenr,
+ node->num_bytes, parent,
+ ref_root, ref->objectid,
+ ref->offset, node->ref_mod,
+ extent_op);
+ } else {
+ BUG();
+ }
+ return ret;
+}
+
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_item *ei)
+{
+ u64 flags = btrfs_extent_flags(leaf, ei);
+ if (extent_op->update_flags) {
+ flags |= extent_op->flags_to_set;
+ btrfs_set_extent_flags(leaf, ei, flags);
+ }
+
+ if (extent_op->update_key) {
+ struct btrfs_tree_block_info *bi;
+ BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
+ }
+}
+
+static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ struct extent_buffer *leaf;
+ u32 item_size;
+ int ret;
+ int err = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = node->bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = node->num_bytes;
+
+ path->reada = 1;
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+ path, 0, 1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0) {
+ err = -EIO;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (item_size < sizeof(*ei)) {
+ ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
+ path, (u64)-1, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ }
+#endif
+ BUG_ON(item_size < sizeof(*ei));
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ __run_delayed_extent_op(extent_op, leaf, ei);
+
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return err;
+}
+
+static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op,
+ int insert_reserved)
+{
+ int ret = 0;
+ struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_key ins;
+ u64 parent = 0;
+ u64 ref_root = 0;
+
+ ins.objectid = node->bytenr;
+ ins.offset = node->num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+ parent = ref->parent;
+ else
+ ref_root = ref->root;
+
+ BUG_ON(node->ref_mod != 1);
+ if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+ BUG_ON(!extent_op || !extent_op->update_flags ||
+ !extent_op->update_key);
+ ret = alloc_reserved_tree_block(trans, root,
+ parent, ref_root,
+ extent_op->flags_to_set,
+ &extent_op->key,
+ ref->level, &ins);
+ } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+ ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+ node->num_bytes, parent, ref_root,
+ ref->level, 0, 1, extent_op);
+ } else if (node->action == BTRFS_DROP_DELAYED_REF) {
+ ret = __btrfs_free_extent(trans, root, node->bytenr,
+ node->num_bytes, parent, ref_root,
+ ref->level, 0, 1, extent_op);
+ } else {
+ BUG();
+ }
+ return ret;
+}
+
+
+/* helper function to actually process a single delayed ref entry */
+static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op,
+ int insert_reserved)
+{
+ int ret;
+ if (btrfs_delayed_ref_is_head(node)) {
+ struct btrfs_delayed_ref_head *head;
+ /*
+ * we've hit the end of the chain and we were supposed
+ * to insert this extent into the tree. But, it got
+ * deleted before we ever needed to insert it, so all
+ * we have to do is clean up the accounting
+ */
+ BUG_ON(extent_op);
+ head = btrfs_delayed_node_to_head(node);
+ if (insert_reserved) {
+ int mark_free = 0;
+ struct extent_buffer *must_clean = NULL;
+
+ ret = pin_down_bytes(trans, root, NULL,
+ node->bytenr, node->num_bytes,
+ head->is_data, 1, &must_clean);
+ if (ret > 0)
+ mark_free = 1;
+
+ if (must_clean) {
+ clean_tree_block(NULL, root, must_clean);
+ btrfs_tree_unlock(must_clean);
+ free_extent_buffer(must_clean);
+ }
+ if (head->is_data) {
+ ret = btrfs_del_csums(trans, root,
+ node->bytenr,
+ node->num_bytes);
+ BUG_ON(ret);
+ }
+ if (mark_free) {
+ ret = btrfs_free_reserved_extent(root,
+ node->bytenr,
+ node->num_bytes);
+ BUG_ON(ret);
+ }
+ }
+ mutex_unlock(&head->mutex);
+ return 0;
+ }
+
+ if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+ ret = run_delayed_tree_ref(trans, root, node, extent_op,
+ insert_reserved);
+ else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ node->type == BTRFS_SHARED_DATA_REF_KEY)
+ ret = run_delayed_data_ref(trans, root, node, extent_op,
+ insert_reserved);
+ else
+ BUG();
+ return ret;
+}
+
+static noinline struct btrfs_delayed_ref_node *
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
+{
+ struct rb_node *node;
+ struct btrfs_delayed_ref_node *ref;
+ int action = BTRFS_ADD_DELAYED_REF;
+again:
+ /*
+ * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+ * this prevents ref count from going down to zero when
+ * there still are pending delayed ref.
+ */
+ node = rb_prev(&head->node.rb_node);
+ while (1) {
+ if (!node)
+ break;
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (ref->bytenr != head->node.bytenr)
+ break;
+ if (ref->action == action)
+ return ref;
+ node = rb_prev(node);
+ }
+ if (action == BTRFS_ADD_DELAYED_REF) {
+ action = BTRFS_DROP_DELAYED_REF;
+ goto again;
+ }
+ return NULL;
+}
+
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct list_head *cluster)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *locked_ref = NULL;
+ struct btrfs_delayed_extent_op *extent_op;
+ int ret;
+ int count = 0;
+ int must_insert_reserved = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ while (1) {
+ if (!locked_ref) {
+ /* pick a new head ref from the cluster list */
+ if (list_empty(cluster))
+ break;
+
+ locked_ref = list_entry(cluster->next,
+ struct btrfs_delayed_ref_head, cluster);
+
+ /* grab the lock that says we are going to process
+ * all the refs for this head */
+ ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+ /*
+ * we may have dropped the spin lock to get the head
+ * mutex lock, and that might have given someone else
+ * time to free the head. If that's true, it has been
+ * removed from our list and we can move on.
+ */
+ if (ret == -EAGAIN) {
+ locked_ref = NULL;
+ count++;
+ continue;
+ }
+ }
+
+ /*
+ * record the must insert reserved flag before we
+ * drop the spin lock.
+ */
+ must_insert_reserved = locked_ref->must_insert_reserved;
+ locked_ref->must_insert_reserved = 0;
+
+ extent_op = locked_ref->extent_op;
+ locked_ref->extent_op = NULL;
+
+ /*
+ * locked_ref is the head node, so we have to go one
+ * node back for any delayed ref updates
+ */
+ ref = select_delayed_ref(locked_ref);
+ if (!ref) {
+ /* All delayed refs have been processed, Go ahead
+ * and send the head node to run_one_delayed_ref,
+ * so that any accounting fixes can happen
+ */
+ ref = &locked_ref->node;
+
+ if (extent_op && must_insert_reserved) {
+ kfree(extent_op);
+ extent_op = NULL;
+ }
+
+ if (extent_op) {
+ spin_unlock(&delayed_refs->lock);
+
+ ret = run_delayed_extent_op(trans, root,
+ ref, extent_op);
+ BUG_ON(ret);
+ kfree(extent_op);
+
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ continue;
+ }
+
+ list_del_init(&locked_ref->cluster);
+ locked_ref = NULL;
+ }
+
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &delayed_refs->root);
+ delayed_refs->num_entries--;
+
+ spin_unlock(&delayed_refs->lock);
+
+ ret = run_one_delayed_ref(trans, root, ref, extent_op,
+ must_insert_reserved);
+ BUG_ON(ret);
+
+ btrfs_put_delayed_ref(ref);
+ kfree(extent_op);
+ count++;
+
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+ return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far. count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long count)
+{
+ struct rb_node *node;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct list_head cluster;
+ int ret;
+ int run_all = count == (unsigned long)-1;
+ int run_most = 0;
+
+ if (root == root->fs_info->extent_root)
+ root = root->fs_info->tree_root;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ INIT_LIST_HEAD(&cluster);
+again:
+ spin_lock(&delayed_refs->lock);
+ if (count == 0) {
+ count = delayed_refs->num_entries * 2;
+ run_most = 1;
+ }
+ while (1) {
+ if (!(run_all || run_most) &&
+ delayed_refs->num_heads_ready < 64)
+ break;
+
+ /*
+ * go find something we can process in the rbtree. We start at
+ * the beginning of the tree, and then build a cluster
+ * of refs to process starting at the first one we are able to
+ * lock
+ */
+ ret = btrfs_find_ref_cluster(trans, &cluster,
+ delayed_refs->run_delayed_start);
+ if (ret)
+ break;
+
+ ret = run_clustered_refs(trans, root, &cluster);
+ BUG_ON(ret < 0);
+
+ count -= min_t(unsigned long, ret, count);
+
+ if (count == 0)
+ break;
+ }
+
+ if (run_all) {
+ node = rb_first(&delayed_refs->root);
+ if (!node)
+ goto out;
+ count = (unsigned long)-1;
+
+ while (node) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (btrfs_delayed_ref_is_head(ref)) {
+ struct btrfs_delayed_ref_head *head;
+
+ head = btrfs_delayed_node_to_head(ref);
+ atomic_inc(&ref->refs);
+
+ spin_unlock(&delayed_refs->lock);
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+
+ btrfs_put_delayed_ref(ref);
+ cond_resched();
+ goto again;
+ }
+ node = rb_next(node);
+ }
+ spin_unlock(&delayed_refs->lock);
+ schedule_timeout(1);
+ goto again;
+ }
+out:
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
+
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 flags,
+ int is_data)
+{
+ struct btrfs_delayed_extent_op *extent_op;
+ int ret;
+
+ extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ if (!extent_op)
+ return -ENOMEM;
+
+ extent_op->flags_to_set = flags;
+ extent_op->update_flags = 1;
+ extent_op->update_key = 0;
+ extent_op->is_data = is_data ? 1 : 0;
+
+ ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+ if (ret)
+ kfree(extent_op);
+ return ret;
+}
+
+static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid, u64 offset, u64 bytenr)
+{
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_data_ref *data_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *node;
+ int ret = 0;
+
+ ret = -ENOENT;
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (!head)
+ goto out;
+
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(root->fs_info->extent_root, path);
+
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ return -EAGAIN;
+ }
+
+ node = rb_prev(&head->node.rb_node);
+ if (!node)
+ goto out_unlock;
+
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+
+ if (ref->bytenr != bytenr)
+ goto out_unlock;
+
+ ret = 1;
+ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
+ goto out_unlock;
+
+ data_ref = btrfs_delayed_node_to_data_ref(ref);
+
+ node = rb_prev(node);
+ if (node) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ if (ref->bytenr == bytenr)
+ goto out_unlock;
+ }
+
+ if (data_ref->root != root->root_key.objectid ||
+ data_ref->objectid != objectid || data_ref->offset != offset)
+ goto out_unlock;
+
+ ret = 0;
+out_unlock:
+ mutex_unlock(&head->mutex);
+out:
+ spin_unlock(&delayed_refs->lock);
+ return ret;
+}
+
+static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid, u64 offset, u64 bytenr)
+{
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_data_ref *ref;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_item *ei;
+ struct btrfs_key key;
+ u32 item_size;
+ int ret;
+
+ key.objectid = bytenr;
+ key.offset = (u64)-1;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ ret = -ENOENT;
+ if (path->slots[0] == 0)
+ goto out;
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
+ goto out;
+
+ ret = 1;
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (item_size < sizeof(*ei)) {
+ WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+ goto out;
+ }
+#endif
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+
+ if (item_size != sizeof(*ei) +
+ btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
+ goto out;
+
+ if (btrfs_extent_generation(leaf, ei) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ goto out;
+
+ iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ if (btrfs_extent_inline_ref_type(leaf, iref) !=
+ BTRFS_EXTENT_DATA_REF_KEY)
+ goto out;
+
+ ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ if (btrfs_extent_refs(leaf, ei) !=
+ btrfs_extent_data_ref_count(leaf, ref) ||
+ btrfs_extent_data_ref_root(leaf, ref) !=
+ root->root_key.objectid ||
+ btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+ btrfs_extent_data_ref_offset(leaf, ref) != offset)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 objectid, u64 offset, u64 bytenr)
+{
+ struct btrfs_path *path;
+ int ret;
+ int ret2;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOENT;
+
+ do {
+ ret = check_committed_ref(trans, root, path, objectid,
+ offset, bytenr);
+ if (ret && ret != -ENOENT)
+ goto out;
+
+ ret2 = check_delayed_ref(trans, root, path, objectid,
+ offset, bytenr);
+ } while (ret2 == -EAGAIN);
+
+ if (ret2 && ret2 != -ENOENT) {
+ ret = ret2;
+ goto out;
+ }
+
+ if (ret != -ENOENT || ret2 != -ENOENT)
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+#if 0
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, u32 nr_extents)
+{
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ u64 root_gen;
+ u32 nritems;
+ int i;
+ int level;
+ int ret = 0;
+ int shared = 0;
+
+ if (!root->ref_cows)
+ return 0;
+
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ shared = 0;
+ root_gen = root->root_key.offset;
+ } else {
+ shared = 1;
+ root_gen = trans->transid - 1;
+ }
+
+ level = btrfs_header_level(buf);
+ nritems = btrfs_header_nritems(buf);
+
+ if (level == 0) {
+ struct btrfs_leaf_ref *ref;
+ struct btrfs_extent_info *info;
+
+ ref = btrfs_alloc_leaf_ref(root, nr_extents);
+ if (!ref) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ref->root_gen = root_gen;
+ ref->bytenr = buf->start;
+ ref->owner = btrfs_header_owner(buf);
+ ref->generation = btrfs_header_generation(buf);
+ ref->nritems = nr_extents;
+ info = ref->extents;
+
+ for (i = 0; nr_extents > 0 && i < nritems; i++) {
+ u64 disk_bytenr;
+ btrfs_item_key_to_cpu(buf, &key, i);
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(buf, i,
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(buf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (disk_bytenr == 0)
+ continue;
+
+ info->bytenr = disk_bytenr;
+ info->num_bytes =
+ btrfs_file_extent_disk_num_bytes(buf, fi);
+ info->objectid = key.objectid;
+ info->offset = key.offset;
+ info++;
+ }
+
+ ret = btrfs_add_leaf_ref(root, ref, shared);
+ if (ret == -EEXIST && shared) {
+ struct btrfs_leaf_ref *old;
+ old = btrfs_lookup_leaf_ref(root, ref->bytenr);
+ BUG_ON(!old);
+ btrfs_remove_leaf_ref(root, old);
+ btrfs_free_leaf_ref(root, old);
+ ret = btrfs_add_leaf_ref(root, ref, shared);
+ }
+ WARN_ON(ret);
+ btrfs_free_leaf_ref(root, ref);
+ }
+out:
+ return ret;
+}
+
+/* when a block goes through cow, we update the reference counts of
+ * everything that block points to. The internal pointers of the block
+ * can be in just about any order, and it is likely to have clusters of
+ * things that are close together and clusters of things that are not.
+ *
+ * To help reduce the seeks that come with updating all of these reference
+ * counts, sort them by byte number before actual updates are done.
+ *
+ * struct refsort is used to match byte number to slot in the btree block.
+ * we sort based on the byte number and then use the slot to actually
+ * find the item.
+ *
+ * struct refsort is smaller than strcut btrfs_item and smaller than
+ * struct btrfs_key_ptr. Since we're currently limited to the page size
+ * for a btree block, there's no way for a kmalloc of refsorts for a
+ * single node to be bigger than a page.
+ */
+struct refsort {
+ u64 bytenr;
+ u32 slot;
+};
+
+/*
+ * for passing into sort()
+ */
+static int refsort_cmp(const void *a_void, const void *b_void)
+{
+ const struct refsort *a = a_void;
+ const struct refsort *b = b_void;
+
+ if (a->bytenr < b->bytenr)
+ return -1;
+ if (a->bytenr > b->bytenr)
+ return 1;
+ return 0;
+}
+#endif
+
+static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ int full_backref, int inc)
+{
+ u64 bytenr;
+ u64 num_bytes;
+ u64 parent;
+ u64 ref_root;
+ u32 nritems;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ int i;
+ int level;
+ int ret = 0;
+ int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
+ u64, u64, u64, u64, u64, u64);
+
+ ref_root = btrfs_header_owner(buf);
+ nritems = btrfs_header_nritems(buf);
+ level = btrfs_header_level(buf);
+
+ if (!root->ref_cows && level == 0)
+ return 0;
+
+ if (inc)
+ process_func = btrfs_inc_extent_ref;
+ else
+ process_func = btrfs_free_extent;
+
+ if (full_backref)
+ parent = buf->start;
+ else
+ parent = 0;
+
+ for (i = 0; i < nritems; i++) {
+ if (level == 0) {
+ btrfs_item_key_to_cpu(buf, &key, i);
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(buf, i,
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(buf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (bytenr == 0)
+ continue;
+
+ num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+ key.offset -= btrfs_file_extent_offset(buf, fi);
+ ret = process_func(trans, root, bytenr, num_bytes,
+ parent, ref_root, key.objectid,
+ key.offset);
+ if (ret)
+ goto fail;
+ } else {
+ bytenr = btrfs_node_blockptr(buf, i);
+ num_bytes = btrfs_level_size(root, level - 1);
+ ret = process_func(trans, root, bytenr, num_bytes,
+ parent, ref_root, level - 1, 0);
+ if (ret)
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ BUG();
+ return ret;
+}
+
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref)
+{
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+}
+
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref)
+{
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+}
+
+static int write_one_cache_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_block_group_cache *cache)
+{
+ int ret;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ unsigned long bi;
+ struct extent_buffer *leaf;
+
+ ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+ if (ret < 0)
+ goto fail;
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(extent_root, path);
+fail:
+ if (ret)
+ return ret;
+ return 0;
+
+}
+
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ struct rb_node *node;
+ spin_lock(&root->fs_info->block_group_cache_lock);
+ node = rb_next(&cache->cache_node);
+ btrfs_put_block_group(cache);
+ if (node) {
+ cache = rb_entry(node, struct btrfs_block_group_cache,
+ cache_node);
+ btrfs_get_block_group(cache);
+ } else
+ cache = NULL;
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+ return cache;
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *cache;
+ int err = 0;
+ struct btrfs_path *path;
+ u64 last = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ if (last == 0) {
+ err = btrfs_run_delayed_refs(trans, root,
+ (unsigned long)-1);
+ BUG_ON(err);
+ }
+
+ cache = btrfs_lookup_first_block_group(root->fs_info, last);
+ while (cache) {
+ if (cache->dirty)
+ break;
+ cache = next_block_group(root, cache);
+ }
+ if (!cache) {
+ if (last == 0)
+ break;
+ last = 0;
+ continue;
+ }
+
+ cache->dirty = 0;
+ last = cache->key.objectid + cache->key.offset;
+
+ err = write_one_cache_group(trans, root, path, cache);
+ BUG_ON(err);
+ btrfs_put_block_group(cache);
+ }
+
+ btrfs_free_path(path);
+ return 0;
+}
+
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
+{
+ struct btrfs_block_group_cache *block_group;
+ int readonly = 0;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+ if (!block_group || block_group->ro)
+ readonly = 1;
+ if (block_group)
+ btrfs_put_block_group(block_group);
+ return readonly;
+}
+
+static int update_space_info(struct btrfs_fs_info *info, u64 flags,
+ u64 total_bytes, u64 bytes_used,
+ struct btrfs_space_info **space_info)
+{
+ struct btrfs_space_info *found;
+
+ found = __find_space_info(info, flags);
+ if (found) {
+ spin_lock(&found->lock);
+ found->total_bytes += total_bytes;
+ found->bytes_used += bytes_used;
+ found->full = 0;
+ spin_unlock(&found->lock);
+ *space_info = found;
+ return 0;
+ }
+ found = kzalloc(sizeof(*found), GFP_NOFS);
+ if (!found)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&found->block_groups);
+ init_rwsem(&found->groups_sem);
+ spin_lock_init(&found->lock);
+ found->flags = flags;
+ found->total_bytes = total_bytes;
+ found->bytes_used = bytes_used;
+ found->bytes_pinned = 0;
+ found->bytes_reserved = 0;
+ found->bytes_readonly = 0;
+ found->bytes_delalloc = 0;
+ found->full = 0;
+ found->force_alloc = 0;
+ *space_info = found;
+ list_add_rcu(&found->list, &info->space_info);
+ atomic_set(&found->caching_threads, 0);
+ return 0;
+}
+
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_DUP);
+ if (extra_flags) {
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits |= extra_flags;
+ }
+}
+
+static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
+{
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ if (!cache->ro) {
+ cache->space_info->bytes_readonly += cache->key.offset -
+ btrfs_block_group_used(&cache->item);
+ cache->ro = 1;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+}
+
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+{
+ u64 num_devices = root->fs_info->fs_devices->rw_devices;
+
+ if (num_devices == 1)
+ flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+ if (num_devices < 4)
+ flags &= ~BTRFS_BLOCK_GROUP_RAID10;
+
+ if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+ (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))) {
+ flags &= ~BTRFS_BLOCK_GROUP_DUP;
+ }
+
+ if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
+ (flags & BTRFS_BLOCK_GROUP_RAID10)) {
+ flags &= ~BTRFS_BLOCK_GROUP_RAID1;
+ }
+
+ if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
+ ((flags & BTRFS_BLOCK_GROUP_RAID1) |
+ (flags & BTRFS_BLOCK_GROUP_RAID10) |
+ (flags & BTRFS_BLOCK_GROUP_DUP)))
+ flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+ return flags;
+}
+
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ u64 alloc_profile;
+
+ if (data) {
+ alloc_profile = info->avail_data_alloc_bits &
+ info->data_alloc_profile;
+ data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+ } else if (root == root->fs_info->chunk_root) {
+ alloc_profile = info->avail_system_alloc_bits &
+ info->system_alloc_profile;
+ data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+ } else {
+ alloc_profile = info->avail_metadata_alloc_bits &
+ info->metadata_alloc_profile;
+ data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+ }
+
+ return btrfs_reduce_alloc_profile(root, data);
+}
+
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+{
+ u64 alloc_target;
+
+ alloc_target = btrfs_get_alloc_profile(root, 1);
+ BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+ alloc_target);
+}
+
+static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
+{
+ u64 num_bytes;
+ int level;
+
+ level = BTRFS_MAX_LEVEL - 2;
+ /*
+ * NOTE: these calculations are absolutely the worst possible case.
+ * This assumes that _every_ item we insert will require a new leaf, and
+ * that the tree has grown to its maximum level size.
+ */
+
+ /*
+ * for every item we insert we could insert both an extent item and a
+ * extent ref item. Then for ever item we insert, we will need to cow
+ * both the original leaf, plus the leaf to the left and right of it.
+ *
+ * Unless we are talking about the extent root, then we just want the
+ * number of items * 2, since we just need the extent item plus its ref.
+ */
+ if (root == root->fs_info->extent_root)
+ num_bytes = num_items * 2;
+ else
+ num_bytes = (num_items + (2 * num_items)) * 3;
+
+ /*
+ * num_bytes is total number of leaves we could need times the leaf
+ * size, and then for every leaf we could end up cow'ing 2 nodes per
+ * level, down to the leaf level.
+ */
+ num_bytes = (num_bytes * root->leafsize) +
+ (num_bytes * (level * 2)) * root->nodesize;
+
+ return num_bytes;
+}
+
+/*
+ * Unreserve metadata space for delalloc. If we have less reserved credits than
+ * we have extents, this function does nothing.
+ */
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 alloc_target;
+ bool bug = false;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+ num_items);
+
+ spin_lock(&meta_sinfo->lock);
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ if (BTRFS_I(inode)->reserved_extents <=
+ BTRFS_I(inode)->outstanding_extents) {
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ spin_unlock(&meta_sinfo->lock);
+ return 0;
+ }
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ BTRFS_I(inode)->reserved_extents--;
+ BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
+
+ if (meta_sinfo->bytes_delalloc < num_bytes) {
+ bug = true;
+ meta_sinfo->bytes_delalloc = 0;
+ } else {
+ meta_sinfo->bytes_delalloc -= num_bytes;
+ }
+ spin_unlock(&meta_sinfo->lock);
+
+ BUG_ON(bug);
+
+ return 0;
+}
+
+static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
+{
+ u64 thresh;
+
+ thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use;
+
+ thresh = meta_sinfo->total_bytes - thresh;
+ thresh *= 80;
+ do_div(thresh, 100);
+ if (thresh <= meta_sinfo->bytes_delalloc)
+ meta_sinfo->force_delalloc = 1;
+ else
+ meta_sinfo->force_delalloc = 0;
+}
+
+struct async_flush {
+ struct btrfs_root *root;
+ struct btrfs_space_info *info;
+ struct btrfs_work work;
+};
+
+static noinline void flush_delalloc_async(struct btrfs_work *work)
+{
+ struct async_flush *async;
+ struct btrfs_root *root;
+ struct btrfs_space_info *info;
+
+ async = container_of(work, struct async_flush, work);
+ root = async->root;
+ info = async->info;
+
+ btrfs_start_delalloc_inodes(root, 0);
+ wake_up(&info->flush_wait);
+ btrfs_wait_ordered_extents(root, 0, 0);
+
+ spin_lock(&info->lock);
+ info->flushing = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->flush_wait);
+
+ kfree(async);
+}
+
+static void wait_on_flush(struct btrfs_space_info *info)
+{
+ DEFINE_WAIT(wait);
+ u64 used;
+
+ while (1) {
+ prepare_to_wait(&info->flush_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock(&info->lock);
+ if (!info->flushing) {
+ spin_unlock(&info->lock);
+ break;
+ }
+
+ used = info->bytes_used + info->bytes_reserved +
+ info->bytes_pinned + info->bytes_readonly +
+ info->bytes_super + info->bytes_root +
+ info->bytes_may_use + info->bytes_delalloc;
+ if (used < info->total_bytes) {
+ spin_unlock(&info->lock);
+ break;
+ }
+ spin_unlock(&info->lock);
+ schedule();
+ }
+ finish_wait(&info->flush_wait, &wait);
+}
+
+static void flush_delalloc(struct btrfs_root *root,
+ struct btrfs_space_info *info)
+{
+ struct async_flush *async;
+ bool wait = false;
+
+ spin_lock(&info->lock);
+
+ if (!info->flushing) {
+ info->flushing = 1;
+ init_waitqueue_head(&info->flush_wait);
+ } else {
+ wait = true;
+ }
+
+ spin_unlock(&info->lock);
+
+ if (wait) {
+ wait_on_flush(info);
+ return;
+ }
+
+ async = kzalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ goto flush;
+
+ async->root = root;
+ async->info = info;
+ async->work.func = flush_delalloc_async;
+
+ btrfs_queue_worker(&root->fs_info->enospc_workers,
+ &async->work);
+ wait_on_flush(info);
+ return;
+
+flush:
+ btrfs_start_delalloc_inodes(root, 0);
+ btrfs_wait_ordered_extents(root, 0, 0);
+
+ spin_lock(&info->lock);
+ info->flushing = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->flush_wait);
+}
+
+static int maybe_allocate_chunk(struct btrfs_root *root,
+ struct btrfs_space_info *info)
+{
+ struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct btrfs_trans_handle *trans;
+ bool wait = false;
+ int ret = 0;
+ u64 min_metadata;
+ u64 free_space;
+
+ free_space = btrfs_super_total_bytes(disk_super);
+ /*
+ * we allow the metadata to grow to a max of either 10gb or 5% of the
+ * space in the volume.
+ */
+ min_metadata = min((u64)10 * 1024 * 1024 * 1024,
+ div64_u64(free_space * 5, 100));
+ if (info->total_bytes >= min_metadata) {
+ spin_unlock(&info->lock);
+ return 0;
+ }
+
+ if (info->full) {
+ spin_unlock(&info->lock);
+ return 0;
+ }
+
+ if (!info->allocating_chunk) {
+ info->force_alloc = 1;
+ info->allocating_chunk = 1;
+ init_waitqueue_head(&info->allocate_wait);
+ } else {
+ wait = true;
+ }
+
+ spin_unlock(&info->lock);
+
+ if (wait) {
+ wait_event(info->allocate_wait,
+ !info->allocating_chunk);
+ return 1;
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ 4096 + 2 * 1024 * 1024,
+ info->flags, 0);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ goto out;
+out:
+ spin_lock(&info->lock);
+ info->allocating_chunk = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->allocate_wait);
+
+ if (ret)
+ return 0;
+ return 1;
+}
+
+/*
+ * Reserve metadata space for delalloc.
+ */
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 used;
+ u64 alloc_target;
+ int flushed = 0;
+ int force_delalloc;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+ num_items);
+again:
+ spin_lock(&meta_sinfo->lock);
+
+ force_delalloc = meta_sinfo->force_delalloc;
+
+ if (unlikely(!meta_sinfo->bytes_root))
+ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+ if (!flushed)
+ meta_sinfo->bytes_delalloc += num_bytes;
+
+ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+ if (used > meta_sinfo->total_bytes) {
+ flushed++;
+
+ if (flushed == 1) {
+ if (maybe_allocate_chunk(root, meta_sinfo))
+ goto again;
+ flushed++;
+ } else {
+ spin_unlock(&meta_sinfo->lock);
+ }
+
+ if (flushed == 2) {
+ filemap_flush(inode->i_mapping);
+ goto again;
+ } else if (flushed == 3) {
+ flush_delalloc(root, meta_sinfo);
+ goto again;
+ }
+ spin_lock(&meta_sinfo->lock);
+ meta_sinfo->bytes_delalloc -= num_bytes;
+ spin_unlock(&meta_sinfo->lock);
+ printk(KERN_ERR "enospc, has %d, reserved %d\n",
+ BTRFS_I(inode)->outstanding_extents,
+ BTRFS_I(inode)->reserved_extents);
+ dump_space_info(meta_sinfo, 0, 0);
+ return -ENOSPC;
+ }
+
+ BTRFS_I(inode)->reserved_extents++;
+ check_force_delalloc(meta_sinfo);
+ spin_unlock(&meta_sinfo->lock);
+
+ if (!flushed && force_delalloc)
+ filemap_flush(inode->i_mapping);
+
+ return 0;
+}
+
+/*
+ * unreserve num_items number of items worth of metadata space. This needs to
+ * be paired with btrfs_reserve_metadata_space.
+ *
+ * NOTE: if you have the option, run this _AFTER_ you do a
+ * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
+ * oprations which will result in more used metadata, so we want to make sure we
+ * can do that without issue.
+ */
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 alloc_target;
+ bool bug = false;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root, num_items);
+
+ spin_lock(&meta_sinfo->lock);
+ if (meta_sinfo->bytes_may_use < num_bytes) {
+ bug = true;
+ meta_sinfo->bytes_may_use = 0;
+ } else {
+ meta_sinfo->bytes_may_use -= num_bytes;
+ }
+ spin_unlock(&meta_sinfo->lock);
+
+ BUG_ON(bug);
+
+ return 0;
+}
+
+/*
+ * Reserve some metadata space for use. We'll calculate the worste case number
+ * of bytes that would be needed to modify num_items number of items. If we
+ * have space, fantastic, if not, you get -ENOSPC. Please call
+ * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
+ * items you reserved, since whatever metadata you needed should have already
+ * been allocated.
+ *
+ * This will commit the transaction to make more space if we don't have enough
+ * metadata space. THe only time we don't do this is if we're reserving space
+ * inside of a transaction, then we will just return -ENOSPC and it is the
+ * callers responsibility to handle it properly.
+ */
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 used;
+ u64 alloc_target;
+ int retries = 0;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root, num_items);
+again:
+ spin_lock(&meta_sinfo->lock);
+
+ if (unlikely(!meta_sinfo->bytes_root))
+ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+ if (!retries)
+ meta_sinfo->bytes_may_use += num_bytes;
+
+ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+ if (used > meta_sinfo->total_bytes) {
+ retries++;
+ if (retries == 1) {
+ if (maybe_allocate_chunk(root, meta_sinfo))
+ goto again;
+ retries++;
+ } else {
+ spin_unlock(&meta_sinfo->lock);
+ }
+
+ if (retries == 2) {
+ flush_delalloc(root, meta_sinfo);
+ goto again;
+ }
+ spin_lock(&meta_sinfo->lock);
+ meta_sinfo->bytes_may_use -= num_bytes;
+ spin_unlock(&meta_sinfo->lock);
+
+ dump_space_info(meta_sinfo, 0, 0);
+ return -ENOSPC;
+ }
+
+ check_force_delalloc(meta_sinfo);
+ spin_unlock(&meta_sinfo->lock);
+
+ return 0;
+}
+
+/*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes)
+{
+ struct btrfs_space_info *data_sinfo;
+ int ret = 0, committed = 0;
+
+ /* make sure bytes are sectorsize aligned */
+ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+ data_sinfo = BTRFS_I(inode)->space_info;
+ if (!data_sinfo)
+ goto alloc;
+
+again:
+ /* make sure we have enough space to handle the data first */
+ spin_lock(&data_sinfo->lock);
+ if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+ data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+ data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+ data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
+ struct btrfs_trans_handle *trans;
+
+ /*
+ * if we don't have enough free bytes in this space then we need
+ * to alloc a new chunk.
+ */
+ if (!data_sinfo->full) {
+ u64 alloc_target;
+
+ data_sinfo->force_alloc = 1;
+ spin_unlock(&data_sinfo->lock);
+alloc:
+ alloc_target = btrfs_get_alloc_profile(root, 1);
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ return -ENOMEM;
+
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ bytes + 2 * 1024 * 1024,
+ alloc_target, 0);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ return ret;
+
+ if (!data_sinfo) {
+ btrfs_set_inode_space_info(root, inode);
+ data_sinfo = BTRFS_I(inode)->space_info;
+ }
+ goto again;
+ }
+ spin_unlock(&data_sinfo->lock);
+
+ /* commit the current transaction and try again */
+ if (!committed && !root->fs_info->open_ioctl_trans) {
+ committed = 1;
+ trans = btrfs_join_transaction(root, 1);
+ if (!trans)
+ return -ENOMEM;
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ goto again;
+ }
+
+ printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+ ", %llu bytes_used, %llu bytes_reserved, "
+ "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
+ "%llu total\n", (unsigned long long)bytes,
+ (unsigned long long)data_sinfo->bytes_delalloc,
+ (unsigned long long)data_sinfo->bytes_used,
+ (unsigned long long)data_sinfo->bytes_reserved,
+ (unsigned long long)data_sinfo->bytes_pinned,
+ (unsigned long long)data_sinfo->bytes_readonly,
+ (unsigned long long)data_sinfo->bytes_may_use,
+ (unsigned long long)data_sinfo->total_bytes);
+ return -ENOSPC;
+ }
+ data_sinfo->bytes_may_use += bytes;
+ BTRFS_I(inode)->reserved_bytes += bytes;
+ spin_unlock(&data_sinfo->lock);
+
+ return 0;
+}
+
+/*
+ * if there was an error for whatever reason after calling
+ * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ */
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+ struct inode *inode, u64 bytes)
+{
+ struct btrfs_space_info *data_sinfo;
+
+ /* make sure bytes are sectorsize aligned */
+ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+ data_sinfo = BTRFS_I(inode)->space_info;
+ spin_lock(&data_sinfo->lock);
+ data_sinfo->bytes_may_use -= bytes;
+ BTRFS_I(inode)->reserved_bytes -= bytes;
+ spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are adding a delalloc extent to the inode's io_tree */
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes)
+{
+ struct btrfs_space_info *data_sinfo;
+
+ /* get the space info for where this inode will be storing its data */
+ data_sinfo = BTRFS_I(inode)->space_info;
+
+ /* make sure we have enough space to handle the data first */
+ spin_lock(&data_sinfo->lock);
+ data_sinfo->bytes_delalloc += bytes;
+
+ /*
+ * we are adding a delalloc extent without calling
+ * btrfs_check_data_free_space first. This happens on a weird
+ * writepage condition, but shouldn't hurt our accounting
+ */
+ if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
+ data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
+ BTRFS_I(inode)->reserved_bytes = 0;
+ } else {
+ data_sinfo->bytes_may_use -= bytes;
+ BTRFS_I(inode)->reserved_bytes -= bytes;
+ }
+
+ spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are clearing an delalloc extent from the inode's io_tree */
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes)
+{
+ struct btrfs_space_info *info;
+
+ info = BTRFS_I(inode)->space_info;
+
+ spin_lock(&info->lock);
+ info->bytes_delalloc -= bytes;
+ spin_unlock(&info->lock);
+}
+
+static void force_metadata_allocation(struct btrfs_fs_info *info)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list) {
+ if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+ found->force_alloc = 1;
+ }
+ rcu_read_unlock();
+}
+
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 alloc_bytes,
+ u64 flags, int force)
+{
+ struct btrfs_space_info *space_info;
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
+ u64 thresh;
+ int ret = 0;
+
+ mutex_lock(&fs_info->chunk_mutex);
+
+ flags = btrfs_reduce_alloc_profile(extent_root, flags);
+
+ space_info = __find_space_info(extent_root->fs_info, flags);
+ if (!space_info) {
+ ret = update_space_info(extent_root->fs_info, flags,
+ 0, 0, &space_info);
+ BUG_ON(ret);
+ }
+ BUG_ON(!space_info);
+
+ spin_lock(&space_info->lock);
+ if (space_info->force_alloc)
+ force = 1;
+ if (space_info->full) {
+ spin_unlock(&space_info->lock);
+ goto out;
+ }
+
+ thresh = space_info->total_bytes - space_info->bytes_readonly;
+ thresh = div_factor(thresh, 8);
+ if (!force &&
+ (space_info->bytes_used + space_info->bytes_pinned +
+ space_info->bytes_reserved + alloc_bytes) < thresh) {
+ spin_unlock(&space_info->lock);
+ goto out;
+ }
+ spin_unlock(&space_info->lock);
+
+ /*
+ * if we're doing a data chunk, go ahead and make sure that
+ * we keep a reasonable number of metadata chunks allocated in the
+ * FS as well.
+ */
+ if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
+ fs_info->data_chunk_allocations++;
+ if (!(fs_info->data_chunk_allocations %
+ fs_info->metadata_ratio))
+ force_metadata_allocation(fs_info);
+ }
+
+ ret = btrfs_alloc_chunk(trans, extent_root, flags);
+ spin_lock(&space_info->lock);
+ if (ret)
+ space_info->full = 1;
+ space_info->force_alloc = 0;
+ spin_unlock(&space_info->lock);
+out:
+ mutex_unlock(&extent_root->fs_info->chunk_mutex);
+ return ret;
+}
+
+static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int alloc,
+ int mark_free)
+{
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_fs_info *info = root->fs_info;
+ u64 total = num_bytes;
+ u64 old_val;
+ u64 byte_in_group;
+
+ /* block accounting for super block */
+ spin_lock(&info->delalloc_lock);
+ old_val = btrfs_super_bytes_used(&info->super_copy);
+ if (alloc)
+ old_val += num_bytes;
+ else
+ old_val -= num_bytes;
+ btrfs_set_super_bytes_used(&info->super_copy, old_val);
+ spin_unlock(&info->delalloc_lock);
+
+ while (total) {
+ cache = btrfs_lookup_block_group(info, bytenr);
+ if (!cache)
+ return -1;
+ byte_in_group = bytenr - cache->key.objectid;
+ WARN_ON(byte_in_group > cache->key.offset);
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->dirty = 1;
+ old_val = btrfs_block_group_used(&cache->item);
+ num_bytes = min(total, cache->key.offset - byte_in_group);
+ if (alloc) {
+ old_val += num_bytes;
+ btrfs_set_block_group_used(&cache->item, old_val);
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_used += num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
+ if (cache->ro)
+ cache->space_info->bytes_readonly -= num_bytes;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ } else {
+ old_val -= num_bytes;
+ cache->space_info->bytes_used -= num_bytes;
+ if (cache->ro)
+ cache->space_info->bytes_readonly += num_bytes;
+ btrfs_set_block_group_used(&cache->item, old_val);
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ if (mark_free) {
+ int ret;
+
+ ret = btrfs_discard_extent(root, bytenr,
+ num_bytes);
+ WARN_ON(ret);
+
+ ret = btrfs_add_free_space(cache, bytenr,
+ num_bytes);
+ WARN_ON(ret);
+ }
+ }
+ btrfs_put_block_group(cache);
+ total -= num_bytes;
+ bytenr += num_bytes;
+ }
+ return 0;
+}
+
+static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+{
+ struct btrfs_block_group_cache *cache;
+ u64 bytenr;
+
+ cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
+ if (!cache)
+ return 0;
+
+ bytenr = cache->key.objectid;
+ btrfs_put_block_group(cache);
+
+ return bytenr;
+}
+
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int reserved)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, bytenr);
+ BUG_ON(!cache);
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned += num_bytes;
+ cache->space_info->bytes_pinned += num_bytes;
+ if (reserved) {
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ btrfs_put_block_group(cache);
+
+ set_extent_dirty(fs_info->pinned_extents,
+ bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+ return 0;
+}
+
+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve)
+{
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ if (reserve) {
+ cache->reserved += num_bytes;
+ cache->space_info->bytes_reserved += num_bytes;
+ } else {
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ return 0;
+}
+
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_caching_control *next;
+ struct btrfs_caching_control *caching_ctl;
+ struct btrfs_block_group_cache *cache;
+
+ down_write(&fs_info->extent_commit_sem);
+
+ list_for_each_entry_safe(caching_ctl, next,
+ &fs_info->caching_block_groups, list) {
+ cache = caching_ctl->block_group;
+ if (block_group_cache_done(cache)) {
+ cache->last_byte_to_unpin = (u64)-1;
+ list_del_init(&caching_ctl->list);
+ put_caching_control(caching_ctl);
+ } else {
+ cache->last_byte_to_unpin = caching_ctl->progress;
+ }
+ }
+
+ if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+ fs_info->pinned_extents = &fs_info->freed_extents[1];
+ else
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
+
+ up_write(&fs_info->extent_commit_sem);
+ return 0;
+}
+
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *cache = NULL;
+ u64 len;
+
+ while (start <= end) {
+ if (!cache ||
+ start >= cache->key.objectid + cache->key.offset) {
+ if (cache)
+ btrfs_put_block_group(cache);
+ cache = btrfs_lookup_block_group(fs_info, start);
+ BUG_ON(!cache);
+ }
+
+ len = cache->key.objectid + cache->key.offset - start;
+ len = min(len, end + 1 - start);
+
+ if (start < cache->last_byte_to_unpin) {
+ len = min(len, cache->last_byte_to_unpin - start);
+ btrfs_add_free_space(cache, start, len);
+ }
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned -= len;
+ cache->space_info->bytes_pinned -= len;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ start += len;
+ }
+
+ if (cache)
+ btrfs_put_block_group(cache);
+ return 0;
+}
+
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *unpin;
+ u64 start;
+ u64 end;
+ int ret;
+
+ if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+ unpin = &fs_info->freed_extents[1];
+ else
+ unpin = &fs_info->freed_extents[0];
+
+ while (1) {
+ ret = find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY);
+ if (ret)
+ break;
+
+ ret = btrfs_discard_extent(root, start, end + 1 - start);
+
+ clear_extent_dirty(unpin, start, end, GFP_NOFS);
+ unpin_extent_range(root, start, end);
+ cond_resched();
+ }
+
+ return ret;
+}
+
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes,
+ int is_data, int reserved,
+ struct extent_buffer **must_clean)
+{
+ int err = 0;
+ struct extent_buffer *buf;
+
+ if (is_data)
+ goto pinit;
+
+ /*
+ * discard is sloooow, and so triggering discards on
+ * individual btree blocks isn't a good plan. Just
+ * pin everything in discard mode.
+ */
+ if (btrfs_test_opt(root, DISCARD))
+ goto pinit;
+
+ buf = btrfs_find_tree_block(root, bytenr, num_bytes);
+ if (!buf)
+ goto pinit;
+
+ /* we can reuse a block if it hasn't been written
+ * and it is from this transaction. We can't
+ * reuse anything from the tree log root because
+ * it has tiny sub-transactions.
+ */
+ if (btrfs_buffer_uptodate(buf, 0) &&
+ btrfs_try_tree_lock(buf)) {
+ u64 header_owner = btrfs_header_owner(buf);
+ u64 header_transid = btrfs_header_generation(buf);
+ if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
+ header_transid == trans->transid &&
+ !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ *must_clean = buf;
+ return 1;
+ }
+ btrfs_tree_unlock(buf);
+ }
+ free_extent_buffer(buf);
+pinit:
+ if (path)
+ btrfs_set_path_blocking(path);
+ /* unlocks the pinned mutex */
+ btrfs_pin_extent(root, bytenr, num_bytes, reserved);
+
+ BUG_ON(err < 0);
+ return 0;
+}
+
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner_objectid,
+ u64 owner_offset, int refs_to_drop,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_root *extent_root = info->extent_root;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+ int is_data;
+ int extent_slot = 0;
+ int found_extent = 0;
+ int num_to_del = 1;
+ u32 item_size;
+ u64 refs;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = 1;
+ path->leave_spinning = 1;
+
+ is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
+ BUG_ON(!is_data && refs_to_drop != 1);
+
+ ret = lookup_extent_backref(trans, extent_root, path, &iref,
+ bytenr, num_bytes, parent,
+ root_objectid, owner_objectid,
+ owner_offset);
+ if (ret == 0) {
+ extent_slot = path->slots[0];
+ while (extent_slot >= 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ extent_slot);
+ if (key.objectid != bytenr)
+ break;
+ if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == num_bytes) {
+ found_extent = 1;
+ break;
+ }
+ if (path->slots[0] - extent_slot > 5)
+ break;
+ extent_slot--;
+ }
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
+ if (found_extent && item_size < sizeof(*ei))
+ found_extent = 0;
+#endif
+ if (!found_extent) {
+ BUG_ON(iref);
+ ret = remove_extent_backref(trans, extent_root, path,
+ NULL, refs_to_drop,
+ is_data);
+ BUG_ON(ret);
+ btrfs_release_path(extent_root, path);
+ path->leave_spinning = 1;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ ret = btrfs_search_slot(trans, extent_root,
+ &key, path, -1, 1);
+ if (ret) {
+ printk(KERN_ERR "umm, got %d back from search"
+ ", was looking for %llu\n", ret,
+ (unsigned long long)bytenr);
+ btrfs_print_leaf(extent_root, path->nodes[0]);
+ }
+ BUG_ON(ret);
+ extent_slot = path->slots[0];
+ }
+ } else {
+ btrfs_print_leaf(extent_root, path->nodes[0]);
+ WARN_ON(1);
+ printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
+ "parent %llu root %llu owner %llu offset %llu\n",
+ (unsigned long long)bytenr,
+ (unsigned long long)parent,
+ (unsigned long long)root_objectid,
+ (unsigned long long)owner_objectid,
+ (unsigned long long)owner_offset);
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, extent_slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (item_size < sizeof(*ei)) {
+ BUG_ON(found_extent || extent_slot != path->slots[0]);
+ ret = convert_extent_item_v0(trans, extent_root, path,
+ owner_objectid, 0);
+ BUG_ON(ret < 0);
+
+ btrfs_release_path(extent_root, path);
+ path->leave_spinning = 1;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ ret = btrfs_search_slot(trans, extent_root, &key, path,
+ -1, 1);
+ if (ret) {
+ printk(KERN_ERR "umm, got %d back from search"
+ ", was looking for %llu\n", ret,
+ (unsigned long long)bytenr);
+ btrfs_print_leaf(extent_root, path->nodes[0]);
+ }
+ BUG_ON(ret);
+ extent_slot = path->slots[0];
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, extent_slot);
+ }
+#endif
+ BUG_ON(item_size < sizeof(*ei));
+ ei = btrfs_item_ptr(leaf, extent_slot,
+ struct btrfs_extent_item);
+ if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+ struct btrfs_tree_block_info *bi;
+ BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
+ }
+
+ refs = btrfs_extent_refs(leaf, ei);
+ BUG_ON(refs < refs_to_drop);
+ refs -= refs_to_drop;
+
+ if (refs > 0) {
+ if (extent_op)
+ __run_delayed_extent_op(extent_op, leaf, ei);
+ /*
+ * In the case of inline back ref, reference count will
+ * be updated by remove_extent_backref
+ */
+ if (iref) {
+ BUG_ON(!found_extent);
+ } else {
+ btrfs_set_extent_refs(leaf, ei, refs);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ if (found_extent) {
+ ret = remove_extent_backref(trans, extent_root, path,
+ iref, refs_to_drop,
+ is_data);
+ BUG_ON(ret);
+ }
+ } else {
+ int mark_free = 0;
+ struct extent_buffer *must_clean = NULL;
+
+ if (found_extent) {
+ BUG_ON(is_data && refs_to_drop !=
+ extent_data_ref_count(root, path, iref));
+ if (iref) {
+ BUG_ON(path->slots[0] != extent_slot);
+ } else {
+ BUG_ON(path->slots[0] != extent_slot + 1);
+ path->slots[0] = extent_slot;
+ num_to_del = 2;
+ }
+ }
+
+ ret = pin_down_bytes(trans, root, path, bytenr,
+ num_bytes, is_data, 0, &must_clean);
+ if (ret > 0)
+ mark_free = 1;
+ BUG_ON(ret < 0);
+ /*
+ * it is going to be very rare for someone to be waiting
+ * on the block we're freeing. del_items might need to
+ * schedule, so rather than get fancy, just force it
+ * to blocking here
+ */
+ if (must_clean)
+ btrfs_set_lock_blocking(must_clean);
+
+ ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
+ num_to_del);
+ BUG_ON(ret);
+ btrfs_release_path(extent_root, path);
+
+ if (must_clean) {
+ clean_tree_block(NULL, root, must_clean);
+ btrfs_tree_unlock(must_clean);
+ free_extent_buffer(must_clean);
+ }
+
+ if (is_data) {
+ ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
+ BUG_ON(ret);
+ } else {
+ invalidate_mapping_pages(info->btree_inode->i_mapping,
+ bytenr >> PAGE_CACHE_SHIFT,
+ (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
+ }
+
+ ret = update_block_group(trans, root, bytenr, num_bytes, 0,
+ mark_free);
+ BUG_ON(ret);
+ }
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * when we free an extent, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well. This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
+ */
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr)
+{
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct rb_node *node;
+ int ret;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (!head)
+ goto out;
+
+ node = rb_prev(&head->node.rb_node);
+ if (!node)
+ goto out;
+
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+
+ /* there are still entries for this ref, we can't drop it */
+ if (ref->bytenr == bytenr)
+ goto out;
+
+ if (head->extent_op) {
+ if (!head->must_insert_reserved)
+ goto out;
+ kfree(head->extent_op);
+ head->extent_op = NULL;
+ }
+
+ /*
+ * waiting for the lock here would deadlock. If someone else has it
+ * locked they are already in the process of dropping it anyway
+ */
+ if (!mutex_trylock(&head->mutex))
+ goto out;
+
+ /*
+ * at this point we have a head with no other entries. Go
+ * ahead and process it.
+ */
+ head->node.in_tree = 0;
+ rb_erase(&head->node.rb_node, &delayed_refs->root);
+
+ delayed_refs->num_entries--;
+
+ /*
+ * we don't take a ref on the node because we're removing it from the
+ * tree, so we just steal the ref the tree was holding.
+ */
+ delayed_refs->num_heads--;
+ if (list_empty(&head->cluster))
+ delayed_refs->num_heads_ready--;
+
+ list_del_init(&head->cluster);
+ spin_unlock(&delayed_refs->lock);
+
+ ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+ &head->node, head->extent_op,
+ head->must_insert_reserved);
+ BUG_ON(ret);
+ btrfs_put_delayed_ref(&head->node);
+ return 0;
+out:
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
+
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner, u64 offset)
+{
+ int ret;
+
+ /*
+ * tree log blocks never actually go into the extent allocation
+ * tree, just update pinning info and exit early.
+ */
+ if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
+ WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
+ /* unlocks the pinned mutex */
+ btrfs_pin_extent(root, bytenr, num_bytes, 1);
+ ret = 0;
+ } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+ parent, root_objectid, (int)owner,
+ BTRFS_DROP_DELAYED_REF, NULL);
+ BUG_ON(ret);
+ ret = check_ref_cleanup(trans, root, bytenr);
+ BUG_ON(ret);
+ } else {
+ ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+ parent, root_objectid, owner,
+ offset, BTRFS_DROP_DELAYED_REF, NULL);
+ BUG_ON(ret);
+ }
+ return ret;
+}
+
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u32 blocksize,
+ u64 parent, u64 root_objectid, int level)
+{
+ u64 used;
+ spin_lock(&root->node_lock);
+ used = btrfs_root_used(&root->root_item) - blocksize;
+ btrfs_set_root_used(&root->root_item, used);
+ spin_unlock(&root->node_lock);
+
+ return btrfs_free_extent(trans, root, bytenr, blocksize,
+ parent, root_objectid, level, 0);
+}
+
+static u64 stripe_align(struct btrfs_root *root, u64 val)
+{
+ u64 mask = ((u64)root->stripesize - 1);
+ u64 ret = (val + mask) & ~mask;
+ return ret;
+}
+
+/*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once. So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes. Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ */
+static noinline int
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+ u64 num_bytes)
+{
+ struct btrfs_caching_control *caching_ctl;
+ DEFINE_WAIT(wait);
+
+ caching_ctl = get_caching_control(cache);
+ if (!caching_ctl)
+ return 0;
+
+ wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
+ (cache->free_space >= num_bytes));
+
+ put_caching_control(caching_ctl);
+ return 0;
+}
+
+static noinline int
+wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_caching_control *caching_ctl;
+ DEFINE_WAIT(wait);
+
+ caching_ctl = get_caching_control(cache);
+ if (!caching_ctl)
+ return 0;
+
+ wait_event(caching_ctl->wait, block_group_cache_done(cache));
+
+ put_caching_control(caching_ctl);
+ return 0;
+}
+
+enum btrfs_loop_type {
+ LOOP_FIND_IDEAL = 0,
+ LOOP_CACHING_NOWAIT = 1,
+ LOOP_CACHING_WAIT = 2,
+ LOOP_ALLOC_CHUNK = 3,
+ LOOP_NO_EMPTY_SIZE = 4,
+};
+
+/*
+ * walks the btree of allocated extents and find a hole of a given size.
+ * The key ins is changed to record the hole:
+ * ins->objectid == block start
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == number of blocks
+ * Any available blocks before search_start are skipped.
+ */
+static noinline int find_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *orig_root,
+ u64 num_bytes, u64 empty_size,
+ u64 search_start, u64 search_end,
+ u64 hint_byte, struct btrfs_key *ins,
+ u64 exclude_start, u64 exclude_nr,
+ int data)
+{
+ int ret = 0;
+ struct btrfs_root *root = orig_root->fs_info->extent_root;
+ struct btrfs_free_cluster *last_ptr = NULL;
+ struct btrfs_block_group_cache *block_group = NULL;
+ int empty_cluster = 2 * 1024 * 1024;
+ int allowed_chunk_alloc = 0;
+ int done_chunk_alloc = 0;
+ struct btrfs_space_info *space_info;
+ int last_ptr_loop = 0;
+ int loop = 0;
+ bool found_uncached_bg = false;
+ bool failed_cluster_refill = false;
+ bool failed_alloc = false;
+ u64 ideal_cache_percent = 0;
+ u64 ideal_cache_offset = 0;
+
+ WARN_ON(num_bytes < root->sectorsize);
+ btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+ ins->objectid = 0;
+ ins->offset = 0;
+
+ space_info = __find_space_info(root->fs_info, data);
+
+ if (orig_root->ref_cows || empty_size)
+ allowed_chunk_alloc = 1;
+
+ if (data & BTRFS_BLOCK_GROUP_METADATA) {
+ last_ptr = &root->fs_info->meta_alloc_cluster;
+ if (!btrfs_test_opt(root, SSD))
+ empty_cluster = 64 * 1024;
+ }
+
+ if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+ last_ptr = &root->fs_info->data_alloc_cluster;
+ }
+
+ if (last_ptr) {
+ spin_lock(&last_ptr->lock);
+ if (last_ptr->block_group)
+ hint_byte = last_ptr->window_start;
+ spin_unlock(&last_ptr->lock);
+ }
+
+ search_start = max(search_start, first_logical_byte(root, 0));
+ search_start = max(search_start, hint_byte);
+
+ if (!last_ptr)
+ empty_cluster = 0;
+
+ if (search_start == hint_byte) {
+ideal_cache:
+ block_group = btrfs_lookup_block_group(root->fs_info,
+ search_start);
+ /*
+ * we don't want to use the block group if it doesn't match our
+ * allocation bits, or if its not cached.
+ *
+ * However if we are re-searching with an ideal block group
+ * picked out then we don't care that the block group is cached.
+ */
+ if (block_group && block_group_bits(block_group, data) &&
+ (block_group->cached != BTRFS_CACHE_NO ||
+ search_start == ideal_cache_offset)) {
+ down_read(&space_info->groups_sem);
+ if (list_empty(&block_group->list) ||
+ block_group->ro) {
+ /*
+ * someone is removing this block group,
+ * we can't jump into the have_block_group
+ * target because our list pointers are not
+ * valid
+ */
+ btrfs_put_block_group(block_group);
+ up_read(&space_info->groups_sem);
+ } else {
+ goto have_block_group;
+ }
+ } else if (block_group) {
+ btrfs_put_block_group(block_group);
+ }
+ }
+search:
+ down_read(&space_info->groups_sem);
+ list_for_each_entry(block_group, &space_info->block_groups, list) {
+ u64 offset;
+ int cached;
+
+ btrfs_get_block_group(block_group);
+ search_start = block_group->key.objectid;
+
+have_block_group:
+ if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+ u64 free_percent;
+
+ free_percent = btrfs_block_group_used(&block_group->item);
+ free_percent *= 100;
+ free_percent = div64_u64(free_percent,
+ block_group->key.offset);
+ free_percent = 100 - free_percent;
+ if (free_percent > ideal_cache_percent &&
+ likely(!block_group->ro)) {
+ ideal_cache_offset = block_group->key.objectid;
+ ideal_cache_percent = free_percent;
+ }
+
+ /*
+ * We only want to start kthread caching if we are at
+ * the point where we will wait for caching to make
+ * progress, or if our ideal search is over and we've
+ * found somebody to start caching.
+ */
+ if (loop > LOOP_CACHING_NOWAIT ||
+ (loop > LOOP_FIND_IDEAL &&
+ atomic_read(&space_info->caching_threads) < 2)) {
+ ret = cache_block_group(block_group);
+ BUG_ON(ret);
+ }
+ found_uncached_bg = true;
+
+ /*
+ * If loop is set for cached only, try the next block
+ * group.
+ */
+ if (loop == LOOP_FIND_IDEAL)
+ goto loop;
+ }
+
+ cached = block_group_cache_done(block_group);
+ if (unlikely(!cached))
+ found_uncached_bg = true;
+
+ if (unlikely(block_group->ro))
+ goto loop;
+
+ /*
+ * Ok we want to try and use the cluster allocator, so lets look
+ * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
+ * have tried the cluster allocator plenty of times at this
+ * point and not have found anything, so we are likely way too
+ * fragmented for the clustering stuff to find anything, so lets
+ * just skip it and let the allocator find whatever block it can
+ * find
+ */
+ if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+ /*
+ * the refill lock keeps out other
+ * people trying to start a new cluster
+ */
+ spin_lock(&last_ptr->refill_lock);
+ if (last_ptr->block_group &&
+ (last_ptr->block_group->ro ||
+ !block_group_bits(last_ptr->block_group, data))) {
+ offset = 0;
+ goto refill_cluster;
+ }
+
+ offset = btrfs_alloc_from_cluster(block_group, last_ptr,
+ num_bytes, search_start);
+ if (offset) {
+ /* we have a block, we're done */
+ spin_unlock(&last_ptr->refill_lock);
+ goto checks;
+ }
+
+ spin_lock(&last_ptr->lock);
+ /*
+ * whoops, this cluster doesn't actually point to
+ * this block group. Get a ref on the block
+ * group is does point to and try again
+ */
+ if (!last_ptr_loop && last_ptr->block_group &&
+ last_ptr->block_group != block_group) {
+
+ btrfs_put_block_group(block_group);
+ block_group = last_ptr->block_group;
+ btrfs_get_block_group(block_group);
+ spin_unlock(&last_ptr->lock);
+ spin_unlock(&last_ptr->refill_lock);
+
+ last_ptr_loop = 1;
+ search_start = block_group->key.objectid;
+ /*
+ * we know this block group is properly
+ * in the list because
+ * btrfs_remove_block_group, drops the
+ * cluster before it removes the block
+ * group from the list
+ */
+ goto have_block_group;
+ }
+ spin_unlock(&last_ptr->lock);
+refill_cluster:
+ /*
+ * this cluster didn't work out, free it and
+ * start over
+ */
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
+
+ last_ptr_loop = 0;
+
+ /* allocate a cluster in this block group */
+ ret = btrfs_find_space_cluster(trans, root,
+ block_group, last_ptr,
+ offset, num_bytes,
+ empty_cluster + empty_size);
+ if (ret == 0) {
+ /*
+ * now pull our allocation out of this
+ * cluster
+ */
+ offset = btrfs_alloc_from_cluster(block_group,
+ last_ptr, num_bytes,
+ search_start);
+ if (offset) {
+ /* we found one, proceed */
+ spin_unlock(&last_ptr->refill_lock);
+ goto checks;
+ }
+ } else if (!cached && loop > LOOP_CACHING_NOWAIT
+ && !failed_cluster_refill) {
+ spin_unlock(&last_ptr->refill_lock);
+
+ failed_cluster_refill = true;
+ wait_block_group_cache_progress(block_group,
+ num_bytes + empty_cluster + empty_size);
+ goto have_block_group;
+ }
+
+ /*
+ * at this point we either didn't find a cluster
+ * or we weren't able to allocate a block from our
+ * cluster. Free the cluster we've been trying
+ * to use, and go to the next block group
+ */
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ spin_unlock(&last_ptr->refill_lock);
+ goto loop;
+ }
+
+ offset = btrfs_find_space_for_alloc(block_group, search_start,
+ num_bytes, empty_size);
+ /*
+ * If we didn't find a chunk, and we haven't failed on this
+ * block group before, and this block group is in the middle of
+ * caching and we are ok with waiting, then go ahead and wait
+ * for progress to be made, and set failed_alloc to true.
+ *
+ * If failed_alloc is true then we've already waited on this
+ * block group once and should move on to the next block group.
+ */
+ if (!offset && !failed_alloc && !cached &&
+ loop > LOOP_CACHING_NOWAIT) {
+ wait_block_group_cache_progress(block_group,
+ num_bytes + empty_size);
+ failed_alloc = true;
+ goto have_block_group;
+ } else if (!offset) {
+ goto loop;
+ }
+checks:
+ search_start = stripe_align(root, offset);
+ /* move on to the next group */
+ if (search_start + num_bytes >= search_end) {
+ btrfs_add_free_space(block_group, offset, num_bytes);
+ goto loop;
+ }
+
+ /* move on to the next group */
+ if (search_start + num_bytes >
+ block_group->key.objectid + block_group->key.offset) {
+ btrfs_add_free_space(block_group, offset, num_bytes);
+ goto loop;
+ }
+
+ if (exclude_nr > 0 &&
+ (search_start + num_bytes > exclude_start &&
+ search_start < exclude_start + exclude_nr)) {
+ search_start = exclude_start + exclude_nr;
+
+ btrfs_add_free_space(block_group, offset, num_bytes);
+ /*
+ * if search_start is still in this block group
+ * then we just re-search this block group
+ */
+ if (search_start >= block_group->key.objectid &&
+ search_start < (block_group->key.objectid +
+ block_group->key.offset))
+ goto have_block_group;
+ goto loop;
+ }
+
+ ins->objectid = search_start;
+ ins->offset = num_bytes;
+
+ if (offset < search_start)
+ btrfs_add_free_space(block_group, offset,
+ search_start - offset);
+ BUG_ON(offset > search_start);
+
+ update_reserved_extents(block_group, num_bytes, 1);
+
+ /* we are all good, lets return */
+ break;
+loop:
+ failed_cluster_refill = false;
+ failed_alloc = false;
+ btrfs_put_block_group(block_group);
+ }
+ up_read(&space_info->groups_sem);
+
+ /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
+ * for them to make caching progress. Also
+ * determine the best possible bg to cache
+ * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+ * caching kthreads as we move along
+ * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+ * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+ * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+ * again
+ */
+ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
+ (found_uncached_bg || empty_size || empty_cluster ||
+ allowed_chunk_alloc)) {
+ if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
+ found_uncached_bg = false;
+ loop++;
+ if (!ideal_cache_percent &&
+ atomic_read(&space_info->caching_threads))
+ goto search;
+
+ /*
+ * 1 of the following 2 things have happened so far
+ *
+ * 1) We found an ideal block group for caching that
+ * is mostly full and will cache quickly, so we might
+ * as well wait for it.
+ *
+ * 2) We searched for cached only and we didn't find
+ * anything, and we didn't start any caching kthreads
+ * either, so chances are we will loop through and
+ * start a couple caching kthreads, and then come back
+ * around and just wait for them. This will be slower
+ * because we will have 2 caching kthreads reading at
+ * the same time when we could have just started one
+ * and waited for it to get far enough to give us an
+ * allocation, so go ahead and go to the wait caching
+ * loop.
+ */
+ loop = LOOP_CACHING_WAIT;
+ search_start = ideal_cache_offset;
+ ideal_cache_percent = 0;
+ goto ideal_cache;
+ } else if (loop == LOOP_FIND_IDEAL) {
+ /*
+ * Didn't find a uncached bg, wait on anything we find
+ * next.
+ */
+ loop = LOOP_CACHING_WAIT;
+ goto search;
+ }
+
+ if (loop < LOOP_CACHING_WAIT) {
+ loop++;
+ goto search;
+ }
+
+ if (loop == LOOP_ALLOC_CHUNK) {
+ empty_size = 0;
+ empty_cluster = 0;
+ }
+
+ if (allowed_chunk_alloc) {
+ ret = do_chunk_alloc(trans, root, num_bytes +
+ 2 * 1024 * 1024, data, 1);
+ allowed_chunk_alloc = 0;
+ done_chunk_alloc = 1;
+ } else if (!done_chunk_alloc) {
+ space_info->force_alloc = 1;
+ }
+
+ if (loop < LOOP_NO_EMPTY_SIZE) {
+ loop++;
+ goto search;
+ }
+ ret = -ENOSPC;
+ } else if (!ins->objectid) {
+ ret = -ENOSPC;
+ }
+
+ /* we found what we needed */
+ if (ins->objectid) {
+ if (!(data & BTRFS_BLOCK_GROUP_DATA))
+ trans->block_group = block_group->key.objectid;
+
+ btrfs_put_block_group(block_group);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups)
+{
+ struct btrfs_block_group_cache *cache;
+
+ spin_lock(&info->lock);
+ printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+ (unsigned long long)(info->total_bytes - info->bytes_used -
+ info->bytes_pinned - info->bytes_reserved -
+ info->bytes_super),
+ (info->full) ? "" : "not ");
+ printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+ " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
+ "\n",
+ (unsigned long long)info->total_bytes,
+ (unsigned long long)info->bytes_pinned,
+ (unsigned long long)info->bytes_delalloc,
+ (unsigned long long)info->bytes_may_use,
+ (unsigned long long)info->bytes_used,
+ (unsigned long long)info->bytes_root,
+ (unsigned long long)info->bytes_super,
+ (unsigned long long)info->bytes_reserved);
+ spin_unlock(&info->lock);
+
+ if (!dump_block_groups)
+ return;
+
+ down_read(&info->groups_sem);
+ list_for_each_entry(cache, &info->block_groups, list) {
+ spin_lock(&cache->lock);
+ printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
+ "%llu pinned %llu reserved\n",
+ (unsigned long long)cache->key.objectid,
+ (unsigned long long)cache->key.offset,
+ (unsigned long long)btrfs_block_group_used(&cache->item),
+ (unsigned long long)cache->pinned,
+ (unsigned long long)cache->reserved);
+ btrfs_dump_free_space(cache, bytes);
+ spin_unlock(&cache->lock);
+ }
+ up_read(&info->groups_sem);
+}
+
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 num_bytes, u64 min_alloc_size,
+ u64 empty_size, u64 hint_byte,
+ u64 search_end, struct btrfs_key *ins,
+ u64 data)
+{
+ int ret;
+ u64 search_start = 0;
+
+ data = btrfs_get_alloc_profile(root, data);
+again:
+ /*
+ * the only place that sets empty_size is btrfs_realloc_node, which
+ * is not called recursively on allocations
+ */
+ if (empty_size || root->ref_cows)
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ num_bytes + 2 * 1024 * 1024, data, 0);
+
+ WARN_ON(num_bytes < root->sectorsize);
+ ret = find_free_extent(trans, root, num_bytes, empty_size,
+ search_start, search_end, hint_byte, ins,
+ trans->alloc_exclude_start,
+ trans->alloc_exclude_nr, data);
+
+ if (ret == -ENOSPC && num_bytes > min_alloc_size) {
+ num_bytes = num_bytes >> 1;
+ num_bytes = num_bytes & ~(root->sectorsize - 1);
+ num_bytes = max(num_bytes, min_alloc_size);
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ num_bytes, data, 1);
+ goto again;
+ }
+ if (ret == -ENOSPC) {
+ struct btrfs_space_info *sinfo;
+
+ sinfo = __find_space_info(root->fs_info, data);
+ printk(KERN_ERR "btrfs allocation failed flags %llu, "
+ "wanted %llu\n", (unsigned long long)data,
+ (unsigned long long)num_bytes);
+ dump_space_info(sinfo, num_bytes, 1);
+ }
+
+ return ret;
+}
+
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+ struct btrfs_block_group_cache *cache;
+ int ret = 0;
+
+ cache = btrfs_lookup_block_group(root->fs_info, start);
+ if (!cache) {
+ printk(KERN_ERR "Unable to find block group for %llu\n",
+ (unsigned long long)start);
+ return -ENOSPC;
+ }
+
+ ret = btrfs_discard_extent(root, start, len);
+
+ btrfs_add_free_space(cache, start, len);
+ update_reserved_extents(cache, len, 0);
+ btrfs_put_block_group(cache);
+
+ return ret;
+}
+
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ u64 flags, u64 owner, u64 offset,
+ struct btrfs_key *ins, int ref_mod)
+{
+ int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_extent_item *extent_item;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int type;
+ u32 size;
+
+ if (parent > 0)
+ type = BTRFS_SHARED_DATA_REF_KEY;
+ else
+ type = BTRFS_EXTENT_DATA_REF_KEY;
+
+ size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
+ ins, size);
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ extent_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, extent_item, ref_mod);
+ btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+ btrfs_set_extent_flags(leaf, extent_item,
+ flags | BTRFS_EXTENT_FLAG_DATA);
+
+ iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+ btrfs_set_extent_inline_ref_type(leaf, iref, type);
+ if (parent > 0) {
+ struct btrfs_shared_data_ref *ref;
+ ref = (struct btrfs_shared_data_ref *)(iref + 1);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
+ } else {
+ struct btrfs_extent_data_ref *ref;
+ ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
+ btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+ btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+ btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
+ }
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_free_path(path);
+
+ ret = update_block_group(trans, root, ins->objectid, ins->offset,
+ 1, 0);
+ if (ret) {
+ printk(KERN_ERR "btrfs update block group failed for %llu "
+ "%llu\n", (unsigned long long)ins->objectid,
+ (unsigned long long)ins->offset);
+ BUG();
+ }
+ return ret;
+}
+
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ u64 flags, struct btrfs_disk_key *key,
+ int level, struct btrfs_key *ins)
+{
+ int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_extent_item *extent_item;
+ struct btrfs_tree_block_info *block_info;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
+ ins, size);
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ extent_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, extent_item, 1);
+ btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+ btrfs_set_extent_flags(leaf, extent_item,
+ flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
+ block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+
+ btrfs_set_tree_block_key(leaf, block_info, key);
+ btrfs_set_tree_block_level(leaf, block_info, level);
+
+ iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+ if (parent > 0) {
+ BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+ btrfs_set_extent_inline_ref_type(leaf, iref,
+ BTRFS_SHARED_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else {
+ btrfs_set_extent_inline_ref_type(leaf, iref,
+ BTRFS_TREE_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+ }
+
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_free_path(path);
+
+ ret = update_block_group(trans, root, ins->objectid, ins->offset,
+ 1, 0);
+ if (ret) {
+ printk(KERN_ERR "btrfs update block group failed for %llu "
+ "%llu\n", (unsigned long long)ins->objectid,
+ (unsigned long long)ins->offset);
+ BUG();
+ }
+ return ret;
+}
+
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 root_objectid, u64 owner,
+ u64 offset, struct btrfs_key *ins)
+{
+ int ret;
+
+ BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+ ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
+ 0, root_objectid, owner, offset,
+ BTRFS_ADD_DELAYED_EXTENT, NULL);
+ return ret;
+}
+
+/*
+ * this is used by the tree logging recovery code. It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 root_objectid, u64 owner, u64 offset,
+ struct btrfs_key *ins)
+{
+ int ret;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_caching_control *caching_ctl;
+ u64 start = ins->objectid;
+ u64 num_bytes = ins->offset;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+ cache_block_group(block_group);
+ caching_ctl = get_caching_control(block_group);
+
+ if (!caching_ctl) {
+ BUG_ON(!block_group_cache_done(block_group));
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+ BUG_ON(ret);
+ } else {
+ mutex_lock(&caching_ctl->mutex);
+
+ if (start >= caching_ctl->progress) {
+ ret = add_excluded_extent(root, start, num_bytes);
+ BUG_ON(ret);
+ } else if (start + num_bytes <= caching_ctl->progress) {
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ BUG_ON(ret);
+ } else {
+ num_bytes = caching_ctl->progress - start;
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ BUG_ON(ret);
+
+ start = caching_ctl->progress;
+ num_bytes = ins->objectid + ins->offset -
+ caching_ctl->progress;
+ ret = add_excluded_extent(root, start, num_bytes);
+ BUG_ON(ret);
+ }
+
+ mutex_unlock(&caching_ctl->mutex);
+ put_caching_control(caching_ctl);
+ }
+
+ update_reserved_extents(block_group, ins->offset, 1);
+ btrfs_put_block_group(block_group);
+ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
+ 0, owner, offset, ins, 1);
+ return ret;
+}
+
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns 0 if everything worked, non-zero otherwise.
+ */
+static int alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 num_bytes, u64 parent, u64 root_objectid,
+ struct btrfs_disk_key *key, int level,
+ u64 empty_size, u64 hint_byte, u64 search_end,
+ struct btrfs_key *ins)
+{
+ int ret;
+ u64 flags = 0;
+
+ ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+ empty_size, hint_byte, search_end,
+ ins, 0);
+ if (ret)
+ return ret;
+
+ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (parent == 0)
+ parent = ins->objectid;
+ flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ } else
+ BUG_ON(parent > 0);
+
+ if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_delayed_extent_op *extent_op;
+ extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ BUG_ON(!extent_op);
+ if (key)
+ memcpy(&extent_op->key, key, sizeof(extent_op->key));
+ else
+ memset(&extent_op->key, 0, sizeof(extent_op->key));
+ extent_op->flags_to_set = flags;
+ extent_op->update_key = 1;
+ extent_op->update_flags = 1;
+ extent_op->is_data = 0;
+
+ ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
+ ins->offset, parent, root_objectid,
+ level, BTRFS_ADD_DELAYED_EXTENT,
+ extent_op);
+ BUG_ON(ret);
+ }
+
+ if (root_objectid == root->root_key.objectid) {
+ u64 used;
+ spin_lock(&root->node_lock);
+ used = btrfs_root_used(&root->root_item) + num_bytes;
+ btrfs_set_root_used(&root->root_item, used);
+ spin_unlock(&root->node_lock);
+ }
+ return ret;
+}
+
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u32 blocksize,
+ int level)
+{
+ struct extent_buffer *buf;
+
+ buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+ btrfs_set_header_generation(buf, trans->transid);
+ btrfs_set_buffer_lockdep_class(buf, level);
+ btrfs_tree_lock(buf);
+ clean_tree_block(trans, root, buf);
+
+ btrfs_set_lock_blocking(buf);
+ btrfs_set_buffer_uptodate(buf);
+
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ /*
+ * we allow two log transactions at a time, use different
+ * EXENT bit to differentiate dirty pages.
+ */
+ if (root->log_transid % 2 == 0)
+ set_extent_dirty(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1, GFP_NOFS);
+ else
+ set_extent_new(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1, GFP_NOFS);
+ } else {
+ set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+ buf->start + buf->len - 1, GFP_NOFS);
+ }
+ trans->blocks_used++;
+ /* this returns a buffer locked for blocking */
+ return buf;
+}
+
+/*
+ * helper function to allocate a block for a given tree
+ * returns the tree buffer or NULL.
+ */
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u32 blocksize,
+ u64 parent, u64 root_objectid,
+ struct btrfs_disk_key *key, int level,
+ u64 hint, u64 empty_size)
+{
+ struct btrfs_key ins;
+ int ret;
+ struct extent_buffer *buf;
+
+ ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
+ key, level, empty_size, hint, (u64)-1, &ins);
+ if (ret) {
+ BUG_ON(ret > 0);
+ return ERR_PTR(ret);
+ }
+
+ buf = btrfs_init_new_buffer(trans, root, ins.objectid,
+ blocksize, level);
+ return buf;
+}
+
+struct walk_control {
+ u64 refs[BTRFS_MAX_LEVEL];
+ u64 flags[BTRFS_MAX_LEVEL];
+ struct btrfs_key update_progress;
+ int stage;
+ int level;
+ int shared_level;
+ int update_ref;
+ int keep_locks;
+ int reada_slot;
+ int reada_count;
+};
+
+#define DROP_REFERENCE 1
+#define UPDATE_BACKREF 2
+
+static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct walk_control *wc,
+ struct btrfs_path *path)
+{
+ u64 bytenr;
+ u64 generation;
+ u64 refs;
+ u64 flags;
+ u64 last = 0;
+ u32 nritems;
+ u32 blocksize;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ int ret;
+ int slot;
+ int nread = 0;
+
+ if (path->slots[wc->level] < wc->reada_slot) {
+ wc->reada_count = wc->reada_count * 2 / 3;
+ wc->reada_count = max(wc->reada_count, 2);
+ } else {
+ wc->reada_count = wc->reada_count * 3 / 2;
+ wc->reada_count = min_t(int, wc->reada_count,
+ BTRFS_NODEPTRS_PER_BLOCK(root));
+ }
+
+ eb = path->nodes[wc->level];
+ nritems = btrfs_header_nritems(eb);
+ blocksize = btrfs_level_size(root, wc->level - 1);
+
+ for (slot = path->slots[wc->level]; slot < nritems; slot++) {
+ if (nread >= wc->reada_count)
+ break;
+
+ cond_resched();
+ bytenr = btrfs_node_blockptr(eb, slot);
+ generation = btrfs_node_ptr_generation(eb, slot);
+
+ if (slot == path->slots[wc->level])
+ goto reada;
+
+ if (wc->stage == UPDATE_BACKREF &&
+ generation <= root->root_key.offset)
+ continue;
+
+ /* We don't lock the tree block, it's OK to be racy here */
+ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+ &refs, &flags);
+ BUG_ON(ret);
+ BUG_ON(refs == 0);
+
+ if (wc->stage == DROP_REFERENCE) {
+ if (refs == 1)
+ goto reada;
+
+ if (wc->level == 1 &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ continue;
+ if (!wc->update_ref ||
+ generation <= root->root_key.offset)
+ continue;
+ btrfs_node_key_to_cpu(eb, &key, slot);
+ ret = btrfs_comp_cpu_keys(&key,
+ &wc->update_progress);
+ if (ret < 0)
+ continue;
+ } else {
+ if (wc->level == 1 &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ continue;
+ }
+reada:
+ ret = readahead_tree_block(root, bytenr, blocksize,
+ generation);
+ if (ret)
+ break;
+ last = bytenr + blocksize;
+ nread++;
+ }
+ wc->reada_slot = slot;
+}
+
+/*
+ * hepler to process tree block while walking down the tree.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function updates
+ * back refs for pointers in the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc, int lookup_info)
+{
+ int level = wc->level;
+ struct extent_buffer *eb = path->nodes[level];
+ u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ int ret;
+
+ if (wc->stage == UPDATE_BACKREF &&
+ btrfs_header_owner(eb) != root->root_key.objectid)
+ return 1;
+
+ /*
+ * when reference count of tree block is 1, it won't increase
+ * again. once full backref flag is set, we never clear it.
+ */
+ if (lookup_info &&
+ ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+ (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
+ BUG_ON(!path->locks[level]);
+ ret = btrfs_lookup_extent_info(trans, root,
+ eb->start, eb->len,
+ &wc->refs[level],
+ &wc->flags[level]);
+ BUG_ON(ret);
+ BUG_ON(wc->refs[level] == 0);
+ }
+
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->refs[level] > 1)
+ return 1;
+
+ if (path->locks[level] && !wc->keep_locks) {
+ btrfs_tree_unlock(eb);
+ path->locks[level] = 0;
+ }
+ return 0;
+ }
+
+ /* wc->stage == UPDATE_BACKREF */
+ if (!(wc->flags[level] & flag)) {
+ BUG_ON(!path->locks[level]);
+ ret = btrfs_inc_ref(trans, root, eb, 1);
+ BUG_ON(ret);
+ ret = btrfs_dec_ref(trans, root, eb, 0);
+ BUG_ON(ret);
+ ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
+ eb->len, flag, 0);
+ BUG_ON(ret);
+ wc->flags[level] |= flag;
+ }
+
+ /*
+ * the block is shared by multiple trees, so it's not good to
+ * keep the tree lock
+ */
+ if (path->locks[level] && level > 0) {
+ btrfs_tree_unlock(eb);
+ path->locks[level] = 0;
+ }
+ return 0;
+}
+
+/*
+ * hepler to process tree block pointer.
+ *
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block pointed to. if the block
+ * is shared and we need update back refs for the subtree
+ * rooted at the block, this function changes wc->stage to
+ * UPDATE_BACKREF. if the block is shared and there is no
+ * need to update back, this function drops the reference
+ * to the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int do_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc, int *lookup_info)
+{
+ u64 bytenr;
+ u64 generation;
+ u64 parent;
+ u32 blocksize;
+ struct btrfs_key key;
+ struct extent_buffer *next;
+ int level = wc->level;
+ int reada = 0;
+ int ret = 0;
+
+ generation = btrfs_node_ptr_generation(path->nodes[level],
+ path->slots[level]);
+ /*
+ * if the lower level block was created before the snapshot
+ * was created, we know there is no need to update back refs
+ * for the subtree
+ */
+ if (wc->stage == UPDATE_BACKREF &&
+ generation <= root->root_key.offset) {
+ *lookup_info = 1;
+ return 1;
+ }
+
+ bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+ blocksize = btrfs_level_size(root, level - 1);
+
+ next = btrfs_find_tree_block(root, bytenr, blocksize);
+ if (!next) {
+ next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ reada = 1;
+ }
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+
+ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+ &wc->refs[level - 1],
+ &wc->flags[level - 1]);
+ BUG_ON(ret);
+ BUG_ON(wc->refs[level - 1] == 0);
+ *lookup_info = 0;
+
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->refs[level - 1] > 1) {
+ if (level == 1 &&
+ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ goto skip;
+
+ if (!wc->update_ref ||
+ generation <= root->root_key.offset)
+ goto skip;
+
+ btrfs_node_key_to_cpu(path->nodes[level], &key,
+ path->slots[level]);
+ ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
+ if (ret < 0)
+ goto skip;
+
+ wc->stage = UPDATE_BACKREF;
+ wc->shared_level = level - 1;
+ }
+ } else {
+ if (level == 1 &&
+ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ goto skip;
+ }
+
+ if (!btrfs_buffer_uptodate(next, generation)) {
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ next = NULL;
+ *lookup_info = 1;
+ }
+
+ if (!next) {
+ if (reada && level == 1)
+ reada_walk_down(trans, root, wc, path);
+ next = read_tree_block(root, bytenr, blocksize, generation);
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+ }
+
+ level--;
+ BUG_ON(level != btrfs_header_level(next));
+ path->nodes[level] = next;
+ path->slots[level] = 0;
+ path->locks[level] = 1;
+ wc->level = level;
+ if (wc->level == 1)
+ wc->reada_slot = 0;
+ return 0;
+skip:
+ wc->refs[level - 1] = 0;
+ wc->flags[level - 1] = 0;
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ parent = path->nodes[level]->start;
+ } else {
+ BUG_ON(root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level]));
+ parent = 0;
+ }
+
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+ root->root_key.objectid, level - 1, 0);
+ BUG_ON(ret);
+ }
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ *lookup_info = 1;
+ return 1;
+}
+
+/*
+ * hepler to process tree block while walking up the tree.
+ *
+ * when wc->stage == DROP_REFERENCE, this function drops
+ * reference count on the block.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function changes
+ * wc->stage back to DROP_REFERENCE if we changed wc->stage
+ * to UPDATE_BACKREF previously while processing the block.
+ *
+ * NOTE: return value 1 means we should stop walking up.
+ */
+static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc)
+{
+ int ret = 0;
+ int level = wc->level;
+ struct extent_buffer *eb = path->nodes[level];
+ u64 parent = 0;
+
+ if (wc->stage == UPDATE_BACKREF) {
+ BUG_ON(wc->shared_level < level);
+ if (level < wc->shared_level)
+ goto out;
+
+ ret = find_next_key(path, level + 1, &wc->update_progress);
+ if (ret > 0)
+ wc->update_ref = 0;
+
+ wc->stage = DROP_REFERENCE;
+ wc->shared_level = -1;
+ path->slots[level] = 0;
+
+ /*
+ * check reference count again if the block isn't locked.
+ * we should start walking down the tree again if reference
+ * count is one.
+ */
+ if (!path->locks[level]) {
+ BUG_ON(level == 0);
+ btrfs_tree_lock(eb);
+ btrfs_set_lock_blocking(eb);
+ path->locks[level] = 1;
+
+ ret = btrfs_lookup_extent_info(trans, root,
+ eb->start, eb->len,
+ &wc->refs[level],
+ &wc->flags[level]);
+ BUG_ON(ret);
+ BUG_ON(wc->refs[level] == 0);
+ if (wc->refs[level] == 1) {
+ btrfs_tree_unlock(eb);
+ path->locks[level] = 0;
+ return 1;
+ }
+ }
+ }
+
+ /* wc->stage == DROP_REFERENCE */
+ BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
+
+ if (wc->refs[level] == 1) {
+ if (level == 0) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ ret = btrfs_dec_ref(trans, root, eb, 1);
+ else
+ ret = btrfs_dec_ref(trans, root, eb, 0);
+ BUG_ON(ret);
+ }
+ /* make block locked assertion in clean_tree_block happy */
+ if (!path->locks[level] &&
+ btrfs_header_generation(eb) == trans->transid) {
+ btrfs_tree_lock(eb);
+ btrfs_set_lock_blocking(eb);
+ path->locks[level] = 1;
+ }
+ clean_tree_block(trans, root, eb);
+ }
+
+ if (eb == root->node) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ parent = eb->start;
+ else
+ BUG_ON(root->root_key.objectid !=
+ btrfs_header_owner(eb));
+ } else {
+ if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ parent = path->nodes[level + 1]->start;
+ else
+ BUG_ON(root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level + 1]));
+ }
+
+ ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
+ root->root_key.objectid, level, 0);
+ BUG_ON(ret);
+out:
+ wc->refs[level] = 0;
+ wc->flags[level] = 0;
+ return ret;
+}
+
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc)
+{
+ int level = wc->level;
+ int lookup_info = 1;
+ int ret;
+
+ while (level >= 0) {
+ ret = walk_down_proc(trans, root, path, wc, lookup_info);
+ if (ret > 0)
+ break;
+
+ if (level == 0)
+ break;
+
+ if (path->slots[level] >=
+ btrfs_header_nritems(path->nodes[level]))
+ break;
+
+ ret = do_walk_down(trans, root, path, wc, &lookup_info);
+ if (ret > 0) {
+ path->slots[level]++;
+ continue;
+ }
+ level = wc->level;
+ }
+ return 0;
+}
+
+static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc, int max_level)
+{
+ int level = wc->level;
+ int ret;
+
+ path->slots[level] = btrfs_header_nritems(path->nodes[level]);
+ while (level < max_level && path->nodes[level]) {
+ wc->level = level;
+ if (path->slots[level] + 1 <
+ btrfs_header_nritems(path->nodes[level])) {
+ path->slots[level]++;
+ return 0;
+ } else {
+ ret = walk_up_proc(trans, root, path, wc);
+ if (ret > 0)
+ return 0;
+
+ if (path->locks[level]) {
+ btrfs_tree_unlock(path->nodes[level]);
+ path->locks[level] = 0;
+ }
+ free_extent_buffer(path->nodes[level]);
+ path->nodes[level] = NULL;
+ level++;
+ }
+ }
+ return 1;
+}
+
+/*
+ * drop a subvolume tree.
+ *
+ * this function traverses the tree freeing any blocks that only
+ * referenced by the tree.
+ *
+ * when a shared tree block is found. this function decreases its
+ * reference count by one. if update_ref is true, this function
+ * also make sure backrefs for the shared block and all lower level
+ * blocks are properly updated.
+ */
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+{
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = root->fs_info->tree_root;
+ struct btrfs_root_item *root_item = &root->root_item;
+ struct walk_control *wc;
+ struct btrfs_key key;
+ int err = 0;
+ int ret;
+ int level;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ wc = kzalloc(sizeof(*wc), GFP_NOFS);
+ BUG_ON(!wc);
+
+ trans = btrfs_start_transaction(tree_root, 1);
+
+ if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+ level = btrfs_header_level(root->node);
+ path->nodes[level] = btrfs_lock_root_node(root);
+ btrfs_set_lock_blocking(path->nodes[level]);
+ path->slots[level] = 0;
+ path->locks[level] = 1;
+ memset(&wc->update_progress, 0,
+ sizeof(wc->update_progress));
+ } else {
+ btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+ memcpy(&wc->update_progress, &key,
+ sizeof(wc->update_progress));
+
+ level = root_item->drop_level;
+ BUG_ON(level == 0);
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ path->lowest_level = 0;
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ WARN_ON(ret > 0);
+
+ /*
+ * unlock our path, this is safe because only this
+ * function is allowed to delete this snapshot
+ */
+ btrfs_unlock_up_safe(path, 0);
+
+ level = btrfs_header_level(root->node);
+ while (1) {
+ btrfs_tree_lock(path->nodes[level]);
+ btrfs_set_lock_blocking(path->nodes[level]);
+
+ ret = btrfs_lookup_extent_info(trans, root,
+ path->nodes[level]->start,
+ path->nodes[level]->len,
+ &wc->refs[level],
+ &wc->flags[level]);
+ BUG_ON(ret);
+ BUG_ON(wc->refs[level] == 0);
+
+ if (level == root_item->drop_level)
+ break;
+
+ btrfs_tree_unlock(path->nodes[level]);
+ WARN_ON(wc->refs[level] != 1);
+ level--;
+ }
+ }
+
+ wc->level = level;
+ wc->shared_level = -1;
+ wc->stage = DROP_REFERENCE;
+ wc->update_ref = update_ref;
+ wc->keep_locks = 0;
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+
+ while (1) {
+ ret = walk_down_tree(trans, root, path, wc);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ if (ret > 0) {
+ BUG_ON(wc->stage != DROP_REFERENCE);
+ break;
+ }
+
+ if (wc->stage == DROP_REFERENCE) {
+ level = wc->level;
+ btrfs_node_key(path->nodes[level],
+ &root_item->drop_progress,
+ path->slots[level]);
+ root_item->drop_level = level;
+ }
+
+ BUG_ON(wc->level == 0);
+ if (trans->transaction->in_commit ||
+ trans->transaction->delayed_refs.flushing) {
+ ret = btrfs_update_root(trans, tree_root,
+ &root->root_key,
+ root_item);
+ BUG_ON(ret);
+
+ btrfs_end_transaction(trans, tree_root);
+ trans = btrfs_start_transaction(tree_root, 1);
+ } else {
+ unsigned long update;
+ update = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (update)
+ btrfs_run_delayed_refs(trans, tree_root,
+ update);
+ }
+ }
+ btrfs_release_path(root, path);
+ BUG_ON(err);
+
+ ret = btrfs_del_root(trans, tree_root, &root->root_key);
+ BUG_ON(ret);
+
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
+ NULL, NULL);
+ BUG_ON(ret < 0);
+ if (ret > 0) {
+ ret = btrfs_del_orphan_item(trans, tree_root,
+ root->root_key.objectid);
+ BUG_ON(ret);
+ }
+ }
+
+ if (root->in_radix) {
+ btrfs_free_fs_root(tree_root->fs_info, root);
+ } else {
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
+ kfree(root);
+ }
+out:
+ btrfs_end_transaction(trans, tree_root);
+ kfree(wc);
+ btrfs_free_path(path);
+ return err;
+}
+
+/*
+ * drop subtree rooted at tree block 'node'.
+ *
+ * NOTE: this function will unlock and release tree block 'node'
+ */
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *node,
+ struct extent_buffer *parent)
+{
+ struct btrfs_path *path;
+ struct walk_control *wc;
+ int level;
+ int parent_level;
+ int ret = 0;
+ int wret;
+
+ BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ wc = kzalloc(sizeof(*wc), GFP_NOFS);
+ BUG_ON(!wc);
+
+ btrfs_assert_tree_locked(parent);
+ parent_level = btrfs_header_level(parent);
+ extent_buffer_get(parent);
+ path->nodes[parent_level] = parent;
+ path->slots[parent_level] = btrfs_header_nritems(parent);
+
+ btrfs_assert_tree_locked(node);
+ level = btrfs_header_level(node);
+ path->nodes[level] = node;
+ path->slots[level] = 0;
+ path->locks[level] = 1;
+
+ wc->refs[parent_level] = 1;
+ wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ wc->level = level;
+ wc->shared_level = -1;
+ wc->stage = DROP_REFERENCE;
+ wc->update_ref = 0;
+ wc->keep_locks = 1;
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+
+ while (1) {
+ wret = walk_down_tree(trans, root, path, wc);
+ if (wret < 0) {
+ ret = wret;
+ break;
+ }
+
+ wret = walk_up_tree(trans, root, path, wc, parent_level);
+ if (wret < 0)
+ ret = wret;
+ if (wret != 0)
+ break;
+ }
+
+ kfree(wc);
+ btrfs_free_path(path);
+ return ret;
+}
+
+#if 0
+static unsigned long calc_ra(unsigned long start, unsigned long last,
+ unsigned long nr)
+{
+ return min(last, start + nr - 1);
+}
+
+static noinline int relocate_inode_pages(struct inode *inode, u64 start,
+ u64 len)
+{
+ u64 page_start;
+ u64 page_end;
+ unsigned long first_index;
+ unsigned long last_index;
+ unsigned long i;
+ struct page *page;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct file_ra_state *ra;
+ struct btrfs_ordered_extent *ordered;
+ unsigned int total_read = 0;
+ unsigned int total_dirty = 0;
+ int ret = 0;
+
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+
+ mutex_lock(&inode->i_mutex);
+ first_index = start >> PAGE_CACHE_SHIFT;
+ last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+
+ /* make sure the dirty trick played by the caller work */
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ first_index, last_index);
+ if (ret)
+ goto out_unlock;
+
+ file_ra_state_init(ra, inode->i_mapping);
+
+ for (i = first_index ; i <= last_index; i++) {
+ if (total_read % ra->ra_pages == 0) {
+ btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+ calc_ra(i, last_index, ra->ra_pages));
+ }
+ total_read++;
+again:
+ if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+ BUG_ON(1);
+ page = grab_cache_page(inode->i_mapping, i);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ ret = -EIO;
+ goto out_unlock;
+ }
+ }
+ wait_on_page_writeback(page);
+
+ page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+ lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+ set_page_extent_mapped(page);
+
+ if (i == first_index)
+ set_extent_bits(io_tree, page_start, page_end,
+ EXTENT_BOUNDARY, GFP_NOFS);
+ btrfs_set_extent_delalloc(inode, page_start, page_end);
+
+ set_page_dirty(page);
+ total_dirty++;
+
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+
+out_unlock:
+ kfree(ra);
+ mutex_unlock(&inode->i_mutex);
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+ return ret;
+}
+
+static noinline int relocate_data_extent(struct inode *reloc_inode,
+ struct btrfs_key *extent_key,
+ u64 offset)
+{
+ struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+ struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
+ struct extent_map *em;
+ u64 start = extent_key->objectid - offset;
+ u64 end = start + extent_key->offset - 1;
+
+ em = alloc_extent_map(GFP_NOFS);
+ BUG_ON(!em || IS_ERR(em));
+
+ em->start = start;
+ em->len = extent_key->offset;
+ em->block_len = extent_key->offset;
+ em->block_start = extent_key->objectid;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ /* setup extent map to cheat btrfs_readpage */
+ lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+ while (1) {
+ int ret;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(reloc_inode, start, end, 0);
+ }
+ unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+
+ return relocate_inode_pages(reloc_inode, start, extent_key->offset);
+}
+
+struct btrfs_ref_path {
+ u64 extent_start;
+ u64 nodes[BTRFS_MAX_LEVEL];
+ u64 root_objectid;
+ u64 root_generation;
+ u64 owner_objectid;
+ u32 num_refs;
+ int lowest_level;
+ int current_level;
+ int shared_level;
+
+ struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
+ u64 new_nodes[BTRFS_MAX_LEVEL];
+};
+
+struct disk_extent {
+ u64 ram_bytes;
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 offset;
+ u64 num_bytes;
+ u8 compression;
+ u8 encryption;
+ u16 other_encoding;
+};
+
+static int is_cowonly_root(u64 root_objectid)
+{
+ if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+ root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+ root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+ root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+ root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+ root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return 1;
+ return 0;
+}
+
+static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_ref_path *ref_path,
+ int first_time)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_extent_ref *ref;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 bytenr;
+ u32 nritems;
+ int level;
+ int ret = 1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (first_time) {
+ ref_path->lowest_level = -1;
+ ref_path->current_level = -1;
+ ref_path->shared_level = -1;
+ goto walk_up;
+ }
+walk_down:
+ level = ref_path->current_level - 1;
+ while (level >= -1) {
+ u64 parent;
+ if (level < ref_path->lowest_level)
+ break;
+
+ if (level >= 0)
+ bytenr = ref_path->nodes[level];
+ else
+ bytenr = ref_path->extent_start;
+ BUG_ON(bytenr == 0);
+
+ parent = ref_path->nodes[level + 1];
+ ref_path->nodes[level + 1] = 0;
+ ref_path->current_level = level;
+ BUG_ON(parent == 0);
+
+ key.objectid = bytenr;
+ key.offset = parent + 1;
+ key.type = BTRFS_EXTENT_REF_KEY;
+
+ ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ goto next;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid == bytenr &&
+ found_key.type == BTRFS_EXTENT_REF_KEY) {
+ if (level < ref_path->shared_level)
+ ref_path->shared_level = level;
+ goto found;
+ }
+next:
+ level--;
+ btrfs_release_path(extent_root, path);
+ cond_resched();
+ }
+ /* reached lowest level */
+ ret = 1;
+ goto out;
+walk_up:
+ level = ref_path->current_level;
+ while (level < BTRFS_MAX_LEVEL - 1) {
+ u64 ref_objectid;
+
+ if (level >= 0)
+ bytenr = ref_path->nodes[level];
+ else
+ bytenr = ref_path->extent_start;
+
+ BUG_ON(bytenr == 0);
+
+ key.objectid = bytenr;
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_REF_KEY;
+
+ ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ /* the extent was freed by someone */
+ if (ref_path->lowest_level == level)
+ goto out;
+ btrfs_release_path(extent_root, path);
+ goto walk_down;
+ }
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != bytenr ||
+ found_key.type != BTRFS_EXTENT_REF_KEY) {
+ /* the extent was freed by someone */
+ if (ref_path->lowest_level == level) {
+ ret = 1;
+ goto out;
+ }
+ btrfs_release_path(extent_root, path);
+ goto walk_down;
+ }
+found:
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref);
+ ref_objectid = btrfs_ref_objectid(leaf, ref);
+ if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+ if (first_time) {
+ level = (int)ref_objectid;
+ BUG_ON(level >= BTRFS_MAX_LEVEL);
+ ref_path->lowest_level = level;
+ ref_path->current_level = level;
+ ref_path->nodes[level] = bytenr;
+ } else {
+ WARN_ON(ref_objectid != level);
+ }
+ } else {
+ WARN_ON(level != -1);
+ }
+ first_time = 0;
+
+ if (ref_path->lowest_level == level) {
+ ref_path->owner_objectid = ref_objectid;
+ ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
+ }
+
+ /*
+ * the block is tree root or the block isn't in reference
+ * counted tree.
+ */
+ if (found_key.objectid == found_key.offset ||
+ is_cowonly_root(btrfs_ref_root(leaf, ref))) {
+ ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+ ref_path->root_generation =
+ btrfs_ref_generation(leaf, ref);
+ if (level < 0) {
+ /* special reference from the tree log */
+ ref_path->nodes[0] = found_key.offset;
+ ref_path->current_level = 0;
+ }
+ ret = 0;
+ goto out;
+ }
+
+ level++;
+ BUG_ON(ref_path->nodes[level] != 0);
+ ref_path->nodes[level] = found_key.offset;
+ ref_path->current_level = level;
+
+ /*
+ * the reference was created in the running transaction,
+ * no need to continue walking up.
+ */
+ if (btrfs_ref_generation(leaf, ref) == trans->transid) {
+ ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+ ref_path->root_generation =
+ btrfs_ref_generation(leaf, ref);
+ ret = 0;
+ goto out;
+ }
+
+ btrfs_release_path(extent_root, path);
+ cond_resched();
+ }
+ /* reached max tree level, but no tree root found. */
+ BUG();
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_ref_path *ref_path,
+ u64 extent_start)
+{
+ memset(ref_path, 0, sizeof(*ref_path));
+ ref_path->extent_start = extent_start;
+
+ return __next_ref_path(trans, extent_root, ref_path, 1);
+}
+
+static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_ref_path *ref_path)
+{
+ return __next_ref_path(trans, extent_root, ref_path, 0);
+}
+
+static noinline int get_new_locations(struct inode *reloc_inode,
+ struct btrfs_key *extent_key,
+ u64 offset, int no_fragment,
+ struct disk_extent **extents,
+ int *nr_extents)
+{
+ struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ struct disk_extent *exts = *extents;
+ struct btrfs_key found_key;
+ u64 cur_pos;
+ u64 last_byte;
+ u32 nritems;
+ int nr = 0;
+ int max = *nr_extents;
+ int ret;
+
+ WARN_ON(!no_fragment && *extents);
+ if (!exts) {
+ max = 1;
+ exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
+ if (!exts)
+ return -ENOMEM;
+ }
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ cur_pos = extent_key->objectid - offset;
+ last_byte = extent_key->objectid + extent_key->offset;
+ ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+ cur_pos, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ while (1) {
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.offset != cur_pos ||
+ found_key.type != BTRFS_EXTENT_DATA_KEY ||
+ found_key.objectid != reloc_inode->i_ino)
+ break;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) !=
+ BTRFS_FILE_EXTENT_REG ||
+ btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+ break;
+
+ if (nr == max) {
+ struct disk_extent *old = exts;
+ max *= 2;
+ exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
+ memcpy(exts, old, sizeof(*exts) * nr);
+ if (old != *extents)
+ kfree(old);
+ }
+
+ exts[nr].disk_bytenr =
+ btrfs_file_extent_disk_bytenr(leaf, fi);
+ exts[nr].disk_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf, fi);
+ exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
+ exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+ exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+ exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+ exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+ fi);
+ BUG_ON(exts[nr].offset > 0);
+ BUG_ON(exts[nr].compression || exts[nr].encryption);
+ BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
+
+ cur_pos += exts[nr].num_bytes;
+ nr++;
+
+ if (cur_pos + offset >= last_byte)
+ break;
+
+ if (no_fragment) {
+ ret = 1;
+ goto out;
+ }
+ path->slots[0]++;
+ }
+
+ BUG_ON(cur_pos + offset > last_byte);
+ if (cur_pos + offset < last_byte) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ if (ret) {
+ if (exts != *extents)
+ kfree(exts);
+ } else {
+ *extents = exts;
+ *nr_extents = nr;
+ }
+ return ret;
+}
+
+static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *extent_key,
+ struct btrfs_key *leaf_key,
+ struct btrfs_ref_path *ref_path,
+ struct disk_extent *new_extents,
+ int nr_extents)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct inode *inode = NULL;
+ struct btrfs_key key;
+ u64 lock_start = 0;
+ u64 lock_end = 0;
+ u64 num_bytes;
+ u64 ext_offset;
+ u64 search_end = (u64)-1;
+ u32 nritems;
+ int nr_scaned = 0;
+ int extent_locked = 0;
+ int extent_type;
+ int ret;
+
+ memcpy(&key, leaf_key, sizeof(key));
+ if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+ if (key.objectid < ref_path->owner_objectid ||
+ (key.objectid == ref_path->owner_objectid &&
+ key.type < BTRFS_EXTENT_DATA_KEY)) {
+ key.objectid = ref_path->owner_objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ }
+ }
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+next:
+ if (extent_locked && ret > 0) {
+ /*
+ * the file extent item was modified by someone
+ * before the extent got locked.
+ */
+ unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+ lock_end, GFP_NOFS);
+ extent_locked = 0;
+ }
+
+ if (path->slots[0] >= nritems) {
+ if (++nr_scaned > 2)
+ break;
+
+ BUG_ON(extent_locked);
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+ if ((key.objectid > ref_path->owner_objectid) ||
+ (key.objectid == ref_path->owner_objectid &&
+ key.type > BTRFS_EXTENT_DATA_KEY) ||
+ key.offset >= search_end)
+ break;
+ }
+
+ if (inode && key.objectid != inode->i_ino) {
+ BUG_ON(extent_locked);
+ btrfs_release_path(root, path);
+ mutex_unlock(&inode->i_mutex);
+ iput(inode);
+ inode = NULL;
+ continue;
+ }
+
+ if (key.type != BTRFS_EXTENT_DATA_KEY) {
+ path->slots[0]++;
+ ret = 1;
+ goto next;
+ }
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ if ((extent_type != BTRFS_FILE_EXTENT_REG &&
+ extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
+ (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+ extent_key->objectid)) {
+ path->slots[0]++;
+ ret = 1;
+ goto next;
+ }
+
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+ ext_offset = btrfs_file_extent_offset(leaf, fi);
+
+ if (search_end == (u64)-1) {
+ search_end = key.offset - ext_offset +
+ btrfs_file_extent_ram_bytes(leaf, fi);
+ }
+
+ if (!extent_locked) {
+ lock_start = key.offset;
+ lock_end = lock_start + num_bytes - 1;
+ } else {
+ if (lock_start > key.offset ||
+ lock_end + 1 < key.offset + num_bytes) {
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ lock_start, lock_end, GFP_NOFS);
+ extent_locked = 0;
+ }
+ }
+
+ if (!inode) {
+ btrfs_release_path(root, path);
+
+ inode = btrfs_iget_locked(root->fs_info->sb,
+ key.objectid, root);
+ if (inode->i_state & I_NEW) {
+ BTRFS_I(inode)->root = root;
+ BTRFS_I(inode)->location.objectid =
+ key.objectid;
+ BTRFS_I(inode)->location.type =
+ BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.offset = 0;
+ btrfs_read_locked_inode(inode);
+ unlock_new_inode(inode);
+ }
+ /*
+ * some code call btrfs_commit_transaction while
+ * holding the i_mutex, so we can't use mutex_lock
+ * here.
+ */
+ if (is_bad_inode(inode) ||
+ !mutex_trylock(&inode->i_mutex)) {
+ iput(inode);
+ inode = NULL;
+ key.offset = (u64)-1;
+ goto skip;
+ }
+ }
+
+ if (!extent_locked) {
+ struct btrfs_ordered_extent *ordered;
+
+ btrfs_release_path(root, path);
+
+ lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+ lock_end, GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode,
+ lock_end);
+ if (ordered &&
+ ordered->file_offset <= lock_end &&
+ ordered->file_offset + ordered->len > lock_start) {
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ lock_start, lock_end, GFP_NOFS);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ key.offset += num_bytes;
+ goto skip;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+
+ extent_locked = 1;
+ continue;
+ }
+
+ if (nr_extents == 1) {
+ /* update extent pointer in place */
+ btrfs_set_file_extent_disk_bytenr(leaf, fi,
+ new_extents[0].disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+ new_extents[0].disk_num_bytes);
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_drop_extent_cache(inode, key.offset,
+ key.offset + num_bytes - 1, 0);
+
+ ret = btrfs_inc_extent_ref(trans, root,
+ new_extents[0].disk_bytenr,
+ new_extents[0].disk_num_bytes,
+ leaf->start,
+ root->root_key.objectid,
+ trans->transid,
+ key.objectid);
+ BUG_ON(ret);
+
+ ret = btrfs_free_extent(trans, root,
+ extent_key->objectid,
+ extent_key->offset,
+ leaf->start,
+ btrfs_header_owner(leaf),
+ btrfs_header_generation(leaf),
+ key.objectid, 0);
+ BUG_ON(ret);
+
+ btrfs_release_path(root, path);
+ key.offset += num_bytes;
+ } else {
+ BUG_ON(1);
+#if 0
+ u64 alloc_hint;
+ u64 extent_len;
+ int i;
+ /*
+ * drop old extent pointer at first, then insert the
+ * new pointers one bye one
+ */
+ btrfs_release_path(root, path);
+ ret = btrfs_drop_extents(trans, root, inode, key.offset,
+ key.offset + num_bytes,
+ key.offset, &alloc_hint);
+ BUG_ON(ret);
+
+ for (i = 0; i < nr_extents; i++) {
+ if (ext_offset >= new_extents[i].num_bytes) {
+ ext_offset -= new_extents[i].num_bytes;
+ continue;
+ }
+ extent_len = min(new_extents[i].num_bytes -
+ ext_offset, num_bytes);
+
+ ret = btrfs_insert_empty_item(trans, root,
+ path, &key,
+ sizeof(*fi));
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_disk_bytenr(leaf, fi,
+ new_extents[i].disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+ new_extents[i].disk_num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi,
+ new_extents[i].ram_bytes);
+
+ btrfs_set_file_extent_compression(leaf, fi,
+ new_extents[i].compression);
+ btrfs_set_file_extent_encryption(leaf, fi,
+ new_extents[i].encryption);
+ btrfs_set_file_extent_other_encoding(leaf, fi,
+ new_extents[i].other_encoding);
+
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_len);
+ ext_offset += new_extents[i].offset;
+ btrfs_set_file_extent_offset(leaf, fi,
+ ext_offset);
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_drop_extent_cache(inode, key.offset,
+ key.offset + extent_len - 1, 0);
+
+ ret = btrfs_inc_extent_ref(trans, root,
+ new_extents[i].disk_bytenr,
+ new_extents[i].disk_num_bytes,
+ leaf->start,
+ root->root_key.objectid,
+ trans->transid, key.objectid);
+ BUG_ON(ret);
+ btrfs_release_path(root, path);
+
+ inode_add_bytes(inode, extent_len);
+
+ ext_offset = 0;
+ num_bytes -= extent_len;
+ key.offset += extent_len;
+
+ if (num_bytes == 0)
+ break;
+ }
+ BUG_ON(i >= nr_extents);
+#endif
+ }
+
+ if (extent_locked) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+ lock_end, GFP_NOFS);
+ extent_locked = 0;
+ }
+skip:
+ if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
+ key.offset >= search_end)
+ break;
+
+ cond_resched();
+ }
+ ret = 0;
+out:
+ btrfs_release_path(root, path);
+ if (inode) {
+ mutex_unlock(&inode->i_mutex);
+ if (extent_locked) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+ lock_end, GFP_NOFS);
+ }
+ iput(inode);
+ }
+ return ret;
+}
+
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf, u64 orig_start)
+{
+ int level;
+ int ret;
+
+ BUG_ON(btrfs_header_generation(buf) != trans->transid);
+ BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
+ level = btrfs_header_level(buf);
+ if (level == 0) {
+ struct btrfs_leaf_ref *ref;
+ struct btrfs_leaf_ref *orig_ref;
+
+ orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
+ if (!orig_ref)
+ return -ENOENT;
+
+ ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
+ if (!ref) {
+ btrfs_free_leaf_ref(root, orig_ref);
+ return -ENOMEM;
+ }
+
+ ref->nritems = orig_ref->nritems;
+ memcpy(ref->extents, orig_ref->extents,
+ sizeof(ref->extents[0]) * ref->nritems);
+
+ btrfs_free_leaf_ref(root, orig_ref);
+
+ ref->root_gen = trans->transid;
+ ref->bytenr = buf->start;
+ ref->owner = btrfs_header_owner(buf);
+ ref->generation = btrfs_header_generation(buf);
+
+ ret = btrfs_add_leaf_ref(root, ref, 0);
+ WARN_ON(ret);
+ btrfs_free_leaf_ref(root, ref);
+ }
+ return 0;
+}
+
+static noinline int invalidate_extent_cache(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_block_group_cache *group,
+ struct btrfs_root *target_root)
+{
+ struct btrfs_key key;
+ struct inode *inode = NULL;
+ struct btrfs_file_extent_item *fi;
+ u64 num_bytes;
+ u64 skip_objectid = 0;
+ u32 nritems;
+ u32 i;
+
+ nritems = btrfs_header_nritems(leaf);
+ for (i = 0; i < nritems; i++) {
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (key.objectid == skip_objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+ continue;
+ if (!inode || inode->i_ino != key.objectid) {
+ iput(inode);
+ inode = btrfs_ilookup(target_root->fs_info->sb,
+ key.objectid, target_root, 1);
+ }
+ if (!inode) {
+ skip_objectid = key.objectid;
+ continue;
+ }
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+
+ lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+ key.offset + num_bytes - 1, GFP_NOFS);
+ btrfs_drop_extent_cache(inode, key.offset,
+ key.offset + num_bytes - 1, 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+ key.offset + num_bytes - 1, GFP_NOFS);
+ cond_resched();
+ }
+ iput(inode);
+ return 0;
+}
+
+static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_block_group_cache *group,
+ struct inode *reloc_inode)
+{
+ struct btrfs_key key;
+ struct btrfs_key extent_key;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_leaf_ref *ref;
+ struct disk_extent *new_extent;
+ u64 bytenr;
+ u64 num_bytes;
+ u32 nritems;
+ u32 i;
+ int ext_index;
+ int nr_extent;
+ int ret;
+
+ new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
+ BUG_ON(!new_extent);
+
+ ref = btrfs_lookup_leaf_ref(root, leaf->start);
+ BUG_ON(!ref);
+
+ ext_index = -1;
+ nritems = btrfs_header_nritems(leaf);
+ for (i = 0; i < nritems; i++) {
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ if (bytenr == 0)
+ continue;
+
+ ext_index++;
+ if (bytenr >= group->key.objectid + group->key.offset ||
+ bytenr + num_bytes <= group->key.objectid)
+ continue;
+
+ extent_key.objectid = bytenr;
+ extent_key.offset = num_bytes;
+ extent_key.type = BTRFS_EXTENT_ITEM_KEY;
+ nr_extent = 1;
+ ret = get_new_locations(reloc_inode, &extent_key,
+ group->key.objectid, 1,
+ &new_extent, &nr_extent);
+ if (ret > 0)
+ continue;
+ BUG_ON(ret < 0);
+
+ BUG_ON(ref->extents[ext_index].bytenr != bytenr);
+ BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
+ ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
+ ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
+
+ btrfs_set_file_extent_disk_bytenr(leaf, fi,
+ new_extent->disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+ new_extent->disk_num_bytes);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_inc_extent_ref(trans, root,
+ new_extent->disk_bytenr,
+ new_extent->disk_num_bytes,
+ leaf->start,
+ root->root_key.objectid,
+ trans->transid, key.objectid);
+ BUG_ON(ret);
+
+ ret = btrfs_free_extent(trans, root,
+ bytenr, num_bytes, leaf->start,
+ btrfs_header_owner(leaf),
+ btrfs_header_generation(leaf),
+ key.objectid, 0);
+ BUG_ON(ret);
+ cond_resched();
+ }
+ kfree(new_extent);
+ BUG_ON(ext_index + 1 != ref->nritems);
+ btrfs_free_leaf_ref(root, ref);
+ return 0;
+}
+
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ int ret;
+
+ if (root->reloc_root) {
+ reloc_root = root->reloc_root;
+ root->reloc_root = NULL;
+ list_add(&reloc_root->dead_list,
+ &root->fs_info->dead_reloc_roots);
+
+ btrfs_set_root_bytenr(&reloc_root->root_item,
+ reloc_root->node->start);
+ btrfs_set_root_level(&root->root_item,
+ btrfs_header_level(reloc_root->node));
+ memset(&reloc_root->root_item.drop_progress, 0,
+ sizeof(struct btrfs_disk_key));
+ reloc_root->root_item.drop_level = 0;
+
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &reloc_root->root_key,
+ &reloc_root->root_item);
+ BUG_ON(ret);
+ }
+ return 0;
+}
+
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *reloc_root;
+ struct btrfs_root *prev_root = NULL;
+ struct list_head dead_roots;
+ int ret;
+ unsigned long nr;
+
+ INIT_LIST_HEAD(&dead_roots);
+ list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
+
+ while (!list_empty(&dead_roots)) {
+ reloc_root = list_entry(dead_roots.prev,
+ struct btrfs_root, dead_list);
+ list_del_init(&reloc_root->dead_list);
+
+ BUG_ON(reloc_root->commit_root != NULL);
+ while (1) {
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(!trans);
+
+ mutex_lock(&root->fs_info->drop_mutex);
+ ret = btrfs_drop_snapshot(trans, reloc_root);
+ if (ret != -EAGAIN)
+ break;
+ mutex_unlock(&root->fs_info->drop_mutex);
+
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction(trans, root);
+ BUG_ON(ret);
+ btrfs_btree_balance_dirty(root, nr);
+ }
+
+ free_extent_buffer(reloc_root->node);
+
+ ret = btrfs_del_root(trans, root->fs_info->tree_root,
+ &reloc_root->root_key);
+ BUG_ON(ret);
+ mutex_unlock(&root->fs_info->drop_mutex);
+
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction(trans, root);
+ BUG_ON(ret);
+ btrfs_btree_balance_dirty(root, nr);
+
+ kfree(prev_root);
+ prev_root = reloc_root;
+ }
+ if (prev_root) {
+ btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
+ kfree(prev_root);
+ }
+ return 0;
+}
+
+int btrfs_add_dead_reloc_root(struct btrfs_root *root)
+{
+ list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
+ return 0;
+}
+
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key location;
+ int found;
+ int ret;
+
+ mutex_lock(&root->fs_info->tree_reloc_mutex);
+ ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
+ BUG_ON(ret);
+ found = !list_empty(&root->fs_info->dead_reloc_roots);
+ mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
+ if (found) {
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ }
+
+ location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+ location.offset = (u64)-1;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+
+ reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+ BUG_ON(!reloc_root);
+ btrfs_orphan_cleanup(reloc_root);
+ return 0;
+}
+
+static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct extent_buffer *eb;
+ struct btrfs_root_item *root_item;
+ struct btrfs_key root_key;
+ int ret;
+
+ BUG_ON(!root->ref_cows);
+ if (root->reloc_root)
+ return 0;
+
+ root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+ BUG_ON(!root_item);
+
+ ret = btrfs_copy_root(trans, root, root->commit_root,
+ &eb, BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+
+ root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+ root_key.offset = root->root_key.objectid;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+
+ memcpy(root_item, &root->root_item, sizeof(root_item));
+ btrfs_set_root_refs(root_item, 0);
+ btrfs_set_root_bytenr(root_item, eb->start);
+ btrfs_set_root_level(root_item, btrfs_header_level(eb));
+ btrfs_set_root_generation(root_item, trans->transid);
+
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+
+ ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+ &root_key, root_item);
+ BUG_ON(ret);
+ kfree(root_item);
+
+ reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+ &root_key);
+ BUG_ON(!reloc_root);
+ reloc_root->last_trans = trans->transid;
+ reloc_root->commit_root = NULL;
+ reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
+
+ root->reloc_root = reloc_root;
+ return 0;
+}
+
+/*
+ * Core function of space balance.
+ *
+ * The idea is using reloc trees to relocate tree blocks in reference
+ * counted roots. There is one reloc tree for each subvol, and all
+ * reloc trees share same root key objectid. Reloc trees are snapshots
+ * of the latest committed roots of subvols (root->commit_root).
+ *
+ * To relocate a tree block referenced by a subvol, there are two steps.
+ * COW the block through subvol's reloc tree, then update block pointer
+ * in the subvol to point to the new block. Since all reloc trees share
+ * same root key objectid, doing special handing for tree blocks owned
+ * by them is easy. Once a tree block has been COWed in one reloc tree,
+ * we can use the resulting new block directly when the same block is
+ * required to COW again through other reloc trees. By this way, relocated
+ * tree blocks are shared between reloc trees, so they are also shared
+ * between subvols.
+ */
+static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *first_key,
+ struct btrfs_ref_path *ref_path,
+ struct btrfs_block_group_cache *group,
+ struct inode *reloc_inode)
+{
+ struct btrfs_root *reloc_root;
+ struct extent_buffer *eb = NULL;
+ struct btrfs_key *keys;
+ u64 *nodes;
+ int level;
+ int shared_level;
+ int lowest_level = 0;
+ int ret;
+
+ if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+ lowest_level = ref_path->owner_objectid;
+
+ if (!root->ref_cows) {
+ path->lowest_level = lowest_level;
+ ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
+ BUG_ON(ret < 0);
+ path->lowest_level = 0;
+ btrfs_release_path(root, path);
+ return 0;
+ }
+
+ mutex_lock(&root->fs_info->tree_reloc_mutex);
+ ret = init_reloc_tree(trans, root);
+ BUG_ON(ret);
+ reloc_root = root->reloc_root;
+
+ shared_level = ref_path->shared_level;
+ ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
+
+ keys = ref_path->node_keys;
+ nodes = ref_path->new_nodes;
+ memset(&keys[shared_level + 1], 0,
+ sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
+ memset(&nodes[shared_level + 1], 0,
+ sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
+
+ if (nodes[lowest_level] == 0) {
+ path->lowest_level = lowest_level;
+ ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+ 0, 1);
+ BUG_ON(ret);
+ for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
+ eb = path->nodes[level];
+ if (!eb || eb == reloc_root->node)
+ break;
+ nodes[level] = eb->start;
+ if (level == 0)
+ btrfs_item_key_to_cpu(eb, &keys[level], 0);
+ else
+ btrfs_node_key_to_cpu(eb, &keys[level], 0);
+ }
+ if (nodes[0] &&
+ ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+ eb = path->nodes[0];
+ ret = replace_extents_in_leaf(trans, reloc_root, eb,
+ group, reloc_inode);
+ BUG_ON(ret);
+ }
+ btrfs_release_path(reloc_root, path);
+ } else {
+ ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
+ lowest_level);
+ BUG_ON(ret);
+ }
+
+ /*
+ * replace tree blocks in the fs tree with tree blocks in
+ * the reloc tree.
+ */
+ ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
+ BUG_ON(ret < 0);
+
+ if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+ ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+ 0, 0);
+ BUG_ON(ret);
+ extent_buffer_get(path->nodes[0]);
+ eb = path->nodes[0];
+ btrfs_release_path(reloc_root, path);
+ ret = invalidate_extent_cache(reloc_root, eb, group, root);
+ BUG_ON(ret);
+ free_extent_buffer(eb);
+ }
+
+ mutex_unlock(&root->fs_info->tree_reloc_mutex);
+ path->lowest_level = 0;
+ return 0;
+}
+
+static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *first_key,
+ struct btrfs_ref_path *ref_path)
+{
+ int ret;
+
+ ret = relocate_one_path(trans, root, path, first_key,
+ ref_path, NULL, NULL);
+ BUG_ON(ret);
+
+ return 0;
+}
+
+static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_path *path,
+ struct btrfs_key *extent_key)
+{
+ int ret;
+
+ ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
+ if (ret)
+ goto out;
+ ret = btrfs_del_item(trans, extent_root, path);
+out:
+ btrfs_release_path(extent_root, path);
+ return ret;
+}
+
+static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_ref_path *ref_path)
+{
+ struct btrfs_key root_key;
+
+ root_key.objectid = ref_path->root_objectid;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ if (is_cowonly_root(ref_path->root_objectid))
+ root_key.offset = 0;
+ else
+ root_key.offset = (u64)-1;
+
+ return btrfs_read_fs_root_no_name(fs_info, &root_key);
+}
+
+static noinline int relocate_one_extent(struct btrfs_root *extent_root,
+ struct btrfs_path *path,
+ struct btrfs_key *extent_key,
+ struct btrfs_block_group_cache *group,
+ struct inode *reloc_inode, int pass)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *found_root;
+ struct btrfs_ref_path *ref_path = NULL;
+ struct disk_extent *new_extents = NULL;
+ int nr_extents = 0;
+ int loops;
+ int ret;
+ int level;
+ struct btrfs_key first_key;
+ u64 prev_block = 0;
+
+
+ trans = btrfs_start_transaction(extent_root, 1);
+ BUG_ON(!trans);
+
+ if (extent_key->objectid == 0) {
+ ret = del_extent_zero(trans, extent_root, path, extent_key);
+ goto out;
+ }
+
+ ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
+ if (!ref_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (loops = 0; ; loops++) {
+ if (loops == 0) {
+ ret = btrfs_first_ref_path(trans, extent_root, ref_path,
+ extent_key->objectid);
+ } else {
+ ret = btrfs_next_ref_path(trans, extent_root, ref_path);
+ }
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+
+ if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+ ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ continue;
+
+ found_root = read_ref_root(extent_root->fs_info, ref_path);
+ BUG_ON(!found_root);
+ /*
+ * for reference counted tree, only process reference paths
+ * rooted at the latest committed root.
+ */
+ if (found_root->ref_cows &&
+ ref_path->root_generation != found_root->root_key.offset)
+ continue;
+
+ if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+ if (pass == 0) {
+ /*
+ * copy data extents to new locations
+ */
+ u64 group_start = group->key.objectid;
+ ret = relocate_data_extent(reloc_inode,
+ extent_key,
+ group_start);
+ if (ret < 0)
+ goto out;
+ break;
+ }
+ level = 0;
+ } else {
+ level = ref_path->owner_objectid;
+ }
+
+ if (prev_block != ref_path->nodes[level]) {
+ struct extent_buffer *eb;
+ u64 block_start = ref_path->nodes[level];
+ u64 block_size = btrfs_level_size(found_root, level);
+
+ eb = read_tree_block(found_root, block_start,
+ block_size, 0);
+ btrfs_tree_lock(eb);
+ BUG_ON(level != btrfs_header_level(eb));
+
+ if (level == 0)
+ btrfs_item_key_to_cpu(eb, &first_key, 0);
+ else
+ btrfs_node_key_to_cpu(eb, &first_key, 0);
+
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ prev_block = block_start;
+ }
+
+ mutex_lock(&extent_root->fs_info->trans_mutex);
+ btrfs_record_root_in_trans(found_root);
+ mutex_unlock(&extent_root->fs_info->trans_mutex);
+ if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * try to update data extent references while
+ * keeping metadata shared between snapshots.
+ */
+ if (pass == 1) {
+ ret = relocate_one_path(trans, found_root,
+ path, &first_key, ref_path,
+ group, reloc_inode);
+ if (ret < 0)
+ goto out;
+ continue;
+ }
+ /*
+ * use fallback method to process the remaining
+ * references.
+ */
+ if (!new_extents) {
+ u64 group_start = group->key.objectid;
+ new_extents = kmalloc(sizeof(*new_extents),
+ GFP_NOFS);
+ nr_extents = 1;
+ ret = get_new_locations(reloc_inode,
+ extent_key,
+ group_start, 1,
+ &new_extents,
+ &nr_extents);
+ if (ret)
+ goto out;
+ }
+ ret = replace_one_extent(trans, found_root,
+ path, extent_key,
+ &first_key, ref_path,
+ new_extents, nr_extents);
+ } else {
+ ret = relocate_tree_block(trans, found_root, path,
+ &first_key, ref_path);
+ }
+ if (ret < 0)
+ goto out;
+ }
+ ret = 0;
+out:
+ btrfs_end_transaction(trans, extent_root);
+ kfree(new_extents);
+ kfree(ref_path);
+ return ret;
+}
+#endif
+
+static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
+{
+ u64 num_devices;
+ u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
+ num_devices = root->fs_info->fs_devices->rw_devices;
+ if (num_devices == 1) {
+ stripped |= BTRFS_BLOCK_GROUP_DUP;
+ stripped = flags & ~stripped;
+
+ /* turn raid0 into single device chunks */
+ if (flags & BTRFS_BLOCK_GROUP_RAID0)
+ return stripped;
+
+ /* turn mirroring into duplication */
+ if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ return stripped | BTRFS_BLOCK_GROUP_DUP;
+ return flags;
+ } else {
+ /* they already had raid on here, just return */
+ if (flags & stripped)
+ return flags;
+
+ stripped |= BTRFS_BLOCK_GROUP_DUP;
+ stripped = flags & ~stripped;
+
+ /* switch duplicated blocks with raid1 */
+ if (flags & BTRFS_BLOCK_GROUP_DUP)
+ return stripped | BTRFS_BLOCK_GROUP_RAID1;
+
+ /* turn single device chunks into raid0 */
+ return stripped | BTRFS_BLOCK_GROUP_RAID0;
+ }
+ return flags;
+}
+
+static int __alloc_chunk_for_shrink(struct btrfs_root *root,
+ struct btrfs_block_group_cache *shrink_block_group,
+ int force)
+{
+ struct btrfs_trans_handle *trans;
+ u64 new_alloc_flags;
+ u64 calc;
+
+ spin_lock(&shrink_block_group->lock);
+ if (btrfs_block_group_used(&shrink_block_group->item) +
+ shrink_block_group->reserved > 0) {
+ spin_unlock(&shrink_block_group->lock);
+
+ trans = btrfs_start_transaction(root, 1);
+ spin_lock(&shrink_block_group->lock);
+
+ new_alloc_flags = update_block_group_flags(root,
+ shrink_block_group->flags);
+ if (new_alloc_flags != shrink_block_group->flags) {
+ calc =
+ btrfs_block_group_used(&shrink_block_group->item);
+ } else {
+ calc = shrink_block_group->key.offset;
+ }
+ spin_unlock(&shrink_block_group->lock);
+
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ calc + 2 * 1024 * 1024, new_alloc_flags, force);
+
+ btrfs_end_transaction(trans, root);
+ } else
+ spin_unlock(&shrink_block_group->lock);
+ return 0;
+}
+
+
+int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+ struct btrfs_block_group_cache *group)
+
+{
+ __alloc_chunk_for_shrink(root, group, 1);
+ set_block_group_readonly(group);
+ return 0;
+}
+
+/*
+ * checks to see if its even possible to relocate this block group.
+ *
+ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
+ * ok to go ahead and try.
+ */
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_device *device;
+ int full = 0;
+ int ret = 0;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+
+ /* odd, couldn't find the block group, leave it alone */
+ if (!block_group)
+ return -1;
+
+ /* no bytes used, we're good */
+ if (!btrfs_block_group_used(&block_group->item))
+ goto out;
+
+ space_info = block_group->space_info;
+ spin_lock(&space_info->lock);
+
+ full = space_info->full;
+
+ /*
+ * if this is the last block group we have in this space, we can't
+ * relocate it unless we're able to allocate a new chunk below.
+ *
+ * Otherwise, we need to make sure we have room in the space to handle
+ * all of the extents from this block group. If we can, we're good
+ */
+ if ((space_info->total_bytes != block_group->key.offset) &&
+ (space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ btrfs_block_group_used(&block_group->item) <
+ space_info->total_bytes)) {
+ spin_unlock(&space_info->lock);
+ goto out;
+ }
+ spin_unlock(&space_info->lock);
+
+ /*
+ * ok we don't have enough space, but maybe we have free space on our
+ * devices to allocate new chunks for relocation, so loop through our
+ * alloc devices and guess if we have enough space. However, if we
+ * were marked as full, then we know there aren't enough chunks, and we
+ * can just return.
+ */
+ ret = -1;
+ if (full)
+ goto out;
+
+ mutex_lock(&root->fs_info->chunk_mutex);
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+ u64 min_free = btrfs_block_group_used(&block_group->item);
+ u64 dev_offset, max_avail;
+
+ /*
+ * check to make sure we can actually find a chunk with enough
+ * space to fit our block group in.
+ */
+ if (device->total_bytes > device->bytes_used + min_free) {
+ ret = find_free_dev_extent(NULL, device, min_free,
+ &dev_offset, &max_avail);
+ if (!ret)
+ break;
+ ret = -1;
+ }
+ }
+ mutex_unlock(&root->fs_info->chunk_mutex);
+out:
+ btrfs_put_block_group(block_group);
+ return ret;
+}
+
+static int find_first_block_group(struct btrfs_root *root,
+ struct btrfs_path *path, struct btrfs_key *key)
+{
+ int ret = 0;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ int slot;
+
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (1) {
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+ break;
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.objectid >= key->objectid &&
+ found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ ret = 0;
+ goto out;
+ }
+ path->slots[0]++;
+ }
+ ret = -ENOENT;
+out:
+ return ret;
+}
+
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct rb_node *n;
+
+ down_write(&info->extent_commit_sem);
+ while (!list_empty(&info->caching_block_groups)) {
+ caching_ctl = list_entry(info->caching_block_groups.next,
+ struct btrfs_caching_control, list);
+ list_del(&caching_ctl->list);
+ put_caching_control(caching_ctl);
+ }
+ up_write(&info->extent_commit_sem);
+
+ spin_lock(&info->block_group_cache_lock);
+ while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ block_group = rb_entry(n, struct btrfs_block_group_cache,
+ cache_node);
+ rb_erase(&block_group->cache_node,
+ &info->block_group_cache_tree);
+ spin_unlock(&info->block_group_cache_lock);
+
+ down_write(&block_group->space_info->groups_sem);
+ list_del(&block_group->list);
+ up_write(&block_group->space_info->groups_sem);
+
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+ wait_block_group_cache_done(block_group);
+
+ btrfs_remove_free_space_cache(block_group);
+ btrfs_put_block_group(block_group);
+
+ spin_lock(&info->block_group_cache_lock);
+ }
+ spin_unlock(&info->block_group_cache_lock);
+
+ /* now that all the block groups are freed, go through and
+ * free all the space_info structs. This is only called during
+ * the final stages of unmount, and so we know nobody is
+ * using them. We call synchronize_rcu() once before we start,
+ * just to be on the safe side.
+ */
+ synchronize_rcu();
+
+ while(!list_empty(&info->space_info)) {
+ space_info = list_entry(info->space_info.next,
+ struct btrfs_space_info,
+ list);
+
+ list_del(&space_info->list);
+ kfree(space_info);
+ }
+ return 0;
+}
+
+int btrfs_read_block_groups(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *space_info;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+
+ root = info->extent_root;
+ key.objectid = 0;
+ key.offset = 0;
+ btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ ret = find_first_block_group(root, path, &key);
+ if (ret > 0) {
+ ret = 0;
+ goto error;
+ }
+ if (ret != 0)
+ goto error;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ atomic_set(&cache->count, 1);
+ spin_lock_init(&cache->lock);
+ spin_lock_init(&cache->tree_lock);
+ cache->fs_info = info;
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+
+ /*
+ * we only want to have 32k of ram per block group for keeping
+ * track of free space, and if we pass 1/2 of that we want to
+ * start converting things over to using bitmaps
+ */
+ cache->extents_thresh = ((1024 * 32) / 2) /
+ sizeof(struct btrfs_free_space);
+
+ read_extent_buffer(leaf, &cache->item,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(cache->item));
+ memcpy(&cache->key, &found_key, sizeof(found_key));
+
+ key.objectid = found_key.objectid + found_key.offset;
+ btrfs_release_path(root, path);
+ cache->flags = btrfs_block_group_flags(&cache->item);
+ cache->sectorsize = root->sectorsize;
+
+ /*
+ * check for two cases, either we are full, and therefore
+ * don't need to bother with the caching work since we won't
+ * find any space, or we are empty, and we can just add all
+ * the space in and be done with it. This saves us _alot_ of
+ * time, particularly in the full case.
+ */
+ if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+ exclude_super_stripes(root, cache);
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ free_excluded_extents(root, cache);
+ } else if (btrfs_block_group_used(&cache->item) == 0) {
+ exclude_super_stripes(root, cache);
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ add_new_free_space(cache, root->fs_info,
+ found_key.objectid,
+ found_key.objectid +
+ found_key.offset);
+ free_excluded_extents(root, cache);
+ }
+
+ ret = update_space_info(info, cache->flags, found_key.offset,
+ btrfs_block_group_used(&cache->item),
+ &space_info);
+ BUG_ON(ret);
+ cache->space_info = space_info;
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_super += cache->bytes_super;
+ spin_unlock(&cache->space_info->lock);
+
+ down_write(&space_info->groups_sem);
+ list_add_tail(&cache->list, &space_info->block_groups);
+ up_write(&space_info->groups_sem);
+
+ ret = btrfs_add_block_group_cache(root->fs_info, cache);
+ BUG_ON(ret);
+
+ set_avail_alloc_bits(root->fs_info, cache->flags);
+ if (btrfs_chunk_readonly(root, cache->key.objectid))
+ set_block_group_readonly(cache);
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytes_used,
+ u64 type, u64 chunk_objectid, u64 chunk_offset,
+ u64 size)
+{
+ int ret;
+ struct btrfs_root *extent_root;
+ struct btrfs_block_group_cache *cache;
+
+ extent_root = root->fs_info->extent_root;
+
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache)
+ return -ENOMEM;
+
+ cache->key.objectid = chunk_offset;
+ cache->key.offset = size;
+ cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->sectorsize = root->sectorsize;
+
+ /*
+ * we only want to have 32k of ram per block group for keeping track
+ * of free space, and if we pass 1/2 of that we want to start
+ * converting things over to using bitmaps
+ */
+ cache->extents_thresh = ((1024 * 32) / 2) /
+ sizeof(struct btrfs_free_space);
+ atomic_set(&cache->count, 1);
+ spin_lock_init(&cache->lock);
+ spin_lock_init(&cache->tree_lock);
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+
+ btrfs_set_block_group_used(&cache->item, bytes_used);
+ btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
+ cache->flags = type;
+ btrfs_set_block_group_flags(&cache->item, type);
+
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ exclude_super_stripes(root, cache);
+
+ add_new_free_space(cache, root->fs_info, chunk_offset,
+ chunk_offset + size);
+
+ free_excluded_extents(root, cache);
+
+ ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+ &cache->space_info);
+ BUG_ON(ret);
+
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_super += cache->bytes_super;
+ spin_unlock(&cache->space_info->lock);
+
+ down_write(&cache->space_info->groups_sem);
+ list_add_tail(&cache->list, &cache->space_info->block_groups);
+ up_write(&cache->space_info->groups_sem);
+
+ ret = btrfs_add_block_group_cache(root->fs_info, cache);
+ BUG_ON(ret);
+
+ ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
+ sizeof(cache->item));
+ BUG_ON(ret);
+
+ set_avail_alloc_bits(extent_root->fs_info, type);
+
+ return 0;
+}
+
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 group_start)
+{
+ struct btrfs_path *path;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_free_cluster *cluster;
+ struct btrfs_key key;
+ int ret;
+
+ root = root->fs_info->extent_root;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, group_start);
+ BUG_ON(!block_group);
+ BUG_ON(!block_group->ro);
+
+ memcpy(&key, &block_group->key, sizeof(key));
+
+ /* make sure this block group isn't part of an allocation cluster */
+ cluster = &root->fs_info->data_alloc_cluster;
+ spin_lock(&cluster->refill_lock);
+ btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&cluster->refill_lock);
+
+ /*
+ * make sure this block group isn't part of a metadata
+ * allocation cluster
+ */
+ cluster = &root->fs_info->meta_alloc_cluster;
+ spin_lock(&cluster->refill_lock);
+ btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&cluster->refill_lock);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ spin_lock(&root->fs_info->block_group_cache_lock);
+ rb_erase(&block_group->cache_node,
+ &root->fs_info->block_group_cache_tree);
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+
+ down_write(&block_group->space_info->groups_sem);
+ /*
+ * we must use list_del_init so people can check to see if they
+ * are still on the list after taking the semaphore
+ */
+ list_del_init(&block_group->list);
+ up_write(&block_group->space_info->groups_sem);
+
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+ wait_block_group_cache_done(block_group);
+
+ btrfs_remove_free_space_cache(block_group);
+
+ spin_lock(&block_group->space_info->lock);
+ block_group->space_info->total_bytes -= block_group->key.offset;
+ block_group->space_info->bytes_readonly -= block_group->key.offset;
+ spin_unlock(&block_group->space_info->lock);
+
+ btrfs_clear_space_info_full(root->fs_info);
+
+ btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -EIO;
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 00000000000..b177ed31961
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3856 @@
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "compat.h"
+#include "ctree.h"
+#include "btrfs_inode.h"
+
+static struct kmem_cache *extent_state_cache;
+static struct kmem_cache *extent_buffer_cache;
+
+static LIST_HEAD(buffers);
+static LIST_HEAD(states);
+
+#define LEAK_DEBUG 0
+#if LEAK_DEBUG
+static DEFINE_SPINLOCK(leak_lock);
+#endif
+
+#define BUFFER_LRU_MAX 64
+
+struct tree_entry {
+ u64 start;
+ u64 end;
+ struct rb_node rb_node;
+};
+
+struct extent_page_data {
+ struct bio *bio;
+ struct extent_io_tree *tree;
+ get_extent_t *get_extent;
+
+ /* tells writepage not to lock the state bits for this range
+ * it still does the unlocking
+ */
+ unsigned int extent_locked:1;
+
+ /* tells the submit_bio code to use a WRITE_SYNC */
+ unsigned int sync_io:1;
+};
+
+int __init extent_io_init(void)
+{
+ extent_state_cache = kmem_cache_create("extent_state",
+ sizeof(struct extent_state), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!extent_state_cache)
+ return -ENOMEM;
+
+ extent_buffer_cache = kmem_cache_create("extent_buffers",
+ sizeof(struct extent_buffer), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!extent_buffer_cache)
+ goto free_state_cache;
+ return 0;
+
+free_state_cache:
+ kmem_cache_destroy(extent_state_cache);
+ return -ENOMEM;
+}
+
+void extent_io_exit(void)
+{
+ struct extent_state *state;
+ struct extent_buffer *eb;
+
+ while (!list_empty(&states)) {
+ state = list_entry(states.next, struct extent_state, leak_list);
+ printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+ "state %lu in tree %p refs %d\n",
+ (unsigned long long)state->start,
+ (unsigned long long)state->end,
+ state->state, state->tree, atomic_read(&state->refs));
+ list_del(&state->leak_list);
+ kmem_cache_free(extent_state_cache, state);
+
+ }
+
+ while (!list_empty(&buffers)) {
+ eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+ printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+ "refs %d\n", (unsigned long long)eb->start,
+ eb->len, atomic_read(&eb->refs));
+ list_del(&eb->leak_list);
+ kmem_cache_free(extent_buffer_cache, eb);
+ }
+ if (extent_state_cache)
+ kmem_cache_destroy(extent_state_cache);
+ if (extent_buffer_cache)
+ kmem_cache_destroy(extent_buffer_cache);
+}
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+ struct address_space *mapping, gfp_t mask)
+{
+ tree->state.rb_node = NULL;
+ tree->buffer.rb_node = NULL;
+ tree->ops = NULL;
+ tree->dirty_bytes = 0;
+ spin_lock_init(&tree->lock);
+ spin_lock_init(&tree->buffer_lock);
+ tree->mapping = mapping;
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+ struct extent_state *state;
+#if LEAK_DEBUG
+ unsigned long flags;
+#endif
+
+ state = kmem_cache_alloc(extent_state_cache, mask);
+ if (!state)
+ return state;
+ state->state = 0;
+ state->private = 0;
+ state->tree = NULL;
+#if LEAK_DEBUG
+ spin_lock_irqsave(&leak_lock, flags);
+ list_add(&state->leak_list, &states);
+ spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+ atomic_set(&state->refs, 1);
+ init_waitqueue_head(&state->wq);
+ return state;
+}
+
+static void free_extent_state(struct extent_state *state)
+{
+ if (!state)
+ return;
+ if (atomic_dec_and_test(&state->refs)) {
+#if LEAK_DEBUG
+ unsigned long flags;
+#endif
+ WARN_ON(state->tree);
+#if LEAK_DEBUG
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&state->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct tree_entry *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct tree_entry, rb_node);
+
+ if (offset < entry->start)
+ p = &(*p)->rb_left;
+ else if (offset > entry->end)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ entry = rb_entry(node, struct tree_entry, rb_node);
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
+ struct rb_node **prev_ret,
+ struct rb_node **next_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node *n = root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev = NULL;
+ struct tree_entry *entry;
+ struct tree_entry *prev_entry = NULL;
+
+ while (n) {
+ entry = rb_entry(n, struct tree_entry, rb_node);
+ prev = n;
+ prev_entry = entry;
+
+ if (offset < entry->start)
+ n = n->rb_left;
+ else if (offset > entry->end)
+ n = n->rb_right;
+ else
+ return n;
+ }
+
+ if (prev_ret) {
+ orig_prev = prev;
+ while (prev && offset > prev_entry->end) {
+ prev = rb_next(prev);
+ prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+ }
+ *prev_ret = prev;
+ prev = orig_prev;
+ }
+
+ if (next_ret) {
+ prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+ while (prev && offset < prev_entry->start) {
+ prev = rb_prev(prev);
+ prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+ }
+ *next_ret = prev;
+ }
+ return NULL;
+}
+
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+ u64 offset)
+{
+ struct rb_node *prev = NULL;
+ struct rb_node *ret;
+
+ ret = __etree_search(tree, offset, &prev, NULL);
+ if (!ret)
+ return prev;
+ return ret;
+}
+
+static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
+ u64 offset, struct rb_node *node)
+{
+ struct rb_root *root = &tree->buffer;
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_buffer *eb;
+
+ while (*p) {
+ parent = *p;
+ eb = rb_entry(parent, struct extent_buffer, rb_node);
+
+ if (offset < eb->start)
+ p = &(*p)->rb_left;
+ else if (offset > eb->start)
+ p = &(*p)->rb_right;
+ else
+ return eb;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
+ u64 offset)
+{
+ struct rb_root *root = &tree->buffer;
+ struct rb_node *n = root->rb_node;
+ struct extent_buffer *eb;
+
+ while (n) {
+ eb = rb_entry(n, struct extent_buffer, rb_node);
+ if (offset < eb->start)
+ n = n->rb_left;
+ else if (offset > eb->start)
+ n = n->rb_right;
+ else
+ return eb;
+ }
+ return NULL;
+}
+
+static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
+ struct extent_state *other)
+{
+ if (tree->ops && tree->ops->merge_extent_hook)
+ tree->ops->merge_extent_hook(tree->mapping->host, new,
+ other);
+}
+
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree. Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static int merge_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+{
+ struct extent_state *other;
+ struct rb_node *other_node;
+
+ if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+ return 0;
+
+ other_node = rb_prev(&state->rb_node);
+ if (other_node) {
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->end == state->start - 1 &&
+ other->state == state->state) {
+ merge_cb(tree, state, other);
+ state->start = other->start;
+ other->tree = NULL;
+ rb_erase(&other->rb_node, &tree->state);
+ free_extent_state(other);
+ }
+ }
+ other_node = rb_next(&state->rb_node);
+ if (other_node) {
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->start == state->end + 1 &&
+ other->state == state->state) {
+ merge_cb(tree, state, other);
+ other->start = state->start;
+ state->tree = NULL;
+ rb_erase(&state->rb_node, &tree->state);
+ free_extent_state(state);
+ state = NULL;
+ }
+ }
+
+ return 0;
+}
+
+static int set_state_cb(struct extent_io_tree *tree,
+ struct extent_state *state,
+ unsigned long bits)
+{
+ if (tree->ops && tree->ops->set_bit_hook) {
+ return tree->ops->set_bit_hook(tree->mapping->host,
+ state->start, state->end,
+ state->state, bits);
+ }
+
+ return 0;
+}
+
+static void clear_state_cb(struct extent_io_tree *tree,
+ struct extent_state *state,
+ unsigned long bits)
+{
+ if (tree->ops && tree->ops->clear_bit_hook)
+ tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
+}
+
+/*
+ * insert an extent_state struct into the tree. 'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally. This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+ struct extent_state *state, u64 start, u64 end,
+ int bits)
+{
+ struct rb_node *node;
+ int ret;
+
+ if (end < start) {
+ printk(KERN_ERR "btrfs end < start %llu %llu\n",
+ (unsigned long long)end,
+ (unsigned long long)start);
+ WARN_ON(1);
+ }
+ state->start = start;
+ state->end = end;
+ ret = set_state_cb(tree, state, bits);
+ if (ret)
+ return ret;
+
+ if (bits & EXTENT_DIRTY)
+ tree->dirty_bytes += end - start + 1;
+ state->state |= bits;
+ node = tree_insert(&tree->state, end, &state->rb_node);
+ if (node) {
+ struct extent_state *found;
+ found = rb_entry(node, struct extent_state, rb_node);
+ printk(KERN_ERR "btrfs found node %llu %llu on insert of "
+ "%llu %llu\n", (unsigned long long)found->start,
+ (unsigned long long)found->end,
+ (unsigned long long)start, (unsigned long long)end);
+ free_extent_state(state);
+ return -EEXIST;
+ }
+ state->tree = tree;
+ merge_state(tree, state);
+ return 0;
+}
+
+static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+ u64 split)
+{
+ if (tree->ops && tree->ops->split_extent_hook)
+ return tree->ops->split_extent_hook(tree->mapping->host,
+ orig, split);
+ return 0;
+}
+
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half. 'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end]. After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+ struct extent_state *prealloc, u64 split)
+{
+ struct rb_node *node;
+
+ split_cb(tree, orig, split);
+
+ prealloc->start = orig->start;
+ prealloc->end = split - 1;
+ prealloc->state = orig->state;
+ orig->start = split;
+
+ node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+ if (node) {
+ free_extent_state(prealloc);
+ return -EEXIST;
+ }
+ prealloc->tree = tree;
+ return 0;
+}
+
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1), or
+ * forcibly remove the state from the tree (delete == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static int clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state, int bits, int wake,
+ int delete)
+{
+ int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+ int ret = state->state & bits_to_clear;
+
+ if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+ u64 range = state->end - state->start + 1;
+ WARN_ON(range > tree->dirty_bytes);
+ tree->dirty_bytes -= range;
+ }
+ clear_state_cb(tree, state, bits);
+ state->state &= ~bits_to_clear;
+ if (wake)
+ wake_up(&state->wq);
+ if (delete || state->state == 0) {
+ if (state->tree) {
+ clear_state_cb(tree, state, state->state);
+ rb_erase(&state->rb_node, &tree->state);
+ state->tree = NULL;
+ free_extent_state(state);
+ } else {
+ WARN_ON(1);
+ }
+ } else {
+ merge_state(tree, state);
+ }
+ return ret;
+}
+
+/*
+ * clear some bits on a range in the tree. This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns < 0 on error, > 0 if any of the
+ * bits were already set, or zero if none of the bits were already set.
+ */
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *cached;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *next_node;
+ struct rb_node *node;
+ u64 last_end;
+ int err;
+ int set = 0;
+
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ prealloc = alloc_extent_state(mask);
+ if (!prealloc)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state) {
+ cached = *cached_state;
+ *cached_state = NULL;
+ cached_state = NULL;
+ if (cached && cached->tree && cached->start == start) {
+ atomic_dec(&cached->refs);
+ state = cached;
+ goto hit_next;
+ }
+ free_extent_state(cached);
+ }
+ /*
+ * this search will find the extents that end after
+ * our range starts
+ */
+ node = tree_search(tree, start);
+ if (!node)
+ goto out;
+ state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+ if (state->start > end)
+ goto out;
+ WARN_ON(state->end < start);
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state | or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip
+ * bits on second half.
+ *
+ * If the extent we found extends past our range, we
+ * just split and search again. It'll get split again
+ * the next time though.
+ *
+ * If the extent we found is inside our range, we clear
+ * the desired bit on it.
+ */
+
+ if (state->start < start) {
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+ err = split_state(tree, state, prealloc, start);
+ BUG_ON(err == -EEXIST);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set |= clear_state_bit(tree, state, bits, wake,
+ delete);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and clear the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+ err = split_state(tree, state, prealloc, end + 1);
+ BUG_ON(err == -EEXIST);
+ if (wake)
+ wake_up(&state->wq);
+
+ set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+
+ prealloc = NULL;
+ goto out;
+ }
+
+ if (state->end < end && prealloc && !need_resched())
+ next_node = rb_next(&state->rb_node);
+ else
+ next_node = NULL;
+
+ set |= clear_state_bit(tree, state, bits, wake, delete);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start <= end && next_node) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
+ goto search_again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return set;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+
+static int wait_on_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+ __releases(tree->lock)
+ __acquires(tree->lock)
+{
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&tree->lock);
+ schedule();
+ spin_lock(&tree->lock);
+ finish_wait(&state->wq, &wait);
+ return 0;
+}
+
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+{
+ struct extent_state *state;
+ struct rb_node *node;
+
+ spin_lock(&tree->lock);
+again:
+ while (1) {
+ /*
+ * this search will find all the extents that end after
+ * our range starts
+ */
+ node = tree_search(tree, start);
+ if (!node)
+ break;
+
+ state = rb_entry(node, struct extent_state, rb_node);
+
+ if (state->start > end)
+ goto out;
+
+ if (state->state & bits) {
+ start = state->start;
+ atomic_inc(&state->refs);
+ wait_on_state(tree, state);
+ free_extent_state(state);
+ goto again;
+ }
+ start = state->end + 1;
+
+ if (start > end)
+ break;
+
+ if (need_resched()) {
+ spin_unlock(&tree->lock);
+ cond_resched();
+ spin_lock(&tree->lock);
+ }
+ }
+out:
+ spin_unlock(&tree->lock);
+ return 0;
+}
+
+static int set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state,
+ int bits)
+{
+ int ret;
+
+ ret = set_state_cb(tree, state, bits);
+ if (ret)
+ return ret;
+
+ if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+ u64 range = state->end - state->start + 1;
+ tree->dirty_bytes += range;
+ }
+ state->state |= bits;
+
+ return 0;
+}
+
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ if (cached_ptr && !(*cached_ptr)) {
+ if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+ *cached_ptr = state;
+ atomic_inc(&state->refs);
+ }
+ }
+}
+
+/*
+ * set some bits on a range in the tree. This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set. The start of the
+ * existing range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive This takes the tree lock.
+ */
+
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state,
+ gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ prealloc = alloc_extent_state(mask);
+ if (!prealloc)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start == start && state->tree) {
+ node = &state->rb_node;
+ goto hit_next;
+ }
+ }
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node) {
+ err = insert_state(tree, prealloc, start, end, bits);
+ prealloc = NULL;
+ BUG_ON(err == -EEXIST);
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ struct rb_node *next_node;
+ if (state->state & exclusive_bits) {
+ *failed_start = state->start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ err = set_state_bits(tree, state, bits);
+ if (err)
+ goto out;
+
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+
+ start = last_end + 1;
+ if (start < end && prealloc && !need_resched()) {
+ next_node = rb_next(node);
+ if (next_node) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
+ }
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on
+ * second half.
+ *
+ * If the extent we found extends past our
+ * range, we just split and search again. It'll get split
+ * again the next time though.
+ *
+ * If the extent we found is inside our range, we set the
+ * desired bit on it.
+ */
+ if (state->start < start) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, start);
+ BUG_ON(err == -EEXIST);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ err = set_state_bits(tree, state, bits);
+ if (err)
+ goto out;
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and
+ * ignore the extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+ err = insert_state(tree, prealloc, start, this_end,
+ bits);
+ BUG_ON(err == -EEXIST);
+ if (err) {
+ prealloc = NULL;
+ goto out;
+ }
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and set the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, end + 1);
+ BUG_ON(err == -EEXIST);
+
+ err = set_state_bits(tree, prealloc, bits);
+ if (err) {
+ prealloc = NULL;
+ goto out;
+ }
+ cache_state(prealloc, cached_state);
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+ goto search_again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+ NULL, mask);
+}
+
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, bits, 0, NULL,
+ NULL, mask);
+}
+
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+}
+
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+ 0, NULL, NULL, mask);
+}
+
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0,
+ NULL, mask);
+}
+
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+ NULL, mask);
+}
+
+static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
+ NULL, mask);
+}
+
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+ NULL, mask);
+}
+
+static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ NULL, mask);
+}
+
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
+}
+
+/*
+ * either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, struct extent_state **cached_state, gfp_t mask)
+{
+ int err;
+ u64 failed_start;
+ while (1) {
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+ EXTENT_LOCKED, &failed_start,
+ cached_state, mask);
+ if (err == -EEXIST && (mask & __GFP_WAIT)) {
+ wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ start = failed_start;
+ } else {
+ break;
+ }
+ WARN_ON(start > end);
+ }
+ return err;
+}
+
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+ return lock_extent_bits(tree, start, end, 0, NULL, mask);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ int err;
+ u64 failed_start;
+
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+ &failed_start, NULL, mask);
+ if (err == -EEXIST) {
+ if (failed_start > start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, 1, 0, NULL, mask);
+ return 0;
+ }
+ return 1;
+}
+
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ mask);
+}
+
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+ mask);
+}
+
+/*
+ * helper function to set pages and extents in the tree dirty
+ */
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(tree->mapping, index);
+ BUG_ON(!page);
+ __set_page_dirty_nobuffers(page);
+ page_cache_release(page);
+ index++;
+ }
+ return 0;
+}
+
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(tree->mapping, index);
+ BUG_ON(!page);
+ set_page_writeback(page);
+ page_cache_release(page);
+ index++;
+ }
+ return 0;
+}
+
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned, < 0 on error
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, int bits)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node)
+ goto out;
+
+ while (1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->end >= start && (state->state & bits)) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ ret = 0;
+ break;
+ }
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/* find the first state struct with 'bits' set after 'start', and
+ * return it. tree->lock must be held. NULL will returned if
+ * nothing was found after 'start'
+ */
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+ u64 start, int bits)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node)
+ goto out;
+
+ while (1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->end >= start && (state->state & bits))
+ return state;
+
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ return NULL;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'. start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+ u64 *start, u64 *end, u64 max_bytes)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ u64 cur_start = *start;
+ u64 found = 0;
+ u64 total_bytes = 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, cur_start);
+ if (!node) {
+ if (!found)
+ *end = (u64)-1;
+ goto out;
+ }
+
+ while (1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (found && (state->start != cur_start ||
+ (state->state & EXTENT_BOUNDARY))) {
+ goto out;
+ }
+ if (!(state->state & EXTENT_DELALLOC)) {
+ if (!found)
+ *end = state->end;
+ goto out;
+ }
+ if (!found)
+ *start = state->start;
+ found++;
+ *end = state->end;
+ cur_start = state->end + 1;
+ node = rb_next(node);
+ if (!node)
+ break;
+ total_bytes += state->end - state->start + 1;
+ if (total_bytes >= max_bytes)
+ break;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return found;
+}
+
+static noinline int __unlock_for_delalloc(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end)
+{
+ int ret;
+ struct page *pages[16];
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+
+ if (index == locked_page->index && end_index == index)
+ return 0;
+
+ while (nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long, nr_pages,
+ ARRAY_SIZE(pages)), pages);
+ for (i = 0; i < ret; i++) {
+ if (pages[i] != locked_page)
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ cond_resched();
+ }
+ return 0;
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+ struct page *locked_page,
+ u64 delalloc_start,
+ u64 delalloc_end)
+{
+ unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+ unsigned long start_index = index;
+ unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+ unsigned long pages_locked = 0;
+ struct page *pages[16];
+ unsigned long nrpages;
+ int ret;
+ int i;
+
+ /* the caller is responsible for locking the start index */
+ if (index == locked_page->index && index == end_index)
+ return 0;
+
+ /* skip the page at the start index */
+ nrpages = end_index - index + 1;
+ while (nrpages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long,
+ nrpages, ARRAY_SIZE(pages)), pages);
+ if (ret == 0) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ /* now we have an array of pages, lock them all */
+ for (i = 0; i < ret; i++) {
+ /*
+ * the caller is taking responsibility for
+ * locked_page
+ */
+ if (pages[i] != locked_page) {
+ lock_page(pages[i]);
+ if (!PageDirty(pages[i]) ||
+ pages[i]->mapping != inode->i_mapping) {
+ ret = -EAGAIN;
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ goto done;
+ }
+ }
+ page_cache_release(pages[i]);
+ pages_locked++;
+ }
+ nrpages -= ret;
+ index += ret;
+ cond_resched();
+ }
+ ret = 0;
+done:
+ if (ret && pages_locked) {
+ __unlock_for_delalloc(inode, locked_page,
+ delalloc_start,
+ ((u64)(start_index + pages_locked - 1)) <<
+ PAGE_CACHE_SHIFT);
+ }
+ return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'. start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+ struct extent_io_tree *tree,
+ struct page *locked_page,
+ u64 *start, u64 *end,
+ u64 max_bytes)
+{
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 found;
+ struct extent_state *cached_state = NULL;
+ int ret;
+ int loops = 0;
+
+again:
+ /* step one, find a bunch of delalloc bytes starting at start */
+ delalloc_start = *start;
+ delalloc_end = 0;
+ found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+ max_bytes);
+ if (!found || delalloc_end <= *start) {
+ *start = delalloc_start;
+ *end = delalloc_end;
+ return found;
+ }
+
+ /*
+ * start comes from the offset of locked_page. We have to lock
+ * pages in order, so we can't process delalloc bytes before
+ * locked_page
+ */
+ if (delalloc_start < *start)
+ delalloc_start = *start;
+
+ /*
+ * make sure to limit the number of pages we try to lock down
+ * if we're looping.
+ */
+ if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
+ delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+
+ /* step two, lock all the pages after the page that has start */
+ ret = lock_delalloc_pages(inode, locked_page,
+ delalloc_start, delalloc_end);
+ if (ret == -EAGAIN) {
+ /* some of the pages are gone, lets avoid looping by
+ * shortening the size of the delalloc range we're searching
+ */
+ free_extent_state(cached_state);
+ if (!loops) {
+ unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+ max_bytes = PAGE_CACHE_SIZE - offset;
+ loops = 1;
+ goto again;
+ } else {
+ found = 0;
+ goto out_failed;
+ }
+ }
+ BUG_ON(ret);
+
+ /* step three, lock the state bits for the whole range */
+ lock_extent_bits(tree, delalloc_start, delalloc_end,
+ 0, &cached_state, GFP_NOFS);
+
+ /* then test to make sure it is all still delalloc */
+ ret = test_range_bit(tree, delalloc_start, delalloc_end,
+ EXTENT_DELALLOC, 1, cached_state);
+ if (!ret) {
+ unlock_extent_cached(tree, delalloc_start, delalloc_end,
+ &cached_state, GFP_NOFS);
+ __unlock_for_delalloc(inode, locked_page,
+ delalloc_start, delalloc_end);
+ cond_resched();
+ goto again;
+ }
+ free_extent_state(cached_state);
+ *start = delalloc_start;
+ *end = delalloc_end;
+out_failed:
+ return found;
+}
+
+int extent_clear_unlock_delalloc(struct inode *inode,
+ struct extent_io_tree *tree,
+ u64 start, u64 end, struct page *locked_page,
+ unsigned long op)
+{
+ int ret;
+ struct page *pages[16];
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+ int clear_bits = 0;
+
+ if (op & EXTENT_CLEAR_UNLOCK)
+ clear_bits |= EXTENT_LOCKED;
+ if (op & EXTENT_CLEAR_DIRTY)
+ clear_bits |= EXTENT_DIRTY;
+
+ if (op & EXTENT_CLEAR_DELALLOC)
+ clear_bits |= EXTENT_DELALLOC;
+
+ if (op & EXTENT_CLEAR_ACCOUNTING)
+ clear_bits |= EXTENT_DO_ACCOUNTING;
+
+ clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
+ if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
+ EXTENT_SET_PRIVATE2)))
+ return 0;
+
+ while (nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long,
+ nr_pages, ARRAY_SIZE(pages)), pages);
+ for (i = 0; i < ret; i++) {
+
+ if (op & EXTENT_SET_PRIVATE2)
+ SetPagePrivate2(pages[i]);
+
+ if (pages[i] == locked_page) {
+ page_cache_release(pages[i]);
+ continue;
+ }
+ if (op & EXTENT_CLEAR_DIRTY)
+ clear_page_dirty_for_io(pages[i]);
+ if (op & EXTENT_SET_WRITEBACK)
+ set_page_writeback(pages[i]);
+ if (op & EXTENT_END_WRITEBACK)
+ end_page_writeback(pages[i]);
+ if (op & EXTENT_CLEAR_UNLOCK_PAGE)
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ cond_resched();
+ }
+ return 0;
+}
+
+/*
+ * count the number of bytes in the tree that have a given bit(s)
+ * set. This can be fairly slow, except for EXTENT_DIRTY which is
+ * cached. The total number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ unsigned long bits)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ u64 cur_start = *start;
+ u64 total_bytes = 0;
+ int found = 0;
+
+ if (search_end <= cur_start) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ spin_lock(&tree->lock);
+ if (cur_start == 0 && bits == EXTENT_DIRTY) {
+ total_bytes = tree->dirty_bytes;
+ goto out;
+ }
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, cur_start);
+ if (!node)
+ goto out;
+
+ while (1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start > search_end)
+ break;
+ if (state->end >= cur_start && (state->state & bits)) {
+ total_bytes += min(search_end, state->end) + 1 -
+ max(cur_start, state->start);
+ if (total_bytes >= max_bytes)
+ break;
+ if (!found) {
+ *start = state->start;
+ found = 1;
+ }
+ }
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return total_bytes;
+}
+
+/*
+ * set the private field for a given byte offset in the tree. If there isn't
+ * an extent_state there already, this does nothing.
+ */
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ int ret = 0;
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node) {
+ ret = -ENOENT;
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start != start) {
+ ret = -ENOENT;
+ goto out;
+ }
+ state->private = private;
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ int ret = 0;
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node) {
+ ret = -ENOENT;
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start != start) {
+ ret = -ENOENT;
+ goto out;
+ }
+ *private = state->private;
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if every extent in the tree
+ * has the bits set. Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int filled, struct extent_state *cached)
+{
+ struct extent_state *state = NULL;
+ struct rb_node *node;
+ int bitset = 0;
+
+ spin_lock(&tree->lock);
+ if (cached && cached->tree && cached->start == start)
+ node = &cached->rb_node;
+ else
+ node = tree_search(tree, start);
+ while (node && start <= end) {
+ state = rb_entry(node, struct extent_state, rb_node);
+
+ if (filled && state->start > start) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->start > end)
+ break;
+
+ if (state->state & bits) {
+ bitset = 1;
+ if (!filled)
+ break;
+ } else if (filled) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->end == (u64)-1)
+ break;
+
+ start = state->end + 1;
+ if (start > end)
+ break;
+ node = rb_next(node);
+ if (!node) {
+ if (filled)
+ bitset = 0;
+ break;
+ }
+ }
+ spin_unlock(&tree->lock);
+ return bitset;
+}
+
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static int check_page_uptodate(struct extent_io_tree *tree,
+ struct page *page)
+{
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
+ SetPageUptodate(page);
+ return 0;
+}
+
+/*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static int check_page_locked(struct extent_io_tree *tree,
+ struct page *page)
+{
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
+ unlock_page(page);
+ return 0;
+}
+
+/*
+ * helper function to end page writeback if all the extents
+ * in the tree for that page are done with writeback
+ */
+static int check_page_writeback(struct extent_io_tree *tree,
+ struct page *page)
+{
+ end_page_writeback(page);
+ return 0;
+}
+
+/* lots and lots of room for performance fixes in the end_bio funcs */
+
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_writepage(struct bio *bio, int err)
+{
+ int uptodate = err == 0;
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct extent_io_tree *tree;
+ u64 start;
+ u64 end;
+ int whole_page;
+ int ret;
+
+ do {
+ struct page *page = bvec->bv_page;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+ bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+ whole_page = 1;
+ else
+ whole_page = 0;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+ if (tree->ops && tree->ops->writepage_end_io_hook) {
+ ret = tree->ops->writepage_end_io_hook(page, start,
+ end, NULL, uptodate);
+ if (ret)
+ uptodate = 0;
+ }
+
+ if (!uptodate && tree->ops &&
+ tree->ops->writepage_io_failed_hook) {
+ ret = tree->ops->writepage_io_failed_hook(bio, page,
+ start, end, NULL);
+ if (ret == 0) {
+ uptodate = (err == 0);
+ continue;
+ }
+ }
+
+ if (!uptodate) {
+ clear_extent_uptodate(tree, start, end, GFP_NOFS);
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+
+ if (whole_page)
+ end_page_writeback(page);
+ else
+ check_page_writeback(tree, page);
+ } while (bvec >= bio->bi_io_vec);
+
+ bio_put(bio);
+}
+
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_readpage(struct bio *bio, int err)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct extent_io_tree *tree;
+ u64 start;
+ u64 end;
+ int whole_page;
+ int ret;
+
+ if (err)
+ uptodate = 0;
+
+ do {
+ struct page *page = bvec->bv_page;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+ bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+ whole_page = 1;
+ else
+ whole_page = 0;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+
+ if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
+ ret = tree->ops->readpage_end_io_hook(page, start, end,
+ NULL);
+ if (ret)
+ uptodate = 0;
+ }
+ if (!uptodate && tree->ops &&
+ tree->ops->readpage_io_failed_hook) {
+ ret = tree->ops->readpage_io_failed_hook(bio, page,
+ start, end, NULL);
+ if (ret == 0) {
+ uptodate =
+ test_bit(BIO_UPTODATE, &bio->bi_flags);
+ if (err)
+ uptodate = 0;
+ continue;
+ }
+ }
+
+ if (uptodate) {
+ set_extent_uptodate(tree, start, end,
+ GFP_ATOMIC);
+ }
+ unlock_extent(tree, start, end, GFP_ATOMIC);
+
+ if (whole_page) {
+ if (uptodate) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ } else {
+ if (uptodate) {
+ check_page_uptodate(tree, page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ check_page_locked(tree, page);
+ }
+ } while (bvec >= bio->bi_io_vec);
+
+ bio_put(bio);
+}
+
+/*
+ * IO done from prepare_write is pretty simple, we just unlock
+ * the structs in the extent tree when done, and set the uptodate bits
+ * as appropriate.
+ */
+static void end_bio_extent_preparewrite(struct bio *bio, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct extent_io_tree *tree;
+ u64 start;
+ u64 end;
+
+ do {
+ struct page *page = bvec->bv_page;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+ bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+
+ if (uptodate) {
+ set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+
+ unlock_extent(tree, start, end, GFP_ATOMIC);
+
+ } while (bvec >= bio->bi_io_vec);
+
+ bio_put(bio);
+}
+
+static struct bio *
+extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+ gfp_t gfp_flags)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(gfp_flags, nr_vecs);
+
+ if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (nr_vecs /= 2))
+ bio = bio_alloc(gfp_flags, nr_vecs);
+ }
+
+ if (bio) {
+ bio->bi_size = 0;
+ bio->bi_bdev = bdev;
+ bio->bi_sector = first_sector;
+ }
+ return bio;
+}
+
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
+{
+ int ret = 0;
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct page *page = bvec->bv_page;
+ struct extent_io_tree *tree = bio->bi_private;
+ u64 start;
+ u64 end;
+
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ bio->bi_private = NULL;
+
+ bio_get(bio);
+
+ if (tree->ops && tree->ops->submit_bio_hook)
+ tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+ mirror_num, bio_flags);
+ else
+ submit_bio(rw, bio);
+ if (bio_flagged(bio, BIO_EOPNOTSUPP))
+ ret = -EOPNOTSUPP;
+ bio_put(bio);
+ return ret;
+}
+
+static int submit_extent_page(int rw, struct extent_io_tree *tree,
+ struct page *page, sector_t sector,
+ size_t size, unsigned long offset,
+ struct block_device *bdev,
+ struct bio **bio_ret,
+ unsigned long max_pages,
+ bio_end_io_t end_io_func,
+ int mirror_num,
+ unsigned long prev_bio_flags,
+ unsigned long bio_flags)
+{
+ int ret = 0;
+ struct bio *bio;
+ int nr;
+ int contig = 0;
+ int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+ int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+ size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
+
+ if (bio_ret && *bio_ret) {
+ bio = *bio_ret;
+ if (old_compressed)
+ contig = bio->bi_sector == sector;
+ else
+ contig = bio->bi_sector + (bio->bi_size >> 9) ==
+ sector;
+
+ if (prev_bio_flags != bio_flags || !contig ||
+ (tree->ops && tree->ops->merge_bio_hook &&
+ tree->ops->merge_bio_hook(page, offset, page_size, bio,
+ bio_flags)) ||
+ bio_add_page(bio, page, page_size, offset) < page_size) {
+ ret = submit_one_bio(rw, bio, mirror_num,
+ prev_bio_flags);
+ bio = NULL;
+ } else {
+ return 0;
+ }
+ }
+ if (this_compressed)
+ nr = BIO_MAX_PAGES;
+ else
+ nr = bio_get_nr_vecs(bdev);
+
+ bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+
+ bio_add_page(bio, page, page_size, offset);
+ bio->bi_end_io = end_io_func;
+ bio->bi_private = tree;
+
+ if (bio_ret)
+ *bio_ret = bio;
+ else
+ ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
+
+ return ret;
+}
+
+void set_page_extent_mapped(struct page *page)
+{
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, EXTENT_PAGE_PRIVATE);
+ }
+}
+
+static void set_page_extent_head(struct page *page, unsigned long len)
+{
+ set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+}
+
+/*
+ * basic readpage implementation. Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ */
+static int __extent_read_full_page(struct extent_io_tree *tree,
+ struct page *page,
+ get_extent_t *get_extent,
+ struct bio **bio, int mirror_num,
+ unsigned long *bio_flags)
+{
+ struct inode *inode = page->mapping->host;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 end;
+ u64 cur = start;
+ u64 extent_offset;
+ u64 last_byte = i_size_read(inode);
+ u64 block_start;
+ u64 cur_end;
+ sector_t sector;
+ struct extent_map *em;
+ struct block_device *bdev;
+ int ret;
+ int nr = 0;
+ size_t page_offset = 0;
+ size_t iosize;
+ size_t disk_io_size;
+ size_t blocksize = inode->i_sb->s_blocksize;
+ unsigned long this_bio_flag = 0;
+
+ set_page_extent_mapped(page);
+
+ end = page_end;
+ lock_extent(tree, start, end, GFP_NOFS);
+
+ if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+ char *userpage;
+ size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+
+ if (zero_offset) {
+ iosize = PAGE_CACHE_SIZE - zero_offset;
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + zero_offset, 0, iosize);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage, KM_USER0);
+ }
+ }
+ while (cur <= end) {
+ if (cur >= last_byte) {
+ char *userpage;
+ iosize = PAGE_CACHE_SIZE - page_offset;
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + page_offset, 0, iosize);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage, KM_USER0);
+ set_extent_uptodate(tree, cur, cur + iosize - 1,
+ GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ break;
+ }
+ em = get_extent(inode, page, page_offset, cur,
+ end - cur + 1, 0);
+ if (IS_ERR(em) || !em) {
+ SetPageError(page);
+ unlock_extent(tree, cur, end, GFP_NOFS);
+ break;
+ }
+ extent_offset = cur - em->start;
+ BUG_ON(extent_map_end(em) <= cur);
+ BUG_ON(end < cur);
+
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ this_bio_flag = EXTENT_BIO_COMPRESSED;
+
+ iosize = min(extent_map_end(em) - cur, end - cur + 1);
+ cur_end = min(extent_map_end(em) - 1, end);
+ iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+ if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+ disk_io_size = em->block_len;
+ sector = em->block_start >> 9;
+ } else {
+ sector = (em->block_start + extent_offset) >> 9;
+ disk_io_size = iosize;
+ }
+ bdev = em->bdev;
+ block_start = em->block_start;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ block_start = EXTENT_MAP_HOLE;
+ free_extent_map(em);
+ em = NULL;
+
+ /* we've found a hole, just zero and go on */
+ if (block_start == EXTENT_MAP_HOLE) {
+ char *userpage;
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + page_offset, 0, iosize);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage, KM_USER0);
+
+ set_extent_uptodate(tree, cur, cur + iosize - 1,
+ GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ cur = cur + iosize;
+ page_offset += iosize;
+ continue;
+ }
+ /* the get_extent function already copied into the page */
+ if (test_range_bit(tree, cur, cur_end,
+ EXTENT_UPTODATE, 1, NULL)) {
+ check_page_uptodate(tree, page);
+ unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ cur = cur + iosize;
+ page_offset += iosize;
+ continue;
+ }
+ /* we have an inline extent but it didn't get marked up
+ * to date. Error out
+ */
+ if (block_start == EXTENT_MAP_INLINE) {
+ SetPageError(page);
+ unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ cur = cur + iosize;
+ page_offset += iosize;
+ continue;
+ }
+
+ ret = 0;
+ if (tree->ops && tree->ops->readpage_io_hook) {
+ ret = tree->ops->readpage_io_hook(page, cur,
+ cur + iosize - 1);
+ }
+ if (!ret) {
+ unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+ pnr -= page->index;
+ ret = submit_extent_page(READ, tree, page,
+ sector, disk_io_size, page_offset,
+ bdev, bio, pnr,
+ end_bio_extent_readpage, mirror_num,
+ *bio_flags,
+ this_bio_flag);
+ nr++;
+ *bio_flags = this_bio_flag;
+ }
+ if (ret)
+ SetPageError(page);
+ cur = cur + iosize;
+ page_offset += iosize;
+ }
+ if (!nr) {
+ if (!PageError(page))
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ return 0;
+}
+
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent)
+{
+ struct bio *bio = NULL;
+ unsigned long bio_flags = 0;
+ int ret;
+
+ ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+ &bio_flags);
+ if (bio)
+ submit_one_bio(READ, bio, 0, bio_flags);
+ return ret;
+}
+
+static noinline void update_nr_written(struct page *page,
+ struct writeback_control *wbc,
+ unsigned long nr_written)
+{
+ wbc->nr_to_write -= nr_written;
+ if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+ wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+ page->mapping->writeback_index = page->index + nr_written;
+}
+
+/*
+ * the writepage semantics are similar to regular writepage. extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback. Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct inode *inode = page->mapping->host;
+ struct extent_page_data *epd = data;
+ struct extent_io_tree *tree = epd->tree;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 delalloc_start;
+ u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 end;
+ u64 cur = start;
+ u64 extent_offset;
+ u64 last_byte = i_size_read(inode);
+ u64 block_start;
+ u64 iosize;
+ u64 unlock_start;
+ sector_t sector;
+ struct extent_state *cached_state = NULL;
+ struct extent_map *em;
+ struct block_device *bdev;
+ int ret;
+ int nr = 0;
+ size_t pg_offset = 0;
+ size_t blocksize;
+ loff_t i_size = i_size_read(inode);
+ unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+ u64 nr_delalloc;
+ u64 delalloc_end;
+ int page_started;
+ int compressed;
+ int write_flags;
+ unsigned long nr_written = 0;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ write_flags = WRITE_SYNC_PLUG;
+ else
+ write_flags = WRITE;
+
+ WARN_ON(!PageLocked(page));
+ pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+ if (page->index > end_index ||
+ (page->index == end_index && !pg_offset)) {
+ page->mapping->a_ops->invalidatepage(page, 0);
+ unlock_page(page);
+ return 0;
+ }
+
+ if (page->index == end_index) {
+ char *userpage;
+
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + pg_offset, 0,
+ PAGE_CACHE_SIZE - pg_offset);
+ kunmap_atomic(userpage, KM_USER0);
+ flush_dcache_page(page);
+ }
+ pg_offset = 0;
+
+ set_page_extent_mapped(page);
+
+ delalloc_start = start;
+ delalloc_end = 0;
+ page_started = 0;
+ if (!epd->extent_locked) {
+ u64 delalloc_to_write = 0;
+ /*
+ * make sure the wbc mapping index is at least updated
+ * to this page.
+ */
+ update_nr_written(page, wbc, 0);
+
+ while (delalloc_end < page_end) {
+ nr_delalloc = find_lock_delalloc_range(inode, tree,
+ page,
+ &delalloc_start,
+ &delalloc_end,
+ 128 * 1024 * 1024);
+ if (nr_delalloc == 0) {
+ delalloc_start = delalloc_end + 1;
+ continue;
+ }
+ tree->ops->fill_delalloc(inode, page, delalloc_start,
+ delalloc_end, &page_started,
+ &nr_written);
+ /*
+ * delalloc_end is already one less than the total
+ * length, so we don't subtract one from
+ * PAGE_CACHE_SIZE
+ */
+ delalloc_to_write += (delalloc_end - delalloc_start +
+ PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+ delalloc_start = delalloc_end + 1;
+ }
+ if (wbc->nr_to_write < delalloc_to_write) {
+ int thresh = 8192;
+
+ if (delalloc_to_write < thresh * 2)
+ thresh = delalloc_to_write;
+ wbc->nr_to_write = min_t(u64, delalloc_to_write,
+ thresh);
+ }
+
+ /* did the fill delalloc function already unlock and start
+ * the IO?
+ */
+ if (page_started) {
+ ret = 0;
+ /*
+ * we've unlocked the page, so we can't update
+ * the mapping's writeback index, just update
+ * nr_to_write.
+ */
+ wbc->nr_to_write -= nr_written;
+ goto done_unlocked;
+ }
+ }
+ if (tree->ops && tree->ops->writepage_start_hook) {
+ ret = tree->ops->writepage_start_hook(page, start,
+ page_end);
+ if (ret == -EAGAIN) {
+ redirty_page_for_writepage(wbc, page);
+ update_nr_written(page, wbc, nr_written);
+ unlock_page(page);
+ ret = 0;
+ goto done_unlocked;
+ }
+ }
+
+ /*
+ * we don't want to touch the inode after unlocking the page,
+ * so we update the mapping writeback index now
+ */
+ update_nr_written(page, wbc, nr_written + 1);
+
+ end = page_end;
+ if (last_byte <= start) {
+ if (tree->ops && tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, start,
+ page_end, NULL, 1);
+ unlock_start = page_end + 1;
+ goto done;
+ }
+
+ blocksize = inode->i_sb->s_blocksize;
+
+ while (cur <= end) {
+ if (cur >= last_byte) {
+ if (tree->ops && tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, cur,
+ page_end, NULL, 1);
+ unlock_start = page_end + 1;
+ break;
+ }
+ em = epd->get_extent(inode, page, pg_offset, cur,
+ end - cur + 1, 1);
+ if (IS_ERR(em) || !em) {
+ SetPageError(page);
+ break;
+ }
+
+ extent_offset = cur - em->start;
+ BUG_ON(extent_map_end(em) <= cur);
+ BUG_ON(end < cur);
+ iosize = min(extent_map_end(em) - cur, end - cur + 1);
+ iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+ sector = (em->block_start + extent_offset) >> 9;
+ bdev = em->bdev;
+ block_start = em->block_start;
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ free_extent_map(em);
+ em = NULL;
+
+ /*
+ * compressed and inline extents are written through other
+ * paths in the FS
+ */
+ if (compressed || block_start == EXTENT_MAP_HOLE ||
+ block_start == EXTENT_MAP_INLINE) {
+ /*
+ * end_io notification does not happen here for
+ * compressed extents
+ */
+ if (!compressed && tree->ops &&
+ tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, cur,
+ cur + iosize - 1,
+ NULL, 1);
+ else if (compressed) {
+ /* we don't want to end_page_writeback on
+ * a compressed extent. this happens
+ * elsewhere
+ */
+ nr++;
+ }
+
+ cur += iosize;
+ pg_offset += iosize;
+ unlock_start = cur;
+ continue;
+ }
+ /* leave this out until we have a page_mkwrite call */
+ if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+ EXTENT_DIRTY, 0, NULL)) {
+ cur = cur + iosize;
+ pg_offset += iosize;
+ continue;
+ }
+
+ if (tree->ops && tree->ops->writepage_io_hook) {
+ ret = tree->ops->writepage_io_hook(page, cur,
+ cur + iosize - 1);
+ } else {
+ ret = 0;
+ }
+ if (ret) {
+ SetPageError(page);
+ } else {
+ unsigned long max_nr = end_index + 1;
+
+ set_range_writeback(tree, cur, cur + iosize - 1);
+ if (!PageWriteback(page)) {
+ printk(KERN_ERR "btrfs warning page %lu not "
+ "writeback, cur %llu end %llu\n",
+ page->index, (unsigned long long)cur,
+ (unsigned long long)end);
+ }
+
+ ret = submit_extent_page(write_flags, tree, page,
+ sector, iosize, pg_offset,
+ bdev, &epd->bio, max_nr,
+ end_bio_extent_writepage,
+ 0, 0, 0);
+ if (ret)
+ SetPageError(page);
+ }
+ cur = cur + iosize;
+ pg_offset += iosize;
+ nr++;
+ }
+done:
+ if (nr == 0) {
+ /* make sure the mapping tag for page dirty gets cleared */
+ set_page_writeback(page);
+ end_page_writeback(page);
+ }
+ unlock_page(page);
+
+done_unlocked:
+
+ /* drop our reference on any cached states */
+ free_extent_state(cached_state);
+ return 0;
+}
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them. If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int extent_write_cache_pages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ struct writeback_control *wbc,
+ writepage_t writepage, void *data,
+ void (*flush_fn)(void *))
+{
+ int ret = 0;
+ int done = 0;
+ int nr_to_write_done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ int scanned = 0;
+ int range_whole = 0;
+
+ pagevec_init(&pvec, 0);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ scanned = 1;
+ }
+retry:
+ while (!done && !nr_to_write_done && (index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY, min(end - index,
+ (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ unsigned i;
+
+ scanned = 1;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point we hold neither mapping->tree_lock nor
+ * lock on the page itself: the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or even
+ * swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+ if (tree->ops && tree->ops->write_cache_pages_lock_hook)
+ tree->ops->write_cache_pages_lock_hook(page);
+ else
+ lock_page(page);
+
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!wbc->range_cyclic && page->index > end) {
+ done = 1;
+ unlock_page(page);
+ continue;
+ }
+
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ if (PageWriteback(page))
+ flush_fn(data);
+ wait_on_page_writeback(page);
+ }
+
+ if (PageWriteback(page) ||
+ !clear_page_dirty_for_io(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ret = (*writepage)(page, wbc, data);
+
+ if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+ unlock_page(page);
+ ret = 0;
+ }
+ if (ret)
+ done = 1;
+
+ /*
+ * the filesystem may choose to bump up nr_to_write.
+ * We have to make sure to honor the new nr_to_write
+ * at any time
+ */
+ nr_to_write_done = wbc->nr_to_write <= 0;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ if (!scanned && !done) {
+ /*
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ scanned = 1;
+ index = 0;
+ goto retry;
+ }
+ return ret;
+}
+
+static void flush_epd_write_bio(struct extent_page_data *epd)
+{
+ if (epd->bio) {
+ if (epd->sync_io)
+ submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
+ else
+ submit_one_bio(WRITE, epd->bio, 0, 0);
+ epd->bio = NULL;
+ }
+}
+
+static noinline void flush_write_bio(void *data)
+{
+ struct extent_page_data *epd = data;
+ flush_epd_write_bio(epd);
+}
+
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc)
+{
+ int ret;
+ struct address_space *mapping = page->mapping;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .get_extent = get_extent,
+ .extent_locked = 0,
+ .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ };
+ struct writeback_control wbc_writepages = {
+ .bdi = wbc->bdi,
+ .sync_mode = wbc->sync_mode,
+ .older_than_this = NULL,
+ .nr_to_write = 64,
+ .range_start = page_offset(page) + PAGE_CACHE_SIZE,
+ .range_end = (loff_t)-1,
+ };
+
+ ret = __extent_writepage(page, wbc, &epd);
+
+ extent_write_cache_pages(tree, mapping, &wbc_writepages,
+ __extent_writepage, &epd, flush_write_bio);
+ flush_epd_write_bio(&epd);
+ return ret;
+}
+
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+ u64 start, u64 end, get_extent_t *get_extent,
+ int mode)
+{
+ int ret = 0;
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+ unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .get_extent = get_extent,
+ .extent_locked = 1,
+ .sync_io = mode == WB_SYNC_ALL,
+ };
+ struct writeback_control wbc_writepages = {
+ .bdi = inode->i_mapping->backing_dev_info,
+ .sync_mode = mode,
+ .older_than_this = NULL,
+ .nr_to_write = nr_pages * 2,
+ .range_start = start,
+ .range_end = end + 1,
+ };
+
+ while (start <= end) {
+ page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ if (clear_page_dirty_for_io(page))
+ ret = __extent_writepage(page, &wbc_writepages, &epd);
+ else {
+ if (tree->ops && tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, start,
+ start + PAGE_CACHE_SIZE - 1,
+ NULL, 1);
+ unlock_page(page);
+ }
+ page_cache_release(page);
+ start += PAGE_CACHE_SIZE;
+ }
+
+ flush_epd_write_bio(&epd);
+ return ret;
+}
+
+int extent_writepages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .get_extent = get_extent,
+ .extent_locked = 0,
+ .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ };
+
+ ret = extent_write_cache_pages(tree, mapping, wbc,
+ __extent_writepage, &epd,
+ flush_write_bio);
+ flush_epd_write_bio(&epd);
+ return ret;
+}
+
+int extent_readpages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages,
+ get_extent_t get_extent)
+{
+ struct bio *bio = NULL;
+ unsigned page_idx;
+ struct pagevec pvec;
+ unsigned long bio_flags = 0;
+
+ pagevec_init(&pvec, 0);
+ for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ struct page *page = list_entry(pages->prev, struct page, lru);
+
+ prefetchw(&page->flags);
+ list_del(&page->lru);
+ /*
+ * what we want to do here is call add_to_page_cache_lru,
+ * but that isn't exported, so we reproduce it here
+ */
+ if (!add_to_page_cache(page, mapping,
+ page->index, GFP_KERNEL)) {
+
+ /* open coding of lru_cache_add, also not exported */
+ page_cache_get(page);
+ if (!pagevec_add(&pvec, page))
+ __pagevec_lru_add_file(&pvec);
+ __extent_read_full_page(tree, page, get_extent,
+ &bio, 0, &bio_flags);
+ }
+ page_cache_release(page);
+ }
+ if (pagevec_count(&pvec))
+ __pagevec_lru_add_file(&pvec);
+ BUG_ON(!list_empty(pages));
+ if (bio)
+ submit_one_bio(READ, bio, 0, bio_flags);
+ return 0;
+}
+
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_io_tree *tree,
+ struct page *page, unsigned long offset)
+{
+ u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+
+ start += (offset + blocksize - 1) & ~(blocksize - 1);
+ if (start > end)
+ return 0;
+
+ lock_extent(tree, start, end, GFP_NOFS);
+ wait_on_page_writeback(page);
+ clear_extent_bit(tree, start, end,
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING,
+ 1, 1, NULL, GFP_NOFS);
+ return 0;
+}
+
+/*
+ * simple commit_write call, set_range_dirty is used to mark both
+ * the pages and the extent records as dirty
+ */
+int extent_commit_write(struct extent_io_tree *tree,
+ struct inode *inode, struct page *page,
+ unsigned from, unsigned to)
+{
+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+ set_page_extent_mapped(page);
+ set_page_dirty(page);
+
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ return 0;
+}
+
+int extent_prepare_write(struct extent_io_tree *tree,
+ struct inode *inode, struct page *page,
+ unsigned from, unsigned to, get_extent_t *get_extent)
+{
+ u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+ u64 block_start;
+ u64 orig_block_start;
+ u64 block_end;
+ u64 cur_end;
+ struct extent_map *em;
+ unsigned blocksize = 1 << inode->i_blkbits;
+ size_t page_offset = 0;
+ size_t block_off_start;
+ size_t block_off_end;
+ int err = 0;
+ int iocount = 0;
+ int ret = 0;
+ int isnew;
+
+ set_page_extent_mapped(page);
+
+ block_start = (page_start + from) & ~((u64)blocksize - 1);
+ block_end = (page_start + to - 1) | (blocksize - 1);
+ orig_block_start = block_start;
+
+ lock_extent(tree, page_start, page_end, GFP_NOFS);
+ while (block_start <= block_end) {
+ em = get_extent(inode, page, page_offset, block_start,
+ block_end - block_start + 1, 1);
+ if (IS_ERR(em) || !em)
+ goto err;
+
+ cur_end = min(block_end, extent_map_end(em) - 1);
+ block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
+ block_off_end = block_off_start + blocksize;
+ isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
+
+ if (!PageUptodate(page) && isnew &&
+ (block_off_end > to || block_off_start < from)) {
+ void *kaddr;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (block_off_end > to)
+ memset(kaddr + to, 0, block_off_end - to);
+ if (block_off_start < from)
+ memset(kaddr + block_off_start, 0,
+ from - block_off_start);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ if ((em->block_start != EXTENT_MAP_HOLE &&
+ em->block_start != EXTENT_MAP_INLINE) &&
+ !isnew && !PageUptodate(page) &&
+ (block_off_end > to || block_off_start < from) &&
+ !test_range_bit(tree, block_start, cur_end,
+ EXTENT_UPTODATE, 1, NULL)) {
+ u64 sector;
+ u64 extent_offset = block_start - em->start;
+ size_t iosize;
+ sector = (em->block_start + extent_offset) >> 9;
+ iosize = (cur_end - block_start + blocksize) &
+ ~((u64)blocksize - 1);
+ /*
+ * we've already got the extent locked, but we
+ * need to split the state such that our end_bio
+ * handler can clear the lock.
+ */
+ set_extent_bit(tree, block_start,
+ block_start + iosize - 1,
+ EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
+ ret = submit_extent_page(READ, tree, page,
+ sector, iosize, page_offset, em->bdev,
+ NULL, 1,
+ end_bio_extent_preparewrite, 0,
+ 0, 0);
+ iocount++;
+ block_start = block_start + iosize;
+ } else {
+ set_extent_uptodate(tree, block_start, cur_end,
+ GFP_NOFS);
+ unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+ block_start = cur_end + 1;
+ }
+ page_offset = block_start & (PAGE_CACHE_SIZE - 1);
+ free_extent_map(em);
+ }
+ if (iocount) {
+ wait_extent_bit(tree, orig_block_start,
+ block_end, EXTENT_LOCKED);
+ }
+ check_page_uptodate(tree, page);
+err:
+ /* FIXME, zero out newly allocated blocks on error */
+ return err;
+}
+
+/*
+ * a helper for releasepage, this tests for areas of the page that
+ * are locked or under IO and drops the related state bits if it is safe
+ * to drop the page.
+ */
+int try_release_extent_state(struct extent_map_tree *map,
+ struct extent_io_tree *tree, struct page *page,
+ gfp_t mask)
+{
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ int ret = 1;
+
+ if (test_range_bit(tree, start, end,
+ EXTENT_IOBITS, 0, NULL))
+ ret = 0;
+ else {
+ if ((mask & GFP_NOFS) == GFP_NOFS)
+ mask = GFP_NOFS;
+ /*
+ * at this point we can safely clear everything except the
+ * locked bit and the nodatasum bit
+ */
+ clear_extent_bit(tree, start, end,
+ ~(EXTENT_LOCKED | EXTENT_NODATASUM),
+ 0, 0, NULL, mask);
+ }
+ return ret;
+}
+
+/*
+ * a helper for releasepage. As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *map,
+ struct extent_io_tree *tree, struct page *page,
+ gfp_t mask)
+{
+ struct extent_map *em;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+
+ if ((mask & __GFP_WAIT) &&
+ page->mapping->host->i_size > 16 * 1024 * 1024) {
+ u64 len;
+ while (start <= end) {
+ len = end - start + 1;
+ write_lock(&map->lock);
+ em = lookup_extent_mapping(map, start, len);
+ if (!em || IS_ERR(em)) {
+ write_unlock(&map->lock);
+ break;
+ }
+ if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+ em->start != start) {
+ write_unlock(&map->lock);
+ free_extent_map(em);
+ break;
+ }
+ if (!test_range_bit(tree, em->start,
+ extent_map_end(em) - 1,
+ EXTENT_LOCKED | EXTENT_WRITEBACK,
+ 0, NULL)) {
+ remove_extent_mapping(map, em);
+ /* once for the rb tree */
+ free_extent_map(em);
+ }
+ start = extent_map_end(em);
+ write_unlock(&map->lock);
+
+ /* once for us */
+ free_extent_map(em);
+ }
+ }
+ return try_release_extent_state(map, tree, page, mask);
+}
+
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+ get_extent_t *get_extent)
+{
+ struct inode *inode = mapping->host;
+ u64 start = iblock << inode->i_blkbits;
+ sector_t sector = 0;
+ size_t blksize = (1 << inode->i_blkbits);
+ struct extent_map *em;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+ GFP_NOFS);
+ em = get_extent(inode, NULL, 0, start, blksize, 0);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+ GFP_NOFS);
+ if (!em || IS_ERR(em))
+ return 0;
+
+ if (em->block_start > EXTENT_MAP_LAST_BYTE)
+ goto out;
+
+ sector = (em->block_start + start - em->start) >> inode->i_blkbits;
+out:
+ free_extent_map(em);
+ return sector;
+}
+
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len, get_extent_t *get_extent)
+{
+ int ret;
+ u64 off = start;
+ u64 max = start + len;
+ u32 flags = 0;
+ u64 disko = 0;
+ struct extent_map *em = NULL;
+ int end = 0;
+ u64 em_start = 0, em_len = 0;
+ unsigned long emflags;
+ ret = 0;
+
+ if (len == 0)
+ return -EINVAL;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
+ GFP_NOFS);
+ em = get_extent(inode, NULL, 0, off, max - off, 0);
+ if (!em)
+ goto out;
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+ while (!end) {
+ off = em->start + em->len;
+ if (off >= max)
+ end = 1;
+
+ em_start = em->start;
+ em_len = em->len;
+
+ disko = 0;
+ flags = 0;
+
+ if (em->block_start == EXTENT_MAP_LAST_BYTE) {
+ end = 1;
+ flags |= FIEMAP_EXTENT_LAST;
+ } else if (em->block_start == EXTENT_MAP_HOLE) {
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
+ } else if (em->block_start == EXTENT_MAP_INLINE) {
+ flags |= (FIEMAP_EXTENT_DATA_INLINE |
+ FIEMAP_EXTENT_NOT_ALIGNED);
+ } else if (em->block_start == EXTENT_MAP_DELALLOC) {
+ flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ } else {
+ disko = em->block_start;
+ }
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ flags |= FIEMAP_EXTENT_ENCODED;
+
+ emflags = em->flags;
+ free_extent_map(em);
+ em = NULL;
+
+ if (!end) {
+ em = get_extent(inode, NULL, 0, off, max - off, 0);
+ if (!em)
+ goto out;
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+ emflags = em->flags;
+ }
+ if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
+ flags |= FIEMAP_EXTENT_LAST;
+ end = 1;
+ }
+
+ ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+ em_len, flags);
+ if (ret)
+ goto out_free;
+ }
+out_free:
+ free_extent_map(em);
+out:
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
+ GFP_NOFS);
+ return ret;
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+ unsigned long i)
+{
+ struct page *p;
+ struct address_space *mapping;
+
+ if (i == 0)
+ return eb->first_page;
+ i += eb->start >> PAGE_CACHE_SHIFT;
+ mapping = eb->first_page->mapping;
+ if (!mapping)
+ return NULL;
+
+ /*
+ * extent_buffer_page is only called after pinning the page
+ * by increasing the reference count. So we know the page must
+ * be in the radix tree.
+ */
+ rcu_read_lock();
+ p = radix_tree_lookup(&mapping->page_tree, i);
+ rcu_read_unlock();
+
+ return p;
+}
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+ return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+ (start >> PAGE_CACHE_SHIFT);
+}
+
+static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
+ u64 start,
+ unsigned long len,
+ gfp_t mask)
+{
+ struct extent_buffer *eb = NULL;
+#if LEAK_DEBUG
+ unsigned long flags;
+#endif
+
+ eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+ eb->start = start;
+ eb->len = len;
+ spin_lock_init(&eb->lock);
+ init_waitqueue_head(&eb->lock_wq);
+
+#if LEAK_DEBUG
+ spin_lock_irqsave(&leak_lock, flags);
+ list_add(&eb->leak_list, &buffers);
+ spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+ atomic_set(&eb->refs, 1);
+
+ return eb;
+}
+
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+#if LEAK_DEBUG
+ unsigned long flags;
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&eb->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+ kmem_cache_free(extent_buffer_cache, eb);
+}
+
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+ u64 start, unsigned long len,
+ struct page *page0,
+ gfp_t mask)
+{
+ unsigned long num_pages = num_extent_pages(start, len);
+ unsigned long i;
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ struct extent_buffer *eb;
+ struct extent_buffer *exists = NULL;
+ struct page *p;
+ struct address_space *mapping = tree->mapping;
+ int uptodate = 1;
+
+ spin_lock(&tree->buffer_lock);
+ eb = buffer_search(tree, start);
+ if (eb) {
+ atomic_inc(&eb->refs);
+ spin_unlock(&tree->buffer_lock);
+ mark_page_accessed(eb->first_page);
+ return eb;
+ }
+ spin_unlock(&tree->buffer_lock);
+
+ eb = __alloc_extent_buffer(tree, start, len, mask);
+ if (!eb)
+ return NULL;
+
+ if (page0) {
+ eb->first_page = page0;
+ i = 1;
+ index++;
+ page_cache_get(page0);
+ mark_page_accessed(page0);
+ set_page_extent_mapped(page0);
+ set_page_extent_head(page0, len);
+ uptodate = PageUptodate(page0);
+ } else {
+ i = 0;
+ }
+ for (; i < num_pages; i++, index++) {
+ p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
+ if (!p) {
+ WARN_ON(1);
+ goto free_eb;
+ }
+ set_page_extent_mapped(p);
+ mark_page_accessed(p);
+ if (i == 0) {
+ eb->first_page = p;
+ set_page_extent_head(p, len);
+ } else {
+ set_page_private(p, EXTENT_PAGE_PRIVATE);
+ }
+ if (!PageUptodate(p))
+ uptodate = 0;
+ unlock_page(p);
+ }
+ if (uptodate)
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+ spin_lock(&tree->buffer_lock);
+ exists = buffer_tree_insert(tree, start, &eb->rb_node);
+ if (exists) {
+ /* add one reference for the caller */
+ atomic_inc(&exists->refs);
+ spin_unlock(&tree->buffer_lock);
+ goto free_eb;
+ }
+ /* add one reference for the tree */
+ atomic_inc(&eb->refs);
+ spin_unlock(&tree->buffer_lock);
+ return eb;
+
+free_eb:
+ if (!atomic_dec_and_test(&eb->refs))
+ return exists;
+ for (index = 1; index < i; index++)
+ page_cache_release(extent_buffer_page(eb, index));
+ page_cache_release(extent_buffer_page(eb, 0));
+ __free_extent_buffer(eb);
+ return exists;
+}
+
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+ u64 start, unsigned long len,
+ gfp_t mask)
+{
+ struct extent_buffer *eb;
+
+ spin_lock(&tree->buffer_lock);
+ eb = buffer_search(tree, start);
+ if (eb)
+ atomic_inc(&eb->refs);
+ spin_unlock(&tree->buffer_lock);
+
+ if (eb)
+ mark_page_accessed(eb->first_page);
+
+ return eb;
+}
+
+void free_extent_buffer(struct extent_buffer *eb)
+{
+ if (!eb)
+ return;
+
+ if (!atomic_dec_and_test(&eb->refs))
+ return;
+
+ WARN_ON(1);
+}
+
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+ struct extent_buffer *eb)
+{
+ unsigned long i;
+ unsigned long num_pages;
+ struct page *page;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+
+ for (i = 0; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if (!PageDirty(page))
+ continue;
+
+ lock_page(page);
+ if (i == 0)
+ set_page_extent_head(page, eb->len);
+ else
+ set_page_private(page, EXTENT_PAGE_PRIVATE);
+
+ clear_page_dirty_for_io(page);
+ spin_lock_irq(&page->mapping->tree_lock);
+ if (!PageDirty(page)) {
+ radix_tree_tag_clear(&page->mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ }
+ spin_unlock_irq(&page->mapping->tree_lock);
+ unlock_page(page);
+ }
+ return 0;
+}
+
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+ struct extent_buffer *eb)
+{
+ return wait_on_extent_writeback(tree, eb->start,
+ eb->start + eb->len - 1);
+}
+
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+ struct extent_buffer *eb)
+{
+ unsigned long i;
+ unsigned long num_pages;
+ int was_dirty = 0;
+
+ was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++)
+ __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+ return was_dirty;
+}
+
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+ struct extent_buffer *eb)
+{
+ unsigned long i;
+ struct page *page;
+ unsigned long num_pages;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+ clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+ GFP_NOFS);
+ for (i = 0; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if (page)
+ ClearPageUptodate(page);
+ }
+ return 0;
+}
+
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+ struct extent_buffer *eb)
+{
+ unsigned long i;
+ struct page *page;
+ unsigned long num_pages;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+
+ set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+ GFP_NOFS);
+ for (i = 0; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+ ((i == num_pages - 1) &&
+ ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+ check_page_uptodate(tree, page);
+ continue;
+ }
+ SetPageUptodate(page);
+ }
+ return 0;
+}
+
+int extent_range_uptodate(struct extent_io_tree *tree,
+ u64 start, u64 end)
+{
+ struct page *page;
+ int ret;
+ int pg_uptodate = 1;
+ int uptodate;
+ unsigned long index;
+
+ ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
+ if (ret)
+ return 1;
+ while (start <= end) {
+ index = start >> PAGE_CACHE_SHIFT;
+ page = find_get_page(tree->mapping, index);
+ uptodate = PageUptodate(page);
+ page_cache_release(page);
+ if (!uptodate) {
+ pg_uptodate = 0;
+ break;
+ }
+ start += PAGE_CACHE_SIZE;
+ }
+ return pg_uptodate;
+}
+
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+ struct extent_buffer *eb)
+{
+ int ret = 0;
+ unsigned long num_pages;
+ unsigned long i;
+ struct page *page;
+ int pg_uptodate = 1;
+
+ if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ return 1;
+
+ ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+ EXTENT_UPTODATE, 1, NULL);
+ if (ret)
+ return ret;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if (!PageUptodate(page)) {
+ pg_uptodate = 0;
+ break;
+ }
+ }
+ return pg_uptodate;
+}
+
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+ struct extent_buffer *eb,
+ u64 start, int wait,
+ get_extent_t *get_extent, int mirror_num)
+{
+ unsigned long i;
+ unsigned long start_i;
+ struct page *page;
+ int err;
+ int ret = 0;
+ int locked_pages = 0;
+ int all_uptodate = 1;
+ int inc_all_pages = 0;
+ unsigned long num_pages;
+ struct bio *bio = NULL;
+ unsigned long bio_flags = 0;
+
+ if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ return 0;
+
+ if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+ EXTENT_UPTODATE, 1, NULL)) {
+ return 0;
+ }
+
+ if (start) {
+ WARN_ON(start < eb->start);
+ start_i = (start >> PAGE_CACHE_SHIFT) -
+ (eb->start >> PAGE_CACHE_SHIFT);
+ } else {
+ start_i = 0;
+ }
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = start_i; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if (!wait) {
+ if (!trylock_page(page))
+ goto unlock_exit;
+ } else {
+ lock_page(page);
+ }
+ locked_pages++;
+ if (!PageUptodate(page))
+ all_uptodate = 0;
+ }
+ if (all_uptodate) {
+ if (start_i == 0)
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ goto unlock_exit;
+ }
+
+ for (i = start_i; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if (inc_all_pages)
+ page_cache_get(page);
+ if (!PageUptodate(page)) {
+ if (start_i == 0)
+ inc_all_pages = 1;
+ ClearPageError(page);
+ err = __extent_read_full_page(tree, page,
+ get_extent, &bio,
+ mirror_num, &bio_flags);
+ if (err)
+ ret = err;
+ } else {
+ unlock_page(page);
+ }
+ }
+
+ if (bio)
+ submit_one_bio(READ, bio, mirror_num, bio_flags);
+
+ if (ret || !wait)
+ return ret;
+
+ for (i = start_i; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ wait_on_page_locked(page);
+ if (!PageUptodate(page))
+ ret = -EIO;
+ }
+
+ if (!ret)
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ return ret;
+
+unlock_exit:
+ i = start_i;
+ while (locked_pages > 0) {
+ page = extent_buffer_page(eb, i);
+ i++;
+ unlock_page(page);
+ locked_pages--;
+ }
+ return ret;
+}
+
+void read_extent_buffer(struct extent_buffer *eb, void *dstv,
+ unsigned long start,
+ unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char *dst = (char *)dstv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(eb, i);
+
+ cur = min(len, (PAGE_CACHE_SIZE - offset));
+ kaddr = kmap_atomic(page, KM_USER1);
+ memcpy(dst, kaddr + offset, cur);
+ kunmap_atomic(kaddr, KM_USER1);
+
+ dst += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
+ unsigned long min_len, char **token, char **map,
+ unsigned long *map_start,
+ unsigned long *map_len, int km)
+{
+ size_t offset = start & (PAGE_CACHE_SIZE - 1);
+ char *kaddr;
+ struct page *p;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ unsigned long end_i = (start_offset + start + min_len - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ if (i != end_i)
+ return -EINVAL;
+
+ if (i == 0) {
+ offset = start_offset;
+ *map_start = 0;
+ } else {
+ offset = 0;
+ *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+ }
+
+ if (start + min_len > eb->len) {
+ printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+ "wanted %lu %lu\n", (unsigned long long)eb->start,
+ eb->len, start, min_len);
+ WARN_ON(1);
+ }
+
+ p = extent_buffer_page(eb, i);
+ kaddr = kmap_atomic(p, km);
+ *token = kaddr;
+ *map = kaddr + offset;
+ *map_len = PAGE_CACHE_SIZE - offset;
+ return 0;
+}
+
+int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+ unsigned long min_len,
+ char **token, char **map,
+ unsigned long *map_start,
+ unsigned long *map_len, int km)
+{
+ int err;
+ int save = 0;
+ if (eb->map_token) {
+ unmap_extent_buffer(eb, eb->map_token, km);
+ eb->map_token = NULL;
+ save = 1;
+ }
+ err = map_private_extent_buffer(eb, start, min_len, token, map,
+ map_start, map_len, km);
+ if (!err && save) {
+ eb->map_token = *token;
+ eb->kaddr = *map;
+ eb->map_start = *map_start;
+ eb->map_len = *map_len;
+ }
+ return err;
+}
+
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
+{
+ kunmap_atomic(token, km);
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+ unsigned long start,
+ unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char *ptr = (char *)ptrv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ int ret = 0;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(eb, i);
+
+ cur = min(len, (PAGE_CACHE_SIZE - offset));
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ ret = memcmp(ptr, kaddr + offset, cur);
+ kunmap_atomic(kaddr, KM_USER0);
+ if (ret)
+ break;
+
+ ptr += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+ return ret;
+}
+
+void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+ unsigned long start, unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char *src = (char *)srcv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(eb, i);
+ WARN_ON(!PageUptodate(page));
+
+ cur = min(len, PAGE_CACHE_SIZE - offset);
+ kaddr = kmap_atomic(page, KM_USER1);
+ memcpy(kaddr + offset, src, cur);
+ kunmap_atomic(kaddr, KM_USER1);
+
+ src += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+ unsigned long start, unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(eb, i);
+ WARN_ON(!PageUptodate(page));
+
+ cur = min(len, PAGE_CACHE_SIZE - offset);
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, c, cur);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len)
+{
+ u64 dst_len = dst->len;
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(src->len != dst_len);
+
+ offset = (start_offset + dst_offset) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(dst, i);
+ WARN_ON(!PageUptodate(page));
+
+ cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ read_extent_buffer(src, kaddr + offset, src_offset, cur);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ src_offset += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+static void move_pages(struct page *dst_page, struct page *src_page,
+ unsigned long dst_off, unsigned long src_off,
+ unsigned long len)
+{
+ char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+ if (dst_page == src_page) {
+ memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
+ } else {
+ char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+ char *p = dst_kaddr + dst_off + len;
+ char *s = src_kaddr + src_off + len;
+
+ while (len--)
+ *--p = *--s;
+
+ kunmap_atomic(src_kaddr, KM_USER1);
+ }
+ kunmap_atomic(dst_kaddr, KM_USER0);
+}
+
+static void copy_pages(struct page *dst_page, struct page *src_page,
+ unsigned long dst_off, unsigned long src_off,
+ unsigned long len)
+{
+ char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+ char *src_kaddr;
+
+ if (dst_page != src_page)
+ src_kaddr = kmap_atomic(src_page, KM_USER1);
+ else
+ src_kaddr = dst_kaddr;
+
+ memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+ kunmap_atomic(dst_kaddr, KM_USER0);
+ if (dst_page != src_page)
+ kunmap_atomic(src_kaddr, KM_USER1);
+}
+
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len)
+{
+ size_t cur;
+ size_t dst_off_in_page;
+ size_t src_off_in_page;
+ size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long dst_i;
+ unsigned long src_i;
+
+ if (src_offset + len > dst->len) {
+ printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+ "len %lu dst len %lu\n", src_offset, len, dst->len);
+ BUG_ON(1);
+ }
+ if (dst_offset + len > dst->len) {
+ printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+ "len %lu dst len %lu\n", dst_offset, len, dst->len);
+ BUG_ON(1);
+ }
+
+ while (len > 0) {
+ dst_off_in_page = (start_offset + dst_offset) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+ src_off_in_page = (start_offset + src_offset) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+ src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+
+ cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+ src_off_in_page));
+ cur = min_t(unsigned long, cur,
+ (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+
+ copy_pages(extent_buffer_page(dst, dst_i),
+ extent_buffer_page(dst, src_i),
+ dst_off_in_page, src_off_in_page, cur);
+
+ src_offset += cur;
+ dst_offset += cur;
+ len -= cur;
+ }
+}
+
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len)
+{
+ size_t cur;
+ size_t dst_off_in_page;
+ size_t src_off_in_page;
+ unsigned long dst_end = dst_offset + len - 1;
+ unsigned long src_end = src_offset + len - 1;
+ size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long dst_i;
+ unsigned long src_i;
+
+ if (src_offset + len > dst->len) {
+ printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+ "len %lu len %lu\n", src_offset, len, dst->len);
+ BUG_ON(1);
+ }
+ if (dst_offset + len > dst->len) {
+ printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+ "len %lu len %lu\n", dst_offset, len, dst->len);
+ BUG_ON(1);
+ }
+ if (dst_offset < src_offset) {
+ memcpy_extent_buffer(dst, dst_offset, src_offset, len);
+ return;
+ }
+ while (len > 0) {
+ dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
+ src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+
+ dst_off_in_page = (start_offset + dst_end) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+ src_off_in_page = (start_offset + src_end) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ cur = min_t(unsigned long, len, src_off_in_page + 1);
+ cur = min(cur, dst_off_in_page + 1);
+ move_pages(extent_buffer_page(dst, dst_i),
+ extent_buffer_page(dst, src_i),
+ dst_off_in_page - cur + 1,
+ src_off_in_page - cur + 1, cur);
+
+ dst_end -= cur;
+ src_end -= cur;
+ len -= cur;
+ }
+}
+
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+{
+ u64 start = page_offset(page);
+ struct extent_buffer *eb;
+ int ret = 1;
+ unsigned long i;
+ unsigned long num_pages;
+
+ spin_lock(&tree->buffer_lock);
+ eb = buffer_search(tree, start);
+ if (!eb)
+ goto out;
+
+ if (atomic_read(&eb->refs) > 1) {
+ ret = 0;
+ goto out;
+ }
+ if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ ret = 0;
+ goto out;
+ }
+ /* at this point we can safely release the extent buffer */
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++)
+ page_cache_release(extent_buffer_page(eb, i));
+ rb_erase(&eb->rb_node, &tree->buffer);
+ __free_extent_buffer(eb);
+out:
+ spin_unlock(&tree->buffer_lock);
+ return ret;
+}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 00000000000..36de250a7b2
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,303 @@
+#ifndef __EXTENTIO__
+#define __EXTENTIO__
+
+#include <linux/rbtree.h>
+
+/* bits for the extent state */
+#define EXTENT_DIRTY 1
+#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_NEW (1 << 4)
+#define EXTENT_DELALLOC (1 << 5)
+#define EXTENT_DEFRAG (1 << 6)
+#define EXTENT_DEFRAG_DONE (1 << 7)
+#define EXTENT_BUFFER_FILLED (1 << 8)
+#define EXTENT_BOUNDARY (1 << 9)
+#define EXTENT_NODATASUM (1 << 10)
+#define EXTENT_DO_ACCOUNTING (1 << 11)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+
+/* flags for bio submission */
+#define EXTENT_BIO_COMPRESSED 1
+
+/* these are bit numbers for test/set bit */
+#define EXTENT_BUFFER_UPTODATE 0
+#define EXTENT_BUFFER_BLOCKING 1
+#define EXTENT_BUFFER_DIRTY 2
+
+/* these are flags for extent_clear_unlock_delalloc */
+#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
+#define EXTENT_CLEAR_UNLOCK 0x2
+#define EXTENT_CLEAR_DELALLOC 0x4
+#define EXTENT_CLEAR_DIRTY 0x8
+#define EXTENT_SET_WRITEBACK 0x10
+#define EXTENT_END_WRITEBACK 0x20
+#define EXTENT_SET_PRIVATE2 0x40
+#define EXTENT_CLEAR_ACCOUNTING 0x80
+
+/*
+ * page->private values. Every page that is controlled by the extent
+ * map has page->private set to one.
+ */
+#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
+
+struct extent_state;
+
+typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags);
+struct extent_io_ops {
+ int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written);
+ int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
+ int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
+ extent_submit_bio_hook_t *submit_bio_hook;
+ int (*merge_bio_hook)(struct page *page, unsigned long offset,
+ size_t size, struct bio *bio,
+ unsigned long bio_flags);
+ int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
+ int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
+ u64 start, u64 end,
+ struct extent_state *state);
+ int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
+ u64 start, u64 end,
+ struct extent_state *state);
+ int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
+ struct extent_state *state);
+ int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
+ struct extent_state *state, int uptodate);
+ int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+ unsigned long old, unsigned long bits);
+ int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+ unsigned long bits);
+ int (*merge_extent_hook)(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other);
+ int (*split_extent_hook)(struct inode *inode,
+ struct extent_state *orig, u64 split);
+ int (*write_cache_pages_lock_hook)(struct page *page);
+};
+
+struct extent_io_tree {
+ struct rb_root state;
+ struct rb_root buffer;
+ struct address_space *mapping;
+ u64 dirty_bytes;
+ spinlock_t lock;
+ spinlock_t buffer_lock;
+ struct extent_io_ops *ops;
+};
+
+struct extent_state {
+ u64 start;
+ u64 end; /* inclusive */
+ struct rb_node rb_node;
+
+ /* ADD NEW ELEMENTS AFTER THIS */
+ struct extent_io_tree *tree;
+ wait_queue_head_t wq;
+ atomic_t refs;
+ unsigned long state;
+ u64 split_start;
+ u64 split_end;
+
+ /* for use by the FS */
+ u64 private;
+
+ struct list_head leak_list;
+};
+
+struct extent_buffer {
+ u64 start;
+ unsigned long len;
+ char *map_token;
+ char *kaddr;
+ unsigned long map_start;
+ unsigned long map_len;
+ struct page *first_page;
+ unsigned long bflags;
+ atomic_t refs;
+ struct list_head leak_list;
+ struct rb_node rb_node;
+
+ /* the spinlock is used to protect most operations */
+ spinlock_t lock;
+
+ /*
+ * when we keep the lock held while blocking, waiters go onto
+ * the wq
+ */
+ wait_queue_head_t lock_wq;
+};
+
+struct extent_map_tree;
+
+static inline struct extent_state *extent_state_next(struct extent_state *state)
+{
+ struct rb_node *node;
+ node = rb_next(&state->rb_node);
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct extent_state, rb_node);
+}
+
+typedef struct extent_map *(get_extent_t)(struct inode *inode,
+ struct page *page,
+ size_t page_offset,
+ u64 start, u64 len,
+ int create);
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+ struct address_space *mapping, gfp_t mask);
+int try_release_extent_mapping(struct extent_map_tree *map,
+ struct extent_io_tree *tree, struct page *page,
+ gfp_t mask);
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
+int try_release_extent_state(struct extent_map_tree *map,
+ struct extent_io_tree *tree, struct page *page,
+ gfp_t mask);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, struct extent_state **cached, gfp_t mask);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent);
+int __init extent_io_init(void);
+void extent_io_exit(void);
+
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end,
+ u64 max_bytes, unsigned long bits);
+
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int filled, struct extent_state *cached_state);
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, gfp_t mask);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int wake, int delete, struct extent_state **cached,
+ gfp_t mask);
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, gfp_t mask);
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask);
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, int bits);
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+ u64 start, int bits);
+int extent_invalidatepage(struct extent_io_tree *tree,
+ struct page *page, unsigned long offset);
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc);
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+ u64 start, u64 end, get_extent_t *get_extent,
+ int mode);
+int extent_writepages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc);
+int extent_readpages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages,
+ get_extent_t get_extent);
+int extent_prepare_write(struct extent_io_tree *tree,
+ struct inode *inode, struct page *page,
+ unsigned from, unsigned to, get_extent_t *get_extent);
+int extent_commit_write(struct extent_io_tree *tree,
+ struct inode *inode, struct page *page,
+ unsigned from, unsigned to);
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+ get_extent_t *get_extent);
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len, get_extent_t *get_extent);
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
+void set_page_extent_mapped(struct page *page);
+
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+ u64 start, unsigned long len,
+ struct page *page0,
+ gfp_t mask);
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+ u64 start, unsigned long len,
+ gfp_t mask);
+void free_extent_buffer(struct extent_buffer *eb);
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+ struct extent_buffer *eb, u64 start, int wait,
+ get_extent_t *get_extent, int mirror_num);
+
+static inline void extent_buffer_get(struct extent_buffer *eb)
+{
+ atomic_inc(&eb->refs);
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+ unsigned long start,
+ unsigned long len);
+void read_extent_buffer(struct extent_buffer *eb, void *dst,
+ unsigned long start,
+ unsigned long len);
+void write_extent_buffer(struct extent_buffer *eb, const void *src,
+ unsigned long start, unsigned long len);
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len);
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len);
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len);
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+ unsigned long start, unsigned long len);
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int test_extent_buffer_dirty(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+ unsigned long min_len, char **token, char **map,
+ unsigned long *map_start,
+ unsigned long *map_len, int km);
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+ unsigned long min_len, char **token, char **map,
+ unsigned long *map_start,
+ unsigned long *map_len, int km);
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
+int release_extent_buffer_tail_pages(struct extent_buffer *eb);
+int extent_range_uptodate(struct extent_io_tree *tree,
+ u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+ struct extent_io_tree *tree,
+ u64 start, u64 end, struct page *locked_page,
+ unsigned long op);
+#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 00000000000..428fcac45f9
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,419 @@
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include "extent_map.h"
+
+
+static struct kmem_cache *extent_map_cache;
+
+int __init extent_map_init(void)
+{
+ extent_map_cache = kmem_cache_create("extent_map",
+ sizeof(struct extent_map), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!extent_map_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void extent_map_exit(void)
+{
+ if (extent_map_cache)
+ kmem_cache_destroy(extent_map_cache);
+}
+
+/**
+ * extent_map_tree_init - initialize extent map tree
+ * @tree: tree to initialize
+ * @mask: flags for memory allocations during tree operations
+ *
+ * Initialize the extent tree @tree. Should be called for each new inode
+ * or other user of the extent_map interface.
+ */
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
+{
+ tree->map.rb_node = NULL;
+ rwlock_init(&tree->lock);
+}
+
+/**
+ * alloc_extent_map - allocate new extent map structure
+ * @mask: memory allocation flags
+ *
+ * Allocate a new extent_map structure. The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_extent_map()
+ */
+struct extent_map *alloc_extent_map(gfp_t mask)
+{
+ struct extent_map *em;
+ em = kmem_cache_alloc(extent_map_cache, mask);
+ if (!em || IS_ERR(em))
+ return em;
+ em->in_tree = 0;
+ em->flags = 0;
+ atomic_set(&em->refs, 1);
+ return em;
+}
+
+/**
+ * free_extent_map - drop reference count of an extent_map
+ * @em: extent map beeing releasead
+ *
+ * Drops the reference out on @em by one and free the structure
+ * if the reference count hits zero.
+ */
+void free_extent_map(struct extent_map *em)
+{
+ if (!em)
+ return;
+ WARN_ON(atomic_read(&em->refs) == 0);
+ if (atomic_dec_and_test(&em->refs)) {
+ WARN_ON(em->in_tree);
+ kmem_cache_free(extent_map_cache, em);
+ }
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_map *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct extent_map, rb_node);
+
+ WARN_ON(!entry->in_tree);
+
+ if (offset < entry->start)
+ p = &(*p)->rb_left;
+ else if (offset >= extent_map_end(entry))
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ entry = rb_entry(node, struct extent_map, rb_node);
+ entry->in_tree = 1;
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+/*
+ * search through the tree for an extent_map with a given offset. If
+ * it can't be found, try to find some neighboring extents
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+ struct rb_node **prev_ret,
+ struct rb_node **next_ret)
+{
+ struct rb_node *n = root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev = NULL;
+ struct extent_map *entry;
+ struct extent_map *prev_entry = NULL;
+
+ while (n) {
+ entry = rb_entry(n, struct extent_map, rb_node);
+ prev = n;
+ prev_entry = entry;
+
+ WARN_ON(!entry->in_tree);
+
+ if (offset < entry->start)
+ n = n->rb_left;
+ else if (offset >= extent_map_end(entry))
+ n = n->rb_right;
+ else
+ return n;
+ }
+
+ if (prev_ret) {
+ orig_prev = prev;
+ while (prev && offset >= extent_map_end(prev_entry)) {
+ prev = rb_next(prev);
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ }
+ *prev_ret = prev;
+ prev = orig_prev;
+ }
+
+ if (next_ret) {
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ while (prev && offset < prev_entry->start) {
+ prev = rb_prev(prev);
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ }
+ *next_ret = prev;
+ }
+ return NULL;
+}
+
+/* check to see if two extent_map structs are adjacent and safe to merge */
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+{
+ if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
+ return 0;
+
+ /*
+ * don't merge compressed extents, we need to know their
+ * actual size
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+ return 0;
+
+ if (extent_map_end(prev) == next->start &&
+ prev->flags == next->flags &&
+ prev->bdev == next->bdev &&
+ ((next->block_start == EXTENT_MAP_HOLE &&
+ prev->block_start == EXTENT_MAP_HOLE) ||
+ (next->block_start == EXTENT_MAP_INLINE &&
+ prev->block_start == EXTENT_MAP_INLINE) ||
+ (next->block_start == EXTENT_MAP_DELALLOC &&
+ prev->block_start == EXTENT_MAP_DELALLOC) ||
+ (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
+ next->block_start == extent_map_block_end(prev)))) {
+ return 1;
+ }
+ return 0;
+}
+
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+{
+ int ret = 0;
+ struct extent_map *merge = NULL;
+ struct rb_node *rb;
+ struct extent_map *em;
+
+ write_lock(&tree->lock);
+ em = lookup_extent_mapping(tree, start, len);
+
+ WARN_ON(!em || em->start != start);
+
+ if (!em)
+ goto out;
+
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ if (em->start != 0) {
+ rb = rb_prev(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(merge, em)) {
+ em->start = merge->start;
+ em->len += merge->len;
+ em->block_len += merge->block_len;
+ em->block_start = merge->block_start;
+ merge->in_tree = 0;
+ rb_erase(&merge->rb_node, &tree->map);
+ free_extent_map(merge);
+ }
+ }
+
+ rb = rb_next(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(em, merge)) {
+ em->len += merge->len;
+ em->block_len += merge->len;
+ rb_erase(&merge->rb_node, &tree->map);
+ merge->in_tree = 0;
+ free_extent_map(merge);
+ }
+
+ free_extent_map(em);
+out:
+ write_unlock(&tree->lock);
+ return ret;
+
+}
+
+/**
+ * add_extent_mapping - add new extent map to the extent tree
+ * @tree: tree to insert new map in
+ * @em: map to insert
+ *
+ * Insert @em into @tree or perform a simple forward/backward merge with
+ * existing mappings. The extent_map struct passed in will be inserted
+ * into the tree directly, with an additional reference taken, or a
+ * reference dropped if the merge attempt was successfull.
+ */
+int add_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em)
+{
+ int ret = 0;
+ struct extent_map *merge = NULL;
+ struct rb_node *rb;
+ struct extent_map *exist;
+
+ exist = lookup_extent_mapping(tree, em->start, em->len);
+ if (exist) {
+ free_extent_map(exist);
+ ret = -EEXIST;
+ goto out;
+ }
+ rb = tree_insert(&tree->map, em->start, &em->rb_node);
+ if (rb) {
+ ret = -EEXIST;
+ goto out;
+ }
+ atomic_inc(&em->refs);
+ if (em->start != 0) {
+ rb = rb_prev(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(merge, em)) {
+ em->start = merge->start;
+ em->len += merge->len;
+ em->block_len += merge->block_len;
+ em->block_start = merge->block_start;
+ merge->in_tree = 0;
+ rb_erase(&merge->rb_node, &tree->map);
+ free_extent_map(merge);
+ }
+ }
+ rb = rb_next(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(em, merge)) {
+ em->len += merge->len;
+ em->block_len += merge->len;
+ rb_erase(&merge->rb_node, &tree->map);
+ merge->in_tree = 0;
+ free_extent_map(merge);
+ }
+out:
+ return ret;
+}
+
+/* simple helper to do math around the end of an extent, handling wrap */
+static u64 range_end(u64 start, u64 len)
+{
+ if (start + len < start)
+ return (u64)-1;
+ return start + len;
+}
+
+/**
+ * lookup_extent_mapping - lookup extent_map
+ * @tree: tree to lookup in
+ * @start: byte offset to start the search
+ * @len: length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range. There may be additional objects in the tree that
+ * intersect, so check the object returned carefully to make sure that no
+ * additional lookups are needed.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
+{
+ struct extent_map *em;
+ struct rb_node *rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *next = NULL;
+ u64 end = range_end(start, len);
+
+ rb_node = __tree_search(&tree->map, start, &prev, &next);
+ if (!rb_node && prev) {
+ em = rb_entry(prev, struct extent_map, rb_node);
+ if (end > em->start && start < extent_map_end(em))
+ goto found;
+ }
+ if (!rb_node && next) {
+ em = rb_entry(next, struct extent_map, rb_node);
+ if (end > em->start && start < extent_map_end(em))
+ goto found;
+ }
+ if (!rb_node) {
+ em = NULL;
+ goto out;
+ }
+ if (IS_ERR(rb_node)) {
+ em = ERR_PTR(PTR_ERR(rb_node));
+ goto out;
+ }
+ em = rb_entry(rb_node, struct extent_map, rb_node);
+ if (end > em->start && start < extent_map_end(em))
+ goto found;
+
+ em = NULL;
+ goto out;
+
+found:
+ atomic_inc(&em->refs);
+out:
+ return em;
+}
+
+/**
+ * search_extent_mapping - find a nearby extent map
+ * @tree: tree to lookup in
+ * @start: byte offset to start the search
+ * @len: length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.
+ *
+ * If one can't be found, any nearby extent may be returned
+ */
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
+{
+ struct extent_map *em;
+ struct rb_node *rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *next = NULL;
+
+ rb_node = __tree_search(&tree->map, start, &prev, &next);
+ if (!rb_node && prev) {
+ em = rb_entry(prev, struct extent_map, rb_node);
+ goto found;
+ }
+ if (!rb_node && next) {
+ em = rb_entry(next, struct extent_map, rb_node);
+ goto found;
+ }
+ if (!rb_node) {
+ em = NULL;
+ goto out;
+ }
+ if (IS_ERR(rb_node)) {
+ em = ERR_PTR(PTR_ERR(rb_node));
+ goto out;
+ }
+ em = rb_entry(rb_node, struct extent_map, rb_node);
+ goto found;
+
+ em = NULL;
+ goto out;
+
+found:
+ atomic_inc(&em->refs);
+out:
+ return em;
+}
+
+/**
+ * remove_extent_mapping - removes an extent_map from the extent tree
+ * @tree: extent tree to remove from
+ * @em: extent map beeing removed
+ *
+ * Removes @em from @tree. No reference counts are dropped, and no checks
+ * are done to see if the range is in use
+ */
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+ int ret = 0;
+
+ WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ rb_erase(&em->rb_node, &tree->map);
+ em->in_tree = 0;
+ return ret;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 00000000000..ab6d74b6e64
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,65 @@
+#ifndef __EXTENTMAP__
+#define __EXTENTMAP__
+
+#include <linux/rbtree.h>
+
+#define EXTENT_MAP_LAST_BYTE (u64)-4
+#define EXTENT_MAP_HOLE (u64)-3
+#define EXTENT_MAP_INLINE (u64)-2
+#define EXTENT_MAP_DELALLOC (u64)-1
+
+/* bits for the flags field */
+#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
+#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
+#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+
+struct extent_map {
+ struct rb_node rb_node;
+
+ /* all of these are in bytes */
+ u64 start;
+ u64 len;
+ u64 orig_start;
+ u64 block_start;
+ u64 block_len;
+ unsigned long flags;
+ struct block_device *bdev;
+ atomic_t refs;
+ int in_tree;
+};
+
+struct extent_map_tree {
+ struct rb_root map;
+ rwlock_t lock;
+};
+
+static inline u64 extent_map_end(struct extent_map *em)
+{
+ if (em->start + em->len < em->start)
+ return (u64)-1;
+ return em->start + em->len;
+}
+
+static inline u64 extent_map_block_end(struct extent_map *em)
+{
+ if (em->block_start + em->block_len < em->block_start)
+ return (u64)-1;
+ return em->block_start + em->block_len;
+}
+
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
+int add_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em);
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+
+struct extent_map *alloc_extent_map(gfp_t mask);
+void free_extent_map(struct extent_map *em);
+int __init extent_map_init(void);
+void extent_map_exit(void);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
+#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 00000000000..9b99886562d
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,834 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+
+#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+ sizeof(struct btrfs_item) * 2) / \
+ size) - 1))
+
+#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
+ sizeof(struct btrfs_ordered_sum)) / \
+ sizeof(struct btrfs_sector_sum) * \
+ (r)->sectorsize - (r)->sectorsize)
+
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 objectid, u64 pos,
+ u64 disk_offset, u64 disk_num_bytes,
+ u64 num_bytes, u64 offset, u64 ram_bytes,
+ u8 compression, u8 encryption, u16 other_encoding)
+{
+ int ret = 0;
+ struct btrfs_file_extent_item *item;
+ struct btrfs_key file_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ file_key.objectid = objectid;
+ file_key.offset = pos;
+ btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+ sizeof(*item));
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret);
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
+ btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
+ btrfs_set_file_extent_offset(leaf, item, offset);
+ btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+ btrfs_set_file_extent_generation(leaf, item, trans->transid);
+ btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_compression(leaf, item, compression);
+ btrfs_set_file_extent_encryption(leaf, item, encryption);
+ btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, int cow)
+{
+ int ret;
+ struct btrfs_key file_key;
+ struct btrfs_key found_key;
+ struct btrfs_csum_item *item;
+ struct extent_buffer *leaf;
+ u64 csum_offset = 0;
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+ int csums_in_item;
+
+ file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ file_key.offset = bytenr;
+ btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+ ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
+ if (ret < 0)
+ goto fail;
+ leaf = path->nodes[0];
+ if (ret > 0) {
+ ret = 1;
+ if (path->slots[0] == 0)
+ goto fail;
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
+ goto fail;
+
+ csum_offset = (bytenr - found_key.offset) >>
+ root->fs_info->sb->s_blocksize_bits;
+ csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
+ csums_in_item /= csum_size;
+
+ if (csum_offset >= csums_in_item) {
+ ret = -EFBIG;
+ goto fail;
+ }
+ }
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+ item = (struct btrfs_csum_item *)((unsigned char *)item +
+ csum_offset * csum_size);
+ return item;
+fail:
+ if (ret > 0)
+ ret = -ENOENT;
+ return ERR_PTR(ret);
+}
+
+
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ u64 offset, int mod)
+{
+ int ret;
+ struct btrfs_key file_key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+
+ file_key.objectid = objectid;
+ file_key.offset = offset;
+ btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+ ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+ return ret;
+}
+
+
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u32 *dst)
+{
+ u32 sum;
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int bio_index = 0;
+ u64 offset;
+ u64 item_start_offset = 0;
+ u64 item_last_offset = 0;
+ u64 disk_bytenr;
+ u32 diff;
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_csum_item *item = NULL;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+
+ path = btrfs_alloc_path();
+ if (bio->bi_size > PAGE_CACHE_SIZE * 8)
+ path->reada = 2;
+
+ WARN_ON(bio->bi_vcnt <= 0);
+
+ disk_bytenr = (u64)bio->bi_sector << 9;
+ while (bio_index < bio->bi_vcnt) {
+ offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+ ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
+ if (ret == 0)
+ goto found;
+
+ if (!item || disk_bytenr < item_start_offset ||
+ disk_bytenr >= item_last_offset) {
+ struct btrfs_key found_key;
+ u32 item_size;
+
+ if (item)
+ btrfs_release_path(root, path);
+ item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
+ path, disk_bytenr, 0);
+ if (IS_ERR(item)) {
+ ret = PTR_ERR(item);
+ if (ret == -ENOENT || ret == -EFBIG)
+ ret = 0;
+ sum = 0;
+ if (BTRFS_I(inode)->root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ set_extent_bits(io_tree, offset,
+ offset + bvec->bv_len - 1,
+ EXTENT_NODATASUM, GFP_NOFS);
+ } else {
+ printk(KERN_INFO "btrfs no csum found "
+ "for inode %lu start %llu\n",
+ inode->i_ino,
+ (unsigned long long)offset);
+ }
+ item = NULL;
+ btrfs_release_path(root, path);
+ goto found;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+
+ item_start_offset = found_key.offset;
+ item_size = btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ item_last_offset = item_start_offset +
+ (item_size / csum_size) *
+ root->sectorsize;
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ }
+ /*
+ * this byte range must be able to fit inside
+ * a single leaf so it will also fit inside a u32
+ */
+ diff = disk_bytenr - item_start_offset;
+ diff = diff / root->sectorsize;
+ diff = diff * csum_size;
+
+ read_extent_buffer(path->nodes[0], &sum,
+ ((unsigned long)item) + diff,
+ csum_size);
+found:
+ if (dst)
+ *dst++ = sum;
+ else
+ set_state_private(io_tree, offset, sum);
+ disk_bytenr += bvec->bv_len;
+ bio_index++;
+ bvec++;
+ }
+ btrfs_free_path(path);
+ return 0;
+}
+
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ struct list_head *list)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_sector_sum *sector_sum;
+ struct btrfs_csum_item *item;
+ unsigned long offset;
+ int ret;
+ size_t size;
+ u64 csum_end;
+ u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.offset = start;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto fail;
+ if (ret > 0 && path->slots[0] > 0) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+ key.type == BTRFS_EXTENT_CSUM_KEY) {
+ offset = (start - key.offset) >>
+ root->fs_info->sb->s_blocksize_bits;
+ if (offset * csum_size <
+ btrfs_item_size_nr(leaf, path->slots[0] - 1))
+ path->slots[0]--;
+ }
+ }
+
+ while (start <= end) {
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto fail;
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ key.type != BTRFS_EXTENT_CSUM_KEY)
+ break;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.offset > end)
+ break;
+
+ if (key.offset > start)
+ start = key.offset;
+
+ size = btrfs_item_size_nr(leaf, path->slots[0]);
+ csum_end = key.offset + (size / csum_size) * root->sectorsize;
+ if (csum_end <= start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ csum_end = min(csum_end, end + 1);
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ while (start < csum_end) {
+ size = min_t(size_t, csum_end - start,
+ MAX_ORDERED_SUM_BYTES(root));
+ sums = kzalloc(btrfs_ordered_sum_size(root, size),
+ GFP_NOFS);
+ BUG_ON(!sums);
+
+ sector_sum = sums->sums;
+ sums->bytenr = start;
+ sums->len = size;
+
+ offset = (start - key.offset) >>
+ root->fs_info->sb->s_blocksize_bits;
+ offset *= csum_size;
+
+ while (size > 0) {
+ read_extent_buffer(path->nodes[0],
+ &sector_sum->sum,
+ ((unsigned long)item) +
+ offset, csum_size);
+ sector_sum->bytenr = start;
+
+ size -= root->sectorsize;
+ start += root->sectorsize;
+ offset += csum_size;
+ sector_sum++;
+ }
+ list_add_tail(&sums->list, list);
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+fail:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 file_start, int contig)
+{
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_sector_sum *sector_sum;
+ struct btrfs_ordered_extent *ordered;
+ char *data;
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int bio_index = 0;
+ unsigned long total_bytes = 0;
+ unsigned long this_sum_bytes = 0;
+ u64 offset;
+ u64 disk_bytenr;
+
+ WARN_ON(bio->bi_vcnt <= 0);
+ sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
+ if (!sums)
+ return -ENOMEM;
+
+ sector_sum = sums->sums;
+ disk_bytenr = (u64)bio->bi_sector << 9;
+ sums->len = bio->bi_size;
+ INIT_LIST_HEAD(&sums->list);
+
+ if (contig)
+ offset = file_start;
+ else
+ offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
+ BUG_ON(!ordered);
+ sums->bytenr = ordered->start;
+
+ while (bio_index < bio->bi_vcnt) {
+ if (!contig)
+ offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+ if (!contig && (offset >= ordered->file_offset + ordered->len ||
+ offset < ordered->file_offset)) {
+ unsigned long bytes_left;
+ sums->len = this_sum_bytes;
+ this_sum_bytes = 0;
+ btrfs_add_ordered_sum(inode, ordered, sums);
+ btrfs_put_ordered_extent(ordered);
+
+ bytes_left = bio->bi_size - total_bytes;
+
+ sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
+ GFP_NOFS);
+ BUG_ON(!sums);
+ sector_sum = sums->sums;
+ sums->len = bytes_left;
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
+ BUG_ON(!ordered);
+ sums->bytenr = ordered->start;
+ }
+
+ data = kmap_atomic(bvec->bv_page, KM_USER0);
+ sector_sum->sum = ~(u32)0;
+ sector_sum->sum = btrfs_csum_data(root,
+ data + bvec->bv_offset,
+ sector_sum->sum,
+ bvec->bv_len);
+ kunmap_atomic(data, KM_USER0);
+ btrfs_csum_final(sector_sum->sum,
+ (char *)&sector_sum->sum);
+ sector_sum->bytenr = disk_bytenr;
+
+ sector_sum++;
+ bio_index++;
+ total_bytes += bvec->bv_len;
+ this_sum_bytes += bvec->bv_len;
+ disk_bytenr += bvec->bv_len;
+ offset += bvec->bv_len;
+ bvec++;
+ }
+ this_sum_bytes = 0;
+ btrfs_add_ordered_sum(inode, ordered, sums);
+ btrfs_put_ordered_extent(ordered);
+ return 0;
+}
+
+/*
+ * helper function for csum removal, this expects the
+ * key to describe the csum pointed to by the path, and it expects
+ * the csum to overlap the range [bytenr, len]
+ *
+ * The csum should not be entirely contained in the range and the
+ * range should not be entirely contained in the csum.
+ *
+ * This calls btrfs_truncate_item with the correct args based on the
+ * overlap, and fixes up the key as required.
+ */
+static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ u64 bytenr, u64 len)
+{
+ struct extent_buffer *leaf;
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+ u64 csum_end;
+ u64 end_byte = bytenr + len;
+ u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+ int ret;
+
+ leaf = path->nodes[0];
+ csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+ csum_end <<= root->fs_info->sb->s_blocksize_bits;
+ csum_end += key->offset;
+
+ if (key->offset < bytenr && csum_end <= end_byte) {
+ /*
+ * [ bytenr - len ]
+ * [ ]
+ * [csum ]
+ * A simple truncate off the end of the item
+ */
+ u32 new_size = (bytenr - key->offset) >> blocksize_bits;
+ new_size *= csum_size;
+ ret = btrfs_truncate_item(trans, root, path, new_size, 1);
+ BUG_ON(ret);
+ } else if (key->offset >= bytenr && csum_end > end_byte &&
+ end_byte > key->offset) {
+ /*
+ * [ bytenr - len ]
+ * [ ]
+ * [csum ]
+ * we need to truncate from the beginning of the csum
+ */
+ u32 new_size = (csum_end - end_byte) >> blocksize_bits;
+ new_size *= csum_size;
+
+ ret = btrfs_truncate_item(trans, root, path, new_size, 0);
+ BUG_ON(ret);
+
+ key->offset = end_byte;
+ ret = btrfs_set_item_key_safe(trans, root, path, key);
+ BUG_ON(ret);
+ } else {
+ BUG();
+ }
+ return 0;
+}
+
+/*
+ * deletes the csum items from the csum tree for a given
+ * range of bytes.
+ */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 len)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 end_byte = bytenr + len;
+ u64 csum_end;
+ struct extent_buffer *leaf;
+ int ret;
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+ int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+
+ root = root->fs_info->csum_root;
+
+ path = btrfs_alloc_path();
+
+ while (1) {
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.offset = end_byte - 1;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ key.type != BTRFS_EXTENT_CSUM_KEY) {
+ break;
+ }
+
+ if (key.offset >= end_byte)
+ break;
+
+ csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+ csum_end <<= blocksize_bits;
+ csum_end += key.offset;
+
+ /* this csum ends before we start, we're done */
+ if (csum_end <= bytenr)
+ break;
+
+ /* delete the entire item, it is inside our range */
+ if (key.offset >= bytenr && csum_end <= end_byte) {
+ ret = btrfs_del_item(trans, root, path);
+ BUG_ON(ret);
+ if (key.offset == bytenr)
+ break;
+ } else if (key.offset < bytenr && csum_end > end_byte) {
+ unsigned long offset;
+ unsigned long shift_len;
+ unsigned long item_offset;
+ /*
+ * [ bytenr - len ]
+ * [csum ]
+ *
+ * Our bytes are in the middle of the csum,
+ * we need to split this item and insert a new one.
+ *
+ * But we can't drop the path because the
+ * csum could change, get removed, extended etc.
+ *
+ * The trick here is the max size of a csum item leaves
+ * enough room in the tree block for a single
+ * item header. So, we split the item in place,
+ * adding a new header pointing to the existing
+ * bytes. Then we loop around again and we have
+ * a nicely formed csum item that we can neatly
+ * truncate.
+ */
+ offset = (bytenr - key.offset) >> blocksize_bits;
+ offset *= csum_size;
+
+ shift_len = (len >> blocksize_bits) * csum_size;
+
+ item_offset = btrfs_item_ptr_offset(leaf,
+ path->slots[0]);
+
+ memset_extent_buffer(leaf, 0, item_offset + offset,
+ shift_len);
+ key.offset = bytenr;
+
+ /*
+ * btrfs_split_item returns -EAGAIN when the
+ * item changed size or key
+ */
+ ret = btrfs_split_item(trans, root, path, &key, offset);
+ BUG_ON(ret && ret != -EAGAIN);
+
+ key.offset = end_byte - 1;
+ } else {
+ ret = truncate_one_csum(trans, root, path,
+ &key, bytenr, len);
+ BUG_ON(ret);
+ if (key.offset < bytenr)
+ break;
+ }
+ btrfs_release_path(root, path);
+ }
+out:
+ btrfs_free_path(path);
+ return 0;
+}
+
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_ordered_sum *sums)
+{
+ u64 bytenr;
+ int ret;
+ struct btrfs_key file_key;
+ struct btrfs_key found_key;
+ u64 next_offset;
+ u64 total_bytes = 0;
+ int found_next;
+ struct btrfs_path *path;
+ struct btrfs_csum_item *item;
+ struct btrfs_csum_item *item_end;
+ struct extent_buffer *leaf = NULL;
+ u64 csum_offset;
+ struct btrfs_sector_sum *sector_sum;
+ u32 nritems;
+ u32 ins_size;
+ char *eb_map;
+ char *eb_token;
+ unsigned long map_len;
+ unsigned long map_start;
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ sector_sum = sums->sums;
+again:
+ next_offset = (u64)-1;
+ found_next = 0;
+ file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ file_key.offset = sector_sum->bytenr;
+ bytenr = sector_sum->bytenr;
+ btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+
+ item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+ if (!IS_ERR(item)) {
+ leaf = path->nodes[0];
+ ret = 0;
+ goto found;
+ }
+ ret = PTR_ERR(item);
+ if (ret == -EFBIG) {
+ u32 item_size;
+ /* we found one, but it isn't big enough yet */
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if ((item_size / csum_size) >=
+ MAX_CSUM_ITEMS(root, csum_size)) {
+ /* already at max size, make a new one */
+ goto insert;
+ }
+ } else {
+ int slot = path->slots[0] + 1;
+ /* we didn't find a csum item, insert one */
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (path->slots[0] >= nritems - 1) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 1)
+ found_next = 1;
+ if (ret != 0)
+ goto insert;
+ slot = 0;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+ if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ found_key.type != BTRFS_EXTENT_CSUM_KEY) {
+ found_next = 1;
+ goto insert;
+ }
+ next_offset = found_key.offset;
+ found_next = 1;
+ goto insert;
+ }
+
+ /*
+ * at this point, we know the tree has an item, but it isn't big
+ * enough yet to put our csum in. Grow it
+ */
+ btrfs_release_path(root, path);
+ ret = btrfs_search_slot(trans, root, &file_key, path,
+ csum_size, 1);
+ if (ret < 0)
+ goto fail_unlock;
+
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ goto insert;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ csum_offset = (bytenr - found_key.offset) >>
+ root->fs_info->sb->s_blocksize_bits;
+
+ if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+ found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
+ goto insert;
+ }
+
+ if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+ csum_size) {
+ u32 diff = (csum_offset + 1) * csum_size;
+
+ /*
+ * is the item big enough already? we dropped our lock
+ * before and need to recheck
+ */
+ if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
+ goto csum;
+
+ diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
+ if (diff != csum_size)
+ goto insert;
+
+ ret = btrfs_extend_item(trans, root, path, diff);
+ BUG_ON(ret);
+ goto csum;
+ }
+
+insert:
+ btrfs_release_path(root, path);
+ csum_offset = 0;
+ if (found_next) {
+ u64 tmp = total_bytes + root->sectorsize;
+ u64 next_sector = sector_sum->bytenr;
+ struct btrfs_sector_sum *next = sector_sum + 1;
+
+ while (tmp < sums->len) {
+ if (next_sector + root->sectorsize != next->bytenr)
+ break;
+ tmp += root->sectorsize;
+ next_sector = next->bytenr;
+ next++;
+ }
+ tmp = min(tmp, next_offset - file_key.offset);
+ tmp >>= root->fs_info->sb->s_blocksize_bits;
+ tmp = max((u64)1, tmp);
+ tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
+ ins_size = csum_size * tmp;
+ } else {
+ ins_size = csum_size;
+ }
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+ ins_size);
+ path->leave_spinning = 0;
+ if (ret < 0)
+ goto fail_unlock;
+ if (ret != 0) {
+ WARN_ON(1);
+ goto fail_unlock;
+ }
+csum:
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+ ret = 0;
+ item = (struct btrfs_csum_item *)((unsigned char *)item +
+ csum_offset * csum_size);
+found:
+ item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+ item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
+ btrfs_item_size_nr(leaf, path->slots[0]));
+ eb_token = NULL;
+next_sector:
+
+ if (!eb_token ||
+ (unsigned long)item + csum_size >= map_start + map_len) {
+ int err;
+
+ if (eb_token)
+ unmap_extent_buffer(leaf, eb_token, KM_USER1);
+ eb_token = NULL;
+ err = map_private_extent_buffer(leaf, (unsigned long)item,
+ csum_size,
+ &eb_token, &eb_map,
+ &map_start, &map_len, KM_USER1);
+ if (err)
+ eb_token = NULL;
+ }
+ if (eb_token) {
+ memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
+ &sector_sum->sum, csum_size);
+ } else {
+ write_extent_buffer(leaf, &sector_sum->sum,
+ (unsigned long)item, csum_size);
+ }
+
+ total_bytes += root->sectorsize;
+ sector_sum++;
+ if (total_bytes < sums->len) {
+ item = (struct btrfs_csum_item *)((char *)item +
+ csum_size);
+ if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
+ sector_sum->bytenr) {
+ bytenr = sector_sum->bytenr;
+ goto next_sector;
+ }
+ }
+ if (eb_token) {
+ unmap_extent_buffer(leaf, eb_token, KM_USER1);
+ eb_token = NULL;
+ }
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ if (total_bytes < sums->len) {
+ btrfs_release_path(root, path);
+ cond_resched();
+ goto again;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+
+fail_unlock:
+ goto out;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 00000000000..6ed434ac037
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1167 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "tree-log.h"
+#include "locking.h"
+#include "compat.h"
+
+
+/* simple helper to fault in pages and copy. This should go away
+ * and be replaced with calls into generic code.
+ */
+static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
+ int write_bytes,
+ struct page **prepared_pages,
+ const char __user *buf)
+{
+ long page_fault = 0;
+ int i;
+ int offset = pos & (PAGE_CACHE_SIZE - 1);
+
+ for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+ size_t count = min_t(size_t,
+ PAGE_CACHE_SIZE - offset, write_bytes);
+ struct page *page = prepared_pages[i];
+ fault_in_pages_readable(buf, count);
+
+ /* Copy data from userspace to the current page */
+ kmap(page);
+ page_fault = __copy_from_user(page_address(page) + offset,
+ buf, count);
+ /* Flush processor's dcache for this page */
+ flush_dcache_page(page);
+ kunmap(page);
+ buf += count;
+ write_bytes -= count;
+
+ if (page_fault)
+ break;
+ }
+ return page_fault ? -EFAULT : 0;
+}
+
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
+static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+{
+ size_t i;
+ for (i = 0; i < num_pages; i++) {
+ if (!pages[i])
+ break;
+ /* page checked is some magic around finding pages that
+ * have been modified without going through btrfs_set_page_dirty
+ * clear it here
+ */
+ ClearPageChecked(pages[i]);
+ unlock_page(pages[i]);
+ mark_page_accessed(pages[i]);
+ page_cache_release(pages[i]);
+ }
+}
+
+/*
+ * after copy_from_user, pages need to be dirtied and we need to make
+ * sure holes are created between the current EOF and the start of
+ * any next extents (if required).
+ *
+ * this also makes the decision about creating an inline extent vs
+ * doing real data extents, marking pages dirty and delalloc as required.
+ */
+static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct file *file,
+ struct page **pages,
+ size_t num_pages,
+ loff_t pos,
+ size_t write_bytes)
+{
+ int err = 0;
+ int i;
+ struct inode *inode = fdentry(file)->d_inode;
+ u64 num_bytes;
+ u64 start_pos;
+ u64 end_of_last_block;
+ u64 end_pos = pos + write_bytes;
+ loff_t isize = i_size_read(inode);
+
+ start_pos = pos & ~((u64)root->sectorsize - 1);
+ num_bytes = (write_bytes + pos - start_pos +
+ root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+ end_of_last_block = start_pos + num_bytes - 1;
+ err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+ if (err)
+ return err;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = pages[i];
+ SetPageUptodate(p);
+ ClearPageChecked(p);
+ set_page_dirty(p);
+ }
+ if (end_pos > isize) {
+ i_size_write(inode, end_pos);
+ /* we've only changed i_size in ram, and we haven't updated
+ * the disk i_size. There is no need to log the inode
+ * at this time.
+ */
+ }
+ return err;
+}
+
+/*
+ * this drops all the extents in the cache that intersect the range
+ * [start, end]. Existing extents are split as required.
+ */
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned)
+{
+ struct extent_map *em;
+ struct extent_map *split = NULL;
+ struct extent_map *split2 = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 len = end - start + 1;
+ int ret;
+ int testend = 1;
+ unsigned long flags;
+ int compressed = 0;
+
+ WARN_ON(end < start);
+ if (end == (u64)-1) {
+ len = (u64)-1;
+ testend = 0;
+ }
+ while (1) {
+ if (!split)
+ split = alloc_extent_map(GFP_NOFS);
+ if (!split2)
+ split2 = alloc_extent_map(GFP_NOFS);
+
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em) {
+ write_unlock(&em_tree->lock);
+ break;
+ }
+ flags = em->flags;
+ if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ if (testend && em->start + em->len >= start + len) {
+ free_extent_map(em);
+ write_unlock(&em_tree->lock);
+ break;
+ }
+ start = em->start + em->len;
+ if (testend)
+ len = start + len - (em->start + em->len);
+ free_extent_map(em);
+ write_unlock(&em_tree->lock);
+ continue;
+ }
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ remove_extent_mapping(em_tree, em);
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ em->start < start) {
+ split->start = em->start;
+ split->len = start - em->start;
+ split->orig_start = em->orig_start;
+ split->block_start = em->block_start;
+
+ if (compressed)
+ split->block_len = em->block_len;
+ else
+ split->block_len = split->len;
+
+ split->bdev = em->bdev;
+ split->flags = flags;
+ ret = add_extent_mapping(em_tree, split);
+ BUG_ON(ret);
+ free_extent_map(split);
+ split = split2;
+ split2 = NULL;
+ }
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ testend && em->start + em->len > start + len) {
+ u64 diff = start + len - em->start;
+
+ split->start = start + len;
+ split->len = em->start + em->len - (start + len);
+ split->bdev = em->bdev;
+ split->flags = flags;
+
+ if (compressed) {
+ split->block_len = em->block_len;
+ split->block_start = em->block_start;
+ split->orig_start = em->orig_start;
+ } else {
+ split->block_len = split->len;
+ split->block_start = em->block_start + diff;
+ split->orig_start = split->start;
+ }
+
+ ret = add_extent_mapping(em_tree, split);
+ BUG_ON(ret);
+ free_extent_map(split);
+ split = NULL;
+ }
+ write_unlock(&em_tree->lock);
+
+ /* once for us */
+ free_extent_map(em);
+ /* once for the tree*/
+ free_extent_map(em);
+ }
+ if (split)
+ free_extent_map(split);
+ if (split2)
+ free_extent_map(split2);
+ return 0;
+}
+
+/*
+ * this is very complex, but the basic idea is to drop all extents
+ * in the range start - end. hint_block is filled in with a block number
+ * that would be a good hint to the block allocator for this file.
+ *
+ * If an extent intersects the range but is not entirely inside the range
+ * it is either truncated or split. Anything entirely inside the range
+ * is deleted from the tree.
+ */
+int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
+ u64 start, u64 end, u64 *hint_byte, int drop_cache)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key new_key;
+ u64 search_start = start;
+ u64 disk_bytenr = 0;
+ u64 num_bytes = 0;
+ u64 extent_offset = 0;
+ u64 extent_end = 0;
+ int del_nr = 0;
+ int del_slot = 0;
+ int extent_type;
+ int recow;
+ int ret;
+
+ if (drop_cache)
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ recow = 0;
+ ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ search_start, -1);
+ if (ret < 0)
+ break;
+ if (ret > 0 && path->slots[0] > 0 && search_start == start) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+ if (key.objectid == inode->i_ino &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+ ret = 0;
+next_slot:
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ BUG_ON(del_nr > 0);
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ recow = 1;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid > inode->i_ino ||
+ key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+ break;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ extent_offset = btrfs_file_extent_offset(leaf, fi);
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ extent_end = key.offset +
+ btrfs_file_extent_inline_len(leaf, fi);
+ } else {
+ WARN_ON(1);
+ extent_end = search_start;
+ }
+
+ if (extent_end <= search_start) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ search_start = max(key.offset, start);
+ if (recow) {
+ btrfs_release_path(root, path);
+ continue;
+ }
+
+ /*
+ * | - range to drop - |
+ * | -------- extent -------- |
+ */
+ if (start > key.offset && end < extent_end) {
+ BUG_ON(del_nr > 0);
+ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset = start;
+ ret = btrfs_duplicate_item(trans, root, path,
+ &new_key);
+ if (ret == -EAGAIN) {
+ btrfs_release_path(root, path);
+ continue;
+ }
+ if (ret < 0)
+ break;
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ extent_offset += start - key.offset;
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - start);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (disk_bytenr > 0) {
+ ret = btrfs_inc_extent_ref(trans, root,
+ disk_bytenr, num_bytes, 0,
+ root->root_key.objectid,
+ new_key.objectid,
+ start - extent_offset);
+ BUG_ON(ret);
+ *hint_byte = disk_bytenr;
+ }
+ key.offset = start;
+ }
+ /*
+ * | ---- range to drop ----- |
+ * | -------- extent -------- |
+ */
+ if (start <= key.offset && end < extent_end) {
+ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset = end;
+ btrfs_set_item_key_safe(trans, root, path, &new_key);
+
+ extent_offset += end - key.offset;
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - end);
+ btrfs_mark_buffer_dirty(leaf);
+ if (disk_bytenr > 0) {
+ inode_sub_bytes(inode, end - key.offset);
+ *hint_byte = disk_bytenr;
+ }
+ break;
+ }
+
+ search_start = extent_end;
+ /*
+ * | ---- range to drop ----- |
+ * | -------- extent -------- |
+ */
+ if (start > key.offset && end >= extent_end) {
+ BUG_ON(del_nr > 0);
+ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+ btrfs_mark_buffer_dirty(leaf);
+ if (disk_bytenr > 0) {
+ inode_sub_bytes(inode, extent_end - start);
+ *hint_byte = disk_bytenr;
+ }
+ if (end == extent_end)
+ break;
+
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ /*
+ * | ---- range to drop ----- |
+ * | ------ extent ------ |
+ */
+ if (start <= key.offset && end >= extent_end) {
+ if (del_nr == 0) {
+ del_slot = path->slots[0];
+ del_nr = 1;
+ } else {
+ BUG_ON(del_slot + del_nr != path->slots[0]);
+ del_nr++;
+ }
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ inode_sub_bytes(inode,
+ extent_end - key.offset);
+ extent_end = ALIGN(extent_end,
+ root->sectorsize);
+ } else if (disk_bytenr > 0) {
+ ret = btrfs_free_extent(trans, root,
+ disk_bytenr, num_bytes, 0,
+ root->root_key.objectid,
+ key.objectid, key.offset -
+ extent_offset);
+ BUG_ON(ret);
+ inode_sub_bytes(inode,
+ extent_end - key.offset);
+ *hint_byte = disk_bytenr;
+ }
+
+ if (end == extent_end)
+ break;
+
+ if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ ret = btrfs_del_items(trans, root, path, del_slot,
+ del_nr);
+ BUG_ON(ret);
+
+ del_nr = 0;
+ del_slot = 0;
+
+ btrfs_release_path(root, path);
+ continue;
+ }
+
+ BUG_ON(1);
+ }
+
+ if (del_nr > 0) {
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+ BUG_ON(ret);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+ u64 objectid, u64 bytenr, u64 orig_offset,
+ u64 *start, u64 *end)
+{
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 extent_end;
+
+ if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+ return 0;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 0;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+ btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+ btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
+ btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ return 0;
+
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ if ((*start && *start != key.offset) || (*end && *end != extent_end))
+ return 0;
+
+ *start = key.offset;
+ *end = extent_end;
+ return 1;
+}
+
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 start, u64 end)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ struct btrfs_key new_key;
+ u64 bytenr;
+ u64 num_bytes;
+ u64 extent_end;
+ u64 orig_offset;
+ u64 other_start;
+ u64 other_end;
+ u64 split;
+ int del_nr = 0;
+ int del_slot = 0;
+ int recow;
+ int ret;
+
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+again:
+ recow = 0;
+ split = start;
+ key.objectid = inode->i_ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = split;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0 && path->slots[0] > 0)
+ path->slots[0]--;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ BUG_ON(key.objectid != inode->i_ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ BUG_ON(btrfs_file_extent_type(leaf, fi) !=
+ BTRFS_FILE_EXTENT_PREALLOC);
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ BUG_ON(key.offset > start || extent_end < end);
+
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
+ memcpy(&new_key, &key, sizeof(new_key));
+
+ if (start == key.offset && end < extent_end) {
+ other_start = 0;
+ other_end = start;
+ if (extent_mergeable(leaf, path->slots[0] - 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ new_key.offset = end;
+ btrfs_set_item_key_safe(trans, root, path, &new_key);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - end);
+ btrfs_set_file_extent_offset(leaf, fi,
+ end - orig_offset);
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ end - other_start);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ }
+
+ if (start > key.offset && end == extent_end) {
+ other_start = end;
+ other_end = 0;
+ if (extent_mergeable(leaf, path->slots[0] + 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+ path->slots[0]++;
+ new_key.offset = start;
+ btrfs_set_item_key_safe(trans, root, path, &new_key);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ other_end - start);
+ btrfs_set_file_extent_offset(leaf, fi,
+ start - orig_offset);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ }
+
+ while (start > key.offset || end < extent_end) {
+ if (key.offset == start)
+ split = end;
+
+ new_key.offset = split;
+ ret = btrfs_duplicate_item(trans, root, path, &new_key);
+ if (ret == -EAGAIN) {
+ btrfs_release_path(root, path);
+ goto again;
+ }
+ BUG_ON(ret < 0);
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ split - key.offset);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - split);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
+ root->root_key.objectid,
+ inode->i_ino, orig_offset);
+ BUG_ON(ret);
+
+ if (split == start) {
+ key.offset = start;
+ } else {
+ BUG_ON(start != key.offset);
+ path->slots[0]--;
+ extent_end = end;
+ }
+ recow = 1;
+ }
+
+ other_start = end;
+ other_end = 0;
+ if (extent_mergeable(leaf, path->slots[0] + 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ if (recow) {
+ btrfs_release_path(root, path);
+ goto again;
+ }
+ extent_end = other_end;
+ del_slot = path->slots[0] + 1;
+ del_nr++;
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ 0, root->root_key.objectid,
+ inode->i_ino, orig_offset);
+ BUG_ON(ret);
+ }
+ other_start = 0;
+ other_end = start;
+ if (extent_mergeable(leaf, path->slots[0] - 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ if (recow) {
+ btrfs_release_path(root, path);
+ goto again;
+ }
+ key.offset = other_start;
+ del_slot = path->slots[0];
+ del_nr++;
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ 0, root->root_key.objectid,
+ inode->i_ino, orig_offset);
+ BUG_ON(ret);
+ }
+ if (del_nr == 0) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ fi = btrfs_item_ptr(leaf, del_slot - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - key.offset);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+ BUG_ON(ret);
+ }
+out:
+ btrfs_free_path(path);
+ return 0;
+}
+
+/*
+ * this gets pages into the page cache and locks them down, it also properly
+ * waits for data=ordered extents to finish before allowing the pages to be
+ * modified.
+ */
+static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
+ struct page **pages, size_t num_pages,
+ loff_t pos, unsigned long first_index,
+ unsigned long last_index, size_t write_bytes)
+{
+ int i;
+ unsigned long index = pos >> PAGE_CACHE_SHIFT;
+ struct inode *inode = fdentry(file)->d_inode;
+ int err = 0;
+ u64 start_pos;
+ u64 last_pos;
+
+ start_pos = pos & ~((u64)root->sectorsize - 1);
+ last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
+
+ if (start_pos > inode->i_size) {
+ err = btrfs_cont_expand(inode, start_pos);
+ if (err)
+ return err;
+ }
+
+ memset(pages, 0, num_pages * sizeof(struct page *));
+again:
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = grab_cache_page(inode->i_mapping, index + i);
+ if (!pages[i]) {
+ err = -ENOMEM;
+ BUG_ON(1);
+ }
+ wait_on_page_writeback(pages[i]);
+ }
+ if (start_pos < inode->i_size) {
+ struct btrfs_ordered_extent *ordered;
+ lock_extent(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1, GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode,
+ last_pos - 1);
+ if (ordered &&
+ ordered->file_offset + ordered->len > start_pos &&
+ ordered->file_offset < last_pos) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1, GFP_NOFS);
+ for (i = 0; i < num_pages; i++) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ btrfs_wait_ordered_range(inode, start_pos,
+ last_pos - start_pos);
+ goto again;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+ last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING,
+ GFP_NOFS);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1, GFP_NOFS);
+ }
+ for (i = 0; i < num_pages; i++) {
+ clear_page_dirty_for_io(pages[i]);
+ set_page_extent_mapped(pages[i]);
+ WARN_ON(!PageLocked(pages[i]));
+ }
+ return 0;
+}
+
+static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ loff_t pos;
+ loff_t start_pos;
+ ssize_t num_written = 0;
+ ssize_t err = 0;
+ int ret = 0;
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct page **pages = NULL;
+ int nrptrs;
+ struct page *pinned[2];
+ unsigned long first_index;
+ unsigned long last_index;
+ int will_write;
+
+ will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
+ (file->f_flags & O_DIRECT));
+
+ nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
+ PAGE_CACHE_SIZE / (sizeof(struct page *)));
+ pinned[0] = NULL;
+ pinned[1] = NULL;
+
+ pos = *ppos;
+ start_pos = pos;
+
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ /* do the reserve before the mutex lock in case we have to do some
+ * flushing. We wouldn't deadlock, but this is more polite.
+ */
+ err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (err)
+ goto out_nolock;
+
+ mutex_lock(&inode->i_mutex);
+
+ current->backing_dev_info = inode->i_mapping->backing_dev_info;
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err)
+ goto out;
+
+ if (count == 0)
+ goto out;
+
+ err = file_remove_suid(file);
+ if (err)
+ goto out;
+
+ file_update_time(file);
+
+ pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+
+ /* generic_write_checks can change our pos */
+ start_pos = pos;
+
+ BTRFS_I(inode)->sequence++;
+ first_index = pos >> PAGE_CACHE_SHIFT;
+ last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+
+ /*
+ * there are lots of better ways to do this, but this code
+ * makes sure the first and last page in the file range are
+ * up to date and ready for cow
+ */
+ if ((pos & (PAGE_CACHE_SIZE - 1))) {
+ pinned[0] = grab_cache_page(inode->i_mapping, first_index);
+ if (!PageUptodate(pinned[0])) {
+ ret = btrfs_readpage(NULL, pinned[0]);
+ BUG_ON(ret);
+ wait_on_page_locked(pinned[0]);
+ } else {
+ unlock_page(pinned[0]);
+ }
+ }
+ if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+ pinned[1] = grab_cache_page(inode->i_mapping, last_index);
+ if (!PageUptodate(pinned[1])) {
+ ret = btrfs_readpage(NULL, pinned[1]);
+ BUG_ON(ret);
+ wait_on_page_locked(pinned[1]);
+ } else {
+ unlock_page(pinned[1]);
+ }
+ }
+
+ while (count > 0) {
+ size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+ size_t write_bytes = min(count, nrptrs *
+ (size_t)PAGE_CACHE_SIZE -
+ offset);
+ size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ WARN_ON(num_pages > nrptrs);
+ memset(pages, 0, sizeof(struct page *) * nrptrs);
+
+ ret = btrfs_check_data_free_space(root, inode, write_bytes);
+ if (ret)
+ goto out;
+
+ ret = prepare_pages(root, file, pages, num_pages,
+ pos, first_index, last_index,
+ write_bytes);
+ if (ret) {
+ btrfs_free_reserved_data_space(root, inode,
+ write_bytes);
+ goto out;
+ }
+
+ ret = btrfs_copy_from_user(pos, num_pages,
+ write_bytes, pages, buf);
+ if (ret) {
+ btrfs_free_reserved_data_space(root, inode,
+ write_bytes);
+ btrfs_drop_pages(pages, num_pages);
+ goto out;
+ }
+
+ ret = dirty_and_release_pages(NULL, root, file, pages,
+ num_pages, pos, write_bytes);
+ btrfs_drop_pages(pages, num_pages);
+ if (ret) {
+ btrfs_free_reserved_data_space(root, inode,
+ write_bytes);
+ goto out;
+ }
+
+ if (will_write) {
+ filemap_fdatawrite_range(inode->i_mapping, pos,
+ pos + write_bytes - 1);
+ } else {
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ num_pages);
+ if (num_pages <
+ (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+ btrfs_btree_balance_dirty(root, 1);
+ btrfs_throttle(root);
+ }
+
+ buf += write_bytes;
+ count -= write_bytes;
+ pos += write_bytes;
+ num_written += write_bytes;
+
+ cond_resched();
+ }
+out:
+ mutex_unlock(&inode->i_mutex);
+ if (ret)
+ err = ret;
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+
+out_nolock:
+ kfree(pages);
+ if (pinned[0])
+ page_cache_release(pinned[0]);
+ if (pinned[1])
+ page_cache_release(pinned[1]);
+ *ppos = pos;
+
+ /*
+ * we want to make sure fsync finds this change
+ * but we haven't joined a transaction running right now.
+ *
+ * Later on, someone is sure to update the inode and get the
+ * real transid recorded.
+ *
+ * We set last_trans now to the fs_info generation + 1,
+ * this will either be one more than the running transaction
+ * or the generation used for the next transaction if there isn't
+ * one running right now.
+ */
+ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+
+ if (num_written > 0 && will_write) {
+ struct btrfs_trans_handle *trans;
+
+ err = btrfs_wait_ordered_range(inode, start_pos, num_written);
+ if (err)
+ num_written = err;
+
+ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_log_dentry_safe(trans, root,
+ file->f_dentry);
+ if (ret == 0) {
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ btrfs_end_transaction(trans, root);
+ else
+ btrfs_commit_transaction(trans, root);
+ } else if (ret != BTRFS_NO_LOG_SYNC) {
+ btrfs_commit_transaction(trans, root);
+ } else {
+ btrfs_end_transaction(trans, root);
+ }
+ }
+ if (file->f_flags & O_DIRECT) {
+ invalidate_mapping_pages(inode->i_mapping,
+ start_pos >> PAGE_CACHE_SHIFT,
+ (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
+ }
+ }
+ current->backing_dev_info = NULL;
+ return num_written ? num_written : err;
+}
+
+int btrfs_release_file(struct inode *inode, struct file *filp)
+{
+ /*
+ * ordered_data_close is set by settattr when we are about to truncate
+ * a file from a non-zero size to a zero size. This tries to
+ * flush down new bytes that may have been written if the
+ * application were using truncate to replace a file in place.
+ */
+ if (BTRFS_I(inode)->ordered_data_close) {
+ BTRFS_I(inode)->ordered_data_close = 0;
+ btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+ if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ filemap_flush(inode->i_mapping);
+ }
+ if (filp->private_data)
+ btrfs_ioctl_trans_end(filp);
+ return 0;
+}
+
+/*
+ * fsync call for both files and directories. This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit. This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+ struct inode *inode = dentry->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+ struct btrfs_trans_handle *trans;
+
+
+ /* we wait first, since the writeback may change the inode */
+ root->log_batch++;
+ /* the VFS called filemap_fdatawrite for us */
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ root->log_batch++;
+
+ /*
+ * check the transaction that last modified this inode
+ * and see if its already been committed
+ */
+ if (!BTRFS_I(inode)->last_trans)
+ goto out;
+
+ /*
+ * if the last transaction that changed this file was before
+ * the current transaction, we can bail out now without any
+ * syncing
+ */
+ mutex_lock(&root->fs_info->trans_mutex);
+ if (BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed) {
+ BTRFS_I(inode)->last_trans = 0;
+ mutex_unlock(&root->fs_info->trans_mutex);
+ goto out;
+ }
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ /*
+ * ok we haven't committed the transaction yet, lets do a commit
+ */
+ if (file && file->private_data)
+ btrfs_ioctl_trans_end(file);
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = btrfs_log_dentry_safe(trans, root, dentry);
+ if (ret < 0)
+ goto out;
+
+ /* we've logged all the items and now have a consistent
+ * version of the file in the log. It is possible that
+ * someone will come in and modify the file, but that's
+ * fine because the log is consistent on disk, and we
+ * have references to all of the file's extents
+ *
+ * It is possible that someone will come in and log the
+ * file again, but that will end up using the synchronization
+ * inside btrfs_sync_log to keep things safe.
+ */
+ mutex_unlock(&dentry->d_inode->i_mutex);
+
+ if (ret != BTRFS_NO_LOG_SYNC) {
+ if (ret > 0) {
+ ret = btrfs_commit_transaction(trans, root);
+ } else {
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ ret = btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_commit_transaction(trans, root);
+ }
+ } else {
+ ret = btrfs_end_transaction(trans, root);
+ }
+ mutex_lock(&dentry->d_inode->i_mutex);
+out:
+ return ret > 0 ? -EIO : ret;
+}
+
+static const struct vm_operations_struct btrfs_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = btrfs_page_mkwrite,
+};
+
+static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &btrfs_file_vm_ops;
+ file_accessed(filp);
+ return 0;
+}
+
+const struct file_operations btrfs_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .aio_read = generic_file_aio_read,
+ .splice_read = generic_file_splice_read,
+ .write = btrfs_file_write,
+ .mmap = btrfs_file_mmap,
+ .open = generic_file_open,
+ .release = btrfs_release_file,
+ .fsync = btrfs_sync_file,
+ .unlocked_ioctl = btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = btrfs_ioctl,
+#endif
+};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 00000000000..cb2849f0325
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,1364 @@
+/*
+ * Copyright (C) 2008 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/math64.h>
+#include "ctree.h"
+#include "free-space-cache.h"
+#include "transaction.h"
+
+#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
+
+static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
+ u64 offset)
+{
+ BUG_ON(offset < bitmap_start);
+ offset -= bitmap_start;
+ return (unsigned long)(div64_u64(offset, sectorsize));
+}
+
+static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
+{
+ return (unsigned long)(div64_u64(bytes, sectorsize));
+}
+
+static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
+ u64 offset)
+{
+ u64 bitmap_start;
+ u64 bytes_per_bitmap;
+
+ bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
+ bitmap_start = offset - block_group->key.objectid;
+ bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+ bitmap_start *= bytes_per_bitmap;
+ bitmap_start += block_group->key.objectid;
+
+ return bitmap_start;
+}
+
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+ struct rb_node *node, int bitmap)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_free_space *info;
+
+ while (*p) {
+ parent = *p;
+ info = rb_entry(parent, struct btrfs_free_space, offset_index);
+
+ if (offset < info->offset) {
+ p = &(*p)->rb_left;
+ } else if (offset > info->offset) {
+ p = &(*p)->rb_right;
+ } else {
+ /*
+ * we could have a bitmap entry and an extent entry
+ * share the same offset. If this is the case, we want
+ * the extent entry to always be found first if we do a
+ * linear search through the tree, since we want to have
+ * the quickest allocation time, and allocating from an
+ * extent is faster than allocating from a bitmap. So
+ * if we're inserting a bitmap and we find an entry at
+ * this offset, we want to go right, or after this entry
+ * logically. If we are inserting an extent and we've
+ * found a bitmap, we want to go left, or before
+ * logically.
+ */
+ if (bitmap) {
+ WARN_ON(info->bitmap);
+ p = &(*p)->rb_right;
+ } else {
+ WARN_ON(!info->bitmap);
+ p = &(*p)->rb_left;
+ }
+ }
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+
+ return 0;
+}
+
+/*
+ * searches the tree for the given offset.
+ *
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
+ * want a section that has at least bytes size and comes at or after the given
+ * offset.
+ */
+static struct btrfs_free_space *
+tree_search_offset(struct btrfs_block_group_cache *block_group,
+ u64 offset, int bitmap_only, int fuzzy)
+{
+ struct rb_node *n = block_group->free_space_offset.rb_node;
+ struct btrfs_free_space *entry, *prev = NULL;
+
+ /* find entry that is closest to the 'offset' */
+ while (1) {
+ if (!n) {
+ entry = NULL;
+ break;
+ }
+
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ prev = entry;
+
+ if (offset < entry->offset)
+ n = n->rb_left;
+ else if (offset > entry->offset)
+ n = n->rb_right;
+ else
+ break;
+ }
+
+ if (bitmap_only) {
+ if (!entry)
+ return NULL;
+ if (entry->bitmap)
+ return entry;
+
+ /*
+ * bitmap entry and extent entry may share same offset,
+ * in that case, bitmap entry comes after extent entry.
+ */
+ n = rb_next(n);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (entry->offset != offset)
+ return NULL;
+
+ WARN_ON(!entry->bitmap);
+ return entry;
+ } else if (entry) {
+ if (entry->bitmap) {
+ /*
+ * if previous extent entry covers the offset,
+ * we should return it instead of the bitmap entry
+ */
+ n = &entry->offset_index;
+ while (1) {
+ n = rb_prev(n);
+ if (!n)
+ break;
+ prev = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (!prev->bitmap) {
+ if (prev->offset + prev->bytes > offset)
+ entry = prev;
+ break;
+ }
+ }
+ }
+ return entry;
+ }
+
+ if (!prev)
+ return NULL;
+
+ /* find last entry before the 'offset' */
+ entry = prev;
+ if (entry->offset > offset) {
+ n = rb_prev(&entry->offset_index);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ BUG_ON(entry->offset > offset);
+ } else {
+ if (fuzzy)
+ return entry;
+ else
+ return NULL;
+ }
+ }
+
+ if (entry->bitmap) {
+ n = &entry->offset_index;
+ while (1) {
+ n = rb_prev(n);
+ if (!n)
+ break;
+ prev = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (!prev->bitmap) {
+ if (prev->offset + prev->bytes > offset)
+ return prev;
+ break;
+ }
+ }
+ if (entry->offset + BITS_PER_BITMAP *
+ block_group->sectorsize > offset)
+ return entry;
+ } else if (entry->offset + entry->bytes > offset)
+ return entry;
+
+ if (!fuzzy)
+ return NULL;
+
+ while (1) {
+ if (entry->bitmap) {
+ if (entry->offset + BITS_PER_BITMAP *
+ block_group->sectorsize > offset)
+ break;
+ } else {
+ if (entry->offset + entry->bytes > offset)
+ break;
+ }
+
+ n = rb_next(&entry->offset_index);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ }
+ return entry;
+}
+
+static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info)
+{
+ rb_erase(&info->offset_index, &block_group->free_space_offset);
+ block_group->free_extents--;
+ block_group->free_space -= info->bytes;
+}
+
+static int link_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info)
+{
+ int ret = 0;
+
+ BUG_ON(!info->bitmap && !info->bytes);
+ ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
+ &info->offset_index, (info->bitmap != NULL));
+ if (ret)
+ return ret;
+
+ block_group->free_space += info->bytes;
+ block_group->free_extents++;
+ return ret;
+}
+
+static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+{
+ u64 max_bytes;
+ u64 bitmap_bytes;
+ u64 extent_bytes;
+
+ /*
+ * The goal is to keep the total amount of memory used per 1gb of space
+ * at or below 32k, so we need to adjust how much memory we allow to be
+ * used by extent based free space tracking
+ */
+ max_bytes = MAX_CACHE_BYTES_PER_GIG *
+ (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+
+ /*
+ * we want to account for 1 more bitmap than what we have so we can make
+ * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
+ * we add more bitmaps.
+ */
+ bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+
+ if (bitmap_bytes >= max_bytes) {
+ block_group->extents_thresh = 0;
+ return;
+ }
+
+ /*
+ * we want the extent entry threshold to always be at most 1/2 the maxw
+ * bytes we can have, or whatever is less than that.
+ */
+ extent_bytes = max_bytes - bitmap_bytes;
+ extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
+
+ block_group->extents_thresh =
+ div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
+}
+
+static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
+{
+ unsigned long start, end;
+ unsigned long i;
+
+ start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+ end = start + bytes_to_bits(bytes, block_group->sectorsize);
+ BUG_ON(end > BITS_PER_BITMAP);
+
+ for (i = start; i < end; i++)
+ clear_bit(i, info->bitmap);
+
+ info->bytes -= bytes;
+ block_group->free_space -= bytes;
+}
+
+static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
+{
+ unsigned long start, end;
+ unsigned long i;
+
+ start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+ end = start + bytes_to_bits(bytes, block_group->sectorsize);
+ BUG_ON(end > BITS_PER_BITMAP);
+
+ for (i = start; i < end; i++)
+ set_bit(i, info->bitmap);
+
+ info->bytes += bytes;
+ block_group->free_space += bytes;
+}
+
+static int search_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *bitmap_info, u64 *offset,
+ u64 *bytes)
+{
+ unsigned long found_bits = 0;
+ unsigned long bits, i;
+ unsigned long next_zero;
+
+ i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+ max_t(u64, *offset, bitmap_info->offset));
+ bits = bytes_to_bits(*bytes, block_group->sectorsize);
+
+ for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
+ i < BITS_PER_BITMAP;
+ i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+ next_zero = find_next_zero_bit(bitmap_info->bitmap,
+ BITS_PER_BITMAP, i);
+ if ((next_zero - i) >= bits) {
+ found_bits = next_zero - i;
+ break;
+ }
+ i = next_zero;
+ }
+
+ if (found_bits) {
+ *offset = (u64)(i * block_group->sectorsize) +
+ bitmap_info->offset;
+ *bytes = (u64)(found_bits) * block_group->sectorsize;
+ return 0;
+ }
+
+ return -1;
+}
+
+static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
+ *block_group, u64 *offset,
+ u64 *bytes, int debug)
+{
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ int ret;
+
+ if (!block_group->free_space_offset.rb_node)
+ return NULL;
+
+ entry = tree_search_offset(block_group,
+ offset_to_bitmap(block_group, *offset),
+ 0, 1);
+ if (!entry)
+ return NULL;
+
+ for (node = &entry->offset_index; node; node = rb_next(node)) {
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ if (entry->bytes < *bytes)
+ continue;
+
+ if (entry->bitmap) {
+ ret = search_bitmap(block_group, entry, offset, bytes);
+ if (!ret)
+ return entry;
+ continue;
+ }
+
+ *offset = entry->offset;
+ *bytes = entry->bytes;
+ return entry;
+ }
+
+ return NULL;
+}
+
+static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info, u64 offset)
+{
+ u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+ int max_bitmaps = (int)div64_u64(block_group->key.offset +
+ bytes_per_bg - 1, bytes_per_bg);
+ BUG_ON(block_group->total_bitmaps >= max_bitmaps);
+
+ info->offset = offset_to_bitmap(block_group, offset);
+ info->bytes = 0;
+ link_free_space(block_group, info);
+ block_group->total_bitmaps++;
+
+ recalculate_thresholds(block_group);
+}
+
+static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *bitmap_info,
+ u64 *offset, u64 *bytes)
+{
+ u64 end;
+ u64 search_start, search_bytes;
+ int ret;
+
+again:
+ end = bitmap_info->offset +
+ (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
+
+ /*
+ * XXX - this can go away after a few releases.
+ *
+ * since the only user of btrfs_remove_free_space is the tree logging
+ * stuff, and the only way to test that is under crash conditions, we
+ * want to have this debug stuff here just in case somethings not
+ * working. Search the bitmap for the space we are trying to use to
+ * make sure its actually there. If its not there then we need to stop
+ * because something has gone wrong.
+ */
+ search_start = *offset;
+ search_bytes = *bytes;
+ ret = search_bitmap(block_group, bitmap_info, &search_start,
+ &search_bytes);
+ BUG_ON(ret < 0 || search_start != *offset);
+
+ if (*offset > bitmap_info->offset && *offset + *bytes > end) {
+ bitmap_clear_bits(block_group, bitmap_info, *offset,
+ end - *offset + 1);
+ *bytes -= end - *offset + 1;
+ *offset = end + 1;
+ } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
+ bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
+ *bytes = 0;
+ }
+
+ if (*bytes) {
+ struct rb_node *next = rb_next(&bitmap_info->offset_index);
+ if (!bitmap_info->bytes) {
+ unlink_free_space(block_group, bitmap_info);
+ kfree(bitmap_info->bitmap);
+ kfree(bitmap_info);
+ block_group->total_bitmaps--;
+ recalculate_thresholds(block_group);
+ }
+
+ /*
+ * no entry after this bitmap, but we still have bytes to
+ * remove, so something has gone wrong.
+ */
+ if (!next)
+ return -EINVAL;
+
+ bitmap_info = rb_entry(next, struct btrfs_free_space,
+ offset_index);
+
+ /*
+ * if the next entry isn't a bitmap we need to return to let the
+ * extent stuff do its work.
+ */
+ if (!bitmap_info->bitmap)
+ return -EAGAIN;
+
+ /*
+ * Ok the next item is a bitmap, but it may not actually hold
+ * the information for the rest of this free space stuff, so
+ * look for it, and if we don't find it return so we can try
+ * everything over again.
+ */
+ search_start = *offset;
+ search_bytes = *bytes;
+ ret = search_bitmap(block_group, bitmap_info, &search_start,
+ &search_bytes);
+ if (ret < 0 || search_start != *offset)
+ return -EAGAIN;
+
+ goto again;
+ } else if (!bitmap_info->bytes) {
+ unlink_free_space(block_group, bitmap_info);
+ kfree(bitmap_info->bitmap);
+ kfree(bitmap_info);
+ block_group->total_bitmaps--;
+ recalculate_thresholds(block_group);
+ }
+
+ return 0;
+}
+
+static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info)
+{
+ struct btrfs_free_space *bitmap_info;
+ int added = 0;
+ u64 bytes, offset, end;
+ int ret;
+
+ /*
+ * If we are below the extents threshold then we can add this as an
+ * extent, and don't have to deal with the bitmap
+ */
+ if (block_group->free_extents < block_group->extents_thresh &&
+ info->bytes > block_group->sectorsize * 4)
+ return 0;
+
+ /*
+ * some block groups are so tiny they can't be enveloped by a bitmap, so
+ * don't even bother to create a bitmap for this
+ */
+ if (BITS_PER_BITMAP * block_group->sectorsize >
+ block_group->key.offset)
+ return 0;
+
+ bytes = info->bytes;
+ offset = info->offset;
+
+again:
+ bitmap_info = tree_search_offset(block_group,
+ offset_to_bitmap(block_group, offset),
+ 1, 0);
+ if (!bitmap_info) {
+ BUG_ON(added);
+ goto new_bitmap;
+ }
+
+ end = bitmap_info->offset +
+ (u64)(BITS_PER_BITMAP * block_group->sectorsize);
+
+ if (offset >= bitmap_info->offset && offset + bytes > end) {
+ bitmap_set_bits(block_group, bitmap_info, offset,
+ end - offset);
+ bytes -= end - offset;
+ offset = end;
+ added = 0;
+ } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
+ bitmap_set_bits(block_group, bitmap_info, offset, bytes);
+ bytes = 0;
+ } else {
+ BUG();
+ }
+
+ if (!bytes) {
+ ret = 1;
+ goto out;
+ } else
+ goto again;
+
+new_bitmap:
+ if (info && info->bitmap) {
+ add_new_bitmap(block_group, info, offset);
+ added = 1;
+ info = NULL;
+ goto again;
+ } else {
+ spin_unlock(&block_group->tree_lock);
+
+ /* no pre-allocated info, allocate a new one */
+ if (!info) {
+ info = kzalloc(sizeof(struct btrfs_free_space),
+ GFP_NOFS);
+ if (!info) {
+ spin_lock(&block_group->tree_lock);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* allocate the bitmap */
+ info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ spin_lock(&block_group->tree_lock);
+ if (!info->bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ goto again;
+ }
+
+out:
+ if (info) {
+ if (info->bitmap)
+ kfree(info->bitmap);
+ kfree(info);
+ }
+
+ return ret;
+}
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes)
+{
+ struct btrfs_free_space *right_info = NULL;
+ struct btrfs_free_space *left_info = NULL;
+ struct btrfs_free_space *info = NULL;
+ int ret = 0;
+
+ info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+ if (!info)
+ return -ENOMEM;
+
+ info->offset = offset;
+ info->bytes = bytes;
+
+ spin_lock(&block_group->tree_lock);
+
+ /*
+ * first we want to see if there is free space adjacent to the range we
+ * are adding, if there is remove that struct and add a new one to
+ * cover the entire range
+ */
+ right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
+ if (right_info && rb_prev(&right_info->offset_index))
+ left_info = rb_entry(rb_prev(&right_info->offset_index),
+ struct btrfs_free_space, offset_index);
+ else
+ left_info = tree_search_offset(block_group, offset - 1, 0, 0);
+
+ /*
+ * If there was no extent directly to the left or right of this new
+ * extent then we know we're going to have to allocate a new extent, so
+ * before we do that see if we need to drop this into a bitmap
+ */
+ if ((!left_info || left_info->bitmap) &&
+ (!right_info || right_info->bitmap)) {
+ ret = insert_into_bitmap(block_group, info);
+
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ if (right_info && !right_info->bitmap) {
+ unlink_free_space(block_group, right_info);
+ info->bytes += right_info->bytes;
+ kfree(right_info);
+ }
+
+ if (left_info && !left_info->bitmap &&
+ left_info->offset + left_info->bytes == offset) {
+ unlink_free_space(block_group, left_info);
+ info->offset = left_info->offset;
+ info->bytes += left_info->bytes;
+ kfree(left_info);
+ }
+
+ ret = link_free_space(block_group, info);
+ if (ret)
+ kfree(info);
+out:
+ spin_unlock(&block_group->tree_lock);
+
+ if (ret) {
+ printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
+ BUG_ON(ret == -EEXIST);
+ }
+
+ return ret;
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes)
+{
+ struct btrfs_free_space *info;
+ struct btrfs_free_space *next_info = NULL;
+ int ret = 0;
+
+ spin_lock(&block_group->tree_lock);
+
+again:
+ info = tree_search_offset(block_group, offset, 0, 0);
+ if (!info) {
+ /*
+ * oops didn't find an extent that matched the space we wanted
+ * to remove, look for a bitmap instead
+ */
+ info = tree_search_offset(block_group,
+ offset_to_bitmap(block_group, offset),
+ 1, 0);
+ if (!info) {
+ WARN_ON(1);
+ goto out_lock;
+ }
+ }
+
+ if (info->bytes < bytes && rb_next(&info->offset_index)) {
+ u64 end;
+ next_info = rb_entry(rb_next(&info->offset_index),
+ struct btrfs_free_space,
+ offset_index);
+
+ if (next_info->bitmap)
+ end = next_info->offset + BITS_PER_BITMAP *
+ block_group->sectorsize - 1;
+ else
+ end = next_info->offset + next_info->bytes;
+
+ if (next_info->bytes < bytes ||
+ next_info->offset > offset || offset > end) {
+ printk(KERN_CRIT "Found free space at %llu, size %llu,"
+ " trying to use %llu\n",
+ (unsigned long long)info->offset,
+ (unsigned long long)info->bytes,
+ (unsigned long long)bytes);
+ WARN_ON(1);
+ ret = -EINVAL;
+ goto out_lock;
+ }
+
+ info = next_info;
+ }
+
+ if (info->bytes == bytes) {
+ unlink_free_space(block_group, info);
+ if (info->bitmap) {
+ kfree(info->bitmap);
+ block_group->total_bitmaps--;
+ }
+ kfree(info);
+ goto out_lock;
+ }
+
+ if (!info->bitmap && info->offset == offset) {
+ unlink_free_space(block_group, info);
+ info->offset += bytes;
+ info->bytes -= bytes;
+ link_free_space(block_group, info);
+ goto out_lock;
+ }
+
+ if (!info->bitmap && info->offset <= offset &&
+ info->offset + info->bytes >= offset + bytes) {
+ u64 old_start = info->offset;
+ /*
+ * we're freeing space in the middle of the info,
+ * this can happen during tree log replay
+ *
+ * first unlink the old info and then
+ * insert it again after the hole we're creating
+ */
+ unlink_free_space(block_group, info);
+ if (offset + bytes < info->offset + info->bytes) {
+ u64 old_end = info->offset + info->bytes;
+
+ info->offset = offset + bytes;
+ info->bytes = old_end - info->offset;
+ ret = link_free_space(block_group, info);
+ WARN_ON(ret);
+ if (ret)
+ goto out_lock;
+ } else {
+ /* the hole we're creating ends at the end
+ * of the info struct, just free the info
+ */
+ kfree(info);
+ }
+ spin_unlock(&block_group->tree_lock);
+
+ /* step two, insert a new info struct to cover
+ * anything before the hole
+ */
+ ret = btrfs_add_free_space(block_group, old_start,
+ offset - old_start);
+ WARN_ON(ret);
+ goto out;
+ }
+
+ ret = remove_from_bitmap(block_group, info, &offset, &bytes);
+ if (ret == -EAGAIN)
+ goto again;
+ BUG_ON(ret);
+out_lock:
+ spin_unlock(&block_group->tree_lock);
+out:
+ return ret;
+}
+
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytes)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ int count = 0;
+
+ for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (info->bytes >= bytes)
+ count++;
+ printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
+ (unsigned long long)info->offset,
+ (unsigned long long)info->bytes,
+ (info->bitmap) ? "yes" : "no");
+ }
+ printk(KERN_INFO "block group has cluster?: %s\n",
+ list_empty(&block_group->cluster_list) ? "no" : "yes");
+ printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
+ "\n", count);
+}
+
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ u64 ret = 0;
+
+ for (n = rb_first(&block_group->free_space_offset); n;
+ n = rb_next(n)) {
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+ ret += info->bytes;
+ }
+
+ return ret;
+}
+
+/*
+ * for a given cluster, put all of its extents back into the free
+ * space cache. If the block group passed doesn't match the block group
+ * pointed to by the cluster, someone else raced in and freed the
+ * cluster already. In that case, we just return without changing anything
+ */
+static int
+__btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster)
+{
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ bool bitmap;
+
+ spin_lock(&cluster->lock);
+ if (cluster->block_group != block_group)
+ goto out;
+
+ bitmap = cluster->points_to_bitmap;
+ cluster->block_group = NULL;
+ cluster->window_start = 0;
+ list_del_init(&cluster->block_group_list);
+ cluster->points_to_bitmap = false;
+
+ if (bitmap)
+ goto out;
+
+ node = rb_first(&cluster->root);
+ while (node) {
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ node = rb_next(&entry->offset_index);
+ rb_erase(&entry->offset_index, &cluster->root);
+ BUG_ON(entry->bitmap);
+ tree_insert_offset(&block_group->free_space_offset,
+ entry->offset, &entry->offset_index, 0);
+ }
+ cluster->root.rb_node = NULL;
+
+out:
+ spin_unlock(&cluster->lock);
+ btrfs_put_block_group(block_group);
+ return 0;
+}
+
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *node;
+ struct btrfs_free_cluster *cluster;
+ struct list_head *head;
+
+ spin_lock(&block_group->tree_lock);
+ while ((head = block_group->cluster_list.next) !=
+ &block_group->cluster_list) {
+ cluster = list_entry(head, struct btrfs_free_cluster,
+ block_group_list);
+
+ WARN_ON(cluster->block_group != block_group);
+ __btrfs_return_cluster_to_free_space(block_group, cluster);
+ if (need_resched()) {
+ spin_unlock(&block_group->tree_lock);
+ cond_resched();
+ spin_lock(&block_group->tree_lock);
+ }
+ }
+
+ while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
+ unlink_free_space(block_group, info);
+ if (info->bitmap)
+ kfree(info->bitmap);
+ kfree(info);
+ if (need_resched()) {
+ spin_unlock(&block_group->tree_lock);
+ cond_resched();
+ spin_lock(&block_group->tree_lock);
+ }
+ }
+
+ spin_unlock(&block_group->tree_lock);
+}
+
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes, u64 empty_size)
+{
+ struct btrfs_free_space *entry = NULL;
+ u64 bytes_search = bytes + empty_size;
+ u64 ret = 0;
+
+ spin_lock(&block_group->tree_lock);
+ entry = find_free_space(block_group, &offset, &bytes_search, 0);
+ if (!entry)
+ goto out;
+
+ ret = offset;
+ if (entry->bitmap) {
+ bitmap_clear_bits(block_group, entry, offset, bytes);
+ if (!entry->bytes) {
+ unlink_free_space(block_group, entry);
+ kfree(entry->bitmap);
+ kfree(entry);
+ block_group->total_bitmaps--;
+ recalculate_thresholds(block_group);
+ }
+ } else {
+ unlink_free_space(block_group, entry);
+ entry->offset += bytes;
+ entry->bytes -= bytes;
+ if (!entry->bytes)
+ kfree(entry);
+ else
+ link_free_space(block_group, entry);
+ }
+
+out:
+ spin_unlock(&block_group->tree_lock);
+
+ return ret;
+}
+
+/*
+ * given a cluster, put all of its extents back into the free space
+ * cache. If a block group is passed, this function will only free
+ * a cluster that belongs to the passed block group.
+ *
+ * Otherwise, it'll get a reference on the block group pointed to by the
+ * cluster and remove the cluster from it.
+ */
+int btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster)
+{
+ int ret;
+
+ /* first, get a safe pointer to the block group */
+ spin_lock(&cluster->lock);
+ if (!block_group) {
+ block_group = cluster->block_group;
+ if (!block_group) {
+ spin_unlock(&cluster->lock);
+ return 0;
+ }
+ } else if (cluster->block_group != block_group) {
+ /* someone else has already freed it don't redo their work */
+ spin_unlock(&cluster->lock);
+ return 0;
+ }
+ atomic_inc(&block_group->count);
+ spin_unlock(&cluster->lock);
+
+ /* now return any extents the cluster had on it */
+ spin_lock(&block_group->tree_lock);
+ ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&block_group->tree_lock);
+
+ /* finally drop our ref */
+ btrfs_put_block_group(block_group);
+ return ret;
+}
+
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 bytes, u64 min_start)
+{
+ struct btrfs_free_space *entry;
+ int err;
+ u64 search_start = cluster->window_start;
+ u64 search_bytes = bytes;
+ u64 ret = 0;
+
+ spin_lock(&block_group->tree_lock);
+ spin_lock(&cluster->lock);
+
+ if (!cluster->points_to_bitmap)
+ goto out;
+
+ if (cluster->block_group != block_group)
+ goto out;
+
+ /*
+ * search_start is the beginning of the bitmap, but at some point it may
+ * be a good idea to point to the actual start of the free area in the
+ * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
+ * to 1 to make sure we get the bitmap entry
+ */
+ entry = tree_search_offset(block_group,
+ offset_to_bitmap(block_group, search_start),
+ 1, 0);
+ if (!entry || !entry->bitmap)
+ goto out;
+
+ search_start = min_start;
+ search_bytes = bytes;
+
+ err = search_bitmap(block_group, entry, &search_start,
+ &search_bytes);
+ if (err)
+ goto out;
+
+ ret = search_start;
+ bitmap_clear_bits(block_group, entry, ret, bytes);
+out:
+ spin_unlock(&cluster->lock);
+ spin_unlock(&block_group->tree_lock);
+
+ return ret;
+}
+
+/*
+ * given a cluster, try to allocate 'bytes' from it, returns 0
+ * if it couldn't find anything suitably large, or a logical disk offset
+ * if things worked out
+ */
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster, u64 bytes,
+ u64 min_start)
+{
+ struct btrfs_free_space *entry = NULL;
+ struct rb_node *node;
+ u64 ret = 0;
+
+ if (cluster->points_to_bitmap)
+ return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
+ min_start);
+
+ spin_lock(&cluster->lock);
+ if (bytes > cluster->max_size)
+ goto out;
+
+ if (cluster->block_group != block_group)
+ goto out;
+
+ node = rb_first(&cluster->root);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+
+ while(1) {
+ if (entry->bytes < bytes || entry->offset < min_start) {
+ struct rb_node *node;
+
+ node = rb_next(&entry->offset_index);
+ if (!node)
+ break;
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ continue;
+ }
+ ret = entry->offset;
+
+ entry->offset += bytes;
+ entry->bytes -= bytes;
+
+ if (entry->bytes == 0) {
+ rb_erase(&entry->offset_index, &cluster->root);
+ kfree(entry);
+ }
+ break;
+ }
+out:
+ spin_unlock(&cluster->lock);
+
+ return ret;
+}
+
+static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *entry,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 min_bytes)
+{
+ unsigned long next_zero;
+ unsigned long i;
+ unsigned long search_bits;
+ unsigned long total_bits;
+ unsigned long found_bits;
+ unsigned long start = 0;
+ unsigned long total_found = 0;
+ bool found = false;
+
+ i = offset_to_bit(entry->offset, block_group->sectorsize,
+ max_t(u64, offset, entry->offset));
+ search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+ total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+
+again:
+ found_bits = 0;
+ for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
+ i < BITS_PER_BITMAP;
+ i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+ next_zero = find_next_zero_bit(entry->bitmap,
+ BITS_PER_BITMAP, i);
+ if (next_zero - i >= search_bits) {
+ found_bits = next_zero - i;
+ break;
+ }
+ i = next_zero;
+ }
+
+ if (!found_bits)
+ return -1;
+
+ if (!found) {
+ start = i;
+ found = true;
+ }
+
+ total_found += found_bits;
+
+ if (cluster->max_size < found_bits * block_group->sectorsize)
+ cluster->max_size = found_bits * block_group->sectorsize;
+
+ if (total_found < total_bits) {
+ i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
+ if (i - start > total_bits * 2) {
+ total_found = 0;
+ cluster->max_size = 0;
+ found = false;
+ }
+ goto again;
+ }
+
+ cluster->window_start = start * block_group->sectorsize +
+ entry->offset;
+ cluster->points_to_bitmap = true;
+
+ return 0;
+}
+
+/*
+ * here we try to find a cluster of blocks in a block group. The goal
+ * is to find at least bytes free and up to empty_size + bytes free.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 empty_size)
+{
+ struct btrfs_free_space *entry = NULL;
+ struct rb_node *node;
+ struct btrfs_free_space *next;
+ struct btrfs_free_space *last = NULL;
+ u64 min_bytes;
+ u64 window_start;
+ u64 window_free;
+ u64 max_extent = 0;
+ bool found_bitmap = false;
+ int ret;
+
+ /* for metadata, allow allocates with more holes */
+ if (btrfs_test_opt(root, SSD_SPREAD)) {
+ min_bytes = bytes + empty_size;
+ } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ /*
+ * we want to do larger allocations when we are
+ * flushing out the delayed refs, it helps prevent
+ * making more work as we go along.
+ */
+ if (trans->transaction->delayed_refs.flushing)
+ min_bytes = max(bytes, (bytes + empty_size) >> 1);
+ else
+ min_bytes = max(bytes, (bytes + empty_size) >> 4);
+ } else
+ min_bytes = max(bytes, (bytes + empty_size) >> 2);
+
+ spin_lock(&block_group->tree_lock);
+ spin_lock(&cluster->lock);
+
+ /* someone already found a cluster, hooray */
+ if (cluster->block_group) {
+ ret = 0;
+ goto out;
+ }
+again:
+ entry = tree_search_offset(block_group, offset, found_bitmap, 1);
+ if (!entry) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ /*
+ * If found_bitmap is true, we exhausted our search for extent entries,
+ * and we just want to search all of the bitmaps that we can find, and
+ * ignore any extent entries we find.
+ */
+ while (entry->bitmap || found_bitmap ||
+ (!entry->bitmap && entry->bytes < min_bytes)) {
+ struct rb_node *node = rb_next(&entry->offset_index);
+
+ if (entry->bitmap && entry->bytes > bytes + empty_size) {
+ ret = btrfs_bitmap_cluster(block_group, entry, cluster,
+ offset, bytes + empty_size,
+ min_bytes);
+ if (!ret)
+ goto got_it;
+ }
+
+ if (!node) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ }
+
+ /*
+ * We already searched all the extent entries from the passed in offset
+ * to the end and didn't find enough space for the cluster, and we also
+ * didn't find any bitmaps that met our criteria, just go ahead and exit
+ */
+ if (found_bitmap) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ cluster->points_to_bitmap = false;
+ window_start = entry->offset;
+ window_free = entry->bytes;
+ last = entry;
+ max_extent = entry->bytes;
+
+ while (1) {
+ /* out window is just right, lets fill it */
+ if (window_free >= bytes + empty_size)
+ break;
+
+ node = rb_next(&last->offset_index);
+ if (!node) {
+ if (found_bitmap)
+ goto again;
+ ret = -ENOSPC;
+ goto out;
+ }
+ next = rb_entry(node, struct btrfs_free_space, offset_index);
+
+ /*
+ * we found a bitmap, so if this search doesn't result in a
+ * cluster, we know to go and search again for the bitmaps and
+ * start looking for space there
+ */
+ if (next->bitmap) {
+ if (!found_bitmap)
+ offset = next->offset;
+ found_bitmap = true;
+ last = next;
+ continue;
+ }
+
+ /*
+ * we haven't filled the empty size and the window is
+ * very large. reset and try again
+ */
+ if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
+ next->offset - window_start > (bytes + empty_size) * 2) {
+ entry = next;
+ window_start = entry->offset;
+ window_free = entry->bytes;
+ last = entry;
+ max_extent = entry->bytes;
+ } else {
+ last = next;
+ window_free += next->bytes;
+ if (entry->bytes > max_extent)
+ max_extent = entry->bytes;
+ }
+ }
+
+ cluster->window_start = entry->offset;
+
+ /*
+ * now we've found our entries, pull them out of the free space
+ * cache and put them into the cluster rbtree
+ *
+ * The cluster includes an rbtree, but only uses the offset index
+ * of each free space cache entry.
+ */
+ while (1) {
+ node = rb_next(&entry->offset_index);
+ if (entry->bitmap && node) {
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ continue;
+ } else if (entry->bitmap && !node) {
+ break;
+ }
+
+ rb_erase(&entry->offset_index, &block_group->free_space_offset);
+ ret = tree_insert_offset(&cluster->root, entry->offset,
+ &entry->offset_index, 0);
+ BUG_ON(ret);
+
+ if (!node || entry == last)
+ break;
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ }
+
+ cluster->max_size = max_extent;
+got_it:
+ ret = 0;
+ atomic_inc(&block_group->count);
+ list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
+ cluster->block_group = block_group;
+out:
+ spin_unlock(&cluster->lock);
+ spin_unlock(&block_group->tree_lock);
+
+ return ret;
+}
+
+/*
+ * simple code to zero out a cluster
+ */
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+{
+ spin_lock_init(&cluster->lock);
+ spin_lock_init(&cluster->refill_lock);
+ cluster->root.rb_node = NULL;
+ cluster->max_size = 0;
+ cluster->points_to_bitmap = false;
+ INIT_LIST_HEAD(&cluster->block_group_list);
+ cluster->block_group = NULL;
+}
+
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 00000000000..890a8e79011
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_FREE_SPACE_CACHE
+#define __BTRFS_FREE_SPACE_CACHE
+
+struct btrfs_free_space {
+ struct rb_node offset_index;
+ u64 offset;
+ u64 bytes;
+ unsigned long *bitmap;
+ struct list_head list;
+};
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytenr, u64 size);
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytenr, u64 size);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+ *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes, u64 empty_size);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytes);
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 empty_size);
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster, u64 bytes,
+ u64 min_start);
+int btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster);
+#endif
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 00000000000..db2ff9773b9
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __HASH__
+#define __HASH__
+
+#include <linux/crc32c.h>
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+ return crc32c((u32)~1, name, len);
+}
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 00000000000..72ce3c173d6
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+static int find_name_in_backref(struct btrfs_path *path, const char *name,
+ int name_len, struct btrfs_inode_ref **ref_ret)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_inode_ref *ref;
+ unsigned long ptr;
+ unsigned long name_ptr;
+ u32 item_size;
+ u32 cur_offset = 0;
+ int len;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ while (cur_offset < item_size) {
+ ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+ len = btrfs_inode_ref_name_len(leaf, ref);
+ name_ptr = (unsigned long)(ref + 1);
+ cur_offset += len + sizeof(*ref);
+ if (len != name_len)
+ continue;
+ if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
+ *ref_ret = ref;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_inode_ref *ref;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+ unsigned long item_start;
+ u32 item_size;
+ u32 sub_item_len;
+ int ret;
+ int del_len = name_len + sizeof(*ref);
+
+ key.objectid = inode_objectid;
+ key.offset = ref_objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ } else if (ret < 0) {
+ goto out;
+ }
+ if (!find_name_in_backref(path, name, name_len, &ref)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+
+ if (index)
+ *index = btrfs_inode_ref_index(leaf, ref);
+
+ if (del_len == item_size) {
+ ret = btrfs_del_item(trans, root, path);
+ goto out;
+ }
+ ptr = (unsigned long)ref;
+ sub_item_len = name_len + sizeof(*ref);
+ item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+ item_size - (ptr + sub_item_len - item_start));
+ ret = btrfs_truncate_item(trans, root, path,
+ item_size - sub_item_len, 1);
+ BUG_ON(ret);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_inode_ref *ref;
+ unsigned long ptr;
+ int ret;
+ int ins_len = name_len + sizeof(*ref);
+
+ key.objectid = inode_objectid;
+ key.offset = ref_objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ ins_len);
+ if (ret == -EEXIST) {
+ u32 old_size;
+
+ if (find_name_in_backref(path, name, name_len, &ref))
+ goto out;
+
+ old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ ret = btrfs_extend_item(trans, root, path, ins_len);
+ BUG_ON(ret);
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_ref);
+ ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+ ptr = (unsigned long)(ref + 1);
+ ret = 0;
+ } else if (ret < 0) {
+ if (ret == -EOVERFLOW)
+ ret = -EMLINK;
+ goto out;
+ } else {
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_ref);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+ ptr = (unsigned long)(ref + 1);
+ }
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid)
+{
+ struct btrfs_key key;
+ int ret;
+ key.objectid = objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(struct btrfs_inode_item));
+ return ret;
+}
+
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path,
+ struct btrfs_key *location, int mod)
+{
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+ int ret;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key found_key;
+
+ ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
+ if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
+ location->offset == (u64)-1 && path->slots[0] != 0) {
+ slot = path->slots[0] - 1;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid == location->objectid &&
+ btrfs_key_type(&found_key) == btrfs_key_type(location)) {
+ path->slots[0]--;
+ return 0;
+ }
+ }
+ return ret;
+}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 00000000000..c56eb590917
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *l;
+ struct btrfs_key search_key;
+ struct btrfs_key found_key;
+ int slot;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+ search_key.type = -1;
+ search_key.offset = (u64)-1;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ BUG_ON(ret == 0);
+ if (path->slots[0] > 0) {
+ slot = path->slots[0] - 1;
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+ *objectid = max_t(u64, found_key.objectid,
+ BTRFS_FIRST_FREE_OBJECTID - 1);
+ } else {
+ *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 dirid, u64 *objectid)
+{
+ int ret;
+ mutex_lock(&root->objectid_mutex);
+
+ if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
+ ret = btrfs_find_highest_inode(root, &root->highest_objectid);
+ if (ret)
+ goto out;
+ }
+
+ if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ *objectid = ++root->highest_objectid;
+ ret = 0;
+out:
+ mutex_unlock(&root->objectid_mutex);
+ return ret;
+}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 00000000000..4deb280f896
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,6056 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/falloc.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "xattr.h"
+#include "tree-log.h"
+#include "compression.h"
+#include "locking.h"
+
+struct btrfs_iget_args {
+ u64 ino;
+ struct btrfs_root *root;
+};
+
+static const struct inode_operations btrfs_dir_inode_operations;
+static const struct inode_operations btrfs_symlink_inode_operations;
+static const struct inode_operations btrfs_dir_ro_inode_operations;
+static const struct inode_operations btrfs_special_inode_operations;
+static const struct inode_operations btrfs_file_inode_operations;
+static const struct address_space_operations btrfs_aops;
+static const struct address_space_operations btrfs_symlink_aops;
+static const struct file_operations btrfs_dir_file_operations;
+static struct extent_io_ops btrfs_extent_io_ops;
+
+static struct kmem_cache *btrfs_inode_cachep;
+struct kmem_cache *btrfs_trans_handle_cachep;
+struct kmem_cache *btrfs_transaction_cachep;
+struct kmem_cache *btrfs_path_cachep;
+
+#define S_SHIFT 12
+static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
+};
+
+static void btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static noinline int cow_file_range(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written, int unlock);
+
+static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir)
+{
+ int err;
+
+ err = btrfs_init_acl(trans, inode, dir);
+ if (!err)
+ err = btrfs_xattr_security_init(trans, inode, dir);
+ return err;
+}
+
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree. The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ u64 start, size_t size, size_t compressed_size,
+ struct page **compressed_pages)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct page *page = NULL;
+ char *kaddr;
+ unsigned long ptr;
+ struct btrfs_file_extent_item *ei;
+ int err = 0;
+ int ret;
+ size_t cur_size = size;
+ size_t datasize;
+ unsigned long offset;
+ int use_compress = 0;
+
+ if (compressed_size && compressed_pages) {
+ use_compress = 1;
+ cur_size = compressed_size;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+ btrfs_set_trans_block_group(trans, inode);
+
+ key.objectid = inode->i_ino;
+ key.offset = start;
+ btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+ datasize = btrfs_file_extent_calc_inline_size(cur_size);
+
+ inode_add_bytes(inode, size);
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ datasize);
+ BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+ btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_encryption(leaf, ei, 0);
+ btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+ btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+ ptr = btrfs_file_extent_inline_start(ei);
+
+ if (use_compress) {
+ struct page *cpage;
+ int i = 0;
+ while (compressed_size > 0) {
+ cpage = compressed_pages[i];
+ cur_size = min_t(unsigned long, compressed_size,
+ PAGE_CACHE_SIZE);
+
+ kaddr = kmap_atomic(cpage, KM_USER0);
+ write_extent_buffer(leaf, kaddr, ptr, cur_size);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ i++;
+ ptr += cur_size;
+ compressed_size -= cur_size;
+ }
+ btrfs_set_file_extent_compression(leaf, ei,
+ BTRFS_COMPRESS_ZLIB);
+ } else {
+ page = find_get_page(inode->i_mapping,
+ start >> PAGE_CACHE_SHIFT);
+ btrfs_set_file_extent_compression(leaf, ei, 0);
+ kaddr = kmap_atomic(page, KM_USER0);
+ offset = start & (PAGE_CACHE_SIZE - 1);
+ write_extent_buffer(leaf, kaddr + offset, ptr, size);
+ kunmap_atomic(kaddr, KM_USER0);
+ page_cache_release(page);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_free_path(path);
+
+ /*
+ * we're an inline extent, so nobody can
+ * extend the file past i_size without locking
+ * a page we already have locked.
+ *
+ * We must do any isize and inode updates
+ * before we unlock the pages. Otherwise we
+ * could end up racing with unlink.
+ */
+ BTRFS_I(inode)->disk_i_size = inode->i_size;
+ btrfs_update_inode(trans, root, inode);
+
+ return 0;
+fail:
+ btrfs_free_path(path);
+ return err;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file. This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode, u64 start, u64 end,
+ size_t compressed_size,
+ struct page **compressed_pages)
+{
+ u64 isize = i_size_read(inode);
+ u64 actual_end = min(end + 1, isize);
+ u64 inline_len = actual_end - start;
+ u64 aligned_end = (end + root->sectorsize - 1) &
+ ~((u64)root->sectorsize - 1);
+ u64 hint_byte;
+ u64 data_len = inline_len;
+ int ret;
+
+ if (compressed_size)
+ data_len = compressed_size;
+
+ if (start > 0 ||
+ actual_end >= PAGE_CACHE_SIZE ||
+ data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+ (!compressed_size &&
+ (actual_end & (root->sectorsize - 1)) == 0) ||
+ end + 1 < isize ||
+ data_len > root->fs_info->max_inline) {
+ return 1;
+ }
+
+ ret = btrfs_drop_extents(trans, inode, start, aligned_end,
+ &hint_byte, 1);
+ BUG_ON(ret);
+
+ if (isize > actual_end)
+ inline_len = min_t(u64, isize, actual_end);
+ ret = insert_inline_extent(trans, root, inode, start,
+ inline_len, compressed_size,
+ compressed_pages);
+ BUG_ON(ret);
+ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
+ return 0;
+}
+
+struct async_extent {
+ u64 start;
+ u64 ram_size;
+ u64 compressed_size;
+ struct page **pages;
+ unsigned long nr_pages;
+ struct list_head list;
+};
+
+struct async_cow {
+ struct inode *inode;
+ struct btrfs_root *root;
+ struct page *locked_page;
+ u64 start;
+ u64 end;
+ struct list_head extents;
+ struct btrfs_work work;
+};
+
+static noinline int add_async_extent(struct async_cow *cow,
+ u64 start, u64 ram_size,
+ u64 compressed_size,
+ struct page **pages,
+ unsigned long nr_pages)
+{
+ struct async_extent *async_extent;
+
+ async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+ async_extent->start = start;
+ async_extent->ram_size = ram_size;
+ async_extent->compressed_size = compressed_size;
+ async_extent->pages = pages;
+ async_extent->nr_pages = nr_pages;
+ list_add_tail(&async_extent->list, &cow->extents);
+ return 0;
+}
+
+/*
+ * we create compressed extents in two phases. The first
+ * phase compresses a range of pages that have already been
+ * locked (both pages and state bits are locked).
+ *
+ * This is done inside an ordered work queue, and the compression
+ * is spread across many cpus. The actual IO submission is step
+ * two, and the ordered work queue takes care of making sure that
+ * happens in the same order things were put onto the queue by
+ * writepages and friends.
+ *
+ * If this code finds it can't get good compression, it puts an
+ * entry onto the work queue to write the uncompressed bytes. This
+ * makes sure that both compressed inodes and uncompressed inodes
+ * are written in the same order that pdflush sent them down.
+ */
+static noinline int compress_file_range(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end,
+ struct async_cow *async_cow,
+ int *num_added)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ u64 num_bytes;
+ u64 orig_start;
+ u64 disk_num_bytes;
+ u64 blocksize = root->sectorsize;
+ u64 actual_end;
+ u64 isize = i_size_read(inode);
+ int ret = 0;
+ struct page **pages = NULL;
+ unsigned long nr_pages;
+ unsigned long nr_pages_ret = 0;
+ unsigned long total_compressed = 0;
+ unsigned long total_in = 0;
+ unsigned long max_compressed = 128 * 1024;
+ unsigned long max_uncompressed = 128 * 1024;
+ int i;
+ int will_compress;
+
+ orig_start = start;
+
+ actual_end = min_t(u64, isize, end + 1);
+again:
+ will_compress = 0;
+ nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+ nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+
+ /*
+ * we don't want to send crud past the end of i_size through
+ * compression, that's just a waste of CPU time. So, if the
+ * end of the file is before the start of our current
+ * requested range of bytes, we bail out to the uncompressed
+ * cleanup code that can deal with all of this.
+ *
+ * It isn't really the fastest way to fix things, but this is a
+ * very uncommon corner.
+ */
+ if (actual_end <= start)
+ goto cleanup_and_bail_uncompressed;
+
+ total_compressed = actual_end - start;
+
+ /* we want to make sure that amount of ram required to uncompress
+ * an extent is reasonable, so we limit the total size in ram
+ * of a compressed extent to 128k. This is a crucial number
+ * because it also controls how easily we can spread reads across
+ * cpus for decompression.
+ *
+ * We also want to make sure the amount of IO required to do
+ * a random read is reasonably small, so we limit the size of
+ * a compressed extent to 128k.
+ */
+ total_compressed = min(total_compressed, max_uncompressed);
+ num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+ num_bytes = max(blocksize, num_bytes);
+ disk_num_bytes = num_bytes;
+ total_in = 0;
+ ret = 0;
+
+ /*
+ * we do compression for mount -o compress and when the
+ * inode has not been flagged as nocompress. This flag can
+ * change at any time if we discover bad compression ratios.
+ */
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
+ btrfs_test_opt(root, COMPRESS)) {
+ WARN_ON(pages);
+ pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+
+ ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+ total_compressed, pages,
+ nr_pages, &nr_pages_ret,
+ &total_in,
+ &total_compressed,
+ max_compressed);
+
+ if (!ret) {
+ unsigned long offset = total_compressed &
+ (PAGE_CACHE_SIZE - 1);
+ struct page *page = pages[nr_pages_ret - 1];
+ char *kaddr;
+
+ /* zero the tail end of the last page, we might be
+ * sending it down to disk
+ */
+ if (offset) {
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, 0,
+ PAGE_CACHE_SIZE - offset);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ will_compress = 1;
+ }
+ }
+ if (start == 0) {
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(!trans);
+ btrfs_set_trans_block_group(trans, inode);
+
+ /* lets try to make an inline extent */
+ if (ret || total_in < (actual_end - start)) {
+ /* we didn't compress the entire range, try
+ * to make an uncompressed inline extent.
+ */
+ ret = cow_file_range_inline(trans, root, inode,
+ start, end, 0, NULL);
+ } else {
+ /* try making a compressed inline extent */
+ ret = cow_file_range_inline(trans, root, inode,
+ start, end,
+ total_compressed, pages);
+ }
+ if (ret == 0) {
+ /*
+ * inline extent creation worked, we don't need
+ * to create any more async work items. Unlock
+ * and free up our temp pages.
+ */
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_ACCOUNTING |
+ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
+
+ btrfs_end_transaction(trans, root);
+ goto free_pages_out;
+ }
+ btrfs_end_transaction(trans, root);
+ }
+
+ if (will_compress) {
+ /*
+ * we aren't doing an inline extent round the compressed size
+ * up to a block size boundary so the allocator does sane
+ * things
+ */
+ total_compressed = (total_compressed + blocksize - 1) &
+ ~(blocksize - 1);
+
+ /*
+ * one last check to make sure the compression is really a
+ * win, compare the page count read with the blocks on disk
+ */
+ total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+ ~(PAGE_CACHE_SIZE - 1);
+ if (total_compressed >= total_in) {
+ will_compress = 0;
+ } else {
+ disk_num_bytes = total_compressed;
+ num_bytes = total_in;
+ }
+ }
+ if (!will_compress && pages) {
+ /*
+ * the compression code ran but failed to make things smaller,
+ * free any pages it allocated and our page pointer array
+ */
+ for (i = 0; i < nr_pages_ret; i++) {
+ WARN_ON(pages[i]->mapping);
+ page_cache_release(pages[i]);
+ }
+ kfree(pages);
+ pages = NULL;
+ total_compressed = 0;
+ nr_pages_ret = 0;
+
+ /* flag the file so we don't compress in the future */
+ if (!btrfs_test_opt(root, FORCE_COMPRESS))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ }
+ if (will_compress) {
+ *num_added += 1;
+
+ /* the async work queues will take care of doing actual
+ * allocation on disk for these compressed pages,
+ * and will submit them to the elevator.
+ */
+ add_async_extent(async_cow, start, num_bytes,
+ total_compressed, pages, nr_pages_ret);
+
+ if (start + num_bytes < end && start + num_bytes < actual_end) {
+ start += num_bytes;
+ pages = NULL;
+ cond_resched();
+ goto again;
+ }
+ } else {
+cleanup_and_bail_uncompressed:
+ /*
+ * No compression, but we still need to write the pages in
+ * the file we've been given so far. redirty the locked
+ * page if it corresponds to our extent and set things up
+ * for the async work queue to run cow_file_range to do
+ * the normal delalloc dance
+ */
+ if (page_offset(locked_page) >= start &&
+ page_offset(locked_page) <= end) {
+ __set_page_dirty_nobuffers(locked_page);
+ /* unlocked later on in the async handlers */
+ }
+ add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+ *num_added += 1;
+ }
+
+out:
+ return 0;
+
+free_pages_out:
+ for (i = 0; i < nr_pages_ret; i++) {
+ WARN_ON(pages[i]->mapping);
+ page_cache_release(pages[i]);
+ }
+ kfree(pages);
+
+ goto out;
+}
+
+/*
+ * phase two of compressed writeback. This is the ordered portion
+ * of the code, which only gets called in the order the work was
+ * queued. We walk all the async extents created by compress_file_range
+ * and send them down to the disk.
+ */
+static noinline int submit_compressed_extents(struct inode *inode,
+ struct async_cow *async_cow)
+{
+ struct async_extent *async_extent;
+ u64 alloc_hint = 0;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key ins;
+ struct extent_map *em;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_io_tree *io_tree;
+ int ret = 0;
+
+ if (list_empty(&async_cow->extents))
+ return 0;
+
+
+ while (!list_empty(&async_cow->extents)) {
+ async_extent = list_entry(async_cow->extents.next,
+ struct async_extent, list);
+ list_del(&async_extent->list);
+
+ io_tree = &BTRFS_I(inode)->io_tree;
+
+retry:
+ /* did the compression code fall back to uncompressed IO? */
+ if (!async_extent->pages) {
+ int page_started = 0;
+ unsigned long nr_written = 0;
+
+ lock_extent(io_tree, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, GFP_NOFS);
+
+ /* allocate blocks */
+ ret = cow_file_range(inode, async_cow->locked_page,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ &page_started, &nr_written, 0);
+
+ /*
+ * if page_started, cow_file_range inserted an
+ * inline extent and took care of all the unlocking
+ * and IO for us. Otherwise, we need to submit
+ * all those pages down to the drive.
+ */
+ if (!page_started && !ret)
+ extent_write_locked_range(io_tree,
+ inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ btrfs_get_extent,
+ WB_SYNC_ALL);
+ kfree(async_extent);
+ cond_resched();
+ continue;
+ }
+
+ lock_extent(io_tree, async_extent->start,
+ async_extent->start + async_extent->ram_size - 1,
+ GFP_NOFS);
+
+ trans = btrfs_join_transaction(root, 1);
+ ret = btrfs_reserve_extent(trans, root,
+ async_extent->compressed_size,
+ async_extent->compressed_size,
+ 0, alloc_hint,
+ (u64)-1, &ins, 1);
+ btrfs_end_transaction(trans, root);
+
+ if (ret) {
+ int i;
+ for (i = 0; i < async_extent->nr_pages; i++) {
+ WARN_ON(async_extent->pages[i]->mapping);
+ page_cache_release(async_extent->pages[i]);
+ }
+ kfree(async_extent->pages);
+ async_extent->nr_pages = 0;
+ async_extent->pages = NULL;
+ unlock_extent(io_tree, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, GFP_NOFS);
+ goto retry;
+ }
+
+ /*
+ * here we're doing allocation and writeback of the
+ * compressed pages
+ */
+ btrfs_drop_extent_cache(inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, 0);
+
+ em = alloc_extent_map(GFP_NOFS);
+ em->start = async_extent->start;
+ em->len = async_extent->ram_size;
+ em->orig_start = em->start;
+
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, 0);
+ }
+
+ ret = btrfs_add_ordered_extent(inode, async_extent->start,
+ ins.objectid,
+ async_extent->ram_size,
+ ins.offset,
+ BTRFS_ORDERED_COMPRESSED);
+ BUG_ON(ret);
+
+ /*
+ * clear dirty, set writeback and unlock the pages.
+ */
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
+
+ ret = btrfs_submit_compressed_write(inode,
+ async_extent->start,
+ async_extent->ram_size,
+ ins.objectid,
+ ins.offset, async_extent->pages,
+ async_extent->nr_pages);
+
+ BUG_ON(ret);
+ alloc_hint = ins.objectid + ins.offset;
+ kfree(async_extent);
+ cond_resched();
+ }
+
+ return 0;
+}
+
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code. The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already. We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it. It may be clean and already done with
+ * IO when we return.
+ */
+static noinline int cow_file_range(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written,
+ int unlock)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ u64 alloc_hint = 0;
+ u64 num_bytes;
+ unsigned long ram_size;
+ u64 disk_num_bytes;
+ u64 cur_alloc_size;
+ u64 blocksize = root->sectorsize;
+ u64 actual_end;
+ u64 isize = i_size_read(inode);
+ struct btrfs_key ins;
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ int ret = 0;
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(!trans);
+ btrfs_set_trans_block_group(trans, inode);
+
+ actual_end = min_t(u64, isize, end + 1);
+
+ num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+ num_bytes = max(blocksize, num_bytes);
+ disk_num_bytes = num_bytes;
+ ret = 0;
+
+ if (start == 0) {
+ /* lets try to make an inline extent */
+ ret = cow_file_range_inline(trans, root, inode,
+ start, end, 0, NULL);
+ if (ret == 0) {
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_ACCOUNTING |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+
+ *nr_written = *nr_written +
+ (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+ *page_started = 1;
+ ret = 0;
+ goto out;
+ }
+ }
+
+ BUG_ON(disk_num_bytes >
+ btrfs_super_total_bytes(&root->fs_info->super_copy));
+
+
+ read_lock(&BTRFS_I(inode)->extent_tree.lock);
+ em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
+ start, num_bytes);
+ if (em) {
+ /*
+ * if block start isn't an actual block number then find the
+ * first block in this inode and use that as a hint. If that
+ * block is also bogus then just don't worry about it.
+ */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ em = search_extent_mapping(em_tree, 0, 0);
+ if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+ alloc_hint = em->block_start;
+ if (em)
+ free_extent_map(em);
+ } else {
+ alloc_hint = em->block_start;
+ free_extent_map(em);
+ }
+ }
+ read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+ btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+
+ while (disk_num_bytes > 0) {
+ unsigned long op;
+
+ cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+ ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+ root->sectorsize, 0, alloc_hint,
+ (u64)-1, &ins, 1);
+ BUG_ON(ret);
+
+ em = alloc_extent_map(GFP_NOFS);
+ em->start = start;
+ em->orig_start = em->start;
+ ram_size = ins.offset;
+ em->len = ins.offset;
+
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, start,
+ start + ram_size - 1, 0);
+ }
+
+ cur_alloc_size = ins.offset;
+ ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
+ ram_size, cur_alloc_size, 0);
+ BUG_ON(ret);
+
+ if (root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ ret = btrfs_reloc_clone_csums(inode, start,
+ cur_alloc_size);
+ BUG_ON(ret);
+ }
+
+ if (disk_num_bytes < cur_alloc_size)
+ break;
+
+ /* we're not doing compressed IO, don't unlock the first
+ * page (which the caller expects to stay locked), don't
+ * clear any dirty bits and don't set any writeback bits
+ *
+ * Do set the Private2 bit so we know this page was properly
+ * setup for writepage
+ */
+ op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
+ op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+ EXTENT_SET_PRIVATE2;
+
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ start, start + ram_size - 1,
+ locked_page, op);
+ disk_num_bytes -= cur_alloc_size;
+ num_bytes -= cur_alloc_size;
+ alloc_hint = ins.objectid + ins.offset;
+ start += cur_alloc_size;
+ }
+out:
+ ret = 0;
+ btrfs_end_transaction(trans, root);
+
+ return ret;
+}
+
+/*
+ * work queue call back to started compression on a file and pages
+ */
+static noinline void async_cow_start(struct btrfs_work *work)
+{
+ struct async_cow *async_cow;
+ int num_added = 0;
+ async_cow = container_of(work, struct async_cow, work);
+
+ compress_file_range(async_cow->inode, async_cow->locked_page,
+ async_cow->start, async_cow->end, async_cow,
+ &num_added);
+ if (num_added == 0)
+ async_cow->inode = NULL;
+}
+
+/*
+ * work queue call back to submit previously compressed pages
+ */
+static noinline void async_cow_submit(struct btrfs_work *work)
+{
+ struct async_cow *async_cow;
+ struct btrfs_root *root;
+ unsigned long nr_pages;
+
+ async_cow = container_of(work, struct async_cow, work);
+
+ root = async_cow->root;
+ nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+
+ atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
+
+ if (atomic_read(&root->fs_info->async_delalloc_pages) <
+ 5 * 1042 * 1024 &&
+ waitqueue_active(&root->fs_info->async_submit_wait))
+ wake_up(&root->fs_info->async_submit_wait);
+
+ if (async_cow->inode)
+ submit_compressed_extents(async_cow->inode, async_cow);
+}
+
+static noinline void async_cow_free(struct btrfs_work *work)
+{
+ struct async_cow *async_cow;
+ async_cow = container_of(work, struct async_cow, work);
+ kfree(async_cow);
+}
+
+static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written)
+{
+ struct async_cow *async_cow;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ unsigned long nr_pages;
+ u64 cur_end;
+ int limit = 10 * 1024 * 1042;
+
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
+ 1, 0, NULL, GFP_NOFS);
+ while (start < end) {
+ async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+ async_cow->inode = inode;
+ async_cow->root = root;
+ async_cow->locked_page = locked_page;
+ async_cow->start = start;
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+ cur_end = end;
+ else
+ cur_end = min(end, start + 512 * 1024 - 1);
+
+ async_cow->end = cur_end;
+ INIT_LIST_HEAD(&async_cow->extents);
+
+ async_cow->work.func = async_cow_start;
+ async_cow->work.ordered_func = async_cow_submit;
+ async_cow->work.ordered_free = async_cow_free;
+ async_cow->work.flags = 0;
+
+ nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+ atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
+
+ btrfs_queue_worker(&root->fs_info->delalloc_workers,
+ &async_cow->work);
+
+ if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
+ wait_event(root->fs_info->async_submit_wait,
+ (atomic_read(&root->fs_info->async_delalloc_pages) <
+ limit));
+ }
+
+ while (atomic_read(&root->fs_info->async_submit_draining) &&
+ atomic_read(&root->fs_info->async_delalloc_pages)) {
+ wait_event(root->fs_info->async_submit_wait,
+ (atomic_read(&root->fs_info->async_delalloc_pages) ==
+ 0));
+ }
+
+ *nr_written += nr_pages;
+ start = cur_end + 1;
+ }
+ *page_started = 1;
+ return 0;
+}
+
+static noinline int csum_exist_in_range(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes)
+{
+ int ret;
+ struct btrfs_ordered_sum *sums;
+ LIST_HEAD(list);
+
+ ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
+ bytenr + num_bytes - 1, &list);
+ if (ret == 0 && list_empty(&list))
+ return 0;
+
+ while (!list_empty(&list)) {
+ sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ return 1;
+}
+
+/*
+ * when nowcow writeback call back. This checks for snapshots or COW copies
+ * of the extents that exist in the file, and COWs the file as required.
+ *
+ * If no cow copies or snapshots exist, we write directly to the existing
+ * blocks on disk
+ */
+static noinline int run_delalloc_nocow(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started, int force,
+ unsigned long *nr_written)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key found_key;
+ u64 cow_start;
+ u64 cur_offset;
+ u64 extent_end;
+ u64 extent_offset;
+ u64 disk_bytenr;
+ u64 num_bytes;
+ int extent_type;
+ int ret;
+ int type;
+ int nocow;
+ int check_prev = 1;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(!trans);
+
+ cow_start = (u64)-1;
+ cur_offset = start;
+ while (1) {
+ ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ cur_offset, 0);
+ BUG_ON(ret < 0);
+ if (ret > 0 && path->slots[0] > 0 && check_prev) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key,
+ path->slots[0] - 1);
+ if (found_key.objectid == inode->i_ino &&
+ found_key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+ check_prev = 0;
+next_slot:
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ BUG_ON(1);
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ nocow = 0;
+ disk_bytenr = 0;
+ num_bytes = 0;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid > inode->i_ino ||
+ found_key.type > BTRFS_EXTENT_DATA_KEY ||
+ found_key.offset > end)
+ break;
+
+ if (found_key.offset > cur_offset) {
+ extent_end = found_key.offset;
+ extent_type = 0;
+ goto out_check;
+ }
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ extent_offset = btrfs_file_extent_offset(leaf, fi);
+ extent_end = found_key.offset +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_end <= start) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (disk_bytenr == 0)
+ goto out_check;
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ goto out_check;
+ if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
+ goto out_check;
+ if (btrfs_extent_readonly(root, disk_bytenr))
+ goto out_check;
+ if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+ found_key.offset -
+ extent_offset, disk_bytenr))
+ goto out_check;
+ disk_bytenr += extent_offset;
+ disk_bytenr += cur_offset - found_key.offset;
+ num_bytes = min(end + 1, extent_end) - cur_offset;
+ /*
+ * force cow if csum exists in the range.
+ * this ensure that csum for a given extent are
+ * either valid or do not exist.
+ */
+ if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+ goto out_check;
+ nocow = 1;
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ extent_end = found_key.offset +
+ btrfs_file_extent_inline_len(leaf, fi);
+ extent_end = ALIGN(extent_end, root->sectorsize);
+ } else {
+ BUG_ON(1);
+ }
+out_check:
+ if (extent_end <= start) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (!nocow) {
+ if (cow_start == (u64)-1)
+ cow_start = cur_offset;
+ cur_offset = extent_end;
+ if (cur_offset > end)
+ break;
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ btrfs_release_path(root, path);
+ if (cow_start != (u64)-1) {
+ ret = cow_file_range(inode, locked_page, cow_start,
+ found_key.offset - 1, page_started,
+ nr_written, 1);
+ BUG_ON(ret);
+ cow_start = (u64)-1;
+ }
+
+ if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ struct extent_map *em;
+ struct extent_map_tree *em_tree;
+ em_tree = &BTRFS_I(inode)->extent_tree;
+ em = alloc_extent_map(GFP_NOFS);
+ em->start = cur_offset;
+ em->orig_start = em->start;
+ em->len = num_bytes;
+ em->block_len = num_bytes;
+ em->block_start = disk_bytenr;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
+ }
+ type = BTRFS_ORDERED_PREALLOC;
+ } else {
+ type = BTRFS_ORDERED_NOCOW;
+ }
+
+ ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
+ num_bytes, num_bytes, type);
+ BUG_ON(ret);
+
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ cur_offset, cur_offset + num_bytes - 1,
+ locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+ EXTENT_SET_PRIVATE2);
+ cur_offset = extent_end;
+ if (cur_offset > end)
+ break;
+ }
+ btrfs_release_path(root, path);
+
+ if (cur_offset <= end && cow_start == (u64)-1)
+ cow_start = cur_offset;
+ if (cow_start != (u64)-1) {
+ ret = cow_file_range(inode, locked_page, cow_start, end,
+ page_started, nr_written, 1);
+ BUG_ON(ret);
+ }
+
+ ret = btrfs_end_transaction(trans, root);
+ BUG_ON(ret);
+ btrfs_free_path(path);
+ return 0;
+}
+
+/*
+ * extent_io.c call back to do delayed allocation processing
+ */
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written)
+{
+ int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
+ ret = run_delalloc_nocow(inode, locked_page, start, end,
+ page_started, 1, nr_written);
+ else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
+ ret = run_delalloc_nocow(inode, locked_page, start, end,
+ page_started, 0, nr_written);
+ else if (!btrfs_test_opt(root, COMPRESS))
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1);
+ else
+ ret = cow_file_range_async(inode, locked_page, start, end,
+ page_started, nr_written);
+ return ret;
+}
+
+static int btrfs_split_extent_hook(struct inode *inode,
+ struct extent_state *orig, u64 split)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 size;
+
+ if (!(orig->state & EXTENT_DELALLOC))
+ return 0;
+
+ size = orig->end - orig->start + 1;
+ if (size > root->fs_info->max_extent) {
+ u64 num_extents;
+ u64 new_size;
+
+ new_size = orig->end - split + 1;
+ num_extents = div64_u64(size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent);
+
+ /*
+ * if we break a large extent up then leave oustanding_extents
+ * be, since we've already accounted for the large extent.
+ */
+ if (div64_u64(new_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent) < num_extents)
+ return 0;
+ }
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ return 0;
+}
+
+/*
+ * extent_io.c merge_extent_hook, used to track merged delayed allocation
+ * extents so we can keep track of new extents that are just merged onto old
+ * extents, such as when we are doing sequential writes, so we can properly
+ * account for the metadata space we'll need.
+ */
+static int btrfs_merge_extent_hook(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 new_size, old_size;
+ u64 num_extents;
+
+ /* not delalloc, ignore it */
+ if (!(other->state & EXTENT_DELALLOC))
+ return 0;
+
+ old_size = other->end - other->start + 1;
+ if (new->start < other->start)
+ new_size = other->end - new->start + 1;
+ else
+ new_size = new->end - other->start + 1;
+
+ /* we're not bigger than the max, unreserve the space and go */
+ if (new_size <= root->fs_info->max_extent) {
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ return 0;
+ }
+
+ /*
+ * If we grew by another max_extent, just return, we want to keep that
+ * reserved amount.
+ */
+ num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent);
+ if (div64_u64(new_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent) > num_extents)
+ return 0;
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ return 0;
+}
+
+/*
+ * extent_io.c set_bit_hook, used to track delayed allocation
+ * bytes in this file, and to maintain the list of inodes that
+ * have pending delalloc work to be done.
+ */
+static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+ unsigned long old, unsigned long bits)
+{
+
+ /*
+ * set_bit and clear bit hooks normally require _irqsave/restore
+ * but in this case, we are only testeing for the DELALLOC
+ * bit, which is only set or cleared with irqs on
+ */
+ if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+ spin_lock(&root->fs_info->delalloc_lock);
+ BTRFS_I(inode)->delalloc_bytes += end - start + 1;
+ root->fs_info->delalloc_bytes += end - start + 1;
+ if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+ &root->fs_info->delalloc_inodes);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+ return 0;
+}
+
+/*
+ * extent_io.c clear_bit_hook, see set_bit_hook for why
+ */
+static int btrfs_clear_bit_hook(struct inode *inode,
+ struct extent_state *state, unsigned long bits)
+{
+ /*
+ * set_bit and clear bit hooks normally require _irqsave/restore
+ * but in this case, we are only testeing for the DELALLOC
+ * bit, which is only set or cleared with irqs on
+ */
+ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (bits & EXTENT_DO_ACCOUNTING) {
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ }
+
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (state->end - state->start + 1 >
+ root->fs_info->delalloc_bytes) {
+ printk(KERN_INFO "btrfs warning: delalloc account "
+ "%llu %llu\n",
+ (unsigned long long)
+ state->end - state->start + 1,
+ (unsigned long long)
+ root->fs_info->delalloc_bytes);
+ btrfs_delalloc_free_space(root, inode, (u64)-1);
+ root->fs_info->delalloc_bytes = 0;
+ BTRFS_I(inode)->delalloc_bytes = 0;
+ } else {
+ btrfs_delalloc_free_space(root, inode,
+ state->end -
+ state->start + 1);
+ root->fs_info->delalloc_bytes -= state->end -
+ state->start + 1;
+ BTRFS_I(inode)->delalloc_bytes -= state->end -
+ state->start + 1;
+ }
+ if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+ !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+ return 0;
+}
+
+/*
+ * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
+ * we don't create bios that span stripes or chunks
+ */
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+ size_t size, struct bio *bio,
+ unsigned long bio_flags)
+{
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ struct btrfs_mapping_tree *map_tree;
+ u64 logical = (u64)bio->bi_sector << 9;
+ u64 length = 0;
+ u64 map_length;
+ int ret;
+
+ if (bio_flags & EXTENT_BIO_COMPRESSED)
+ return 0;
+
+ length = bio->bi_size;
+ map_tree = &root->fs_info->mapping_tree;
+ map_length = length;
+ ret = btrfs_map_block(map_tree, READ, logical,
+ &map_length, NULL, 0);
+
+ if (map_length < length + size)
+ return 1;
+ return 0;
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time. All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_start(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+ BUG_ON(ret);
+ return 0;
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time. All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+}
+
+/*
+ * extent_io.c submission hook. This does the right thing for csum calculation
+ * on write, or reading the csums from the tree before a read
+ */
+static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+ int skip_sum;
+
+ skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ BUG_ON(ret);
+
+ if (!(rw & (1 << BIO_RW))) {
+ if (bio_flags & EXTENT_BIO_COMPRESSED) {
+ return btrfs_submit_compressed_read(inode, bio,
+ mirror_num, bio_flags);
+ } else if (!skip_sum)
+ btrfs_lookup_bio_sums(root, inode, bio, NULL);
+ goto mapit;
+ } else if (!skip_sum) {
+ /* csum items have already been cloned */
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ goto mapit;
+ /* we're doing a write, do the async checksumming */
+ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, mirror_num,
+ bio_flags, __btrfs_submit_bio_start,
+ __btrfs_submit_bio_done);
+ }
+
+mapit:
+ return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+}
+
+/*
+ * given a list of ordered sums record them in the inode. This happens
+ * at IO completion time based on sums calculated at bio submission time.
+ */
+static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 file_offset,
+ struct list_head *list)
+{
+ struct btrfs_ordered_sum *sum;
+
+ btrfs_set_trans_block_group(trans, inode);
+
+ list_for_each_entry(sum, list, list) {
+ btrfs_csum_file_blocks(trans,
+ BTRFS_I(inode)->root->fs_info->csum_root, sum);
+ }
+ return 0;
+}
+
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+{
+ if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
+ WARN_ON(1);
+ return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
+ GFP_NOFS);
+}
+
+/* see btrfs_writepage_start_hook for details on why this is required */
+struct btrfs_writepage_fixup {
+ struct page *page;
+ struct btrfs_work work;
+};
+
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+{
+ struct btrfs_writepage_fixup *fixup;
+ struct btrfs_ordered_extent *ordered;
+ struct page *page;
+ struct inode *inode;
+ u64 page_start;
+ u64 page_end;
+
+ fixup = container_of(work, struct btrfs_writepage_fixup, work);
+ page = fixup->page;
+again:
+ lock_page(page);
+ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+ ClearPageChecked(page);
+ goto out_page;
+ }
+
+ inode = page->mapping->host;
+ page_start = page_offset(page);
+ page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+
+ /* already ordered? We're done */
+ if (PagePrivate2(page))
+ goto out;
+
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
+ page_end, GFP_NOFS);
+ unlock_page(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ goto again;
+ }
+
+ btrfs_set_extent_delalloc(inode, page_start, page_end);
+ ClearPageChecked(page);
+out:
+ unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+out_page:
+ unlock_page(page);
+ page_cache_release(page);
+}
+
+/*
+ * There are a few paths in the higher layers of the kernel that directly
+ * set the page dirty bit without asking the filesystem if it is a
+ * good idea. This causes problems because we want to make sure COW
+ * properly happens and the data=ordered rules are followed.
+ *
+ * In our case any range that doesn't have the ORDERED bit set
+ * hasn't been properly setup for IO. We kick off an async process
+ * to fix it up. The async helper will wait for ordered extents, set
+ * the delalloc bit and make it safe to write the page.
+ */
+static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_writepage_fixup *fixup;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /* this page is properly in the ordered list */
+ if (TestClearPagePrivate2(page))
+ return 0;
+
+ if (PageChecked(page))
+ return -EAGAIN;
+
+ fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+ if (!fixup)
+ return -EAGAIN;
+
+ SetPageChecked(page);
+ page_cache_get(page);
+ fixup->work.func = btrfs_writepage_fixup_worker;
+ fixup->page = page;
+ btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+ return -EAGAIN;
+}
+
+static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 file_pos,
+ u64 disk_bytenr, u64 disk_num_bytes,
+ u64 num_bytes, u64 ram_bytes,
+ u8 compression, u8 encryption,
+ u16 other_encoding, int extent_type)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key ins;
+ u64 hint;
+ int ret;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ path->leave_spinning = 1;
+
+ /*
+ * we may be replacing one extent in the tree with another.
+ * The new extent is pinned in the extent map, and we don't want
+ * to drop it from the cache until it is completely in the btree.
+ *
+ * So, tell btrfs_drop_extents to leave this extent in the cache.
+ * the caller is expected to unpin it and allow it to be merged
+ * with the others.
+ */
+ ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
+ &hint, 0);
+ BUG_ON(ret);
+
+ ins.objectid = inode->i_ino;
+ ins.offset = file_pos;
+ ins.type = BTRFS_EXTENT_DATA_KEY;
+ ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
+ BUG_ON(ret);
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_set_file_extent_type(leaf, fi, extent_type);
+ btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+ btrfs_set_file_extent_compression(leaf, fi, compression);
+ btrfs_set_file_extent_encryption(leaf, fi, encryption);
+ btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+
+ btrfs_unlock_up_safe(path, 1);
+ btrfs_set_lock_blocking(leaf);
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ inode_add_bytes(inode, num_bytes);
+
+ ins.objectid = disk_bytenr;
+ ins.offset = disk_num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
+ root->root_key.objectid,
+ inode->i_ino, file_pos, &ins);
+ BUG_ON(ret);
+ btrfs_free_path(path);
+
+ return 0;
+}
+
+/*
+ * helper function for btrfs_finish_ordered_io, this
+ * just reads in some of the csum leaves to prime them into ram
+ * before we start the transaction. It limits the amount of btree
+ * reads required while inside the transaction.
+ */
+/* as ordered data IO finishes, this gets called so we can finish
+ * an ordered extent if the range of bytes in the file it covers are
+ * fully written.
+ */
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_ordered_extent *ordered_extent = NULL;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ int compressed = 0;
+ int ret;
+
+ ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+ if (!ret)
+ return 0;
+
+ ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+ BUG_ON(!ordered_extent);
+
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+ BUG_ON(!list_empty(&ordered_extent->list));
+ ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ if (!ret) {
+ trans = btrfs_join_transaction(root, 1);
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+ btrfs_end_transaction(trans, root);
+ }
+ goto out;
+ }
+
+ lock_extent(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset + ordered_extent->len - 1,
+ GFP_NOFS);
+
+ trans = btrfs_join_transaction(root, 1);
+
+ if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+ compressed = 1;
+ if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+ BUG_ON(compressed);
+ ret = btrfs_mark_extent_written(trans, inode,
+ ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len);
+ BUG_ON(ret);
+ } else {
+ ret = insert_reserved_file_extent(trans, inode,
+ ordered_extent->file_offset,
+ ordered_extent->start,
+ ordered_extent->disk_len,
+ ordered_extent->len,
+ ordered_extent->len,
+ compressed, 0, 0,
+ BTRFS_FILE_EXTENT_REG);
+ unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+ ordered_extent->file_offset,
+ ordered_extent->len);
+ BUG_ON(ret);
+ }
+ unlock_extent(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset + ordered_extent->len - 1,
+ GFP_NOFS);
+ add_pending_csums(trans, inode, ordered_extent->file_offset,
+ &ordered_extent->list);
+
+ /* this also removes the ordered extent from the tree */
+ btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+ btrfs_end_transaction(trans, root);
+out:
+ /* once for us */
+ btrfs_put_ordered_extent(ordered_extent);
+ /* once for the tree */
+ btrfs_put_ordered_extent(ordered_extent);
+
+ return 0;
+}
+
+static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+ struct extent_state *state, int uptodate)
+{
+ ClearPagePrivate2(page);
+ return btrfs_finish_ordered_io(page->mapping->host, start, end);
+}
+
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data. This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors. If another mirror has good data, the page is set up to date
+ * and things continue. If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+ struct page *page;
+ u64 start;
+ u64 len;
+ u64 logical;
+ unsigned long bio_flags;
+ int last_mirror;
+};
+
+static int btrfs_io_failed_hook(struct bio *failed_bio,
+ struct page *page, u64 start, u64 end,
+ struct extent_state *state)
+{
+ struct io_failure_record *failrec = NULL;
+ u64 private;
+ struct extent_map *em;
+ struct inode *inode = page->mapping->host;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct bio *bio;
+ int num_copies;
+ int ret;
+ int rw;
+ u64 logical;
+
+ ret = get_state_private(failure_tree, start, &private);
+ if (ret) {
+ failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
+ if (!failrec)
+ return -ENOMEM;
+ failrec->start = start;
+ failrec->len = end - start + 1;
+ failrec->last_mirror = 0;
+ failrec->bio_flags = 0;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, failrec->len);
+ if (em->start > start || em->start + em->len < start) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ read_unlock(&em_tree->lock);
+
+ if (!em || IS_ERR(em)) {
+ kfree(failrec);
+ return -EIO;
+ }
+ logical = start - em->start;
+ logical = em->block_start + logical;
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ logical = em->block_start;
+ failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+ }
+ failrec->logical = logical;
+ free_extent_map(em);
+ set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
+ EXTENT_DIRTY, GFP_NOFS);
+ set_state_private(failure_tree, start,
+ (u64)(unsigned long)failrec);
+ } else {
+ failrec = (struct io_failure_record *)(unsigned long)private;
+ }
+ num_copies = btrfs_num_copies(
+ &BTRFS_I(inode)->root->fs_info->mapping_tree,
+ failrec->logical, failrec->len);
+ failrec->last_mirror++;
+ if (!state) {
+ spin_lock(&BTRFS_I(inode)->io_tree.lock);
+ state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+ failrec->start,
+ EXTENT_LOCKED);
+ if (state && state->start != failrec->start)
+ state = NULL;
+ spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+ }
+ if (!state || failrec->last_mirror > num_copies) {
+ set_state_private(failure_tree, failrec->start, 0);
+ clear_extent_bits(failure_tree, failrec->start,
+ failrec->start + failrec->len - 1,
+ EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+ kfree(failrec);
+ return -EIO;
+ }
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio->bi_private = state;
+ bio->bi_end_io = failed_bio->bi_end_io;
+ bio->bi_sector = failrec->logical >> 9;
+ bio->bi_bdev = failed_bio->bi_bdev;
+ bio->bi_size = 0;
+
+ bio_add_page(bio, page, failrec->len, start - page_offset(page));
+ if (failed_bio->bi_rw & (1 << BIO_RW))
+ rw = WRITE;
+ else
+ rw = READ;
+
+ BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+ failrec->last_mirror,
+ failrec->bio_flags);
+ return 0;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int btrfs_clean_io_failures(struct inode *inode, u64 start)
+{
+ u64 private;
+ u64 private_failure;
+ struct io_failure_record *failure;
+ int ret;
+
+ private = 0;
+ if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+ (u64)-1, 1, EXTENT_DIRTY)) {
+ ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
+ start, &private_failure);
+ if (ret == 0) {
+ failure = (struct io_failure_record *)(unsigned long)
+ private_failure;
+ set_state_private(&BTRFS_I(inode)->io_failure_tree,
+ failure->start, 0);
+ clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+ failure->start,
+ failure->start + failure->len - 1,
+ EXTENT_DIRTY | EXTENT_LOCKED,
+ GFP_NOFS);
+ kfree(failure);
+ }
+ }
+ return 0;
+}
+
+/*
+ * when reads are done, we need to check csums to verify the data is correct
+ * if there's a match, we allow the bio to finish. If not, we go through
+ * the io_failure_record routines to find good copies
+ */
+static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+ struct extent_state *state)
+{
+ size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+ struct inode *inode = page->mapping->host;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ char *kaddr;
+ u64 private = ~(u32)0;
+ int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u32 csum = ~(u32)0;
+
+ if (PageChecked(page)) {
+ ClearPageChecked(page);
+ goto good;
+ }
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ return 0;
+
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
+ clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
+ GFP_NOFS);
+ return 0;
+ }
+
+ if (state && state->start == start) {
+ private = state->private;
+ ret = 0;
+ } else {
+ ret = get_state_private(io_tree, start, &private);
+ }
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (ret)
+ goto zeroit;
+
+ csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1);
+ btrfs_csum_final(csum, (char *)&csum);
+ if (csum != private)
+ goto zeroit;
+
+ kunmap_atomic(kaddr, KM_USER0);
+good:
+ /* if the io failure tree for this inode is non-empty,
+ * check to see if we've recovered from a failed IO
+ */
+ btrfs_clean_io_failures(inode, start);
+ return 0;
+
+zeroit:
+ if (printk_ratelimit()) {
+ printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+ "private %llu\n", page->mapping->host->i_ino,
+ (unsigned long long)start, csum,
+ (unsigned long long)private);
+ }
+ memset(kaddr + offset, 1, end - start + 1);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ if (private == 0)
+ return 0;
+ return -EIO;
+}
+
+struct delayed_iput {
+ struct list_head list;
+ struct inode *inode;
+};
+
+void btrfs_add_delayed_iput(struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct delayed_iput *delayed;
+
+ if (atomic_add_unless(&inode->i_count, -1, 1))
+ return;
+
+ delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
+ delayed->inode = inode;
+
+ spin_lock(&fs_info->delayed_iput_lock);
+ list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+ spin_unlock(&fs_info->delayed_iput_lock);
+}
+
+void btrfs_run_delayed_iputs(struct btrfs_root *root)
+{
+ LIST_HEAD(list);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct delayed_iput *delayed;
+ int empty;
+
+ spin_lock(&fs_info->delayed_iput_lock);
+ empty = list_empty(&fs_info->delayed_iputs);
+ spin_unlock(&fs_info->delayed_iput_lock);
+ if (empty)
+ return;
+
+ down_read(&root->fs_info->cleanup_work_sem);
+ spin_lock(&fs_info->delayed_iput_lock);
+ list_splice_init(&fs_info->delayed_iputs, &list);
+ spin_unlock(&fs_info->delayed_iput_lock);
+
+ while (!list_empty(&list)) {
+ delayed = list_entry(list.next, struct delayed_iput, list);
+ list_del(&delayed->list);
+ iput(delayed->inode);
+ kfree(delayed);
+ }
+ up_read(&root->fs_info->cleanup_work_sem);
+}
+
+/*
+ * This creates an orphan entry for the given inode in case something goes
+ * wrong in the middle of an unlink/truncate.
+ */
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ spin_lock(&root->list_lock);
+
+ /* already on the orphan list, we're good */
+ if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+ spin_unlock(&root->list_lock);
+ return 0;
+ }
+
+ list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+
+ spin_unlock(&root->list_lock);
+
+ /*
+ * insert an orphan item to track this unlinked/truncated file
+ */
+ ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+
+ return ret;
+}
+
+/*
+ * We have done the truncate/delete so we can go ahead and remove the orphan
+ * item for this particular inode.
+ */
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ spin_lock(&root->list_lock);
+
+ if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+ spin_unlock(&root->list_lock);
+ return 0;
+ }
+
+ list_del_init(&BTRFS_I(inode)->i_orphan);
+ if (!trans) {
+ spin_unlock(&root->list_lock);
+ return 0;
+ }
+
+ spin_unlock(&root->list_lock);
+
+ ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+
+ return ret;
+}
+
+/*
+ * this cleans up any orphans that may be left on the list from the last use
+ * of this root.
+ */
+void btrfs_orphan_cleanup(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ struct btrfs_key key, found_key;
+ struct btrfs_trans_handle *trans;
+ struct inode *inode;
+ int ret = 0, nr_unlink = 0, nr_truncate = 0;
+
+ if (!xchg(&root->clean_orphans, 0))
+ return;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ path->reada = -1;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ printk(KERN_ERR "Error searching slot for orphan: %d"
+ "\n", ret);
+ break;
+ }
+
+ /*
+ * if ret == 0 means we found what we were searching for, which
+ * is weird, but possible, so only screw with path if we didnt
+ * find the key and see if we have stuff that matches
+ */
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+
+ /* pull out the item */
+ leaf = path->nodes[0];
+ item = btrfs_item_nr(leaf, path->slots[0]);
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ /* make sure the item matches what we want */
+ if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
+ break;
+ if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ /* release the path since we're done with it */
+ btrfs_release_path(root, path);
+
+ /*
+ * this is where we are basically btrfs_lookup, without the
+ * crossing root thing. we store the inode number in the
+ * offset of the orphan item.
+ */
+ found_key.objectid = found_key.offset;
+ found_key.type = BTRFS_INODE_ITEM_KEY;
+ found_key.offset = 0;
+ inode = btrfs_iget(root->fs_info->sb, &found_key, root);
+ if (IS_ERR(inode))
+ break;
+
+ /*
+ * add this inode to the orphan list so btrfs_orphan_del does
+ * the proper thing when we hit it
+ */
+ spin_lock(&root->list_lock);
+ list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ spin_unlock(&root->list_lock);
+
+ /*
+ * if this is a bad inode, means we actually succeeded in
+ * removing the inode, but not the orphan record, which means
+ * we need to manually delete the orphan since iput will just
+ * do a destroy_inode
+ */
+ if (is_bad_inode(inode)) {
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_orphan_del(trans, inode);
+ btrfs_end_transaction(trans, root);
+ iput(inode);
+ continue;
+ }
+
+ /* if we have links, this was a truncate, lets do that */
+ if (inode->i_nlink) {
+ nr_truncate++;
+ btrfs_truncate(inode);
+ } else {
+ nr_unlink++;
+ }
+
+ /* this will do delete_inode and everything for us */
+ iput(inode);
+ }
+
+ if (nr_unlink)
+ printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+ if (nr_truncate)
+ printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+
+ btrfs_free_path(path);
+}
+
+/*
+ * very simple check to peek ahead in the leaf looking for xattrs. If we
+ * don't find any xattrs, we know there can't be any acls.
+ *
+ * slot is the slot the inode is in, objectid is the objectid of the inode
+ */
+static noinline int acls_after_inode_item(struct extent_buffer *leaf,
+ int slot, u64 objectid)
+{
+ u32 nritems = btrfs_header_nritems(leaf);
+ struct btrfs_key found_key;
+ int scanned = 0;
+
+ slot++;
+ while (slot < nritems) {
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /* we found a different objectid, there must not be acls */
+ if (found_key.objectid != objectid)
+ return 0;
+
+ /* we found an xattr, assume we've got an acl */
+ if (found_key.type == BTRFS_XATTR_ITEM_KEY)
+ return 1;
+
+ /*
+ * we found a key greater than an xattr key, there can't
+ * be any acls later on
+ */
+ if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+ return 0;
+
+ slot++;
+ scanned++;
+
+ /*
+ * it goes inode, inode backrefs, xattrs, extents,
+ * so if there are a ton of hard links to an inode there can
+ * be a lot of backrefs. Don't waste time searching too hard,
+ * this is just an optimization
+ */
+ if (scanned >= 8)
+ break;
+ }
+ /* we hit the end of the leaf before we found an xattr or
+ * something larger than an xattr. We have to assume the inode
+ * has acls
+ */
+ return 1;
+}
+
+/*
+ * read an inode from the btree into the in-memory inode
+ */
+static void btrfs_read_locked_inode(struct inode *inode)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_timespec *tspec;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_key location;
+ int maybe_acls;
+ u64 alloc_group_block;
+ u32 rdev;
+ int ret;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+
+ ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
+ if (ret)
+ goto make_bad;
+
+ leaf = path->nodes[0];
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+
+ inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+ inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
+ inode->i_uid = btrfs_inode_uid(leaf, inode_item);
+ inode->i_gid = btrfs_inode_gid(leaf, inode_item);
+ btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+
+ tspec = btrfs_inode_atime(inode_item);
+ inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+ inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+ tspec = btrfs_inode_mtime(inode_item);
+ inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+ inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+ tspec = btrfs_inode_ctime(inode_item);
+ inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+ inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+ inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
+ BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+ BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+ inode->i_generation = BTRFS_I(inode)->generation;
+ inode->i_rdev = 0;
+ rdev = btrfs_inode_rdev(leaf, inode_item);
+
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+ BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+
+ alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+
+ /*
+ * try to precache a NULL acl entry for files that don't have
+ * any xattrs or acls
+ */
+ maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
+ if (!maybe_acls)
+ cache_no_acl(inode);
+
+ BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
+ alloc_group_block, 0);
+ btrfs_free_path(path);
+ inode_item = NULL;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+ break;
+ case S_IFDIR:
+ inode->i_fop = &btrfs_dir_file_operations;
+ if (root == root->fs_info->tree_root)
+ inode->i_op = &btrfs_dir_ro_inode_operations;
+ else
+ inode->i_op = &btrfs_dir_inode_operations;
+ break;
+ case S_IFLNK:
+ inode->i_op = &btrfs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ break;
+ default:
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+ break;
+ }
+
+ btrfs_update_iflags(inode);
+ return;
+
+make_bad:
+ btrfs_free_path(path);
+ make_bad_inode(inode);
+}
+
+/*
+ * given a leaf and an inode, copy the inode fields into the leaf
+ */
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+ struct extent_buffer *leaf,
+ struct btrfs_inode_item *item,
+ struct inode *inode)
+{
+ btrfs_set_inode_uid(leaf, item, inode->i_uid);
+ btrfs_set_inode_gid(leaf, item, inode->i_gid);
+ btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+ btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+ inode->i_atime.tv_sec);
+ btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+ inode->i_atime.tv_nsec);
+
+ btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+ inode->i_mtime.tv_sec);
+ btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ inode->i_mtime.tv_nsec);
+
+ btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+ inode->i_ctime.tv_sec);
+ btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ inode->i_ctime.tv_nsec);
+
+ btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+ btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+ btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+ btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
+ btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+}
+
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
+{
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ path->leave_spinning = 1;
+ ret = btrfs_lookup_inode(trans, root, path,
+ &BTRFS_I(inode)->location, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto failed;
+ }
+
+ btrfs_unlock_up_safe(path, 1);
+ leaf = path->nodes[0];
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+
+ fill_inode_item(trans, leaf, inode_item, inode);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_set_inode_last_trans(trans, inode);
+ ret = 0;
+failed:
+ btrfs_free_path(path);
+ return ret;
+}
+
+
+/*
+ * unlink helper that gets used here in inode.c and in the tree logging
+ * recovery code. It remove a link in a directory with a given name, and
+ * also drops the back refs in the inode to the directory
+ */
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, struct inode *inode,
+ const char *name, int name_len)
+{
+ struct btrfs_path *path;
+ int ret = 0;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u64 index;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ path->leave_spinning = 1;
+ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ name, name_len, -1);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto err;
+ }
+ if (!di) {
+ ret = -ENOENT;
+ goto err;
+ }
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret)
+ goto err;
+ btrfs_release_path(root, path);
+
+ ret = btrfs_del_inode_ref(trans, root, name, name_len,
+ inode->i_ino,
+ dir->i_ino, &index);
+ if (ret) {
+ printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
+ "inode %lu parent %lu\n", name_len, name,
+ inode->i_ino, dir->i_ino);
+ goto err;
+ }
+
+ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+ index, name, name_len, -1);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto err;
+ }
+ if (!di) {
+ ret = -ENOENT;
+ goto err;
+ }
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ btrfs_release_path(root, path);
+
+ ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
+ inode, dir->i_ino);
+ BUG_ON(ret != 0 && ret != -ENOENT);
+
+ ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
+ dir, index);
+ BUG_ON(ret);
+err:
+ btrfs_free_path(path);
+ if (ret)
+ goto out;
+
+ btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ btrfs_update_inode(trans, root, dir);
+ btrfs_drop_nlink(inode);
+ ret = btrfs_update_inode(trans, root, inode);
+out:
+ return ret;
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct btrfs_root *root;
+ struct btrfs_trans_handle *trans;
+ struct inode *inode = dentry->d_inode;
+ int ret;
+ unsigned long nr = 0;
+
+ root = BTRFS_I(dir)->root;
+
+ /*
+ * 5 items for unlink inode
+ * 1 for orphan
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ btrfs_unreserve_metadata_space(root, 6);
+ return PTR_ERR(trans);
+ }
+
+ btrfs_set_trans_block_group(trans, dir);
+
+ btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
+
+ ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ dentry->d_name.name, dentry->d_name.len);
+
+ if (inode->i_nlink == 0)
+ ret = btrfs_orphan_add(trans, inode);
+
+ nr = trans->blocks_used;
+
+ btrfs_end_transaction_throttle(trans, root);
+ btrfs_unreserve_metadata_space(root, 6);
+ btrfs_btree_balance_dirty(root, nr);
+ return ret;
+}
+
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, u64 objectid,
+ const char *name, int name_len)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u64 index;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ name, name_len, -1);
+ BUG_ON(!di || IS_ERR(di));
+
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ BUG_ON(ret);
+ btrfs_release_path(root, path);
+
+ ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+ objectid, root->root_key.objectid,
+ dir->i_ino, &index, name, name_len);
+ if (ret < 0) {
+ BUG_ON(ret != -ENOENT);
+ di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+ name, name_len);
+ BUG_ON(!di || IS_ERR(di));
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(root, path);
+ index = key.offset;
+ }
+
+ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+ index, name, name_len, -1);
+ BUG_ON(!di || IS_ERR(di));
+
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ BUG_ON(ret);
+ btrfs_release_path(root, path);
+
+ btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, dir);
+ BUG_ON(ret);
+ dir->i_sb->s_dirt = 1;
+
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ int err = 0;
+ int ret;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_trans_handle *trans;
+ unsigned long nr = 0;
+
+ if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
+ inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ return -ENOTEMPTY;
+
+ ret = btrfs_reserve_metadata_space(root, 5);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ btrfs_unreserve_metadata_space(root, 5);
+ return PTR_ERR(trans);
+ }
+
+ btrfs_set_trans_block_group(trans, dir);
+
+ if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ err = btrfs_unlink_subvol(trans, root, dir,
+ BTRFS_I(inode)->location.objectid,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ goto out;
+ }
+
+ err = btrfs_orphan_add(trans, inode);
+ if (err)
+ goto out;
+
+ /* now the directory is empty */
+ err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ dentry->d_name.name, dentry->d_name.len);
+ if (!err)
+ btrfs_i_size_write(inode, 0);
+out:
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction_throttle(trans, root);
+ btrfs_unreserve_metadata_space(root, 5);
+ btrfs_btree_balance_dirty(root, nr);
+
+ if (ret && !err)
+ err = ret;
+ return err;
+}
+
+#if 0
+/*
+ * when truncating bytes in a file, it is possible to avoid reading
+ * the leaves that contain only checksum items. This can be the
+ * majority of the IO required to delete a large file, but it must
+ * be done carefully.
+ *
+ * The keys in the level just above the leaves are checked to make sure
+ * the lowest key in a given leaf is a csum key, and starts at an offset
+ * after the new size.
+ *
+ * Then the key for the next leaf is checked to make sure it also has
+ * a checksum item for the same file. If it does, we know our target leaf
+ * contains only checksum items, and it can be safely freed without reading
+ * it.
+ *
+ * This is just an optimization targeted at large files. It may do
+ * nothing. It will return 0 unless things went badly.
+ */
+static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct inode *inode, u64 new_size)
+{
+ struct btrfs_key key;
+ int ret;
+ int nritems;
+ struct btrfs_key found_key;
+ struct btrfs_key other_key;
+ struct btrfs_leaf_ref *ref;
+ u64 leaf_gen;
+ u64 leaf_start;
+
+ path->lowest_level = 1;
+ key.objectid = inode->i_ino;
+ key.type = BTRFS_CSUM_ITEM_KEY;
+ key.offset = new_size;
+again:
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (path->nodes[1] == NULL) {
+ ret = 0;
+ goto out;
+ }
+ ret = 0;
+ btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
+ nritems = btrfs_header_nritems(path->nodes[1]);
+
+ if (!nritems)
+ goto out;
+
+ if (path->slots[1] >= nritems)
+ goto next_node;
+
+ /* did we find a key greater than anything we want to delete? */
+ if (found_key.objectid > inode->i_ino ||
+ (found_key.objectid == inode->i_ino && found_key.type > key.type))
+ goto out;
+
+ /* we check the next key in the node to make sure the leave contains
+ * only checksum items. This comparison doesn't work if our
+ * leaf is the last one in the node
+ */
+ if (path->slots[1] + 1 >= nritems) {
+next_node:
+ /* search forward from the last key in the node, this
+ * will bring us into the next node in the tree
+ */
+ btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
+
+ /* unlikely, but we inc below, so check to be safe */
+ if (found_key.offset == (u64)-1)
+ goto out;
+
+ /* search_forward needs a path with locks held, do the
+ * search again for the original key. It is possible
+ * this will race with a balance and return a path that
+ * we could modify, but this drop is just an optimization
+ * and is allowed to miss some leaves.
+ */
+ btrfs_release_path(root, path);
+ found_key.offset++;
+
+ /* setup a max key for search_forward */
+ other_key.offset = (u64)-1;
+ other_key.type = key.type;
+ other_key.objectid = key.objectid;
+
+ path->keep_locks = 1;
+ ret = btrfs_search_forward(root, &found_key, &other_key,
+ path, 0, 0);
+ path->keep_locks = 0;
+ if (ret || found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ ret = 0;
+ goto out;
+ }
+
+ key.offset = found_key.offset;
+ btrfs_release_path(root, path);
+ cond_resched();
+ goto again;
+ }
+
+ /* we know there's one more slot after us in the tree,
+ * read that key so we can verify it is also a checksum item
+ */
+ btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
+
+ if (found_key.objectid < inode->i_ino)
+ goto next_key;
+
+ if (found_key.type != key.type || found_key.offset < new_size)
+ goto next_key;
+
+ /*
+ * if the key for the next leaf isn't a csum key from this objectid,
+ * we can't be sure there aren't good items inside this leaf.
+ * Bail out
+ */
+ if (other_key.objectid != inode->i_ino || other_key.type != key.type)
+ goto out;
+
+ leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
+ leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
+ /*
+ * it is safe to delete this leaf, it contains only
+ * csum items from this inode at an offset >= new_size
+ */
+ ret = btrfs_del_leaf(trans, root, path, leaf_start);
+ BUG_ON(ret);
+
+ if (root->ref_cows && leaf_gen < trans->transid) {
+ ref = btrfs_alloc_leaf_ref(root, 0);
+ if (ref) {
+ ref->root_gen = root->root_key.offset;
+ ref->bytenr = leaf_start;
+ ref->owner = 0;
+ ref->generation = leaf_gen;
+ ref->nritems = 0;
+
+ btrfs_sort_leaf_ref(ref);
+
+ ret = btrfs_add_leaf_ref(root, ref, 0);
+ WARN_ON(ret);
+ btrfs_free_leaf_ref(root, ref);
+ } else {
+ WARN_ON(1);
+ }
+ }
+next_key:
+ btrfs_release_path(root, path);
+
+ if (other_key.objectid == inode->i_ino &&
+ other_key.type == key.type && other_key.offset > key.offset) {
+ key.offset = other_key.offset;
+ cond_resched();
+ goto again;
+ }
+ ret = 0;
+out:
+ /* fixup any changes we've made to the path */
+ path->lowest_level = 0;
+ path->keep_locks = 0;
+ btrfs_release_path(root, path);
+ return ret;
+}
+
+#endif
+
+/*
+ * this can truncate away extent items, csum items and directory items.
+ * It starts at a high offset and removes keys until it can't find
+ * any higher than new_size
+ *
+ * csum items that cross the new i_size are truncated to the new size
+ * as well.
+ *
+ * min_type is the minimum key type to truncate down to. If set to 0, this
+ * will kill all the items on this inode, including the INODE_ITEM_KEY.
+ */
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ u64 new_size, u32 min_type)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 extent_start = 0;
+ u64 extent_num_bytes = 0;
+ u64 extent_offset = 0;
+ u64 item_end = 0;
+ u64 mask = root->sectorsize - 1;
+ u32 found_type = (u8)-1;
+ int found_extent;
+ int del_item;
+ int pending_del_nr = 0;
+ int pending_del_slot = 0;
+ int extent_type = -1;
+ int encoding;
+ int ret;
+ int err = 0;
+
+ BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+
+ if (root->ref_cows)
+ btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ path->reada = -1;
+
+ key.objectid = inode->i_ino;
+ key.offset = (u64)-1;
+ key.type = (u8)-1;
+
+search_again:
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ if (ret > 0) {
+ /* there are no items in the tree for us to truncate, we're
+ * done
+ */
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+
+ while (1) {
+ fi = NULL;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ found_type = btrfs_key_type(&found_key);
+ encoding = 0;
+
+ if (found_key.objectid != inode->i_ino)
+ break;
+
+ if (found_type < min_type)
+ break;
+
+ item_end = found_key.offset;
+ if (found_type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ encoding = btrfs_file_extent_compression(leaf, fi);
+ encoding |= btrfs_file_extent_encryption(leaf, fi);
+ encoding |= btrfs_file_extent_other_encoding(leaf, fi);
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ item_end +=
+ btrfs_file_extent_num_bytes(leaf, fi);
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ item_end += btrfs_file_extent_inline_len(leaf,
+ fi);
+ }
+ item_end--;
+ }
+ if (found_type > min_type) {
+ del_item = 1;
+ } else {
+ if (item_end < new_size)
+ break;
+ if (found_key.offset >= new_size)
+ del_item = 1;
+ else
+ del_item = 0;
+ }
+ found_extent = 0;
+ /* FIXME, shrink the extent if the ref count is only 1 */
+ if (found_type != BTRFS_EXTENT_DATA_KEY)
+ goto delete;
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ u64 num_dec;
+ extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (!del_item && !encoding) {
+ u64 orig_num_bytes =
+ btrfs_file_extent_num_bytes(leaf, fi);
+ extent_num_bytes = new_size -
+ found_key.offset + root->sectorsize - 1;
+ extent_num_bytes = extent_num_bytes &
+ ~((u64)root->sectorsize - 1);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_num_bytes);
+ num_dec = (orig_num_bytes -
+ extent_num_bytes);
+ if (root->ref_cows && extent_start != 0)
+ inode_sub_bytes(inode, num_dec);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ extent_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf,
+ fi);
+ extent_offset = found_key.offset -
+ btrfs_file_extent_offset(leaf, fi);
+
+ /* FIXME blocksize != 4096 */
+ num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_start != 0) {
+ found_extent = 1;
+ if (root->ref_cows)
+ inode_sub_bytes(inode, num_dec);
+ }
+ }
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ /*
+ * we can't truncate inline items that have had
+ * special encodings
+ */
+ if (!del_item &&
+ btrfs_file_extent_compression(leaf, fi) == 0 &&
+ btrfs_file_extent_encryption(leaf, fi) == 0 &&
+ btrfs_file_extent_other_encoding(leaf, fi) == 0) {
+ u32 size = new_size - found_key.offset;
+
+ if (root->ref_cows) {
+ inode_sub_bytes(inode, item_end + 1 -
+ new_size);
+ }
+ size =
+ btrfs_file_extent_calc_inline_size(size);
+ ret = btrfs_truncate_item(trans, root, path,
+ size, 1);
+ BUG_ON(ret);
+ } else if (root->ref_cows) {
+ inode_sub_bytes(inode, item_end + 1 -
+ found_key.offset);
+ }
+ }
+delete:
+ if (del_item) {
+ if (!pending_del_nr) {
+ /* no pending yet, add ourselves */
+ pending_del_slot = path->slots[0];
+ pending_del_nr = 1;
+ } else if (pending_del_nr &&
+ path->slots[0] + 1 == pending_del_slot) {
+ /* hop on the pending chunk */
+ pending_del_nr++;
+ pending_del_slot = path->slots[0];
+ } else {
+ BUG();
+ }
+ } else {
+ break;
+ }
+ if (found_extent && root->ref_cows) {
+ btrfs_set_path_blocking(path);
+ ret = btrfs_free_extent(trans, root, extent_start,
+ extent_num_bytes, 0,
+ btrfs_header_owner(leaf),
+ inode->i_ino, extent_offset);
+ BUG_ON(ret);
+ }
+
+ if (found_type == BTRFS_INODE_ITEM_KEY)
+ break;
+
+ if (path->slots[0] == 0 ||
+ path->slots[0] != pending_del_slot) {
+ if (root->ref_cows) {
+ err = -EAGAIN;
+ goto out;
+ }
+ if (pending_del_nr) {
+ ret = btrfs_del_items(trans, root, path,
+ pending_del_slot,
+ pending_del_nr);
+ BUG_ON(ret);
+ pending_del_nr = 0;
+ }
+ btrfs_release_path(root, path);
+ goto search_again;
+ } else {
+ path->slots[0]--;
+ }
+ }
+out:
+ if (pending_del_nr) {
+ ret = btrfs_del_items(trans, root, path, pending_del_slot,
+ pending_del_nr);
+ }
+ btrfs_free_path(path);
+ return err;
+}
+
+/*
+ * taken from block_truncate_page, but does cow as it zeros out
+ * any bytes left in the last page in the file.
+ */
+static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+{
+ struct inode *inode = mapping->host;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ char *kaddr;
+ u32 blocksize = root->sectorsize;
+ pgoff_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ struct page *page;
+ int ret = 0;
+ u64 page_start;
+ u64 page_end;
+
+ if ((offset & (blocksize - 1)) == 0)
+ goto out;
+ ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+ if (ret)
+ goto out;
+
+ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (ret)
+ goto out;
+
+ ret = -ENOMEM;
+again:
+ page = grab_cache_page(mapping, index);
+ if (!page) {
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ goto out;
+ }
+
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ if (!PageUptodate(page)) {
+ ret = btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+ }
+ wait_on_page_writeback(page);
+
+ lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ set_page_extent_mapped(page);
+
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ GFP_NOFS);
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+ if (ret) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ goto out_unlock;
+ }
+
+ ret = 0;
+ if (offset != PAGE_CACHE_SIZE) {
+ kaddr = kmap(page);
+ memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+ flush_dcache_page(page);
+ kunmap(page);
+ }
+ ClearPageChecked(page);
+ set_page_dirty(page);
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+out_unlock:
+ if (ret)
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ unlock_page(page);
+ page_cache_release(page);
+out:
+ return ret;
+}
+
+int btrfs_cont_expand(struct inode *inode, loff_t size)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em;
+ u64 mask = root->sectorsize - 1;
+ u64 hole_start = (inode->i_size + mask) & ~mask;
+ u64 block_end = (size + mask) & ~mask;
+ u64 last_byte;
+ u64 cur_offset;
+ u64 hole_size;
+ int err = 0;
+
+ if (size <= hole_start)
+ return 0;
+
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+ btrfs_wait_ordered_range(inode, hole_start,
+ block_end - hole_start);
+ lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+ ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+ if (!ordered)
+ break;
+ unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ cur_offset = hole_start;
+ while (1) {
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+ block_end - cur_offset, 0);
+ BUG_ON(IS_ERR(em) || !em);
+ last_byte = min(extent_map_end(em), block_end);
+ last_byte = (last_byte + mask) & ~mask;
+ if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ u64 hint_byte = 0;
+ hole_size = last_byte - cur_offset;
+
+ err = btrfs_reserve_metadata_space(root, 2);
+ if (err)
+ break;
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+
+ err = btrfs_drop_extents(trans, inode, cur_offset,
+ cur_offset + hole_size,
+ &hint_byte, 1);
+ BUG_ON(err);
+
+ err = btrfs_insert_file_extent(trans, root,
+ inode->i_ino, cur_offset, 0,
+ 0, hole_size, 0, hole_size,
+ 0, 0, 0);
+ BUG_ON(err);
+
+ btrfs_drop_extent_cache(inode, hole_start,
+ last_byte - 1, 0);
+
+ btrfs_end_transaction(trans, root);
+ btrfs_unreserve_metadata_space(root, 2);
+ }
+ free_extent_map(em);
+ cur_offset = last_byte;
+ if (cur_offset >= block_end)
+ break;
+ }
+
+ unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+ return err;
+}
+
+static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ unsigned long nr;
+ int ret;
+
+ if (attr->ia_size == inode->i_size)
+ return 0;
+
+ if (attr->ia_size > inode->i_size) {
+ unsigned long limit;
+ limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ if (attr->ia_size > inode->i_sb->s_maxbytes)
+ return -EFBIG;
+ if (limit != RLIM_INFINITY && attr->ia_size > limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
+ }
+ }
+
+ ret = btrfs_reserve_metadata_space(root, 1);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+
+ ret = btrfs_orphan_add(trans, inode);
+ BUG_ON(ret);
+
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_unreserve_metadata_space(root, 1);
+ btrfs_btree_balance_dirty(root, nr);
+
+ if (attr->ia_size > inode->i_size) {
+ ret = btrfs_cont_expand(inode, attr->ia_size);
+ if (ret) {
+ btrfs_truncate(inode);
+ return ret;
+ }
+
+ i_size_write(inode, attr->ia_size);
+ btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+ if (inode->i_nlink > 0) {
+ ret = btrfs_orphan_del(trans, inode);
+ BUG_ON(ret);
+ }
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+ return 0;
+ }
+
+ /*
+ * We're truncating a file that used to have good data down to
+ * zero. Make sure it gets into the ordered flush list so that
+ * any new writes get down to disk quickly.
+ */
+ if (attr->ia_size == 0)
+ BTRFS_I(inode)->ordered_data_close = 1;
+
+ /* we don't support swapfiles, so vmtruncate shouldn't fail */
+ ret = vmtruncate(inode, attr->ia_size);
+ BUG_ON(ret);
+
+ return 0;
+}
+
+static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ int err;
+
+ err = inode_change_ok(inode, attr);
+ if (err)
+ return err;
+
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+ err = btrfs_setattr_size(inode, attr);
+ if (err)
+ return err;
+ }
+ attr->ia_valid &= ~ATTR_SIZE;
+
+ if (attr->ia_valid)
+ err = inode_setattr(inode, attr);
+
+ if (!err && ((attr->ia_valid & ATTR_MODE)))
+ err = btrfs_acl_chmod(inode);
+ return err;
+}
+
+void btrfs_delete_inode(struct inode *inode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ unsigned long nr;
+ int ret;
+
+ truncate_inode_pages(&inode->i_data, 0);
+ if (is_bad_inode(inode)) {
+ btrfs_orphan_del(NULL, inode);
+ goto no_delete;
+ }
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+
+ if (root->fs_info->log_root_recovering) {
+ BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+ goto no_delete;
+ }
+
+ if (inode->i_nlink > 0) {
+ BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+ goto no_delete;
+ }
+
+ btrfs_i_size_write(inode, 0);
+
+ while (1) {
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+
+ if (ret != -EAGAIN)
+ break;
+
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ trans = NULL;
+ btrfs_btree_balance_dirty(root, nr);
+ }
+
+ if (ret == 0) {
+ ret = btrfs_orphan_del(trans, inode);
+ BUG_ON(ret);
+ }
+
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+no_delete:
+ clear_inode(inode);
+ return;
+}
+
+/*
+ * this returns the key found in the dir entry in the location pointer.
+ * If no dir entries were found, location->objectid is 0.
+ */
+static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+ struct btrfs_key *location)
+{
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
+ namelen, 0);
+ if (IS_ERR(di))
+ ret = PTR_ERR(di);
+
+ if (!di || IS_ERR(di))
+ goto out_err;
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
+out:
+ btrfs_free_path(path);
+ return ret;
+out_err:
+ location->objectid = 0;
+ goto out;
+}
+
+/*
+ * when we hit a tree root in a directory, the btrfs part of the inode
+ * needs to be changed to reflect the root directory of the tree root. This
+ * is kind of like crossing a mount point.
+ */
+static int fixup_tree_root_location(struct btrfs_root *root,
+ struct inode *dir,
+ struct dentry *dentry,
+ struct btrfs_key *location,
+ struct btrfs_root **sub_root)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *new_root;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ int ret;
+ int err = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = -ENOENT;
+ ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
+ BTRFS_I(dir)->root->root_key.objectid,
+ location->objectid);
+ if (ret) {
+ if (ret < 0)
+ err = ret;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+ if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
+ btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+ goto out;
+
+ ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
+ (unsigned long)(ref + 1),
+ dentry->d_name.len);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(root->fs_info->tree_root, path);
+
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
+ if (IS_ERR(new_root)) {
+ err = PTR_ERR(new_root);
+ goto out;
+ }
+
+ if (btrfs_root_refs(&new_root->root_item) == 0) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ *sub_root = new_root;
+ location->objectid = btrfs_root_dirid(&new_root->root_item);
+ location->type = BTRFS_INODE_ITEM_KEY;
+ location->offset = 0;
+ err = 0;
+out:
+ btrfs_free_path(path);
+ return err;
+}
+
+static void inode_tree_add(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *entry;
+ struct rb_node **p;
+ struct rb_node *parent;
+again:
+ p = &root->inode_tree.rb_node;
+ parent = NULL;
+
+ if (hlist_unhashed(&inode->i_hash))
+ return;
+
+ spin_lock(&root->inode_lock);
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_inode, rb_node);
+
+ if (inode->i_ino < entry->vfs_inode.i_ino)
+ p = &parent->rb_left;
+ else if (inode->i_ino > entry->vfs_inode.i_ino)
+ p = &parent->rb_right;
+ else {
+ WARN_ON(!(entry->vfs_inode.i_state &
+ (I_WILL_FREE | I_FREEING | I_CLEAR)));
+ rb_erase(parent, &root->inode_tree);
+ RB_CLEAR_NODE(parent);
+ spin_unlock(&root->inode_lock);
+ goto again;
+ }
+ }
+ rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
+ rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+ spin_unlock(&root->inode_lock);
+}
+
+static void inode_tree_del(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int empty = 0;
+
+ spin_lock(&root->inode_lock);
+ if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
+ rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+ RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ empty = RB_EMPTY_ROOT(&root->inode_tree);
+ }
+ spin_unlock(&root->inode_lock);
+
+ if (empty && btrfs_root_refs(&root->root_item) == 0) {
+ synchronize_srcu(&root->fs_info->subvol_srcu);
+ spin_lock(&root->inode_lock);
+ empty = RB_EMPTY_ROOT(&root->inode_tree);
+ spin_unlock(&root->inode_lock);
+ if (empty)
+ btrfs_add_dead_root(root);
+ }
+}
+
+int btrfs_invalidate_inodes(struct btrfs_root *root)
+{
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *entry;
+ struct inode *inode;
+ u64 objectid = 0;
+
+ WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+ if (objectid < entry->vfs_inode.i_ino)
+ node = node->rb_left;
+ else if (objectid > entry->vfs_inode.i_ino)
+ node = node->rb_right;
+ else
+ break;
+ }
+ if (!node) {
+ while (prev) {
+ entry = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (objectid <= entry->vfs_inode.i_ino) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+ while (node) {
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+ objectid = entry->vfs_inode.i_ino + 1;
+ inode = igrab(&entry->vfs_inode);
+ if (inode) {
+ spin_unlock(&root->inode_lock);
+ if (atomic_read(&inode->i_count) > 1)
+ d_prune_aliases(inode);
+ /*
+ * btrfs_drop_inode will remove it from
+ * the inode cache when its usage count
+ * hits zero.
+ */
+ iput(inode);
+ cond_resched();
+ spin_lock(&root->inode_lock);
+ goto again;
+ }
+
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+ return 0;
+}
+
+static noinline void init_btrfs_i(struct inode *inode)
+{
+ struct btrfs_inode *bi = BTRFS_I(inode);
+
+ bi->generation = 0;
+ bi->sequence = 0;
+ bi->last_trans = 0;
+ bi->last_sub_trans = 0;
+ bi->logged_trans = 0;
+ bi->delalloc_bytes = 0;
+ bi->reserved_bytes = 0;
+ bi->disk_i_size = 0;
+ bi->flags = 0;
+ bi->index_cnt = (u64)-1;
+ bi->last_unlink_trans = 0;
+ bi->ordered_data_close = 0;
+ extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+ extent_io_tree_init(&BTRFS_I(inode)->io_tree,
+ inode->i_mapping, GFP_NOFS);
+ extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+ inode->i_mapping, GFP_NOFS);
+ INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+ INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
+ RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
+ mutex_init(&BTRFS_I(inode)->log_mutex);
+}
+
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+ struct btrfs_iget_args *args = p;
+ inode->i_ino = args->ino;
+ init_btrfs_i(inode);
+ BTRFS_I(inode)->root = args->root;
+ btrfs_set_inode_space_info(args->root, inode);
+ return 0;
+}
+
+static int btrfs_find_actor(struct inode *inode, void *opaque)
+{
+ struct btrfs_iget_args *args = opaque;
+ return args->ino == inode->i_ino &&
+ args->root == BTRFS_I(inode)->root;
+}
+
+static struct inode *btrfs_iget_locked(struct super_block *s,
+ u64 objectid,
+ struct btrfs_root *root)
+{
+ struct inode *inode;
+ struct btrfs_iget_args args;
+ args.ino = objectid;
+ args.root = root;
+
+ inode = iget5_locked(s, objectid, btrfs_find_actor,
+ btrfs_init_locked_inode,
+ (void *)&args);
+ return inode;
+}
+
+/* Get an inode object given its location and corresponding root.
+ * Returns in *is_new if the inode was read from disk
+ */
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+ struct btrfs_root *root)
+{
+ struct inode *inode;
+
+ inode = btrfs_iget_locked(s, location->objectid, root);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (inode->i_state & I_NEW) {
+ BTRFS_I(inode)->root = root;
+ memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
+ btrfs_read_locked_inode(inode);
+
+ inode_tree_add(inode);
+ unlock_new_inode(inode);
+ }
+
+ return inode;
+}
+
+static struct inode *new_simple_dir(struct super_block *s,
+ struct btrfs_key *key,
+ struct btrfs_root *root)
+{
+ struct inode *inode = new_inode(s);
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ init_btrfs_i(inode);
+
+ BTRFS_I(inode)->root = root;
+ memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+ BTRFS_I(inode)->dummy_inode = 1;
+
+ inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+ return inode;
+}
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *sub_root = root;
+ struct btrfs_key location;
+ int index;
+ int ret;
+
+ dentry->d_op = &btrfs_dentry_operations;
+
+ if (dentry->d_name.len > BTRFS_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ ret = btrfs_inode_by_name(dir, dentry, &location);
+
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (location.objectid == 0)
+ return NULL;
+
+ if (location.type == BTRFS_INODE_ITEM_KEY) {
+ inode = btrfs_iget(dir->i_sb, &location, root);
+ return inode;
+ }
+
+ BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
+
+ index = srcu_read_lock(&root->fs_info->subvol_srcu);
+ ret = fixup_tree_root_location(root, dir, dentry,
+ &location, &sub_root);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ inode = ERR_PTR(ret);
+ else
+ inode = new_simple_dir(dir->i_sb, &location, sub_root);
+ } else {
+ inode = btrfs_iget(dir->i_sb, &location, sub_root);
+ }
+ srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+
+ if (root != sub_root) {
+ down_read(&root->fs_info->cleanup_work_sem);
+ if (!(inode->i_sb->s_flags & MS_RDONLY))
+ btrfs_orphan_cleanup(sub_root);
+ up_read(&root->fs_info->cleanup_work_sem);
+ }
+
+ return inode;
+}
+
+static int btrfs_dentry_delete(struct dentry *dentry)
+{
+ struct btrfs_root *root;
+
+ if (!dentry->d_inode && !IS_ROOT(dentry))
+ dentry = dentry->d_parent;
+
+ if (dentry->d_inode) {
+ root = BTRFS_I(dentry->d_inode)->root;
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return 1;
+ }
+ return 0;
+}
+
+static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct inode *inode;
+
+ inode = btrfs_lookup_dentry(dir, dentry);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return d_splice_alias(inode, dentry);
+}
+
+static unsigned char btrfs_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int btrfs_real_readdir(struct file *filp, void *dirent,
+ filldir_t filldir)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_item *item;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ int ret;
+ u32 nritems;
+ struct extent_buffer *leaf;
+ int slot;
+ int advance;
+ unsigned char d_type;
+ int over = 0;
+ u32 di_cur;
+ u32 di_total;
+ u32 di_len;
+ int key_type = BTRFS_DIR_INDEX_KEY;
+ char tmp_name[32];
+ char *name_ptr;
+ int name_len;
+
+ /* FIXME, use a real flag for deciding about the key type */
+ if (root->fs_info->tree_root == root)
+ key_type = BTRFS_DIR_ITEM_KEY;
+
+ /* special case for "." */
+ if (filp->f_pos == 0) {
+ over = filldir(dirent, ".", 1,
+ 1, inode->i_ino,
+ DT_DIR);
+ if (over)
+ return 0;
+ filp->f_pos = 1;
+ }
+ /* special case for .., just use the back ref */
+ if (filp->f_pos == 1) {
+ u64 pino = parent_ino(filp->f_path.dentry);
+ over = filldir(dirent, "..", 2,
+ 2, pino, DT_DIR);
+ if (over)
+ return 0;
+ filp->f_pos = 2;
+ }
+ path = btrfs_alloc_path();
+ path->reada = 2;
+
+ btrfs_set_key_type(&key, key_type);
+ key.offset = filp->f_pos;
+ key.objectid = inode->i_ino;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto err;
+ advance = 0;
+
+ while (1) {
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ slot = path->slots[0];
+ if (advance || slot >= nritems) {
+ if (slot >= nritems - 1) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ slot = path->slots[0];
+ } else {
+ slot++;
+ path->slots[0]++;
+ }
+ }
+
+ advance = 1;
+ item = btrfs_item_nr(leaf, slot);
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.objectid != key.objectid)
+ break;
+ if (btrfs_key_type(&found_key) != key_type)
+ break;
+ if (found_key.offset < filp->f_pos)
+ continue;
+
+ filp->f_pos = found_key.offset;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ di_cur = 0;
+ di_total = btrfs_item_size(leaf, item);
+
+ while (di_cur < di_total) {
+ struct btrfs_key location;
+
+ name_len = btrfs_dir_name_len(leaf, di);
+ if (name_len <= sizeof(tmp_name)) {
+ name_ptr = tmp_name;
+ } else {
+ name_ptr = kmalloc(name_len, GFP_NOFS);
+ if (!name_ptr) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+ read_extent_buffer(leaf, name_ptr,
+ (unsigned long)(di + 1), name_len);
+
+ d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+ btrfs_dir_item_key_to_cpu(leaf, di, &location);
+
+ /* is this a reference to our own snapshot? If so
+ * skip it
+ */
+ if (location.type == BTRFS_ROOT_ITEM_KEY &&
+ location.objectid == root->root_key.objectid) {
+ over = 0;
+ goto skip;
+ }
+ over = filldir(dirent, name_ptr, name_len,
+ found_key.offset, location.objectid,
+ d_type);
+
+skip:
+ if (name_ptr != tmp_name)
+ kfree(name_ptr);
+
+ if (over)
+ goto nopos;
+ di_len = btrfs_dir_name_len(leaf, di) +
+ btrfs_dir_data_len(leaf, di) + sizeof(*di);
+ di_cur += di_len;
+ di = (struct btrfs_dir_item *)((char *)di + di_len);
+ }
+ }
+
+ /* Reached end of directory/root. Bump pos past the last item. */
+ if (key_type == BTRFS_DIR_INDEX_KEY)
+ /*
+ * 32-bit glibc will use getdents64, but then strtol -
+ * so the last number we can serve is this.
+ */
+ filp->f_pos = 0x7fffffff;
+ else
+ filp->f_pos++;
+nopos:
+ ret = 0;
+err:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_write_inode(struct inode *inode, int wait)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (root->fs_info->btree_inode == inode)
+ return 0;
+
+ if (wait) {
+ trans = btrfs_join_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ ret = btrfs_commit_transaction(trans, root);
+ }
+ return ret;
+}
+
+/*
+ * This is somewhat expensive, updating the tree every time the
+ * inode changes. But, it is most likely to find the inode in cache.
+ * FIXME, needs more benchmarking...there are no reasons other than performance
+ * to keep or drop this code.
+ */
+void btrfs_dirty_inode(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_join_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ btrfs_update_inode(trans, root, inode);
+ btrfs_end_transaction(trans, root);
+}
+
+/*
+ * find the highest existing sequence number in a directory
+ * and then set the in-memory index_cnt variable to reflect
+ * free sequence numbers
+ */
+static int btrfs_set_inode_index_count(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_key key, found_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = inode->i_ino;
+ btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+ key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ /* FIXME: we should be able to handle this */
+ if (ret == 0)
+ goto out;
+ ret = 0;
+
+ /*
+ * MAGIC NUMBER EXPLANATION:
+ * since we search a directory based on f_pos we have to start at 2
+ * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
+ * else has to start at 2
+ */
+ if (path->slots[0] == 0) {
+ BTRFS_I(inode)->index_cnt = 2;
+ goto out;
+ }
+
+ path->slots[0]--;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid != inode->i_ino ||
+ btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+ BTRFS_I(inode)->index_cnt = 2;
+ goto out;
+ }
+
+ BTRFS_I(inode)->index_cnt = found_key.offset + 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper to find a free sequence number in a given directory. This current
+ * code is very simple, later versions will do smarter things in the btree
+ */
+int btrfs_set_inode_index(struct inode *dir, u64 *index)
+{
+ int ret = 0;
+
+ if (BTRFS_I(dir)->index_cnt == (u64)-1) {
+ ret = btrfs_set_inode_index_count(dir);
+ if (ret)
+ return ret;
+ }
+
+ *index = BTRFS_I(dir)->index_cnt;
+ BTRFS_I(dir)->index_cnt++;
+
+ return ret;
+}
+
+static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir,
+ const char *name, int name_len,
+ u64 ref_objectid, u64 objectid,
+ u64 alloc_hint, int mode, u64 *index)
+{
+ struct inode *inode;
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_key *location;
+ struct btrfs_path *path;
+ struct btrfs_inode_ref *ref;
+ struct btrfs_key key[2];
+ u32 sizes[2];
+ unsigned long ptr;
+ int ret;
+ int owner;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ inode = new_inode(root->fs_info->sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (dir) {
+ ret = btrfs_set_inode_index(dir, index);
+ if (ret) {
+ iput(inode);
+ return ERR_PTR(ret);
+ }
+ }
+ /*
+ * index_cnt is ignored for everything but a dir,
+ * btrfs_get_inode_index_count has an explanation for the magic
+ * number
+ */
+ init_btrfs_i(inode);
+ BTRFS_I(inode)->index_cnt = 2;
+ BTRFS_I(inode)->root = root;
+ BTRFS_I(inode)->generation = trans->transid;
+ btrfs_set_inode_space_info(root, inode);
+
+ if (mode & S_IFDIR)
+ owner = 0;
+ else
+ owner = 1;
+ BTRFS_I(inode)->block_group =
+ btrfs_find_block_group(root, 0, alloc_hint, owner);
+
+ key[0].objectid = objectid;
+ btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+ key[0].offset = 0;
+
+ key[1].objectid = objectid;
+ btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+ key[1].offset = ref_objectid;
+
+ sizes[0] = sizeof(struct btrfs_inode_item);
+ sizes[1] = name_len + sizeof(*ref);
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
+ if (ret != 0)
+ goto fail;
+
+ inode->i_uid = current_fsuid();
+
+ if (dir && (dir->i_mode & S_ISGID)) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else
+ inode->i_gid = current_fsgid();
+
+ inode->i_mode = mode;
+ inode->i_ino = objectid;
+ inode_set_bytes(inode, 0);
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ fill_inode_item(trans, path->nodes[0], inode_item, inode);
+
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+ struct btrfs_inode_ref);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+ ptr = (unsigned long)(ref + 1);
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_free_path(path);
+
+ location = &BTRFS_I(inode)->location;
+ location->objectid = objectid;
+ location->offset = 0;
+ btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+
+ btrfs_inherit_iflags(inode, dir);
+
+ if ((mode & S_IFREG)) {
+ if (btrfs_test_opt(root, NODATASUM))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ if (btrfs_test_opt(root, NODATACOW))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+ }
+
+ insert_inode_hash(inode);
+ inode_tree_add(inode);
+ return inode;
+fail:
+ if (dir)
+ BTRFS_I(dir)->index_cnt--;
+ btrfs_free_path(path);
+ iput(inode);
+ return ERR_PTR(ret);
+}
+
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+ return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
+}
+
+/*
+ * utility function to add 'inode' into 'parent_inode' with
+ * a give name and a given sequence number.
+ * if 'add_backref' is true, also insert a backref from the
+ * inode to the parent directory.
+ */
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+ struct inode *parent_inode, struct inode *inode,
+ const char *name, int name_len, int add_backref, u64 index)
+{
+ int ret = 0;
+ struct btrfs_key key;
+ struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+
+ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
+ } else {
+ key.objectid = inode->i_ino;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+ }
+
+ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+ key.objectid, root->root_key.objectid,
+ parent_inode->i_ino,
+ index, name, name_len);
+ } else if (add_backref) {
+ ret = btrfs_insert_inode_ref(trans, root,
+ name, name_len, inode->i_ino,
+ parent_inode->i_ino, index);
+ }
+
+ if (ret == 0) {
+ ret = btrfs_insert_dir_item(trans, root, name, name_len,
+ parent_inode->i_ino, &key,
+ btrfs_inode_type(inode), index);
+ BUG_ON(ret);
+
+ btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ name_len * 2);
+ parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, parent_inode);
+ }
+ return ret;
+}
+
+static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
+ struct dentry *dentry, struct inode *inode,
+ int backref, u64 index)
+{
+ int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+ inode, dentry->d_name.name,
+ dentry->d_name.len, backref, index);
+ if (!err) {
+ d_instantiate(dentry, inode);
+ return 0;
+ }
+ if (err > 0)
+ err = -EEXIST;
+ return err;
+}
+
+static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+ int mode, dev_t rdev)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = NULL;
+ int err;
+ int drop_inode = 0;
+ u64 objectid;
+ unsigned long nr = 0;
+ u64 index = 0;
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+ /*
+ * 2 for inode item and ref
+ * 2 for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
+ if (err)
+ return err;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto fail;
+ btrfs_set_trans_block_group(trans, dir);
+
+ err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+ if (err) {
+ err = -ENOSPC;
+ goto out_unlock;
+ }
+
+ inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+ dentry->d_name.len,
+ dentry->d_parent->d_inode->i_ino, objectid,
+ BTRFS_I(dir)->block_group, mode, &index);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_unlock;
+
+ err = btrfs_init_inode_security(trans, inode, dir);
+ if (err) {
+ drop_inode = 1;
+ goto out_unlock;
+ }
+
+ btrfs_set_trans_block_group(trans, inode);
+ err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+ if (err)
+ drop_inode = 1;
+ else {
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+ btrfs_update_inode(trans, root, inode);
+ }
+ btrfs_update_inode_block_group(trans, inode);
+ btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+fail:
+ btrfs_unreserve_metadata_space(root, 5);
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_btree_balance_dirty(root, nr);
+ return err;
+}
+
+static int btrfs_create(struct inode *dir, struct dentry *dentry,
+ int mode, struct nameidata *nd)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = NULL;
+ int err;
+ int drop_inode = 0;
+ unsigned long nr = 0;
+ u64 objectid;
+ u64 index = 0;
+
+ /*
+ * 2 for inode item and ref
+ * 2 for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
+ if (err)
+ return err;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto fail;
+ btrfs_set_trans_block_group(trans, dir);
+
+ err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+ if (err) {
+ err = -ENOSPC;
+ goto out_unlock;
+ }
+
+ inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+ dentry->d_name.len,
+ dentry->d_parent->d_inode->i_ino,
+ objectid, BTRFS_I(dir)->block_group, mode,
+ &index);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_unlock;
+
+ err = btrfs_init_inode_security(trans, inode, dir);
+ if (err) {
+ drop_inode = 1;
+ goto out_unlock;
+ }
+
+ btrfs_set_trans_block_group(trans, inode);
+ err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+ if (err)
+ drop_inode = 1;
+ else {
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ }
+ btrfs_update_inode_block_group(trans, inode);
+ btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+fail:
+ btrfs_unreserve_metadata_space(root, 5);
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_btree_balance_dirty(root, nr);
+ return err;
+}
+
+static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = old_dentry->d_inode;
+ u64 index;
+ unsigned long nr = 0;
+ int err;
+ int drop_inode = 0;
+
+ if (inode->i_nlink == 0)
+ return -ENOENT;
+
+ /* do not allow sys_link's with other subvols of the same device */
+ if (root->objectid != BTRFS_I(inode)->root->objectid)
+ return -EPERM;
+
+ /*
+ * 1 item for inode ref
+ * 2 items for dir items
+ */
+ err = btrfs_reserve_metadata_space(root, 3);
+ if (err)
+ return err;
+
+ btrfs_inc_nlink(inode);
+
+ err = btrfs_set_inode_index(dir, &index);
+ if (err)
+ goto fail;
+
+ trans = btrfs_start_transaction(root, 1);
+
+ btrfs_set_trans_block_group(trans, dir);
+ atomic_inc(&inode->i_count);
+
+ err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+
+ if (err) {
+ drop_inode = 1;
+ } else {
+ btrfs_update_inode_block_group(trans, dir);
+ err = btrfs_update_inode(trans, root, inode);
+ BUG_ON(err);
+ btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+ }
+
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+fail:
+ btrfs_unreserve_metadata_space(root, 3);
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_btree_balance_dirty(root, nr);
+ return err;
+}
+
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ struct inode *inode = NULL;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ int err = 0;
+ int drop_on_err = 0;
+ u64 objectid = 0;
+ u64 index = 0;
+ unsigned long nr = 1;
+
+ /*
+ * 2 items for inode and ref
+ * 2 items for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
+ if (err)
+ return err;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ btrfs_set_trans_block_group(trans, dir);
+
+ err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+ if (err) {
+ err = -ENOSPC;
+ goto out_unlock;
+ }
+
+ inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+ dentry->d_name.len,
+ dentry->d_parent->d_inode->i_ino, objectid,
+ BTRFS_I(dir)->block_group, S_IFDIR | mode,
+ &index);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_fail;
+ }
+
+ drop_on_err = 1;
+
+ err = btrfs_init_inode_security(trans, inode, dir);
+ if (err)
+ goto out_fail;
+
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
+ btrfs_set_trans_block_group(trans, inode);
+
+ btrfs_i_size_write(inode, 0);
+ err = btrfs_update_inode(trans, root, inode);
+ if (err)
+ goto out_fail;
+
+ err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+ inode, dentry->d_name.name,
+ dentry->d_name.len, 0, index);
+ if (err)
+ goto out_fail;
+
+ d_instantiate(dentry, inode);
+ drop_on_err = 0;
+ btrfs_update_inode_block_group(trans, inode);
+ btrfs_update_inode_block_group(trans, dir);
+
+out_fail:
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+
+out_unlock:
+ btrfs_unreserve_metadata_space(root, 5);
+ if (drop_on_err)
+ iput(inode);
+ btrfs_btree_balance_dirty(root, nr);
+ return err;
+}
+
+/* helper for btfs_get_extent. Given an existing extent in the tree,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the new extent into the tree.
+ */
+static int merge_extent_mapping(struct extent_map_tree *em_tree,
+ struct extent_map *existing,
+ struct extent_map *em,
+ u64 map_start, u64 map_len)
+{
+ u64 start_diff;
+
+ BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+ start_diff = map_start - em->start;
+ em->start = map_start;
+ em->len = map_len;
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ em->block_start += start_diff;
+ em->block_len -= start_diff;
+ }
+ return add_extent_mapping(em_tree, em);
+}
+
+static noinline int uncompress_inline(struct btrfs_path *path,
+ struct inode *inode, struct page *page,
+ size_t pg_offset, u64 extent_offset,
+ struct btrfs_file_extent_item *item)
+{
+ int ret;
+ struct extent_buffer *leaf = path->nodes[0];
+ char *tmp;
+ size_t max_size;
+ unsigned long inline_size;
+ unsigned long ptr;
+
+ WARN_ON(pg_offset != 0);
+ max_size = btrfs_file_extent_ram_bytes(leaf, item);
+ inline_size = btrfs_file_extent_inline_item_len(leaf,
+ btrfs_item_nr(leaf, path->slots[0]));
+ tmp = kmalloc(inline_size, GFP_NOFS);
+ ptr = btrfs_file_extent_inline_start(item);
+
+ read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+ max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+ ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+ inline_size, max_size);
+ if (ret) {
+ char *kaddr = kmap_atomic(page, KM_USER0);
+ unsigned long copy_size = min_t(u64,
+ PAGE_CACHE_SIZE - pg_offset,
+ max_size - extent_offset);
+ memset(kaddr + pg_offset, 0, copy_size);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ kfree(tmp);
+ return 0;
+}
+
+/*
+ * a bit scary, this does extent mapping from logical file offset to the disk.
+ * the ugly parts come from merging extents from the disk with the in-ram
+ * representation. This gets more complex because of the data=ordered code,
+ * where the in-ram extents might be locked pending data=ordered completion.
+ *
+ * This also copies inline extents directly into the page.
+ */
+
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+ size_t pg_offset, u64 start, u64 len,
+ int create)
+{
+ int ret;
+ int err = 0;
+ u64 bytenr;
+ u64 extent_start = 0;
+ u64 extent_end = 0;
+ u64 objectid = inode->i_ino;
+ u32 found_type;
+ struct btrfs_path *path = NULL;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *item;
+ struct extent_buffer *leaf;
+ struct btrfs_key found_key;
+ struct extent_map *em = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_trans_handle *trans = NULL;
+ int compressed;
+
+again:
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (em)
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ read_unlock(&em_tree->lock);
+
+ if (em) {
+ if (em->start > start || em->start + em->len <= start)
+ free_extent_map(em);
+ else if (em->block_start == EXTENT_MAP_INLINE && page)
+ free_extent_map(em);
+ else
+ goto out;
+ }
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em) {
+ err = -ENOMEM;
+ goto out;
+ }
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->start = EXTENT_MAP_HOLE;
+ em->orig_start = EXTENT_MAP_HOLE;
+ em->len = (u64)-1;
+ em->block_len = (u64)-1;
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ }
+
+ ret = btrfs_lookup_file_extent(trans, root, path,
+ objectid, start, trans != NULL);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ if (ret != 0) {
+ if (path->slots[0] == 0)
+ goto not_found;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ /* are we inside the extent that was found? */
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ found_type = btrfs_key_type(&found_key);
+ if (found_key.objectid != objectid ||
+ found_type != BTRFS_EXTENT_DATA_KEY) {
+ goto not_found;
+ }
+
+ found_type = btrfs_file_extent_type(leaf, item);
+ extent_start = found_key.offset;
+ compressed = btrfs_file_extent_compression(leaf, item);
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ extent_end = extent_start +
+ btrfs_file_extent_num_bytes(leaf, item);
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ size_t size;
+ size = btrfs_file_extent_inline_len(leaf, item);
+ extent_end = (extent_start + size + root->sectorsize - 1) &
+ ~((u64)root->sectorsize - 1);
+ }
+
+ if (start >= extent_end) {
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0)
+ goto not_found;
+ leaf = path->nodes[0];
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != objectid ||
+ found_key.type != BTRFS_EXTENT_DATA_KEY)
+ goto not_found;
+ if (start + len <= found_key.offset)
+ goto not_found;
+ em->start = start;
+ em->len = found_key.offset - start;
+ goto not_found_em;
+ }
+
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ em->orig_start = extent_start -
+ btrfs_file_extent_offset(leaf, item);
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
+ if (bytenr == 0) {
+ em->block_start = EXTENT_MAP_HOLE;
+ goto insert;
+ }
+ if (compressed) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->block_start = bytenr;
+ em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+ item);
+ } else {
+ bytenr += btrfs_file_extent_offset(leaf, item);
+ em->block_start = bytenr;
+ em->block_len = em->len;
+ if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ }
+ goto insert;
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ unsigned long ptr;
+ char *map;
+ size_t size;
+ size_t extent_offset;
+ size_t copy_size;
+
+ em->block_start = EXTENT_MAP_INLINE;
+ if (!page || create) {
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ goto out;
+ }
+
+ size = btrfs_file_extent_inline_len(leaf, item);
+ extent_offset = page_offset(page) + pg_offset - extent_start;
+ copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
+ size - extent_offset);
+ em->start = extent_start + extent_offset;
+ em->len = (copy_size + root->sectorsize - 1) &
+ ~((u64)root->sectorsize - 1);
+ em->orig_start = EXTENT_MAP_INLINE;
+ if (compressed)
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+ if (create == 0 && !PageUptodate(page)) {
+ if (btrfs_file_extent_compression(leaf, item) ==
+ BTRFS_COMPRESS_ZLIB) {
+ ret = uncompress_inline(path, inode, page,
+ pg_offset,
+ extent_offset, item);
+ BUG_ON(ret);
+ } else {
+ map = kmap(page);
+ read_extent_buffer(leaf, map + pg_offset, ptr,
+ copy_size);
+ if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+ memset(map + pg_offset + copy_size, 0,
+ PAGE_CACHE_SIZE - pg_offset -
+ copy_size);
+ }
+ kunmap(page);
+ }
+ flush_dcache_page(page);
+ } else if (create && PageUptodate(page)) {
+ if (!trans) {
+ kunmap(page);
+ free_extent_map(em);
+ em = NULL;
+ btrfs_release_path(root, path);
+ trans = btrfs_join_transaction(root, 1);
+ goto again;
+ }
+ map = kmap(page);
+ write_extent_buffer(leaf, map + pg_offset, ptr,
+ copy_size);
+ kunmap(page);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ set_extent_uptodate(io_tree, em->start,
+ extent_map_end(em) - 1, GFP_NOFS);
+ goto insert;
+ } else {
+ printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
+ WARN_ON(1);
+ }
+not_found:
+ em->start = start;
+ em->len = len;
+not_found_em:
+ em->block_start = EXTENT_MAP_HOLE;
+ set_bit(EXTENT_FLAG_VACANCY, &em->flags);
+insert:
+ btrfs_release_path(root, path);
+ if (em->start > start || extent_map_end(em) <= start) {
+ printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
+ "[%llu %llu]\n", (unsigned long long)em->start,
+ (unsigned long long)em->len,
+ (unsigned long long)start,
+ (unsigned long long)len);
+ err = -EIO;
+ goto out;
+ }
+
+ err = 0;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ /* it is possible that someone inserted the extent into the tree
+ * while we had the lock dropped. It is also possible that
+ * an overlapping map exists in the tree
+ */
+ if (ret == -EEXIST) {
+ struct extent_map *existing;
+
+ ret = 0;
+
+ existing = lookup_extent_mapping(em_tree, start, len);
+ if (existing && (existing->start > start ||
+ existing->start + existing->len <= start)) {
+ free_extent_map(existing);
+ existing = NULL;
+ }
+ if (!existing) {
+ existing = lookup_extent_mapping(em_tree, em->start,
+ em->len);
+ if (existing) {
+ err = merge_extent_mapping(em_tree, existing,
+ em, start,
+ root->sectorsize);
+ free_extent_map(existing);
+ if (err) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ } else {
+ err = -EIO;
+ free_extent_map(em);
+ em = NULL;
+ }
+ } else {
+ free_extent_map(em);
+ em = existing;
+ err = 0;
+ }
+ }
+ write_unlock(&em_tree->lock);
+out:
+ if (path)
+ btrfs_free_path(path);
+ if (trans) {
+ ret = btrfs_end_transaction(trans, root);
+ if (!err)
+ err = ret;
+ }
+ if (err) {
+ free_extent_map(em);
+ return ERR_PTR(err);
+ }
+ return em;
+}
+
+static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ return -EINVAL;
+}
+
+static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
+}
+
+int btrfs_readpage(struct file *file, struct page *page)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ return extent_read_full_page(tree, page, btrfs_get_extent);
+}
+
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree;
+
+
+ if (current->flags & PF_MEMALLOC) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+}
+
+int btrfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree;
+
+ tree = &BTRFS_I(mapping->host)->io_tree;
+ return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
+}
+
+static int
+btrfs_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(mapping->host)->io_tree;
+ return extent_readpages(tree, mapping, pages, nr_pages,
+ btrfs_get_extent);
+}
+static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+ struct extent_io_tree *tree;
+ struct extent_map_tree *map;
+ int ret;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ map = &BTRFS_I(page->mapping->host)->extent_tree;
+ ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+ if (ret == 1) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+ return ret;
+}
+
+static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+ if (PageWriteback(page) || PageDirty(page))
+ return 0;
+ return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
+}
+
+static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct extent_io_tree *tree;
+ struct btrfs_ordered_extent *ordered;
+ u64 page_start = page_offset(page);
+ u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+
+ /*
+ * we have the page locked, so new writeback can't start,
+ * and the dirty bit won't be cleared while we are here.
+ *
+ * Wait for IO on this page so that we can safely clear
+ * the PagePrivate2 bit and do ordered accounting
+ */
+ wait_on_page_writeback(page);
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (offset) {
+ btrfs_releasepage(page, GFP_NOFS);
+ return;
+ }
+ lock_extent(tree, page_start, page_end, GFP_NOFS);
+ ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+ page_offset(page));
+ if (ordered) {
+ /*
+ * IO on this page will never be started, so we need
+ * to account for any ordered extents now
+ */
+ clear_extent_bit(tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
+ NULL, GFP_NOFS);
+ /*
+ * whoever cleared the private bit is responsible
+ * for the finish_ordered_io
+ */
+ if (TestClearPagePrivate2(page)) {
+ btrfs_finish_ordered_io(page->mapping->host,
+ page_start, page_end);
+ }
+ btrfs_put_ordered_extent(ordered);
+ lock_extent(tree, page_start, page_end, GFP_NOFS);
+ }
+ clear_extent_bit(tree, page_start, page_end,
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
+ __btrfs_releasepage(page, GFP_NOFS);
+
+ ClearPageChecked(page);
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+}
+
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF. Because
+ * vmtruncate() writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = fdentry(vma->vm_file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ char *kaddr;
+ unsigned long zero_start;
+ loff_t size;
+ int ret;
+ u64 page_start;
+ u64 page_end;
+
+ ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+ if (ret) {
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else /* -ENOSPC, -EIO, etc */
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
+ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (ret) {
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
+ ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+again:
+ lock_page(page);
+ size = i_size_read(inode);
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ if ((page->mapping != inode->i_mapping) ||
+ (page_start >= size)) {
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ /* page got truncated out from underneath us */
+ goto out_unlock;
+ }
+ wait_on_page_writeback(page);
+
+ lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ set_page_extent_mapped(page);
+
+ /*
+ * we can't set the delalloc bits if there are pending ordered
+ * extents. Drop our locks and wait for them to finish
+ */
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ /*
+ * XXX - page_mkwrite gets called every time the page is dirtied, even
+ * if it was already dirty, so for space accounting reasons we need to
+ * clear any delalloc bits for the range we are fixing to save. There
+ * is probably a better way to do this, but for now keep consistent with
+ * prepare_pages in the normal write path.
+ */
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ GFP_NOFS);
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+ if (ret) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ ret = VM_FAULT_SIGBUS;
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ goto out_unlock;
+ }
+ ret = 0;
+
+ /* page is wholly or partially inside EOF */
+ if (page_start + PAGE_CACHE_SIZE > size)
+ zero_start = size & ~PAGE_CACHE_MASK;
+ else
+ zero_start = PAGE_CACHE_SIZE;
+
+ if (zero_start != PAGE_CACHE_SIZE) {
+ kaddr = kmap(page);
+ memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+ flush_dcache_page(page);
+ kunmap(page);
+ }
+ ClearPageChecked(page);
+ set_page_dirty(page);
+ SetPageUptodate(page);
+
+ BTRFS_I(inode)->last_trans = root->fs_info->generation;
+ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+out_unlock:
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ if (!ret)
+ return VM_FAULT_LOCKED;
+ unlock_page(page);
+out:
+ return ret;
+}
+
+static void btrfs_truncate(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+ struct btrfs_trans_handle *trans;
+ unsigned long nr;
+ u64 mask = root->sectorsize - 1;
+
+ if (!S_ISREG(inode->i_mode)) {
+ WARN_ON(1);
+ return;
+ }
+
+ ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ if (ret)
+ return;
+
+ btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+ btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+
+ /*
+ * setattr is responsible for setting the ordered_data_close flag,
+ * but that is only tested during the last file release. That
+ * could happen well after the next commit, leaving a great big
+ * window where new writes may get lost if someone chooses to write
+ * to this file after truncating to zero
+ *
+ * The inode doesn't have any dirty data here, and so if we commit
+ * this is a noop. If someone immediately starts writing to the inode
+ * it is very likely we'll catch some of their writes in this
+ * transaction, and the commit will find this file on the ordered
+ * data list with good things to send down.
+ *
+ * This is a best effort solution, there is still a window where
+ * using truncate to replace the contents of the file will
+ * end up with a zero length file after a crash.
+ */
+ if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+ btrfs_add_ordered_operation(trans, root, inode);
+
+ while (1) {
+ ret = btrfs_truncate_inode_items(trans, root, inode,
+ inode->i_size,
+ BTRFS_EXTENT_DATA_KEY);
+ if (ret != -EAGAIN)
+ break;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ }
+
+ if (ret == 0 && inode->i_nlink > 0) {
+ ret = btrfs_orphan_del(trans, inode);
+ BUG_ON(ret);
+ }
+
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction_throttle(trans, root);
+ BUG_ON(ret);
+ btrfs_btree_balance_dirty(root, nr);
+}
+
+/*
+ * create a new subvolume directory/inode (helper for the ioctl).
+ */
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *new_root,
+ u64 new_dirid, u64 alloc_hint)
+{
+ struct inode *inode;
+ int err;
+ u64 index = 0;
+
+ inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
+ new_dirid, alloc_hint, S_IFDIR | 0700, &index);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
+
+ inode->i_nlink = 1;
+ btrfs_i_size_write(inode, 0);
+
+ err = btrfs_update_inode(trans, new_root, inode);
+ BUG_ON(err);
+
+ iput(inode);
+ return 0;
+}
+
+/* helper function for file defrag and space balancing. This
+ * forces readahead on a given range of bytes in an inode
+ */
+unsigned long btrfs_force_ra(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *file,
+ pgoff_t offset, pgoff_t last_index)
+{
+ pgoff_t req_size = last_index - offset + 1;
+
+ page_cache_sync_readahead(mapping, ra, file, offset, req_size);
+ return offset + req_size;
+}
+
+struct inode *btrfs_alloc_inode(struct super_block *sb)
+{
+ struct btrfs_inode *ei;
+
+ ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+ if (!ei)
+ return NULL;
+ ei->last_trans = 0;
+ ei->last_sub_trans = 0;
+ ei->logged_trans = 0;
+ ei->outstanding_extents = 0;
+ ei->reserved_extents = 0;
+ ei->root = NULL;
+ spin_lock_init(&ei->accounting_lock);
+ btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+ INIT_LIST_HEAD(&ei->i_orphan);
+ INIT_LIST_HEAD(&ei->ordered_operations);
+ return &ei->vfs_inode;
+}
+
+void btrfs_destroy_inode(struct inode *inode)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ WARN_ON(!list_empty(&inode->i_dentry));
+ WARN_ON(inode->i_data.nrpages);
+
+ /*
+ * This can happen where we create an inode, but somebody else also
+ * created the same inode and we need to destroy the one we already
+ * created.
+ */
+ if (!root)
+ goto free;
+
+ /*
+ * Make sure we're properly removed from the ordered operation
+ * lists.
+ */
+ smp_mb();
+ if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ list_del_init(&BTRFS_I(inode)->ordered_operations);
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ }
+
+ spin_lock(&root->list_lock);
+ if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+ printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
+ inode->i_ino);
+ list_del_init(&BTRFS_I(inode)->i_orphan);
+ }
+ spin_unlock(&root->list_lock);
+
+ while (1) {
+ ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+ if (!ordered)
+ break;
+ else {
+ printk(KERN_ERR "btrfs found ordered "
+ "extent %llu %llu on inode cleanup\n",
+ (unsigned long long)ordered->file_offset,
+ (unsigned long long)ordered->len);
+ btrfs_remove_ordered_extent(inode, ordered);
+ btrfs_put_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+ }
+ }
+ inode_tree_del(inode);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+free:
+ kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+
+void btrfs_drop_inode(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
+ generic_delete_inode(inode);
+ else
+ generic_drop_inode(inode);
+}
+
+static void init_once(void *foo)
+{
+ struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+
+ inode_init_once(&ei->vfs_inode);
+}
+
+void btrfs_destroy_cachep(void)
+{
+ if (btrfs_inode_cachep)
+ kmem_cache_destroy(btrfs_inode_cachep);
+ if (btrfs_trans_handle_cachep)
+ kmem_cache_destroy(btrfs_trans_handle_cachep);
+ if (btrfs_transaction_cachep)
+ kmem_cache_destroy(btrfs_transaction_cachep);
+ if (btrfs_path_cachep)
+ kmem_cache_destroy(btrfs_path_cachep);
+}
+
+int btrfs_init_cachep(void)
+{
+ btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+ sizeof(struct btrfs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+ if (!btrfs_inode_cachep)
+ goto fail;
+
+ btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+ sizeof(struct btrfs_trans_handle), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_trans_handle_cachep)
+ goto fail;
+
+ btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+ sizeof(struct btrfs_transaction), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_transaction_cachep)
+ goto fail;
+
+ btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+ sizeof(struct btrfs_path), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_path_cachep)
+ goto fail;
+
+ return 0;
+fail:
+ btrfs_destroy_cachep();
+ return -ENOMEM;
+}
+
+static int btrfs_getattr(struct vfsmount *mnt,
+ struct dentry *dentry, struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+ stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
+ stat->blksize = PAGE_CACHE_SIZE;
+ stat->blocks = (inode_get_bytes(inode) +
+ BTRFS_I(inode)->delalloc_bytes) >> 9;
+ return 0;
+}
+
+static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(old_dir)->root;
+ struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = old_dentry->d_inode;
+ struct timespec ctime = CURRENT_TIME;
+ u64 index = 0;
+ u64 root_objectid;
+ int ret;
+
+ if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ return -EPERM;
+
+ /* we only allow rename subvolume link between subvolumes */
+ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ return -EXDEV;
+
+ if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+ (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
+ return -ENOTEMPTY;
+
+ if (S_ISDIR(old_inode->i_mode) && new_inode &&
+ new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+ return -ENOTEMPTY;
+
+ /*
+ * We want to reserve the absolute worst case amount of items. So if
+ * both inodes are subvols and we need to unlink them then that would
+ * require 4 item modifications, but if they are both normal inodes it
+ * would require 5 item modifications, so we'll assume their normal
+ * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+ * should cover the worst case number of items we'll modify.
+ */
+ ret = btrfs_reserve_metadata_space(root, 11);
+ if (ret)
+ return ret;
+
+ /*
+ * we're using rename to replace one file with another.
+ * and the replacement file is large. Start IO on it now so
+ * we don't add too much work to the end of the transaction
+ */
+ if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
+ old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ filemap_flush(old_inode->i_mapping);
+
+ /* close the racy window with snapshot create/destroy ioctl */
+ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ down_read(&root->fs_info->subvol_sem);
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, new_dir);
+
+ if (dest != root)
+ btrfs_record_root_in_trans(trans, dest);
+
+ ret = btrfs_set_inode_index(new_dir, &index);
+ if (ret)
+ goto out_fail;
+
+ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ /* force full log commit if subvolume involved. */
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ } else {
+ ret = btrfs_insert_inode_ref(trans, dest,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len,
+ old_inode->i_ino,
+ new_dir->i_ino, index);
+ if (ret)
+ goto out_fail;
+ /*
+ * this is an ugly little race, but the rename is required
+ * to make sure that if we crash, the inode is either at the
+ * old name or the new one. pinning the log transaction lets
+ * us make sure we don't allow a log commit to come in after
+ * we unlink the name but before we add the new name back in.
+ */
+ btrfs_pin_log_trans(root);
+ }
+ /*
+ * make sure the inode gets flushed if it is replacing
+ * something.
+ */
+ if (new_inode && new_inode->i_size &&
+ old_inode && S_ISREG(old_inode->i_mode)) {
+ btrfs_add_ordered_operation(trans, root, old_inode);
+ }
+
+ old_dir->i_ctime = old_dir->i_mtime = ctime;
+ new_dir->i_ctime = new_dir->i_mtime = ctime;
+ old_inode->i_ctime = ctime;
+
+ if (old_dentry->d_parent != new_dentry->d_parent)
+ btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+
+ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+ ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ } else {
+ btrfs_inc_nlink(old_dentry->d_inode);
+ ret = btrfs_unlink_inode(trans, root, old_dir,
+ old_dentry->d_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ }
+ BUG_ON(ret);
+
+ if (new_inode) {
+ new_inode->i_ctime = CURRENT_TIME;
+ if (unlikely(new_inode->i_ino ==
+ BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ root_objectid = BTRFS_I(new_inode)->location.objectid;
+ ret = btrfs_unlink_subvol(trans, dest, new_dir,
+ root_objectid,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ BUG_ON(new_inode->i_nlink == 0);
+ } else {
+ ret = btrfs_unlink_inode(trans, dest, new_dir,
+ new_dentry->d_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ }
+ BUG_ON(ret);
+ if (new_inode->i_nlink == 0) {
+ ret = btrfs_orphan_add(trans, new_dentry->d_inode);
+ BUG_ON(ret);
+ }
+ }
+
+ ret = btrfs_add_link(trans, new_dir, old_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len, 0, index);
+ BUG_ON(ret);
+
+ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_log_new_name(trans, old_inode, old_dir,
+ new_dentry->d_parent);
+ btrfs_end_log_trans(root);
+ }
+out_fail:
+ btrfs_end_transaction_throttle(trans, root);
+
+ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&root->fs_info->subvol_sem);
+
+ btrfs_unreserve_metadata_space(root, 11);
+ return ret;
+}
+
+/*
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+ struct list_head *head = &root->fs_info->delalloc_inodes;
+ struct btrfs_inode *binode;
+ struct inode *inode;
+
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ spin_lock(&root->fs_info->delalloc_lock);
+ while (!list_empty(head)) {
+ binode = list_entry(head->next, struct btrfs_inode,
+ delalloc_inodes);
+ inode = igrab(&binode->vfs_inode);
+ if (!inode)
+ list_del_init(&binode->delalloc_inodes);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ if (inode) {
+ filemap_flush(inode->i_mapping);
+ if (delay_iput)
+ btrfs_add_delayed_iput(inode);
+ else
+ iput(inode);
+ }
+ cond_resched();
+ spin_lock(&root->fs_info->delalloc_lock);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+
+ /* the filemap_flush will queue IO into the worker threads, but
+ * we have to make sure the IO is actually started and that
+ * ordered extents get created before we return
+ */
+ atomic_inc(&root->fs_info->async_submit_draining);
+ while (atomic_read(&root->fs_info->nr_async_submits) ||
+ atomic_read(&root->fs_info->async_delalloc_pages)) {
+ wait_event(root->fs_info->async_submit_wait,
+ (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+ atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+ }
+ atomic_dec(&root->fs_info->async_submit_draining);
+ return 0;
+}
+
+static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct inode *inode = NULL;
+ int err;
+ int drop_inode = 0;
+ u64 objectid;
+ u64 index = 0 ;
+ int name_len;
+ int datasize;
+ unsigned long ptr;
+ struct btrfs_file_extent_item *ei;
+ struct extent_buffer *leaf;
+ unsigned long nr = 0;
+
+ name_len = strlen(symname) + 1;
+ if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+ return -ENAMETOOLONG;
+
+ /*
+ * 2 items for inode item and ref
+ * 2 items for dir items
+ * 1 item for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
+ if (err)
+ return err;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto out_fail;
+ btrfs_set_trans_block_group(trans, dir);
+
+ err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+ if (err) {
+ err = -ENOSPC;
+ goto out_unlock;
+ }
+
+ inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+ dentry->d_name.len,
+ dentry->d_parent->d_inode->i_ino, objectid,
+ BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
+ &index);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_unlock;
+
+ err = btrfs_init_inode_security(trans, inode, dir);
+ if (err) {
+ drop_inode = 1;
+ goto out_unlock;
+ }
+
+ btrfs_set_trans_block_group(trans, inode);
+ err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+ if (err)
+ drop_inode = 1;
+ else {
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ }
+ btrfs_update_inode_block_group(trans, inode);
+ btrfs_update_inode_block_group(trans, dir);
+ if (drop_inode)
+ goto out_unlock;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ key.objectid = inode->i_ino;
+ key.offset = 0;
+ btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+ datasize = btrfs_file_extent_calc_inline_size(name_len);
+ err = btrfs_insert_empty_item(trans, root, path, &key,
+ datasize);
+ if (err) {
+ drop_inode = 1;
+ goto out_unlock;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+ btrfs_set_file_extent_type(leaf, ei,
+ BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_encryption(leaf, ei, 0);
+ btrfs_set_file_extent_compression(leaf, ei, 0);
+ btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+ btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
+ ptr = btrfs_file_extent_inline_start(ei);
+ write_extent_buffer(leaf, symname, ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_free_path(path);
+
+ inode->i_op = &btrfs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ inode_set_bytes(inode, name_len);
+ btrfs_i_size_write(inode, name_len - 1);
+ err = btrfs_update_inode(trans, root, inode);
+ if (err)
+ drop_inode = 1;
+
+out_unlock:
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+out_fail:
+ btrfs_unreserve_metadata_space(root, 5);
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_btree_balance_dirty(root, nr);
+ return err;
+}
+
+static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+ u64 alloc_hint, int mode, loff_t actual_len)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_key ins;
+ u64 alloc_size;
+ u64 cur_offset = start;
+ u64 num_bytes = end - start;
+ int ret = 0;
+ u64 i_size;
+
+ while (num_bytes > 0) {
+ alloc_size = min(num_bytes, root->fs_info->max_extent);
+
+ trans = btrfs_start_transaction(root, 1);
+
+ ret = btrfs_reserve_extent(trans, root, alloc_size,
+ root->sectorsize, 0, alloc_hint,
+ (u64)-1, &ins, 1);
+ if (ret) {
+ WARN_ON(1);
+ goto stop_trans;
+ }
+
+ ret = btrfs_reserve_metadata_space(root, 3);
+ if (ret) {
+ btrfs_free_reserved_extent(root, ins.objectid,
+ ins.offset);
+ goto stop_trans;
+ }
+
+ ret = insert_reserved_file_extent(trans, inode,
+ cur_offset, ins.objectid,
+ ins.offset, ins.offset,
+ ins.offset, 0, 0, 0,
+ BTRFS_FILE_EXTENT_PREALLOC);
+ BUG_ON(ret);
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset + ins.offset -1, 0);
+
+ num_bytes -= ins.offset;
+ cur_offset += ins.offset;
+ alloc_hint = ins.objectid + ins.offset;
+
+ inode->i_ctime = CURRENT_TIME;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ (actual_len > inode->i_size) &&
+ (cur_offset > inode->i_size)) {
+
+ if (cur_offset > actual_len)
+ i_size = actual_len;
+ else
+ i_size = cur_offset;
+ i_size_write(inode, i_size);
+ btrfs_ordered_update_i_size(inode, i_size, NULL);
+ }
+
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+
+ btrfs_end_transaction(trans, root);
+ btrfs_unreserve_metadata_space(root, 3);
+ }
+ return ret;
+
+stop_trans:
+ btrfs_end_transaction(trans, root);
+ return ret;
+
+}
+
+static long btrfs_fallocate(struct inode *inode, int mode,
+ loff_t offset, loff_t len)
+{
+ u64 cur_offset;
+ u64 last_byte;
+ u64 alloc_start;
+ u64 alloc_end;
+ u64 alloc_hint = 0;
+ u64 locked_end;
+ u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+ struct extent_map *em;
+ int ret;
+
+ alloc_start = offset & ~mask;
+ alloc_end = (offset + len + mask) & ~mask;
+
+ /*
+ * wait for ordered IO before we have any locks. We'll loop again
+ * below with the locks held.
+ */
+ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
+ mutex_lock(&inode->i_mutex);
+ if (alloc_start > inode->i_size) {
+ ret = btrfs_cont_expand(inode, alloc_start);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
+ alloc_end - alloc_start);
+ if (ret)
+ goto out;
+
+ locked_end = alloc_end - 1;
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ /* the extent lock is ordered inside the running
+ * transaction
+ */
+ lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode,
+ alloc_end - 1);
+ if (ordered &&
+ ordered->file_offset + ordered->len > alloc_start &&
+ ordered->file_offset < alloc_end) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ alloc_start, locked_end, GFP_NOFS);
+ /*
+ * we can't wait on the range with the transaction
+ * running or with the extent lock held
+ */
+ btrfs_wait_ordered_range(inode, alloc_start,
+ alloc_end - alloc_start);
+ } else {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ }
+
+ cur_offset = alloc_start;
+ while (1) {
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+ alloc_end - cur_offset, 0);
+ BUG_ON(IS_ERR(em) || !em);
+ last_byte = min(extent_map_end(em), alloc_end);
+ last_byte = (last_byte + mask) & ~mask;
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ (cur_offset >= inode->i_size &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ ret = prealloc_file_range(inode,
+ cur_offset, last_byte,
+ alloc_hint, mode, offset+len);
+ if (ret < 0) {
+ free_extent_map(em);
+ break;
+ }
+ }
+ if (em->block_start <= EXTENT_MAP_LAST_BYTE)
+ alloc_hint = em->block_start;
+ free_extent_map(em);
+
+ cur_offset = last_byte;
+ if (cur_offset >= alloc_end) {
+ ret = 0;
+ break;
+ }
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ GFP_NOFS);
+
+ btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
+ alloc_end - alloc_start);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
+static int btrfs_set_page_dirty(struct page *page)
+{
+ return __set_page_dirty_nobuffers(page);
+}
+
+static int btrfs_permission(struct inode *inode, int mask)
+{
+ if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
+ return -EACCES;
+ return generic_permission(inode, mask, btrfs_check_acl);
+}
+
+static const struct inode_operations btrfs_dir_inode_operations = {
+ .getattr = btrfs_getattr,
+ .lookup = btrfs_lookup,
+ .create = btrfs_create,
+ .unlink = btrfs_unlink,
+ .link = btrfs_link,
+ .mkdir = btrfs_mkdir,
+ .rmdir = btrfs_rmdir,
+ .rename = btrfs_rename,
+ .symlink = btrfs_symlink,
+ .setattr = btrfs_setattr,
+ .mknod = btrfs_mknod,
+ .setxattr = btrfs_setxattr,
+ .getxattr = btrfs_getxattr,
+ .listxattr = btrfs_listxattr,
+ .removexattr = btrfs_removexattr,
+ .permission = btrfs_permission,
+};
+static const struct inode_operations btrfs_dir_ro_inode_operations = {
+ .lookup = btrfs_lookup,
+ .permission = btrfs_permission,
+};
+
+static const struct file_operations btrfs_dir_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .readdir = btrfs_real_readdir,
+ .unlocked_ioctl = btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = btrfs_ioctl,
+#endif
+ .release = btrfs_release_file,
+ .fsync = btrfs_sync_file,
+};
+
+static struct extent_io_ops btrfs_extent_io_ops = {
+ .fill_delalloc = run_delalloc_range,
+ .submit_bio_hook = btrfs_submit_bio_hook,
+ .merge_bio_hook = btrfs_merge_bio_hook,
+ .readpage_end_io_hook = btrfs_readpage_end_io_hook,
+ .writepage_end_io_hook = btrfs_writepage_end_io_hook,
+ .writepage_start_hook = btrfs_writepage_start_hook,
+ .readpage_io_failed_hook = btrfs_io_failed_hook,
+ .set_bit_hook = btrfs_set_bit_hook,
+ .clear_bit_hook = btrfs_clear_bit_hook,
+ .merge_extent_hook = btrfs_merge_extent_hook,
+ .split_extent_hook = btrfs_split_extent_hook,
+};
+
+/*
+ * btrfs doesn't support the bmap operation because swapfiles
+ * use bmap to make a mapping of extents in the file. They assume
+ * these extents won't change over the life of the file and they
+ * use the bmap result to do IO directly to the drive.
+ *
+ * the btrfs bmap call would return logical addresses that aren't
+ * suitable for IO and they also will change frequently as COW
+ * operations happen. So, swapfile + btrfs == corruption.
+ *
+ * For now we're avoiding this by dropping bmap.
+ */
+static const struct address_space_operations btrfs_aops = {
+ .readpage = btrfs_readpage,
+ .writepage = btrfs_writepage,
+ .writepages = btrfs_writepages,
+ .readpages = btrfs_readpages,
+ .sync_page = block_sync_page,
+ .direct_IO = btrfs_direct_IO,
+ .invalidatepage = btrfs_invalidatepage,
+ .releasepage = btrfs_releasepage,
+ .set_page_dirty = btrfs_set_page_dirty,
+ .error_remove_page = generic_error_remove_page,
+};
+
+static const struct address_space_operations btrfs_symlink_aops = {
+ .readpage = btrfs_readpage,
+ .writepage = btrfs_writepage,
+ .invalidatepage = btrfs_invalidatepage,
+ .releasepage = btrfs_releasepage,
+};
+
+static const struct inode_operations btrfs_file_inode_operations = {
+ .truncate = btrfs_truncate,
+ .getattr = btrfs_getattr,
+ .setattr = btrfs_setattr,
+ .setxattr = btrfs_setxattr,
+ .getxattr = btrfs_getxattr,
+ .listxattr = btrfs_listxattr,
+ .removexattr = btrfs_removexattr,
+ .permission = btrfs_permission,
+ .fallocate = btrfs_fallocate,
+ .fiemap = btrfs_fiemap,
+};
+static const struct inode_operations btrfs_special_inode_operations = {
+ .getattr = btrfs_getattr,
+ .setattr = btrfs_setattr,
+ .permission = btrfs_permission,
+ .setxattr = btrfs_setxattr,
+ .getxattr = btrfs_getxattr,
+ .listxattr = btrfs_listxattr,
+ .removexattr = btrfs_removexattr,
+};
+static const struct inode_operations btrfs_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+ .put_link = page_put_link,
+ .permission = btrfs_permission,
+ .setxattr = btrfs_setxattr,
+ .getxattr = btrfs_getxattr,
+ .listxattr = btrfs_listxattr,
+ .removexattr = btrfs_removexattr,
+};
+
+const struct dentry_operations btrfs_dentry_operations = {
+ .d_delete = btrfs_dentry_delete,
+};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 00000000000..2c6ee6aacb2
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,1352 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/security.h>
+#include <linux/xattr.h>
+#include <linux/vmalloc.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "locking.h"
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & ~FS_DIRSYNC_FL;
+ else
+ return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+
+/*
+ * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ */
+static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+{
+ unsigned int iflags = 0;
+
+ if (flags & BTRFS_INODE_SYNC)
+ iflags |= FS_SYNC_FL;
+ if (flags & BTRFS_INODE_IMMUTABLE)
+ iflags |= FS_IMMUTABLE_FL;
+ if (flags & BTRFS_INODE_APPEND)
+ iflags |= FS_APPEND_FL;
+ if (flags & BTRFS_INODE_NODUMP)
+ iflags |= FS_NODUMP_FL;
+ if (flags & BTRFS_INODE_NOATIME)
+ iflags |= FS_NOATIME_FL;
+ if (flags & BTRFS_INODE_DIRSYNC)
+ iflags |= FS_DIRSYNC_FL;
+
+ return iflags;
+}
+
+/*
+ * Update inode->i_flags based on the btrfs internal flags.
+ */
+void btrfs_update_iflags(struct inode *inode)
+{
+ struct btrfs_inode *ip = BTRFS_I(inode);
+
+ inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+
+ if (ip->flags & BTRFS_INODE_SYNC)
+ inode->i_flags |= S_SYNC;
+ if (ip->flags & BTRFS_INODE_IMMUTABLE)
+ inode->i_flags |= S_IMMUTABLE;
+ if (ip->flags & BTRFS_INODE_APPEND)
+ inode->i_flags |= S_APPEND;
+ if (ip->flags & BTRFS_INODE_NOATIME)
+ inode->i_flags |= S_NOATIME;
+ if (ip->flags & BTRFS_INODE_DIRSYNC)
+ inode->i_flags |= S_DIRSYNC;
+}
+
+/*
+ * Inherit flags from the parent inode.
+ *
+ * Unlike extN we don't have any flags we don't want to inherit currently.
+ */
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+{
+ unsigned int flags;
+
+ if (!dir)
+ return;
+
+ flags = BTRFS_I(dir)->flags;
+
+ if (S_ISREG(inode->i_mode))
+ flags &= ~BTRFS_INODE_DIRSYNC;
+ else if (!S_ISDIR(inode->i_mode))
+ flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+
+ BTRFS_I(inode)->flags = flags;
+ btrfs_update_iflags(inode);
+}
+
+static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
+{
+ struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
+ unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+
+ if (copy_to_user(arg, &flags, sizeof(flags)))
+ return -EFAULT;
+ return 0;
+}
+
+static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct btrfs_inode *ip = BTRFS_I(inode);
+ struct btrfs_root *root = ip->root;
+ struct btrfs_trans_handle *trans;
+ unsigned int flags, oldflags;
+ int ret;
+
+ if (copy_from_user(&flags, arg, sizeof(flags)))
+ return -EFAULT;
+
+ if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+ FS_NOATIME_FL | FS_NODUMP_FL | \
+ FS_SYNC_FL | FS_DIRSYNC_FL))
+ return -EOPNOTSUPP;
+
+ if (!is_owner_or_cap(inode))
+ return -EACCES;
+
+ mutex_lock(&inode->i_mutex);
+
+ flags = btrfs_mask_flags(inode->i_mode, flags);
+ oldflags = btrfs_flags_to_ioctl(ip->flags);
+ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ }
+
+ ret = mnt_want_write(file->f_path.mnt);
+ if (ret)
+ goto out_unlock;
+
+ if (flags & FS_SYNC_FL)
+ ip->flags |= BTRFS_INODE_SYNC;
+ else
+ ip->flags &= ~BTRFS_INODE_SYNC;
+ if (flags & FS_IMMUTABLE_FL)
+ ip->flags |= BTRFS_INODE_IMMUTABLE;
+ else
+ ip->flags &= ~BTRFS_INODE_IMMUTABLE;
+ if (flags & FS_APPEND_FL)
+ ip->flags |= BTRFS_INODE_APPEND;
+ else
+ ip->flags &= ~BTRFS_INODE_APPEND;
+ if (flags & FS_NODUMP_FL)
+ ip->flags |= BTRFS_INODE_NODUMP;
+ else
+ ip->flags &= ~BTRFS_INODE_NODUMP;
+ if (flags & FS_NOATIME_FL)
+ ip->flags |= BTRFS_INODE_NOATIME;
+ else
+ ip->flags &= ~BTRFS_INODE_NOATIME;
+ if (flags & FS_DIRSYNC_FL)
+ ip->flags |= BTRFS_INODE_DIRSYNC;
+ else
+ ip->flags &= ~BTRFS_INODE_DIRSYNC;
+
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(!trans);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+
+ btrfs_update_iflags(inode);
+ inode->i_ctime = CURRENT_TIME;
+ btrfs_end_transaction(trans, root);
+
+ mnt_drop_write(file->f_path.mnt);
+ out_unlock:
+ mutex_unlock(&inode->i_mutex);
+ return 0;
+}
+
+static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+
+ return put_user(inode->i_generation, arg);
+}
+
+static noinline int create_subvol(struct btrfs_root *root,
+ struct dentry *dentry,
+ char *name, int namelen)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key key;
+ struct btrfs_root_item root_item;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *leaf;
+ struct btrfs_root *new_root;
+ struct inode *dir = dentry->d_parent->d_inode;
+ int ret;
+ int err;
+ u64 objectid;
+ u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+ u64 index = 0;
+
+ /*
+ * 1 - inode item
+ * 2 - refs
+ * 1 - root item
+ * 2 - dir items
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+
+ ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
+ 0, &objectid);
+ if (ret)
+ goto fail;
+
+ leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+ 0, objectid, NULL, 0, 0, 0);
+ if (IS_ERR(leaf)) {
+ ret = PTR_ERR(leaf);
+ goto fail;
+ }
+
+ memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_bytenr(leaf, leaf->start);
+ btrfs_set_header_generation(leaf, trans->transid);
+ btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(leaf, objectid);
+
+ write_extent_buffer(leaf, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(leaf),
+ BTRFS_FSID_SIZE);
+ write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+ BTRFS_UUID_SIZE);
+ btrfs_mark_buffer_dirty(leaf);
+
+ inode_item = &root_item.inode;
+ memset(inode_item, 0, sizeof(*inode_item));
+ inode_item->generation = cpu_to_le64(1);
+ inode_item->size = cpu_to_le64(3);
+ inode_item->nlink = cpu_to_le32(1);
+ inode_item->nbytes = cpu_to_le64(root->leafsize);
+ inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+ btrfs_set_root_bytenr(&root_item, leaf->start);
+ btrfs_set_root_generation(&root_item, trans->transid);
+ btrfs_set_root_level(&root_item, 0);
+ btrfs_set_root_refs(&root_item, 1);
+ btrfs_set_root_used(&root_item, leaf->len);
+ btrfs_set_root_last_snapshot(&root_item, 0);
+
+ memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+ root_item.drop_level = 0;
+
+ btrfs_tree_unlock(leaf);
+ free_extent_buffer(leaf);
+ leaf = NULL;
+
+ btrfs_set_root_dirid(&root_item, new_dirid);
+
+ key.objectid = objectid;
+ key.offset = 0;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+ &root_item);
+ if (ret)
+ goto fail;
+
+ key.offset = (u64)-1;
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ BUG_ON(IS_ERR(new_root));
+
+ btrfs_record_root_in_trans(trans, new_root);
+
+ ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
+ BTRFS_I(dir)->block_group);
+ /*
+ * insert the directory item
+ */
+ ret = btrfs_set_inode_index(dir, &index);
+ BUG_ON(ret);
+
+ ret = btrfs_insert_dir_item(trans, root,
+ name, namelen, dir->i_ino, &key,
+ BTRFS_FT_DIR, index);
+ if (ret)
+ goto fail;
+
+ btrfs_i_size_write(dir, dir->i_size + namelen * 2);
+ ret = btrfs_update_inode(trans, root, dir);
+ BUG_ON(ret);
+
+ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+ objectid, root->root_key.objectid,
+ dir->i_ino, index, name, namelen);
+
+ BUG_ON(ret);
+
+ d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+fail:
+ err = btrfs_commit_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+
+ btrfs_unreserve_metadata_space(root, 6);
+ return ret;
+}
+
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+ char *name, int namelen)
+{
+ struct inode *inode;
+ struct btrfs_pending_snapshot *pending_snapshot;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ if (!root->ref_cows)
+ return -EINVAL;
+
+ /*
+ * 1 - inode item
+ * 2 - refs
+ * 1 - root item
+ * 2 - dir items
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
+ if (ret)
+ goto fail;
+
+ pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+ if (!pending_snapshot) {
+ ret = -ENOMEM;
+ btrfs_unreserve_metadata_space(root, 6);
+ goto fail;
+ }
+ pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
+ if (!pending_snapshot->name) {
+ ret = -ENOMEM;
+ kfree(pending_snapshot);
+ btrfs_unreserve_metadata_space(root, 6);
+ goto fail;
+ }
+ memcpy(pending_snapshot->name, name, namelen);
+ pending_snapshot->name[namelen] = '\0';
+ pending_snapshot->dentry = dentry;
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+ pending_snapshot->root = root;
+ list_add(&pending_snapshot->list,
+ &trans->transaction->pending_snapshots);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ btrfs_unreserve_metadata_space(root, 6);
+
+ inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto fail;
+ }
+ BUG_ON(!inode);
+ d_instantiate(dentry, inode);
+ ret = 0;
+fail:
+ return ret;
+}
+
+/* copy of may_create in fs/namei.c() */
+static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+{
+ if (child->d_inode)
+ return -EEXIST;
+ if (IS_DEADDIR(dir))
+ return -ENOENT;
+ return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/*
+ * Create a new subvolume below @parent. This is largely modeled after
+ * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+ * inside this filesystem so it's quite a bit simpler.
+ */
+static noinline int btrfs_mksubvol(struct path *parent,
+ char *name, int namelen,
+ struct btrfs_root *snap_src)
+{
+ struct inode *dir = parent->dentry->d_inode;
+ struct dentry *dentry;
+ int error;
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+
+ dentry = lookup_one_len(name, parent->dentry, namelen);
+ error = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ goto out_unlock;
+
+ error = -EEXIST;
+ if (dentry->d_inode)
+ goto out_dput;
+
+ error = mnt_want_write(parent->mnt);
+ if (error)
+ goto out_dput;
+
+ error = btrfs_may_create(dir, dentry);
+ if (error)
+ goto out_drop_write;
+
+ down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+
+ if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
+ goto out_up_read;
+
+ if (snap_src) {
+ error = create_snapshot(snap_src, dentry,
+ name, namelen);
+ } else {
+ error = create_subvol(BTRFS_I(dir)->root, dentry,
+ name, namelen);
+ }
+ if (!error)
+ fsnotify_mkdir(dir, dentry);
+out_up_read:
+ up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+out_drop_write:
+ mnt_drop_write(parent->mnt);
+out_dput:
+ dput(dentry);
+out_unlock:
+ mutex_unlock(&dir->i_mutex);
+ return error;
+}
+
+static int btrfs_defrag_file(struct file *file)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ struct page *page;
+ unsigned long last_index;
+ unsigned long ra_pages = root->fs_info->bdi.ra_pages;
+ unsigned long total_read = 0;
+ u64 page_start;
+ u64 page_end;
+ unsigned long i;
+ int ret;
+
+ ret = btrfs_check_data_free_space(root, inode, inode->i_size);
+ if (ret)
+ return -ENOSPC;
+
+ mutex_lock(&inode->i_mutex);
+ last_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ for (i = 0; i <= last_index; i++) {
+ if (total_read % ra_pages == 0) {
+ btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
+ min(last_index, i + ra_pages - 1));
+ }
+ total_read++;
+again:
+ page = grab_cache_page(inode->i_mapping, i);
+ if (!page)
+ goto out_unlock;
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto out_unlock;
+ }
+ }
+
+ wait_on_page_writeback(page);
+
+ page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+ lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+ set_page_extent_mapped(page);
+
+ /*
+ * this makes sure page_mkwrite is called on the
+ * page if it is dirtied again later
+ */
+ clear_page_dirty_for_io(page);
+
+ btrfs_set_extent_delalloc(inode, page_start, page_end);
+ set_page_dirty(page);
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+ }
+
+out_unlock:
+ mutex_unlock(&inode->i_mutex);
+ return 0;
+}
+
+static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+ void __user *arg)
+{
+ u64 new_size;
+ u64 old_size;
+ u64 devid = 1;
+ struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_device *device = NULL;
+ char *sizestr;
+ char *devstr = NULL;
+ int ret = 0;
+ int namelen;
+ int mod = 0;
+
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ namelen = strlen(vol_args->name);
+
+ mutex_lock(&root->fs_info->volume_mutex);
+ sizestr = vol_args->name;
+ devstr = strchr(sizestr, ':');
+ if (devstr) {
+ char *end;
+ sizestr = devstr + 1;
+ *devstr = '\0';
+ devstr = vol_args->name;
+ devid = simple_strtoull(devstr, &end, 10);
+ printk(KERN_INFO "resizing devid %llu\n",
+ (unsigned long long)devid);
+ }
+ device = btrfs_find_device(root, devid, NULL, NULL);
+ if (!device) {
+ printk(KERN_INFO "resizer unable to find device %llu\n",
+ (unsigned long long)devid);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ if (!strcmp(sizestr, "max"))
+ new_size = device->bdev->bd_inode->i_size;
+ else {
+ if (sizestr[0] == '-') {
+ mod = -1;
+ sizestr++;
+ } else if (sizestr[0] == '+') {
+ mod = 1;
+ sizestr++;
+ }
+ new_size = btrfs_parse_size(sizestr);
+ if (new_size == 0) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ }
+
+ old_size = device->total_bytes;
+
+ if (mod < 0) {
+ if (new_size > old_size) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ new_size = old_size - new_size;
+ } else if (mod > 0) {
+ new_size = old_size + new_size;
+ }
+
+ if (new_size < 256 * 1024 * 1024) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ if (new_size > device->bdev->bd_inode->i_size) {
+ ret = -EFBIG;
+ goto out_unlock;
+ }
+
+ do_div(new_size, root->sectorsize);
+ new_size *= root->sectorsize;
+
+ printk(KERN_INFO "new size for %s is %llu\n",
+ device->name, (unsigned long long)new_size);
+
+ if (new_size > old_size) {
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_grow_device(trans, device, new_size);
+ btrfs_commit_transaction(trans, root);
+ } else {
+ ret = btrfs_shrink_device(device, new_size);
+ }
+
+out_unlock:
+ mutex_unlock(&root->fs_info->volume_mutex);
+ kfree(vol_args);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+ void __user *arg, int subvol)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_ioctl_vol_args *vol_args;
+ struct file *src_file;
+ int namelen;
+ int ret = 0;
+
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ namelen = strlen(vol_args->name);
+ if (strchr(vol_args->name, '/')) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (subvol) {
+ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+ NULL);
+ } else {
+ struct inode *src_inode;
+ src_file = fget(vol_args->fd);
+ if (!src_file) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ src_inode = src_file->f_path.dentry->d_inode;
+ if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
+ printk(KERN_INFO "btrfs: Snapshot src from "
+ "another FS\n");
+ ret = -EINVAL;
+ fput(src_file);
+ goto out;
+ }
+ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+ BTRFS_I(src_inode)->root);
+ fput(src_file);
+ }
+out:
+ kfree(vol_args);
+ return ret;
+}
+
+/*
+ * helper to check if the subvolume references other subvolumes
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ ret = 0;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == root->root_key.objectid &&
+ key.type == BTRFS_ROOT_REF_KEY)
+ ret = -ENOTEMPTY;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_destroy(struct file *file,
+ void __user *arg)
+{
+ struct dentry *parent = fdentry(file);
+ struct dentry *dentry;
+ struct inode *dir = parent->d_inode;
+ struct inode *inode;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *dest = NULL;
+ struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_trans_handle *trans;
+ int namelen;
+ int ret;
+ int err = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ namelen = strlen(vol_args->name);
+ if (strchr(vol_args->name, '/') ||
+ strncmp(vol_args->name, "..", namelen) == 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = mnt_want_write(file->f_path.mnt);
+ if (err)
+ goto out;
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ dentry = lookup_one_len(vol_args->name, parent, namelen);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_unlock_dir;
+ }
+
+ if (!dentry->d_inode) {
+ err = -ENOENT;
+ goto out_dput;
+ }
+
+ inode = dentry->d_inode;
+ if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ err = -EINVAL;
+ goto out_dput;
+ }
+
+ dest = BTRFS_I(inode)->root;
+
+ mutex_lock(&inode->i_mutex);
+ err = d_invalidate(dentry);
+ if (err)
+ goto out_unlock;
+
+ down_write(&root->fs_info->subvol_sem);
+
+ err = may_destroy_subvol(dest);
+ if (err)
+ goto out_up_write;
+
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_unlink_subvol(trans, root, dir,
+ dest->root_key.objectid,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ BUG_ON(ret);
+
+ btrfs_record_root_in_trans(trans, dest);
+
+ memset(&dest->root_item.drop_progress, 0,
+ sizeof(dest->root_item.drop_progress));
+ dest->root_item.drop_level = 0;
+ btrfs_set_root_refs(&dest->root_item, 0);
+
+ ret = btrfs_insert_orphan_item(trans,
+ root->fs_info->tree_root,
+ dest->root_key.objectid);
+ BUG_ON(ret);
+
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ inode->i_flags |= S_DEAD;
+out_up_write:
+ up_write(&root->fs_info->subvol_sem);
+out_unlock:
+ mutex_unlock(&inode->i_mutex);
+ if (!err) {
+ shrink_dcache_sb(root->fs_info->sb);
+ btrfs_invalidate_inodes(dest);
+ d_delete(dentry);
+ }
+out_dput:
+ dput(dentry);
+out_unlock_dir:
+ mutex_unlock(&dir->i_mutex);
+ mnt_drop_write(file->f_path.mnt);
+out:
+ kfree(vol_args);
+ return err;
+}
+
+static int btrfs_ioctl_defrag(struct file *file)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ ret = mnt_want_write(file->f_path.mnt);
+ if (ret)
+ return ret;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFDIR:
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out;
+ }
+ btrfs_defrag_root(root, 0);
+ btrfs_defrag_root(root->fs_info->extent_root, 0);
+ break;
+ case S_IFREG:
+ if (!(file->f_mode & FMODE_WRITE)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ btrfs_defrag_file(file);
+ break;
+ }
+out:
+ mnt_drop_write(file->f_path.mnt);
+ return ret;
+}
+
+static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_vol_args *vol_args;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_init_new_device(root, vol_args->name);
+
+ kfree(vol_args);
+ return ret;
+}
+
+static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_vol_args *vol_args;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_rm_device(root, vol_args->name);
+
+ kfree(vol_args);
+ return ret;
+}
+
+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct file *src_file;
+ struct inode *src;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ char *buf;
+ struct btrfs_key key;
+ u32 nritems;
+ int slot;
+ int ret;
+ u64 len = olen;
+ u64 bs = root->fs_info->sb->s_blocksize;
+ u64 hint_byte;
+
+ /*
+ * TODO:
+ * - split compressed inline extents. annoying: we need to
+ * decompress into destination's address_space (the file offset
+ * may change, so source mapping won't do), then recompress (or
+ * otherwise reinsert) a subrange.
+ * - allow ranges within the same file to be cloned (provided
+ * they don't overlap)?
+ */
+
+ /* the destination must be opened for writing */
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EINVAL;
+
+ ret = mnt_want_write(file->f_path.mnt);
+ if (ret)
+ return ret;
+
+ src_file = fget(srcfd);
+ if (!src_file) {
+ ret = -EBADF;
+ goto out_drop_write;
+ }
+
+ src = src_file->f_dentry->d_inode;
+
+ ret = -EINVAL;
+ if (src == inode)
+ goto out_fput;
+
+ /* the src must be open for reading */
+ if (!(src_file->f_mode & FMODE_READ))
+ goto out_fput;
+
+ ret = -EISDIR;
+ if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
+ goto out_fput;
+
+ ret = -EXDEV;
+ if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
+ goto out_fput;
+
+ ret = -ENOMEM;
+ buf = vmalloc(btrfs_level_size(root, 0));
+ if (!buf)
+ goto out_fput;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ vfree(buf);
+ goto out_fput;
+ }
+ path->reada = 2;
+
+ if (inode < src) {
+ mutex_lock(&inode->i_mutex);
+ mutex_lock(&src->i_mutex);
+ } else {
+ mutex_lock(&src->i_mutex);
+ mutex_lock(&inode->i_mutex);
+ }
+
+ /* determine range to clone */
+ ret = -EINVAL;
+ if (off >= src->i_size || off + len > src->i_size)
+ goto out_unlock;
+ if (len == 0)
+ olen = len = src->i_size - off;
+ /* if we extend to eof, continue to block boundary */
+ if (off + len == src->i_size)
+ len = ((src->i_size + bs-1) & ~(bs-1))
+ - off;
+
+ /* verify the end result is block aligned */
+ if ((off & (bs-1)) ||
+ ((off + len) & (bs-1)))
+ goto out_unlock;
+
+ /* do any pending delalloc/csum calc on src, one way or
+ another, and lock file content */
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+ lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
+ if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
+ break;
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ btrfs_wait_ordered_range(src, off, off+len);
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+
+ /* punch hole in destination first */
+ btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
+
+ /* clone data */
+ key.objectid = src->i_ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+
+ while (1) {
+ /*
+ * note the key will change type as we walk through the
+ * tree.
+ */
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ }
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+ key.objectid != src->i_ino)
+ break;
+
+ if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+ struct btrfs_file_extent_item *extent;
+ int type;
+ u32 size;
+ struct btrfs_key new_key;
+ u64 disko = 0, diskl = 0;
+ u64 datao = 0, datal = 0;
+ u8 comp;
+
+ size = btrfs_item_size_nr(leaf, slot);
+ read_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ comp = btrfs_file_extent_compression(leaf, extent);
+ type = btrfs_file_extent_type(leaf, extent);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disko = btrfs_file_extent_disk_bytenr(leaf,
+ extent);
+ diskl = btrfs_file_extent_disk_num_bytes(leaf,
+ extent);
+ datao = btrfs_file_extent_offset(leaf, extent);
+ datal = btrfs_file_extent_num_bytes(leaf,
+ extent);
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ /* take upper bound, may be compressed */
+ datal = btrfs_file_extent_ram_bytes(leaf,
+ extent);
+ }
+ btrfs_release_path(root, path);
+
+ if (key.offset + datal < off ||
+ key.offset >= off+len)
+ goto next;
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.objectid = inode->i_ino;
+ new_key.offset = key.offset + destoff - off;
+
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ ret = btrfs_insert_empty_item(trans, root, path,
+ &new_key, size);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ write_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+
+ if (off > key.offset) {
+ datao += off - key.offset;
+ datal -= off - key.offset;
+ }
+
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
+ /* disko == 0 means it's a hole */
+ if (!disko)
+ datao = 0;
+
+ btrfs_set_file_extent_offset(leaf, extent,
+ datao);
+ btrfs_set_file_extent_num_bytes(leaf, extent,
+ datal);
+ if (disko) {
+ inode_add_bytes(inode, datal);
+ ret = btrfs_inc_extent_ref(trans, root,
+ disko, diskl, 0,
+ root->root_key.objectid,
+ inode->i_ino,
+ new_key.offset - datao);
+ BUG_ON(ret);
+ }
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ u64 skip = 0;
+ u64 trim = 0;
+ if (off > key.offset) {
+ skip = off - key.offset;
+ new_key.offset += skip;
+ }
+
+ if (key.offset + datal > off+len)
+ trim = key.offset + datal - (off+len);
+
+ if (comp && (skip || trim)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ size -= skip + trim;
+ datal -= skip + trim;
+ ret = btrfs_insert_empty_item(trans, root, path,
+ &new_key, size);
+ if (ret)
+ goto out;
+
+ if (skip) {
+ u32 start =
+ btrfs_file_extent_calc_inline_size(0);
+ memmove(buf+start, buf+start+skip,
+ datal);
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ write_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ size);
+ inode_add_bytes(inode, datal);
+ }
+
+ btrfs_mark_buffer_dirty(leaf);
+ }
+
+next:
+ btrfs_release_path(root, path);
+ key.offset++;
+ }
+ ret = 0;
+out:
+ btrfs_release_path(root, path);
+ if (ret == 0) {
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ if (destoff + olen > inode->i_size)
+ btrfs_i_size_write(inode, destoff + olen);
+ BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+ ret = btrfs_update_inode(trans, root, inode);
+ }
+ btrfs_end_transaction(trans, root);
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ if (ret)
+ vmtruncate(inode, 0);
+out_unlock:
+ mutex_unlock(&src->i_mutex);
+ mutex_unlock(&inode->i_mutex);
+ vfree(buf);
+ btrfs_free_path(path);
+out_fput:
+ fput(src_file);
+out_drop_write:
+ mnt_drop_write(file->f_path.mnt);
+ return ret;
+}
+
+static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+{
+ struct btrfs_ioctl_clone_range_args args;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
+ args.src_length, args.dest_offset);
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks. They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+static long btrfs_ioctl_trans_start(struct file *file)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ ret = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ ret = -EINPROGRESS;
+ if (file->private_data)
+ goto out;
+
+ ret = mnt_want_write(file->f_path.mnt);
+ if (ret)
+ goto out;
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ root->fs_info->open_ioctl_trans++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ ret = -ENOMEM;
+ trans = btrfs_start_ioctl_transaction(root, 0);
+ if (!trans)
+ goto out_drop;
+
+ file->private_data = trans;
+ return 0;
+
+out_drop:
+ mutex_lock(&root->fs_info->trans_mutex);
+ root->fs_info->open_ioctl_trans--;
+ mutex_unlock(&root->fs_info->trans_mutex);
+ mnt_drop_write(file->f_path.mnt);
+out:
+ return ret;
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks. They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_end(struct file *file)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+
+ trans = file->private_data;
+ if (!trans)
+ return -EINVAL;
+ file->private_data = NULL;
+
+ btrfs_end_transaction(trans, root);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ root->fs_info->open_ioctl_trans--;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ mnt_drop_write(file->f_path.mnt);
+ return 0;
+}
+
+long btrfs_ioctl(struct file *file, unsigned int
+ cmd, unsigned long arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ return btrfs_ioctl_getflags(file, argp);
+ case FS_IOC_SETFLAGS:
+ return btrfs_ioctl_setflags(file, argp);
+ case FS_IOC_GETVERSION:
+ return btrfs_ioctl_getversion(file, argp);
+ case BTRFS_IOC_SNAP_CREATE:
+ return btrfs_ioctl_snap_create(file, argp, 0);
+ case BTRFS_IOC_SUBVOL_CREATE:
+ return btrfs_ioctl_snap_create(file, argp, 1);
+ case BTRFS_IOC_SNAP_DESTROY:
+ return btrfs_ioctl_snap_destroy(file, argp);
+ case BTRFS_IOC_DEFRAG:
+ return btrfs_ioctl_defrag(file);
+ case BTRFS_IOC_RESIZE:
+ return btrfs_ioctl_resize(root, argp);
+ case BTRFS_IOC_ADD_DEV:
+ return btrfs_ioctl_add_dev(root, argp);
+ case BTRFS_IOC_RM_DEV:
+ return btrfs_ioctl_rm_dev(root, argp);
+ case BTRFS_IOC_BALANCE:
+ return btrfs_balance(root->fs_info->dev_root);
+ case BTRFS_IOC_CLONE:
+ return btrfs_ioctl_clone(file, arg, 0, 0, 0);
+ case BTRFS_IOC_CLONE_RANGE:
+ return btrfs_ioctl_clone_range(file, argp);
+ case BTRFS_IOC_TRANS_START:
+ return btrfs_ioctl_trans_start(file);
+ case BTRFS_IOC_TRANS_END:
+ return btrfs_ioctl_trans_end(file);
+ case BTRFS_IOC_SYNC:
+ btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ return 0;
+ }
+
+ return -ENOTTY;
+}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 00000000000..bc49914475e
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __IOCTL_
+#define __IOCTL_
+#include <linux/ioctl.h>
+
+#define BTRFS_IOCTL_MAGIC 0x94
+#define BTRFS_VOL_NAME_MAX 255
+#define BTRFS_PATH_NAME_MAX 4087
+
+/* this should be 4k */
+struct btrfs_ioctl_vol_args {
+ __s64 fd;
+ char name[BTRFS_PATH_NAME_MAX + 1];
+};
+
+struct btrfs_ioctl_clone_range_args {
+ __s64 src_fd;
+ __u64 src_offset, src_length;
+ __u64 dest_offset;
+};
+
+#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
+ struct btrfs_ioctl_vol_args)
+/* trans start and trans end are dangerous, and only for
+ * use by applications that know how to avoid the
+ * resulting deadlocks
+ */
+#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
+#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
+#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
+
+#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
+#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
+ struct btrfs_ioctl_vol_args)
+
+#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
+ struct btrfs_ioctl_clone_range_args)
+
+#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
+ struct btrfs_ioctl_vol_args)
+#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 00000000000..1c36e5cd8f5
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <asm/bug.h>
+#include "ctree.h"
+#include "extent_io.h"
+#include "locking.h"
+
+static inline void spin_nested(struct extent_buffer *eb)
+{
+ spin_lock(&eb->lock);
+}
+
+/*
+ * Setting a lock to blocking will drop the spinlock and set the
+ * flag that forces other procs who want the lock to wait. After
+ * this you can safely schedule with the lock held.
+ */
+void btrfs_set_lock_blocking(struct extent_buffer *eb)
+{
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+ set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+ spin_unlock(&eb->lock);
+ }
+ /* exit with the spin lock released and the bit set */
+}
+
+/*
+ * clearing the blocking flag will take the spinlock again.
+ * After this you can't safely schedule
+ */
+void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+ if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+ spin_nested(eb);
+ clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+ smp_mb__after_clear_bit();
+ }
+ /* exit with the spin lock held */
+}
+
+/*
+ * unfortunately, many of the places that currently set a lock to blocking
+ * don't end up blocking for very long, and often they don't block
+ * at all. For a dbench 50 run, if we don't spin on the blocking bit
+ * at all, the context switch rate can jump up to 400,000/sec or more.
+ *
+ * So, we're still stuck with this crummy spin on the blocking bit,
+ * at least until the most common causes of the short blocks
+ * can be dealt with.
+ */
+static int btrfs_spin_on_block(struct extent_buffer *eb)
+{
+ int i;
+
+ for (i = 0; i < 512; i++) {
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ return 1;
+ if (need_resched())
+ break;
+ cpu_relax();
+ }
+ return 0;
+}
+
+/*
+ * This is somewhat different from trylock. It will take the
+ * spinlock but if it finds the lock is set to blocking, it will
+ * return without the lock held.
+ *
+ * returns 1 if it was able to take the lock and zero otherwise
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_try_spin_lock(struct extent_buffer *eb)
+{
+ int i;
+
+ if (btrfs_spin_on_block(eb)) {
+ spin_nested(eb);
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ return 1;
+ spin_unlock(&eb->lock);
+ }
+ /* spin for a bit on the BLOCKING flag */
+ for (i = 0; i < 2; i++) {
+ cpu_relax();
+ if (!btrfs_spin_on_block(eb))
+ break;
+
+ spin_nested(eb);
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ return 1;
+ spin_unlock(&eb->lock);
+ }
+ return 0;
+}
+
+/*
+ * the autoremove wake function will return 0 if it tried to wake up
+ * a process that was already awake, which means that process won't
+ * count as an exclusive wakeup. The waitq code will continue waking
+ * procs until it finds one that was actually sleeping.
+ *
+ * For btrfs, this isn't quite what we want. We want a single proc
+ * to be notified that the lock is ready for taking. If that proc
+ * already happen to be awake, great, it will loop around and try for
+ * the lock.
+ *
+ * So, btrfs_wake_function always returns 1, even when the proc that we
+ * tried to wake up was already awake.
+ */
+static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
+ int sync, void *key)
+{
+ autoremove_wake_function(wait, mode, sync, key);
+ return 1;
+}
+
+/*
+ * returns with the extent buffer spinlocked.
+ *
+ * This will spin and/or wait as required to take the lock, and then
+ * return with the spinlock held.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+ DEFINE_WAIT(wait);
+ wait.func = btrfs_wake_function;
+
+ if (!btrfs_spin_on_block(eb))
+ goto sleep;
+
+ while(1) {
+ spin_nested(eb);
+
+ /* nobody is blocking, exit with the spinlock held */
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ return 0;
+
+ /*
+ * we have the spinlock, but the real owner is blocking.
+ * wait for them
+ */
+ spin_unlock(&eb->lock);
+
+ /*
+ * spin for a bit, and if the blocking flag goes away,
+ * loop around
+ */
+ cpu_relax();
+ if (btrfs_spin_on_block(eb))
+ continue;
+sleep:
+ prepare_to_wait_exclusive(&eb->lock_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ schedule();
+
+ finish_wait(&eb->lock_wq, &wait);
+ }
+ return 0;
+}
+
+/*
+ * Very quick trylock, this does not spin or schedule. It returns
+ * 1 with the spinlock held if it was able to take the lock, or it
+ * returns zero if it was unable to take the lock.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_try_tree_lock(struct extent_buffer *eb)
+{
+ if (spin_trylock(&eb->lock)) {
+ if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+ /*
+ * we've got the spinlock, but the real owner is
+ * blocking. Drop the spinlock and return failure
+ */
+ spin_unlock(&eb->lock);
+ return 0;
+ }
+ return 1;
+ }
+ /* someone else has the spinlock giveup */
+ return 0;
+}
+
+int btrfs_tree_unlock(struct extent_buffer *eb)
+{
+ /*
+ * if we were a blocking owner, we don't have the spinlock held
+ * just clear the bit and look for waiters
+ */
+ if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ smp_mb__after_clear_bit();
+ else
+ spin_unlock(&eb->lock);
+
+ if (waitqueue_active(&eb->lock_wq))
+ wake_up(&eb->lock_wq);
+ return 0;
+}
+
+void btrfs_assert_tree_locked(struct extent_buffer *eb)
+{
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ assert_spin_locked(&eb->lock);
+}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 00000000000..6c4ce457168
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_LOCKING_
+#define __BTRFS_LOCKING_
+
+int btrfs_tree_lock(struct extent_buffer *eb);
+int btrfs_tree_unlock(struct extent_buffer *eb);
+
+int btrfs_try_tree_lock(struct extent_buffer *eb);
+int btrfs_try_spin_lock(struct extent_buffer *eb);
+
+void btrfs_set_lock_blocking(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking(struct extent_buffer *eb);
+void btrfs_assert_tree_locked(struct extent_buffer *eb);
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 00000000000..5c2a9e78a94
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,830 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "extent_io.h"
+
+static u64 entry_end(struct btrfs_ordered_extent *entry)
+{
+ if (entry->file_offset + entry->len < entry->file_offset)
+ return (u64)-1;
+ return entry->file_offset + entry->len;
+}
+
+/* returns NULL if the insertion worked, or it returns the node it did find
+ * in the tree
+ */
+static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_ordered_extent *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
+
+ if (file_offset < entry->file_offset)
+ p = &(*p)->rb_left;
+ else if (file_offset >= entry_end(entry))
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+/*
+ * look for a given offset in the tree, and if it can't be found return the
+ * first lesser offset
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
+ struct rb_node **prev_ret)
+{
+ struct rb_node *n = root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *test;
+ struct btrfs_ordered_extent *entry;
+ struct btrfs_ordered_extent *prev_entry = NULL;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+ prev = n;
+ prev_entry = entry;
+
+ if (file_offset < entry->file_offset)
+ n = n->rb_left;
+ else if (file_offset >= entry_end(entry))
+ n = n->rb_right;
+ else
+ return n;
+ }
+ if (!prev_ret)
+ return NULL;
+
+ while (prev && file_offset >= entry_end(prev_entry)) {
+ test = rb_next(prev);
+ if (!test)
+ break;
+ prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+ rb_node);
+ if (file_offset < entry_end(prev_entry))
+ break;
+
+ prev = test;
+ }
+ if (prev)
+ prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
+ rb_node);
+ while (prev && file_offset < entry_end(prev_entry)) {
+ test = rb_prev(prev);
+ if (!test)
+ break;
+ prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+ rb_node);
+ prev = test;
+ }
+ *prev_ret = prev;
+ return NULL;
+}
+
+/*
+ * helper to check if a given offset is inside a given entry
+ */
+static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
+{
+ if (file_offset < entry->file_offset ||
+ entry->file_offset + entry->len <= file_offset)
+ return 0;
+ return 1;
+}
+
+/*
+ * look find the first ordered struct that has this offset, otherwise
+ * the first one less than this offset
+ */
+static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+ u64 file_offset)
+{
+ struct rb_root *root = &tree->tree;
+ struct rb_node *prev;
+ struct rb_node *ret;
+ struct btrfs_ordered_extent *entry;
+
+ if (tree->last) {
+ entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+ rb_node);
+ if (offset_in_entry(entry, file_offset))
+ return tree->last;
+ }
+ ret = __tree_search(root, file_offset, &prev);
+ if (!ret)
+ ret = prev;
+ if (ret)
+ tree->last = ret;
+ return ret;
+}
+
+/* allocate and add a new ordered_extent into the per-inode tree.
+ * file_offset is the logical offset in the file
+ *
+ * start is the disk block number of an extent already reserved in the
+ * extent allocation tree
+ *
+ * len is the length of the extent
+ *
+ * The tree is given a single reference on the ordered extent that was
+ * inserted.
+ */
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ entry = kzalloc(sizeof(*entry), GFP_NOFS);
+ if (!entry)
+ return -ENOMEM;
+
+ mutex_lock(&tree->mutex);
+ entry->file_offset = file_offset;
+ entry->start = start;
+ entry->len = len;
+ entry->disk_len = disk_len;
+ entry->bytes_left = len;
+ entry->inode = inode;
+ if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
+ set_bit(type, &entry->flags);
+
+ /* one ref for the tree */
+ atomic_set(&entry->refs, 1);
+ init_waitqueue_head(&entry->wait);
+ INIT_LIST_HEAD(&entry->list);
+ INIT_LIST_HEAD(&entry->root_extent_list);
+
+ node = tree_insert(&tree->tree, file_offset,
+ &entry->rb_node);
+ BUG_ON(node);
+
+ spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ list_add_tail(&entry->root_extent_list,
+ &BTRFS_I(inode)->root->fs_info->ordered_extents);
+ spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+
+ mutex_unlock(&tree->mutex);
+ BUG_ON(node);
+ return 0;
+}
+
+/*
+ * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
+ * when an ordered extent is finished. If the list covers more than one
+ * ordered extent, it is split across multiples.
+ */
+int btrfs_add_ordered_sum(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ struct btrfs_ordered_sum *sum)
+{
+ struct btrfs_ordered_inode_tree *tree;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ mutex_lock(&tree->mutex);
+ list_add_tail(&sum->list, &entry->list);
+ mutex_unlock(&tree->mutex);
+ return 0;
+}
+
+/*
+ * this is used to account for finished IO across a given range
+ * of the file. The IO should not span ordered extents. If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ */
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+ u64 file_offset, u64 io_size)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry;
+ int ret;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ mutex_lock(&tree->mutex);
+ node = tree_search(tree, file_offset);
+ if (!node) {
+ ret = 1;
+ goto out;
+ }
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (!offset_in_entry(entry, file_offset)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (io_size > entry->bytes_left) {
+ printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+ (unsigned long long)entry->bytes_left,
+ (unsigned long long)io_size);
+ }
+ entry->bytes_left -= io_size;
+ if (entry->bytes_left == 0)
+ ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ else
+ ret = 1;
+out:
+ mutex_unlock(&tree->mutex);
+ return ret == 0;
+}
+
+/*
+ * used to drop a reference on an ordered extent. This will free
+ * the extent if the last reference is dropped
+ */
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+ struct list_head *cur;
+ struct btrfs_ordered_sum *sum;
+
+ if (atomic_dec_and_test(&entry->refs)) {
+ while (!list_empty(&entry->list)) {
+ cur = entry->list.next;
+ sum = list_entry(cur, struct btrfs_ordered_sum, list);
+ list_del(&sum->list);
+ kfree(sum);
+ }
+ kfree(entry);
+ }
+ return 0;
+}
+
+/*
+ * remove an ordered extent from the tree. No references are dropped
+ * and you must wake_up entry->wait. You must hold the tree mutex
+ * while you call this function.
+ */
+static int __btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ node = &entry->rb_node;
+ rb_erase(node, &tree->tree);
+ tree->last = NULL;
+ set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
+ inode, 1);
+
+ spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ list_del_init(&entry->root_extent_list);
+
+ /*
+ * we have no more ordered extents for this inode and
+ * no dirty pages. We can safely remove it from the
+ * list of ordered extents
+ */
+ if (RB_EMPTY_ROOT(&tree->tree) &&
+ !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+ list_del_init(&BTRFS_I(inode)->ordered_operations);
+ }
+ spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+
+ return 0;
+}
+
+/*
+ * remove an ordered extent from the tree. No references are dropped
+ * but any waiters are woken.
+ */
+int btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ int ret;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ mutex_lock(&tree->mutex);
+ ret = __btrfs_remove_ordered_extent(inode, entry);
+ mutex_unlock(&tree->mutex);
+ wake_up(&entry->wait);
+
+ return ret;
+}
+
+/*
+ * wait for all the ordered extents in a root. This is done when balancing
+ * space between drives.
+ */
+int btrfs_wait_ordered_extents(struct btrfs_root *root,
+ int nocow_only, int delay_iput)
+{
+ struct list_head splice;
+ struct list_head *cur;
+ struct btrfs_ordered_extent *ordered;
+ struct inode *inode;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ list_splice_init(&root->fs_info->ordered_extents, &splice);
+ while (!list_empty(&splice)) {
+ cur = splice.next;
+ ordered = list_entry(cur, struct btrfs_ordered_extent,
+ root_extent_list);
+ if (nocow_only &&
+ !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+ list_move(&ordered->root_extent_list,
+ &root->fs_info->ordered_extents);
+ cond_resched_lock(&root->fs_info->ordered_extent_lock);
+ continue;
+ }
+
+ list_del_init(&ordered->root_extent_list);
+ atomic_inc(&ordered->refs);
+
+ /*
+ * the inode may be getting freed (in sys_unlink path).
+ */
+ inode = igrab(ordered->inode);
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ if (inode) {
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ if (delay_iput)
+ btrfs_add_delayed_iput(inode);
+ else
+ iput(inode);
+ } else {
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ }
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ return 0;
+}
+
+/*
+ * this is used during transaction commit to write all the inodes
+ * added to the ordered operation list. These files must be fully on
+ * disk before the transaction commits.
+ *
+ * we have two modes here, one is to just start the IO via filemap_flush
+ * and the other is to wait for all the io. When we wait, we have an
+ * extra check to make sure the ordered operation list really is empty
+ * before we return
+ */
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+{
+ struct btrfs_inode *btrfs_inode;
+ struct inode *inode;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&root->fs_info->ordered_operations_mutex);
+ spin_lock(&root->fs_info->ordered_extent_lock);
+again:
+ list_splice_init(&root->fs_info->ordered_operations, &splice);
+
+ while (!list_empty(&splice)) {
+ btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+ ordered_operations);
+
+ inode = &btrfs_inode->vfs_inode;
+
+ list_del_init(&btrfs_inode->ordered_operations);
+
+ /*
+ * the inode may be getting freed (in sys_unlink path).
+ */
+ inode = igrab(inode);
+
+ if (!wait && inode) {
+ list_add_tail(&BTRFS_I(inode)->ordered_operations,
+ &root->fs_info->ordered_operations);
+ }
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ if (inode) {
+ if (wait)
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ else
+ filemap_flush(inode->i_mapping);
+ btrfs_add_delayed_iput(inode);
+ }
+
+ cond_resched();
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ }
+ if (wait && !list_empty(&root->fs_info->ordered_operations))
+ goto again;
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+ return 0;
+}
+
+/*
+ * Used to start IO or wait for a given ordered extent to finish.
+ *
+ * If wait is one, this effectively waits on page writeback for all the pages
+ * in the extent, and it waits on the io completion code to insert
+ * metadata into the btree corresponding to the extent
+ */
+void btrfs_start_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ int wait)
+{
+ u64 start = entry->file_offset;
+ u64 end = start + entry->len - 1;
+
+ /*
+ * pages in the range can be dirty, clean or writeback. We
+ * start IO on any dirty ones so the wait doesn't stall waiting
+ * for pdflush to find them
+ */
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (wait) {
+ wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+ &entry->flags));
+ }
+}
+
+/*
+ * Used to wait on ordered extents across a large range of bytes.
+ */
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+{
+ u64 end;
+ u64 orig_end;
+ u64 wait_end;
+ struct btrfs_ordered_extent *ordered;
+ int found;
+
+ if (start + len < start) {
+ orig_end = INT_LIMIT(loff_t);
+ } else {
+ orig_end = start + len - 1;
+ if (orig_end > INT_LIMIT(loff_t))
+ orig_end = INT_LIMIT(loff_t);
+ }
+ wait_end = orig_end;
+again:
+ /* start IO across the range first to instantiate any delalloc
+ * extents
+ */
+ filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+
+ /* The compression code will leave pages locked but return from
+ * writepage without setting the page writeback. Starting again
+ * with WB_SYNC_ALL will end up waiting for the IO to actually start.
+ */
+ filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+
+ filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+
+ end = orig_end;
+ found = 0;
+ while (1) {
+ ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ if (!ordered)
+ break;
+ if (ordered->file_offset > orig_end) {
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ if (ordered->file_offset + ordered->len < start) {
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ found++;
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ end = ordered->file_offset;
+ btrfs_put_ordered_extent(ordered);
+ if (end == 0 || end == start)
+ break;
+ end--;
+ }
+ if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+ EXTENT_DELALLOC, 0, NULL)) {
+ schedule_timeout(1);
+ goto again;
+ }
+ return 0;
+}
+
+/*
+ * find an ordered extent corresponding to file_offset. return NULL if
+ * nothing is found, otherwise take a reference on the extent and return it
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+ u64 file_offset)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ mutex_lock(&tree->mutex);
+ node = tree_search(tree, file_offset);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (!offset_in_entry(entry, file_offset))
+ entry = NULL;
+ if (entry)
+ atomic_inc(&entry->refs);
+out:
+ mutex_unlock(&tree->mutex);
+ return entry;
+}
+
+/*
+ * lookup and return any extent before 'file_offset'. NULL is returned
+ * if none is found
+ */
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ mutex_lock(&tree->mutex);
+ node = tree_search(tree, file_offset);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ atomic_inc(&entry->refs);
+out:
+ mutex_unlock(&tree->mutex);
+ return entry;
+}
+
+/*
+ * After an extent is done, call this to conditionally update the on disk
+ * i_size. i_size is updated to cover any fully written part of the file.
+ */
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
+ struct btrfs_ordered_extent *ordered)
+{
+ struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ u64 disk_i_size;
+ u64 new_i_size;
+ u64 i_size_test;
+ u64 i_size = i_size_read(inode);
+ struct rb_node *node;
+ struct rb_node *prev = NULL;
+ struct btrfs_ordered_extent *test;
+ int ret = 1;
+
+ if (ordered)
+ offset = entry_end(ordered);
+ else
+ offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
+
+ mutex_lock(&tree->mutex);
+ disk_i_size = BTRFS_I(inode)->disk_i_size;
+
+ /* truncate file */
+ if (disk_i_size > i_size) {
+ BTRFS_I(inode)->disk_i_size = i_size;
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * if the disk i_size is already at the inode->i_size, or
+ * this ordered extent is inside the disk i_size, we're done
+ */
+ if (disk_i_size == i_size || offset <= disk_i_size) {
+ goto out;
+ }
+
+ /*
+ * we can't update the disk_isize if there are delalloc bytes
+ * between disk_i_size and this ordered extent
+ */
+ if (test_range_bit(io_tree, disk_i_size, offset - 1,
+ EXTENT_DELALLOC, 0, NULL)) {
+ goto out;
+ }
+ /*
+ * walk backward from this ordered extent to disk_i_size.
+ * if we find an ordered extent then we can't update disk i_size
+ * yet
+ */
+ if (ordered) {
+ node = rb_prev(&ordered->rb_node);
+ } else {
+ prev = tree_search(tree, offset);
+ /*
+ * we insert file extents without involving ordered struct,
+ * so there should be no ordered struct cover this offset
+ */
+ if (prev) {
+ test = rb_entry(prev, struct btrfs_ordered_extent,
+ rb_node);
+ BUG_ON(offset_in_entry(test, offset));
+ }
+ node = prev;
+ }
+ while (node) {
+ test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (test->file_offset + test->len <= disk_i_size)
+ break;
+ if (test->file_offset >= i_size)
+ break;
+ if (test->file_offset >= disk_i_size)
+ goto out;
+ node = rb_prev(node);
+ }
+ new_i_size = min_t(u64, offset, i_size);
+
+ /*
+ * at this point, we know we can safely update i_size to at least
+ * the offset from this ordered extent. But, we need to
+ * walk forward and see if ios from higher up in the file have
+ * finished.
+ */
+ if (ordered) {
+ node = rb_next(&ordered->rb_node);
+ } else {
+ if (prev)
+ node = rb_next(prev);
+ else
+ node = rb_first(&tree->tree);
+ }
+ i_size_test = 0;
+ if (node) {
+ /*
+ * do we have an area where IO might have finished
+ * between our ordered extent and the next one.
+ */
+ test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (test->file_offset > offset)
+ i_size_test = test->file_offset;
+ } else {
+ i_size_test = i_size;
+ }
+
+ /*
+ * i_size_test is the end of a region after this ordered
+ * extent where there are no ordered extents. As long as there
+ * are no delalloc bytes in this area, it is safe to update
+ * disk_i_size to the end of the region.
+ */
+ if (i_size_test > offset &&
+ !test_range_bit(io_tree, offset, i_size_test - 1,
+ EXTENT_DELALLOC, 0, NULL)) {
+ new_i_size = min_t(u64, i_size_test, i_size);
+ }
+ BTRFS_I(inode)->disk_i_size = new_i_size;
+ ret = 0;
+out:
+ /*
+ * we need to remove the ordered extent with the tree lock held
+ * so that other people calling this function don't find our fully
+ * processed ordered entry and skip updating the i_size
+ */
+ if (ordered)
+ __btrfs_remove_ordered_extent(inode, ordered);
+ mutex_unlock(&tree->mutex);
+ if (ordered)
+ wake_up(&ordered->wait);
+ return ret;
+}
+
+/*
+ * search the ordered extents for one corresponding to 'offset' and
+ * try to find a checksum. This is used because we allow pages to
+ * be reclaimed before their checksum is actually put into the btree
+ */
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+ u32 *sum)
+{
+ struct btrfs_ordered_sum *ordered_sum;
+ struct btrfs_sector_sum *sector_sums;
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ unsigned long num_sectors;
+ unsigned long i;
+ u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
+ int ret = 1;
+
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
+ if (!ordered)
+ return 1;
+
+ mutex_lock(&tree->mutex);
+ list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
+ if (disk_bytenr >= ordered_sum->bytenr) {
+ num_sectors = ordered_sum->len / sectorsize;
+ sector_sums = ordered_sum->sums;
+ for (i = 0; i < num_sectors; i++) {
+ if (sector_sums[i].bytenr == disk_bytenr) {
+ *sum = sector_sums[i].sum;
+ ret = 0;
+ goto out;
+ }
+ }
+ }
+ }
+out:
+ mutex_unlock(&tree->mutex);
+ btrfs_put_ordered_extent(ordered);
+ return ret;
+}
+
+
+/*
+ * add a given inode to the list of inodes that must be fully on
+ * disk before a transaction commit finishes.
+ *
+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
+ * used to make sure renamed files are fully on disk.
+ *
+ * It is a noop if the inode is already fully on disk.
+ *
+ * If trans is not null, we'll do a friendly check for a transaction that
+ * is already flushing things and force the IO down ourselves.
+ */
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode)
+{
+ u64 last_mod;
+
+ last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
+
+ /*
+ * if this file hasn't been changed since the last transaction
+ * commit, we can safely return without doing anything
+ */
+ if (last_mod < root->fs_info->last_trans_committed)
+ return 0;
+
+ /*
+ * the transaction is already committing. Just start the IO and
+ * don't bother with all of this list nonsense
+ */
+ if (trans && root->fs_info->running_transaction->blocked) {
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ return 0;
+ }
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
+ list_add_tail(&BTRFS_I(inode)->ordered_operations,
+ &root->fs_info->ordered_operations);
+ }
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 00000000000..1fe1282ef47
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+
+/* one of these per inode */
+struct btrfs_ordered_inode_tree {
+ struct mutex mutex;
+ struct rb_root tree;
+ struct rb_node *last;
+};
+
+/*
+ * these are used to collect checksums done just before bios submission.
+ * They are attached via a list into the ordered extent, and
+ * checksum items are inserted into the tree after all the blocks in
+ * the ordered extent are on disk
+ */
+struct btrfs_sector_sum {
+ /* bytenr on disk */
+ u64 bytenr;
+ u32 sum;
+};
+
+struct btrfs_ordered_sum {
+ /* bytenr is the start of this extent on disk */
+ u64 bytenr;
+
+ /*
+ * this is the length in bytes covered by the sums array below.
+ */
+ unsigned long len;
+ struct list_head list;
+ /* last field is a variable length array of btrfs_sector_sums */
+ struct btrfs_sector_sum sums[];
+};
+
+/*
+ * bits for the flags field:
+ *
+ * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
+ * It is used to make sure metadata is inserted into the tree only once
+ * per extent.
+ *
+ * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
+ * rbtree, just before waking any waiters. It is used to indicate the
+ * IO is done and any metadata is inserted into the tree.
+ */
+#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+
+#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
+
+#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
+
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+
+struct btrfs_ordered_extent {
+ /* logical offset in the file */
+ u64 file_offset;
+
+ /* disk byte number */
+ u64 start;
+
+ /* ram length of the extent in bytes */
+ u64 len;
+
+ /* extent length on disk */
+ u64 disk_len;
+
+ /* number of bytes that still need writing */
+ u64 bytes_left;
+
+ /* flags (described above) */
+ unsigned long flags;
+
+ /* reference count */
+ atomic_t refs;
+
+ /* the inode we belong to */
+ struct inode *inode;
+
+ /* list of checksums for insertion when the extent io is done */
+ struct list_head list;
+
+ /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
+ wait_queue_head_t wait;
+
+ /* our friendly rbtree entry */
+ struct rb_node rb_node;
+
+ /* a per root list of all the pending ordered extents */
+ struct list_head root_extent_list;
+};
+
+
+/*
+ * calculates the total size you need to allocate for an ordered sum
+ * structure spanning 'bytes' in the file
+ */
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
+ unsigned long bytes)
+{
+ unsigned long num_sectors = (bytes + root->sectorsize - 1) /
+ root->sectorsize;
+ num_sectors++;
+ return sizeof(struct btrfs_ordered_sum) +
+ num_sectors * sizeof(struct btrfs_sector_sum);
+}
+
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+ mutex_init(&t->mutex);
+ t->tree.rb_node = NULL;
+ t->last = NULL;
+}
+
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+int btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry);
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+ u64 file_offset, u64 io_size);
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int tyep);
+int btrfs_add_ordered_sum(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+ u64 file_offset);
+void btrfs_start_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry, int wait);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
+ struct btrfs_ordered_extent *ordered);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root,
+ int nocow_only, int delay_iput);
+#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 00000000000..79cba5fbc28
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2008 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.offset = offset;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.offset = offset;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ ret = btrfs_del_item(trans, root, path);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = offset;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 00000000000..0d126be22b6
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,337 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
+{
+ int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
+ int i;
+ printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
+ "num_stripes %d\n",
+ (unsigned long long)btrfs_chunk_length(eb, chunk),
+ (unsigned long long)btrfs_chunk_owner(eb, chunk),
+ (unsigned long long)btrfs_chunk_type(eb, chunk),
+ num_stripes);
+ for (i = 0 ; i < num_stripes ; i++) {
+ printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
+ (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
+ (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
+ }
+}
+static void print_dev_item(struct extent_buffer *eb,
+ struct btrfs_dev_item *dev_item)
+{
+ printk(KERN_INFO "\t\tdev item devid %llu "
+ "total_bytes %llu bytes used %llu\n",
+ (unsigned long long)btrfs_device_id(eb, dev_item),
+ (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
+ (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
+}
+static void print_extent_data_ref(struct extent_buffer *eb,
+ struct btrfs_extent_data_ref *ref)
+{
+ printk(KERN_INFO "\t\textent data backref root %llu "
+ "objectid %llu offset %llu count %u\n",
+ (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
+ (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
+ (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
+ btrfs_extent_data_ref_count(eb, ref));
+}
+
+static void print_extent_item(struct extent_buffer *eb, int slot)
+{
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ struct btrfs_disk_key key;
+ unsigned long end;
+ unsigned long ptr;
+ int type;
+ u32 item_size = btrfs_item_size_nr(eb, slot);
+ u64 flags;
+ u64 offset;
+
+ if (item_size < sizeof(*ei)) {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ struct btrfs_extent_item_v0 *ei0;
+ BUG_ON(item_size != sizeof(*ei0));
+ ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
+ printk(KERN_INFO "\t\textent refs %u\n",
+ btrfs_extent_refs_v0(eb, ei0));
+ return;
+#else
+ BUG();
+#endif
+ }
+
+ ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(eb, ei);
+
+ printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
+ (unsigned long long)btrfs_extent_refs(eb, ei),
+ (unsigned long long)btrfs_extent_generation(eb, ei),
+ (unsigned long long)flags);
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ struct btrfs_tree_block_info *info;
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ btrfs_tree_block_key(eb, info, &key);
+ printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
+ "level %d\n",
+ (unsigned long long)btrfs_disk_key_objectid(&key),
+ key.type,
+ (unsigned long long)btrfs_disk_key_offset(&key),
+ btrfs_tree_block_level(eb, info));
+ iref = (struct btrfs_extent_inline_ref *)(info + 1);
+ } else {
+ iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ }
+
+ ptr = (unsigned long)iref;
+ end = (unsigned long)ei + item_size;
+ while (ptr < end) {
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_extent_inline_ref_type(eb, iref);
+ offset = btrfs_extent_inline_ref_offset(eb, iref);
+ switch (type) {
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ printk(KERN_INFO "\t\ttree block backref "
+ "root %llu\n", (unsigned long long)offset);
+ break;
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ printk(KERN_INFO "\t\tshared block backref "
+ "parent %llu\n", (unsigned long long)offset);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ print_extent_data_ref(eb, dref);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY:
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ printk(KERN_INFO "\t\tshared data backref "
+ "parent %llu count %u\n",
+ (unsigned long long)offset,
+ btrfs_shared_data_ref_count(eb, sref));
+ break;
+ default:
+ BUG();
+ }
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+ WARN_ON(ptr > end);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
+{
+ struct btrfs_extent_ref_v0 *ref0;
+
+ ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
+ printk("\t\textent back ref root %llu gen %llu "
+ "owner %llu num_refs %lu\n",
+ (unsigned long long)btrfs_ref_root_v0(eb, ref0),
+ (unsigned long long)btrfs_ref_generation_v0(eb, ref0),
+ (unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
+ (unsigned long)btrfs_ref_count_v0(eb, ref0));
+}
+#endif
+
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
+{
+ int i;
+ u32 type;
+ u32 nr = btrfs_header_nritems(l);
+ struct btrfs_item *item;
+ struct btrfs_root_item *ri;
+ struct btrfs_dir_item *di;
+ struct btrfs_inode_item *ii;
+ struct btrfs_block_group_item *bi;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ struct btrfs_dev_extent *dev_extent;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+
+ printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
+ (unsigned long long)btrfs_header_bytenr(l), nr,
+ btrfs_leaf_free_space(root, l));
+ for (i = 0 ; i < nr ; i++) {
+ item = btrfs_item_nr(l, i);
+ btrfs_item_key_to_cpu(l, &key, i);
+ type = btrfs_key_type(&key);
+ printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
+ "itemsize %d\n",
+ i,
+ (unsigned long long)key.objectid, type,
+ (unsigned long long)key.offset,
+ btrfs_item_offset(l, item), btrfs_item_size(l, item));
+ switch (type) {
+ case BTRFS_INODE_ITEM_KEY:
+ ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
+ printk(KERN_INFO "\t\tinode generation %llu size %llu "
+ "mode %o\n",
+ (unsigned long long)
+ btrfs_inode_generation(l, ii),
+ (unsigned long long)btrfs_inode_size(l, ii),
+ btrfs_inode_mode(l, ii));
+ break;
+ case BTRFS_DIR_ITEM_KEY:
+ di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(l, di, &found_key);
+ printk(KERN_INFO "\t\tdir oid %llu type %u\n",
+ (unsigned long long)found_key.objectid,
+ btrfs_dir_type(l, di));
+ break;
+ case BTRFS_ROOT_ITEM_KEY:
+ ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
+ printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
+ (unsigned long long)
+ btrfs_disk_root_bytenr(l, ri),
+ btrfs_disk_root_refs(l, ri));
+ break;
+ case BTRFS_EXTENT_ITEM_KEY:
+ print_extent_item(l, i);
+ break;
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ printk(KERN_INFO "\t\ttree block backref\n");
+ break;
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ printk(KERN_INFO "\t\tshared block backref\n");
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ dref = btrfs_item_ptr(l, i,
+ struct btrfs_extent_data_ref);
+ print_extent_data_ref(l, dref);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY:
+ sref = btrfs_item_ptr(l, i,
+ struct btrfs_shared_data_ref);
+ printk(KERN_INFO "\t\tshared data backref count %u\n",
+ btrfs_shared_data_ref_count(l, sref));
+ break;
+ case BTRFS_EXTENT_DATA_KEY:
+ fi = btrfs_item_ptr(l, i,
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(l, fi) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ printk(KERN_INFO "\t\tinline extent data "
+ "size %u\n",
+ btrfs_file_extent_inline_len(l, fi));
+ break;
+ }
+ printk(KERN_INFO "\t\textent data disk bytenr %llu "
+ "nr %llu\n",
+ (unsigned long long)
+ btrfs_file_extent_disk_bytenr(l, fi),
+ (unsigned long long)
+ btrfs_file_extent_disk_num_bytes(l, fi));
+ printk(KERN_INFO "\t\textent data offset %llu "
+ "nr %llu ram %llu\n",
+ (unsigned long long)
+ btrfs_file_extent_offset(l, fi),
+ (unsigned long long)
+ btrfs_file_extent_num_bytes(l, fi),
+ (unsigned long long)
+ btrfs_file_extent_ram_bytes(l, fi));
+ break;
+ case BTRFS_EXTENT_REF_V0_KEY:
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ print_extent_ref_v0(l, i);
+#else
+ BUG();
+#endif
+ case BTRFS_BLOCK_GROUP_ITEM_KEY:
+ bi = btrfs_item_ptr(l, i,
+ struct btrfs_block_group_item);
+ printk(KERN_INFO "\t\tblock group used %llu\n",
+ (unsigned long long)
+ btrfs_disk_block_group_used(l, bi));
+ break;
+ case BTRFS_CHUNK_ITEM_KEY:
+ print_chunk(l, btrfs_item_ptr(l, i,
+ struct btrfs_chunk));
+ break;
+ case BTRFS_DEV_ITEM_KEY:
+ print_dev_item(l, btrfs_item_ptr(l, i,
+ struct btrfs_dev_item));
+ break;
+ case BTRFS_DEV_EXTENT_KEY:
+ dev_extent = btrfs_item_ptr(l, i,
+ struct btrfs_dev_extent);
+ printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
+ "\t\tchunk objectid %llu chunk offset %llu "
+ "length %llu\n",
+ (unsigned long long)
+ btrfs_dev_extent_chunk_tree(l, dev_extent),
+ (unsigned long long)
+ btrfs_dev_extent_chunk_objectid(l, dev_extent),
+ (unsigned long long)
+ btrfs_dev_extent_chunk_offset(l, dev_extent),
+ (unsigned long long)
+ btrfs_dev_extent_length(l, dev_extent));
+ };
+ }
+}
+
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
+{
+ int i; u32 nr;
+ struct btrfs_key key;
+ int level;
+
+ if (!c)
+ return;
+ nr = btrfs_header_nritems(c);
+ level = btrfs_header_level(c);
+ if (level == 0) {
+ btrfs_print_leaf(root, c);
+ return;
+ }
+ printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
+ (unsigned long long)btrfs_header_bytenr(c),
+ level, nr,
+ (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+ for (i = 0; i < nr; i++) {
+ btrfs_node_key_to_cpu(c, &key, i);
+ printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
+ i,
+ (unsigned long long)key.objectid,
+ key.type,
+ (unsigned long long)key.offset,
+ (unsigned long long)btrfs_node_blockptr(c, i));
+ }
+ for (i = 0; i < nr; i++) {
+ struct extent_buffer *next = read_tree_block(root,
+ btrfs_node_blockptr(c, i),
+ btrfs_level_size(root, level - 1),
+ btrfs_node_ptr_generation(c, i));
+ if (btrfs_is_leaf(next) &&
+ level != 1)
+ BUG();
+ if (btrfs_header_level(next) !=
+ level - 1)
+ BUG();
+ btrfs_print_tree(root, next);
+ free_extent_buffer(next);
+ }
+}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 00000000000..da75efe534d
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __PRINT_TREE_
+#define __PRINT_TREE_
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
+#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 00000000000..d0cc62bccb9
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/sort.h>
+#include "ctree.h"
+#include "ref-cache.h"
+#include "transaction.h"
+
+/*
+ * leaf refs are used to cache the information about which extents
+ * a given leaf has references on. This allows us to process that leaf
+ * in btrfs_drop_snapshot without needing to read it back from disk.
+ */
+
+/*
+ * kmalloc a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+ int nr_extents)
+{
+ struct btrfs_leaf_ref *ref;
+ size_t size = btrfs_leaf_ref_size(nr_extents);
+
+ ref = kmalloc(size, GFP_NOFS);
+ if (ref) {
+ spin_lock(&root->fs_info->ref_cache_lock);
+ root->fs_info->total_ref_cache_size += size;
+ spin_unlock(&root->fs_info->ref_cache_lock);
+
+ memset(ref, 0, sizeof(*ref));
+ atomic_set(&ref->usage, 1);
+ INIT_LIST_HEAD(&ref->list);
+ }
+ return ref;
+}
+
+/*
+ * free a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+ if (!ref)
+ return;
+ WARN_ON(atomic_read(&ref->usage) == 0);
+ if (atomic_dec_and_test(&ref->usage)) {
+ size_t size = btrfs_leaf_ref_size(ref->nritems);
+
+ BUG_ON(ref->in_tree);
+ kfree(ref);
+
+ spin_lock(&root->fs_info->ref_cache_lock);
+ root->fs_info->total_ref_cache_size -= size;
+ spin_unlock(&root->fs_info->ref_cache_lock);
+ }
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_leaf_ref *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
+
+ if (bytenr < entry->bytenr)
+ p = &(*p)->rb_left;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+ struct rb_node *n = root->rb_node;
+ struct btrfs_leaf_ref *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
+ WARN_ON(!entry->in_tree);
+
+ if (bytenr < entry->bytenr)
+ n = n->rb_left;
+ else if (bytenr > entry->bytenr)
+ n = n->rb_right;
+ else
+ return n;
+ }
+ return NULL;
+}
+
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
+ int shared)
+{
+ struct btrfs_leaf_ref *ref = NULL;
+ struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+
+ if (shared)
+ tree = &root->fs_info->shared_ref_tree;
+ if (!tree)
+ return 0;
+
+ spin_lock(&tree->lock);
+ while (!list_empty(&tree->list)) {
+ ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
+ BUG_ON(ref->tree != tree);
+ if (ref->root_gen > max_root_gen)
+ break;
+ if (!xchg(&ref->in_tree, 0)) {
+ cond_resched_lock(&tree->lock);
+ continue;
+ }
+
+ rb_erase(&ref->rb_node, &tree->root);
+ list_del_init(&ref->list);
+
+ spin_unlock(&tree->lock);
+ btrfs_free_leaf_ref(root, ref);
+ cond_resched();
+ spin_lock(&tree->lock);
+ }
+ spin_unlock(&tree->lock);
+ return 0;
+}
+
+/*
+ * find the leaf ref for a given extent. This returns the ref struct with
+ * a usage reference incremented
+ */
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+ u64 bytenr)
+{
+ struct rb_node *rb;
+ struct btrfs_leaf_ref *ref = NULL;
+ struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+again:
+ if (tree) {
+ spin_lock(&tree->lock);
+ rb = tree_search(&tree->root, bytenr);
+ if (rb)
+ ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+ if (ref)
+ atomic_inc(&ref->usage);
+ spin_unlock(&tree->lock);
+ if (ref)
+ return ref;
+ }
+ if (tree != &root->fs_info->shared_ref_tree) {
+ tree = &root->fs_info->shared_ref_tree;
+ goto again;
+ }
+ return NULL;
+}
+
+/*
+ * add a fully filled in leaf ref struct
+ * remove all the refs older than a given root generation
+ */
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
+ int shared)
+{
+ int ret = 0;
+ struct rb_node *rb;
+ struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+
+ if (shared)
+ tree = &root->fs_info->shared_ref_tree;
+
+ spin_lock(&tree->lock);
+ rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
+ if (rb) {
+ ret = -EEXIST;
+ } else {
+ atomic_inc(&ref->usage);
+ ref->tree = tree;
+ ref->in_tree = 1;
+ list_add_tail(&ref->list, &tree->list);
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * remove a single leaf ref from the tree. This drops the ref held by the tree
+ * only
+ */
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+ struct btrfs_leaf_ref_tree *tree;
+
+ if (!xchg(&ref->in_tree, 0))
+ return 0;
+
+ tree = ref->tree;
+ spin_lock(&tree->lock);
+
+ rb_erase(&ref->rb_node, &tree->root);
+ list_del_init(&ref->list);
+
+ spin_unlock(&tree->lock);
+
+ btrfs_free_leaf_ref(root, ref);
+ return 0;
+}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 00000000000..bc283ad2db7
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __REFCACHE__
+#define __REFCACHE__
+
+struct btrfs_extent_info {
+ /* bytenr and num_bytes find the extent in the extent allocation tree */
+ u64 bytenr;
+ u64 num_bytes;
+
+ /* objectid and offset find the back reference for the file */
+ u64 objectid;
+ u64 offset;
+};
+
+struct btrfs_leaf_ref {
+ struct rb_node rb_node;
+ struct btrfs_leaf_ref_tree *tree;
+ int in_tree;
+ atomic_t usage;
+
+ u64 root_gen;
+ u64 bytenr;
+ u64 owner;
+ u64 generation;
+ int nritems;
+
+ struct list_head list;
+ struct btrfs_extent_info extents[];
+};
+
+static inline size_t btrfs_leaf_ref_size(int nr_extents)
+{
+ return sizeof(struct btrfs_leaf_ref) +
+ sizeof(struct btrfs_extent_info) * nr_extents;
+}
+
+static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
+{
+ tree->root.rb_node = NULL;
+ INIT_LIST_HEAD(&tree->list);
+ spin_lock_init(&tree->lock);
+}
+
+static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
+{
+ return RB_EMPTY_ROOT(&tree->root);
+}
+
+void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+ int nr_extents);
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+ u64 bytenr);
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
+ int shared);
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
+ int shared);
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 00000000000..ab7ab531874
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,3815 @@
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "btrfs_inode.h"
+#include "async-thread.h"
+
+/*
+ * backref_node, mapping_node and tree_block start with this
+ */
+struct tree_entry {
+ struct rb_node rb_node;
+ u64 bytenr;
+};
+
+/*
+ * present a tree block in the backref cache
+ */
+struct backref_node {
+ struct rb_node rb_node;
+ u64 bytenr;
+ /* objectid tree block owner */
+ u64 owner;
+ /* list of upper level blocks reference this block */
+ struct list_head upper;
+ /* list of child blocks in the cache */
+ struct list_head lower;
+ /* NULL if this node is not tree root */
+ struct btrfs_root *root;
+ /* extent buffer got by COW the block */
+ struct extent_buffer *eb;
+ /* level of tree block */
+ unsigned int level:8;
+ /* 1 if the block is root of old snapshot */
+ unsigned int old_root:1;
+ /* 1 if no child blocks in the cache */
+ unsigned int lowest:1;
+ /* is the extent buffer locked */
+ unsigned int locked:1;
+ /* has the block been processed */
+ unsigned int processed:1;
+ /* have backrefs of this block been checked */
+ unsigned int checked:1;
+};
+
+/*
+ * present a block pointer in the backref cache
+ */
+struct backref_edge {
+ struct list_head list[2];
+ struct backref_node *node[2];
+ u64 blockptr;
+};
+
+#define LOWER 0
+#define UPPER 1
+
+struct backref_cache {
+ /* red black tree of all backref nodes in the cache */
+ struct rb_root rb_root;
+ /* list of backref nodes with no child block in the cache */
+ struct list_head pending[BTRFS_MAX_LEVEL];
+ spinlock_t lock;
+};
+
+/*
+ * map address of tree root to tree
+ */
+struct mapping_node {
+ struct rb_node rb_node;
+ u64 bytenr;
+ void *data;
+};
+
+struct mapping_tree {
+ struct rb_root rb_root;
+ spinlock_t lock;
+};
+
+/*
+ * present a tree block to process
+ */
+struct tree_block {
+ struct rb_node rb_node;
+ u64 bytenr;
+ struct btrfs_key key;
+ unsigned int level:8;
+ unsigned int key_ready:1;
+};
+
+/* inode vector */
+#define INODEVEC_SIZE 16
+
+struct inodevec {
+ struct list_head list;
+ struct inode *inode[INODEVEC_SIZE];
+ int nr;
+};
+
+#define MAX_EXTENTS 128
+
+struct file_extent_cluster {
+ u64 start;
+ u64 end;
+ u64 boundary[MAX_EXTENTS];
+ unsigned int nr;
+};
+
+struct reloc_control {
+ /* block group to relocate */
+ struct btrfs_block_group_cache *block_group;
+ /* extent tree */
+ struct btrfs_root *extent_root;
+ /* inode for moving data */
+ struct inode *data_inode;
+ struct btrfs_workers workers;
+ /* tree blocks have been processed */
+ struct extent_io_tree processed_blocks;
+ /* map start of tree root to corresponding reloc tree */
+ struct mapping_tree reloc_root_tree;
+ /* list of reloc trees */
+ struct list_head reloc_roots;
+ u64 search_start;
+ u64 extents_found;
+ u64 extents_skipped;
+ int stage;
+ int create_reloc_root;
+ unsigned int found_file_extent:1;
+ unsigned int found_old_snapshot:1;
+};
+
+/* stages of data relocation */
+#define MOVE_DATA_EXTENTS 0
+#define UPDATE_DATA_PTRS 1
+
+/*
+ * merge reloc tree to corresponding fs tree in worker threads
+ */
+struct async_merge {
+ struct btrfs_work work;
+ struct reloc_control *rc;
+ struct btrfs_root *root;
+ struct completion *done;
+ atomic_t *num_pending;
+};
+
+static void mapping_tree_init(struct mapping_tree *tree)
+{
+ tree->rb_root.rb_node = NULL;
+ spin_lock_init(&tree->lock);
+}
+
+static void backref_cache_init(struct backref_cache *cache)
+{
+ int i;
+ cache->rb_root.rb_node = NULL;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&cache->pending[i]);
+ spin_lock_init(&cache->lock);
+}
+
+static void backref_node_init(struct backref_node *node)
+{
+ memset(node, 0, sizeof(*node));
+ INIT_LIST_HEAD(&node->upper);
+ INIT_LIST_HEAD(&node->lower);
+ RB_CLEAR_NODE(&node->rb_node);
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct tree_entry *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct tree_entry, rb_node);
+
+ if (bytenr < entry->bytenr)
+ p = &(*p)->rb_left;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+ struct rb_node *n = root->rb_node;
+ struct tree_entry *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct tree_entry, rb_node);
+
+ if (bytenr < entry->bytenr)
+ n = n->rb_left;
+ else if (bytenr > entry->bytenr)
+ n = n->rb_right;
+ else
+ return n;
+ }
+ return NULL;
+}
+
+/*
+ * walk up backref nodes until reach node presents tree root
+ */
+static struct backref_node *walk_up_backref(struct backref_node *node,
+ struct backref_edge *edges[],
+ int *index)
+{
+ struct backref_edge *edge;
+ int idx = *index;
+
+ while (!list_empty(&node->upper)) {
+ edge = list_entry(node->upper.next,
+ struct backref_edge, list[LOWER]);
+ edges[idx++] = edge;
+ node = edge->node[UPPER];
+ }
+ *index = idx;
+ return node;
+}
+
+/*
+ * walk down backref nodes to find start of next reference path
+ */
+static struct backref_node *walk_down_backref(struct backref_edge *edges[],
+ int *index)
+{
+ struct backref_edge *edge;
+ struct backref_node *lower;
+ int idx = *index;
+
+ while (idx > 0) {
+ edge = edges[idx - 1];
+ lower = edge->node[LOWER];
+ if (list_is_last(&edge->list[LOWER], &lower->upper)) {
+ idx--;
+ continue;
+ }
+ edge = list_entry(edge->list[LOWER].next,
+ struct backref_edge, list[LOWER]);
+ edges[idx - 1] = edge;
+ *index = idx;
+ return edge->node[UPPER];
+ }
+ *index = 0;
+ return NULL;
+}
+
+static void drop_node_buffer(struct backref_node *node)
+{
+ if (node->eb) {
+ if (node->locked) {
+ btrfs_tree_unlock(node->eb);
+ node->locked = 0;
+ }
+ free_extent_buffer(node->eb);
+ node->eb = NULL;
+ }
+}
+
+static void drop_backref_node(struct backref_cache *tree,
+ struct backref_node *node)
+{
+ BUG_ON(!node->lowest);
+ BUG_ON(!list_empty(&node->upper));
+
+ drop_node_buffer(node);
+ list_del(&node->lower);
+
+ rb_erase(&node->rb_node, &tree->rb_root);
+ kfree(node);
+}
+
+/*
+ * remove a backref node from the backref cache
+ */
+static void remove_backref_node(struct backref_cache *cache,
+ struct backref_node *node)
+{
+ struct backref_node *upper;
+ struct backref_edge *edge;
+
+ if (!node)
+ return;
+
+ BUG_ON(!node->lowest);
+ while (!list_empty(&node->upper)) {
+ edge = list_entry(node->upper.next, struct backref_edge,
+ list[LOWER]);
+ upper = edge->node[UPPER];
+ list_del(&edge->list[LOWER]);
+ list_del(&edge->list[UPPER]);
+ kfree(edge);
+ /*
+ * add the node to pending list if no other
+ * child block cached.
+ */
+ if (list_empty(&upper->lower)) {
+ list_add_tail(&upper->lower,
+ &cache->pending[upper->level]);
+ upper->lowest = 1;
+ }
+ }
+ drop_backref_node(cache, node);
+}
+
+/*
+ * find reloc tree by address of tree root
+ */
+static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
+ u64 bytenr)
+{
+ struct rb_node *rb_node;
+ struct mapping_node *node;
+ struct btrfs_root *root = NULL;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct mapping_node, rb_node);
+ root = (struct btrfs_root *)node->data;
+ }
+ spin_unlock(&rc->reloc_root_tree.lock);
+ return root;
+}
+
+static int is_cowonly_root(u64 root_objectid)
+{
+ if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+ root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+ root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+ root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+ root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+ root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return 1;
+ return 0;
+}
+
+static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_objectid)
+{
+ struct btrfs_key key;
+
+ key.objectid = root_objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ if (is_cowonly_root(root_objectid))
+ key.offset = 0;
+ else
+ key.offset = (u64)-1;
+
+ return btrfs_read_fs_root_no_name(fs_info, &key);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static noinline_for_stack
+struct btrfs_root *find_tree_root(struct reloc_control *rc,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_ref_v0 *ref0)
+{
+ struct btrfs_root *root;
+ u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
+ u64 generation = btrfs_ref_generation_v0(leaf, ref0);
+
+ BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
+
+ root = read_fs_root(rc->extent_root->fs_info, root_objectid);
+ BUG_ON(IS_ERR(root));
+
+ if (root->ref_cows &&
+ generation != btrfs_root_generation(&root->root_item))
+ return NULL;
+
+ return root;
+}
+#endif
+
+static noinline_for_stack
+int find_inline_backref(struct extent_buffer *leaf, int slot,
+ unsigned long *ptr, unsigned long *end)
+{
+ struct btrfs_extent_item *ei;
+ struct btrfs_tree_block_info *bi;
+ u32 item_size;
+
+ item_size = btrfs_item_size_nr(leaf, slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (item_size < sizeof(*ei)) {
+ WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+ return 1;
+ }
+#endif
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ WARN_ON(!(btrfs_extent_flags(leaf, ei) &
+ BTRFS_EXTENT_FLAG_TREE_BLOCK));
+
+ if (item_size <= sizeof(*ei) + sizeof(*bi)) {
+ WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
+ return 1;
+ }
+
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ *ptr = (unsigned long)(bi + 1);
+ *end = (unsigned long)ei + item_size;
+ return 0;
+}
+
+/*
+ * build backref tree for a given tree block. root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond
+ * roots of b-trees that reference the tree block.
+ *
+ * the basic idea of this function is check backrefs of a given block
+ * to find upper level blocks that refernece the block, and then check
+ * bakcrefs of these upper level blocks recursively. the recursion stop
+ * when tree root is reached or backrefs for the block is cached.
+ *
+ * NOTE: if we find backrefs for a block are cached, we know backrefs
+ * for all upper level blocks that directly/indirectly reference the
+ * block are also cached.
+ */
+static struct backref_node *build_backref_tree(struct reloc_control *rc,
+ struct backref_cache *cache,
+ struct btrfs_key *node_key,
+ int level, u64 bytenr)
+{
+ struct btrfs_path *path1;
+ struct btrfs_path *path2;
+ struct extent_buffer *eb;
+ struct btrfs_root *root;
+ struct backref_node *cur;
+ struct backref_node *upper;
+ struct backref_node *lower;
+ struct backref_node *node = NULL;
+ struct backref_node *exist = NULL;
+ struct backref_edge *edge;
+ struct rb_node *rb_node;
+ struct btrfs_key key;
+ unsigned long end;
+ unsigned long ptr;
+ LIST_HEAD(list);
+ int ret;
+ int err = 0;
+
+ path1 = btrfs_alloc_path();
+ path2 = btrfs_alloc_path();
+ if (!path1 || !path2) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ node = kmalloc(sizeof(*node), GFP_NOFS);
+ if (!node) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ backref_node_init(node);
+ node->bytenr = bytenr;
+ node->owner = 0;
+ node->level = level;
+ node->lowest = 1;
+ cur = node;
+again:
+ end = 0;
+ ptr = 0;
+ key.objectid = cur->bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ path1->search_commit_root = 1;
+ path1->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
+ 0, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ BUG_ON(!ret || !path1->slots[0]);
+
+ path1->slots[0]--;
+
+ WARN_ON(cur->checked);
+ if (!list_empty(&cur->upper)) {
+ /*
+ * the backref was added previously when processsing
+ * backref of type BTRFS_TREE_BLOCK_REF_KEY
+ */
+ BUG_ON(!list_is_singular(&cur->upper));
+ edge = list_entry(cur->upper.next, struct backref_edge,
+ list[LOWER]);
+ BUG_ON(!list_empty(&edge->list[UPPER]));
+ exist = edge->node[UPPER];
+ /*
+ * add the upper level block to pending list if we need
+ * check its backrefs
+ */
+ if (!exist->checked)
+ list_add_tail(&edge->list[UPPER], &list);
+ } else {
+ exist = NULL;
+ }
+
+ while (1) {
+ cond_resched();
+ eb = path1->nodes[0];
+
+ if (ptr >= end) {
+ if (path1->slots[0] >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(rc->extent_root, path1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0)
+ break;
+ eb = path1->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
+ if (key.objectid != cur->bytenr) {
+ WARN_ON(exist);
+ break;
+ }
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+ ret = find_inline_backref(eb, path1->slots[0],
+ &ptr, &end);
+ if (ret)
+ goto next;
+ }
+ }
+
+ if (ptr < end) {
+ /* update key for inline back ref */
+ struct btrfs_extent_inline_ref *iref;
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ key.type = btrfs_extent_inline_ref_type(eb, iref);
+ key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+ WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+ key.type != BTRFS_SHARED_BLOCK_REF_KEY);
+ }
+
+ if (exist &&
+ ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+ exist->owner == key.offset) ||
+ (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+ exist->bytenr == key.offset))) {
+ exist = NULL;
+ goto next;
+ }
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
+ key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ if (key.objectid == key.offset &&
+ key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ struct btrfs_extent_ref_v0 *ref0;
+ ref0 = btrfs_item_ptr(eb, path1->slots[0],
+ struct btrfs_extent_ref_v0);
+ root = find_tree_root(rc, eb, ref0);
+ if (root)
+ cur->root = root;
+ else
+ cur->old_root = 1;
+ break;
+ }
+#else
+ BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+#endif
+ if (key.objectid == key.offset) {
+ /*
+ * only root blocks of reloc trees use
+ * backref of this type.
+ */
+ root = find_reloc_root(rc, cur->bytenr);
+ BUG_ON(!root);
+ cur->root = root;
+ break;
+ }
+
+ edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ if (!edge) {
+ err = -ENOMEM;
+ goto out;
+ }
+ rb_node = tree_search(&cache->rb_root, key.offset);
+ if (!rb_node) {
+ upper = kmalloc(sizeof(*upper), GFP_NOFS);
+ if (!upper) {
+ kfree(edge);
+ err = -ENOMEM;
+ goto out;
+ }
+ backref_node_init(upper);
+ upper->bytenr = key.offset;
+ upper->owner = 0;
+ upper->level = cur->level + 1;
+ /*
+ * backrefs for the upper level block isn't
+ * cached, add the block to pending list
+ */
+ list_add_tail(&edge->list[UPPER], &list);
+ } else {
+ upper = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
+ list_add(&edge->list[LOWER], &cur->upper);
+ edge->node[UPPER] = upper;
+ edge->node[LOWER] = cur;
+
+ goto next;
+ } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+ goto next;
+ }
+
+ /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+ root = read_fs_root(rc->extent_root->fs_info, key.offset);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto out;
+ }
+
+ if (btrfs_root_level(&root->root_item) == cur->level) {
+ /* tree root */
+ BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ cur->bytenr);
+ cur->root = root;
+ break;
+ }
+
+ level = cur->level + 1;
+
+ /*
+ * searching the tree to find upper level blocks
+ * reference the block.
+ */
+ path2->search_commit_root = 1;
+ path2->skip_locking = 1;
+ path2->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
+ path2->lowest_level = 0;
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0 && path2->slots[level] > 0)
+ path2->slots[level]--;
+
+ eb = path2->nodes[level];
+ WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
+ cur->bytenr);
+
+ lower = cur;
+ for (; level < BTRFS_MAX_LEVEL; level++) {
+ if (!path2->nodes[level]) {
+ BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ lower->bytenr);
+ lower->root = root;
+ break;
+ }
+
+ edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ if (!edge) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ eb = path2->nodes[level];
+ rb_node = tree_search(&cache->rb_root, eb->start);
+ if (!rb_node) {
+ upper = kmalloc(sizeof(*upper), GFP_NOFS);
+ if (!upper) {
+ kfree(edge);
+ err = -ENOMEM;
+ goto out;
+ }
+ backref_node_init(upper);
+ upper->bytenr = eb->start;
+ upper->owner = btrfs_header_owner(eb);
+ upper->level = lower->level + 1;
+
+ /*
+ * if we know the block isn't shared
+ * we can void checking its backrefs.
+ */
+ if (btrfs_block_can_be_shared(root, eb))
+ upper->checked = 0;
+ else
+ upper->checked = 1;
+
+ /*
+ * add the block to pending list if we
+ * need check its backrefs. only block
+ * at 'cur->level + 1' is added to the
+ * tail of pending list. this guarantees
+ * we check backrefs from lower level
+ * blocks to upper level blocks.
+ */
+ if (!upper->checked &&
+ level == cur->level + 1) {
+ list_add_tail(&edge->list[UPPER],
+ &list);
+ } else
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ } else {
+ upper = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ BUG_ON(!upper->checked);
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
+ list_add_tail(&edge->list[LOWER], &lower->upper);
+ edge->node[UPPER] = upper;
+ edge->node[LOWER] = lower;
+
+ if (rb_node)
+ break;
+ lower = upper;
+ upper = NULL;
+ }
+ btrfs_release_path(root, path2);
+next:
+ if (ptr < end) {
+ ptr += btrfs_extent_inline_ref_size(key.type);
+ if (ptr >= end) {
+ WARN_ON(ptr > end);
+ ptr = 0;
+ end = 0;
+ }
+ }
+ if (ptr >= end)
+ path1->slots[0]++;
+ }
+ btrfs_release_path(rc->extent_root, path1);
+
+ cur->checked = 1;
+ WARN_ON(exist);
+
+ /* the pending list isn't empty, take the first block to process */
+ if (!list_empty(&list)) {
+ edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+ list_del_init(&edge->list[UPPER]);
+ cur = edge->node[UPPER];
+ goto again;
+ }
+
+ /*
+ * everything goes well, connect backref nodes and insert backref nodes
+ * into the cache.
+ */
+ BUG_ON(!node->checked);
+ rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+ BUG_ON(rb_node);
+
+ list_for_each_entry(edge, &node->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &list);
+
+ while (!list_empty(&list)) {
+ edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+ list_del_init(&edge->list[UPPER]);
+ upper = edge->node[UPPER];
+
+ if (!RB_EMPTY_NODE(&upper->rb_node)) {
+ if (upper->lowest) {
+ list_del_init(&upper->lower);
+ upper->lowest = 0;
+ }
+
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+ continue;
+ }
+
+ BUG_ON(!upper->checked);
+ rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+ &upper->rb_node);
+ BUG_ON(rb_node);
+
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &list);
+ }
+out:
+ btrfs_free_path(path1);
+ btrfs_free_path(path2);
+ if (err) {
+ INIT_LIST_HEAD(&list);
+ upper = node;
+ while (upper) {
+ if (RB_EMPTY_NODE(&upper->rb_node)) {
+ list_splice_tail(&upper->upper, &list);
+ kfree(upper);
+ }
+
+ if (list_empty(&list))
+ break;
+
+ edge = list_entry(list.next, struct backref_edge,
+ list[LOWER]);
+ upper = edge->node[UPPER];
+ kfree(edge);
+ }
+ return ERR_PTR(err);
+ }
+ return node;
+}
+
+/*
+ * helper to add 'address of tree root -> reloc tree' mapping
+ */
+static int __add_reloc_root(struct btrfs_root *root)
+{
+ struct rb_node *rb_node;
+ struct mapping_node *node;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+ node = kmalloc(sizeof(*node), GFP_NOFS);
+ BUG_ON(!node);
+
+ node->bytenr = root->node->start;
+ node->data = root;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
+ spin_unlock(&rc->reloc_root_tree.lock);
+ BUG_ON(rb_node);
+
+ list_add_tail(&root->root_list, &rc->reloc_roots);
+ return 0;
+}
+
+/*
+ * helper to update/delete the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, int del)
+{
+ struct rb_node *rb_node;
+ struct mapping_node *node = NULL;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+ root->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct mapping_node, rb_node);
+ rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+ }
+ spin_unlock(&rc->reloc_root_tree.lock);
+
+ BUG_ON((struct btrfs_root *)node->data != root);
+
+ if (!del) {
+ spin_lock(&rc->reloc_root_tree.lock);
+ node->bytenr = root->node->start;
+ rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
+ spin_unlock(&rc->reloc_root_tree.lock);
+ BUG_ON(rb_node);
+ } else {
+ list_del_init(&root->root_list);
+ kfree(node);
+ }
+ return 0;
+}
+
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct extent_buffer *eb;
+ struct btrfs_root_item *root_item;
+ struct btrfs_key root_key;
+ int ret;
+
+ if (root->reloc_root) {
+ reloc_root = root->reloc_root;
+ reloc_root->last_trans = trans->transid;
+ return 0;
+ }
+
+ if (!root->fs_info->reloc_ctl ||
+ !root->fs_info->reloc_ctl->create_reloc_root ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
+ root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+ BUG_ON(!root_item);
+
+ root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = root->root_key.objectid;
+
+ ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+
+ btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
+ memcpy(root_item, &root->root_item, sizeof(*root_item));
+ btrfs_set_root_refs(root_item, 1);
+ btrfs_set_root_bytenr(root_item, eb->start);
+ btrfs_set_root_level(root_item, btrfs_header_level(eb));
+ btrfs_set_root_generation(root_item, trans->transid);
+ memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
+ root_item->drop_level = 0;
+
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+
+ ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+ &root_key, root_item);
+ BUG_ON(ret);
+ kfree(root_item);
+
+ reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+ &root_key);
+ BUG_ON(IS_ERR(reloc_root));
+ reloc_root->last_trans = trans->transid;
+
+ __add_reloc_root(reloc_root);
+ root->reloc_root = reloc_root;
+ return 0;
+}
+
+/*
+ * update root item of reloc tree
+ */
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct btrfs_root_item *root_item;
+ int del = 0;
+ int ret;
+
+ if (!root->reloc_root)
+ return 0;
+
+ reloc_root = root->reloc_root;
+ root_item = &reloc_root->root_item;
+
+ if (btrfs_root_refs(root_item) == 0) {
+ root->reloc_root = NULL;
+ del = 1;
+ }
+
+ __update_reloc_root(reloc_root, del);
+
+ if (reloc_root->commit_root != reloc_root->node) {
+ btrfs_set_root_node(root_item, reloc_root->node);
+ free_extent_buffer(reloc_root->commit_root);
+ reloc_root->commit_root = btrfs_root_node(reloc_root);
+ }
+
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &reloc_root->root_key, root_item);
+ BUG_ON(ret);
+ return 0;
+}
+
+/*
+ * helper to find first cached inode with inode number >= objectid
+ * in a subvolume
+ */
+static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
+{
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *entry;
+ struct inode *inode;
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+ if (objectid < entry->vfs_inode.i_ino)
+ node = node->rb_left;
+ else if (objectid > entry->vfs_inode.i_ino)
+ node = node->rb_right;
+ else
+ break;
+ }
+ if (!node) {
+ while (prev) {
+ entry = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (objectid <= entry->vfs_inode.i_ino) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+ while (node) {
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+ inode = igrab(&entry->vfs_inode);
+ if (inode) {
+ spin_unlock(&root->inode_lock);
+ return inode;
+ }
+
+ objectid = entry->vfs_inode.i_ino + 1;
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+ return NULL;
+}
+
+static int in_block_group(u64 bytenr,
+ struct btrfs_block_group_cache *block_group)
+{
+ if (bytenr >= block_group->key.objectid &&
+ bytenr < block_group->key.objectid + block_group->key.offset)
+ return 1;
+ return 0;
+}
+
+/*
+ * get new location of data
+ */
+static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
+ u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+ ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+ bytenr, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
+ btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi));
+
+ if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (new_bytenr)
+ *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * update file extent items in the tree leaf to point to
+ * the new locations.
+ */
+static int replace_file_extents(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct list_head *inode_list)
+{
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ struct inode *inode = NULL;
+ struct inodevec *ivec = NULL;
+ u64 parent;
+ u64 bytenr;
+ u64 new_bytenr;
+ u64 num_bytes;
+ u64 end;
+ u32 nritems;
+ u32 i;
+ int ret;
+ int first = 1;
+ int dirty = 0;
+
+ if (rc->stage != UPDATE_DATA_PTRS)
+ return 0;
+
+ /* reloc trees always use full backref */
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ parent = leaf->start;
+ else
+ parent = 0;
+
+ nritems = btrfs_header_nritems(leaf);
+ for (i = 0; i < nritems; i++) {
+ cond_resched();
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ if (bytenr == 0)
+ continue;
+ if (!in_block_group(bytenr, rc->block_group))
+ continue;
+
+ /*
+ * if we are modifying block in fs tree, wait for readpage
+ * to complete and drop the extent cache
+ */
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (!ivec || ivec->nr == INODEVEC_SIZE) {
+ ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
+ BUG_ON(!ivec);
+ ivec->nr = 0;
+ list_add_tail(&ivec->list, inode_list);
+ }
+ if (first) {
+ inode = find_next_inode(root, key.objectid);
+ if (inode)
+ ivec->inode[ivec->nr++] = inode;
+ first = 0;
+ } else if (inode && inode->i_ino < key.objectid) {
+ inode = find_next_inode(root, key.objectid);
+ if (inode)
+ ivec->inode[ivec->nr++] = inode;
+ }
+ if (inode && inode->i_ino == key.objectid) {
+ end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ WARN_ON(!IS_ALIGNED(key.offset,
+ root->sectorsize));
+ WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+ end--;
+ ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+ key.offset, end,
+ GFP_NOFS);
+ if (!ret)
+ continue;
+
+ btrfs_drop_extent_cache(inode, key.offset, end,
+ 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ key.offset, end, GFP_NOFS);
+ }
+ }
+
+ ret = get_new_location(rc->data_inode, &new_bytenr,
+ bytenr, num_bytes);
+ if (ret > 0)
+ continue;
+ BUG_ON(ret < 0);
+
+ btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
+ dirty = 1;
+
+ key.offset -= btrfs_file_extent_offset(leaf, fi);
+ ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
+ num_bytes, parent,
+ btrfs_header_owner(leaf),
+ key.objectid, key.offset);
+ BUG_ON(ret);
+
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ parent, btrfs_header_owner(leaf),
+ key.objectid, key.offset);
+ BUG_ON(ret);
+ }
+ if (dirty)
+ btrfs_mark_buffer_dirty(leaf);
+ return 0;
+}
+
+static noinline_for_stack
+int memcmp_node_keys(struct extent_buffer *eb, int slot,
+ struct btrfs_path *path, int level)
+{
+ struct btrfs_disk_key key1;
+ struct btrfs_disk_key key2;
+ btrfs_node_key(eb, &key1, slot);
+ btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
+ return memcmp(&key1, &key2, sizeof(key1));
+}
+
+/*
+ * try to replace tree blocks in fs tree with the new blocks
+ * in reloc tree. tree blocks haven't been modified since the
+ * reloc tree was create can be replaced.
+ *
+ * if a block was replaced, level of the block + 1 is returned.
+ * if no block got replaced, 0 is returned. if there are other
+ * errors, a negative error number is returned.
+ */
+static int replace_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *dest, struct btrfs_root *src,
+ struct btrfs_path *path, struct btrfs_key *next_key,
+ struct extent_buffer **leaf,
+ int lowest_level, int max_level)
+{
+ struct extent_buffer *eb;
+ struct extent_buffer *parent;
+ struct btrfs_key key;
+ u64 old_bytenr;
+ u64 new_bytenr;
+ u64 old_ptr_gen;
+ u64 new_ptr_gen;
+ u64 last_snapshot;
+ u32 blocksize;
+ int level;
+ int ret;
+ int slot;
+
+ BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(lowest_level > 1 && leaf);
+
+ last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+
+ slot = path->slots[lowest_level];
+ btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
+
+ eb = btrfs_lock_root_node(dest);
+ btrfs_set_lock_blocking(eb);
+ level = btrfs_header_level(eb);
+
+ if (level < lowest_level) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ return 0;
+ }
+
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+ BUG_ON(ret);
+ btrfs_set_lock_blocking(eb);
+
+ if (next_key) {
+ next_key->objectid = (u64)-1;
+ next_key->type = (u8)-1;
+ next_key->offset = (u64)-1;
+ }
+
+ parent = eb;
+ while (1) {
+ level = btrfs_header_level(parent);
+ BUG_ON(level < lowest_level);
+
+ ret = btrfs_bin_search(parent, &key, level, &slot);
+ if (ret && slot > 0)
+ slot--;
+
+ if (next_key && slot + 1 < btrfs_header_nritems(parent))
+ btrfs_node_key_to_cpu(parent, next_key, slot + 1);
+
+ old_bytenr = btrfs_node_blockptr(parent, slot);
+ blocksize = btrfs_level_size(dest, level - 1);
+ old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+
+ if (level <= max_level) {
+ eb = path->nodes[level];
+ new_bytenr = btrfs_node_blockptr(eb,
+ path->slots[level]);
+ new_ptr_gen = btrfs_node_ptr_generation(eb,
+ path->slots[level]);
+ } else {
+ new_bytenr = 0;
+ new_ptr_gen = 0;
+ }
+
+ if (new_bytenr > 0 && new_bytenr == old_bytenr) {
+ WARN_ON(1);
+ ret = level;
+ break;
+ }
+
+ if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
+ memcmp_node_keys(parent, slot, path, level)) {
+ if (level <= lowest_level && !leaf) {
+ ret = 0;
+ break;
+ }
+
+ eb = read_tree_block(dest, old_bytenr, blocksize,
+ old_ptr_gen);
+ btrfs_tree_lock(eb);
+ ret = btrfs_cow_block(trans, dest, eb, parent,
+ slot, &eb);
+ BUG_ON(ret);
+ btrfs_set_lock_blocking(eb);
+
+ if (level <= lowest_level) {
+ *leaf = eb;
+ ret = 0;
+ break;
+ }
+
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+
+ parent = eb;
+ continue;
+ }
+
+ btrfs_node_key_to_cpu(path->nodes[level], &key,
+ path->slots[level]);
+ btrfs_release_path(src, path);
+
+ path->lowest_level = level;
+ ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+ path->lowest_level = 0;
+ BUG_ON(ret);
+
+ /*
+ * swap blocks in fs tree and reloc tree.
+ */
+ btrfs_set_node_blockptr(parent, slot, new_bytenr);
+ btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
+ btrfs_mark_buffer_dirty(parent);
+
+ btrfs_set_node_blockptr(path->nodes[level],
+ path->slots[level], old_bytenr);
+ btrfs_set_node_ptr_generation(path->nodes[level],
+ path->slots[level], old_ptr_gen);
+ btrfs_mark_buffer_dirty(path->nodes[level]);
+
+ ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
+ path->nodes[level]->start,
+ src->root_key.objectid, level - 1, 0);
+ BUG_ON(ret);
+ ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
+ 0, dest->root_key.objectid, level - 1,
+ 0);
+ BUG_ON(ret);
+
+ ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
+ path->nodes[level]->start,
+ src->root_key.objectid, level - 1, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
+ 0, dest->root_key.objectid, level - 1,
+ 0);
+ BUG_ON(ret);
+
+ btrfs_unlock_up_safe(path, 0);
+
+ ret = level;
+ break;
+ }
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+ return ret;
+}
+
+/*
+ * helper to find next relocated block in reloc tree
+ */
+static noinline_for_stack
+int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+ int *level)
+{
+ struct extent_buffer *eb;
+ int i;
+ u64 last_snapshot;
+ u32 nritems;
+
+ last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+ for (i = 0; i < *level; i++) {
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+
+ for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+ eb = path->nodes[i];
+ nritems = btrfs_header_nritems(eb);
+ while (path->slots[i] + 1 < nritems) {
+ path->slots[i]++;
+ if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
+ last_snapshot)
+ continue;
+
+ *level = i;
+ return 0;
+ }
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+ return 1;
+}
+
+/*
+ * walk down reloc tree to find relocated block of lowest level
+ */
+static noinline_for_stack
+int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+ int *level)
+{
+ struct extent_buffer *eb = NULL;
+ int i;
+ u64 bytenr;
+ u64 ptr_gen = 0;
+ u64 last_snapshot;
+ u32 blocksize;
+ u32 nritems;
+
+ last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+ for (i = *level; i > 0; i--) {
+ eb = path->nodes[i];
+ nritems = btrfs_header_nritems(eb);
+ while (path->slots[i] < nritems) {
+ ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
+ if (ptr_gen > last_snapshot)
+ break;
+ path->slots[i]++;
+ }
+ if (path->slots[i] >= nritems) {
+ if (i == *level)
+ break;
+ *level = i + 1;
+ return 0;
+ }
+ if (i == 1) {
+ *level = i;
+ return 0;
+ }
+
+ bytenr = btrfs_node_blockptr(eb, path->slots[i]);
+ blocksize = btrfs_level_size(root, i - 1);
+ eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+ BUG_ON(btrfs_header_level(eb) != i - 1);
+ path->nodes[i - 1] = eb;
+ path->slots[i - 1] = 0;
+ }
+ return 1;
+}
+
+/*
+ * invalidate extent cache for file extents whose key in range of
+ * [min_key, max_key)
+ */
+static int invalidate_extent_cache(struct btrfs_root *root,
+ struct btrfs_key *min_key,
+ struct btrfs_key *max_key)
+{
+ struct inode *inode = NULL;
+ u64 objectid;
+ u64 start, end;
+
+ objectid = min_key->objectid;
+ while (1) {
+ cond_resched();
+ iput(inode);
+
+ if (objectid > max_key->objectid)
+ break;
+
+ inode = find_next_inode(root, objectid);
+ if (!inode)
+ break;
+
+ if (inode->i_ino > max_key->objectid) {
+ iput(inode);
+ break;
+ }
+
+ objectid = inode->i_ino + 1;
+ if (!S_ISREG(inode->i_mode))
+ continue;
+
+ if (unlikely(min_key->objectid == inode->i_ino)) {
+ if (min_key->type > BTRFS_EXTENT_DATA_KEY)
+ continue;
+ if (min_key->type < BTRFS_EXTENT_DATA_KEY)
+ start = 0;
+ else {
+ start = min_key->offset;
+ WARN_ON(!IS_ALIGNED(start, root->sectorsize));
+ }
+ } else {
+ start = 0;
+ }
+
+ if (unlikely(max_key->objectid == inode->i_ino)) {
+ if (max_key->type < BTRFS_EXTENT_DATA_KEY)
+ continue;
+ if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
+ end = (u64)-1;
+ } else {
+ if (max_key->offset == 0)
+ continue;
+ end = max_key->offset;
+ WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+ end--;
+ }
+ } else {
+ end = (u64)-1;
+ }
+
+ /* the lock_extent waits for readpage to complete */
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ btrfs_drop_extent_cache(inode, start, end, 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ }
+ return 0;
+}
+
+static void put_inodes(struct list_head *list)
+{
+ struct inodevec *ivec;
+ while (!list_empty(list)) {
+ ivec = list_entry(list->next, struct inodevec, list);
+ list_del(&ivec->list);
+ while (ivec->nr > 0) {
+ ivec->nr--;
+ iput(ivec->inode[ivec->nr]);
+ }
+ kfree(ivec);
+ }
+}
+
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key)
+
+{
+ while (level < BTRFS_MAX_LEVEL) {
+ if (!path->nodes[level])
+ break;
+ if (path->slots[level] + 1 <
+ btrfs_header_nritems(path->nodes[level])) {
+ btrfs_node_key_to_cpu(path->nodes[level], key,
+ path->slots[level] + 1);
+ return 0;
+ }
+ level++;
+ }
+ return 1;
+}
+
+/*
+ * merge the relocated tree blocks in reloc tree with corresponding
+ * fs tree.
+ */
+static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
+ struct btrfs_root *root)
+{
+ LIST_HEAD(inode_list);
+ struct btrfs_key key;
+ struct btrfs_key next_key;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *reloc_root;
+ struct btrfs_root_item *root_item;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf = NULL;
+ unsigned long nr;
+ int level;
+ int max_level;
+ int replaced = 0;
+ int ret;
+ int err = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ reloc_root = root->reloc_root;
+ root_item = &reloc_root->root_item;
+
+ if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+ level = btrfs_root_level(root_item);
+ extent_buffer_get(reloc_root->node);
+ path->nodes[level] = reloc_root->node;
+ path->slots[level] = 0;
+ } else {
+ btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+
+ level = root_item->drop_level;
+ BUG_ON(level == 0);
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+ path->lowest_level = 0;
+ if (ret < 0) {
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ btrfs_node_key_to_cpu(path->nodes[level], &next_key,
+ path->slots[level]);
+ WARN_ON(memcmp(&key, &next_key, sizeof(key)));
+
+ btrfs_unlock_up_safe(path, 0);
+ }
+
+ if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+ trans = btrfs_start_transaction(root, 1);
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, 0);
+ btrfs_release_path(reloc_root, path);
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_unlock_up_safe(path, 1);
+ ret = replace_file_extents(trans, rc, root, leaf,
+ &inode_list);
+ if (ret < 0)
+ err = ret;
+ goto out;
+ }
+
+ memset(&next_key, 0, sizeof(next_key));
+
+ while (1) {
+ leaf = NULL;
+ replaced = 0;
+ trans = btrfs_start_transaction(root, 1);
+ max_level = level;
+
+ ret = walk_down_reloc_tree(reloc_root, path, &level);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0)
+ break;
+
+ if (!find_next_key(path, level, &key) &&
+ btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
+ ret = 0;
+ } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
+ ret = replace_path(trans, root, reloc_root,
+ path, &next_key, &leaf,
+ level, max_level);
+ } else {
+ ret = replace_path(trans, root, reloc_root,
+ path, &next_key, NULL,
+ level, max_level);
+ }
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ if (ret > 0) {
+ level = ret;
+ btrfs_node_key_to_cpu(path->nodes[level], &key,
+ path->slots[level]);
+ replaced = 1;
+ } else if (leaf) {
+ /*
+ * no block got replaced, try replacing file extents
+ */
+ btrfs_item_key_to_cpu(leaf, &key, 0);
+ ret = replace_file_extents(trans, rc, root, leaf,
+ &inode_list);
+ btrfs_tree_unlock(leaf);
+ free_extent_buffer(leaf);
+ BUG_ON(ret < 0);
+ }
+
+ ret = walk_up_reloc_tree(reloc_root, path, &level);
+ if (ret > 0)
+ break;
+
+ BUG_ON(level == 0);
+ /*
+ * save the merging progress in the drop_progress.
+ * this is OK since root refs == 1 in this case.
+ */
+ btrfs_node_key(path->nodes[level], &root_item->drop_progress,
+ path->slots[level]);
+ root_item->drop_level = level;
+
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+
+ btrfs_btree_balance_dirty(root, nr);
+
+ /*
+ * put inodes outside transaction, otherwise we may deadlock.
+ */
+ put_inodes(&inode_list);
+
+ if (replaced && rc->stage == UPDATE_DATA_PTRS)
+ invalidate_extent_cache(root, &key, &next_key);
+ }
+
+ /*
+ * handle the case only one block in the fs tree need to be
+ * relocated and the block is tree root.
+ */
+ leaf = btrfs_lock_root_node(root);
+ ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+ btrfs_tree_unlock(leaf);
+ free_extent_buffer(leaf);
+ if (ret < 0)
+ err = ret;
+out:
+ btrfs_free_path(path);
+
+ if (err == 0) {
+ memset(&root_item->drop_progress, 0,
+ sizeof(root_item->drop_progress));
+ root_item->drop_level = 0;
+ btrfs_set_root_refs(root_item, 0);
+ }
+
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+
+ btrfs_btree_balance_dirty(root, nr);
+
+ put_inodes(&inode_list);
+
+ if (replaced && rc->stage == UPDATE_DATA_PTRS)
+ invalidate_extent_cache(root, &key, &next_key);
+
+ return err;
+}
+
+/*
+ * callback for the work threads.
+ * this function merges reloc tree with corresponding fs tree,
+ * and then drops the reloc tree.
+ */
+static void merge_func(struct btrfs_work *work)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root;
+ struct btrfs_root *reloc_root;
+ struct async_merge *async;
+
+ async = container_of(work, struct async_merge, work);
+ reloc_root = async->root;
+
+ if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+ root = read_fs_root(reloc_root->fs_info,
+ reloc_root->root_key.offset);
+ BUG_ON(IS_ERR(root));
+ BUG_ON(root->reloc_root != reloc_root);
+
+ merge_reloc_root(async->rc, root);
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_update_reloc_root(trans, root);
+ btrfs_end_transaction(trans, root);
+ }
+
+ btrfs_drop_snapshot(reloc_root, 0);
+
+ if (atomic_dec_and_test(async->num_pending))
+ complete(async->done);
+
+ kfree(async);
+}
+
+static int merge_reloc_roots(struct reloc_control *rc)
+{
+ struct async_merge *async;
+ struct btrfs_root *root;
+ struct completion done;
+ atomic_t num_pending;
+
+ init_completion(&done);
+ atomic_set(&num_pending, 1);
+
+ while (!list_empty(&rc->reloc_roots)) {
+ root = list_entry(rc->reloc_roots.next,
+ struct btrfs_root, root_list);
+ list_del_init(&root->root_list);
+
+ async = kmalloc(sizeof(*async), GFP_NOFS);
+ BUG_ON(!async);
+ async->work.func = merge_func;
+ async->work.flags = 0;
+ async->rc = rc;
+ async->root = root;
+ async->done = &done;
+ async->num_pending = &num_pending;
+ atomic_inc(&num_pending);
+ btrfs_queue_worker(&rc->workers, &async->work);
+ }
+
+ if (!atomic_dec_and_test(&num_pending))
+ wait_for_completion(&done);
+
+ BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+ return 0;
+}
+
+static void free_block_list(struct rb_root *blocks)
+{
+ struct tree_block *block;
+ struct rb_node *rb_node;
+ while ((rb_node = rb_first(blocks))) {
+ block = rb_entry(rb_node, struct tree_block, rb_node);
+ rb_erase(rb_node, blocks);
+ kfree(block);
+ }
+}
+
+static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *reloc_root)
+{
+ struct btrfs_root *root;
+
+ if (reloc_root->last_trans == trans->transid)
+ return 0;
+
+ root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
+ BUG_ON(IS_ERR(root));
+ BUG_ON(root->reloc_root != reloc_root);
+
+ return btrfs_record_root_in_trans(trans, root);
+}
+
+/*
+ * select one tree from trees that references the block.
+ * for blocks in refernce counted trees, we preper reloc tree.
+ * if no reloc tree found and reloc_only is true, NULL is returned.
+ */
+static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
+ struct backref_node *node,
+ struct backref_edge *edges[],
+ int *nr, int reloc_only)
+{
+ struct backref_node *next;
+ struct btrfs_root *root;
+ int index;
+ int loop = 0;
+again:
+ index = 0;
+ next = node;
+ while (1) {
+ cond_resched();
+ next = walk_up_backref(next, edges, &index);
+ root = next->root;
+ if (!root) {
+ BUG_ON(!node->old_root);
+ goto skip;
+ }
+
+ /* no other choice for non-refernce counted tree */
+ if (!root->ref_cows) {
+ BUG_ON(reloc_only);
+ break;
+ }
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ record_reloc_root_in_trans(trans, root);
+ break;
+ }
+
+ if (loop) {
+ btrfs_record_root_in_trans(trans, root);
+ break;
+ }
+
+ if (reloc_only || next != node) {
+ if (!root->reloc_root)
+ btrfs_record_root_in_trans(trans, root);
+ root = root->reloc_root;
+ /*
+ * if the reloc tree was created in current
+ * transation, there is no node in backref tree
+ * corresponds to the root of the reloc tree.
+ */
+ if (btrfs_root_last_snapshot(&root->root_item) ==
+ trans->transid - 1)
+ break;
+ }
+skip:
+ root = NULL;
+ next = walk_down_backref(edges, &index);
+ if (!next || next->level <= node->level)
+ break;
+ }
+
+ if (!root && !loop && !reloc_only) {
+ loop = 1;
+ goto again;
+ }
+
+ if (root)
+ *nr = index;
+ else
+ *nr = 0;
+
+ return root;
+}
+
+static noinline_for_stack
+struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
+ struct backref_node *node)
+{
+ struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ int nr;
+ return __select_one_root(trans, node, edges, &nr, 0);
+}
+
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+ struct backref_node *node,
+ struct backref_edge *edges[], int *nr)
+{
+ return __select_one_root(trans, node, edges, nr, 1);
+}
+
+static void grab_path_buffers(struct btrfs_path *path,
+ struct backref_node *node,
+ struct backref_edge *edges[], int nr)
+{
+ int i = 0;
+ while (1) {
+ drop_node_buffer(node);
+ node->eb = path->nodes[node->level];
+ BUG_ON(!node->eb);
+ if (path->locks[node->level])
+ node->locked = 1;
+ path->nodes[node->level] = NULL;
+ path->locks[node->level] = 0;
+
+ if (i >= nr)
+ break;
+
+ edges[i]->blockptr = node->eb->start;
+ node = edges[i]->node[UPPER];
+ i++;
+ }
+}
+
+/*
+ * relocate a block tree, and then update pointers in upper level
+ * blocks that reference the block to point to the new location.
+ *
+ * if called by link_to_upper, the block has already been relocated.
+ * in that case this function just updates pointers.
+ */
+static int do_relocation(struct btrfs_trans_handle *trans,
+ struct backref_node *node,
+ struct btrfs_key *key,
+ struct btrfs_path *path, int lowest)
+{
+ struct backref_node *upper;
+ struct backref_edge *edge;
+ struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ struct btrfs_root *root;
+ struct extent_buffer *eb;
+ u32 blocksize;
+ u64 bytenr;
+ u64 generation;
+ int nr;
+ int slot;
+ int ret;
+ int err = 0;
+
+ BUG_ON(lowest && node->eb);
+
+ path->lowest_level = node->level + 1;
+ list_for_each_entry(edge, &node->upper, list[LOWER]) {
+ cond_resched();
+ if (node->eb && node->eb->start == edge->blockptr)
+ continue;
+
+ upper = edge->node[UPPER];
+ root = select_reloc_root(trans, upper, edges, &nr);
+ if (!root)
+ continue;
+
+ if (upper->eb && !upper->locked)
+ drop_node_buffer(upper);
+
+ if (!upper->eb) {
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ BUG_ON(ret > 0);
+
+ slot = path->slots[upper->level];
+
+ btrfs_unlock_up_safe(path, upper->level + 1);
+ grab_path_buffers(path, upper, edges, nr);
+
+ btrfs_release_path(NULL, path);
+ } else {
+ ret = btrfs_bin_search(upper->eb, key, upper->level,
+ &slot);
+ BUG_ON(ret);
+ }
+
+ bytenr = btrfs_node_blockptr(upper->eb, slot);
+ if (!lowest) {
+ if (node->eb->start == bytenr) {
+ btrfs_tree_unlock(upper->eb);
+ upper->locked = 0;
+ continue;
+ }
+ } else {
+ BUG_ON(node->bytenr != bytenr);
+ }
+
+ blocksize = btrfs_level_size(root, node->level);
+ generation = btrfs_node_ptr_generation(upper->eb, slot);
+ eb = read_tree_block(root, bytenr, blocksize, generation);
+ btrfs_tree_lock(eb);
+ btrfs_set_lock_blocking(eb);
+
+ if (!node->eb) {
+ ret = btrfs_cow_block(trans, root, eb, upper->eb,
+ slot, &eb);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ btrfs_set_lock_blocking(eb);
+ node->eb = eb;
+ node->locked = 1;
+ } else {
+ btrfs_set_node_blockptr(upper->eb, slot,
+ node->eb->start);
+ btrfs_set_node_ptr_generation(upper->eb, slot,
+ trans->transid);
+ btrfs_mark_buffer_dirty(upper->eb);
+
+ ret = btrfs_inc_extent_ref(trans, root,
+ node->eb->start, blocksize,
+ upper->eb->start,
+ btrfs_header_owner(upper->eb),
+ node->level, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
+ BUG_ON(ret);
+ }
+ if (!lowest) {
+ btrfs_tree_unlock(upper->eb);
+ upper->locked = 0;
+ }
+ }
+ path->lowest_level = 0;
+ return err;
+}
+
+static int link_to_upper(struct btrfs_trans_handle *trans,
+ struct backref_node *node,
+ struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ if (!node->eb || list_empty(&node->upper))
+ return 0;
+
+ btrfs_node_key_to_cpu(node->eb, &key, 0);
+ return do_relocation(trans, node, &key, path, 0);
+}
+
+static int finish_pending_nodes(struct btrfs_trans_handle *trans,
+ struct backref_cache *cache,
+ struct btrfs_path *path)
+{
+ struct backref_node *node;
+ int level;
+ int ret;
+ int err = 0;
+
+ for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ while (!list_empty(&cache->pending[level])) {
+ node = list_entry(cache->pending[level].next,
+ struct backref_node, lower);
+ BUG_ON(node->level != level);
+
+ ret = link_to_upper(trans, node, path);
+ if (ret < 0)
+ err = ret;
+ /*
+ * this remove the node from the pending list and
+ * may add some other nodes to the level + 1
+ * pending list
+ */
+ remove_backref_node(cache, node);
+ }
+ }
+ BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+ return err;
+}
+
+static void mark_block_processed(struct reloc_control *rc,
+ struct backref_node *node)
+{
+ u32 blocksize;
+ if (node->level == 0 ||
+ in_block_group(node->bytenr, rc->block_group)) {
+ blocksize = btrfs_level_size(rc->extent_root, node->level);
+ set_extent_bits(&rc->processed_blocks, node->bytenr,
+ node->bytenr + blocksize - 1, EXTENT_DIRTY,
+ GFP_NOFS);
+ }
+ node->processed = 1;
+}
+
+/*
+ * mark a block and all blocks directly/indirectly reference the block
+ * as processed.
+ */
+static void update_processed_blocks(struct reloc_control *rc,
+ struct backref_node *node)
+{
+ struct backref_node *next = node;
+ struct backref_edge *edge;
+ struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ int index = 0;
+
+ while (next) {
+ cond_resched();
+ while (1) {
+ if (next->processed)
+ break;
+
+ mark_block_processed(rc, next);
+
+ if (list_empty(&next->upper))
+ break;
+
+ edge = list_entry(next->upper.next,
+ struct backref_edge, list[LOWER]);
+ edges[index++] = edge;
+ next = edge->node[UPPER];
+ }
+ next = walk_down_backref(edges, &index);
+ }
+}
+
+static int tree_block_processed(u64 bytenr, u32 blocksize,
+ struct reloc_control *rc)
+{
+ if (test_range_bit(&rc->processed_blocks, bytenr,
+ bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+ return 1;
+ return 0;
+}
+
+/*
+ * check if there are any file extent pointers in the leaf point to
+ * data require processing
+ */
+static int check_file_extents(struct reloc_control *rc,
+ u64 bytenr, u32 blocksize, u64 ptr_gen)
+{
+ struct btrfs_key found_key;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int i;
+ int ret = 0;
+
+ leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
+
+ nritems = btrfs_header_nritems(leaf);
+ for (i = 0; i < nritems; i++) {
+ cond_resched();
+ btrfs_item_key_to_cpu(leaf, &found_key, i);
+ if (found_key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (bytenr == 0)
+ continue;
+ if (in_block_group(bytenr, rc->block_group)) {
+ ret = 1;
+ break;
+ }
+ }
+ free_extent_buffer(leaf);
+ return ret;
+}
+
+/*
+ * scan child blocks of a given block to find blocks require processing
+ */
+static int add_child_blocks(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node,
+ struct rb_root *blocks)
+{
+ struct tree_block *block;
+ struct rb_node *rb_node;
+ u64 bytenr;
+ u64 ptr_gen;
+ u32 blocksize;
+ u32 nritems;
+ int i;
+ int err = 0;
+
+ nritems = btrfs_header_nritems(node->eb);
+ blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
+ for (i = 0; i < nritems; i++) {
+ cond_resched();
+ bytenr = btrfs_node_blockptr(node->eb, i);
+ ptr_gen = btrfs_node_ptr_generation(node->eb, i);
+ if (ptr_gen == trans->transid)
+ continue;
+ if (!in_block_group(bytenr, rc->block_group) &&
+ (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
+ continue;
+ if (tree_block_processed(bytenr, blocksize, rc))
+ continue;
+
+ readahead_tree_block(rc->extent_root,
+ bytenr, blocksize, ptr_gen);
+ }
+
+ for (i = 0; i < nritems; i++) {
+ cond_resched();
+ bytenr = btrfs_node_blockptr(node->eb, i);
+ ptr_gen = btrfs_node_ptr_generation(node->eb, i);
+ if (ptr_gen == trans->transid)
+ continue;
+ if (!in_block_group(bytenr, rc->block_group) &&
+ (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
+ continue;
+ if (tree_block_processed(bytenr, blocksize, rc))
+ continue;
+ if (!in_block_group(bytenr, rc->block_group) &&
+ !check_file_extents(rc, bytenr, blocksize, ptr_gen))
+ continue;
+
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block) {
+ err = -ENOMEM;
+ break;
+ }
+ block->bytenr = bytenr;
+ btrfs_node_key_to_cpu(node->eb, &block->key, i);
+ block->level = node->level - 1;
+ block->key_ready = 1;
+ rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+ BUG_ON(rb_node);
+ }
+ if (err)
+ free_block_list(blocks);
+ return err;
+}
+
+/*
+ * find adjacent blocks require processing
+ */
+static noinline_for_stack
+int add_adjacent_blocks(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_cache *cache,
+ struct rb_root *blocks, int level,
+ struct backref_node **upper)
+{
+ struct backref_node *node;
+ int ret = 0;
+
+ WARN_ON(!list_empty(&cache->pending[level]));
+
+ if (list_empty(&cache->pending[level + 1]))
+ return 1;
+
+ node = list_entry(cache->pending[level + 1].next,
+ struct backref_node, lower);
+ if (node->eb)
+ ret = add_child_blocks(trans, rc, node, blocks);
+
+ *upper = node;
+ return ret;
+}
+
+static int get_tree_block_key(struct reloc_control *rc,
+ struct tree_block *block)
+{
+ struct extent_buffer *eb;
+
+ BUG_ON(block->key_ready);
+ eb = read_tree_block(rc->extent_root, block->bytenr,
+ block->key.objectid, block->key.offset);
+ WARN_ON(btrfs_header_level(eb) != block->level);
+ if (block->level == 0)
+ btrfs_item_key_to_cpu(eb, &block->key, 0);
+ else
+ btrfs_node_key_to_cpu(eb, &block->key, 0);
+ free_extent_buffer(eb);
+ block->key_ready = 1;
+ return 0;
+}
+
+static int reada_tree_block(struct reloc_control *rc,
+ struct tree_block *block)
+{
+ BUG_ON(block->key_ready);
+ readahead_tree_block(rc->extent_root, block->bytenr,
+ block->key.objectid, block->key.offset);
+ return 0;
+}
+
+/*
+ * helper function to relocate a tree block
+ */
+static int relocate_tree_block(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node,
+ struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root;
+ int ret;
+
+ root = select_one_root(trans, node);
+ if (unlikely(!root)) {
+ rc->found_old_snapshot = 1;
+ update_processed_blocks(rc, node);
+ return 0;
+ }
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ ret = do_relocation(trans, node, key, path, 1);
+ if (ret < 0)
+ goto out;
+ if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+ ret = replace_file_extents(trans, rc, root,
+ node->eb, NULL);
+ if (ret < 0)
+ goto out;
+ }
+ drop_node_buffer(node);
+ } else if (!root->ref_cows) {
+ path->lowest_level = node->level;
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ btrfs_release_path(root, path);
+ if (ret < 0)
+ goto out;
+ } else if (root != node->root) {
+ WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
+ }
+
+ update_processed_blocks(rc, node);
+ ret = 0;
+out:
+ drop_node_buffer(node);
+ return ret;
+}
+
+/*
+ * relocate a list of blocks
+ */
+static noinline_for_stack
+int relocate_tree_blocks(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc, struct rb_root *blocks)
+{
+ struct backref_cache *cache;
+ struct backref_node *node;
+ struct btrfs_path *path;
+ struct tree_block *block;
+ struct rb_node *rb_node;
+ int level = -1;
+ int ret;
+ int err = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ cache = kmalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ backref_cache_init(cache);
+
+ rb_node = rb_first(blocks);
+ while (rb_node) {
+ block = rb_entry(rb_node, struct tree_block, rb_node);
+ if (level == -1)
+ level = block->level;
+ else
+ BUG_ON(level != block->level);
+ if (!block->key_ready)
+ reada_tree_block(rc, block);
+ rb_node = rb_next(rb_node);
+ }
+
+ rb_node = rb_first(blocks);
+ while (rb_node) {
+ block = rb_entry(rb_node, struct tree_block, rb_node);
+ if (!block->key_ready)
+ get_tree_block_key(rc, block);
+ rb_node = rb_next(rb_node);
+ }
+
+ rb_node = rb_first(blocks);
+ while (rb_node) {
+ block = rb_entry(rb_node, struct tree_block, rb_node);
+
+ node = build_backref_tree(rc, cache, &block->key,
+ block->level, block->bytenr);
+ if (IS_ERR(node)) {
+ err = PTR_ERR(node);
+ goto out;
+ }
+
+ ret = relocate_tree_block(trans, rc, node, &block->key,
+ path);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ remove_backref_node(cache, node);
+ rb_node = rb_next(rb_node);
+ }
+
+ if (level > 0)
+ goto out;
+
+ free_block_list(blocks);
+
+ /*
+ * now backrefs of some upper level tree blocks have been cached,
+ * try relocating blocks referenced by these upper level blocks.
+ */
+ while (1) {
+ struct backref_node *upper = NULL;
+ if (trans->transaction->in_commit ||
+ trans->transaction->delayed_refs.flushing)
+ break;
+
+ ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
+ &upper);
+ if (ret < 0)
+ err = ret;
+ if (ret != 0)
+ break;
+
+ rb_node = rb_first(blocks);
+ while (rb_node) {
+ block = rb_entry(rb_node, struct tree_block, rb_node);
+ if (trans->transaction->in_commit ||
+ trans->transaction->delayed_refs.flushing)
+ goto out;
+ BUG_ON(!block->key_ready);
+ node = build_backref_tree(rc, cache, &block->key,
+ level, block->bytenr);
+ if (IS_ERR(node)) {
+ err = PTR_ERR(node);
+ goto out;
+ }
+
+ ret = relocate_tree_block(trans, rc, node,
+ &block->key, path);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ remove_backref_node(cache, node);
+ rb_node = rb_next(rb_node);
+ }
+ free_block_list(blocks);
+
+ if (upper) {
+ ret = link_to_upper(trans, upper, path);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ remove_backref_node(cache, upper);
+ }
+ }
+out:
+ free_block_list(blocks);
+
+ ret = finish_pending_nodes(trans, cache, path);
+ if (ret < 0)
+ err = ret;
+
+ kfree(cache);
+ btrfs_free_path(path);
+ return err;
+}
+
+static noinline_for_stack
+int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
+ u64 block_start)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret = 0;
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em)
+ return -ENOMEM;
+
+ em->start = start;
+ em->len = end + 1 - start;
+ em->block_len = em->len;
+ em->block_start = block_start;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+ struct file_extent_cluster *cluster)
+{
+ u64 page_start;
+ u64 page_end;
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ unsigned long index;
+ unsigned long last_index;
+ unsigned int dirty_page = 0;
+ struct page *page;
+ struct file_ra_state *ra;
+ int nr = 0;
+ int ret = 0;
+
+ if (!cluster->nr)
+ return 0;
+
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ if (!ra)
+ return -ENOMEM;
+
+ index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+ last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+
+ mutex_lock(&inode->i_mutex);
+
+ i_size_write(inode, cluster->end + 1 - offset);
+ ret = setup_extent_mapping(inode, cluster->start - offset,
+ cluster->end - offset, cluster->start);
+ if (ret)
+ goto out_unlock;
+
+ file_ra_state_init(ra, inode->i_mapping);
+
+ WARN_ON(cluster->start != cluster->boundary[0]);
+ while (index <= last_index) {
+ page = find_lock_page(inode->i_mapping, index);
+ if (!page) {
+ page_cache_sync_readahead(inode->i_mapping,
+ ra, NULL, index,
+ last_index + 1 - index);
+ page = grab_cache_page(inode->i_mapping, index);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ }
+
+ if (PageReadahead(page)) {
+ page_cache_async_readahead(inode->i_mapping,
+ ra, NULL, page, index,
+ last_index + 1 - index);
+ }
+
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ ret = -EIO;
+ goto out_unlock;
+ }
+ }
+
+ page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ lock_extent(&BTRFS_I(inode)->io_tree,
+ page_start, page_end, GFP_NOFS);
+
+ set_page_extent_mapped(page);
+
+ if (nr < cluster->nr &&
+ page_start + offset == cluster->boundary[nr]) {
+ set_extent_bits(&BTRFS_I(inode)->io_tree,
+ page_start, page_end,
+ EXTENT_BOUNDARY, GFP_NOFS);
+ nr++;
+ }
+ btrfs_set_extent_delalloc(inode, page_start, page_end);
+
+ set_page_dirty(page);
+ dirty_page++;
+
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+
+ index++;
+ if (nr < cluster->nr &&
+ page_end + 1 + offset == cluster->boundary[nr]) {
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ dirty_page);
+ dirty_page = 0;
+ }
+ }
+ if (dirty_page) {
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ dirty_page);
+ }
+ WARN_ON(nr != cluster->nr);
+out_unlock:
+ mutex_unlock(&inode->i_mutex);
+ kfree(ra);
+ return ret;
+}
+
+static noinline_for_stack
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
+ struct file_extent_cluster *cluster)
+{
+ int ret;
+
+ if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
+ }
+
+ if (!cluster->nr)
+ cluster->start = extent_key->objectid;
+ else
+ BUG_ON(cluster->nr >= MAX_EXTENTS);
+ cluster->end = extent_key->objectid + extent_key->offset - 1;
+ cluster->boundary[cluster->nr] = extent_key->objectid;
+ cluster->nr++;
+
+ if (cluster->nr >= MAX_EXTENTS) {
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
+ }
+ return 0;
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int get_ref_objectid_v0(struct reloc_control *rc,
+ struct btrfs_path *path,
+ struct btrfs_key *extent_key,
+ u64 *ref_objectid, int *path_change)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_ref_v0 *ref0;
+ int ret;
+ int slot;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ while (1) {
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(rc->extent_root, path);
+ if (ret < 0)
+ return ret;
+ BUG_ON(ret > 0);
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (path_change)
+ *path_change = 1;
+ }
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != extent_key->objectid)
+ return -ENOENT;
+
+ if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
+ slot++;
+ continue;
+ }
+ ref0 = btrfs_item_ptr(leaf, slot,
+ struct btrfs_extent_ref_v0);
+ *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
+ break;
+ }
+ return 0;
+}
+#endif
+
+/*
+ * helper to add a tree block to the list.
+ * the major work is getting the generation and level of the block
+ */
+static int add_tree_block(struct reloc_control *rc,
+ struct btrfs_key *extent_key,
+ struct btrfs_path *path,
+ struct rb_root *blocks)
+{
+ struct extent_buffer *eb;
+ struct btrfs_extent_item *ei;
+ struct btrfs_tree_block_info *bi;
+ struct tree_block *block;
+ struct rb_node *rb_node;
+ u32 item_size;
+ int level = -1;
+ int generation;
+
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+ if (item_size >= sizeof(*ei) + sizeof(*bi)) {
+ ei = btrfs_item_ptr(eb, path->slots[0],
+ struct btrfs_extent_item);
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ generation = btrfs_extent_generation(eb, ei);
+ level = btrfs_tree_block_level(eb, bi);
+ } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ u64 ref_owner;
+ int ret;
+
+ BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+ ret = get_ref_objectid_v0(rc, path, extent_key,
+ &ref_owner, NULL);
+ BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
+ level = (int)ref_owner;
+ /* FIXME: get real generation */
+ generation = 0;
+#else
+ BUG();
+#endif
+ }
+
+ btrfs_release_path(rc->extent_root, path);
+
+ BUG_ON(level == -1);
+
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block)
+ return -ENOMEM;
+
+ block->bytenr = extent_key->objectid;
+ block->key.objectid = extent_key->offset;
+ block->key.offset = generation;
+ block->level = level;
+ block->key_ready = 0;
+
+ rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+ BUG_ON(rb_node);
+
+ return 0;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
+ */
+static int __add_tree_block(struct reloc_control *rc,
+ u64 bytenr, u32 blocksize,
+ struct rb_root *blocks)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ if (tree_block_processed(bytenr, blocksize, rc))
+ return 0;
+
+ if (tree_search(blocks, bytenr))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = blocksize;
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret);
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ret = add_tree_block(rc, &key, path, blocks);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper to check if the block use full backrefs for pointers in it
+ */
+static int block_use_full_backref(struct reloc_control *rc,
+ struct extent_buffer *eb)
+{
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ struct btrfs_key key;
+ u64 flags;
+ int ret;
+
+ if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
+ btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
+ return 1;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ key.objectid = eb->start;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = eb->len;
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, rc->extent_root,
+ &key, path, 0, 0);
+ BUG_ON(ret);
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ flags = btrfs_extent_flags(path->nodes[0], ei);
+ BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+ if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ ret = 1;
+ else
+ ret = 0;
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
+ * this function scans fs tree to find blocks reference the data extent
+ */
+static int find_data_references(struct reloc_control *rc,
+ struct btrfs_key *extent_key,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_data_ref *ref,
+ struct rb_root *blocks)
+{
+ struct btrfs_path *path;
+ struct tree_block *block;
+ struct btrfs_root *root;
+ struct btrfs_file_extent_item *fi;
+ struct rb_node *rb_node;
+ struct btrfs_key key;
+ u64 ref_root;
+ u64 ref_objectid;
+ u64 ref_offset;
+ u32 ref_count;
+ u32 nritems;
+ int err = 0;
+ int added = 0;
+ int counted;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ref_root = btrfs_extent_data_ref_root(leaf, ref);
+ ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
+ ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
+ ref_count = btrfs_extent_data_ref_count(leaf, ref);
+
+ root = read_fs_root(rc->extent_root->fs_info, ref_root);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto out;
+ }
+
+ key.objectid = ref_objectid;
+ key.offset = ref_offset;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ /*
+ * the references in tree blocks that use full backrefs
+ * are not counted in
+ */
+ if (block_use_full_backref(rc, leaf))
+ counted = 0;
+ else
+ counted = 1;
+ rb_node = tree_search(blocks, leaf->start);
+ if (rb_node) {
+ if (counted)
+ added = 1;
+ else
+ path->slots[0] = nritems;
+ }
+
+ while (ref_count > 0) {
+ while (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0) {
+ WARN_ON(1);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ added = 0;
+
+ if (block_use_full_backref(rc, leaf))
+ counted = 0;
+ else
+ counted = 1;
+ rb_node = tree_search(blocks, leaf->start);
+ if (rb_node) {
+ if (counted)
+ added = 1;
+ else
+ path->slots[0] = nritems;
+ }
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ref_objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY) {
+ WARN_ON(1);
+ break;
+ }
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ goto next;
+
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+ extent_key->objectid)
+ goto next;
+
+ key.offset -= btrfs_file_extent_offset(leaf, fi);
+ if (key.offset != ref_offset)
+ goto next;
+
+ if (counted)
+ ref_count--;
+ if (added)
+ goto next;
+
+ if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block) {
+ err = -ENOMEM;
+ break;
+ }
+ block->bytenr = leaf->start;
+ btrfs_item_key_to_cpu(leaf, &block->key, 0);
+ block->level = 0;
+ block->key_ready = 1;
+ rb_node = tree_insert(blocks, block->bytenr,
+ &block->rb_node);
+ BUG_ON(rb_node);
+ }
+ if (counted)
+ added = 1;
+ else
+ path->slots[0] = nritems;
+next:
+ path->slots[0]++;
+
+ }
+out:
+ btrfs_free_path(path);
+ return err;
+}
+
+/*
+ * hepler to find all tree blocks that reference a given data extent
+ */
+static noinline_for_stack
+int add_data_references(struct reloc_control *rc,
+ struct btrfs_key *extent_key,
+ struct btrfs_path *path,
+ struct rb_root *blocks)
+{
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_extent_inline_ref *iref;
+ unsigned long ptr;
+ unsigned long end;
+ u32 blocksize;
+ int ret;
+ int err = 0;
+
+ ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
+ extent_key->offset);
+ BUG_ON(ret < 0);
+ if (ret > 0) {
+ /* the relocated data is fragmented */
+ rc->extents_skipped++;
+ btrfs_release_path(rc->extent_root, path);
+ return 0;
+ }
+
+ blocksize = btrfs_level_size(rc->extent_root, 0);
+
+ eb = path->nodes[0];
+ ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+ end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
+ ptr = end;
+ else
+#endif
+ ptr += sizeof(struct btrfs_extent_item);
+
+ while (ptr < end) {
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ key.type = btrfs_extent_inline_ref_type(eb, iref);
+ if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+ key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+ ret = __add_tree_block(rc, key.offset, blocksize,
+ blocks);
+ } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ ret = find_data_references(rc, extent_key,
+ eb, dref, blocks);
+ } else {
+ BUG();
+ }
+ ptr += btrfs_extent_inline_ref_size(key.type);
+ }
+ WARN_ON(ptr > end);
+
+ while (1) {
+ cond_resched();
+ eb = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(rc->extent_root, path);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ if (ret > 0)
+ break;
+ eb = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+ if (key.objectid != extent_key->objectid)
+ break;
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
+ key.type == BTRFS_EXTENT_REF_V0_KEY) {
+#else
+ BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+ if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+#endif
+ ret = __add_tree_block(rc, key.offset, blocksize,
+ blocks);
+ } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ dref = btrfs_item_ptr(eb, path->slots[0],
+ struct btrfs_extent_data_ref);
+ ret = find_data_references(rc, extent_key,
+ eb, dref, blocks);
+ } else {
+ ret = 0;
+ }
+ if (ret) {
+ err = ret;
+ break;
+ }
+ path->slots[0]++;
+ }
+ btrfs_release_path(rc->extent_root, path);
+ if (err)
+ free_block_list(blocks);
+ return err;
+}
+
+/*
+ * hepler to find next unprocessed extent
+ */
+static noinline_for_stack
+int find_next_extent(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc, struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u64 start, end, last;
+ int ret;
+
+ last = rc->block_group->key.objectid + rc->block_group->key.offset;
+ while (1) {
+ cond_resched();
+ if (rc->search_start >= last) {
+ ret = 1;
+ break;
+ }
+
+ key.objectid = rc->search_start;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
+ 0, 0);
+ if (ret < 0)
+ break;
+next:
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(rc->extent_root, path);
+ if (ret != 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid >= last) {
+ ret = 1;
+ break;
+ }
+
+ if (key.type != BTRFS_EXTENT_ITEM_KEY ||
+ key.objectid + key.offset <= rc->search_start) {
+ path->slots[0]++;
+ goto next;
+ }
+
+ ret = find_first_extent_bit(&rc->processed_blocks,
+ key.objectid, &start, &end,
+ EXTENT_DIRTY);
+
+ if (ret == 0 && start <= key.objectid) {
+ btrfs_release_path(rc->extent_root, path);
+ rc->search_start = end + 1;
+ } else {
+ rc->search_start = key.objectid + key.offset;
+ return 0;
+ }
+ }
+ btrfs_release_path(rc->extent_root, path);
+ return ret;
+}
+
+static void set_reloc_control(struct reloc_control *rc)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ mutex_lock(&fs_info->trans_mutex);
+ fs_info->reloc_ctl = rc;
+ mutex_unlock(&fs_info->trans_mutex);
+}
+
+static void unset_reloc_control(struct reloc_control *rc)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ mutex_lock(&fs_info->trans_mutex);
+ fs_info->reloc_ctl = NULL;
+ mutex_unlock(&fs_info->trans_mutex);
+}
+
+static int check_extent_flags(u64 flags)
+{
+ if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+ (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+ return 1;
+ if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
+ !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+ return 1;
+ if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ return 1;
+ return 0;
+}
+
+
+static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+{
+ struct rb_root blocks = RB_ROOT;
+ struct btrfs_key key;
+ struct file_extent_cluster *cluster;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ unsigned long nr;
+ u64 flags;
+ u32 item_size;
+ int ret;
+ int err = 0;
+
+ cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
+ if (!cluster)
+ return -ENOMEM;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ kfree(cluster);
+ return -ENOMEM;
+ }
+
+ rc->extents_found = 0;
+ rc->extents_skipped = 0;
+
+ rc->search_start = rc->block_group->key.objectid;
+ clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+ GFP_NOFS);
+
+ rc->create_reloc_root = 1;
+ set_reloc_control(rc);
+
+ trans = btrfs_start_transaction(rc->extent_root, 1);
+ btrfs_commit_transaction(trans, rc->extent_root);
+
+ while (1) {
+ trans = btrfs_start_transaction(rc->extent_root, 1);
+
+ ret = find_next_extent(trans, rc, path);
+ if (ret < 0)
+ err = ret;
+ if (ret != 0)
+ break;
+
+ rc->extents_found++;
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ item_size = btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ if (item_size >= sizeof(*ei)) {
+ flags = btrfs_extent_flags(path->nodes[0], ei);
+ ret = check_extent_flags(flags);
+ BUG_ON(ret);
+
+ } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ u64 ref_owner;
+ int path_change = 0;
+
+ BUG_ON(item_size !=
+ sizeof(struct btrfs_extent_item_v0));
+ ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
+ &path_change);
+ if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
+ flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+ else
+ flags = BTRFS_EXTENT_FLAG_DATA;
+
+ if (path_change) {
+ btrfs_release_path(rc->extent_root, path);
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, rc->extent_root,
+ &key, path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ BUG_ON(ret > 0);
+ }
+#else
+ BUG();
+#endif
+ }
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ ret = add_tree_block(rc, &key, path, &blocks);
+ } else if (rc->stage == UPDATE_DATA_PTRS &&
+ (flags & BTRFS_EXTENT_FLAG_DATA)) {
+ ret = add_data_references(rc, &key, path, &blocks);
+ } else {
+ btrfs_release_path(rc->extent_root, path);
+ ret = 0;
+ }
+ if (ret < 0) {
+ err = 0;
+ break;
+ }
+
+ if (!RB_EMPTY_ROOT(&blocks)) {
+ ret = relocate_tree_blocks(trans, rc, &blocks);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ }
+
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, rc->extent_root);
+ trans = NULL;
+ btrfs_btree_balance_dirty(rc->extent_root, nr);
+
+ if (rc->stage == MOVE_DATA_EXTENTS &&
+ (flags & BTRFS_EXTENT_FLAG_DATA)) {
+ rc->found_file_extent = 1;
+ ret = relocate_data_extent(rc->data_inode,
+ &key, cluster);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ }
+ }
+ btrfs_free_path(path);
+
+ if (trans) {
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, rc->extent_root);
+ btrfs_btree_balance_dirty(rc->extent_root, nr);
+ }
+
+ if (!err) {
+ ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+ if (ret < 0)
+ err = ret;
+ }
+
+ kfree(cluster);
+
+ rc->create_reloc_root = 0;
+ smp_mb();
+
+ if (rc->extents_found > 0) {
+ trans = btrfs_start_transaction(rc->extent_root, 1);
+ btrfs_commit_transaction(trans, rc->extent_root);
+ }
+
+ merge_reloc_roots(rc);
+
+ unset_reloc_control(rc);
+
+ /* get rid of pinned extents */
+ trans = btrfs_start_transaction(rc->extent_root, 1);
+ btrfs_commit_transaction(trans, rc->extent_root);
+
+ return err;
+}
+
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid)
+{
+ struct btrfs_path *path;
+ struct btrfs_inode_item *item;
+ struct extent_buffer *leaf;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+ memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+ btrfs_set_inode_generation(leaf, item, 1);
+ btrfs_set_inode_size(leaf, item, 0);
+ btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+ btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper to create inode for data relocation.
+ * the inode is in data relocation tree and its link count is 0
+ */
+static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *group)
+{
+ struct inode *inode = NULL;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ unsigned long nr;
+ u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+ int err = 0;
+
+ root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
+ if (IS_ERR(root))
+ return ERR_CAST(root);
+
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+
+ err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+ if (err)
+ goto out;
+
+ err = __insert_orphan_inode(trans, root, objectid);
+ BUG_ON(err);
+
+ key.objectid = objectid;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ inode = btrfs_iget(root->fs_info->sb, &key, root);
+ BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
+ BTRFS_I(inode)->index_cnt = group->key.objectid;
+
+ err = btrfs_orphan_add(trans, inode);
+out:
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+
+ btrfs_btree_balance_dirty(root, nr);
+ if (err) {
+ if (inode)
+ iput(inode);
+ inode = ERR_PTR(err);
+ }
+ return inode;
+}
+
+/*
+ * function to relocate all extents in a block group.
+ */
+int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+{
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
+ struct reloc_control *rc;
+ int ret;
+ int err = 0;
+
+ rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ if (!rc)
+ return -ENOMEM;
+
+ mapping_tree_init(&rc->reloc_root_tree);
+ extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+ INIT_LIST_HEAD(&rc->reloc_roots);
+
+ rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
+ BUG_ON(!rc->block_group);
+
+ btrfs_init_workers(&rc->workers, "relocate",
+ fs_info->thread_pool_size, NULL);
+
+ rc->extent_root = extent_root;
+ btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+
+ rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+ if (IS_ERR(rc->data_inode)) {
+ err = PTR_ERR(rc->data_inode);
+ rc->data_inode = NULL;
+ goto out;
+ }
+
+ printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
+ (unsigned long long)rc->block_group->key.objectid,
+ (unsigned long long)rc->block_group->flags);
+
+ btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+ btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
+
+ while (1) {
+ rc->extents_found = 0;
+ rc->extents_skipped = 0;
+
+ mutex_lock(&fs_info->cleaner_mutex);
+
+ btrfs_clean_old_snapshots(fs_info->tree_root);
+ ret = relocate_block_group(rc);
+
+ mutex_unlock(&fs_info->cleaner_mutex);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ if (rc->extents_found == 0)
+ break;
+
+ printk(KERN_INFO "btrfs: found %llu extents\n",
+ (unsigned long long)rc->extents_found);
+
+ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
+ btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+ invalidate_mapping_pages(rc->data_inode->i_mapping,
+ 0, -1);
+ rc->stage = UPDATE_DATA_PTRS;
+ } else if (rc->stage == UPDATE_DATA_PTRS &&
+ rc->extents_skipped >= rc->extents_found) {
+ iput(rc->data_inode);
+ rc->data_inode = create_reloc_inode(fs_info,
+ rc->block_group);
+ if (IS_ERR(rc->data_inode)) {
+ err = PTR_ERR(rc->data_inode);
+ rc->data_inode = NULL;
+ break;
+ }
+ rc->stage = MOVE_DATA_EXTENTS;
+ rc->found_file_extent = 0;
+ }
+ }
+
+ filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
+ rc->block_group->key.objectid,
+ rc->block_group->key.objectid +
+ rc->block_group->key.offset - 1);
+
+ WARN_ON(rc->block_group->pinned > 0);
+ WARN_ON(rc->block_group->reserved > 0);
+ WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
+out:
+ iput(rc->data_inode);
+ btrfs_stop_workers(&rc->workers);
+ btrfs_put_block_group(rc->block_group);
+ kfree(rc);
+ return err;
+}
+
+static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+
+ memset(&root->root_item.drop_progress, 0,
+ sizeof(root->root_item.drop_progress));
+ root->root_item.drop_level = 0;
+ btrfs_set_root_refs(&root->root_item, 0);
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &root->root_key, &root->root_item);
+ BUG_ON(ret);
+
+ ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
+ BUG_ON(ret);
+ return 0;
+}
+
+/*
+ * recover relocation interrupted by system crash.
+ *
+ * this function resumes merging reloc trees with corresponding fs trees.
+ * this is important for keeping the sharing of tree blocks
+ */
+int btrfs_recover_relocation(struct btrfs_root *root)
+{
+ LIST_HEAD(reloc_roots);
+ struct btrfs_key key;
+ struct btrfs_root *fs_root;
+ struct btrfs_root *reloc_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct reloc_control *rc = NULL;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ int err = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
+ path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(root->fs_info->tree_root, path);
+
+ if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
+ key.type != BTRFS_ROOT_ITEM_KEY)
+ break;
+
+ reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+ if (IS_ERR(reloc_root)) {
+ err = PTR_ERR(reloc_root);
+ goto out;
+ }
+
+ list_add(&reloc_root->root_list, &reloc_roots);
+
+ if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+ fs_root = read_fs_root(root->fs_info,
+ reloc_root->root_key.offset);
+ if (IS_ERR(fs_root)) {
+ ret = PTR_ERR(fs_root);
+ if (ret != -ENOENT) {
+ err = ret;
+ goto out;
+ }
+ mark_garbage_root(reloc_root);
+ }
+ }
+
+ if (key.offset == 0)
+ break;
+
+ key.offset--;
+ }
+ btrfs_release_path(root->fs_info->tree_root, path);
+
+ if (list_empty(&reloc_roots))
+ goto out;
+
+ rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ if (!rc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ mapping_tree_init(&rc->reloc_root_tree);
+ INIT_LIST_HEAD(&rc->reloc_roots);
+ btrfs_init_workers(&rc->workers, "relocate",
+ root->fs_info->thread_pool_size, NULL);
+ rc->extent_root = root->fs_info->extent_root;
+
+ set_reloc_control(rc);
+
+ while (!list_empty(&reloc_roots)) {
+ reloc_root = list_entry(reloc_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&reloc_root->root_list);
+
+ if (btrfs_root_refs(&reloc_root->root_item) == 0) {
+ list_add_tail(&reloc_root->root_list,
+ &rc->reloc_roots);
+ continue;
+ }
+
+ fs_root = read_fs_root(root->fs_info,
+ reloc_root->root_key.offset);
+ BUG_ON(IS_ERR(fs_root));
+
+ __add_reloc_root(reloc_root);
+ fs_root->reloc_root = reloc_root;
+ }
+
+ trans = btrfs_start_transaction(rc->extent_root, 1);
+ btrfs_commit_transaction(trans, rc->extent_root);
+
+ merge_reloc_roots(rc);
+
+ unset_reloc_control(rc);
+
+ trans = btrfs_start_transaction(rc->extent_root, 1);
+ btrfs_commit_transaction(trans, rc->extent_root);
+out:
+ if (rc) {
+ btrfs_stop_workers(&rc->workers);
+ kfree(rc);
+ }
+ while (!list_empty(&reloc_roots)) {
+ reloc_root = list_entry(reloc_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&reloc_root->root_list);
+ free_extent_buffer(reloc_root->node);
+ free_extent_buffer(reloc_root->commit_root);
+ kfree(reloc_root);
+ }
+ btrfs_free_path(path);
+
+ if (err == 0) {
+ /* cleanup orphan inode in data relocation tree */
+ fs_root = read_fs_root(root->fs_info,
+ BTRFS_DATA_RELOC_TREE_OBJECTID);
+ if (IS_ERR(fs_root))
+ err = PTR_ERR(fs_root);
+ else
+ btrfs_orphan_cleanup(fs_root);
+ }
+ return err;
+}
+
+/*
+ * helper to add ordered checksum for data relocation.
+ *
+ * cloning checksum properly handles the nodatasum extents.
+ * it also saves CPU time to re-calculate the checksum.
+ */
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_sector_sum *sector_sum;
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ size_t offset;
+ int ret;
+ u64 disk_bytenr;
+ LIST_HEAD(list);
+
+ ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+ BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+
+ disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+ ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+ disk_bytenr + len - 1, &list);
+
+ while (!list_empty(&list)) {
+ sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+ list_del_init(&sums->list);
+
+ sector_sum = sums->sums;
+ sums->bytenr = ordered->start;
+
+ offset = 0;
+ while (offset < sums->len) {
+ sector_sum->bytenr += ordered->start - disk_bytenr;
+ sector_sum++;
+ offset += root->sectorsize;
+ }
+
+ btrfs_add_ordered_sum(inode, ordered, sums);
+ }
+ btrfs_put_ordered_extent(ordered);
+ return 0;
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 00000000000..67fa2d29d66
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+/*
+ * search forward for a root, starting with objectid 'search_start'
+ * if a root key is found, the objectid we find is filled into 'found_objectid'
+ * and 0 is returned. < 0 is returned on error, 1 if there is nothing
+ * left in the tree.
+ */
+int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+ u64 *found_objectid)
+{
+ struct btrfs_path *path;
+ struct btrfs_key search_key;
+ int ret;
+
+ root = root->fs_info->tree_root;
+ search_key.objectid = search_start;
+ search_key.type = (u8)-1;
+ search_key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret == 0) {
+ ret = 1;
+ goto out;
+ }
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ goto out;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
+ if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
+ search_key.offset++;
+ btrfs_release_path(root, path);
+ goto again;
+ }
+ ret = 0;
+ *found_objectid = search_key.objectid;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * lookup the root with the highest offset for a given objectid. The key we do
+ * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
+ * on error.
+ */
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
+ struct btrfs_root_item *item, struct btrfs_key *key)
+{
+ struct btrfs_path *path;
+ struct btrfs_key search_key;
+ struct btrfs_key found_key;
+ struct extent_buffer *l;
+ int ret;
+ int slot;
+
+ search_key.objectid = objectid;
+ search_key.type = BTRFS_ROOT_ITEM_KEY;
+ search_key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ BUG_ON(ret == 0);
+ if (path->slots[0] == 0) {
+ ret = 1;
+ goto out;
+ }
+ l = path->nodes[0];
+ slot = path->slots[0] - 1;
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+ if (found_key.objectid != objectid ||
+ found_key.type != BTRFS_ROOT_ITEM_KEY) {
+ ret = 1;
+ goto out;
+ }
+ if (item)
+ read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+ sizeof(*item));
+ if (key)
+ memcpy(key, &found_key, sizeof(found_key));
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_set_root_node(struct btrfs_root_item *item,
+ struct extent_buffer *node)
+{
+ btrfs_set_root_bytenr(item, node->start);
+ btrfs_set_root_level(item, btrfs_header_level(node));
+ btrfs_set_root_generation(item, btrfs_header_generation(node));
+ return 0;
+}
+
+/*
+ * copy the data in 'item' into the btree
+ */
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_root_item
+ *item)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *l;
+ int ret;
+ int slot;
+ unsigned long ptr;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret != 0) {
+ btrfs_print_leaf(root, path->nodes[0]);
+ printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
+ (unsigned long long)key->objectid, key->type,
+ (unsigned long long)key->offset);
+ BUG_ON(1);
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ ptr = btrfs_item_ptr_offset(l, slot);
+ write_extent_buffer(l, item, ptr, sizeof(*item));
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_root_item
+ *item)
+{
+ int ret;
+ ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
+ return ret;
+}
+
+/*
+ * at mount time we want to find all the old transaction snapshots that were in
+ * the process of being deleted if we crashed. This is any root item with an
+ * offset lower than the latest root. They need to be queued for deletion to
+ * finish what was happening when we crashed.
+ */
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
+{
+ struct btrfs_root *dead_root;
+ struct btrfs_item *item;
+ struct btrfs_root_item *ri;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ int ret;
+ u32 nritems;
+ struct extent_buffer *leaf;
+ int slot;
+
+ key.objectid = objectid;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = 0;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+again:
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto err;
+ while (1) {
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ slot = path->slots[0];
+ if (slot >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ slot = path->slots[0];
+ }
+ item = btrfs_item_nr(leaf, slot);
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
+ goto next;
+
+ if (key.objectid < objectid)
+ goto next;
+
+ if (key.objectid > objectid)
+ break;
+
+ ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
+ if (btrfs_disk_root_refs(leaf, ri) != 0)
+ goto next;
+
+ memcpy(&found_key, &key, sizeof(key));
+ key.offset++;
+ btrfs_release_path(root, path);
+ dead_root =
+ btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+ &found_key);
+ if (IS_ERR(dead_root)) {
+ ret = PTR_ERR(dead_root);
+ goto err;
+ }
+
+ ret = btrfs_add_dead_root(dead_root);
+ if (ret)
+ goto err;
+ goto again;
+next:
+ slot++;
+ path->slots[0]++;
+ }
+ ret = 0;
+err:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int err = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = 0;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(tree_root, path);
+ if (ret < 0)
+ err = ret;
+ if (ret != 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(tree_root, path);
+
+ if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
+ key.type != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ ret = btrfs_find_dead_roots(tree_root, key.offset);
+ if (ret) {
+ err = ret;
+ break;
+ }
+
+ key.offset++;
+ }
+
+ btrfs_free_path(path);
+ return err;
+}
+
+/* drop the root item for 'key' from 'root' */
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key)
+{
+ struct btrfs_path *path;
+ int ret;
+ u32 refs;
+ struct btrfs_root_item *ri;
+ struct extent_buffer *leaf;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(trans, root, key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ BUG_ON(ret != 0);
+ leaf = path->nodes[0];
+ ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
+
+ refs = btrfs_disk_root_refs(leaf, ri);
+ BUG_ON(refs != 0);
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+ const char *name, int name_len)
+
+{
+ struct btrfs_path *path;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long ptr;
+ int err = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = root_id;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = ref_id;
+again:
+ ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+ BUG_ON(ret < 0);
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+
+ WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
+ WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
+ ptr = (unsigned long)(ref + 1);
+ WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
+ *sequence = btrfs_root_ref_sequence(leaf, ref);
+
+ ret = btrfs_del_item(trans, tree_root, path);
+ BUG_ON(ret);
+ } else
+ err = -ENOENT;
+
+ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+ btrfs_release_path(tree_root, path);
+ key.objectid = ref_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = root_id;
+ goto again;
+ }
+
+ btrfs_free_path(path);
+ return err;
+}
+
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+ struct btrfs_path *path,
+ u64 root_id, u64 ref_id)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = root_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = ref_id;
+
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ return ret;
+}
+
+/*
+ * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
+ * or BTRFS_ROOT_BACKREF_KEY.
+ *
+ * The dirid, sequence, name and name_len refer to the directory entry
+ * that is referencing the root.
+ *
+ * For a forward ref, the root_id is the id of the tree referencing
+ * the root and ref_id is the id of the subvol or snapshot.
+ *
+ * For a back ref the root_id is the id of the subvol or snapshot and
+ * ref_id is the id of the tree referencing it.
+ */
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+ const char *name, int name_len)
+{
+ struct btrfs_key key;
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = root_id;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = ref_id;
+again:
+ ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+ sizeof(*ref) + name_len);
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+ btrfs_set_root_ref_dirid(leaf, ref, dirid);
+ btrfs_set_root_ref_sequence(leaf, ref, sequence);
+ btrfs_set_root_ref_name_len(leaf, ref, name_len);
+ ptr = (unsigned long)(ref + 1);
+ write_extent_buffer(leaf, name, ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+ btrfs_release_path(tree_root, path);
+ key.objectid = ref_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = root_id;
+ goto again;
+ }
+
+ btrfs_free_path(path);
+ return 0;
+}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 00000000000..c0f7ecaf1e7
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/highmem.h>
+
+/* this is some deeply nasty code. ctree.h has a different
+ * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
+ *
+ * The end result is that anyone who #includes ctree.h gets a
+ * declaration for the btrfs_set_foo functions and btrfs_foo functions
+ *
+ * This file declares the macros and then #includes ctree.h, which results
+ * in cpp creating the function here based on the template below.
+ *
+ * These setget functions do all the extent_buffer related mapping
+ * required to efficiently read and write specific fields in the extent
+ * buffers. Every pointer to metadata items in btrfs is really just
+ * an unsigned long offset into the extent buffer which has been
+ * cast to a specific type. This gives us all the gcc type checking.
+ *
+ * The extent buffer api is used to do all the kmapping and page
+ * spanning work required to get extent buffers in highmem and have
+ * a metadata blocksize different from the page size.
+ *
+ * The macro starts with a simple function prototype declaration so that
+ * sparse won't complain about it being static.
+ */
+
+#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
+u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \
+u##bits btrfs_##name(struct extent_buffer *eb, \
+ type *s) \
+{ \
+ unsigned long part_offset = (unsigned long)s; \
+ unsigned long offset = part_offset + offsetof(type, member); \
+ type *p; \
+ /* ugly, but we want the fast path here */ \
+ if (eb->map_token && offset >= eb->map_start && \
+ offset + sizeof(((type *)0)->member) <= eb->map_start + \
+ eb->map_len) { \
+ p = (type *)(eb->kaddr + part_offset - eb->map_start); \
+ return le##bits##_to_cpu(p->member); \
+ } \
+ { \
+ int err; \
+ char *map_token; \
+ char *kaddr; \
+ int unmap_on_exit = (eb->map_token == NULL); \
+ unsigned long map_start; \
+ unsigned long map_len; \
+ u##bits res; \
+ err = map_extent_buffer(eb, offset, \
+ sizeof(((type *)0)->member), \
+ &map_token, &kaddr, \
+ &map_start, &map_len, KM_USER1); \
+ if (err) { \
+ __le##bits leres; \
+ read_eb_member(eb, s, type, member, &leres); \
+ return le##bits##_to_cpu(leres); \
+ } \
+ p = (type *)(kaddr + part_offset - map_start); \
+ res = le##bits##_to_cpu(p->member); \
+ if (unmap_on_exit) \
+ unmap_extent_buffer(eb, map_token, KM_USER1); \
+ return res; \
+ } \
+} \
+void btrfs_set_##name(struct extent_buffer *eb, \
+ type *s, u##bits val) \
+{ \
+ unsigned long part_offset = (unsigned long)s; \
+ unsigned long offset = part_offset + offsetof(type, member); \
+ type *p; \
+ /* ugly, but we want the fast path here */ \
+ if (eb->map_token && offset >= eb->map_start && \
+ offset + sizeof(((type *)0)->member) <= eb->map_start + \
+ eb->map_len) { \
+ p = (type *)(eb->kaddr + part_offset - eb->map_start); \
+ p->member = cpu_to_le##bits(val); \
+ return; \
+ } \
+ { \
+ int err; \
+ char *map_token; \
+ char *kaddr; \
+ int unmap_on_exit = (eb->map_token == NULL); \
+ unsigned long map_start; \
+ unsigned long map_len; \
+ err = map_extent_buffer(eb, offset, \
+ sizeof(((type *)0)->member), \
+ &map_token, &kaddr, \
+ &map_start, &map_len, KM_USER1); \
+ if (err) { \
+ __le##bits val2; \
+ val2 = cpu_to_le##bits(val); \
+ write_eb_member(eb, s, type, member, &val2); \
+ return; \
+ } \
+ p = (type *)(kaddr + part_offset - map_start); \
+ p->member = cpu_to_le##bits(val); \
+ if (unmap_on_exit) \
+ unmap_extent_buffer(eb, map_token, KM_USER1); \
+ } \
+}
+
+#include "ctree.h"
+
+void btrfs_node_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+ if (eb->map_token && ptr >= eb->map_start &&
+ ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
+ memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
+ sizeof(*disk_key));
+ return;
+ } else if (eb->map_token) {
+ unmap_extent_buffer(eb, eb->map_token, KM_USER1);
+ eb->map_token = NULL;
+ }
+ read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+ struct btrfs_key_ptr, key, disk_key);
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 00000000000..8a1ea6e6457
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,799 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/parser.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/miscdevice.h>
+#include <linux/magic.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "xattr.h"
+#include "volumes.h"
+#include "version.h"
+#include "export.h"
+#include "compression.h"
+
+static const struct super_operations btrfs_super_ops;
+
+static void btrfs_put_super(struct super_block *sb)
+{
+ struct btrfs_root *root = btrfs_sb(sb);
+ int ret;
+
+ ret = close_ctree(root);
+ sb->s_fs_info = NULL;
+}
+
+enum {
+ Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
+ Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
+ Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
+ Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio,
+ Opt_flushoncommit,
+ Opt_discard, Opt_err,
+};
+
+static match_table_t tokens = {
+ {Opt_degraded, "degraded"},
+ {Opt_subvol, "subvol=%s"},
+ {Opt_device, "device=%s"},
+ {Opt_nodatasum, "nodatasum"},
+ {Opt_nodatacow, "nodatacow"},
+ {Opt_nobarrier, "nobarrier"},
+ {Opt_max_extent, "max_extent=%s"},
+ {Opt_max_inline, "max_inline=%s"},
+ {Opt_alloc_start, "alloc_start=%s"},
+ {Opt_thread_pool, "thread_pool=%d"},
+ {Opt_compress, "compress"},
+ {Opt_compress_force, "compress-force"},
+ {Opt_ssd, "ssd"},
+ {Opt_ssd_spread, "ssd_spread"},
+ {Opt_nossd, "nossd"},
+ {Opt_noacl, "noacl"},
+ {Opt_notreelog, "notreelog"},
+ {Opt_flushoncommit, "flushoncommit"},
+ {Opt_ratio, "metadata_ratio=%d"},
+ {Opt_discard, "discard"},
+ {Opt_err, NULL},
+};
+
+u64 btrfs_parse_size(char *str)
+{
+ u64 res;
+ int mult = 1;
+ char *end;
+ char last;
+
+ res = simple_strtoul(str, &end, 10);
+
+ last = end[0];
+ if (isalpha(last)) {
+ last = tolower(last);
+ switch (last) {
+ case 'g':
+ mult *= 1024;
+ case 'm':
+ mult *= 1024;
+ case 'k':
+ mult *= 1024;
+ }
+ res = res * mult;
+ }
+ return res;
+}
+
+/*
+ * Regular mount options parser. Everything that is needed only when
+ * reading in a new superblock is parsed here.
+ */
+int btrfs_parse_options(struct btrfs_root *root, char *options)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ substring_t args[MAX_OPT_ARGS];
+ char *p, *num;
+ int intarg;
+ int ret = 0;
+
+ if (!options)
+ return 0;
+
+ /*
+ * strsep changes the string, duplicate it because parse_options
+ * gets called twice
+ */
+ options = kstrdup(options, GFP_NOFS);
+ if (!options)
+ return -ENOMEM;
+
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_degraded:
+ printk(KERN_INFO "btrfs: allowing degraded mounts\n");
+ btrfs_set_opt(info->mount_opt, DEGRADED);
+ break;
+ case Opt_subvol:
+ case Opt_device:
+ /*
+ * These are parsed by btrfs_parse_early_options
+ * and can be happily ignored here.
+ */
+ break;
+ case Opt_nodatasum:
+ printk(KERN_INFO "btrfs: setting nodatasum\n");
+ btrfs_set_opt(info->mount_opt, NODATASUM);
+ break;
+ case Opt_nodatacow:
+ printk(KERN_INFO "btrfs: setting nodatacow\n");
+ btrfs_set_opt(info->mount_opt, NODATACOW);
+ btrfs_set_opt(info->mount_opt, NODATASUM);
+ break;
+ case Opt_compress:
+ printk(KERN_INFO "btrfs: use compression\n");
+ btrfs_set_opt(info->mount_opt, COMPRESS);
+ break;
+ case Opt_compress_force:
+ printk(KERN_INFO "btrfs: forcing compression\n");
+ btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+ btrfs_set_opt(info->mount_opt, COMPRESS);
+ break;
+ case Opt_ssd:
+ printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
+ btrfs_set_opt(info->mount_opt, SSD);
+ break;
+ case Opt_ssd_spread:
+ printk(KERN_INFO "btrfs: use spread ssd "
+ "allocation scheme\n");
+ btrfs_set_opt(info->mount_opt, SSD);
+ btrfs_set_opt(info->mount_opt, SSD_SPREAD);
+ break;
+ case Opt_nossd:
+ printk(KERN_INFO "btrfs: not using ssd allocation "
+ "scheme\n");
+ btrfs_set_opt(info->mount_opt, NOSSD);
+ btrfs_clear_opt(info->mount_opt, SSD);
+ btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
+ break;
+ case Opt_nobarrier:
+ printk(KERN_INFO "btrfs: turning off barriers\n");
+ btrfs_set_opt(info->mount_opt, NOBARRIER);
+ break;
+ case Opt_thread_pool:
+ intarg = 0;
+ match_int(&args[0], &intarg);
+ if (intarg) {
+ info->thread_pool_size = intarg;
+ printk(KERN_INFO "btrfs: thread pool %d\n",
+ info->thread_pool_size);
+ }
+ break;
+ case Opt_max_extent:
+ num = match_strdup(&args[0]);
+ if (num) {
+ info->max_extent = btrfs_parse_size(num);
+ kfree(num);
+
+ info->max_extent = max_t(u64,
+ info->max_extent, root->sectorsize);
+ printk(KERN_INFO "btrfs: max_extent at %llu\n",
+ (unsigned long long)info->max_extent);
+ }
+ break;
+ case Opt_max_inline:
+ num = match_strdup(&args[0]);
+ if (num) {
+ info->max_inline = btrfs_parse_size(num);
+ kfree(num);
+
+ if (info->max_inline) {
+ info->max_inline = max_t(u64,
+ info->max_inline,
+ root->sectorsize);
+ }
+ printk(KERN_INFO "btrfs: max_inline at %llu\n",
+ (unsigned long long)info->max_inline);
+ }
+ break;
+ case Opt_alloc_start:
+ num = match_strdup(&args[0]);
+ if (num) {
+ info->alloc_start = btrfs_parse_size(num);
+ kfree(num);
+ printk(KERN_INFO
+ "btrfs: allocations start at %llu\n",
+ (unsigned long long)info->alloc_start);
+ }
+ break;
+ case Opt_noacl:
+ root->fs_info->sb->s_flags &= ~MS_POSIXACL;
+ break;
+ case Opt_notreelog:
+ printk(KERN_INFO "btrfs: disabling tree log\n");
+ btrfs_set_opt(info->mount_opt, NOTREELOG);
+ break;
+ case Opt_flushoncommit:
+ printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
+ btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
+ break;
+ case Opt_ratio:
+ intarg = 0;
+ match_int(&args[0], &intarg);
+ if (intarg) {
+ info->metadata_ratio = intarg;
+ printk(KERN_INFO "btrfs: metadata ratio %d\n",
+ info->metadata_ratio);
+ }
+ break;
+ case Opt_discard:
+ btrfs_set_opt(info->mount_opt, DISCARD);
+ break;
+ case Opt_err:
+ printk(KERN_INFO "btrfs: unrecognized mount option "
+ "'%s'\n", p);
+ ret = -EINVAL;
+ goto out;
+ default:
+ break;
+ }
+ }
+out:
+ kfree(options);
+ return ret;
+}
+
+/*
+ * Parse mount options that are required early in the mount process.
+ *
+ * All other options will be parsed on much later in the mount process and
+ * only when we need to allocate a new super block.
+ */
+static int btrfs_parse_early_options(const char *options, fmode_t flags,
+ void *holder, char **subvol_name,
+ struct btrfs_fs_devices **fs_devices)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *opts, *p;
+ int error = 0;
+
+ if (!options)
+ goto out;
+
+ /*
+ * strsep changes the string, duplicate it because parse_options
+ * gets called twice
+ */
+ opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ while ((p = strsep(&opts, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_subvol:
+ *subvol_name = match_strdup(&args[0]);
+ break;
+ case Opt_device:
+ error = btrfs_scan_one_device(match_strdup(&args[0]),
+ flags, holder, fs_devices);
+ if (error)
+ goto out_free_opts;
+ break;
+ default:
+ break;
+ }
+ }
+
+ out_free_opts:
+ kfree(opts);
+ out:
+ /*
+ * If no subvolume name is specified we use the default one. Allocate
+ * a copy of the string "." here so that code later in the
+ * mount path doesn't care if it's the default volume or another one.
+ */
+ if (!*subvol_name) {
+ *subvol_name = kstrdup(".", GFP_KERNEL);
+ if (!*subvol_name)
+ return -ENOMEM;
+ }
+ return error;
+}
+
+static int btrfs_fill_super(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ void *data, int silent)
+{
+ struct inode *inode;
+ struct dentry *root_dentry;
+ struct btrfs_super_block *disk_super;
+ struct btrfs_root *tree_root;
+ struct btrfs_key key;
+ int err;
+
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_magic = BTRFS_SUPER_MAGIC;
+ sb->s_op = &btrfs_super_ops;
+ sb->s_export_op = &btrfs_export_ops;
+ sb->s_xattr = btrfs_xattr_handlers;
+ sb->s_time_gran = 1;
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ sb->s_flags |= MS_POSIXACL;
+#endif
+
+ tree_root = open_ctree(sb, fs_devices, (char *)data);
+
+ if (IS_ERR(tree_root)) {
+ printk("btrfs: open_ctree failed\n");
+ return PTR_ERR(tree_root);
+ }
+ sb->s_fs_info = tree_root;
+ disk_super = &tree_root->fs_info->super_copy;
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto fail_close;
+ }
+
+ root_dentry = d_alloc_root(inode);
+ if (!root_dentry) {
+ iput(inode);
+ err = -ENOMEM;
+ goto fail_close;
+ }
+#if 0
+ /* this does the super kobj at the same time */
+ err = btrfs_sysfs_add_super(tree_root->fs_info);
+ if (err)
+ goto fail_close;
+#endif
+
+ sb->s_root = root_dentry;
+
+ save_mount_options(sb, data);
+ return 0;
+
+fail_close:
+ close_ctree(tree_root);
+ return err;
+}
+
+int btrfs_sync_fs(struct super_block *sb, int wait)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = btrfs_sb(sb);
+ int ret;
+
+ if (!wait) {
+ filemap_flush(root->fs_info->btree_inode->i_mapping);
+ return 0;
+ }
+
+ btrfs_start_delalloc_inodes(root, 0);
+ btrfs_wait_ordered_extents(root, 0, 0);
+
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_commit_transaction(trans, root);
+ return ret;
+}
+
+static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+ struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
+ struct btrfs_fs_info *info = root->fs_info;
+
+ if (btrfs_test_opt(root, DEGRADED))
+ seq_puts(seq, ",degraded");
+ if (btrfs_test_opt(root, NODATASUM))
+ seq_puts(seq, ",nodatasum");
+ if (btrfs_test_opt(root, NODATACOW))
+ seq_puts(seq, ",nodatacow");
+ if (btrfs_test_opt(root, NOBARRIER))
+ seq_puts(seq, ",nobarrier");
+ if (info->max_extent != (u64)-1)
+ seq_printf(seq, ",max_extent=%llu",
+ (unsigned long long)info->max_extent);
+ if (info->max_inline != 8192 * 1024)
+ seq_printf(seq, ",max_inline=%llu",
+ (unsigned long long)info->max_inline);
+ if (info->alloc_start != 0)
+ seq_printf(seq, ",alloc_start=%llu",
+ (unsigned long long)info->alloc_start);
+ if (info->thread_pool_size != min_t(unsigned long,
+ num_online_cpus() + 2, 8))
+ seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
+ if (btrfs_test_opt(root, COMPRESS))
+ seq_puts(seq, ",compress");
+ if (btrfs_test_opt(root, NOSSD))
+ seq_puts(seq, ",nossd");
+ if (btrfs_test_opt(root, SSD_SPREAD))
+ seq_puts(seq, ",ssd_spread");
+ else if (btrfs_test_opt(root, SSD))
+ seq_puts(seq, ",ssd");
+ if (btrfs_test_opt(root, NOTREELOG))
+ seq_puts(seq, ",notreelog");
+ if (btrfs_test_opt(root, FLUSHONCOMMIT))
+ seq_puts(seq, ",flushoncommit");
+ if (btrfs_test_opt(root, DISCARD))
+ seq_puts(seq, ",discard");
+ if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
+ seq_puts(seq, ",noacl");
+ return 0;
+}
+
+static int btrfs_test_super(struct super_block *s, void *data)
+{
+ struct btrfs_fs_devices *test_fs_devices = data;
+ struct btrfs_root *root = btrfs_sb(s);
+
+ return root->fs_info->fs_devices == test_fs_devices;
+}
+
+/*
+ * Find a superblock for the given device / mount point.
+ *
+ * Note: This is based on get_sb_bdev from fs/super.c with a few additions
+ * for multiple device setup. Make sure to keep it in sync.
+ */
+static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ char *subvol_name = NULL;
+ struct block_device *bdev = NULL;
+ struct super_block *s;
+ struct dentry *root;
+ struct btrfs_fs_devices *fs_devices = NULL;
+ fmode_t mode = FMODE_READ;
+ int error = 0;
+
+ if (!(flags & MS_RDONLY))
+ mode |= FMODE_WRITE;
+
+ error = btrfs_parse_early_options(data, mode, fs_type,
+ &subvol_name, &fs_devices);
+ if (error)
+ return error;
+
+ error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
+ if (error)
+ goto error_free_subvol_name;
+
+ error = btrfs_open_devices(fs_devices, mode, fs_type);
+ if (error)
+ goto error_free_subvol_name;
+
+ if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+ error = -EACCES;
+ goto error_close_devices;
+ }
+
+ bdev = fs_devices->latest_bdev;
+ s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+ if (IS_ERR(s))
+ goto error_s;
+
+ if (s->s_root) {
+ if ((flags ^ s->s_flags) & MS_RDONLY) {
+ deactivate_locked_super(s);
+ error = -EBUSY;
+ goto error_close_devices;
+ }
+
+ btrfs_close_devices(fs_devices);
+ } else {
+ char b[BDEVNAME_SIZE];
+
+ s->s_flags = flags;
+ strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ error = btrfs_fill_super(s, fs_devices, data,
+ flags & MS_SILENT ? 1 : 0);
+ if (error) {
+ deactivate_locked_super(s);
+ goto error_free_subvol_name;
+ }
+
+ btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+ s->s_flags |= MS_ACTIVE;
+ }
+
+ if (!strcmp(subvol_name, "."))
+ root = dget(s->s_root);
+ else {
+ mutex_lock(&s->s_root->d_inode->i_mutex);
+ root = lookup_one_len(subvol_name, s->s_root,
+ strlen(subvol_name));
+ mutex_unlock(&s->s_root->d_inode->i_mutex);
+
+ if (IS_ERR(root)) {
+ deactivate_locked_super(s);
+ error = PTR_ERR(root);
+ goto error_free_subvol_name;
+ }
+ if (!root->d_inode) {
+ dput(root);
+ deactivate_locked_super(s);
+ error = -ENXIO;
+ goto error_free_subvol_name;
+ }
+ }
+
+ mnt->mnt_sb = s;
+ mnt->mnt_root = root;
+
+ kfree(subvol_name);
+ return 0;
+
+error_s:
+ error = PTR_ERR(s);
+error_close_devices:
+ btrfs_close_devices(fs_devices);
+error_free_subvol_name:
+ kfree(subvol_name);
+ return error;
+}
+
+static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct btrfs_root *root = btrfs_sb(sb);
+ int ret;
+
+ ret = btrfs_parse_options(root, data);
+ if (ret)
+ return -EINVAL;
+
+ if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+ return 0;
+
+ if (*flags & MS_RDONLY) {
+ sb->s_flags |= MS_RDONLY;
+
+ ret = btrfs_commit_super(root);
+ WARN_ON(ret);
+ } else {
+ if (root->fs_info->fs_devices->rw_devices == 0)
+ return -EACCES;
+
+ if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+ return -EINVAL;
+
+ /* recover relocation */
+ ret = btrfs_recover_relocation(root);
+ WARN_ON(ret);
+
+ ret = btrfs_cleanup_fs_roots(root->fs_info);
+ WARN_ON(ret);
+
+ sb->s_flags &= ~MS_RDONLY;
+ }
+
+ return 0;
+}
+
+static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct btrfs_root *root = btrfs_sb(dentry->d_sb);
+ struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ int bits = dentry->d_sb->s_blocksize_bits;
+ __be32 *fsid = (__be32 *)root->fs_info->fsid;
+
+ buf->f_namelen = BTRFS_NAME_LEN;
+ buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
+ buf->f_bfree = buf->f_blocks -
+ (btrfs_super_bytes_used(disk_super) >> bits);
+ buf->f_bavail = buf->f_bfree;
+ buf->f_bsize = dentry->d_sb->s_blocksize;
+ buf->f_type = BTRFS_SUPER_MAGIC;
+
+ /* We treat it as constant endianness (it doesn't matter _which_)
+ because we want the fsid to come out the same whether mounted
+ on a big-endian or little-endian host */
+ buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
+ buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+ /* Mask in the root object ID too, to disambiguate subvols */
+ buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
+ buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
+
+ return 0;
+}
+
+static struct file_system_type btrfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "btrfs",
+ .get_sb = btrfs_get_sb,
+ .kill_sb = kill_anon_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+/*
+ * used by btrfsctl to scan devices when no FS is mounted
+ */
+static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct btrfs_ioctl_vol_args *vol;
+ struct btrfs_fs_devices *fs_devices;
+ int ret = -ENOTTY;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol = memdup_user((void __user *)arg, sizeof(*vol));
+ if (IS_ERR(vol))
+ return PTR_ERR(vol);
+
+ switch (cmd) {
+ case BTRFS_IOC_SCAN_DEV:
+ ret = btrfs_scan_one_device(vol->name, FMODE_READ,
+ &btrfs_fs_type, &fs_devices);
+ break;
+ }
+
+ kfree(vol);
+ return ret;
+}
+
+static int btrfs_freeze(struct super_block *sb)
+{
+ struct btrfs_root *root = btrfs_sb(sb);
+ mutex_lock(&root->fs_info->transaction_kthread_mutex);
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ return 0;
+}
+
+static int btrfs_unfreeze(struct super_block *sb)
+{
+ struct btrfs_root *root = btrfs_sb(sb);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+ mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+ return 0;
+}
+
+static const struct super_operations btrfs_super_ops = {
+ .drop_inode = btrfs_drop_inode,
+ .delete_inode = btrfs_delete_inode,
+ .put_super = btrfs_put_super,
+ .sync_fs = btrfs_sync_fs,
+ .show_options = btrfs_show_options,
+ .write_inode = btrfs_write_inode,
+ .dirty_inode = btrfs_dirty_inode,
+ .alloc_inode = btrfs_alloc_inode,
+ .destroy_inode = btrfs_destroy_inode,
+ .statfs = btrfs_statfs,
+ .remount_fs = btrfs_remount,
+ .freeze_fs = btrfs_freeze,
+ .unfreeze_fs = btrfs_unfreeze,
+};
+
+static const struct file_operations btrfs_ctl_fops = {
+ .unlocked_ioctl = btrfs_control_ioctl,
+ .compat_ioctl = btrfs_control_ioctl,
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice btrfs_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "btrfs-control",
+ .fops = &btrfs_ctl_fops
+};
+
+static int btrfs_interface_init(void)
+{
+ return misc_register(&btrfs_misc);
+}
+
+static void btrfs_interface_exit(void)
+{
+ if (misc_deregister(&btrfs_misc) < 0)
+ printk(KERN_INFO "misc_deregister failed for control device");
+}
+
+static int __init init_btrfs_fs(void)
+{
+ int err;
+
+ err = btrfs_init_sysfs();
+ if (err)
+ return err;
+
+ err = btrfs_init_cachep();
+ if (err)
+ goto free_sysfs;
+
+ err = extent_io_init();
+ if (err)
+ goto free_cachep;
+
+ err = extent_map_init();
+ if (err)
+ goto free_extent_io;
+
+ err = btrfs_interface_init();
+ if (err)
+ goto free_extent_map;
+
+ err = register_filesystem(&btrfs_fs_type);
+ if (err)
+ goto unregister_ioctl;
+
+ printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
+ return 0;
+
+unregister_ioctl:
+ btrfs_interface_exit();
+free_extent_map:
+ extent_map_exit();
+free_extent_io:
+ extent_io_exit();
+free_cachep:
+ btrfs_destroy_cachep();
+free_sysfs:
+ btrfs_exit_sysfs();
+ return err;
+}
+
+static void __exit exit_btrfs_fs(void)
+{
+ btrfs_destroy_cachep();
+ extent_map_exit();
+ extent_io_exit();
+ btrfs_interface_exit();
+ unregister_filesystem(&btrfs_fs_type);
+ btrfs_exit_sysfs();
+ btrfs_cleanup_fs_uuids();
+ btrfs_zlib_exit();
+}
+
+module_init(init_btrfs_fs)
+module_exit(exit_btrfs_fs)
+
+MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 00000000000..a240b6fa81d
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)btrfs_root_used(&root->root_item));
+}
+
+static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)btrfs_root_limit(&root->root_item));
+}
+
+static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
+{
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
+}
+
+static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
+}
+
+static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
+}
+
+/* this is for root attrs (subvols/snapshots) */
+struct btrfs_root_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct btrfs_root *, char *);
+ ssize_t (*store)(struct btrfs_root *, const char *, size_t);
+};
+
+#define ROOT_ATTR(name, mode, show, store) \
+static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
+ show, store)
+
+ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL);
+ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL);
+
+static struct attribute *btrfs_root_attrs[] = {
+ &btrfs_root_attr_blocks_used.attr,
+ &btrfs_root_attr_block_limit.attr,
+ NULL,
+};
+
+/* this is for super attrs (actual full fs) */
+struct btrfs_super_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct btrfs_fs_info *, char *);
+ ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
+};
+
+#define SUPER_ATTR(name, mode, show, store) \
+static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
+ show, store)
+
+SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL);
+SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL);
+SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL);
+
+static struct attribute *btrfs_super_attrs[] = {
+ &btrfs_super_attr_blocks_used.attr,
+ &btrfs_super_attr_total_blocks.attr,
+ &btrfs_super_attr_blocksize.attr,
+ NULL,
+};
+
+static ssize_t btrfs_super_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+ super_kobj);
+ struct btrfs_super_attr *a = container_of(attr,
+ struct btrfs_super_attr,
+ attr);
+
+ return a->show ? a->show(fs, buf) : 0;
+}
+
+static ssize_t btrfs_super_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+ super_kobj);
+ struct btrfs_super_attr *a = container_of(attr,
+ struct btrfs_super_attr,
+ attr);
+
+ return a->store ? a->store(fs, buf, len) : 0;
+}
+
+static ssize_t btrfs_root_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+ root_kobj);
+ struct btrfs_root_attr *a = container_of(attr,
+ struct btrfs_root_attr,
+ attr);
+
+ return a->show ? a->show(root, buf) : 0;
+}
+
+static ssize_t btrfs_root_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+ root_kobj);
+ struct btrfs_root_attr *a = container_of(attr,
+ struct btrfs_root_attr,
+ attr);
+ return a->store ? a->store(root, buf, len) : 0;
+}
+
+static void btrfs_super_release(struct kobject *kobj)
+{
+ struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+ super_kobj);
+ complete(&fs->kobj_unregister);
+}
+
+static void btrfs_root_release(struct kobject *kobj)
+{
+ struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+ root_kobj);
+ complete(&root->kobj_unregister);
+}
+
+static struct sysfs_ops btrfs_super_attr_ops = {
+ .show = btrfs_super_attr_show,
+ .store = btrfs_super_attr_store,
+};
+
+static struct sysfs_ops btrfs_root_attr_ops = {
+ .show = btrfs_root_attr_show,
+ .store = btrfs_root_attr_store,
+};
+
+static struct kobj_type btrfs_root_ktype = {
+ .default_attrs = btrfs_root_attrs,
+ .sysfs_ops = &btrfs_root_attr_ops,
+ .release = btrfs_root_release,
+};
+
+static struct kobj_type btrfs_super_ktype = {
+ .default_attrs = btrfs_super_attrs,
+ .sysfs_ops = &btrfs_super_attr_ops,
+ .release = btrfs_super_release,
+};
+
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
+
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
+{
+ int error;
+ char *name;
+ char c;
+ int len = strlen(fs->sb->s_id) + 1;
+ int i;
+
+ name = kmalloc(len, GFP_NOFS);
+ if (!name) {
+ error = -ENOMEM;
+ goto fail;
+ }
+
+ for (i = 0; i < len; i++) {
+ c = fs->sb->s_id[i];
+ if (c == '/' || c == '\\')
+ c = '!';
+ name[i] = c;
+ }
+ name[len] = '\0';
+
+ fs->super_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
+ NULL, "%s", name);
+ kfree(name);
+ if (error)
+ goto fail;
+
+ return 0;
+
+fail:
+ printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
+ return error;
+}
+
+int btrfs_sysfs_add_root(struct btrfs_root *root)
+{
+ int error;
+
+ error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
+ &root->fs_info->super_kobj,
+ "%s", root->name);
+ if (error)
+ goto fail;
+
+ return 0;
+
+fail:
+ printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
+ return error;
+}
+
+void btrfs_sysfs_del_root(struct btrfs_root *root)
+{
+ kobject_put(&root->root_kobj);
+ wait_for_completion(&root->kobj_unregister);
+}
+
+void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
+{
+ kobject_put(&fs->super_kobj);
+ wait_for_completion(&fs->kobj_unregister);
+}
+
+int btrfs_init_sysfs(void)
+{
+ btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
+ if (!btrfs_kset)
+ return -ENOMEM;
+ return 0;
+}
+
+void btrfs_exit_sysfs(void)
+{
+ kset_unregister(btrfs_kset);
+}
+
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 00000000000..b2acc79f1b3
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1153 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "locking.h"
+#include "tree-log.h"
+
+#define BTRFS_ROOT_TRANS_TAG 0
+
+static noinline void put_transaction(struct btrfs_transaction *transaction)
+{
+ WARN_ON(transaction->use_count == 0);
+ transaction->use_count--;
+ if (transaction->use_count == 0) {
+ list_del_init(&transaction->list);
+ memset(transaction, 0, sizeof(*transaction));
+ kmem_cache_free(btrfs_transaction_cachep, transaction);
+ }
+}
+
+static noinline void switch_commit_root(struct btrfs_root *root)
+{
+ free_extent_buffer(root->commit_root);
+ root->commit_root = btrfs_root_node(root);
+}
+
+/*
+ * either allocate a new transaction or hop into the existing one
+ */
+static noinline int join_transaction(struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans;
+ cur_trans = root->fs_info->running_transaction;
+ if (!cur_trans) {
+ cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
+ GFP_NOFS);
+ BUG_ON(!cur_trans);
+ root->fs_info->generation++;
+ cur_trans->num_writers = 1;
+ cur_trans->num_joined = 0;
+ cur_trans->transid = root->fs_info->generation;
+ init_waitqueue_head(&cur_trans->writer_wait);
+ init_waitqueue_head(&cur_trans->commit_wait);
+ cur_trans->in_commit = 0;
+ cur_trans->blocked = 0;
+ cur_trans->use_count = 1;
+ cur_trans->commit_done = 0;
+ cur_trans->start_time = get_seconds();
+
+ cur_trans->delayed_refs.root.rb_node = NULL;
+ cur_trans->delayed_refs.num_entries = 0;
+ cur_trans->delayed_refs.num_heads_ready = 0;
+ cur_trans->delayed_refs.num_heads = 0;
+ cur_trans->delayed_refs.flushing = 0;
+ cur_trans->delayed_refs.run_delayed_start = 0;
+ spin_lock_init(&cur_trans->delayed_refs.lock);
+
+ INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+ list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+ extent_io_tree_init(&cur_trans->dirty_pages,
+ root->fs_info->btree_inode->i_mapping,
+ GFP_NOFS);
+ spin_lock(&root->fs_info->new_trans_lock);
+ root->fs_info->running_transaction = cur_trans;
+ spin_unlock(&root->fs_info->new_trans_lock);
+ } else {
+ cur_trans->num_writers++;
+ cur_trans->num_joined++;
+ }
+
+ return 0;
+}
+
+/*
+ * this does all the record keeping required to make sure that a reference
+ * counted root is properly recorded in a given transaction. This is required
+ * to make sure the old root from before we joined the transaction is deleted
+ * when the transaction commits
+ */
+static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (root->ref_cows && root->last_trans < trans->transid) {
+ WARN_ON(root == root->fs_info->extent_root);
+ WARN_ON(root->commit_root != root->node);
+
+ radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ root->last_trans = trans->transid;
+ btrfs_init_reloc_root(trans, root);
+ }
+ return 0;
+}
+
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (!root->ref_cows)
+ return 0;
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ if (root->last_trans == trans->transid) {
+ mutex_unlock(&root->fs_info->trans_mutex);
+ return 0;
+ }
+
+ record_root_in_trans(trans, root);
+ mutex_unlock(&root->fs_info->trans_mutex);
+ return 0;
+}
+
+/* wait for commit against the current transaction to become unblocked
+ * when this is done, it is safe to start a new transaction, but the current
+ * transaction might not be fully on disk.
+ */
+static void wait_current_trans(struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans;
+
+ cur_trans = root->fs_info->running_transaction;
+ if (cur_trans && cur_trans->blocked) {
+ DEFINE_WAIT(wait);
+ cur_trans->use_count++;
+ while (1) {
+ prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (cur_trans->blocked) {
+ mutex_unlock(&root->fs_info->trans_mutex);
+ schedule();
+ mutex_lock(&root->fs_info->trans_mutex);
+ finish_wait(&root->fs_info->transaction_wait,
+ &wait);
+ } else {
+ finish_wait(&root->fs_info->transaction_wait,
+ &wait);
+ break;
+ }
+ }
+ put_transaction(cur_trans);
+ }
+}
+
+enum btrfs_trans_type {
+ TRANS_START,
+ TRANS_JOIN,
+ TRANS_USERSPACE,
+};
+
+static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+ int num_blocks, int type)
+{
+ struct btrfs_trans_handle *h =
+ kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ int ret;
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ if (!root->fs_info->log_root_recovering &&
+ ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+ type == TRANS_USERSPACE))
+ wait_current_trans(root);
+ ret = join_transaction(root);
+ BUG_ON(ret);
+
+ h->transid = root->fs_info->running_transaction->transid;
+ h->transaction = root->fs_info->running_transaction;
+ h->blocks_reserved = num_blocks;
+ h->blocks_used = 0;
+ h->block_group = 0;
+ h->alloc_exclude_nr = 0;
+ h->alloc_exclude_start = 0;
+ h->delayed_ref_updates = 0;
+
+ if (!current->journal_info && type != TRANS_USERSPACE)
+ current->journal_info = h;
+
+ root->fs_info->running_transaction->use_count++;
+ record_root_in_trans(h, root);
+ mutex_unlock(&root->fs_info->trans_mutex);
+ return h;
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+ int num_blocks)
+{
+ return start_transaction(root, num_blocks, TRANS_START);
+}
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+ int num_blocks)
+{
+ return start_transaction(root, num_blocks, TRANS_JOIN);
+}
+
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+ int num_blocks)
+{
+ return start_transaction(r, num_blocks, TRANS_USERSPACE);
+}
+
+/* wait for a transaction commit to be fully complete */
+static noinline int wait_for_commit(struct btrfs_root *root,
+ struct btrfs_transaction *commit)
+{
+ DEFINE_WAIT(wait);
+ mutex_lock(&root->fs_info->trans_mutex);
+ while (!commit->commit_done) {
+ prepare_to_wait(&commit->commit_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (commit->commit_done)
+ break;
+ mutex_unlock(&root->fs_info->trans_mutex);
+ schedule();
+ mutex_lock(&root->fs_info->trans_mutex);
+ }
+ mutex_unlock(&root->fs_info->trans_mutex);
+ finish_wait(&commit->commit_wait, &wait);
+ return 0;
+}
+
+#if 0
+/*
+ * rate limit against the drop_snapshot code. This helps to slow down new
+ * operations if the drop_snapshot code isn't able to keep up.
+ */
+static void throttle_on_drops(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ int harder_count = 0;
+
+harder:
+ if (atomic_read(&info->throttles)) {
+ DEFINE_WAIT(wait);
+ int thr;
+ thr = atomic_read(&info->throttle_gen);
+
+ do {
+ prepare_to_wait(&info->transaction_throttle,
+ &wait, TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&info->throttles)) {
+ finish_wait(&info->transaction_throttle, &wait);
+ break;
+ }
+ schedule();
+ finish_wait(&info->transaction_throttle, &wait);
+ } while (thr == atomic_read(&info->throttle_gen));
+ harder_count++;
+
+ if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
+ harder_count < 2)
+ goto harder;
+
+ if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
+ harder_count < 10)
+ goto harder;
+
+ if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
+ harder_count < 20)
+ goto harder;
+ }
+}
+#endif
+
+void btrfs_throttle(struct btrfs_root *root)
+{
+ mutex_lock(&root->fs_info->trans_mutex);
+ if (!root->fs_info->open_ioctl_trans)
+ wait_current_trans(root);
+ mutex_unlock(&root->fs_info->trans_mutex);
+}
+
+static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int throttle)
+{
+ struct btrfs_transaction *cur_trans;
+ struct btrfs_fs_info *info = root->fs_info;
+ int count = 0;
+
+ while (count < 4) {
+ unsigned long cur = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (cur &&
+ trans->transaction->delayed_refs.num_heads_ready > 64) {
+ trans->delayed_ref_updates = 0;
+
+ /*
+ * do a full flush if the transaction is trying
+ * to close
+ */
+ if (trans->transaction->delayed_refs.flushing)
+ cur = 0;
+ btrfs_run_delayed_refs(trans, root, cur);
+ } else {
+ break;
+ }
+ count++;
+ }
+
+ mutex_lock(&info->trans_mutex);
+ cur_trans = info->running_transaction;
+ WARN_ON(cur_trans != trans->transaction);
+ WARN_ON(cur_trans->num_writers < 1);
+ cur_trans->num_writers--;
+
+ if (waitqueue_active(&cur_trans->writer_wait))
+ wake_up(&cur_trans->writer_wait);
+ put_transaction(cur_trans);
+ mutex_unlock(&info->trans_mutex);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+ memset(trans, 0, sizeof(*trans));
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+ if (throttle)
+ btrfs_run_delayed_iputs(root);
+
+ return 0;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ return __btrfs_end_transaction(trans, root, 0);
+}
+
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ return __btrfs_end_transaction(trans, root, 1);
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are sent to disk but does not wait on them
+ */
+int btrfs_write_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages, int mark)
+{
+ int ret;
+ int err = 0;
+ int werr = 0;
+ struct page *page;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ u64 start = 0;
+ u64 end;
+ unsigned long index;
+
+ while (1) {
+ ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark);
+ if (ret)
+ break;
+ while (start <= end) {
+ cond_resched();
+
+ index = start >> PAGE_CACHE_SHIFT;
+ start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+ page = find_get_page(btree_inode->i_mapping, index);
+ if (!page)
+ continue;
+
+ btree_lock_page_hook(page);
+ if (!page->mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ continue;
+ }
+
+ if (PageWriteback(page)) {
+ if (PageDirty(page))
+ wait_on_page_writeback(page);
+ else {
+ unlock_page(page);
+ page_cache_release(page);
+ continue;
+ }
+ }
+ err = write_one_page(page, 0);
+ if (err)
+ werr = err;
+ page_cache_release(page);
+ }
+ }
+ if (err)
+ werr = err;
+ return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are on disk for transaction or log commit. We wait
+ * on all the pages and clear them from the dirty pages state tree
+ */
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages, int mark)
+{
+ int ret;
+ int err = 0;
+ int werr = 0;
+ struct page *page;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ u64 start = 0;
+ u64 end;
+ unsigned long index;
+
+ while (1) {
+ ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark);
+ if (ret)
+ break;
+
+ clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+ while (start <= end) {
+ index = start >> PAGE_CACHE_SHIFT;
+ start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+ page = find_get_page(btree_inode->i_mapping, index);
+ if (!page)
+ continue;
+ if (PageDirty(page)) {
+ btree_lock_page_hook(page);
+ wait_on_page_writeback(page);
+ err = write_one_page(page, 0);
+ if (err)
+ werr = err;
+ }
+ wait_on_page_writeback(page);
+ page_cache_release(page);
+ cond_resched();
+ }
+ }
+ if (err)
+ werr = err;
+ return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages, int mark)
+{
+ int ret;
+ int ret2;
+
+ ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+ ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
+ return ret || ret2;
+}
+
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (!trans || !trans->transaction) {
+ struct inode *btree_inode;
+ btree_inode = root->fs_info->btree_inode;
+ return filemap_write_and_wait(btree_inode->i_mapping);
+ }
+ return btrfs_write_and_wait_marked_extents(root,
+ &trans->transaction->dirty_pages,
+ EXTENT_DIRTY);
+}
+
+/*
+ * this is used to update the root pointer in the tree of tree roots.
+ *
+ * But, in the case of the extent allocation tree, updating the root
+ * pointer may allocate blocks which may change the root of the extent
+ * allocation tree.
+ *
+ * So, this loops and repeats and makes sure the cowonly root didn't
+ * change while the root pointer was being updated in the metadata.
+ */
+static int update_cowonly_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+ u64 old_root_bytenr;
+ u64 old_root_used;
+ struct btrfs_root *tree_root = root->fs_info->tree_root;
+
+ old_root_used = btrfs_root_used(&root->root_item);
+ btrfs_write_dirty_block_groups(trans, root);
+
+ while (1) {
+ old_root_bytenr = btrfs_root_bytenr(&root->root_item);
+ if (old_root_bytenr == root->node->start &&
+ old_root_used == btrfs_root_used(&root->root_item))
+ break;
+
+ btrfs_set_root_node(&root->root_item, root->node);
+ ret = btrfs_update_root(trans, tree_root,
+ &root->root_key,
+ &root->root_item);
+ BUG_ON(ret);
+
+ old_root_used = btrfs_root_used(&root->root_item);
+ ret = btrfs_write_dirty_block_groups(trans, root);
+ BUG_ON(ret);
+ }
+
+ if (root != root->fs_info->extent_root)
+ switch_commit_root(root);
+
+ return 0;
+}
+
+/*
+ * update all the cowonly tree roots on disk
+ */
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct list_head *next;
+ struct extent_buffer *eb;
+ int ret;
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
+
+ eb = btrfs_lock_root_node(fs_info->tree_root);
+ btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
+
+ while (!list_empty(&fs_info->dirty_cowonly_roots)) {
+ next = fs_info->dirty_cowonly_roots.next;
+ list_del_init(next);
+ root = list_entry(next, struct btrfs_root, dirty_list);
+
+ update_cowonly_root(trans, root);
+ }
+
+ down_write(&fs_info->extent_commit_sem);
+ switch_commit_root(fs_info->extent_root);
+ up_write(&fs_info->extent_commit_sem);
+
+ return 0;
+}
+
+/*
+ * dead roots are old snapshots that need to be deleted. This allocates
+ * a dirty root struct and adds it into the list of dead roots that need to
+ * be deleted
+ */
+int btrfs_add_dead_root(struct btrfs_root *root)
+{
+ mutex_lock(&root->fs_info->trans_mutex);
+ list_add(&root->root_list, &root->fs_info->dead_roots);
+ mutex_unlock(&root->fs_info->trans_mutex);
+ return 0;
+}
+
+/*
+ * update all the cowonly tree roots on disk
+ */
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *gang[8];
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int i;
+ int ret;
+ int err = 0;
+
+ while (1) {
+ ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang),
+ BTRFS_ROOT_TRANS_TAG);
+ if (ret == 0)
+ break;
+ for (i = 0; i < ret; i++) {
+ root = gang[i];
+ radix_tree_tag_clear(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+
+ btrfs_free_log(trans, root);
+ btrfs_update_reloc_root(trans, root);
+
+ if (root->commit_root != root->node) {
+ switch_commit_root(root);
+ btrfs_set_root_node(&root->root_item,
+ root->node);
+ }
+
+ err = btrfs_update_root(trans, fs_info->tree_root,
+ &root->root_key,
+ &root->root_item);
+ if (err)
+ break;
+ }
+ }
+ return err;
+}
+
+/*
+ * defrag a given btree. If cacheonly == 1, this won't read from the disk,
+ * otherwise every leaf in the btree is read and defragged.
+ */
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ int ret;
+ struct btrfs_trans_handle *trans;
+ unsigned long nr;
+
+ smp_mb();
+ if (root->defrag_running)
+ return 0;
+ trans = btrfs_start_transaction(root, 1);
+ while (1) {
+ root->defrag_running = 1;
+ ret = btrfs_defrag_leaves(trans, root, cacheonly);
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(info->tree_root, nr);
+ cond_resched();
+
+ trans = btrfs_start_transaction(root, 1);
+ if (root->fs_info->closing || ret != -EAGAIN)
+ break;
+ }
+ root->defrag_running = 0;
+ smp_mb();
+ btrfs_end_transaction(trans, root);
+ return 0;
+}
+
+#if 0
+/*
+ * when dropping snapshots, we generate a ton of delayed refs, and it makes
+ * sense not to join the transaction while it is trying to flush the current
+ * queue of delayed refs out.
+ *
+ * This is used by the drop snapshot code only
+ */
+static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
+{
+ DEFINE_WAIT(wait);
+
+ mutex_lock(&info->trans_mutex);
+ while (info->running_transaction &&
+ info->running_transaction->delayed_refs.flushing) {
+ prepare_to_wait(&info->transaction_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&info->trans_mutex);
+
+ schedule();
+
+ mutex_lock(&info->trans_mutex);
+ finish_wait(&info->transaction_wait, &wait);
+ }
+ mutex_unlock(&info->trans_mutex);
+ return 0;
+}
+
+/*
+ * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
+ * all of them
+ */
+int btrfs_drop_dead_root(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = root->fs_info->tree_root;
+ unsigned long nr;
+ int ret;
+
+ while (1) {
+ /*
+ * we don't want to jump in and create a bunch of
+ * delayed refs if the transaction is starting to close
+ */
+ wait_transaction_pre_flush(tree_root->fs_info);
+ trans = btrfs_start_transaction(tree_root, 1);
+
+ /*
+ * we've joined a transaction, make sure it isn't
+ * closing right now
+ */
+ if (trans->transaction->delayed_refs.flushing) {
+ btrfs_end_transaction(trans, tree_root);
+ continue;
+ }
+
+ ret = btrfs_drop_snapshot(trans, root);
+ if (ret != -EAGAIN)
+ break;
+
+ ret = btrfs_update_root(trans, tree_root,
+ &root->root_key,
+ &root->root_item);
+ if (ret)
+ break;
+
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction(trans, tree_root);
+ BUG_ON(ret);
+
+ btrfs_btree_balance_dirty(tree_root, nr);
+ cond_resched();
+ }
+ BUG_ON(ret);
+
+ ret = btrfs_del_root(trans, tree_root, &root->root_key);
+ BUG_ON(ret);
+
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction(trans, tree_root);
+ BUG_ON(ret);
+
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
+ kfree(root);
+
+ btrfs_btree_balance_dirty(tree_root, nr);
+ return ret;
+}
+#endif
+
+/*
+ * new snapshots need to be created at a very specific time in the
+ * transaction commit. This does the actual creation
+ */
+static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_key key;
+ struct btrfs_root_item *new_root_item;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *root = pending->root;
+ struct extent_buffer *tmp;
+ struct extent_buffer *old;
+ int ret;
+ u64 objectid;
+
+ new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
+ if (!new_root_item) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
+ if (ret)
+ goto fail;
+
+ record_root_in_trans(trans, root);
+ btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
+ memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+
+ key.objectid = objectid;
+ /* record when the snapshot was created in key.offset */
+ key.offset = trans->transid;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
+ old = btrfs_lock_root_node(root);
+ btrfs_cow_block(trans, root, old, NULL, 0, &old);
+ btrfs_set_lock_blocking(old);
+
+ btrfs_copy_root(trans, root, old, &tmp, objectid);
+ btrfs_tree_unlock(old);
+ free_extent_buffer(old);
+
+ btrfs_set_root_node(new_root_item, tmp);
+ ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+ new_root_item);
+ btrfs_tree_unlock(tmp);
+ free_extent_buffer(tmp);
+ if (ret)
+ goto fail;
+
+ key.offset = (u64)-1;
+ memcpy(&pending->root_key, &key, sizeof(key));
+fail:
+ kfree(new_root_item);
+ return ret;
+}
+
+static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
+ struct btrfs_pending_snapshot *pending)
+{
+ int ret;
+ int namelen;
+ u64 index = 0;
+ struct btrfs_trans_handle *trans;
+ struct inode *parent_inode;
+ struct btrfs_root *parent_root;
+
+ parent_inode = pending->dentry->d_parent->d_inode;
+ parent_root = BTRFS_I(parent_inode)->root;
+ trans = btrfs_join_transaction(parent_root, 1);
+
+ /*
+ * insert the directory item
+ */
+ namelen = strlen(pending->name);
+ ret = btrfs_set_inode_index(parent_inode, &index);
+ ret = btrfs_insert_dir_item(trans, parent_root,
+ pending->name, namelen,
+ parent_inode->i_ino,
+ &pending->root_key, BTRFS_FT_DIR, index);
+
+ if (ret)
+ goto fail;
+
+ btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
+ ret = btrfs_update_inode(trans, parent_root, parent_inode);
+ BUG_ON(ret);
+
+ ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+ pending->root_key.objectid,
+ parent_root->root_key.objectid,
+ parent_inode->i_ino, index, pending->name,
+ namelen);
+
+ BUG_ON(ret);
+
+fail:
+ btrfs_end_transaction(trans, fs_info->fs_root);
+ return ret;
+}
+
+/*
+ * create all the snapshots we've scheduled for creation
+ */
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_pending_snapshot *pending;
+ struct list_head *head = &trans->transaction->pending_snapshots;
+ int ret;
+
+ list_for_each_entry(pending, head, list) {
+ ret = create_pending_snapshot(trans, fs_info, pending);
+ BUG_ON(ret);
+ }
+ return 0;
+}
+
+static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_pending_snapshot *pending;
+ struct list_head *head = &trans->transaction->pending_snapshots;
+ int ret;
+
+ while (!list_empty(head)) {
+ pending = list_entry(head->next,
+ struct btrfs_pending_snapshot, list);
+ ret = finish_pending_snapshot(fs_info, pending);
+ BUG_ON(ret);
+ list_del(&pending->list);
+ kfree(pending->name);
+ kfree(pending);
+ }
+ return 0;
+}
+
+static void update_super_roots(struct btrfs_root *root)
+{
+ struct btrfs_root_item *root_item;
+ struct btrfs_super_block *super;
+
+ super = &root->fs_info->super_copy;
+
+ root_item = &root->fs_info->chunk_root->root_item;
+ super->chunk_root = root_item->bytenr;
+ super->chunk_root_generation = root_item->generation;
+ super->chunk_root_level = root_item->level;
+
+ root_item = &root->fs_info->tree_root->root_item;
+ super->root = root_item->bytenr;
+ super->generation = root_item->generation;
+ super->root_level = root_item->level;
+}
+
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
+{
+ int ret = 0;
+ spin_lock(&info->new_trans_lock);
+ if (info->running_transaction)
+ ret = info->running_transaction->in_commit;
+ spin_unlock(&info->new_trans_lock);
+ return ret;
+}
+
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ unsigned long joined = 0;
+ unsigned long timeout = 1;
+ struct btrfs_transaction *cur_trans;
+ struct btrfs_transaction *prev_trans = NULL;
+ DEFINE_WAIT(wait);
+ int ret;
+ int should_grow = 0;
+ unsigned long now = get_seconds();
+ int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
+
+ btrfs_run_ordered_operations(root, 0);
+
+ /* make a pass through all the delayed refs we have so far
+ * any runnings procs may add more while we are here
+ */
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ BUG_ON(ret);
+
+ cur_trans = trans->transaction;
+ /*
+ * set the flushing flag so procs in this transaction have to
+ * start sending their work down.
+ */
+ cur_trans->delayed_refs.flushing = 1;
+
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ BUG_ON(ret);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ if (cur_trans->in_commit) {
+ cur_trans->use_count++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+ btrfs_end_transaction(trans, root);
+
+ ret = wait_for_commit(root, cur_trans);
+ BUG_ON(ret);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ put_transaction(cur_trans);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ return 0;
+ }
+
+ trans->transaction->in_commit = 1;
+ trans->transaction->blocked = 1;
+ if (cur_trans->list.prev != &root->fs_info->trans_list) {
+ prev_trans = list_entry(cur_trans->list.prev,
+ struct btrfs_transaction, list);
+ if (!prev_trans->commit_done) {
+ prev_trans->use_count++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ wait_for_commit(root, prev_trans);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ put_transaction(prev_trans);
+ }
+ }
+
+ if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
+ should_grow = 1;
+
+ do {
+ int snap_pending = 0;
+ joined = cur_trans->num_joined;
+ if (!list_empty(&trans->transaction->pending_snapshots))
+ snap_pending = 1;
+
+ WARN_ON(cur_trans != trans->transaction);
+ prepare_to_wait(&cur_trans->writer_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (cur_trans->num_writers > 1)
+ timeout = MAX_SCHEDULE_TIMEOUT;
+ else if (should_grow)
+ timeout = 1;
+
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ if (flush_on_commit) {
+ btrfs_start_delalloc_inodes(root, 1);
+ ret = btrfs_wait_ordered_extents(root, 0, 1);
+ BUG_ON(ret);
+ } else if (snap_pending) {
+ ret = btrfs_wait_ordered_extents(root, 0, 1);
+ BUG_ON(ret);
+ }
+
+ /*
+ * rename don't use btrfs_join_transaction, so, once we
+ * set the transaction to blocked above, we aren't going
+ * to get any new ordered operations. We can safely run
+ * it here and no for sure that nothing new will be added
+ * to the list
+ */
+ btrfs_run_ordered_operations(root, 1);
+
+ smp_mb();
+ if (cur_trans->num_writers > 1 || should_grow)
+ schedule_timeout(timeout);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ finish_wait(&cur_trans->writer_wait, &wait);
+ } while (cur_trans->num_writers > 1 ||
+ (should_grow && cur_trans->num_joined != joined));
+
+ ret = create_pending_snapshots(trans, root->fs_info);
+ BUG_ON(ret);
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
+
+ WARN_ON(cur_trans != trans->transaction);
+
+ /* btrfs_commit_tree_roots is responsible for getting the
+ * various roots consistent with each other. Every pointer
+ * in the tree of tree roots has to point to the most up to date
+ * root for every subvolume and other tree. So, we have to keep
+ * the tree logging code from jumping in and changing any
+ * of the trees.
+ *
+ * At this point in the commit, there can't be any tree-log
+ * writers, but a little lower down we drop the trans mutex
+ * and let new people in. By holding the tree_log_mutex
+ * from now until after the super is written, we avoid races
+ * with the tree-log code.
+ */
+ mutex_lock(&root->fs_info->tree_log_mutex);
+
+ ret = commit_fs_roots(trans, root);
+ BUG_ON(ret);
+
+ /* commit_fs_roots gets rid of all the tree log roots, it is now
+ * safe to free the root of tree log roots
+ */
+ btrfs_free_log_root_tree(trans, root->fs_info);
+
+ ret = commit_cowonly_roots(trans, root);
+ BUG_ON(ret);
+
+ btrfs_prepare_extent_commit(trans, root);
+
+ cur_trans = root->fs_info->running_transaction;
+ spin_lock(&root->fs_info->new_trans_lock);
+ root->fs_info->running_transaction = NULL;
+ spin_unlock(&root->fs_info->new_trans_lock);
+
+ btrfs_set_root_node(&root->fs_info->tree_root->root_item,
+ root->fs_info->tree_root->node);
+ switch_commit_root(root->fs_info->tree_root);
+
+ btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
+ root->fs_info->chunk_root->node);
+ switch_commit_root(root->fs_info->chunk_root);
+
+ update_super_roots(root);
+
+ if (!root->fs_info->log_root_recovering) {
+ btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
+ btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+ }
+
+ memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+ sizeof(root->fs_info->super_copy));
+
+ trans->transaction->blocked = 0;
+
+ wake_up(&root->fs_info->transaction_wait);
+
+ mutex_unlock(&root->fs_info->trans_mutex);
+ ret = btrfs_write_and_wait_transaction(trans, root);
+ BUG_ON(ret);
+ write_ctree_super(trans, root, 0);
+
+ /*
+ * the super is written, we can safely allow the tree-loggers
+ * to go about their business
+ */
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+
+ btrfs_finish_extent_commit(trans, root);
+
+ /* do the directory inserts of any pending snapshot creations */
+ finish_pending_snapshots(trans, root->fs_info);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+
+ cur_trans->commit_done = 1;
+
+ root->fs_info->last_trans_committed = cur_trans->transid;
+
+ wake_up(&cur_trans->commit_wait);
+
+ put_transaction(cur_trans);
+ put_transaction(cur_trans);
+
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+ if (current != root->fs_info->transaction_kthread)
+ btrfs_run_delayed_iputs(root);
+
+ return ret;
+}
+
+/*
+ * interface function to delete all the snapshots we have scheduled for deletion
+ */
+int btrfs_clean_old_snapshots(struct btrfs_root *root)
+{
+ LIST_HEAD(list);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ mutex_lock(&fs_info->trans_mutex);
+ list_splice_init(&fs_info->dead_roots, &list);
+ mutex_unlock(&fs_info->trans_mutex);
+
+ while (!list_empty(&list)) {
+ root = list_entry(list.next, struct btrfs_root, root_list);
+ list_del(&root->root_list);
+
+ if (btrfs_header_backref_rev(root->node) <
+ BTRFS_MIXED_BACKREF_REV)
+ btrfs_drop_snapshot(root, 0);
+ else
+ btrfs_drop_snapshot(root, 1);
+ }
+ return 0;
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 00000000000..93c7ccb3311
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_TRANSACTION__
+#define __BTRFS_TRANSACTION__
+#include "btrfs_inode.h"
+#include "delayed-ref.h"
+
+struct btrfs_transaction {
+ u64 transid;
+ /*
+ * total writers in this transaction, it must be zero before the
+ * transaction can end
+ */
+ unsigned long num_writers;
+
+ unsigned long num_joined;
+ int in_commit;
+ int use_count;
+ int commit_done;
+ int blocked;
+ struct list_head list;
+ struct extent_io_tree dirty_pages;
+ unsigned long start_time;
+ wait_queue_head_t writer_wait;
+ wait_queue_head_t commit_wait;
+ struct list_head pending_snapshots;
+ struct btrfs_delayed_ref_root delayed_refs;
+};
+
+struct btrfs_trans_handle {
+ u64 transid;
+ unsigned long blocks_reserved;
+ unsigned long blocks_used;
+ struct btrfs_transaction *transaction;
+ u64 block_group;
+ u64 alloc_exclude_start;
+ u64 alloc_exclude_nr;
+ unsigned long delayed_ref_updates;
+};
+
+struct btrfs_pending_snapshot {
+ struct dentry *dentry;
+ struct btrfs_root *root;
+ char *name;
+ struct btrfs_key root_key;
+ struct list_head list;
+};
+
+static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ trans->block_group = BTRFS_I(inode)->block_group;
+}
+
+static inline void btrfs_update_inode_block_group(
+ struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ BTRFS_I(inode)->block_group = trans->block_group;
+}
+
+static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ BTRFS_I(inode)->last_trans = trans->transaction->transid;
+ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+ int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+ int num_blocks);
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+ int num_blocks);
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+
+int btrfs_add_dead_root(struct btrfs_root *root);
+int btrfs_drop_dead_root(struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
+int btrfs_clean_old_snapshots(struct btrfs_root *root);
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+void btrfs_throttle(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages, int mark);
+int btrfs_write_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages, int mark);
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages, int mark);
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 00000000000..b10eacdb162
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "locking.h"
+
+/* defrag all the leaves in a given btree. If cache_only == 1, don't read
+ * things from disk, otherwise read all the leaves and try to get key order to
+ * better reflect disk order
+ */
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int cache_only)
+{
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ int ret = 0;
+ int wret;
+ int level;
+ int orig_level;
+ int is_extent = 0;
+ int next_key_ret = 0;
+ u64 last_ret = 0;
+ u64 min_trans = 0;
+
+ if (cache_only)
+ goto out;
+
+ if (root->fs_info->extent_root == root) {
+ /*
+ * there's recursion here right now in the tree locking,
+ * we can't defrag the extent root without deadlock
+ */
+ goto out;
+ }
+
+ if (root->ref_cows == 0 && !is_extent)
+ goto out;
+
+ if (btrfs_test_opt(root, SSD))
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ level = btrfs_header_level(root->node);
+ orig_level = level;
+
+ if (level == 0)
+ goto out;
+
+ if (root->defrag_progress.objectid == 0) {
+ struct extent_buffer *root_node;
+ u32 nritems;
+
+ root_node = btrfs_lock_root_node(root);
+ btrfs_set_lock_blocking(root_node);
+ nritems = btrfs_header_nritems(root_node);
+ root->defrag_max.objectid = 0;
+ /* from above we know this is not a leaf */
+ btrfs_node_key_to_cpu(root_node, &root->defrag_max,
+ nritems - 1);
+ btrfs_tree_unlock(root_node);
+ free_extent_buffer(root_node);
+ memset(&key, 0, sizeof(key));
+ } else {
+ memcpy(&key, &root->defrag_progress, sizeof(key));
+ }
+
+ path->keep_locks = 1;
+ if (cache_only)
+ min_trans = root->defrag_trans_start;
+
+ ret = btrfs_search_forward(root, &key, NULL, path,
+ cache_only, min_trans);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ btrfs_release_path(root, path);
+ wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+
+ if (wret < 0) {
+ ret = wret;
+ goto out;
+ }
+ if (!path->nodes[1]) {
+ ret = 0;
+ goto out;
+ }
+ path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+ next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+ min_trans);
+ ret = btrfs_realloc_node(trans, root,
+ path->nodes[1], 0,
+ cache_only, &last_ret,
+ &root->defrag_progress);
+ WARN_ON(ret && ret != -EAGAIN);
+ if (next_key_ret == 0) {
+ memcpy(&root->defrag_progress, &key, sizeof(key));
+ ret = -EAGAIN;
+ }
+
+ btrfs_release_path(root, path);
+out:
+ if (path)
+ btrfs_free_path(path);
+ if (ret == -EAGAIN) {
+ if (root->defrag_max.objectid > root->defrag_progress.objectid)
+ goto done;
+ if (root->defrag_max.type > root->defrag_progress.type)
+ goto done;
+ if (root->defrag_max.offset > root->defrag_progress.offset)
+ goto done;
+ ret = 0;
+ }
+done:
+ if (ret != -EAGAIN) {
+ memset(&root->defrag_progress, 0,
+ sizeof(root->defrag_progress));
+ root->defrag_trans_start = trans->transid;
+ }
+ return ret;
+}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 00000000000..4a9434b622e
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,3198 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "print-tree.h"
+#include "compat.h"
+#include "tree-log.h"
+
+/* magic values for the inode_only field in btrfs_log_inode:
+ *
+ * LOG_INODE_ALL means to log everything
+ * LOG_INODE_EXISTS means to log just enough to recreate the inode
+ * during log replay
+ */
+#define LOG_INODE_ALL 0
+#define LOG_INODE_EXISTS 1
+
+/*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2). After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ * 2a is actually the more important variant. With the extra logging
+ * a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir. After a crash the rm -rf must
+ * be replayed. This must be able to recurse down the entire
+ * directory tree. The inode link count fixup code takes care of the
+ * ugly details.
+ */
+
+/*
+ * stages for the tree walking. The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+ * we find in the log are created in the subvolume.
+ *
+ * The last stage is to deal with directories and links and extents
+ * and all the other fun semantics
+ */
+#define LOG_WALK_PIN_ONLY 0
+#define LOG_WALK_REPLAY_INODES 1
+#define LOG_WALK_REPLAY_ALL 2
+
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only);
+static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dirid, int del_all);
+
+/*
+ * tree logging is a special write ahead log used to make sure that
+ * fsyncs and O_SYNCs can happen without doing full tree commits.
+ *
+ * Full tree commits are expensive because they require commonly
+ * modified blocks to be recowed, creating many dirty pages in the
+ * extent tree an 4x-6x higher write load than ext3.
+ *
+ * Instead of doing a tree commit on every fsync, we use the
+ * key ranges and transaction ids to find items for a given file or directory
+ * that have changed in this transaction. Those items are copied into
+ * a special tree (one per subvolume root), that tree is written to disk
+ * and then the fsync is considered complete.
+ *
+ * After a crash, items are copied out of the log-tree back into the
+ * subvolume tree. Any file data extents found are recorded in the extent
+ * allocation tree, and the log-tree freed.
+ *
+ * The log tree is read three times, once to pin down all the extents it is
+ * using in ram and once, once to create all the inodes logged in the tree
+ * and once to do all the other items.
+ */
+
+/*
+ * start a sub transaction and setup the log tree
+ * this increments the log tree writer count to make the people
+ * syncing the tree wait for us to finish
+ */
+static int start_log_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+
+ mutex_lock(&root->log_mutex);
+ if (root->log_root) {
+ if (!root->log_start_pid) {
+ root->log_start_pid = current->pid;
+ root->log_multiple_pids = false;
+ } else if (root->log_start_pid != current->pid) {
+ root->log_multiple_pids = true;
+ }
+
+ root->log_batch++;
+ atomic_inc(&root->log_writers);
+ mutex_unlock(&root->log_mutex);
+ return 0;
+ }
+ root->log_multiple_pids = false;
+ root->log_start_pid = current->pid;
+ mutex_lock(&root->fs_info->tree_log_mutex);
+ if (!root->fs_info->log_root_tree) {
+ ret = btrfs_init_log_root_tree(trans, root->fs_info);
+ BUG_ON(ret);
+ }
+ if (!root->log_root) {
+ ret = btrfs_add_log_tree(trans, root);
+ BUG_ON(ret);
+ }
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ root->log_batch++;
+ atomic_inc(&root->log_writers);
+ mutex_unlock(&root->log_mutex);
+ return 0;
+}
+
+/*
+ * returns 0 if there was a log transaction running and we were able
+ * to join, or returns -ENOENT if there were not transactions
+ * in progress
+ */
+static int join_running_log_trans(struct btrfs_root *root)
+{
+ int ret = -ENOENT;
+
+ smp_mb();
+ if (!root->log_root)
+ return -ENOENT;
+
+ mutex_lock(&root->log_mutex);
+ if (root->log_root) {
+ ret = 0;
+ atomic_inc(&root->log_writers);
+ }
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+/*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+int btrfs_pin_log_trans(struct btrfs_root *root)
+{
+ int ret = -ENOENT;
+
+ mutex_lock(&root->log_mutex);
+ atomic_inc(&root->log_writers);
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+int btrfs_end_log_trans(struct btrfs_root *root)
+{
+ if (atomic_dec_and_test(&root->log_writers)) {
+ smp_mb();
+ if (waitqueue_active(&root->log_writer_wait))
+ wake_up(&root->log_writer_wait);
+ }
+ return 0;
+}
+
+
+/*
+ * the walk control struct is used to pass state down the chain when
+ * processing the log tree. The stage field tells us which part
+ * of the log tree processing we are currently doing. The others
+ * are state fields used for that specific part
+ */
+struct walk_control {
+ /* should we free the extent on disk when done? This is used
+ * at transaction commit time while freeing a log tree
+ */
+ int free;
+
+ /* should we write out the extent buffer? This is used
+ * while flushing the log tree to disk during a sync
+ */
+ int write;
+
+ /* should we wait for the extent buffer io to finish? Also used
+ * while flushing the log tree to disk for a sync
+ */
+ int wait;
+
+ /* pin only walk, we record which extents on disk belong to the
+ * log trees
+ */
+ int pin;
+
+ /* what stage of the replay code we're currently in */
+ int stage;
+
+ /* the root we are currently replaying */
+ struct btrfs_root *replay_dest;
+
+ /* the trans handle for the current replay */
+ struct btrfs_trans_handle *trans;
+
+ /* the function that gets used to process blocks we find in the
+ * tree. Note the extent_buffer might not be up to date when it is
+ * passed in, and it must be checked or read if you need the data
+ * inside it
+ */
+ int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen);
+};
+
+/*
+ * process_func used to pin down extents, write them or wait on them
+ */
+static int process_one_buffer(struct btrfs_root *log,
+ struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen)
+{
+ if (wc->pin)
+ btrfs_pin_extent(log->fs_info->extent_root,
+ eb->start, eb->len, 0);
+
+ if (btrfs_buffer_uptodate(eb, gen)) {
+ if (wc->write)
+ btrfs_write_tree_block(eb);
+ if (wc->wait)
+ btrfs_wait_tree_block_writeback(eb);
+ }
+ return 0;
+}
+
+/*
+ * Item overwrite used by replay and tree logging. eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten. If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static noinline int overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+ u32 item_size;
+ u64 saved_i_size = 0;
+ int save_old_i_size = 0;
+ unsigned long src_ptr;
+ unsigned long dst_ptr;
+ int overwrite_root = 0;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ overwrite_root = 1;
+
+ item_size = btrfs_item_size_nr(eb, slot);
+ src_ptr = btrfs_item_ptr_offset(eb, slot);
+
+ /* look for the key in the destination tree */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret == 0) {
+ char *src_copy;
+ char *dst_copy;
+ u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ if (dst_size != item_size)
+ goto insert;
+
+ if (item_size == 0) {
+ btrfs_release_path(root, path);
+ return 0;
+ }
+ dst_copy = kmalloc(item_size, GFP_NOFS);
+ src_copy = kmalloc(item_size, GFP_NOFS);
+
+ read_extent_buffer(eb, src_copy, src_ptr, item_size);
+
+ dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
+ item_size);
+ ret = memcmp(dst_copy, src_copy, item_size);
+
+ kfree(dst_copy);
+ kfree(src_copy);
+ /*
+ * they have the same contents, just return, this saves
+ * us from cowing blocks in the destination tree and doing
+ * extra writes that may not have been done by a previous
+ * sync
+ */
+ if (ret == 0) {
+ btrfs_release_path(root, path);
+ return 0;
+ }
+
+ }
+insert:
+ btrfs_release_path(root, path);
+ /* try to insert the key into the destination tree */
+ ret = btrfs_insert_empty_item(trans, root, path,
+ key, item_size);
+
+ /* make sure any existing item is the correct size */
+ if (ret == -EEXIST) {
+ u32 found_size;
+ found_size = btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ if (found_size > item_size) {
+ btrfs_truncate_item(trans, root, path, item_size, 1);
+ } else if (found_size < item_size) {
+ ret = btrfs_extend_item(trans, root, path,
+ item_size - found_size);
+ BUG_ON(ret);
+ }
+ } else if (ret) {
+ BUG();
+ }
+ dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+
+ /* don't overwrite an existing inode if the generation number
+ * was logged as zero. This is done when the tree logging code
+ * is just logging an inode to make sure it exists after recovery.
+ *
+ * Also, don't overwrite i_size on directories during replay.
+ * log replay inserts and removes directory items based on the
+ * state of the tree found in the subvolume, and i_size is modified
+ * as it goes
+ */
+ if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+ struct btrfs_inode_item *src_item;
+ struct btrfs_inode_item *dst_item;
+
+ src_item = (struct btrfs_inode_item *)src_ptr;
+ dst_item = (struct btrfs_inode_item *)dst_ptr;
+
+ if (btrfs_inode_generation(eb, src_item) == 0)
+ goto no_copy;
+
+ if (overwrite_root &&
+ S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+ S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+ save_old_i_size = 1;
+ saved_i_size = btrfs_inode_size(path->nodes[0],
+ dst_item);
+ }
+ }
+
+ copy_extent_buffer(path->nodes[0], eb, dst_ptr,
+ src_ptr, item_size);
+
+ if (save_old_i_size) {
+ struct btrfs_inode_item *dst_item;
+ dst_item = (struct btrfs_inode_item *)dst_ptr;
+ btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+ }
+
+ /* make sure the generation is filled in */
+ if (key->type == BTRFS_INODE_ITEM_KEY) {
+ struct btrfs_inode_item *dst_item;
+ dst_item = (struct btrfs_inode_item *)dst_ptr;
+ if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
+ btrfs_set_inode_generation(path->nodes[0], dst_item,
+ trans->transid);
+ }
+ }
+no_copy:
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(root, path);
+ return 0;
+}
+
+/*
+ * simple helper to read an inode off the disk from a given root
+ * This can only be called for subvolume roots and not for the log
+ */
+static noinline struct inode *read_one_inode(struct btrfs_root *root,
+ u64 objectid)
+{
+ struct btrfs_key key;
+ struct inode *inode;
+
+ key.objectid = objectid;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ inode = btrfs_iget(root->fs_info->sb, &key, root);
+ if (IS_ERR(inode)) {
+ inode = NULL;
+ } else if (is_bad_inode(inode)) {
+ iput(inode);
+ inode = NULL;
+ }
+ return inode;
+}
+
+/* replays a single extent in 'eb' at 'slot' with 'key' into the
+ * subvolume 'root'. path is released on entry and should be released
+ * on exit.
+ *
+ * extents in the log tree have not been allocated out of the extent
+ * tree yet. So, this completes the allocation, taking a reference
+ * as required if the extent already exists or creating a new extent
+ * if it isn't in the extent allocation tree yet.
+ *
+ * The extent is inserted into the file, dropping any existing extents
+ * from the file that overlap the new one.
+ */
+static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int found_type;
+ u64 mask = root->sectorsize - 1;
+ u64 extent_end;
+ u64 alloc_hint;
+ u64 start = key->offset;
+ u64 saved_nbytes;
+ struct btrfs_file_extent_item *item;
+ struct inode *inode = NULL;
+ unsigned long size;
+ int ret = 0;
+
+ item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(eb, item);
+
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC)
+ extent_end = start + btrfs_file_extent_num_bytes(eb, item);
+ else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ size = btrfs_file_extent_inline_len(eb, item);
+ extent_end = (start + size + mask) & ~mask;
+ } else {
+ ret = 0;
+ goto out;
+ }
+
+ inode = read_one_inode(root, key->objectid);
+ if (!inode) {
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * first check to see if we already have this extent in the
+ * file. This must be done before the btrfs_drop_extents run
+ * so we don't try to drop this extent.
+ */
+ ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ start, 0);
+
+ if (ret == 0 &&
+ (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
+ struct btrfs_file_extent_item cmp1;
+ struct btrfs_file_extent_item cmp2;
+ struct btrfs_file_extent_item *existing;
+ struct extent_buffer *leaf;
+
+ leaf = path->nodes[0];
+ existing = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ read_extent_buffer(eb, &cmp1, (unsigned long)item,
+ sizeof(cmp1));
+ read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
+ sizeof(cmp2));
+
+ /*
+ * we already have a pointer to this exact extent,
+ * we don't have to do anything
+ */
+ if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+ btrfs_release_path(root, path);
+ goto out;
+ }
+ }
+ btrfs_release_path(root, path);
+
+ saved_nbytes = inode_get_bytes(inode);
+ /* drop any overlapping extents */
+ ret = btrfs_drop_extents(trans, inode, start, extent_end,
+ &alloc_hint, 1);
+ BUG_ON(ret);
+
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 offset;
+ unsigned long dest_offset;
+ struct btrfs_key ins;
+
+ ret = btrfs_insert_empty_item(trans, root, path, key,
+ sizeof(*item));
+ BUG_ON(ret);
+ dest_offset = btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ copy_extent_buffer(path->nodes[0], eb, dest_offset,
+ (unsigned long)item, sizeof(*item));
+
+ ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+ ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ offset = key->offset - btrfs_file_extent_offset(eb, item);
+
+ if (ins.objectid > 0) {
+ u64 csum_start;
+ u64 csum_end;
+ LIST_HEAD(ordered_sums);
+ /*
+ * is this extent already allocated in the extent
+ * allocation tree? If so, just add a reference
+ */
+ ret = btrfs_lookup_extent(root, ins.objectid,
+ ins.offset);
+ if (ret == 0) {
+ ret = btrfs_inc_extent_ref(trans, root,
+ ins.objectid, ins.offset,
+ 0, root->root_key.objectid,
+ key->objectid, offset);
+ } else {
+ /*
+ * insert the extent pointer in the extent
+ * allocation tree
+ */
+ ret = btrfs_alloc_logged_file_extent(trans,
+ root, root->root_key.objectid,
+ key->objectid, offset, &ins);
+ BUG_ON(ret);
+ }
+ btrfs_release_path(root, path);
+
+ if (btrfs_file_extent_compression(eb, item)) {
+ csum_start = ins.objectid;
+ csum_end = csum_start + ins.offset;
+ } else {
+ csum_start = ins.objectid +
+ btrfs_file_extent_offset(eb, item);
+ csum_end = csum_start +
+ btrfs_file_extent_num_bytes(eb, item);
+ }
+
+ ret = btrfs_lookup_csums_range(root->log_root,
+ csum_start, csum_end - 1,
+ &ordered_sums);
+ BUG_ON(ret);
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums;
+ sums = list_entry(ordered_sums.next,
+ struct btrfs_ordered_sum,
+ list);
+ ret = btrfs_csum_file_blocks(trans,
+ root->fs_info->csum_root,
+ sums);
+ BUG_ON(ret);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ } else {
+ btrfs_release_path(root, path);
+ }
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ /* inline extents are easy, we just overwrite them */
+ ret = overwrite_item(trans, root, path, eb, slot, key);
+ BUG_ON(ret);
+ }
+
+ inode_set_bytes(inode, saved_nbytes);
+ btrfs_update_inode(trans, root, inode);
+out:
+ if (inode)
+ iput(inode);
+ return ret;
+}
+
+/*
+ * when cleaning up conflicts between the directory names in the
+ * subvolume, directory names in the log and directory names in the
+ * inode back references, we may have to unlink inodes from directories.
+ *
+ * This is a helper function to do the unlink of a specific directory
+ * item
+ */
+static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct inode *dir,
+ struct btrfs_dir_item *di)
+{
+ struct inode *inode;
+ char *name;
+ int name_len;
+ struct extent_buffer *leaf;
+ struct btrfs_key location;
+ int ret;
+
+ leaf = path->nodes[0];
+
+ btrfs_dir_item_key_to_cpu(leaf, di, &location);
+ name_len = btrfs_dir_name_len(leaf, di);
+ name = kmalloc(name_len, GFP_NOFS);
+ read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
+ btrfs_release_path(root, path);
+
+ inode = read_one_inode(root, location.objectid);
+ BUG_ON(!inode);
+
+ ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ BUG_ON(ret);
+
+ ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+ BUG_ON(ret);
+ kfree(name);
+
+ iput(inode);
+ return ret;
+}
+
+/*
+ * helper function to see if a given name and sequence number found
+ * in an inode back reference are already in a directory and correctly
+ * point to this inode
+ */
+static noinline int inode_in_dir(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 dirid, u64 objectid, u64 index,
+ const char *name, int name_len)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_key location;
+ int match = 0;
+
+ di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
+ index, name, name_len, 0);
+ if (di && !IS_ERR(di)) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid != objectid)
+ goto out;
+ } else
+ goto out;
+ btrfs_release_path(root, path);
+
+ di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+ if (di && !IS_ERR(di)) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid != objectid)
+ goto out;
+ } else
+ goto out;
+ match = 1;
+out:
+ btrfs_release_path(root, path);
+ return match;
+}
+
+/*
+ * helper function to check a log tree for a named back reference in
+ * an inode. This is used to decide if a back reference that is
+ * found in the subvolume conflicts with what we find in the log.
+ *
+ * inode backreferences may have multiple refs in a single item,
+ * during replay we process one reference at a time, and we don't
+ * want to delete valid links to a file from the subvolume if that
+ * link is also in the log.
+ */
+static noinline int backref_in_log(struct btrfs_root *log,
+ struct btrfs_key *key,
+ char *name, int namelen)
+{
+ struct btrfs_path *path;
+ struct btrfs_inode_ref *ref;
+ unsigned long ptr;
+ unsigned long ptr_end;
+ unsigned long name_ptr;
+ int found_name_len;
+ int item_size;
+ int ret;
+ int match = 0;
+
+ path = btrfs_alloc_path();
+ ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
+ if (ret != 0)
+ goto out;
+
+ item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ ptr_end = ptr + item_size;
+ while (ptr < ptr_end) {
+ ref = (struct btrfs_inode_ref *)ptr;
+ found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
+ if (found_name_len == namelen) {
+ name_ptr = (unsigned long)(ref + 1);
+ ret = memcmp_extent_buffer(path->nodes[0], name,
+ name_ptr, namelen);
+ if (ret == 0) {
+ match = 1;
+ goto out;
+ }
+ }
+ ptr = (unsigned long)(ref + 1) + found_name_len;
+ }
+out:
+ btrfs_free_path(path);
+ return match;
+}
+
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function. (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ struct inode *dir;
+ int ret;
+ struct btrfs_key location;
+ struct btrfs_inode_ref *ref;
+ struct btrfs_dir_item *di;
+ struct inode *inode;
+ char *name;
+ int namelen;
+ unsigned long ref_ptr;
+ unsigned long ref_end;
+
+ location.objectid = key->objectid;
+ location.type = BTRFS_INODE_ITEM_KEY;
+ location.offset = 0;
+
+ /*
+ * it is possible that we didn't log all the parent directories
+ * for a given inode. If we don't find the dir, just don't
+ * copy the back ref in. The link count fixup code will take
+ * care of the rest
+ */
+ dir = read_one_inode(root, key->offset);
+ if (!dir)
+ return -ENOENT;
+
+ inode = read_one_inode(root, key->objectid);
+ BUG_ON(!inode);
+
+ ref_ptr = btrfs_item_ptr_offset(eb, slot);
+ ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+again:
+ ref = (struct btrfs_inode_ref *)ref_ptr;
+
+ namelen = btrfs_inode_ref_name_len(eb, ref);
+ name = kmalloc(namelen, GFP_NOFS);
+ BUG_ON(!name);
+
+ read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+
+ /* if we already have a perfect match, we're done */
+ if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
+ btrfs_inode_ref_index(eb, ref),
+ name, namelen)) {
+ goto out;
+ }
+
+ /*
+ * look for a conflicting back reference in the metadata.
+ * if we find one we have to unlink that name of the file
+ * before we add our new link. Later on, we overwrite any
+ * existing back reference, and we don't want to create
+ * dangling pointers in the directory.
+ */
+conflict_again:
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret == 0) {
+ char *victim_name;
+ int victim_name_len;
+ struct btrfs_inode_ref *victim_ref;
+ unsigned long ptr;
+ unsigned long ptr_end;
+ struct extent_buffer *leaf = path->nodes[0];
+
+ /* are we trying to overwrite a back ref for the root directory
+ * if so, just jump out, we're done
+ */
+ if (key->objectid == key->offset)
+ goto out_nowrite;
+
+ /* check all the names in this back reference to see
+ * if they are in the log. if so, we allow them to stay
+ * otherwise they must be unlinked as a conflict
+ */
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+ while (ptr < ptr_end) {
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ victim_name_len = btrfs_inode_ref_name_len(leaf,
+ victim_ref);
+ victim_name = kmalloc(victim_name_len, GFP_NOFS);
+ BUG_ON(!victim_name);
+
+ read_extent_buffer(leaf, victim_name,
+ (unsigned long)(victim_ref + 1),
+ victim_name_len);
+
+ if (!backref_in_log(log, key, victim_name,
+ victim_name_len)) {
+ btrfs_inc_nlink(inode);
+ btrfs_release_path(root, path);
+
+ ret = btrfs_unlink_inode(trans, root, dir,
+ inode, victim_name,
+ victim_name_len);
+ kfree(victim_name);
+ btrfs_release_path(root, path);
+ goto conflict_again;
+ }
+ kfree(victim_name);
+ ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+ }
+ BUG_ON(ret);
+ }
+ btrfs_release_path(root, path);
+
+ /* look for a conflicting sequence number */
+ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+ btrfs_inode_ref_index(eb, ref),
+ name, namelen, 0);
+ if (di && !IS_ERR(di)) {
+ ret = drop_one_dir_item(trans, root, path, dir, di);
+ BUG_ON(ret);
+ }
+ btrfs_release_path(root, path);
+
+
+ /* look for a conflicting name */
+ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ name, namelen, 0);
+ if (di && !IS_ERR(di)) {
+ ret = drop_one_dir_item(trans, root, path, dir, di);
+ BUG_ON(ret);
+ }
+ btrfs_release_path(root, path);
+
+ /* insert our name */
+ ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
+ btrfs_inode_ref_index(eb, ref));
+ BUG_ON(ret);
+
+ btrfs_update_inode(trans, root, inode);
+
+out:
+ ref_ptr = (unsigned long)(ref + 1) + namelen;
+ kfree(name);
+ if (ref_ptr < ref_end)
+ goto again;
+
+ /* finally write the back reference in the inode */
+ ret = overwrite_item(trans, root, path, eb, slot, key);
+ BUG_ON(ret);
+
+out_nowrite:
+ btrfs_release_path(root, path);
+ iput(dir);
+ iput(inode);
+ return 0;
+}
+
+static int insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset)
+{
+ int ret;
+ ret = btrfs_find_orphan_item(root, offset);
+ if (ret > 0)
+ ret = btrfs_insert_orphan_item(trans, root, offset);
+ return ret;
+}
+
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay. So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found. If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct btrfs_key key;
+ u64 nlink = 0;
+ unsigned long ptr;
+ unsigned long ptr_end;
+ int name_len;
+
+ key.objectid = inode->i_ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid != inode->i_ino ||
+ key.type != BTRFS_INODE_REF_KEY)
+ break;
+ ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ while (ptr < ptr_end) {
+ struct btrfs_inode_ref *ref;
+
+ ref = (struct btrfs_inode_ref *)ptr;
+ name_len = btrfs_inode_ref_name_len(path->nodes[0],
+ ref);
+ ptr = (unsigned long)(ref + 1) + name_len;
+ nlink++;
+ }
+
+ if (key.offset == 0)
+ break;
+ key.offset--;
+ btrfs_release_path(root, path);
+ }
+ btrfs_release_path(root, path);
+ if (nlink != inode->i_nlink) {
+ inode->i_nlink = nlink;
+ btrfs_update_inode(trans, root, inode);
+ }
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+
+ if (inode->i_nlink == 0) {
+ if (S_ISDIR(inode->i_mode)) {
+ ret = replay_dir_deletes(trans, root, NULL, path,
+ inode->i_ino, 1);
+ BUG_ON(ret);
+ }
+ ret = insert_orphan_item(trans, root, inode->i_ino);
+ BUG_ON(ret);
+ }
+ btrfs_free_path(path);
+
+ return 0;
+}
+
+static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ int ret;
+ struct btrfs_key key;
+ struct inode *inode;
+
+ key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = (u64)-1;
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ break;
+
+ if (ret == 1) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
+ key.type != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ ret = btrfs_del_item(trans, root, path);
+ BUG_ON(ret);
+
+ btrfs_release_path(root, path);
+ inode = read_one_inode(root, key.offset);
+ BUG_ON(!inode);
+
+ ret = fixup_inode_link_count(trans, root, inode);
+ BUG_ON(ret);
+
+ iput(inode);
+
+ /*
+ * fixup on a directory may create new entries,
+ * make sure we always look for the highset possible
+ * offset
+ */
+ key.offset = (u64)-1;
+ }
+ btrfs_release_path(root, path);
+ return 0;
+}
+
+
+/*
+ * record a given inode in the fixup dir so we can check its link
+ * count when replay is done. The link count is incremented here
+ * so the inode won't go away until we check it
+ */
+static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid)
+{
+ struct btrfs_key key;
+ int ret = 0;
+ struct inode *inode;
+
+ inode = read_one_inode(root, objectid);
+ BUG_ON(!inode);
+
+ key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+ btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.offset = objectid;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+ btrfs_release_path(root, path);
+ if (ret == 0) {
+ btrfs_inc_nlink(inode);
+ btrfs_update_inode(trans, root, inode);
+ } else if (ret == -EEXIST) {
+ ret = 0;
+ } else {
+ BUG();
+ }
+ iput(inode);
+
+ return ret;
+}
+
+/*
+ * when replaying the log for a directory, we only insert names
+ * for inodes that actually exist. This means an fsync on a directory
+ * does not implicitly fsync all the new files in it
+ */
+static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 dirid, u64 index,
+ char *name, int name_len, u8 type,
+ struct btrfs_key *location)
+{
+ struct inode *inode;
+ struct inode *dir;
+ int ret;
+
+ inode = read_one_inode(root, location->objectid);
+ if (!inode)
+ return -ENOENT;
+
+ dir = read_one_inode(root, dirid);
+ if (!dir) {
+ iput(inode);
+ return -EIO;
+ }
+ ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
+
+ /* FIXME, put inode into FIXUP list */
+
+ iput(inode);
+ iput(dir);
+ return ret;
+}
+
+/*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+ *
+ * if a conflicting item exists in the subdirectory already,
+ * the inode it points to is unlinked and put into the link count
+ * fix up tree.
+ *
+ * If a name from the log points to a file or directory that does
+ * not exist in the FS, it is skipped. fsyncs on directories
+ * do not force down inodes inside that directory, just changes to the
+ * names or unlinks in a directory.
+ */
+static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb,
+ struct btrfs_dir_item *di,
+ struct btrfs_key *key)
+{
+ char *name;
+ int name_len;
+ struct btrfs_dir_item *dst_di;
+ struct btrfs_key found_key;
+ struct btrfs_key log_key;
+ struct inode *dir;
+ u8 log_type;
+ int exists;
+ int ret;
+
+ dir = read_one_inode(root, key->objectid);
+ BUG_ON(!dir);
+
+ name_len = btrfs_dir_name_len(eb, di);
+ name = kmalloc(name_len, GFP_NOFS);
+ log_type = btrfs_dir_type(eb, di);
+ read_extent_buffer(eb, name, (unsigned long)(di + 1),
+ name_len);
+
+ btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+ exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
+ if (exists == 0)
+ exists = 1;
+ else
+ exists = 0;
+ btrfs_release_path(root, path);
+
+ if (key->type == BTRFS_DIR_ITEM_KEY) {
+ dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+ name, name_len, 1);
+ } else if (key->type == BTRFS_DIR_INDEX_KEY) {
+ dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+ key->objectid,
+ key->offset, name,
+ name_len, 1);
+ } else {
+ BUG();
+ }
+ if (!dst_di || IS_ERR(dst_di)) {
+ /* we need a sequence number to insert, so we only
+ * do inserts for the BTRFS_DIR_INDEX_KEY types
+ */
+ if (key->type != BTRFS_DIR_INDEX_KEY)
+ goto out;
+ goto insert;
+ }
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+ /* the existing item matches the logged item */
+ if (found_key.objectid == log_key.objectid &&
+ found_key.type == log_key.type &&
+ found_key.offset == log_key.offset &&
+ btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+ goto out;
+ }
+
+ /*
+ * don't drop the conflicting directory entry if the inode
+ * for the new entry doesn't exist
+ */
+ if (!exists)
+ goto out;
+
+ ret = drop_one_dir_item(trans, root, path, dir, dst_di);
+ BUG_ON(ret);
+
+ if (key->type == BTRFS_DIR_INDEX_KEY)
+ goto insert;
+out:
+ btrfs_release_path(root, path);
+ kfree(name);
+ iput(dir);
+ return 0;
+
+insert:
+ btrfs_release_path(root, path);
+ ret = insert_one_name(trans, root, path, key->objectid, key->offset,
+ name, name_len, log_type, &log_key);
+
+ BUG_ON(ret && ret != -ENOENT);
+ goto out;
+}
+
+/*
+ * find all the names in a directory item and reconcile them into
+ * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
+ * one name in a directory item, but the same code gets used for
+ * both directory index types
+ */
+static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+ u32 item_size = btrfs_item_size_nr(eb, slot);
+ struct btrfs_dir_item *di;
+ int name_len;
+ unsigned long ptr;
+ unsigned long ptr_end;
+
+ ptr = btrfs_item_ptr_offset(eb, slot);
+ ptr_end = ptr + item_size;
+ while (ptr < ptr_end) {
+ di = (struct btrfs_dir_item *)ptr;
+ name_len = btrfs_dir_name_len(eb, di);
+ ret = replay_one_name(trans, root, path, eb, di, key);
+ BUG_ON(ret);
+ ptr = (unsigned long)(di + 1);
+ ptr += name_len;
+ }
+ return 0;
+}
+
+/*
+ * directory replay has two parts. There are the standard directory
+ * items in the log copied from the subvolume, and range items
+ * created in the log while the subvolume was logged.
+ *
+ * The range items tell us which parts of the key space the log
+ * is authoritative for. During replay, if a key in the subvolume
+ * directory is in a logged range item, but not actually in the log
+ * that means it was deleted from the directory before the fsync
+ * and should be removed.
+ */
+static noinline int find_dir_range(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 dirid, int key_type,
+ u64 *start_ret, u64 *end_ret)
+{
+ struct btrfs_key key;
+ u64 found_end;
+ struct btrfs_dir_log_item *item;
+ int ret;
+ int nritems;
+
+ if (*start_ret == (u64)-1)
+ return 1;
+
+ key.objectid = dirid;
+ key.type = key_type;
+ key.offset = *start_ret;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+ if (ret != 0)
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != key_type || key.objectid != dirid) {
+ ret = 1;
+ goto next;
+ }
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_log_item);
+ found_end = btrfs_dir_log_end(path->nodes[0], item);
+
+ if (*start_ret >= key.offset && *start_ret <= found_end) {
+ ret = 0;
+ *start_ret = key.offset;
+ *end_ret = found_end;
+ goto out;
+ }
+ ret = 1;
+next:
+ /* check the next slot in the tree to see if it is a valid item */
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ goto out;
+ } else {
+ path->slots[0]++;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != key_type || key.objectid != dirid) {
+ ret = 1;
+ goto out;
+ }
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_log_item);
+ found_end = btrfs_dir_log_end(path->nodes[0], item);
+ *start_ret = key.offset;
+ *end_ret = found_end;
+ ret = 0;
+out:
+ btrfs_release_path(root, path);
+ return ret;
+}
+
+/*
+ * this looks for a given directory item in the log. If the directory
+ * item is not in the log, the item is removed and the inode it points
+ * to is unlinked
+ */
+static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ struct btrfs_path *log_path,
+ struct inode *dir,
+ struct btrfs_key *dir_key)
+{
+ int ret;
+ struct extent_buffer *eb;
+ int slot;
+ u32 item_size;
+ struct btrfs_dir_item *di;
+ struct btrfs_dir_item *log_di;
+ int name_len;
+ unsigned long ptr;
+ unsigned long ptr_end;
+ char *name;
+ struct inode *inode;
+ struct btrfs_key location;
+
+again:
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+ ptr = btrfs_item_ptr_offset(eb, slot);
+ ptr_end = ptr + item_size;
+ while (ptr < ptr_end) {
+ di = (struct btrfs_dir_item *)ptr;
+ name_len = btrfs_dir_name_len(eb, di);
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(eb, name, (unsigned long)(di + 1),
+ name_len);
+ log_di = NULL;
+ if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
+ log_di = btrfs_lookup_dir_item(trans, log, log_path,
+ dir_key->objectid,
+ name, name_len, 0);
+ } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
+ log_di = btrfs_lookup_dir_index_item(trans, log,
+ log_path,
+ dir_key->objectid,
+ dir_key->offset,
+ name, name_len, 0);
+ }
+ if (!log_di || IS_ERR(log_di)) {
+ btrfs_dir_item_key_to_cpu(eb, di, &location);
+ btrfs_release_path(root, path);
+ btrfs_release_path(log, log_path);
+ inode = read_one_inode(root, location.objectid);
+ BUG_ON(!inode);
+
+ ret = link_to_fixup_dir(trans, root,
+ path, location.objectid);
+ BUG_ON(ret);
+ btrfs_inc_nlink(inode);
+ ret = btrfs_unlink_inode(trans, root, dir, inode,
+ name, name_len);
+ BUG_ON(ret);
+ kfree(name);
+ iput(inode);
+
+ /* there might still be more names under this key
+ * check and repeat if required
+ */
+ ret = btrfs_search_slot(NULL, root, dir_key, path,
+ 0, 0);
+ if (ret == 0)
+ goto again;
+ ret = 0;
+ goto out;
+ }
+ btrfs_release_path(log, log_path);
+ kfree(name);
+
+ ptr = (unsigned long)(di + 1);
+ ptr += name_len;
+ }
+ ret = 0;
+out:
+ btrfs_release_path(root, path);
+ btrfs_release_path(log, log_path);
+ return ret;
+}
+
+/*
+ * deletion replay happens before we copy any new directory items
+ * out of the log or out of backreferences from inodes. It
+ * scans the log to find ranges of keys that log is authoritative for,
+ * and then scans the directory to find items in those ranges that are
+ * not present in the log.
+ *
+ * Anything we don't find in the log is unlinked and removed from the
+ * directory.
+ */
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dirid, int del_all)
+{
+ u64 range_start;
+ u64 range_end;
+ int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+ int ret = 0;
+ struct btrfs_key dir_key;
+ struct btrfs_key found_key;
+ struct btrfs_path *log_path;
+ struct inode *dir;
+
+ dir_key.objectid = dirid;
+ dir_key.type = BTRFS_DIR_ITEM_KEY;
+ log_path = btrfs_alloc_path();
+ if (!log_path)
+ return -ENOMEM;
+
+ dir = read_one_inode(root, dirid);
+ /* it isn't an error if the inode isn't there, that can happen
+ * because we replay the deletes before we copy in the inode item
+ * from the log
+ */
+ if (!dir) {
+ btrfs_free_path(log_path);
+ return 0;
+ }
+again:
+ range_start = 0;
+ range_end = 0;
+ while (1) {
+ if (del_all)
+ range_end = (u64)-1;
+ else {
+ ret = find_dir_range(log, path, dirid, key_type,
+ &range_start, &range_end);
+ if (ret != 0)
+ break;
+ }
+
+ dir_key.offset = range_start;
+ while (1) {
+ int nritems;
+ ret = btrfs_search_slot(NULL, root, &dir_key, path,
+ 0, 0);
+ if (ret < 0)
+ goto out;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ break;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != dirid ||
+ found_key.type != dir_key.type)
+ goto next_type;
+
+ if (found_key.offset > range_end)
+ break;
+
+ ret = check_item_in_log(trans, root, log, path,
+ log_path, dir,
+ &found_key);
+ BUG_ON(ret);
+ if (found_key.offset == (u64)-1)
+ break;
+ dir_key.offset = found_key.offset + 1;
+ }
+ btrfs_release_path(root, path);
+ if (range_end == (u64)-1)
+ break;
+ range_start = range_end + 1;
+ }
+
+next_type:
+ ret = 0;
+ if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+ key_type = BTRFS_DIR_LOG_INDEX_KEY;
+ dir_key.type = BTRFS_DIR_INDEX_KEY;
+ btrfs_release_path(root, path);
+ goto again;
+ }
+out:
+ btrfs_release_path(root, path);
+ btrfs_free_path(log_path);
+ iput(dir);
+ return ret;
+}
+
+/*
+ * the process_func used to replay items from the log tree. This
+ * gets called in two different stages. The first stage just looks
+ * for inodes and makes sure they are all copied into the subvolume.
+ *
+ * The second stage copies all the other item types from the log into
+ * the subvolume. The two stage approach is slower, but gets rid of
+ * lots of complexity around inodes referencing other inodes that exist
+ * only in the log (references come from either directory items or inode
+ * back refs).
+ */
+static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen)
+{
+ int nritems;
+ struct btrfs_path *path;
+ struct btrfs_root *root = wc->replay_dest;
+ struct btrfs_key key;
+ u32 item_size;
+ int level;
+ int i;
+ int ret;
+
+ btrfs_read_buffer(eb, gen);
+
+ level = btrfs_header_level(eb);
+
+ if (level != 0)
+ return 0;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ nritems = btrfs_header_nritems(eb);
+ for (i = 0; i < nritems; i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+ item_size = btrfs_item_size_nr(eb, i);
+
+ /* inode keys are done during the first stage */
+ if (key.type == BTRFS_INODE_ITEM_KEY &&
+ wc->stage == LOG_WALK_REPLAY_INODES) {
+ struct btrfs_inode_item *inode_item;
+ u32 mode;
+
+ inode_item = btrfs_item_ptr(eb, i,
+ struct btrfs_inode_item);
+ mode = btrfs_inode_mode(eb, inode_item);
+ if (S_ISDIR(mode)) {
+ ret = replay_dir_deletes(wc->trans,
+ root, log, path, key.objectid, 0);
+ BUG_ON(ret);
+ }
+ ret = overwrite_item(wc->trans, root, path,
+ eb, i, &key);
+ BUG_ON(ret);
+
+ /* for regular files, make sure corresponding
+ * orhpan item exist. extents past the new EOF
+ * will be truncated later by orphan cleanup.
+ */
+ if (S_ISREG(mode)) {
+ ret = insert_orphan_item(wc->trans, root,
+ key.objectid);
+ BUG_ON(ret);
+ }
+
+ ret = link_to_fixup_dir(wc->trans, root,
+ path, key.objectid);
+ BUG_ON(ret);
+ }
+ if (wc->stage < LOG_WALK_REPLAY_ALL)
+ continue;
+
+ /* these keys are simply copied */
+ if (key.type == BTRFS_XATTR_ITEM_KEY) {
+ ret = overwrite_item(wc->trans, root, path,
+ eb, i, &key);
+ BUG_ON(ret);
+ } else if (key.type == BTRFS_INODE_REF_KEY) {
+ ret = add_inode_ref(wc->trans, root, log, path,
+ eb, i, &key);
+ BUG_ON(ret && ret != -ENOENT);
+ } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ ret = replay_one_extent(wc->trans, root, path,
+ eb, i, &key);
+ BUG_ON(ret);
+ } else if (key.type == BTRFS_DIR_ITEM_KEY ||
+ key.type == BTRFS_DIR_INDEX_KEY) {
+ ret = replay_one_dir_item(wc->trans, root, path,
+ eb, i, &key);
+ BUG_ON(ret);
+ }
+ }
+ btrfs_free_path(path);
+ return 0;
+}
+
+static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level,
+ struct walk_control *wc)
+{
+ u64 root_owner;
+ u64 root_gen;
+ u64 bytenr;
+ u64 ptr_gen;
+ struct extent_buffer *next;
+ struct extent_buffer *cur;
+ struct extent_buffer *parent;
+ u32 blocksize;
+ int ret = 0;
+
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+ while (*level > 0) {
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+ cur = path->nodes[*level];
+
+ if (btrfs_header_level(cur) != *level)
+ WARN_ON(1);
+
+ if (path->slots[*level] >=
+ btrfs_header_nritems(cur))
+ break;
+
+ bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+ ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+ blocksize = btrfs_level_size(root, *level - 1);
+
+ parent = path->nodes[*level];
+ root_owner = btrfs_header_owner(parent);
+ root_gen = btrfs_header_generation(parent);
+
+ next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+
+ wc->process_func(root, next, wc, ptr_gen);
+
+ if (*level == 1) {
+ path->slots[*level]++;
+ if (wc->free) {
+ btrfs_read_buffer(next, ptr_gen);
+
+ btrfs_tree_lock(next);
+ clean_tree_block(trans, root, next);
+ btrfs_set_lock_blocking(next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+
+ WARN_ON(root_owner !=
+ BTRFS_TREE_LOG_OBJECTID);
+ ret = btrfs_free_reserved_extent(root,
+ bytenr, blocksize);
+ BUG_ON(ret);
+ }
+ free_extent_buffer(next);
+ continue;
+ }
+ btrfs_read_buffer(next, ptr_gen);
+
+ WARN_ON(*level <= 0);
+ if (path->nodes[*level-1])
+ free_extent_buffer(path->nodes[*level-1]);
+ path->nodes[*level-1] = next;
+ *level = btrfs_header_level(next);
+ path->slots[*level] = 0;
+ cond_resched();
+ }
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+ if (path->nodes[*level] == root->node)
+ parent = path->nodes[*level];
+ else
+ parent = path->nodes[*level + 1];
+
+ bytenr = path->nodes[*level]->start;
+
+ blocksize = btrfs_level_size(root, *level);
+ root_owner = btrfs_header_owner(parent);
+ root_gen = btrfs_header_generation(parent);
+
+ wc->process_func(root, path->nodes[*level], wc,
+ btrfs_header_generation(path->nodes[*level]));
+
+ if (wc->free) {
+ next = path->nodes[*level];
+ btrfs_tree_lock(next);
+ clean_tree_block(trans, root, next);
+ btrfs_set_lock_blocking(next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+
+ WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+ ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
+ BUG_ON(ret);
+ }
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ *level += 1;
+
+ cond_resched();
+ return 0;
+}
+
+static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level,
+ struct walk_control *wc)
+{
+ u64 root_owner;
+ u64 root_gen;
+ int i;
+ int slot;
+ int ret;
+
+ for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+ slot = path->slots[i];
+ if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+ struct extent_buffer *node;
+ node = path->nodes[i];
+ path->slots[i]++;
+ *level = i;
+ WARN_ON(*level == 0);
+ return 0;
+ } else {
+ struct extent_buffer *parent;
+ if (path->nodes[*level] == root->node)
+ parent = path->nodes[*level];
+ else
+ parent = path->nodes[*level + 1];
+
+ root_owner = btrfs_header_owner(parent);
+ root_gen = btrfs_header_generation(parent);
+ wc->process_func(root, path->nodes[*level], wc,
+ btrfs_header_generation(path->nodes[*level]));
+ if (wc->free) {
+ struct extent_buffer *next;
+
+ next = path->nodes[*level];
+
+ btrfs_tree_lock(next);
+ clean_tree_block(trans, root, next);
+ btrfs_set_lock_blocking(next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+
+ WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+ ret = btrfs_free_reserved_extent(root,
+ path->nodes[*level]->start,
+ path->nodes[*level]->len);
+ BUG_ON(ret);
+ }
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ *level = i + 1;
+ }
+ }
+ return 1;
+}
+
+/*
+ * drop the reference count on the tree rooted at 'snap'. This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+static int walk_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, struct walk_control *wc)
+{
+ int ret = 0;
+ int wret;
+ int level;
+ struct btrfs_path *path;
+ int i;
+ int orig_level;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ level = btrfs_header_level(log->node);
+ orig_level = level;
+ path->nodes[level] = log->node;
+ extent_buffer_get(log->node);
+ path->slots[level] = 0;
+
+ while (1) {
+ wret = walk_down_log_tree(trans, log, path, &level, wc);
+ if (wret > 0)
+ break;
+ if (wret < 0)
+ ret = wret;
+
+ wret = walk_up_log_tree(trans, log, path, &level, wc);
+ if (wret > 0)
+ break;
+ if (wret < 0)
+ ret = wret;
+ }
+
+ /* was the root node processed? if not, catch it here */
+ if (path->nodes[orig_level]) {
+ wc->process_func(log, path->nodes[orig_level], wc,
+ btrfs_header_generation(path->nodes[orig_level]));
+ if (wc->free) {
+ struct extent_buffer *next;
+
+ next = path->nodes[orig_level];
+
+ btrfs_tree_lock(next);
+ clean_tree_block(trans, log, next);
+ btrfs_set_lock_blocking(next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+
+ WARN_ON(log->root_key.objectid !=
+ BTRFS_TREE_LOG_OBJECTID);
+ ret = btrfs_free_reserved_extent(log, next->start,
+ next->len);
+ BUG_ON(ret);
+ }
+ }
+
+ for (i = 0; i <= orig_level; i++) {
+ if (path->nodes[i]) {
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+ }
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log)
+{
+ int ret;
+
+ if (log->log_transid == 1) {
+ /* insert root item on the first sync */
+ ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
+ &log->root_key, &log->root_item);
+ } else {
+ ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+ &log->root_key, &log->root_item);
+ }
+ return ret;
+}
+
+static int wait_log_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long transid)
+{
+ DEFINE_WAIT(wait);
+ int index = transid % 2;
+
+ /*
+ * we only allow two pending log transactions at a time,
+ * so we know that if ours is more than 2 older than the
+ * current transaction, we're done
+ */
+ do {
+ prepare_to_wait(&root->log_commit_wait[index],
+ &wait, TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&root->log_mutex);
+
+ if (root->fs_info->last_trans_log_full_commit !=
+ trans->transid && root->log_transid < transid + 2 &&
+ atomic_read(&root->log_commit[index]))
+ schedule();
+
+ finish_wait(&root->log_commit_wait[index], &wait);
+ mutex_lock(&root->log_mutex);
+ } while (root->log_transid < transid + 2 &&
+ atomic_read(&root->log_commit[index]));
+ return 0;
+}
+
+static int wait_for_writer(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ DEFINE_WAIT(wait);
+ while (atomic_read(&root->log_writers)) {
+ prepare_to_wait(&root->log_writer_wait,
+ &wait, TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&root->log_mutex);
+ if (root->fs_info->last_trans_log_full_commit !=
+ trans->transid && atomic_read(&root->log_writers))
+ schedule();
+ mutex_lock(&root->log_mutex);
+ finish_wait(&root->log_writer_wait, &wait);
+ }
+ return 0;
+}
+
+/*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it. When this call is done,
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
+ */
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int index1;
+ int index2;
+ int mark;
+ int ret;
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
+ unsigned long log_transid = 0;
+
+ mutex_lock(&root->log_mutex);
+ index1 = root->log_transid % 2;
+ if (atomic_read(&root->log_commit[index1])) {
+ wait_log_commit(trans, root, root->log_transid);
+ mutex_unlock(&root->log_mutex);
+ return 0;
+ }
+ atomic_set(&root->log_commit[index1], 1);
+
+ /* wait for previous tree log sync to complete */
+ if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
+ wait_log_commit(trans, root, root->log_transid - 1);
+
+ while (1) {
+ unsigned long batch = root->log_batch;
+ if (root->log_multiple_pids) {
+ mutex_unlock(&root->log_mutex);
+ schedule_timeout_uninterruptible(1);
+ mutex_lock(&root->log_mutex);
+ }
+ wait_for_writer(trans, root);
+ if (batch == root->log_batch)
+ break;
+ }
+
+ /* bail out if we need to do a full commit */
+ if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ ret = -EAGAIN;
+ mutex_unlock(&root->log_mutex);
+ goto out;
+ }
+
+ log_transid = root->log_transid;
+ if (log_transid % 2 == 0)
+ mark = EXTENT_DIRTY;
+ else
+ mark = EXTENT_NEW;
+
+ /* we start IO on all the marked extents here, but we don't actually
+ * wait for them until later.
+ */
+ ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
+ BUG_ON(ret);
+
+ btrfs_set_root_node(&log->root_item, log->node);
+
+ root->log_batch = 0;
+ root->log_transid++;
+ log->log_transid = root->log_transid;
+ root->log_start_pid = 0;
+ smp_mb();
+ /*
+ * IO has been started, blocks of the log tree have WRITTEN flag set
+ * in their headers. new modifications of the log will be written to
+ * new positions. so it's safe to allow log writers to go in.
+ */
+ mutex_unlock(&root->log_mutex);
+
+ mutex_lock(&log_root_tree->log_mutex);
+ log_root_tree->log_batch++;
+ atomic_inc(&log_root_tree->log_writers);
+ mutex_unlock(&log_root_tree->log_mutex);
+
+ ret = update_log_root(trans, log);
+ BUG_ON(ret);
+
+ mutex_lock(&log_root_tree->log_mutex);
+ if (atomic_dec_and_test(&log_root_tree->log_writers)) {
+ smp_mb();
+ if (waitqueue_active(&log_root_tree->log_writer_wait))
+ wake_up(&log_root_tree->log_writer_wait);
+ }
+
+ index2 = log_root_tree->log_transid % 2;
+ if (atomic_read(&log_root_tree->log_commit[index2])) {
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ wait_log_commit(trans, log_root_tree,
+ log_root_tree->log_transid);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out;
+ }
+ atomic_set(&log_root_tree->log_commit[index2], 1);
+
+ if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
+ wait_log_commit(trans, log_root_tree,
+ log_root_tree->log_transid - 1);
+ }
+
+ wait_for_writer(trans, log_root_tree);
+
+ /*
+ * now that we've moved on to the tree of log tree roots,
+ * check the full commit flag again
+ */
+ if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = -EAGAIN;
+ goto out_wake_log_root;
+ }
+
+ ret = btrfs_write_and_wait_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_DIRTY | EXTENT_NEW);
+ BUG_ON(ret);
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+
+ btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+ log_root_tree->node->start);
+ btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+ btrfs_header_level(log_root_tree->node));
+
+ log_root_tree->log_batch = 0;
+ log_root_tree->log_transid++;
+ smp_mb();
+
+ mutex_unlock(&log_root_tree->log_mutex);
+
+ /*
+ * nobody else is going to jump in and write the the ctree
+ * super here because the log_commit atomic below is protecting
+ * us. We must be called with a transaction handle pinning
+ * the running transaction open, so a full commit can't hop
+ * in and cause problems either.
+ */
+ write_ctree_super(trans, root->fs_info->tree_root, 1);
+ ret = 0;
+
+ mutex_lock(&root->log_mutex);
+ if (root->last_log_commit < log_transid)
+ root->last_log_commit = log_transid;
+ mutex_unlock(&root->log_mutex);
+
+out_wake_log_root:
+ atomic_set(&log_root_tree->log_commit[index2], 0);
+ smp_mb();
+ if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
+ wake_up(&log_root_tree->log_commit_wait[index2]);
+out:
+ atomic_set(&root->log_commit[index1], 0);
+ smp_mb();
+ if (waitqueue_active(&root->log_commit_wait[index1]))
+ wake_up(&root->log_commit_wait[index1]);
+ return 0;
+}
+
+/*
+ * free all the extents used by the tree log. This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+ int ret;
+ struct btrfs_root *log;
+ struct key;
+ u64 start;
+ u64 end;
+ struct walk_control wc = {
+ .free = 1,
+ .process_func = process_one_buffer
+ };
+
+ if (!root->log_root || root->fs_info->log_root_recovering)
+ return 0;
+
+ log = root->log_root;
+ ret = walk_log_tree(trans, log, &wc);
+ BUG_ON(ret);
+
+ while (1) {
+ ret = find_first_extent_bit(&log->dirty_log_pages,
+ 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+ if (ret)
+ break;
+
+ clear_extent_bits(&log->dirty_log_pages, start, end,
+ EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+ }
+
+ if (log->log_transid > 0) {
+ ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
+ &log->root_key);
+ BUG_ON(ret);
+ }
+ root->log_root = NULL;
+ free_extent_buffer(log->node);
+ kfree(log);
+ return 0;
+}
+
+/*
+ * If both a file and directory are logged, and unlinks or renames are
+ * mixed in, we have a few interesting corners:
+ *
+ * create file X in dir Y
+ * link file X to X.link in dir Y
+ * fsync file X
+ * unlink file X but leave X.link
+ * fsync dir Y
+ *
+ * After a crash we would expect only X.link to exist. But file X
+ * didn't get fsync'd again so the log has back refs for X and X.link.
+ *
+ * We solve this by removing directory entries and inode backrefs from the
+ * log when a file that was logged in the current transaction is
+ * unlinked. Any later fsync will include the updated log entries, and
+ * we'll be able to reconstruct the proper directory items from backrefs.
+ *
+ * This optimizations allows us to avoid relogging the entire inode
+ * or the entire directory.
+ */
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *dir, u64 index)
+{
+ struct btrfs_root *log;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ int ret;
+ int bytes_del = 0;
+
+ if (BTRFS_I(dir)->logged_trans < trans->transid)
+ return 0;
+
+ ret = join_running_log_trans(root);
+ if (ret)
+ return 0;
+
+ mutex_lock(&BTRFS_I(dir)->log_mutex);
+
+ log = root->log_root;
+ path = btrfs_alloc_path();
+ di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
+ name, name_len, -1);
+ if (di && !IS_ERR(di)) {
+ ret = btrfs_delete_one_dir_name(trans, log, path, di);
+ bytes_del += name_len;
+ BUG_ON(ret);
+ }
+ btrfs_release_path(log, path);
+ di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
+ index, name, name_len, -1);
+ if (di && !IS_ERR(di)) {
+ ret = btrfs_delete_one_dir_name(trans, log, path, di);
+ bytes_del += name_len;
+ BUG_ON(ret);
+ }
+
+ /* update the directory size in the log to reflect the names
+ * we have removed
+ */
+ if (bytes_del) {
+ struct btrfs_key key;
+
+ key.objectid = dir->i_ino;
+ key.offset = 0;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ btrfs_release_path(log, path);
+
+ ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+ if (ret == 0) {
+ struct btrfs_inode_item *item;
+ u64 i_size;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ i_size = btrfs_inode_size(path->nodes[0], item);
+ if (i_size > bytes_del)
+ i_size -= bytes_del;
+ else
+ i_size = 0;
+ btrfs_set_inode_size(path->nodes[0], item, i_size);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ } else
+ ret = 0;
+ btrfs_release_path(log, path);
+ }
+
+ btrfs_free_path(path);
+ mutex_unlock(&BTRFS_I(dir)->log_mutex);
+ btrfs_end_log_trans(root);
+
+ return 0;
+}
+
+/* see comments for btrfs_del_dir_entries_in_log */
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *inode, u64 dirid)
+{
+ struct btrfs_root *log;
+ u64 index;
+ int ret;
+
+ if (BTRFS_I(inode)->logged_trans < trans->transid)
+ return 0;
+
+ ret = join_running_log_trans(root);
+ if (ret)
+ return 0;
+ log = root->log_root;
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+ ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+ dirid, &index);
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+ btrfs_end_log_trans(root);
+
+ return ret;
+}
+
+/*
+ * creates a range item in the log for 'dirid'. first_offset and
+ * last_offset tell us which parts of the key space the log should
+ * be considered authoritative for.
+ */
+static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ int key_type, u64 dirid,
+ u64 first_offset, u64 last_offset)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_dir_log_item *item;
+
+ key.objectid = dirid;
+ key.offset = first_offset;
+ if (key_type == BTRFS_DIR_ITEM_KEY)
+ key.type = BTRFS_DIR_LOG_ITEM_KEY;
+ else
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+ BUG_ON(ret);
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_log_item);
+ btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(log, path);
+ return 0;
+}
+
+/*
+ * log all the items included in the current transaction for a given
+ * directory. This also creates the range items in the log tree required
+ * to replay anything deleted before the fsync
+ */
+static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path, int key_type,
+ u64 min_offset, u64 *last_offset_ret)
+{
+ struct btrfs_key min_key;
+ struct btrfs_key max_key;
+ struct btrfs_root *log = root->log_root;
+ struct extent_buffer *src;
+ int ret;
+ int i;
+ int nritems;
+ u64 first_offset = min_offset;
+ u64 last_offset = (u64)-1;
+
+ log = root->log_root;
+ max_key.objectid = inode->i_ino;
+ max_key.offset = (u64)-1;
+ max_key.type = key_type;
+
+ min_key.objectid = inode->i_ino;
+ min_key.type = key_type;
+ min_key.offset = min_offset;
+
+ path->keep_locks = 1;
+
+ ret = btrfs_search_forward(root, &min_key, &max_key,
+ path, 0, trans->transid);
+
+ /*
+ * we didn't find anything from this transaction, see if there
+ * is anything at all
+ */
+ if (ret != 0 || min_key.objectid != inode->i_ino ||
+ min_key.type != key_type) {
+ min_key.objectid = inode->i_ino;
+ min_key.type = key_type;
+ min_key.offset = (u64)-1;
+ btrfs_release_path(root, path);
+ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+ if (ret < 0) {
+ btrfs_release_path(root, path);
+ return ret;
+ }
+ ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+
+ /* if ret == 0 there are items for this type,
+ * create a range to tell us the last key of this type.
+ * otherwise, there are no items in this directory after
+ * *min_offset, and we create a range to indicate that.
+ */
+ if (ret == 0) {
+ struct btrfs_key tmp;
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+ path->slots[0]);
+ if (key_type == tmp.type)
+ first_offset = max(min_offset, tmp.offset) + 1;
+ }
+ goto done;
+ }
+
+ /* go backward to find any previous key */
+ ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+ if (ret == 0) {
+ struct btrfs_key tmp;
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+ if (key_type == tmp.type) {
+ first_offset = tmp.offset;
+ ret = overwrite_item(trans, log, dst_path,
+ path->nodes[0], path->slots[0],
+ &tmp);
+ }
+ }
+ btrfs_release_path(root, path);
+
+ /* find the first key from this transaction again */
+ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+ if (ret != 0) {
+ WARN_ON(1);
+ goto done;
+ }
+
+ /*
+ * we have a block from this transaction, log every item in it
+ * from our directory
+ */
+ while (1) {
+ struct btrfs_key tmp;
+ src = path->nodes[0];
+ nritems = btrfs_header_nritems(src);
+ for (i = path->slots[0]; i < nritems; i++) {
+ btrfs_item_key_to_cpu(src, &min_key, i);
+
+ if (min_key.objectid != inode->i_ino ||
+ min_key.type != key_type)
+ goto done;
+ ret = overwrite_item(trans, log, dst_path, src, i,
+ &min_key);
+ BUG_ON(ret);
+ }
+ path->slots[0] = nritems;
+
+ /*
+ * look ahead to the next item and see if it is also
+ * from this directory and from this transaction
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 1) {
+ last_offset = (u64)-1;
+ goto done;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+ if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
+ last_offset = (u64)-1;
+ goto done;
+ }
+ if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+ ret = overwrite_item(trans, log, dst_path,
+ path->nodes[0], path->slots[0],
+ &tmp);
+
+ BUG_ON(ret);
+ last_offset = tmp.offset;
+ goto done;
+ }
+ }
+done:
+ *last_offset_ret = last_offset;
+ btrfs_release_path(root, path);
+ btrfs_release_path(log, dst_path);
+
+ /* insert the log range keys to indicate where the log is valid */
+ ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
+ first_offset, last_offset);
+ BUG_ON(ret);
+ return 0;
+}
+
+/*
+ * logging directories is very similar to logging inodes, We find all the items
+ * from the current transaction and write them to the log.
+ *
+ * The recovery code scans the directory in the subvolume, and if it finds a
+ * key in the range logged that is not present in the log tree, then it means
+ * that dir entry was unlinked during the transaction.
+ *
+ * In order for that scan to work, we must include one key smaller than
+ * the smallest logged by this transaction and one key larger than the largest
+ * key logged by this transaction.
+ */
+static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path)
+{
+ u64 min_key;
+ u64 max_key;
+ int ret;
+ int key_type = BTRFS_DIR_ITEM_KEY;
+
+again:
+ min_key = 0;
+ max_key = 0;
+ while (1) {
+ ret = log_dir_items(trans, root, inode, path,
+ dst_path, key_type, min_key,
+ &max_key);
+ BUG_ON(ret);
+ if (max_key == (u64)-1)
+ break;
+ min_key = max_key + 1;
+ }
+
+ if (key_type == BTRFS_DIR_ITEM_KEY) {
+ key_type = BTRFS_DIR_INDEX_KEY;
+ goto again;
+ }
+ return 0;
+}
+
+/*
+ * a helper function to drop items from the log before we relog an
+ * inode. max_key_type indicates the highest item type to remove.
+ * This cannot be run for file data extents because it does not
+ * free the extents they point to.
+ */
+static int drop_objectid_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 objectid, int max_key_type)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+
+ key.objectid = objectid;
+ key.type = max_key_type;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+
+ if (ret != 1)
+ break;
+
+ if (path->slots[0] == 0)
+ break;
+
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+
+ if (found_key.objectid != objectid)
+ break;
+
+ ret = btrfs_del_item(trans, log, path);
+ BUG_ON(ret);
+ btrfs_release_path(log, path);
+ }
+ btrfs_release_path(log, path);
+ return 0;
+}
+
+static noinline int copy_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *dst_path,
+ struct extent_buffer *src,
+ int start_slot, int nr, int inode_only)
+{
+ unsigned long src_offset;
+ unsigned long dst_offset;
+ struct btrfs_file_extent_item *extent;
+ struct btrfs_inode_item *inode_item;
+ int ret;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ char *ins_data;
+ int i;
+ struct list_head ordered_sums;
+
+ INIT_LIST_HEAD(&ordered_sums);
+
+ ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
+ nr * sizeof(u32), GFP_NOFS);
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+
+ for (i = 0; i < nr; i++) {
+ ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+ btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+ }
+ ret = btrfs_insert_empty_items(trans, log, dst_path,
+ ins_keys, ins_sizes, nr);
+ BUG_ON(ret);
+
+ for (i = 0; i < nr; i++, dst_path->slots[0]++) {
+ dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+ dst_path->slots[0]);
+
+ src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+
+ copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+ src_offset, ins_sizes[i]);
+
+ if (inode_only == LOG_INODE_EXISTS &&
+ ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+ inode_item = btrfs_item_ptr(dst_path->nodes[0],
+ dst_path->slots[0],
+ struct btrfs_inode_item);
+ btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+
+ /* set the generation to zero so the recover code
+ * can tell the difference between an logging
+ * just to say 'this inode exists' and a logging
+ * to say 'update this inode with these values'
+ */
+ btrfs_set_inode_generation(dst_path->nodes[0],
+ inode_item, 0);
+ }
+ /* take a reference on file data extents so that truncates
+ * or deletes of this inode don't have to relog the inode
+ * again
+ */
+ if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+ int found_type;
+ extent = btrfs_item_ptr(src, start_slot + i,
+ struct btrfs_file_extent_item);
+
+ found_type = btrfs_file_extent_type(src, extent);
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 ds, dl, cs, cl;
+ ds = btrfs_file_extent_disk_bytenr(src,
+ extent);
+ /* ds == 0 is a hole */
+ if (ds == 0)
+ continue;
+
+ dl = btrfs_file_extent_disk_num_bytes(src,
+ extent);
+ cs = btrfs_file_extent_offset(src, extent);
+ cl = btrfs_file_extent_num_bytes(src,
+ extent);
+ if (btrfs_file_extent_compression(src,
+ extent)) {
+ cs = 0;
+ cl = dl;
+ }
+
+ ret = btrfs_lookup_csums_range(
+ log->fs_info->csum_root,
+ ds + cs, ds + cs + cl - 1,
+ &ordered_sums);
+ BUG_ON(ret);
+ }
+ }
+ }
+
+ btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+ btrfs_release_path(log, dst_path);
+ kfree(ins_data);
+
+ /*
+ * we have to do this after the loop above to avoid changing the
+ * log tree while trying to change the log tree.
+ */
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+ struct btrfs_ordered_sum,
+ list);
+ ret = btrfs_csum_file_blocks(trans, log, sums);
+ BUG_ON(ret);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ return 0;
+}
+
+/* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+ *
+ * Any items from this inode changed by the current transaction are copied
+ * to the log tree. An extra reference is taken on any extents in this
+ * file, allowing us to avoid a whole pile of corner cases around logging
+ * blocks that have been removed from the tree.
+ *
+ * See LOG_INODE_ALL and related defines for a description of what inode_only
+ * does.
+ *
+ * This handles both files and directories.
+ */
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only)
+{
+ struct btrfs_path *path;
+ struct btrfs_path *dst_path;
+ struct btrfs_key min_key;
+ struct btrfs_key max_key;
+ struct btrfs_root *log = root->log_root;
+ struct extent_buffer *src = NULL;
+ u32 size;
+ int ret;
+ int nritems;
+ int ins_start_slot = 0;
+ int ins_nr;
+
+ log = root->log_root;
+
+ path = btrfs_alloc_path();
+ dst_path = btrfs_alloc_path();
+
+ min_key.objectid = inode->i_ino;
+ min_key.type = BTRFS_INODE_ITEM_KEY;
+ min_key.offset = 0;
+
+ max_key.objectid = inode->i_ino;
+
+ /* today the code can only do partial logging of directories */
+ if (!S_ISDIR(inode->i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ else
+ max_key.type = (u8)-1;
+ max_key.offset = (u64)-1;
+
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+ /*
+ * a brute force approach to making sure we get the most uptodate
+ * copies of everything.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+
+ if (inode_only == LOG_INODE_EXISTS)
+ max_key_type = BTRFS_XATTR_ITEM_KEY;
+ ret = drop_objectid_items(trans, log, path,
+ inode->i_ino, max_key_type);
+ } else {
+ ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+ }
+ BUG_ON(ret);
+ path->keep_locks = 1;
+
+ while (1) {
+ ins_nr = 0;
+ ret = btrfs_search_forward(root, &min_key, &max_key,
+ path, 0, trans->transid);
+ if (ret != 0)
+ break;
+again:
+ /* note, ins_nr might be > 0 here, cleanup outside the loop */
+ if (min_key.objectid != inode->i_ino)
+ break;
+ if (min_key.type > max_key.type)
+ break;
+
+ src = path->nodes[0];
+ size = btrfs_item_size_nr(src, path->slots[0]);
+ if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+ ins_nr++;
+ goto next_slot;
+ } else if (!ins_nr) {
+ ins_start_slot = path->slots[0];
+ ins_nr = 1;
+ goto next_slot;
+ }
+
+ ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+ ins_nr, inode_only);
+ BUG_ON(ret);
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+next_slot:
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ path->slots[0]++;
+ if (path->slots[0] < nritems) {
+ btrfs_item_key_to_cpu(path->nodes[0], &min_key,
+ path->slots[0]);
+ goto again;
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, log, dst_path, src,
+ ins_start_slot,
+ ins_nr, inode_only);
+ BUG_ON(ret);
+ ins_nr = 0;
+ }
+ btrfs_release_path(root, path);
+
+ if (min_key.offset < (u64)-1)
+ min_key.offset++;
+ else if (min_key.type < (u8)-1)
+ min_key.type++;
+ else if (min_key.objectid < (u64)-1)
+ min_key.objectid++;
+ else
+ break;
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, log, dst_path, src,
+ ins_start_slot,
+ ins_nr, inode_only);
+ BUG_ON(ret);
+ ins_nr = 0;
+ }
+ WARN_ON(ins_nr);
+ if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+ btrfs_release_path(root, path);
+ btrfs_release_path(log, dst_path);
+ ret = log_directory_changes(trans, root, inode, path, dst_path);
+ BUG_ON(ret);
+ }
+ BTRFS_I(inode)->logged_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+ return 0;
+}
+
+/*
+ * follow the dentry parent pointers up the chain and see if any
+ * of the directories in it require a full commit before they can
+ * be logged. Returns zero if nothing special needs to be done or 1 if
+ * a full commit is required.
+ */
+static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct dentry *parent,
+ struct super_block *sb,
+ u64 last_committed)
+{
+ int ret = 0;
+ struct btrfs_root *root;
+
+ /*
+ * for regular files, if its inode is already on disk, we don't
+ * have to worry about the parents at all. This is because
+ * we can use the last_unlink_trans field to record renames
+ * and other fun in this file.
+ */
+ if (S_ISREG(inode->i_mode) &&
+ BTRFS_I(inode)->generation <= last_committed &&
+ BTRFS_I(inode)->last_unlink_trans <= last_committed)
+ goto out;
+
+ if (!S_ISDIR(inode->i_mode)) {
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ goto out;
+ inode = parent->d_inode;
+ }
+
+ while (1) {
+ BTRFS_I(inode)->logged_trans = trans->transid;
+ smp_mb();
+
+ if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+ root = BTRFS_I(inode)->root;
+
+ /*
+ * make sure any commits to the log are forced
+ * to be full commits
+ */
+ root->fs_info->last_trans_log_full_commit =
+ trans->transid;
+ ret = 1;
+ break;
+ }
+
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ break;
+
+ if (IS_ROOT(parent))
+ break;
+
+ parent = parent->d_parent;
+ inode = parent->d_inode;
+
+ }
+out:
+ return ret;
+}
+
+static int inode_in_log(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ mutex_lock(&root->log_mutex);
+ if (BTRFS_I(inode)->logged_trans == trans->transid &&
+ BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+ ret = 1;
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+
+/*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log. A minimal inode and backref
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct dentry *parent, int exists_only)
+{
+ int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
+ struct super_block *sb;
+ int ret = 0;
+ u64 last_committed = root->fs_info->last_trans_committed;
+
+ sb = inode->i_sb;
+
+ if (btrfs_test_opt(root, NOTREELOG)) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
+ if (root->fs_info->last_trans_log_full_commit >
+ root->fs_info->last_trans_committed) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
+ if (root != BTRFS_I(inode)->root ||
+ btrfs_root_refs(&root->root_item) == 0) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
+ ret = check_parent_dirs_for_sync(trans, inode, parent,
+ sb, last_committed);
+ if (ret)
+ goto end_no_trans;
+
+ if (inode_in_log(trans, inode)) {
+ ret = BTRFS_NO_LOG_SYNC;
+ goto end_no_trans;
+ }
+
+ start_log_trans(trans, root);
+
+ ret = btrfs_log_inode(trans, root, inode, inode_only);
+ BUG_ON(ret);
+
+ /*
+ * for regular files, if its inode is already on disk, we don't
+ * have to worry about the parents at all. This is because
+ * we can use the last_unlink_trans field to record renames
+ * and other fun in this file.
+ */
+ if (S_ISREG(inode->i_mode) &&
+ BTRFS_I(inode)->generation <= last_committed &&
+ BTRFS_I(inode)->last_unlink_trans <= last_committed)
+ goto no_parent;
+
+ inode_only = LOG_INODE_EXISTS;
+ while (1) {
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ break;
+
+ inode = parent->d_inode;
+ if (root != BTRFS_I(inode)->root)
+ break;
+
+ if (BTRFS_I(inode)->generation >
+ root->fs_info->last_trans_committed) {
+ ret = btrfs_log_inode(trans, root, inode, inode_only);
+ BUG_ON(ret);
+ }
+ if (IS_ROOT(parent))
+ break;
+
+ parent = parent->d_parent;
+ }
+no_parent:
+ ret = 0;
+ btrfs_end_log_trans(root);
+end_no_trans:
+ return ret;
+}
+
+/*
+ * it is not safe to log dentry if the chunk root has added new
+ * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
+ * If this returns 1, you must commit the transaction to safely get your
+ * data on disk.
+ */
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct dentry *dentry)
+{
+ return btrfs_log_inode_parent(trans, root, dentry->d_inode,
+ dentry->d_parent, 0);
+}
+
+/*
+ * should be called during mount to recover any replay any log trees
+ * from the FS
+ */
+int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_key tmp_key;
+ struct btrfs_root *log;
+ struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+ struct walk_control wc = {
+ .process_func = process_one_buffer,
+ .stage = 0,
+ };
+
+ fs_info->log_root_recovering = 1;
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+ wc.trans = trans;
+ wc.pin = 1;
+
+ walk_log_tree(trans, log_root_tree, &wc);
+
+again:
+ key.objectid = BTRFS_TREE_LOG_OBJECTID;
+ key.offset = (u64)-1;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ btrfs_release_path(log_root_tree, path);
+ if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ break;
+
+ log = btrfs_read_fs_root_no_radix(log_root_tree,
+ &found_key);
+ BUG_ON(!log);
+
+
+ tmp_key.objectid = found_key.offset;
+ tmp_key.type = BTRFS_ROOT_ITEM_KEY;
+ tmp_key.offset = (u64)-1;
+
+ wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+ BUG_ON(!wc.replay_dest);
+
+ wc.replay_dest->log_root = log;
+ btrfs_record_root_in_trans(trans, wc.replay_dest);
+ ret = walk_log_tree(trans, log, &wc);
+ BUG_ON(ret);
+
+ if (wc.stage == LOG_WALK_REPLAY_ALL) {
+ ret = fixup_inode_link_counts(trans, wc.replay_dest,
+ path);
+ BUG_ON(ret);
+ }
+
+ key.offset = found_key.offset - 1;
+ wc.replay_dest->log_root = NULL;
+ free_extent_buffer(log->node);
+ free_extent_buffer(log->commit_root);
+ kfree(log);
+
+ if (found_key.offset == 0)
+ break;
+ }
+ btrfs_release_path(log_root_tree, path);
+
+ /* step one is to pin it all, step two is to replay just inodes */
+ if (wc.pin) {
+ wc.pin = 0;
+ wc.process_func = replay_one_buffer;
+ wc.stage = LOG_WALK_REPLAY_INODES;
+ goto again;
+ }
+ /* step three is to replay everything */
+ if (wc.stage < LOG_WALK_REPLAY_ALL) {
+ wc.stage++;
+ goto again;
+ }
+
+ btrfs_free_path(path);
+
+ free_extent_buffer(log_root_tree->node);
+ log_root_tree->log_root = NULL;
+ fs_info->log_root_recovering = 0;
+
+ /* step 4: commit the transaction, which also unpins the blocks */
+ btrfs_commit_transaction(trans, fs_info->tree_root);
+
+ kfree(log_root_tree);
+ return 0;
+}
+
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+ struct inode *dir, struct inode *inode,
+ int for_rename)
+{
+ /*
+ * when we're logging a file, if it hasn't been renamed
+ * or unlinked, and its inode is fully committed on disk,
+ * we don't have to worry about walking up the directory chain
+ * to log its parents.
+ *
+ * So, we use the last_unlink_trans field to put this transid
+ * into the file. When the file is logged we check it and
+ * don't log the parents if the file is fully on disk.
+ */
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+ /*
+ * if this directory was already logged any new
+ * names for this file/dir will get recorded
+ */
+ smp_mb();
+ if (BTRFS_I(dir)->logged_trans == trans->transid)
+ return;
+
+ /*
+ * if the inode we're about to unlink was logged,
+ * the log will be properly updated for any new names
+ */
+ if (BTRFS_I(inode)->logged_trans == trans->transid)
+ return;
+
+ /*
+ * when renaming files across directories, if the directory
+ * there we're unlinking from gets fsync'd later on, there's
+ * no way to find the destination directory later and fsync it
+ * properly. So, we have to be conservative and force commits
+ * so the new name gets discovered.
+ */
+ if (for_rename)
+ goto record;
+
+ /* we can safely do the unlink without any special recording */
+ return;
+
+record:
+ BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+
+/*
+ * Call this after adding a new name for a file and it will properly
+ * update the log to reflect the new name.
+ *
+ * It will return zero if all goes well, and it will return 1 if a
+ * full transaction commit is required.
+ */
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *old_dir,
+ struct dentry *parent)
+{
+ struct btrfs_root * root = BTRFS_I(inode)->root;
+
+ /*
+ * this will force the logging code to walk the dentry chain
+ * up for the file
+ */
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+ /*
+ * if this inode hasn't been logged and directory we're renaming it
+ * from hasn't been logged, we don't need to log it
+ */
+ if (BTRFS_I(inode)->logged_trans <=
+ root->fs_info->last_trans_committed &&
+ (!old_dir || BTRFS_I(old_dir)->logged_trans <=
+ root->fs_info->last_trans_committed))
+ return 0;
+
+ return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+}
+
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 00000000000..0776eacb508
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __TREE_LOG_
+#define __TREE_LOG_
+
+/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
+#define BTRFS_NO_LOG_SYNC 256
+
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_recover_log_trees(struct btrfs_root *tree_root);
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct dentry *dentry);
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *dir, u64 index);
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *inode, u64 dirid);
+int btrfs_join_running_log_trans(struct btrfs_root *root);
+int btrfs_end_log_trans(struct btrfs_root *root);
+int btrfs_pin_log_trans(struct btrfs_root *root);
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct dentry *parent, int exists_only);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+ struct inode *dir, struct inode *inode,
+ int for_rename);
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *old_dir,
+ struct dentry *parent);
+#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 00000000000..9bf3946d5ef
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
+#ifndef __BTRFS_VERSION_H
+#define __BTRFS_VERSION_H
+#define BTRFS_BUILD_VERSION "Btrfs"
+#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 00000000000..1ca1952fd91
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# determine-version -- report a useful version for releases
+#
+# Copyright 2008, Aron Griffis <agriffis@n01se.net>
+# Copyright 2008, Oracle
+# Released under the GNU GPLv2
+
+v="v0.16"
+
+which git &> /dev/null
+if [ $? == 0 ]; then
+ git branch >& /dev/null
+ if [ $? == 0 ]; then
+ if head=`git rev-parse --verify HEAD 2>/dev/null`; then
+ if tag=`git describe --tags 2>/dev/null`; then
+ v="$tag"
+ fi
+
+ # Are there uncommitted changes?
+ git update-index --refresh --unmerged > /dev/null
+ if git diff-index --name-only HEAD | \
+ grep -v "^scripts/package" \
+ | read dummy; then
+ v="$v"-dirty
+ fi
+ fi
+ fi
+fi
+
+echo "#ifndef __BUILD_VERSION" > .build-version.h
+echo "#define __BUILD_VERSION" >> .build-version.h
+echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
+echo "#endif" >> .build-version.h
+
+diff -q version.h .build-version.h >& /dev/null
+
+if [ $? == 0 ]; then
+ rm .build-version.h
+ exit 0
+fi
+
+mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 00000000000..41ecbb2347f
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,3426 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+
+struct map_lookup {
+ u64 type;
+ int io_align;
+ int io_width;
+ int stripe_len;
+ int sector_size;
+ int num_stripes;
+ int sub_stripes;
+ struct btrfs_bio_stripe stripes[];
+};
+
+static int init_first_rw_device(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_device *device);
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+ (sizeof(struct btrfs_bio_stripe) * (n)))
+
+static DEFINE_MUTEX(uuid_mutex);
+static LIST_HEAD(fs_uuids);
+
+void btrfs_lock_volumes(void)
+{
+ mutex_lock(&uuid_mutex);
+}
+
+void btrfs_unlock_volumes(void)
+{
+ mutex_unlock(&uuid_mutex);
+}
+
+static void lock_chunks(struct btrfs_root *root)
+{
+ mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static void unlock_chunks(struct btrfs_root *root)
+{
+ mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+ WARN_ON(fs_devices->opened);
+ while (!list_empty(&fs_devices->devices)) {
+ device = list_entry(fs_devices->devices.next,
+ struct btrfs_device, dev_list);
+ list_del(&device->dev_list);
+ kfree(device->name);
+ kfree(device);
+ }
+ kfree(fs_devices);
+}
+
+int btrfs_cleanup_fs_uuids(void)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ while (!list_empty(&fs_uuids)) {
+ fs_devices = list_entry(fs_uuids.next,
+ struct btrfs_fs_devices, list);
+ list_del(&fs_devices->list);
+ free_fs_devices(fs_devices);
+ }
+ return 0;
+}
+
+static noinline struct btrfs_device *__find_device(struct list_head *head,
+ u64 devid, u8 *uuid)
+{
+ struct btrfs_device *dev;
+
+ list_for_each_entry(dev, head, dev_list) {
+ if (dev->devid == devid &&
+ (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
+ return dev;
+ }
+ }
+ return NULL;
+}
+
+static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ list_for_each_entry(fs_devices, &fs_uuids, list) {
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+ return fs_devices;
+ }
+ return NULL;
+}
+
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+ struct bio *head, struct bio *tail)
+{
+
+ struct bio *old_head;
+
+ old_head = pending_bios->head;
+ pending_bios->head = head;
+ if (pending_bios->tail)
+ tail->bi_next = old_head;
+ else
+ pending_bios->tail = tail;
+}
+
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device. This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block. The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested. This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+static noinline int run_scheduled_bios(struct btrfs_device *device)
+{
+ struct bio *pending;
+ struct backing_dev_info *bdi;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_pending_bios *pending_bios;
+ struct bio *tail;
+ struct bio *cur;
+ int again = 0;
+ unsigned long num_run;
+ unsigned long num_sync_run;
+ unsigned long batch_run = 0;
+ unsigned long limit;
+ unsigned long last_waited = 0;
+ int force_reg = 0;
+
+ bdi = blk_get_backing_dev_info(device->bdev);
+ fs_info = device->dev_root->fs_info;
+ limit = btrfs_async_submit_limit(fs_info);
+ limit = limit * 2 / 3;
+
+ /* we want to make sure that every time we switch from the sync
+ * list to the normal list, we unplug
+ */
+ num_sync_run = 0;
+
+loop:
+ spin_lock(&device->io_lock);
+
+loop_lock:
+ num_run = 0;
+
+ /* take all the bios off the list at once and process them
+ * later on (without the lock held). But, remember the
+ * tail and other pointers so the bios can be properly reinserted
+ * into the list if we hit congestion
+ */
+ if (!force_reg && device->pending_sync_bios.head) {
+ pending_bios = &device->pending_sync_bios;
+ force_reg = 1;
+ } else {
+ pending_bios = &device->pending_bios;
+ force_reg = 0;
+ }
+
+ pending = pending_bios->head;
+ tail = pending_bios->tail;
+ WARN_ON(pending && !tail);
+
+ /*
+ * if pending was null this time around, no bios need processing
+ * at all and we can stop. Otherwise it'll loop back up again
+ * and do an additional check so no bios are missed.
+ *
+ * device->running_pending is used to synchronize with the
+ * schedule_bio code.
+ */
+ if (device->pending_sync_bios.head == NULL &&
+ device->pending_bios.head == NULL) {
+ again = 0;
+ device->running_pending = 0;
+ } else {
+ again = 1;
+ device->running_pending = 1;
+ }
+
+ pending_bios->head = NULL;
+ pending_bios->tail = NULL;
+
+ spin_unlock(&device->io_lock);
+
+ /*
+ * if we're doing the regular priority list, make sure we unplug
+ * for any high prio bios we've sent down
+ */
+ if (pending_bios == &device->pending_bios && num_sync_run > 0) {
+ num_sync_run = 0;
+ blk_run_backing_dev(bdi, NULL);
+ }
+
+ while (pending) {
+
+ rmb();
+ /* we want to work on both lists, but do more bios on the
+ * sync list than the regular list
+ */
+ if ((num_run > 32 &&
+ pending_bios != &device->pending_sync_bios &&
+ device->pending_sync_bios.head) ||
+ (num_run > 64 && pending_bios == &device->pending_sync_bios &&
+ device->pending_bios.head)) {
+ spin_lock(&device->io_lock);
+ requeue_list(pending_bios, pending, tail);
+ goto loop_lock;
+ }
+
+ cur = pending;
+ pending = pending->bi_next;
+ cur->bi_next = NULL;
+ atomic_dec(&fs_info->nr_async_bios);
+
+ if (atomic_read(&fs_info->nr_async_bios) < limit &&
+ waitqueue_active(&fs_info->async_submit_wait))
+ wake_up(&fs_info->async_submit_wait);
+
+ BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+ submit_bio(cur->bi_rw, cur);
+ num_run++;
+ batch_run++;
+
+ if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
+ num_sync_run++;
+
+ if (need_resched()) {
+ if (num_sync_run) {
+ blk_run_backing_dev(bdi, NULL);
+ num_sync_run = 0;
+ }
+ cond_resched();
+ }
+
+ /*
+ * we made progress, there is more work to do and the bdi
+ * is now congested. Back off and let other work structs
+ * run instead
+ */
+ if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
+ fs_info->fs_devices->open_devices > 1) {
+ struct io_context *ioc;
+
+ ioc = current->io_context;
+
+ /*
+ * the main goal here is that we don't want to
+ * block if we're going to be able to submit
+ * more requests without blocking.
+ *
+ * This code does two great things, it pokes into
+ * the elevator code from a filesystem _and_
+ * it makes assumptions about how batching works.
+ */
+ if (ioc && ioc->nr_batch_requests > 0 &&
+ time_before(jiffies, ioc->last_waited + HZ/50UL) &&
+ (last_waited == 0 ||
+ ioc->last_waited == last_waited)) {
+ /*
+ * we want to go through our batch of
+ * requests and stop. So, we copy out
+ * the ioc->last_waited time and test
+ * against it before looping
+ */
+ last_waited = ioc->last_waited;
+ if (need_resched()) {
+ if (num_sync_run) {
+ blk_run_backing_dev(bdi, NULL);
+ num_sync_run = 0;
+ }
+ cond_resched();
+ }
+ continue;
+ }
+ spin_lock(&device->io_lock);
+ requeue_list(pending_bios, pending, tail);
+ device->running_pending = 1;
+
+ spin_unlock(&device->io_lock);
+ btrfs_requeue_work(&device->work);
+ goto done;
+ }
+ }
+
+ if (num_sync_run) {
+ num_sync_run = 0;
+ blk_run_backing_dev(bdi, NULL);
+ }
+
+ cond_resched();
+ if (again)
+ goto loop;
+
+ spin_lock(&device->io_lock);
+ if (device->pending_bios.head || device->pending_sync_bios.head)
+ goto loop_lock;
+ spin_unlock(&device->io_lock);
+
+ /*
+ * IO has already been through a long path to get here. Checksumming,
+ * async helper threads, perhaps compression. We've done a pretty
+ * good job of collecting a batch of IO and should just unplug
+ * the device right away.
+ *
+ * This will help anyone who is waiting on the IO, they might have
+ * already unplugged, but managed to do so before the bio they
+ * cared about found its way down here.
+ */
+ blk_run_backing_dev(bdi, NULL);
+done:
+ return 0;
+}
+
+static void pending_bios_fn(struct btrfs_work *work)
+{
+ struct btrfs_device *device;
+
+ device = container_of(work, struct btrfs_device, work);
+ run_scheduled_bios(device);
+}
+
+static noinline int device_list_add(const char *path,
+ struct btrfs_super_block *disk_super,
+ u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *fs_devices;
+ u64 found_transid = btrfs_super_generation(disk_super);
+
+ fs_devices = find_fsid(disk_super->fsid);
+ if (!fs_devices) {
+ fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+ if (!fs_devices)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&fs_devices->devices);
+ INIT_LIST_HEAD(&fs_devices->alloc_list);
+ list_add(&fs_devices->list, &fs_uuids);
+ memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
+ fs_devices->latest_devid = devid;
+ fs_devices->latest_trans = found_transid;
+ mutex_init(&fs_devices->device_list_mutex);
+ device = NULL;
+ } else {
+ device = __find_device(&fs_devices->devices, devid,
+ disk_super->dev_item.uuid);
+ }
+ if (!device) {
+ if (fs_devices->opened)
+ return -EBUSY;
+
+ device = kzalloc(sizeof(*device), GFP_NOFS);
+ if (!device) {
+ /* we can safely leave the fs_devices entry around */
+ return -ENOMEM;
+ }
+ device->devid = devid;
+ device->work.func = pending_bios_fn;
+ memcpy(device->uuid, disk_super->dev_item.uuid,
+ BTRFS_UUID_SIZE);
+ device->barriers = 1;
+ spin_lock_init(&device->io_lock);
+ device->name = kstrdup(path, GFP_NOFS);
+ if (!device->name) {
+ kfree(device);
+ return -ENOMEM;
+ }
+ INIT_LIST_HEAD(&device->dev_alloc_list);
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_add(&device->dev_list, &fs_devices->devices);
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ device->fs_devices = fs_devices;
+ fs_devices->num_devices++;
+ }
+
+ if (found_transid > fs_devices->latest_trans) {
+ fs_devices->latest_devid = devid;
+ fs_devices->latest_trans = found_transid;
+ }
+ *fs_devices_ret = fs_devices;
+ return 0;
+}
+
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+{
+ struct btrfs_fs_devices *fs_devices;
+ struct btrfs_device *device;
+ struct btrfs_device *orig_dev;
+
+ fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+ if (!fs_devices)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&fs_devices->devices);
+ INIT_LIST_HEAD(&fs_devices->alloc_list);
+ INIT_LIST_HEAD(&fs_devices->list);
+ mutex_init(&fs_devices->device_list_mutex);
+ fs_devices->latest_devid = orig->latest_devid;
+ fs_devices->latest_trans = orig->latest_trans;
+ memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+
+ mutex_lock(&orig->device_list_mutex);
+ list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+ device = kzalloc(sizeof(*device), GFP_NOFS);
+ if (!device)
+ goto error;
+
+ device->name = kstrdup(orig_dev->name, GFP_NOFS);
+ if (!device->name) {
+ kfree(device);
+ goto error;
+ }
+
+ device->devid = orig_dev->devid;
+ device->work.func = pending_bios_fn;
+ memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
+ device->barriers = 1;
+ spin_lock_init(&device->io_lock);
+ INIT_LIST_HEAD(&device->dev_list);
+ INIT_LIST_HEAD(&device->dev_alloc_list);
+
+ list_add(&device->dev_list, &fs_devices->devices);
+ device->fs_devices = fs_devices;
+ fs_devices->num_devices++;
+ }
+ mutex_unlock(&orig->device_list_mutex);
+ return fs_devices;
+error:
+ mutex_unlock(&orig->device_list_mutex);
+ free_fs_devices(fs_devices);
+ return ERR_PTR(-ENOMEM);
+}
+
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device, *next;
+
+ mutex_lock(&uuid_mutex);
+again:
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+ if (device->in_fs_metadata)
+ continue;
+
+ if (device->bdev) {
+ close_bdev_exclusive(device->bdev, device->mode);
+ device->bdev = NULL;
+ fs_devices->open_devices--;
+ }
+ if (device->writeable) {
+ list_del_init(&device->dev_alloc_list);
+ device->writeable = 0;
+ fs_devices->rw_devices--;
+ }
+ list_del_init(&device->dev_list);
+ fs_devices->num_devices--;
+ kfree(device->name);
+ kfree(device);
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (fs_devices->seed) {
+ fs_devices = fs_devices->seed;
+ goto again;
+ }
+
+ mutex_unlock(&uuid_mutex);
+ return 0;
+}
+
+static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+
+ if (--fs_devices->opened > 0)
+ return 0;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->bdev) {
+ close_bdev_exclusive(device->bdev, device->mode);
+ fs_devices->open_devices--;
+ }
+ if (device->writeable) {
+ list_del_init(&device->dev_alloc_list);
+ fs_devices->rw_devices--;
+ }
+
+ device->bdev = NULL;
+ device->writeable = 0;
+ device->in_fs_metadata = 0;
+ }
+ WARN_ON(fs_devices->open_devices);
+ WARN_ON(fs_devices->rw_devices);
+ fs_devices->opened = 0;
+ fs_devices->seeding = 0;
+
+ return 0;
+}
+
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_fs_devices *seed_devices = NULL;
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+ ret = __btrfs_close_devices(fs_devices);
+ if (!fs_devices->opened) {
+ seed_devices = fs_devices->seed;
+ fs_devices->seed = NULL;
+ }
+ mutex_unlock(&uuid_mutex);
+
+ while (seed_devices) {
+ fs_devices = seed_devices;
+ seed_devices = fs_devices->seed;
+ __btrfs_close_devices(fs_devices);
+ free_fs_devices(fs_devices);
+ }
+ return ret;
+}
+
+static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+ fmode_t flags, void *holder)
+{
+ struct block_device *bdev;
+ struct list_head *head = &fs_devices->devices;
+ struct btrfs_device *device;
+ struct block_device *latest_bdev = NULL;
+ struct buffer_head *bh;
+ struct btrfs_super_block *disk_super;
+ u64 latest_devid = 0;
+ u64 latest_transid = 0;
+ u64 devid;
+ int seeding = 1;
+ int ret = 0;
+
+ list_for_each_entry(device, head, dev_list) {
+ if (device->bdev)
+ continue;
+ if (!device->name)
+ continue;
+
+ bdev = open_bdev_exclusive(device->name, flags, holder);
+ if (IS_ERR(bdev)) {
+ printk(KERN_INFO "open %s failed\n", device->name);
+ goto error;
+ }
+ set_blocksize(bdev, 4096);
+
+ bh = btrfs_read_dev_super(bdev);
+ if (!bh)
+ goto error_close;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = le64_to_cpu(disk_super->dev_item.devid);
+ if (devid != device->devid)
+ goto error_brelse;
+
+ if (memcmp(device->uuid, disk_super->dev_item.uuid,
+ BTRFS_UUID_SIZE))
+ goto error_brelse;
+
+ device->generation = btrfs_super_generation(disk_super);
+ if (!latest_transid || device->generation > latest_transid) {
+ latest_devid = devid;
+ latest_transid = device->generation;
+ latest_bdev = bdev;
+ }
+
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+ device->writeable = 0;
+ } else {
+ device->writeable = !bdev_read_only(bdev);
+ seeding = 0;
+ }
+
+ device->bdev = bdev;
+ device->in_fs_metadata = 0;
+ device->mode = flags;
+
+ if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ fs_devices->rotating = 1;
+
+ fs_devices->open_devices++;
+ if (device->writeable) {
+ fs_devices->rw_devices++;
+ list_add(&device->dev_alloc_list,
+ &fs_devices->alloc_list);
+ }
+ continue;
+
+error_brelse:
+ brelse(bh);
+error_close:
+ close_bdev_exclusive(bdev, FMODE_READ);
+error:
+ continue;
+ }
+ if (fs_devices->open_devices == 0) {
+ ret = -EIO;
+ goto out;
+ }
+ fs_devices->seeding = seeding;
+ fs_devices->opened = 1;
+ fs_devices->latest_bdev = latest_bdev;
+ fs_devices->latest_devid = latest_devid;
+ fs_devices->latest_trans = latest_transid;
+ fs_devices->total_rw_bytes = 0;
+out:
+ return ret;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+ fmode_t flags, void *holder)
+{
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+ if (fs_devices->opened) {
+ fs_devices->opened++;
+ ret = 0;
+ } else {
+ ret = __btrfs_open_devices(fs_devices, flags, holder);
+ }
+ mutex_unlock(&uuid_mutex);
+ return ret;
+}
+
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+ struct btrfs_fs_devices **fs_devices_ret)
+{
+ struct btrfs_super_block *disk_super;
+ struct block_device *bdev;
+ struct buffer_head *bh;
+ int ret;
+ u64 devid;
+ u64 transid;
+
+ mutex_lock(&uuid_mutex);
+
+ bdev = open_bdev_exclusive(path, flags, holder);
+
+ if (IS_ERR(bdev)) {
+ ret = PTR_ERR(bdev);
+ goto error;
+ }
+
+ ret = set_blocksize(bdev, 4096);
+ if (ret)
+ goto error_close;
+ bh = btrfs_read_dev_super(bdev);
+ if (!bh) {
+ ret = -EIO;
+ goto error_close;
+ }
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = le64_to_cpu(disk_super->dev_item.devid);
+ transid = btrfs_super_generation(disk_super);
+ if (disk_super->label[0])
+ printk(KERN_INFO "device label %s ", disk_super->label);
+ else {
+ /* FIXME, make a readl uuid parser */
+ printk(KERN_INFO "device fsid %llx-%llx ",
+ *(unsigned long long *)disk_super->fsid,
+ *(unsigned long long *)(disk_super->fsid + 8));
+ }
+ printk(KERN_CONT "devid %llu transid %llu %s\n",
+ (unsigned long long)devid, (unsigned long long)transid, path);
+ ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+
+ brelse(bh);
+error_close:
+ close_bdev_exclusive(bdev, flags);
+error:
+ mutex_unlock(&uuid_mutex);
+ return ret;
+}
+
+/*
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ */
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *max_avail)
+{
+ struct btrfs_key key;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_path *path;
+ u64 hole_size = 0;
+ u64 last_byte = 0;
+ u64 search_start = 0;
+ u64 search_end = device->total_bytes;
+ int ret;
+ int slot = 0;
+ int start_found;
+ struct extent_buffer *l;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 2;
+ start_found = 0;
+
+ /* FIXME use last free of some kind */
+
+ /* we don't want to overwrite the superblock on the drive,
+ * so we make sure to start at an offset of at least 1MB
+ */
+ search_start = max((u64)1024 * 1024, search_start);
+
+ if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+ search_start = max(root->fs_info->alloc_start, search_start);
+
+ key.objectid = device->devid;
+ key.offset = search_start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ if (ret > 0) {
+ ret = btrfs_previous_item(root, path, key.objectid, key.type);
+ if (ret < 0)
+ goto error;
+ if (ret > 0)
+ start_found = 1;
+ }
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+ while (1) {
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto error;
+no_more_items:
+ if (!start_found) {
+ if (search_start >= search_end) {
+ ret = -ENOSPC;
+ goto error;
+ }
+ *start = search_start;
+ start_found = 1;
+ goto check_pending;
+ }
+ *start = last_byte > search_start ?
+ last_byte : search_start;
+ if (search_end <= *start) {
+ ret = -ENOSPC;
+ goto error;
+ }
+ goto check_pending;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid < device->devid)
+ goto next;
+
+ if (key.objectid > device->devid)
+ goto no_more_items;
+
+ if (key.offset >= search_start && key.offset > last_byte &&
+ start_found) {
+ if (last_byte < search_start)
+ last_byte = search_start;
+ hole_size = key.offset - last_byte;
+
+ if (hole_size > *max_avail)
+ *max_avail = hole_size;
+
+ if (key.offset > last_byte &&
+ hole_size >= num_bytes) {
+ *start = last_byte;
+ goto check_pending;
+ }
+ }
+ if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ goto next;
+
+ start_found = 1;
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+next:
+ path->slots[0]++;
+ cond_resched();
+ }
+check_pending:
+ /* we have to make sure we didn't find an extent that has already
+ * been allocated by the map tree or the original allocation
+ */
+ BUG_ON(*start < search_start);
+
+ if (*start + num_bytes > search_end) {
+ ret = -ENOSPC;
+ goto error;
+ }
+ /* check for pending inserts here */
+ ret = 0;
+
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 start)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf = NULL;
+ struct btrfs_dev_extent *extent = NULL;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = device->devid;
+ key.offset = start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = btrfs_previous_item(root, path, key.objectid,
+ BTRFS_DEV_EXTENT_KEY);
+ BUG_ON(ret);
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_extent);
+ BUG_ON(found_key.offset > start || found_key.offset +
+ btrfs_dev_extent_length(leaf, extent) < start);
+ ret = 0;
+ } else if (ret == 0) {
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_extent);
+ }
+ BUG_ON(ret);
+
+ if (device->bytes_used > 0)
+ device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+ ret = btrfs_del_item(trans, root, path);
+ BUG_ON(ret);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset, u64 start, u64 num_bytes)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *extent;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ WARN_ON(!device->in_fs_metadata);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = device->devid;
+ key.offset = start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*extent));
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_extent);
+ btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
+ btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
+ btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+ write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
+ BTRFS_UUID_SIZE);
+
+ btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int find_next_chunk(struct btrfs_root *root,
+ u64 objectid, u64 *offset)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_chunk *chunk;
+ struct btrfs_key found_key;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ key.objectid = objectid;
+ key.offset = (u64)-1;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+
+ BUG_ON(ret == 0);
+
+ ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
+ if (ret) {
+ *offset = 0;
+ } else {
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != objectid)
+ *offset = 0;
+ else {
+ chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_chunk);
+ *offset = found_key.offset +
+ btrfs_chunk_length(path->nodes[0], chunk);
+ }
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+
+ root = root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+
+ BUG_ON(ret == 0);
+
+ ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+ BTRFS_DEV_ITEM_KEY);
+ if (ret) {
+ *objectid = 1;
+ } else {
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ *objectid = found_key.offset + 1;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_device *device)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_dev_item *dev_item;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long ptr;
+
+ root = root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = device->devid;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*dev_item));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+ btrfs_set_device_id(leaf, dev_item, device->devid);
+ btrfs_set_device_generation(leaf, dev_item, 0);
+ btrfs_set_device_type(leaf, dev_item, device->type);
+ btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+ btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+ btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+ btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+ btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+ btrfs_set_device_group(leaf, dev_item, 0);
+ btrfs_set_device_seek_speed(leaf, dev_item, 0);
+ btrfs_set_device_bandwidth(leaf, dev_item, 0);
+ btrfs_set_device_start_offset(leaf, dev_item, 0);
+
+ ptr = (unsigned long)btrfs_device_uuid(dev_item);
+ write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+ ptr = (unsigned long)btrfs_device_fsid(dev_item);
+ write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_rm_dev_item(struct btrfs_root *root,
+ struct btrfs_device *device)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_trans_handle *trans;
+
+ root = root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 1);
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = device->devid;
+ lock_chunks(root);
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+out:
+ btrfs_free_path(path);
+ unlock_chunks(root);
+ btrfs_commit_transaction(trans, root);
+ return ret;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+{
+ struct btrfs_device *device;
+ struct btrfs_device *next_device;
+ struct block_device *bdev;
+ struct buffer_head *bh = NULL;
+ struct btrfs_super_block *disk_super;
+ u64 all_avail;
+ u64 devid;
+ u64 num_devices;
+ u8 *dev_uuid;
+ int ret = 0;
+
+ mutex_lock(&uuid_mutex);
+ mutex_lock(&root->fs_info->volume_mutex);
+
+ all_avail = root->fs_info->avail_data_alloc_bits |
+ root->fs_info->avail_system_alloc_bits |
+ root->fs_info->avail_metadata_alloc_bits;
+
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+ root->fs_info->fs_devices->num_devices <= 4) {
+ printk(KERN_ERR "btrfs: unable to go below four devices "
+ "on raid10\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+ root->fs_info->fs_devices->num_devices <= 2) {
+ printk(KERN_ERR "btrfs: unable to go below two "
+ "devices on raid1\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (strcmp(device_path, "missing") == 0) {
+ struct list_head *devices;
+ struct btrfs_device *tmp;
+
+ device = NULL;
+ devices = &root->fs_info->fs_devices->devices;
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ list_for_each_entry(tmp, devices, dev_list) {
+ if (tmp->in_fs_metadata && !tmp->bdev) {
+ device = tmp;
+ break;
+ }
+ }
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ bdev = NULL;
+ bh = NULL;
+ disk_super = NULL;
+ if (!device) {
+ printk(KERN_ERR "btrfs: no missing devices found to "
+ "remove\n");
+ goto out;
+ }
+ } else {
+ bdev = open_bdev_exclusive(device_path, FMODE_READ,
+ root->fs_info->bdev_holder);
+ if (IS_ERR(bdev)) {
+ ret = PTR_ERR(bdev);
+ goto out;
+ }
+
+ set_blocksize(bdev, 4096);
+ bh = btrfs_read_dev_super(bdev);
+ if (!bh) {
+ ret = -EIO;
+ goto error_close;
+ }
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = le64_to_cpu(disk_super->dev_item.devid);
+ dev_uuid = disk_super->dev_item.uuid;
+ device = btrfs_find_device(root, devid, dev_uuid,
+ disk_super->fsid);
+ if (!device) {
+ ret = -ENOENT;
+ goto error_brelse;
+ }
+ }
+
+ if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
+ printk(KERN_ERR "btrfs: unable to remove the only writeable "
+ "device\n");
+ ret = -EINVAL;
+ goto error_brelse;
+ }
+
+ if (device->writeable) {
+ list_del_init(&device->dev_alloc_list);
+ root->fs_info->fs_devices->rw_devices--;
+ }
+
+ ret = btrfs_shrink_device(device, 0);
+ if (ret)
+ goto error_brelse;
+
+ ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+ if (ret)
+ goto error_brelse;
+
+ device->in_fs_metadata = 0;
+
+ /*
+ * the device list mutex makes sure that we don't change
+ * the device list while someone else is writing out all
+ * the device supers.
+ */
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ list_del_init(&device->dev_list);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ device->fs_devices->num_devices--;
+
+ next_device = list_entry(root->fs_info->fs_devices->devices.next,
+ struct btrfs_device, dev_list);
+ if (device->bdev == root->fs_info->sb->s_bdev)
+ root->fs_info->sb->s_bdev = next_device->bdev;
+ if (device->bdev == root->fs_info->fs_devices->latest_bdev)
+ root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+
+ if (device->bdev) {
+ close_bdev_exclusive(device->bdev, device->mode);
+ device->bdev = NULL;
+ device->fs_devices->open_devices--;
+ }
+
+ num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+ btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+
+ if (device->fs_devices->open_devices == 0) {
+ struct btrfs_fs_devices *fs_devices;
+ fs_devices = root->fs_info->fs_devices;
+ while (fs_devices) {
+ if (fs_devices->seed == device->fs_devices)
+ break;
+ fs_devices = fs_devices->seed;
+ }
+ fs_devices->seed = device->fs_devices->seed;
+ device->fs_devices->seed = NULL;
+ __btrfs_close_devices(device->fs_devices);
+ free_fs_devices(device->fs_devices);
+ }
+
+ /*
+ * at this point, the device is zero sized. We want to
+ * remove it from the devices list and zero out the old super
+ */
+ if (device->writeable) {
+ /* make sure this device isn't detected as part of
+ * the FS anymore
+ */
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ }
+
+ kfree(device->name);
+ kfree(device);
+ ret = 0;
+
+error_brelse:
+ brelse(bh);
+error_close:
+ if (bdev)
+ close_bdev_exclusive(bdev, FMODE_READ);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
+ mutex_unlock(&uuid_mutex);
+ return ret;
+}
+
+/*
+ * does all the dirty work required for changing file system's UUID.
+ */
+static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_fs_devices *old_devices;
+ struct btrfs_fs_devices *seed_devices;
+ struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct btrfs_device *device;
+ u64 super_flags;
+
+ BUG_ON(!mutex_is_locked(&uuid_mutex));
+ if (!fs_devices->seeding)
+ return -EINVAL;
+
+ seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+ if (!seed_devices)
+ return -ENOMEM;
+
+ old_devices = clone_fs_devices(fs_devices);
+ if (IS_ERR(old_devices)) {
+ kfree(seed_devices);
+ return PTR_ERR(old_devices);
+ }
+
+ list_add(&old_devices->list, &fs_uuids);
+
+ memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+ seed_devices->opened = 1;
+ INIT_LIST_HEAD(&seed_devices->devices);
+ INIT_LIST_HEAD(&seed_devices->alloc_list);
+ mutex_init(&seed_devices->device_list_mutex);
+ list_splice_init(&fs_devices->devices, &seed_devices->devices);
+ list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
+ list_for_each_entry(device, &seed_devices->devices, dev_list) {
+ device->fs_devices = seed_devices;
+ }
+
+ fs_devices->seeding = 0;
+ fs_devices->num_devices = 0;
+ fs_devices->open_devices = 0;
+ fs_devices->seed = seed_devices;
+
+ generate_random_uuid(fs_devices->fsid);
+ memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+ memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+ super_flags = btrfs_super_flags(disk_super) &
+ ~BTRFS_SUPER_FLAG_SEEDING;
+ btrfs_set_super_flags(disk_super, super_flags);
+
+ return 0;
+}
+
+/*
+ * strore the expected generation for seed devices in device items.
+ */
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_dev_item *dev_item;
+ struct btrfs_device *device;
+ struct btrfs_key key;
+ u8 fs_uuid[BTRFS_UUID_SIZE];
+ u8 dev_uuid[BTRFS_UUID_SIZE];
+ u64 devid;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ root = root->fs_info->chunk_root;
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.offset = 0;
+ key.type = BTRFS_DEV_ITEM_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ goto error;
+
+ leaf = path->nodes[0];
+next_slot:
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret > 0)
+ break;
+ if (ret < 0)
+ goto error;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(root, path);
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
+ key.type != BTRFS_DEV_ITEM_KEY)
+ break;
+
+ dev_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_item);
+ devid = btrfs_device_id(leaf, dev_item);
+ read_extent_buffer(leaf, dev_uuid,
+ (unsigned long)btrfs_device_uuid(dev_item),
+ BTRFS_UUID_SIZE);
+ read_extent_buffer(leaf, fs_uuid,
+ (unsigned long)btrfs_device_fsid(dev_item),
+ BTRFS_UUID_SIZE);
+ device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+ BUG_ON(!device);
+
+ if (device->fs_devices->seeding) {
+ btrfs_set_device_generation(leaf, dev_item,
+ device->generation);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+
+ path->slots[0]++;
+ goto next_slot;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_device *device;
+ struct block_device *bdev;
+ struct list_head *devices;
+ struct super_block *sb = root->fs_info->sb;
+ u64 total_bytes;
+ int seeding_dev = 0;
+ int ret = 0;
+
+ if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
+ return -EINVAL;
+
+ bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+
+ if (root->fs_info->fs_devices->seeding) {
+ seeding_dev = 1;
+ down_write(&sb->s_umount);
+ mutex_lock(&uuid_mutex);
+ }
+
+ filemap_write_and_wait(bdev->bd_inode->i_mapping);
+ mutex_lock(&root->fs_info->volume_mutex);
+
+ devices = &root->fs_info->fs_devices->devices;
+ /*
+ * we have the volume lock, so we don't need the extra
+ * device list mutex while reading the list here.
+ */
+ list_for_each_entry(device, devices, dev_list) {
+ if (device->bdev == bdev) {
+ ret = -EEXIST;
+ goto error;
+ }
+ }
+
+ device = kzalloc(sizeof(*device), GFP_NOFS);
+ if (!device) {
+ /* we can safely leave the fs_devices entry around */
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ device->name = kstrdup(device_path, GFP_NOFS);
+ if (!device->name) {
+ kfree(device);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = find_next_devid(root, &device->devid);
+ if (ret) {
+ kfree(device);
+ goto error;
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ lock_chunks(root);
+
+ device->barriers = 1;
+ device->writeable = 1;
+ device->work.func = pending_bios_fn;
+ generate_random_uuid(device->uuid);
+ spin_lock_init(&device->io_lock);
+ device->generation = trans->transid;
+ device->io_width = root->sectorsize;
+ device->io_align = root->sectorsize;
+ device->sector_size = root->sectorsize;
+ device->total_bytes = i_size_read(bdev->bd_inode);
+ device->disk_total_bytes = device->total_bytes;
+ device->dev_root = root->fs_info->dev_root;
+ device->bdev = bdev;
+ device->in_fs_metadata = 1;
+ device->mode = 0;
+ set_blocksize(device->bdev, 4096);
+
+ if (seeding_dev) {
+ sb->s_flags &= ~MS_RDONLY;
+ ret = btrfs_prepare_sprout(trans, root);
+ BUG_ON(ret);
+ }
+
+ device->fs_devices = root->fs_info->fs_devices;
+
+ /*
+ * we don't want write_supers to jump in here with our device
+ * half setup
+ */
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+ list_add(&device->dev_alloc_list,
+ &root->fs_info->fs_devices->alloc_list);
+ root->fs_info->fs_devices->num_devices++;
+ root->fs_info->fs_devices->open_devices++;
+ root->fs_info->fs_devices->rw_devices++;
+ root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+
+ if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ root->fs_info->fs_devices->rotating = 1;
+
+ total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+ btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+ total_bytes + device->total_bytes);
+
+ total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+ btrfs_set_super_num_devices(&root->fs_info->super_copy,
+ total_bytes + 1);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ if (seeding_dev) {
+ ret = init_first_rw_device(trans, root, device);
+ BUG_ON(ret);
+ ret = btrfs_finish_sprout(trans, root);
+ BUG_ON(ret);
+ } else {
+ ret = btrfs_add_device(trans, root, device);
+ }
+
+ /*
+ * we've got more storage, clear any full flags on the space
+ * infos
+ */
+ btrfs_clear_space_info_full(root->fs_info);
+
+ unlock_chunks(root);
+ btrfs_commit_transaction(trans, root);
+
+ if (seeding_dev) {
+ mutex_unlock(&uuid_mutex);
+ up_write(&sb->s_umount);
+
+ ret = btrfs_relocate_sys_chunks(root);
+ BUG_ON(ret);
+ }
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
+ return ret;
+error:
+ close_bdev_exclusive(bdev, 0);
+ if (seeding_dev) {
+ mutex_unlock(&uuid_mutex);
+ up_write(&sb->s_umount);
+ }
+ goto out;
+}
+
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root *root;
+ struct btrfs_dev_item *dev_item;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ root = device->dev_root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = device->devid;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+ btrfs_set_device_id(leaf, dev_item, device->devid);
+ btrfs_set_device_type(leaf, dev_item, device->type);
+ btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+ btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+ btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+ btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
+ btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+ btrfs_mark_buffer_dirty(leaf);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 new_size)
+{
+ struct btrfs_super_block *super_copy =
+ &device->dev_root->fs_info->super_copy;
+ u64 old_total = btrfs_super_total_bytes(super_copy);
+ u64 diff = new_size - device->total_bytes;
+
+ if (!device->writeable)
+ return -EACCES;
+ if (new_size <= device->total_bytes)
+ return -EINVAL;
+
+ btrfs_set_super_total_bytes(super_copy, old_total + diff);
+ device->fs_devices->total_rw_bytes += diff;
+
+ device->total_bytes = new_size;
+ device->disk_total_bytes = new_size;
+ btrfs_clear_space_info_full(device->dev_root->fs_info);
+
+ return btrfs_update_device(trans, device);
+}
+
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 new_size)
+{
+ int ret;
+ lock_chunks(device->dev_root);
+ ret = __btrfs_grow_device(trans, device, new_size);
+ unlock_chunks(device->dev_root);
+ return ret;
+}
+
+static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ root = root->fs_info->chunk_root;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = chunk_objectid;
+ key.offset = chunk_offset;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ BUG_ON(ret);
+
+ ret = btrfs_del_item(trans, root, path);
+ BUG_ON(ret);
+
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+ chunk_offset)
+{
+ struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_disk_key *disk_key;
+ struct btrfs_chunk *chunk;
+ u8 *ptr;
+ int ret = 0;
+ u32 num_stripes;
+ u32 array_size;
+ u32 len = 0;
+ u32 cur;
+ struct btrfs_key key;
+
+ array_size = btrfs_super_sys_array_size(super_copy);
+
+ ptr = super_copy->sys_chunk_array;
+ cur = 0;
+
+ while (cur < array_size) {
+ disk_key = (struct btrfs_disk_key *)ptr;
+ btrfs_disk_key_to_cpu(&key, disk_key);
+
+ len = sizeof(*disk_key);
+
+ if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+ chunk = (struct btrfs_chunk *)(ptr + len);
+ num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+ len += btrfs_chunk_item_size(num_stripes);
+ } else {
+ ret = -EIO;
+ break;
+ }
+ if (key.objectid == chunk_objectid &&
+ key.offset == chunk_offset) {
+ memmove(ptr, ptr + len, array_size - (cur + len));
+ array_size -= len;
+ btrfs_set_super_sys_array_size(super_copy, array_size);
+ } else {
+ ptr += len;
+ cur += len;
+ }
+ }
+ return ret;
+}
+
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset)
+{
+ struct extent_map_tree *em_tree;
+ struct btrfs_root *extent_root;
+ struct btrfs_trans_handle *trans;
+ struct extent_map *em;
+ struct map_lookup *map;
+ int ret;
+ int i;
+
+ root = root->fs_info->chunk_root;
+ extent_root = root->fs_info->extent_root;
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+
+ ret = btrfs_can_relocate(extent_root, chunk_offset);
+ if (ret)
+ return -ENOSPC;
+
+ /* step one, relocate all the extents inside this chunk */
+ ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ BUG_ON(ret);
+
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+
+ lock_chunks(root);
+
+ /*
+ * step two, delete the device extents and the
+ * chunk tree entries
+ */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+
+ BUG_ON(em->start > chunk_offset ||
+ em->start + em->len < chunk_offset);
+ map = (struct map_lookup *)em->bdev;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
+ map->stripes[i].physical);
+ BUG_ON(ret);
+
+ if (map->stripes[i].dev) {
+ ret = btrfs_update_device(trans, map->stripes[i].dev);
+ BUG_ON(ret);
+ }
+ }
+ ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
+ chunk_offset);
+
+ BUG_ON(ret);
+
+ if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
+ BUG_ON(ret);
+ }
+
+ ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+ BUG_ON(ret);
+
+ write_lock(&em_tree->lock);
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+
+ kfree(map);
+ em->bdev = NULL;
+
+ /* once for the tree */
+ free_extent_map(em);
+ /* once for us */
+ free_extent_map(em);
+
+ unlock_chunks(root);
+ btrfs_end_transaction(trans, root);
+ return 0;
+}
+
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+{
+ struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_chunk *chunk;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 chunk_tree = chunk_root->root_key.objectid;
+ u64 chunk_type;
+ bool retried = false;
+ int failed = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+again:
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.offset = (u64)-1;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ BUG_ON(ret == 0);
+
+ ret = btrfs_previous_item(chunk_root, path, key.objectid,
+ key.type);
+ if (ret < 0)
+ goto error;
+ if (ret > 0)
+ break;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ chunk = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_chunk);
+ chunk_type = btrfs_chunk_type(leaf, chunk);
+ btrfs_release_path(chunk_root, path);
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+ found_key.objectid,
+ found_key.offset);
+ if (ret == -ENOSPC)
+ failed++;
+ else if (ret)
+ BUG();
+ }
+
+ if (found_key.offset == 0)
+ break;
+ key.offset = found_key.offset - 1;
+ }
+ ret = 0;
+ if (failed && !retried) {
+ failed = 0;
+ retried = true;
+ goto again;
+ } else if (failed && retried) {
+ WARN_ON(1);
+ ret = -ENOSPC;
+ }
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static u64 div_factor(u64 num, int factor)
+{
+ if (factor == 10)
+ return num;
+ num *= factor;
+ do_div(num, 10);
+ return num;
+}
+
+int btrfs_balance(struct btrfs_root *dev_root)
+{
+ int ret;
+ struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+ struct btrfs_device *device;
+ u64 old_size;
+ u64 size_to_free;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_chunk *chunk;
+ struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key found_key;
+
+ if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ mutex_lock(&dev_root->fs_info->volume_mutex);
+ dev_root = dev_root->fs_info->dev_root;
+
+ /* step one make some room on all the devices */
+ list_for_each_entry(device, devices, dev_list) {
+ old_size = device->total_bytes;
+ size_to_free = div_factor(old_size, 1);
+ size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+ if (!device->writeable ||
+ device->total_bytes - device->bytes_used > size_to_free)
+ continue;
+
+ ret = btrfs_shrink_device(device, old_size - size_to_free);
+ if (ret == -ENOSPC)
+ break;
+ BUG_ON(ret);
+
+ trans = btrfs_start_transaction(dev_root, 1);
+ BUG_ON(!trans);
+
+ ret = btrfs_grow_device(trans, device, old_size);
+ BUG_ON(ret);
+
+ btrfs_end_transaction(trans, dev_root);
+ }
+
+ /* step two, relocate all the chunks */
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.offset = (u64)-1;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+
+ /*
+ * this shouldn't happen, it means the last relocate
+ * failed
+ */
+ if (ret == 0)
+ break;
+
+ ret = btrfs_previous_item(chunk_root, path, 0,
+ BTRFS_CHUNK_ITEM_KEY);
+ if (ret)
+ break;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != key.objectid)
+ break;
+
+ chunk = btrfs_item_ptr(path->nodes[0],
+ path->slots[0],
+ struct btrfs_chunk);
+ /* chunk zero is special */
+ if (found_key.offset == 0)
+ break;
+
+ btrfs_release_path(chunk_root, path);
+ ret = btrfs_relocate_chunk(chunk_root,
+ chunk_root->root_key.objectid,
+ found_key.objectid,
+ found_key.offset);
+ BUG_ON(ret && ret != -ENOSPC);
+ key.offset = found_key.offset - 1;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ mutex_unlock(&dev_root->fs_info->volume_mutex);
+ return ret;
+}
+
+/*
+ * shrinking a device means finding all of the device extents past
+ * the new size, and then following the back refs to the chunks.
+ * The chunk relocation code actually frees the device extent
+ */
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_path *path;
+ u64 length;
+ u64 chunk_tree;
+ u64 chunk_objectid;
+ u64 chunk_offset;
+ int ret;
+ int slot;
+ int failed = 0;
+ bool retried = false;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ u64 old_total = btrfs_super_total_bytes(super_copy);
+ u64 old_size = device->total_bytes;
+ u64 diff = device->total_bytes - new_size;
+
+ if (new_size >= device->total_bytes)
+ return -EINVAL;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = 2;
+
+ lock_chunks(root);
+
+ device->total_bytes = new_size;
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes -= diff;
+ unlock_chunks(root);
+
+again:
+ key.objectid = device->devid;
+ key.offset = (u64)-1;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto done;
+
+ ret = btrfs_previous_item(root, path, 0, key.type);
+ if (ret < 0)
+ goto done;
+ if (ret) {
+ ret = 0;
+ btrfs_release_path(root, path);
+ break;
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
+ if (key.objectid != device->devid) {
+ btrfs_release_path(root, path);
+ break;
+ }
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ length = btrfs_dev_extent_length(l, dev_extent);
+
+ if (key.offset + length <= new_size) {
+ btrfs_release_path(root, path);
+ break;
+ }
+
+ chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+ chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+ chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+ btrfs_release_path(root, path);
+
+ ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+ chunk_offset);
+ if (ret && ret != -ENOSPC)
+ goto done;
+ if (ret == -ENOSPC)
+ failed++;
+ key.offset -= 1;
+ }
+
+ if (failed && !retried) {
+ failed = 0;
+ retried = true;
+ goto again;
+ } else if (failed && retried) {
+ ret = -ENOSPC;
+ lock_chunks(root);
+
+ device->total_bytes = old_size;
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes += diff;
+ unlock_chunks(root);
+ goto done;
+ }
+
+ /* Shrinking succeeded, else we would be at "done". */
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ lock_chunks(root);
+
+ device->disk_total_bytes = new_size;
+ /* Now btrfs_update_device() will change the on-disk size. */
+ ret = btrfs_update_device(trans, device);
+ if (ret) {
+ unlock_chunks(root);
+ btrfs_end_transaction(trans, root);
+ goto done;
+ }
+ WARN_ON(diff > old_total);
+ btrfs_set_super_total_bytes(super_copy, old_total - diff);
+ unlock_chunks(root);
+ btrfs_end_transaction(trans, root);
+done:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key,
+ struct btrfs_chunk *chunk, int item_size)
+{
+ struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_disk_key disk_key;
+ u32 array_size;
+ u8 *ptr;
+
+ array_size = btrfs_super_sys_array_size(super_copy);
+ if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+ return -EFBIG;
+
+ ptr = super_copy->sys_chunk_array + array_size;
+ btrfs_cpu_key_to_disk(&disk_key, key);
+ memcpy(ptr, &disk_key, sizeof(disk_key));
+ ptr += sizeof(disk_key);
+ memcpy(ptr, chunk, item_size);
+ item_size += sizeof(disk_key);
+ btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+ return 0;
+}
+
+static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
+ int num_stripes, int sub_stripes)
+{
+ if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
+ return calc_size;
+ else if (type & BTRFS_BLOCK_GROUP_RAID10)
+ return calc_size * (num_stripes / sub_stripes);
+ else
+ return calc_size * num_stripes;
+}
+
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct map_lookup **map_ret,
+ u64 *num_bytes, u64 *stripe_size,
+ u64 start, u64 type)
+{
+ struct btrfs_fs_info *info = extent_root->fs_info;
+ struct btrfs_device *device = NULL;
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ struct list_head *cur;
+ struct map_lookup *map = NULL;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct list_head private_devs;
+ int min_stripe_size = 1 * 1024 * 1024;
+ u64 calc_size = 1024 * 1024 * 1024;
+ u64 max_chunk_size = calc_size;
+ u64 min_free;
+ u64 avail;
+ u64 max_avail = 0;
+ u64 dev_offset;
+ int num_stripes = 1;
+ int min_stripes = 1;
+ int sub_stripes = 0;
+ int looped = 0;
+ int ret;
+ int index;
+ int stripe_len = 64 * 1024;
+
+ if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+ (type & BTRFS_BLOCK_GROUP_DUP)) {
+ WARN_ON(1);
+ type &= ~BTRFS_BLOCK_GROUP_DUP;
+ }
+ if (list_empty(&fs_devices->alloc_list))
+ return -ENOSPC;
+
+ if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
+ num_stripes = fs_devices->rw_devices;
+ min_stripes = 2;
+ }
+ if (type & (BTRFS_BLOCK_GROUP_DUP)) {
+ num_stripes = 2;
+ min_stripes = 2;
+ }
+ if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
+ num_stripes = min_t(u64, 2, fs_devices->rw_devices);
+ if (num_stripes < 2)
+ return -ENOSPC;
+ min_stripes = 2;
+ }
+ if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+ num_stripes = fs_devices->rw_devices;
+ if (num_stripes < 4)
+ return -ENOSPC;
+ num_stripes &= ~(u32)1;
+ sub_stripes = 2;
+ min_stripes = 4;
+ }
+
+ if (type & BTRFS_BLOCK_GROUP_DATA) {
+ max_chunk_size = 10 * calc_size;
+ min_stripe_size = 64 * 1024 * 1024;
+ } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+ max_chunk_size = 256 * 1024 * 1024;
+ min_stripe_size = 32 * 1024 * 1024;
+ } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ calc_size = 8 * 1024 * 1024;
+ max_chunk_size = calc_size * 2;
+ min_stripe_size = 1 * 1024 * 1024;
+ }
+
+ /* we don't want a chunk larger than 10% of writeable space */
+ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+ max_chunk_size);
+
+again:
+ max_avail = 0;
+ if (!map || map->num_stripes != num_stripes) {
+ kfree(map);
+ map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+ if (!map)
+ return -ENOMEM;
+ map->num_stripes = num_stripes;
+ }
+
+ if (calc_size * num_stripes > max_chunk_size) {
+ calc_size = max_chunk_size;
+ do_div(calc_size, num_stripes);
+ do_div(calc_size, stripe_len);
+ calc_size *= stripe_len;
+ }
+ /* we don't want tiny stripes */
+ calc_size = max_t(u64, min_stripe_size, calc_size);
+
+ do_div(calc_size, stripe_len);
+ calc_size *= stripe_len;
+
+ cur = fs_devices->alloc_list.next;
+ index = 0;
+
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ min_free = calc_size * 2;
+ else
+ min_free = calc_size;
+
+ /*
+ * we add 1MB because we never use the first 1MB of the device, unless
+ * we've looped, then we are likely allocating the maximum amount of
+ * space left already
+ */
+ if (!looped)
+ min_free += 1024 * 1024;
+
+ INIT_LIST_HEAD(&private_devs);
+ while (index < num_stripes) {
+ device = list_entry(cur, struct btrfs_device, dev_alloc_list);
+ BUG_ON(!device->writeable);
+ if (device->total_bytes > device->bytes_used)
+ avail = device->total_bytes - device->bytes_used;
+ else
+ avail = 0;
+ cur = cur->next;
+
+ if (device->in_fs_metadata && avail >= min_free) {
+ ret = find_free_dev_extent(trans, device,
+ min_free, &dev_offset,
+ &max_avail);
+ if (ret == 0) {
+ list_move_tail(&device->dev_alloc_list,
+ &private_devs);
+ map->stripes[index].dev = device;
+ map->stripes[index].physical = dev_offset;
+ index++;
+ if (type & BTRFS_BLOCK_GROUP_DUP) {
+ map->stripes[index].dev = device;
+ map->stripes[index].physical =
+ dev_offset + calc_size;
+ index++;
+ }
+ }
+ } else if (device->in_fs_metadata && avail > max_avail)
+ max_avail = avail;
+ if (cur == &fs_devices->alloc_list)
+ break;
+ }
+ list_splice(&private_devs, &fs_devices->alloc_list);
+ if (index < num_stripes) {
+ if (index >= min_stripes) {
+ num_stripes = index;
+ if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+ num_stripes /= sub_stripes;
+ num_stripes *= sub_stripes;
+ }
+ looped = 1;
+ goto again;
+ }
+ if (!looped && max_avail > 0) {
+ looped = 1;
+ calc_size = max_avail;
+ goto again;
+ }
+ kfree(map);
+ return -ENOSPC;
+ }
+ map->sector_size = extent_root->sectorsize;
+ map->stripe_len = stripe_len;
+ map->io_align = stripe_len;
+ map->io_width = stripe_len;
+ map->type = type;
+ map->num_stripes = num_stripes;
+ map->sub_stripes = sub_stripes;
+
+ *map_ret = map;
+ *stripe_size = calc_size;
+ *num_bytes = chunk_bytes_by_type(type, calc_size,
+ num_stripes, sub_stripes);
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ em->bdev = (struct block_device *)map;
+ em->start = start;
+ em->len = *num_bytes;
+ em->block_start = 0;
+ em->block_len = em->len;
+
+ em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ BUG_ON(ret);
+ free_extent_map(em);
+
+ ret = btrfs_make_block_group(trans, extent_root, 0, type,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ start, *num_bytes);
+ BUG_ON(ret);
+
+ index = 0;
+ while (index < map->num_stripes) {
+ device = map->stripes[index].dev;
+ dev_offset = map->stripes[index].physical;
+
+ ret = btrfs_alloc_dev_extent(trans, device,
+ info->chunk_root->root_key.objectid,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ start, dev_offset, calc_size);
+ BUG_ON(ret);
+ index++;
+ }
+
+ return 0;
+}
+
+static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct map_lookup *map, u64 chunk_offset,
+ u64 chunk_size, u64 stripe_size)
+{
+ u64 dev_offset;
+ struct btrfs_key key;
+ struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+ struct btrfs_device *device;
+ struct btrfs_chunk *chunk;
+ struct btrfs_stripe *stripe;
+ size_t item_size = btrfs_chunk_item_size(map->num_stripes);
+ int index = 0;
+ int ret;
+
+ chunk = kzalloc(item_size, GFP_NOFS);
+ if (!chunk)
+ return -ENOMEM;
+
+ index = 0;
+ while (index < map->num_stripes) {
+ device = map->stripes[index].dev;
+ device->bytes_used += stripe_size;
+ ret = btrfs_update_device(trans, device);
+ BUG_ON(ret);
+ index++;
+ }
+
+ index = 0;
+ stripe = &chunk->stripe;
+ while (index < map->num_stripes) {
+ device = map->stripes[index].dev;
+ dev_offset = map->stripes[index].physical;
+
+ btrfs_set_stack_stripe_devid(stripe, device->devid);
+ btrfs_set_stack_stripe_offset(stripe, dev_offset);
+ memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+ stripe++;
+ index++;
+ }
+
+ btrfs_set_stack_chunk_length(chunk, chunk_size);
+ btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+ btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_type(chunk, map->type);
+ btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+ btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+ btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = chunk_offset;
+
+ ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+ BUG_ON(ret);
+
+ if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+ item_size);
+ BUG_ON(ret);
+ }
+ kfree(chunk);
+ return 0;
+}
+
+/*
+ * Chunk allocation falls into two parts. The first part does works
+ * that make the new allocated chunk useable, but not do any operation
+ * that modifies the chunk tree. The second part does the works that
+ * require modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 type)
+{
+ u64 chunk_offset;
+ u64 chunk_size;
+ u64 stripe_size;
+ struct map_lookup *map;
+ struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+ int ret;
+
+ ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ &chunk_offset);
+ if (ret)
+ return ret;
+
+ ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+ &stripe_size, chunk_offset, type);
+ if (ret)
+ return ret;
+
+ ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+ chunk_size, stripe_size);
+ BUG_ON(ret);
+ return 0;
+}
+
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_device *device)
+{
+ u64 chunk_offset;
+ u64 sys_chunk_offset;
+ u64 chunk_size;
+ u64 sys_chunk_size;
+ u64 stripe_size;
+ u64 sys_stripe_size;
+ u64 alloc_profile;
+ struct map_lookup *map;
+ struct map_lookup *sys_map;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ int ret;
+
+ ret = find_next_chunk(fs_info->chunk_root,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
+ BUG_ON(ret);
+
+ alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
+ (fs_info->metadata_alloc_profile &
+ fs_info->avail_metadata_alloc_bits);
+ alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+ ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+ &stripe_size, chunk_offset, alloc_profile);
+ BUG_ON(ret);
+
+ sys_chunk_offset = chunk_offset + chunk_size;
+
+ alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
+ (fs_info->system_alloc_profile &
+ fs_info->avail_system_alloc_bits);
+ alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+ ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
+ &sys_chunk_size, &sys_stripe_size,
+ sys_chunk_offset, alloc_profile);
+ BUG_ON(ret);
+
+ ret = btrfs_add_device(trans, fs_info->chunk_root, device);
+ BUG_ON(ret);
+
+ /*
+ * Modifying chunk tree needs allocating new blocks from both
+ * system block group and metadata block group. So we only can
+ * do operations require modifying the chunk tree after both
+ * block groups were created.
+ */
+ ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+ chunk_size, stripe_size);
+ BUG_ON(ret);
+
+ ret = __finish_chunk_alloc(trans, extent_root, sys_map,
+ sys_chunk_offset, sys_chunk_size,
+ sys_stripe_size);
+ BUG_ON(ret);
+ return 0;
+}
+
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ int readonly = 0;
+ int i;
+
+ read_lock(&map_tree->map_tree.lock);
+ em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+ read_unlock(&map_tree->map_tree.lock);
+ if (!em)
+ return 1;
+
+ if (btrfs_test_opt(root, DEGRADED)) {
+ free_extent_map(em);
+ return 0;
+ }
+
+ map = (struct map_lookup *)em->bdev;
+ for (i = 0; i < map->num_stripes; i++) {
+ if (!map->stripes[i].dev->writeable) {
+ readonly = 1;
+ break;
+ }
+ }
+ free_extent_map(em);
+ return readonly;
+}
+
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
+{
+ extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+}
+
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+{
+ struct extent_map *em;
+
+ while (1) {
+ write_lock(&tree->map_tree.lock);
+ em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+ if (em)
+ remove_extent_mapping(&tree->map_tree, em);
+ write_unlock(&tree->map_tree.lock);
+ if (!em)
+ break;
+ kfree(em->bdev);
+ /* once for us */
+ free_extent_map(em);
+ /* once for the tree */
+ free_extent_map(em);
+ }
+}
+
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ int ret;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, len);
+ read_unlock(&em_tree->lock);
+ BUG_ON(!em);
+
+ BUG_ON(em->start > logical || em->start + em->len < logical);
+ map = (struct map_lookup *)em->bdev;
+ if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+ ret = map->num_stripes;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+ ret = map->sub_stripes;
+ else
+ ret = 1;
+ free_extent_map(em);
+ return ret;
+}
+
+static int find_live_mirror(struct map_lookup *map, int first, int num,
+ int optimal)
+{
+ int i;
+ if (map->stripes[optimal].dev->bdev)
+ return optimal;
+ for (i = first; i < first + num; i++) {
+ if (map->stripes[i].dev->bdev)
+ return i;
+ }
+ /* we couldn't find one that doesn't fail. Just return something
+ * and the io error handling code will clean up eventually
+ */
+ return optimal;
+}
+
+static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_multi_bio **multi_ret,
+ int mirror_num, struct page *unplug_page)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ u64 offset;
+ u64 stripe_offset;
+ u64 stripe_nr;
+ int stripes_allocated = 8;
+ int stripes_required = 1;
+ int stripe_index;
+ int i;
+ int num_stripes;
+ int max_errors = 0;
+ struct btrfs_multi_bio *multi = NULL;
+
+ if (multi_ret && !(rw & (1 << BIO_RW)))
+ stripes_allocated = 1;
+again:
+ if (multi_ret) {
+ multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+ GFP_NOFS);
+ if (!multi)
+ return -ENOMEM;
+
+ atomic_set(&multi->error, 0);
+ }
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, *length);
+ read_unlock(&em_tree->lock);
+
+ if (!em && unplug_page) {
+ kfree(multi);
+ return 0;
+ }
+
+ if (!em) {
+ printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+ (unsigned long long)logical,
+ (unsigned long long)*length);
+ BUG();
+ }
+
+ BUG_ON(em->start > logical || em->start + em->len < logical);
+ map = (struct map_lookup *)em->bdev;
+ offset = logical - em->start;
+
+ if (mirror_num > map->num_stripes)
+ mirror_num = 0;
+
+ /* if our multi bio struct is too small, back off and try again */
+ if (rw & (1 << BIO_RW)) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ stripes_required = map->num_stripes;
+ max_errors = 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ stripes_required = map->sub_stripes;
+ max_errors = 1;
+ }
+ }
+ if (multi_ret && (rw & (1 << BIO_RW)) &&
+ stripes_allocated < stripes_required) {
+ stripes_allocated = map->num_stripes;
+ free_extent_map(em);
+ kfree(multi);
+ goto again;
+ }
+ stripe_nr = offset;
+ /*
+ * stripe_nr counts the total number of stripes we have to stride
+ * to get to this block
+ */
+ do_div(stripe_nr, map->stripe_len);
+
+ stripe_offset = stripe_nr * map->stripe_len;
+ BUG_ON(offset < stripe_offset);
+
+ /* stripe_offset is the offset of this block in its stripe*/
+ stripe_offset = offset - stripe_offset;
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ /* we limit the length of each bio to what fits in a stripe */
+ *length = min_t(u64, em->len - offset,
+ map->stripe_len - stripe_offset);
+ } else {
+ *length = em->len - offset;
+ }
+
+ if (!multi_ret && !unplug_page)
+ goto out;
+
+ num_stripes = 1;
+ stripe_index = 0;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+ if (unplug_page || (rw & (1 << BIO_RW)))
+ num_stripes = map->num_stripes;
+ else if (mirror_num)
+ stripe_index = mirror_num - 1;
+ else {
+ stripe_index = find_live_mirror(map, 0,
+ map->num_stripes,
+ current->pid % map->num_stripes);
+ }
+
+ } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+ if (rw & (1 << BIO_RW))
+ num_stripes = map->num_stripes;
+ else if (mirror_num)
+ stripe_index = mirror_num - 1;
+
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ int factor = map->num_stripes / map->sub_stripes;
+
+ stripe_index = do_div(stripe_nr, factor);
+ stripe_index *= map->sub_stripes;
+
+ if (unplug_page || (rw & (1 << BIO_RW)))
+ num_stripes = map->sub_stripes;
+ else if (mirror_num)
+ stripe_index += mirror_num - 1;
+ else {
+ stripe_index = find_live_mirror(map, stripe_index,
+ map->sub_stripes, stripe_index +
+ current->pid % map->sub_stripes);
+ }
+ } else {
+ /*
+ * after this do_div call, stripe_nr is the number of stripes
+ * on this device we have to walk to find the data, and
+ * stripe_index is the number of our device in the stripe array
+ */
+ stripe_index = do_div(stripe_nr, map->num_stripes);
+ }
+ BUG_ON(stripe_index >= map->num_stripes);
+
+ for (i = 0; i < num_stripes; i++) {
+ if (unplug_page) {
+ struct btrfs_device *device;
+ struct backing_dev_info *bdi;
+
+ device = map->stripes[stripe_index].dev;
+ if (device->bdev) {
+ bdi = blk_get_backing_dev_info(device->bdev);
+ if (bdi->unplug_io_fn)
+ bdi->unplug_io_fn(bdi, unplug_page);
+ }
+ } else {
+ multi->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+ multi->stripes[i].dev = map->stripes[stripe_index].dev;
+ }
+ stripe_index++;
+ }
+ if (multi_ret) {
+ *multi_ret = multi;
+ multi->num_stripes = num_stripes;
+ multi->max_errors = max_errors;
+ }
+out:
+ free_extent_map(em);
+ return 0;
+}
+
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_multi_bio **multi_ret, int mirror_num)
+{
+ return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+ mirror_num, NULL);
+}
+
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+ u64 chunk_start, u64 physical, u64 devid,
+ u64 **logical, int *naddrs, int *stripe_len)
+{
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 *buf;
+ u64 bytenr;
+ u64 length;
+ u64 stripe_nr;
+ int i, j, nr = 0;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_start, 1);
+ read_unlock(&em_tree->lock);
+
+ BUG_ON(!em || em->start != chunk_start);
+ map = (struct map_lookup *)em->bdev;
+
+ length = em->len;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+ do_div(length, map->num_stripes / map->sub_stripes);
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ do_div(length, map->num_stripes);
+
+ buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+ BUG_ON(!buf);
+
+ for (i = 0; i < map->num_stripes; i++) {
+ if (devid && map->stripes[i].dev->devid != devid)
+ continue;
+ if (map->stripes[i].physical > physical ||
+ map->stripes[i].physical + length <= physical)
+ continue;
+
+ stripe_nr = physical - map->stripes[i].physical;
+ do_div(stripe_nr, map->stripe_len);
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ do_div(stripe_nr, map->sub_stripes);
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ }
+ bytenr = chunk_start + stripe_nr * map->stripe_len;
+ WARN_ON(nr >= map->num_stripes);
+ for (j = 0; j < nr; j++) {
+ if (buf[j] == bytenr)
+ break;
+ }
+ if (j == nr) {
+ WARN_ON(nr >= map->num_stripes);
+ buf[nr++] = bytenr;
+ }
+ }
+
+ *logical = buf;
+ *naddrs = nr;
+ *stripe_len = map->stripe_len;
+
+ free_extent_map(em);
+ return 0;
+}
+
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+ u64 logical, struct page *page)
+{
+ u64 length = PAGE_CACHE_SIZE;
+ return __btrfs_map_block(map_tree, READ, logical, &length,
+ NULL, 0, page);
+}
+
+static void end_bio_multi_stripe(struct bio *bio, int err)
+{
+ struct btrfs_multi_bio *multi = bio->bi_private;
+ int is_orig_bio = 0;
+
+ if (err)
+ atomic_inc(&multi->error);
+
+ if (bio == multi->orig_bio)
+ is_orig_bio = 1;
+
+ if (atomic_dec_and_test(&multi->stripes_pending)) {
+ if (!is_orig_bio) {
+ bio_put(bio);
+ bio = multi->orig_bio;
+ }
+ bio->bi_private = multi->private;
+ bio->bi_end_io = multi->end_io;
+ /* only send an error to the higher layers if it is
+ * beyond the tolerance of the multi-bio
+ */
+ if (atomic_read(&multi->error) > multi->max_errors) {
+ err = -EIO;
+ } else if (err) {
+ /*
+ * this bio is actually up to date, we didn't
+ * go over the max number of errors
+ */
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ err = 0;
+ }
+ kfree(multi);
+
+ bio_endio(bio, err);
+ } else if (!is_orig_bio) {
+ bio_put(bio);
+ }
+}
+
+struct async_sched {
+ struct bio *bio;
+ int rw;
+ struct btrfs_fs_info *info;
+ struct btrfs_work work;
+};
+
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+static noinline int schedule_bio(struct btrfs_root *root,
+ struct btrfs_device *device,
+ int rw, struct bio *bio)
+{
+ int should_queue = 1;
+ struct btrfs_pending_bios *pending_bios;
+
+ /* don't bother with additional async steps for reads, right now */
+ if (!(rw & (1 << BIO_RW))) {
+ bio_get(bio);
+ submit_bio(rw, bio);
+ bio_put(bio);
+ return 0;
+ }
+
+ /*
+ * nr_async_bios allows us to reliably return congestion to the
+ * higher layers. Otherwise, the async bio makes it appear we have
+ * made progress against dirty pages when we've really just put it
+ * on a queue for later
+ */
+ atomic_inc(&root->fs_info->nr_async_bios);
+ WARN_ON(bio->bi_next);
+ bio->bi_next = NULL;
+ bio->bi_rw |= rw;
+
+ spin_lock(&device->io_lock);
+ if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
+ pending_bios = &device->pending_sync_bios;
+ else
+ pending_bios = &device->pending_bios;
+
+ if (pending_bios->tail)
+ pending_bios->tail->bi_next = bio;
+
+ pending_bios->tail = bio;
+ if (!pending_bios->head)
+ pending_bios->head = bio;
+ if (device->running_pending)
+ should_queue = 0;
+
+ spin_unlock(&device->io_lock);
+
+ if (should_queue)
+ btrfs_queue_worker(&root->fs_info->submit_workers,
+ &device->work);
+ return 0;
+}
+
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+ int mirror_num, int async_submit)
+{
+ struct btrfs_mapping_tree *map_tree;
+ struct btrfs_device *dev;
+ struct bio *first_bio = bio;
+ u64 logical = (u64)bio->bi_sector << 9;
+ u64 length = 0;
+ u64 map_length;
+ struct btrfs_multi_bio *multi = NULL;
+ int ret;
+ int dev_nr = 0;
+ int total_devs = 1;
+
+ length = bio->bi_size;
+ map_tree = &root->fs_info->mapping_tree;
+ map_length = length;
+
+ ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+ mirror_num);
+ BUG_ON(ret);
+
+ total_devs = multi->num_stripes;
+ if (map_length < length) {
+ printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+ "len %llu\n", (unsigned long long)logical,
+ (unsigned long long)length,
+ (unsigned long long)map_length);
+ BUG();
+ }
+ multi->end_io = first_bio->bi_end_io;
+ multi->private = first_bio->bi_private;
+ multi->orig_bio = first_bio;
+ atomic_set(&multi->stripes_pending, multi->num_stripes);
+
+ while (dev_nr < total_devs) {
+ if (total_devs > 1) {
+ if (dev_nr < total_devs - 1) {
+ bio = bio_clone(first_bio, GFP_NOFS);
+ BUG_ON(!bio);
+ } else {
+ bio = first_bio;
+ }
+ bio->bi_private = multi;
+ bio->bi_end_io = end_bio_multi_stripe;
+ }
+ bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
+ dev = multi->stripes[dev_nr].dev;
+ BUG_ON(rw == WRITE && !dev->writeable);
+ if (dev && dev->bdev) {
+ bio->bi_bdev = dev->bdev;
+ if (async_submit)
+ schedule_bio(root, dev, rw, bio);
+ else
+ submit_bio(rw, bio);
+ } else {
+ bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
+ bio->bi_sector = logical >> 9;
+ bio_endio(bio, -EIO);
+ }
+ dev_nr++;
+ }
+ if (total_devs == 1)
+ kfree(multi);
+ return 0;
+}
+
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+ u8 *uuid, u8 *fsid)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *cur_devices;
+
+ cur_devices = root->fs_info->fs_devices;
+ while (cur_devices) {
+ if (!fsid ||
+ !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+ device = __find_device(&cur_devices->devices,
+ devid, uuid);
+ if (device)
+ return device;
+ }
+ cur_devices = cur_devices->seed;
+ }
+ return NULL;
+}
+
+static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+ u64 devid, u8 *dev_uuid)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+
+ device = kzalloc(sizeof(*device), GFP_NOFS);
+ if (!device)
+ return NULL;
+ list_add(&device->dev_list,
+ &fs_devices->devices);
+ device->barriers = 1;
+ device->dev_root = root->fs_info->dev_root;
+ device->devid = devid;
+ device->work.func = pending_bios_fn;
+ device->fs_devices = fs_devices;
+ fs_devices->num_devices++;
+ spin_lock_init(&device->io_lock);
+ INIT_LIST_HEAD(&device->dev_alloc_list);
+ memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+ return device;
+}
+
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk)
+{
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ struct map_lookup *map;
+ struct extent_map *em;
+ u64 logical;
+ u64 length;
+ u64 devid;
+ u8 uuid[BTRFS_UUID_SIZE];
+ int num_stripes;
+ int ret;
+ int i;
+
+ logical = key->offset;
+ length = btrfs_chunk_length(leaf, chunk);
+
+ read_lock(&map_tree->map_tree.lock);
+ em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+ read_unlock(&map_tree->map_tree.lock);
+
+ /* already mapped? */
+ if (em && em->start <= logical && em->start + em->len > logical) {
+ free_extent_map(em);
+ return 0;
+ } else if (em) {
+ free_extent_map(em);
+ }
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em)
+ return -ENOMEM;
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+ if (!map) {
+ free_extent_map(em);
+ return -ENOMEM;
+ }
+
+ em->bdev = (struct block_device *)map;
+ em->start = logical;
+ em->len = length;
+ em->block_start = 0;
+ em->block_len = em->len;
+
+ map->num_stripes = num_stripes;
+ map->io_width = btrfs_chunk_io_width(leaf, chunk);
+ map->io_align = btrfs_chunk_io_align(leaf, chunk);
+ map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
+ map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ map->type = btrfs_chunk_type(leaf, chunk);
+ map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ for (i = 0; i < num_stripes; i++) {
+ map->stripes[i].physical =
+ btrfs_stripe_offset_nr(leaf, chunk, i);
+ devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+ read_extent_buffer(leaf, uuid, (unsigned long)
+ btrfs_stripe_dev_uuid_nr(chunk, i),
+ BTRFS_UUID_SIZE);
+ map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
+ NULL);
+ if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
+ kfree(map);
+ free_extent_map(em);
+ return -EIO;
+ }
+ if (!map->stripes[i].dev) {
+ map->stripes[i].dev =
+ add_missing_dev(root, devid, uuid);
+ if (!map->stripes[i].dev) {
+ kfree(map);
+ free_extent_map(em);
+ return -EIO;
+ }
+ }
+ map->stripes[i].dev->in_fs_metadata = 1;
+ }
+
+ write_lock(&map_tree->map_tree.lock);
+ ret = add_extent_mapping(&map_tree->map_tree, em);
+ write_unlock(&map_tree->map_tree.lock);
+ BUG_ON(ret);
+ free_extent_map(em);
+
+ return 0;
+}
+
+static int fill_device_from_item(struct extent_buffer *leaf,
+ struct btrfs_dev_item *dev_item,
+ struct btrfs_device *device)
+{
+ unsigned long ptr;
+
+ device->devid = btrfs_device_id(leaf, dev_item);
+ device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+ device->total_bytes = device->disk_total_bytes;
+ device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+ device->type = btrfs_device_type(leaf, dev_item);
+ device->io_align = btrfs_device_io_align(leaf, dev_item);
+ device->io_width = btrfs_device_io_width(leaf, dev_item);
+ device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+
+ ptr = (unsigned long)btrfs_device_uuid(dev_item);
+ read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+
+ return 0;
+}
+
+static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+{
+ struct btrfs_fs_devices *fs_devices;
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+
+ fs_devices = root->fs_info->fs_devices->seed;
+ while (fs_devices) {
+ if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+ ret = 0;
+ goto out;
+ }
+ fs_devices = fs_devices->seed;
+ }
+
+ fs_devices = find_fsid(fsid);
+ if (!fs_devices) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ fs_devices = clone_fs_devices(fs_devices);
+ if (IS_ERR(fs_devices)) {
+ ret = PTR_ERR(fs_devices);
+ goto out;
+ }
+
+ ret = __btrfs_open_devices(fs_devices, FMODE_READ,
+ root->fs_info->bdev_holder);
+ if (ret)
+ goto out;
+
+ if (!fs_devices->seeding) {
+ __btrfs_close_devices(fs_devices);
+ free_fs_devices(fs_devices);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ fs_devices->seed = root->fs_info->fs_devices->seed;
+ root->fs_info->fs_devices->seed = fs_devices;
+out:
+ mutex_unlock(&uuid_mutex);
+ return ret;
+}
+
+static int read_one_dev(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_dev_item *dev_item)
+{
+ struct btrfs_device *device;
+ u64 devid;
+ int ret;
+ u8 fs_uuid[BTRFS_UUID_SIZE];
+ u8 dev_uuid[BTRFS_UUID_SIZE];
+
+ devid = btrfs_device_id(leaf, dev_item);
+ read_extent_buffer(leaf, dev_uuid,
+ (unsigned long)btrfs_device_uuid(dev_item),
+ BTRFS_UUID_SIZE);
+ read_extent_buffer(leaf, fs_uuid,
+ (unsigned long)btrfs_device_fsid(dev_item),
+ BTRFS_UUID_SIZE);
+
+ if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
+ ret = open_seed_devices(root, fs_uuid);
+ if (ret && !btrfs_test_opt(root, DEGRADED))
+ return ret;
+ }
+
+ device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+ if (!device || !device->bdev) {
+ if (!btrfs_test_opt(root, DEGRADED))
+ return -EIO;
+
+ if (!device) {
+ printk(KERN_WARNING "warning devid %llu missing\n",
+ (unsigned long long)devid);
+ device = add_missing_dev(root, devid, dev_uuid);
+ if (!device)
+ return -ENOMEM;
+ }
+ }
+
+ if (device->fs_devices != root->fs_info->fs_devices) {
+ BUG_ON(device->writeable);
+ if (device->generation !=
+ btrfs_device_generation(leaf, dev_item))
+ return -EINVAL;
+ }
+
+ fill_device_from_item(leaf, dev_item, device);
+ device->dev_root = root->fs_info->dev_root;
+ device->in_fs_metadata = 1;
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes += device->total_bytes;
+ ret = 0;
+ return ret;
+}
+
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
+{
+ struct btrfs_dev_item *dev_item;
+
+ dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
+ dev_item);
+ return read_one_dev(root, buf, dev_item);
+}
+
+int btrfs_read_sys_array(struct btrfs_root *root)
+{
+ struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct extent_buffer *sb;
+ struct btrfs_disk_key *disk_key;
+ struct btrfs_chunk *chunk;
+ u8 *ptr;
+ unsigned long sb_ptr;
+ int ret = 0;
+ u32 num_stripes;
+ u32 array_size;
+ u32 len = 0;
+ u32 cur;
+ struct btrfs_key key;
+
+ sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+ BTRFS_SUPER_INFO_SIZE);
+ if (!sb)
+ return -ENOMEM;
+ btrfs_set_buffer_uptodate(sb);
+ btrfs_set_buffer_lockdep_class(sb, 0);
+
+ write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
+ array_size = btrfs_super_sys_array_size(super_copy);
+
+ ptr = super_copy->sys_chunk_array;
+ sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
+ cur = 0;
+
+ while (cur < array_size) {
+ disk_key = (struct btrfs_disk_key *)ptr;
+ btrfs_disk_key_to_cpu(&key, disk_key);
+
+ len = sizeof(*disk_key); ptr += len;
+ sb_ptr += len;
+ cur += len;
+
+ if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+ chunk = (struct btrfs_chunk *)sb_ptr;
+ ret = read_one_chunk(root, &key, sb, chunk);
+ if (ret)
+ break;
+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ len = btrfs_chunk_item_size(num_stripes);
+ } else {
+ ret = -EIO;
+ break;
+ }
+ ptr += len;
+ sb_ptr += len;
+ cur += len;
+ }
+ free_extent_buffer(sb);
+ return ret;
+}
+
+int btrfs_read_chunk_tree(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int ret;
+ int slot;
+
+ root = root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* first we search for all of the device items, and then we
+ * read in all of the chunk items. This way we can create chunk
+ * mappings that reference all of the devices that are afound
+ */
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.offset = 0;
+ key.type = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto error;
+ break;
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+ if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
+ break;
+ if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+ struct btrfs_dev_item *dev_item;
+ dev_item = btrfs_item_ptr(leaf, slot,
+ struct btrfs_dev_item);
+ ret = read_one_dev(root, leaf, dev_item);
+ if (ret)
+ goto error;
+ }
+ } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+ struct btrfs_chunk *chunk;
+ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ ret = read_one_chunk(root, &found_key, leaf, chunk);
+ if (ret)
+ goto error;
+ }
+ path->slots[0]++;
+ }
+ if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+ key.objectid = 0;
+ btrfs_release_path(root, path);
+ goto again;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 00000000000..31b0fabdd2e
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_VOLUMES_
+#define __BTRFS_VOLUMES_
+
+#include <linux/bio.h>
+#include "async-thread.h"
+
+struct buffer_head;
+struct btrfs_pending_bios {
+ struct bio *head;
+ struct bio *tail;
+};
+
+struct btrfs_device {
+ struct list_head dev_list;
+ struct list_head dev_alloc_list;
+ struct btrfs_fs_devices *fs_devices;
+ struct btrfs_root *dev_root;
+
+ /* regular prio bios */
+ struct btrfs_pending_bios pending_bios;
+ /* WRITE_SYNC bios */
+ struct btrfs_pending_bios pending_sync_bios;
+
+ int running_pending;
+ u64 generation;
+
+ int barriers;
+ int writeable;
+ int in_fs_metadata;
+
+ spinlock_t io_lock;
+
+ struct block_device *bdev;
+
+ /* the mode sent to open_bdev_exclusive */
+ fmode_t mode;
+
+ char *name;
+
+ /* the internal btrfs device id */
+ u64 devid;
+
+ /* size of the device */
+ u64 total_bytes;
+
+ /* size of the disk */
+ u64 disk_total_bytes;
+
+ /* bytes used */
+ u64 bytes_used;
+
+ /* optimal io alignment for this device */
+ u32 io_align;
+
+ /* optimal io width for this device */
+ u32 io_width;
+
+ /* minimal io size for this device */
+ u32 sector_size;
+
+ /* type and info about this device */
+ u64 type;
+
+ /* physical drive uuid (or lvm uuid) */
+ u8 uuid[BTRFS_UUID_SIZE];
+
+ struct btrfs_work work;
+};
+
+struct btrfs_fs_devices {
+ u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+
+ /* the device with this id has the most recent copy of the super */
+ u64 latest_devid;
+ u64 latest_trans;
+ u64 num_devices;
+ u64 open_devices;
+ u64 rw_devices;
+ u64 total_rw_bytes;
+ struct block_device *latest_bdev;
+
+ /* all of the devices in the FS, protected by a mutex
+ * so we can safely walk it to write out the supers without
+ * worrying about add/remove by the multi-device code
+ */
+ struct mutex device_list_mutex;
+ struct list_head devices;
+
+ /* devices not currently being allocated */
+ struct list_head alloc_list;
+ struct list_head list;
+
+ struct btrfs_fs_devices *seed;
+ int seeding;
+
+ int opened;
+
+ /* set when we find or add a device that doesn't have the
+ * nonrot flag set
+ */
+ int rotating;
+};
+
+struct btrfs_bio_stripe {
+ struct btrfs_device *dev;
+ u64 physical;
+};
+
+struct btrfs_multi_bio {
+ atomic_t stripes_pending;
+ bio_end_io_t *end_io;
+ struct bio *orig_bio;
+ void *private;
+ atomic_t error;
+ int max_errors;
+ int num_stripes;
+ struct btrfs_bio_stripe stripes[];
+};
+
+#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+ (sizeof(struct btrfs_bio_stripe) * (n)))
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset, u64 start, u64 num_bytes);
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_multi_bio **multi_ret, int mirror_num);
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+ u64 chunk_start, u64 physical, u64 devid,
+ u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_read_chunk_tree(struct btrfs_root *root);
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 type);
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+ int mirror_num, int async_submit);
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+ fmode_t flags, void *holder);
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+ struct btrfs_fs_devices **fs_devices_ret);
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_device *device);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path);
+int btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+ u64 logical, struct page *page);
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 new_size);
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+ u8 *uuid, u8 *fsid);
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
+int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_balance(struct btrfs_root *dev_root);
+void btrfs_unlock_volumes(void);
+void btrfs_lock_volumes(void);
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *max_avail);
+#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 00000000000..193b58f7d3f
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,392 @@
+/*
+ * Copyright (C) 2007 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "xattr.h"
+#include "disk-io.h"
+
+
+ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret = 0;
+ unsigned long data_ptr;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* lookup the xattr by name */
+ di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
+ strlen(name), 0);
+ if (!di) {
+ ret = -ENODATA;
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ /* if size is 0, that means we want the size of the attr */
+ if (!size) {
+ ret = btrfs_dir_data_len(leaf, di);
+ goto out;
+ }
+
+ /* now get the data out of our dir_item */
+ if (btrfs_dir_data_len(leaf, di) > size) {
+ ret = -ERANGE;
+ goto out;
+ }
+
+ /*
+ * The way things are packed into the leaf is like this
+ * |struct btrfs_dir_item|name|data|
+ * where name is the xattr name, so security.foo, and data is the
+ * content of the xattr. data_ptr points to the location in memory
+ * where the data starts in the in memory leaf
+ */
+ data_ptr = (unsigned long)((char *)(di + 1) +
+ btrfs_dir_name_len(leaf, di));
+ read_extent_buffer(leaf, buffer, data_ptr,
+ btrfs_dir_data_len(leaf, di));
+ ret = btrfs_dir_data_len(leaf, di);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int do_setxattr(struct btrfs_trans_handle *trans,
+ struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ size_t name_len = strlen(name);
+ int ret = 0;
+
+ if (name_len + size > BTRFS_MAX_XATTR_SIZE(root))
+ return -ENOSPC;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* first lets see if we already have this xattr */
+ di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
+ strlen(name), -1);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+
+ /* ok we already have this xattr, lets remove it */
+ if (di) {
+ /* if we want create only exit */
+ if (flags & XATTR_CREATE) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ BUG_ON(ret);
+ btrfs_release_path(root, path);
+
+ /* if we don't have a value then we are removing the xattr */
+ if (!value)
+ goto out;
+ } else {
+ btrfs_release_path(root, path);
+
+ if (flags & XATTR_REPLACE) {
+ /* we couldn't find the attr to replace */
+ ret = -ENODATA;
+ goto out;
+ }
+ }
+
+ /* ok we have to create a completely new xattr */
+ ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino,
+ name, name_len, value, size);
+ BUG_ON(ret);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+ struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ if (trans)
+ return do_setxattr(trans, inode, name, value, size, flags);
+
+ ret = btrfs_reserve_metadata_space(root, 2);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ btrfs_set_trans_block_group(trans, inode);
+
+ ret = do_setxattr(trans, inode, name, value, size, flags);
+ if (ret)
+ goto out;
+
+ inode->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+out:
+ btrfs_end_transaction_throttle(trans, root);
+ btrfs_unreserve_metadata_space(root, 2);
+ return ret;
+}
+
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ struct btrfs_key key, found_key;
+ struct inode *inode = dentry->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_item *item;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ int ret = 0, slot, advance;
+ size_t total_size = 0, size_left = size;
+ unsigned long name_ptr;
+ size_t name_len;
+ u32 nritems;
+
+ /*
+ * ok we want all objects associated with this id.
+ * NOTE: we set key.offset = 0; because we want to start with the
+ * first xattr that we find and walk forward
+ */
+ key.objectid = inode->i_ino;
+ btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 2;
+
+ /* search for our xattrs */
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto err;
+ advance = 0;
+ while (1) {
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ slot = path->slots[0];
+
+ /* this is where we start walking through the path */
+ if (advance || slot >= nritems) {
+ /*
+ * if we've reached the last slot in this leaf we need
+ * to go to the next leaf and reset everything
+ */
+ if (slot >= nritems-1) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ slot = path->slots[0];
+ } else {
+ /*
+ * just walking through the slots on this leaf
+ */
+ slot++;
+ path->slots[0]++;
+ }
+ }
+ advance = 1;
+
+ item = btrfs_item_nr(leaf, slot);
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /* check to make sure this item is what we want */
+ if (found_key.objectid != key.objectid)
+ break;
+ if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+
+ name_len = btrfs_dir_name_len(leaf, di);
+ total_size += name_len + 1;
+
+ /* we are just looking for how big our buffer needs to be */
+ if (!size)
+ continue;
+
+ if (!buffer || (name_len + 1) > size_left) {
+ ret = -ERANGE;
+ goto err;
+ }
+
+ name_ptr = (unsigned long)(di + 1);
+ read_extent_buffer(leaf, buffer, name_ptr, name_len);
+ buffer[name_len] = '\0';
+
+ size_left -= name_len + 1;
+ buffer += name_len + 1;
+ }
+ ret = total_size;
+
+err:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+/*
+ * List of handlers for synthetic system.* attributes. All real ondisk
+ * attributes are handled directly.
+ */
+struct xattr_handler *btrfs_xattr_handlers[] = {
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ &btrfs_xattr_acl_access_handler,
+ &btrfs_xattr_acl_default_handler,
+#endif
+ NULL,
+};
+
+/*
+ * Check if the attribute is in a supported namespace.
+ *
+ * This applied after the check for the synthetic attributes in the system
+ * namespace.
+ */
+static bool btrfs_is_valid_xattr(const char *name)
+{
+ return !strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN) ||
+ !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
+ !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+ !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
+{
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_getxattr(dentry, name, buffer, size);
+
+ if (!btrfs_is_valid_xattr(name))
+ return -EOPNOTSUPP;
+ return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
+}
+
+int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
+{
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_setxattr(dentry, name, value, size, flags);
+
+ if (!btrfs_is_valid_xattr(name))
+ return -EOPNOTSUPP;
+
+ if (size == 0)
+ value = ""; /* empty EA, do not remove */
+
+ return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size,
+ flags);
+}
+
+int btrfs_removexattr(struct dentry *dentry, const char *name)
+{
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_removexattr(dentry, name);
+
+ if (!btrfs_is_valid_xattr(name))
+ return -EOPNOTSUPP;
+
+ return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
+ XATTR_REPLACE);
+}
+
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir)
+{
+ int err;
+ size_t len;
+ void *value;
+ char *suffix;
+ char *name;
+
+ err = security_inode_init_security(inode, dir, &suffix, &value, &len);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
+ GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ } else {
+ strcpy(name, XATTR_SECURITY_PREFIX);
+ strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
+ err = __btrfs_setxattr(trans, inode, name, value, len, 0);
+ kfree(name);
+ }
+
+ kfree(suffix);
+ kfree(value);
+ return err;
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 00000000000..721efa0346e
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2007 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __XATTR__
+#define __XATTR__
+
+#include <linux/xattr.h>
+
+extern struct xattr_handler btrfs_xattr_acl_access_handler;
+extern struct xattr_handler btrfs_xattr_acl_default_handler;
+extern struct xattr_handler *btrfs_xattr_handlers[];
+
+extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+ void *buffer, size_t size);
+extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+ struct inode *inode, const char *name,
+ const void *value, size_t size, int flags);
+extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size);
+extern int btrfs_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags);
+extern int btrfs_removexattr(struct dentry *dentry, const char *name);
+
+extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir);
+
+#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 00000000000..3e2b90eaa23
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,632 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include "compression.h"
+
+/* Plan: call deflate() with avail_in == *sourcelen,
+ avail_out = *dstlen - 12 and flush == Z_FINISH.
+ If it doesn't manage to finish, call it again with
+ avail_in == 0 and avail_out set to the remaining 12
+ bytes for it to clean up.
+ Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+
+struct workspace {
+ z_stream inf_strm;
+ z_stream def_strm;
+ char *buf;
+ struct list_head list;
+};
+
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+ struct workspace *workspace;
+ int ret;
+ int cpus = num_online_cpus();
+
+again:
+ spin_lock(&workspace_lock);
+ if (!list_empty(&idle_workspace)) {
+ workspace = list_entry(idle_workspace.next, struct workspace,
+ list);
+ list_del(&workspace->list);
+ num_workspace--;
+ spin_unlock(&workspace_lock);
+ return workspace;
+
+ }
+ spin_unlock(&workspace_lock);
+ if (atomic_read(&alloc_workspace) > cpus) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&alloc_workspace) > cpus)
+ schedule();
+ finish_wait(&workspace_wait, &wait);
+ goto again;
+ }
+ atomic_inc(&alloc_workspace);
+ workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+ if (!workspace) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+ if (!workspace->def_strm.workspace) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+ if (!workspace->inf_strm.workspace) {
+ ret = -ENOMEM;
+ goto fail_inflate;
+ }
+ workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ if (!workspace->buf) {
+ ret = -ENOMEM;
+ goto fail_kmalloc;
+ }
+ return workspace;
+
+fail_kmalloc:
+ vfree(workspace->inf_strm.workspace);
+fail_inflate:
+ vfree(workspace->def_strm.workspace);
+fail:
+ kfree(workspace);
+ atomic_dec(&alloc_workspace);
+ wake_up(&workspace_wait);
+ return ERR_PTR(ret);
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+ spin_lock(&workspace_lock);
+ if (num_workspace < num_online_cpus()) {
+ list_add_tail(&workspace->list, &idle_workspace);
+ num_workspace++;
+ spin_unlock(&workspace_lock);
+ if (waitqueue_active(&workspace_wait))
+ wake_up(&workspace_wait);
+ return 0;
+ }
+ spin_unlock(&workspace_lock);
+ vfree(workspace->def_strm.workspace);
+ vfree(workspace->inf_strm.workspace);
+ kfree(workspace->buf);
+ kfree(workspace);
+
+ atomic_dec(&alloc_workspace);
+ if (waitqueue_active(&workspace_wait))
+ wake_up(&workspace_wait);
+ return 0;
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+ struct workspace *workspace;
+ while (!list_empty(&idle_workspace)) {
+ workspace = list_entry(idle_workspace.next, struct workspace,
+ list);
+ list_del(&workspace->list);
+ vfree(workspace->def_strm.workspace);
+ vfree(workspace->inf_strm.workspace);
+ kfree(workspace->buf);
+ kfree(workspace);
+ atomic_dec(&alloc_workspace);
+ }
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated. There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read. It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out)
+{
+ int ret;
+ struct workspace *workspace;
+ char *data_in;
+ char *cpage_out;
+ int nr_pages = 0;
+ struct page *in_page = NULL;
+ struct page *out_page = NULL;
+ int out_written = 0;
+ int in_read = 0;
+ unsigned long bytes_left;
+
+ *out_pages = 0;
+ *total_out = 0;
+ *total_in = 0;
+
+ workspace = find_zlib_workspace();
+ if (IS_ERR(workspace))
+ return -1;
+
+ if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+ printk(KERN_WARNING "deflateInit failed\n");
+ ret = -1;
+ goto out;
+ }
+
+ workspace->def_strm.total_in = 0;
+ workspace->def_strm.total_out = 0;
+
+ in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ data_in = kmap(in_page);
+
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ cpage_out = kmap(out_page);
+ pages[0] = out_page;
+ nr_pages = 1;
+
+ workspace->def_strm.next_in = data_in;
+ workspace->def_strm.next_out = cpage_out;
+ workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+
+ out_written = 0;
+ in_read = 0;
+
+ while (workspace->def_strm.total_in < len) {
+ ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+ if (ret != Z_OK) {
+ printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+ ret);
+ zlib_deflateEnd(&workspace->def_strm);
+ ret = -1;
+ goto out;
+ }
+
+ /* we're making it bigger, give up */
+ if (workspace->def_strm.total_in > 8192 &&
+ workspace->def_strm.total_in <
+ workspace->def_strm.total_out) {
+ ret = -1;
+ goto out;
+ }
+ /* we need another page for writing out. Test this
+ * before the total_in so we will pull in a new page for
+ * the stream end if required
+ */
+ if (workspace->def_strm.avail_out == 0) {
+ kunmap(out_page);
+ if (nr_pages == nr_dest_pages) {
+ out_page = NULL;
+ ret = -1;
+ goto out;
+ }
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ cpage_out = kmap(out_page);
+ pages[nr_pages] = out_page;
+ nr_pages++;
+ workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->def_strm.next_out = cpage_out;
+ }
+ /* we're all done */
+ if (workspace->def_strm.total_in >= len)
+ break;
+
+ /* we've read in a full page, get a new one */
+ if (workspace->def_strm.avail_in == 0) {
+ if (workspace->def_strm.total_out > max_out)
+ break;
+
+ bytes_left = len - workspace->def_strm.total_in;
+ kunmap(in_page);
+ page_cache_release(in_page);
+
+ start += PAGE_CACHE_SIZE;
+ in_page = find_get_page(mapping,
+ start >> PAGE_CACHE_SHIFT);
+ data_in = kmap(in_page);
+ workspace->def_strm.avail_in = min(bytes_left,
+ PAGE_CACHE_SIZE);
+ workspace->def_strm.next_in = data_in;
+ }
+ }
+ workspace->def_strm.avail_in = 0;
+ ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+ zlib_deflateEnd(&workspace->def_strm);
+
+ if (ret != Z_STREAM_END) {
+ ret = -1;
+ goto out;
+ }
+
+ if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+ *total_out = workspace->def_strm.total_out;
+ *total_in = workspace->def_strm.total_in;
+out:
+ *out_pages = nr_pages;
+ if (out_page)
+ kunmap(out_page);
+
+ if (in_page) {
+ kunmap(in_page);
+ page_cache_release(in_page);
+ }
+ free_workspace(workspace);
+ return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous. They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen)
+{
+ int ret = 0;
+ int wbits = MAX_WBITS;
+ struct workspace *workspace;
+ char *data_in;
+ size_t total_out = 0;
+ unsigned long page_bytes_left;
+ unsigned long page_in_index = 0;
+ unsigned long page_out_index = 0;
+ struct page *page_out;
+ unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE;
+ unsigned long buf_start;
+ unsigned long buf_offset;
+ unsigned long bytes;
+ unsigned long working_bytes;
+ unsigned long pg_offset;
+ unsigned long start_byte;
+ unsigned long current_buf_start;
+ char *kaddr;
+
+ workspace = find_zlib_workspace();
+ if (IS_ERR(workspace))
+ return -ENOMEM;
+
+ data_in = kmap(pages_in[page_in_index]);
+ workspace->inf_strm.next_in = data_in;
+ workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+ workspace->inf_strm.total_in = 0;
+
+ workspace->inf_strm.total_out = 0;
+ workspace->inf_strm.next_out = workspace->buf;
+ workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ page_out = bvec[page_out_index].bv_page;
+ page_bytes_left = PAGE_CACHE_SIZE;
+ pg_offset = 0;
+
+ /* If it's deflate, and it's got no preset dictionary, then
+ we can tell zlib to skip the adler32 check. */
+ if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+ ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+ !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+ wbits = -((data_in[0] >> 4) + 8);
+ workspace->inf_strm.next_in += 2;
+ workspace->inf_strm.avail_in -= 2;
+ }
+
+ if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+ printk(KERN_WARNING "inflateInit failed\n");
+ ret = -1;
+ goto out;
+ }
+ while (workspace->inf_strm.total_in < srclen) {
+ ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+ if (ret != Z_OK && ret != Z_STREAM_END)
+ break;
+ /*
+ * buf start is the byte offset we're of the start of
+ * our workspace buffer
+ */
+ buf_start = total_out;
+
+ /* total_out is the last byte of the workspace buffer */
+ total_out = workspace->inf_strm.total_out;
+
+ working_bytes = total_out - buf_start;
+
+ /*
+ * start byte is the first byte of the page we're currently
+ * copying into relative to the start of the compressed data.
+ */
+ start_byte = page_offset(page_out) - disk_start;
+
+ if (working_bytes == 0) {
+ /* we didn't make progress in this inflate
+ * call, we're done
+ */
+ if (ret != Z_STREAM_END)
+ ret = -1;
+ break;
+ }
+
+ /* we haven't yet hit data corresponding to this page */
+ if (total_out <= start_byte)
+ goto next;
+
+ /*
+ * the start of the data we care about is offset into
+ * the middle of our working buffer
+ */
+ if (total_out > start_byte && buf_start < start_byte) {
+ buf_offset = start_byte - buf_start;
+ working_bytes -= buf_offset;
+ } else {
+ buf_offset = 0;
+ }
+ current_buf_start = buf_start;
+
+ /* copy bytes from the working buffer into the pages */
+ while (working_bytes > 0) {
+ bytes = min(PAGE_CACHE_SIZE - pg_offset,
+ PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(bytes, working_bytes);
+ kaddr = kmap_atomic(page_out, KM_USER0);
+ memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+ bytes);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page_out);
+
+ pg_offset += bytes;
+ page_bytes_left -= bytes;
+ buf_offset += bytes;
+ working_bytes -= bytes;
+ current_buf_start += bytes;
+
+ /* check if we need to pick another page */
+ if (page_bytes_left == 0) {
+ page_out_index++;
+ if (page_out_index >= vcnt) {
+ ret = 0;
+ goto done;
+ }
+
+ page_out = bvec[page_out_index].bv_page;
+ pg_offset = 0;
+ page_bytes_left = PAGE_CACHE_SIZE;
+ start_byte = page_offset(page_out) - disk_start;
+
+ /*
+ * make sure our new page is covered by this
+ * working buffer
+ */
+ if (total_out <= start_byte)
+ goto next;
+
+ /* the next page in the biovec might not
+ * be adjacent to the last page, but it
+ * might still be found inside this working
+ * buffer. bump our offset pointer
+ */
+ if (total_out > start_byte &&
+ current_buf_start < start_byte) {
+ buf_offset = start_byte - buf_start;
+ working_bytes = total_out - start_byte;
+ current_buf_start = buf_start +
+ buf_offset;
+ }
+ }
+ }
+next:
+ workspace->inf_strm.next_out = workspace->buf;
+ workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+
+ if (workspace->inf_strm.avail_in == 0) {
+ unsigned long tmp;
+ kunmap(pages_in[page_in_index]);
+ page_in_index++;
+ if (page_in_index >= total_pages_in) {
+ data_in = NULL;
+ break;
+ }
+ data_in = kmap(pages_in[page_in_index]);
+ workspace->inf_strm.next_in = data_in;
+ tmp = srclen - workspace->inf_strm.total_in;
+ workspace->inf_strm.avail_in = min(tmp,
+ PAGE_CACHE_SIZE);
+ }
+ }
+ if (ret != Z_STREAM_END)
+ ret = -1;
+ else
+ ret = 0;
+done:
+ zlib_inflateEnd(&workspace->inf_strm);
+ if (data_in)
+ kunmap(pages_in[page_in_index]);
+out:
+ free_workspace(workspace);
+ return ret;
+}
+
+/*
+ * a less complex decompression routine. Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen)
+{
+ int ret = 0;
+ int wbits = MAX_WBITS;
+ struct workspace *workspace;
+ unsigned long bytes_left = destlen;
+ unsigned long total_out = 0;
+ char *kaddr;
+
+ if (destlen > PAGE_CACHE_SIZE)
+ return -ENOMEM;
+
+ workspace = find_zlib_workspace();
+ if (IS_ERR(workspace))
+ return -ENOMEM;
+
+ workspace->inf_strm.next_in = data_in;
+ workspace->inf_strm.avail_in = srclen;
+ workspace->inf_strm.total_in = 0;
+
+ workspace->inf_strm.next_out = workspace->buf;
+ workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->inf_strm.total_out = 0;
+ /* If it's deflate, and it's got no preset dictionary, then
+ we can tell zlib to skip the adler32 check. */
+ if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+ ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+ !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+ wbits = -((data_in[0] >> 4) + 8);
+ workspace->inf_strm.next_in += 2;
+ workspace->inf_strm.avail_in -= 2;
+ }
+
+ if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+ printk(KERN_WARNING "inflateInit failed\n");
+ ret = -1;
+ goto out;
+ }
+
+ while (bytes_left > 0) {
+ unsigned long buf_start;
+ unsigned long buf_offset;
+ unsigned long bytes;
+ unsigned long pg_offset = 0;
+
+ ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+ if (ret != Z_OK && ret != Z_STREAM_END)
+ break;
+
+ buf_start = total_out;
+ total_out = workspace->inf_strm.total_out;
+
+ if (total_out == buf_start) {
+ ret = -1;
+ break;
+ }
+
+ if (total_out <= start_byte)
+ goto next;
+
+ if (total_out > start_byte && buf_start < start_byte)
+ buf_offset = start_byte - buf_start;
+ else
+ buf_offset = 0;
+
+ bytes = min(PAGE_CACHE_SIZE - pg_offset,
+ PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(bytes, bytes_left);
+
+ kaddr = kmap_atomic(dest_page, KM_USER0);
+ memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ pg_offset += bytes;
+ bytes_left -= bytes;
+next:
+ workspace->inf_strm.next_out = workspace->buf;
+ workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ }
+
+ if (ret != Z_STREAM_END && bytes_left != 0)
+ ret = -1;
+ else
+ ret = 0;
+
+ zlib_inflateEnd(&workspace->inf_strm);
+out:
+ free_workspace(workspace);
+ return ret;
+}
+
+void btrfs_zlib_exit(void)
+{
+ free_workspaces();
+}