summaryrefslogtreecommitdiffstats
path: root/drivers/xen
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/xen')
-rw-r--r--drivers/xen/Kconfig63
-rw-r--r--drivers/xen/Makefile12
-rw-r--r--drivers/xen/balloon.c580
-rw-r--r--drivers/xen/cpu_hotplug.c111
-rw-r--r--drivers/xen/events.c946
-rw-r--r--drivers/xen/evtchn.c508
-rw-r--r--drivers/xen/features.c33
-rw-r--r--drivers/xen/grant-table.c559
-rw-r--r--drivers/xen/manage.c273
-rw-r--r--drivers/xen/sys-hypervisor.c446
-rw-r--r--drivers/xen/xenbus/Makefile7
-rw-r--r--drivers/xen/xenbus/xenbus_client.c568
-rw-r--r--drivers/xen/xenbus/xenbus_comms.c234
-rw-r--r--drivers/xen/xenbus/xenbus_comms.h46
-rw-r--r--drivers/xen/xenbus/xenbus_probe.c958
-rw-r--r--drivers/xen/xenbus/xenbus_probe.h76
-rw-r--r--drivers/xen/xenbus/xenbus_xs.c864
-rw-r--r--drivers/xen/xencomm.c217
-rw-r--r--drivers/xen/xenfs/Makefile3
-rw-r--r--drivers/xen/xenfs/super.c83
-rw-r--r--drivers/xen/xenfs/xenbus.c593
-rw-r--r--drivers/xen/xenfs/xenfs.h6
22 files changed, 7186 insertions, 0 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
new file mode 100644
index 00000000000..cab100acf98
--- /dev/null
+++ b/drivers/xen/Kconfig
@@ -0,0 +1,63 @@
+config XEN_BALLOON
+ bool "Xen memory balloon driver"
+ depends on XEN
+ default y
+ help
+ The balloon driver allows the Xen domain to request more memory from
+ the system to expand the domain's memory allocation, or alternatively
+ return unneeded memory to the system.
+
+config XEN_SCRUB_PAGES
+ bool "Scrub pages before returning them to system"
+ depends on XEN_BALLOON
+ default y
+ help
+ Scrub pages before returning them to the system for reuse by
+ other domains. This makes sure that any confidential data
+ is not accidentally visible to other domains. Is it more
+ secure, but slightly less efficient.
+ If in doubt, say yes.
+
+config XEN_DEV_EVTCHN
+ tristate "Xen /dev/xen/evtchn device"
+ depends on XEN
+ default y
+ help
+ The evtchn driver allows a userspace process to triger event
+ channels and to receive notification of an event channel
+ firing.
+ If in doubt, say yes.
+
+config XENFS
+ tristate "Xen filesystem"
+ depends on XEN
+ default y
+ help
+ The xen filesystem provides a way for domains to share
+ information with each other and with the hypervisor.
+ For example, by reading and writing the "xenbus" file, guests
+ may pass arbitrary information to the initial domain.
+ If in doubt, say yes.
+
+config XEN_COMPAT_XENFS
+ bool "Create compatibility mount point /proc/xen"
+ depends on XENFS
+ default y
+ help
+ The old xenstore userspace tools expect to find "xenbus"
+ under /proc/xen, but "xenbus" is now found at the root of the
+ xenfs filesystem. Selecting this causes the kernel to create
+ the compatibility mount point /proc/xen if it is running on
+ a xen platform.
+ If in doubt, say yes.
+
+config XEN_SYS_HYPERVISOR
+ bool "Create xen entries under /sys/hypervisor"
+ depends on XEN && SYSFS
+ select SYS_HYPERVISOR
+ default y
+ help
+ Create entries under /sys/hypervisor describing the Xen
+ hypervisor environment. When running native or in another
+ virtual environment, /sys/hypervisor will still be present,
+ but will have no xen contents. \ No newline at end of file
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
new file mode 100644
index 00000000000..7c284342f30
--- /dev/null
+++ b/drivers/xen/Makefile
@@ -0,0 +1,12 @@
+obj-y += grant-table.o features.o events.o manage.o
+obj-y += xenbus/
+
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_features.o := $(nostackp)
+
+obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
+obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
+obj-$(CONFIG_XEN_BALLOON) += balloon.o
+obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o
+obj-$(CONFIG_XENFS) += xenfs/
+obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o \ No newline at end of file
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
new file mode 100644
index 00000000000..f6738d8b02b
--- /dev/null
+++ b/drivers/xen/balloon.c
@@ -0,0 +1,580 @@
+/******************************************************************************
+ * balloon.c
+ *
+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
+ *
+ * Copyright (c) 2003, B Dragovic
+ * Copyright (c) 2003-2004, M Williamson, K Fraser
+ * Copyright (c) 2005 Dan M. Smith, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/sysdev.h>
+
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/memory.h>
+#include <xen/xenbus.h>
+#include <xen/features.h>
+#include <xen/page.h>
+
+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+
+#define BALLOON_CLASS_NAME "xen_memory"
+
+struct balloon_stats {
+ /* We aim for 'current allocation' == 'target allocation'. */
+ unsigned long current_pages;
+ unsigned long target_pages;
+ /*
+ * Drivers may alter the memory reservation independently, but they
+ * must inform the balloon driver so we avoid hitting the hard limit.
+ */
+ unsigned long driver_pages;
+ /* Number of pages in high- and low-memory balloons. */
+ unsigned long balloon_low;
+ unsigned long balloon_high;
+};
+
+static DEFINE_MUTEX(balloon_mutex);
+
+static struct sys_device balloon_sysdev;
+
+static int register_balloon(struct sys_device *sysdev);
+
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and driver_pages, and
+ * balloon lists.
+ */
+static DEFINE_SPINLOCK(balloon_lock);
+
+static struct balloon_stats balloon_stats;
+
+/* We increase/decrease in batches which fit in a page */
+static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
+
+#ifdef CONFIG_HIGHMEM
+#define inc_totalhigh_pages() (totalhigh_pages++)
+#define dec_totalhigh_pages() (totalhigh_pages--)
+#else
+#define inc_totalhigh_pages() do {} while(0)
+#define dec_totalhigh_pages() do {} while(0)
+#endif
+
+/* List of ballooned pages, threaded through the mem_map array. */
+static LIST_HEAD(ballooned_pages);
+
+/* Main work function, always executed in process context. */
+static void balloon_process(struct work_struct *work);
+static DECLARE_WORK(balloon_worker, balloon_process);
+static struct timer_list balloon_timer;
+
+/* When ballooning out (allocating memory to return to Xen) we don't really
+ want the kernel to try too hard since that can trigger the oom killer. */
+#define GFP_BALLOON \
+ (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
+
+static void scrub_page(struct page *page)
+{
+#ifdef CONFIG_XEN_SCRUB_PAGES
+ clear_highpage(page);
+#endif
+}
+
+/* balloon_append: add the given page to the balloon. */
+static void balloon_append(struct page *page)
+{
+ /* Lowmem is re-populated first, so highmem pages go at list tail. */
+ if (PageHighMem(page)) {
+ list_add_tail(&page->lru, &ballooned_pages);
+ balloon_stats.balloon_high++;
+ dec_totalhigh_pages();
+ } else {
+ list_add(&page->lru, &ballooned_pages);
+ balloon_stats.balloon_low++;
+ }
+
+ totalram_pages--;
+}
+
+/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
+static struct page *balloon_retrieve(void)
+{
+ struct page *page;
+
+ if (list_empty(&ballooned_pages))
+ return NULL;
+
+ page = list_entry(ballooned_pages.next, struct page, lru);
+ list_del(&page->lru);
+
+ if (PageHighMem(page)) {
+ balloon_stats.balloon_high--;
+ inc_totalhigh_pages();
+ }
+ else
+ balloon_stats.balloon_low--;
+
+ totalram_pages++;
+
+ return page;
+}
+
+static struct page *balloon_first_page(void)
+{
+ if (list_empty(&ballooned_pages))
+ return NULL;
+ return list_entry(ballooned_pages.next, struct page, lru);
+}
+
+static struct page *balloon_next_page(struct page *page)
+{
+ struct list_head *next = page->lru.next;
+ if (next == &ballooned_pages)
+ return NULL;
+ return list_entry(next, struct page, lru);
+}
+
+static void balloon_alarm(unsigned long unused)
+{
+ schedule_work(&balloon_worker);
+}
+
+static unsigned long current_target(void)
+{
+ unsigned long target = balloon_stats.target_pages;
+
+ target = min(target,
+ balloon_stats.current_pages +
+ balloon_stats.balloon_low +
+ balloon_stats.balloon_high);
+
+ return target;
+}
+
+static int increase_reservation(unsigned long nr_pages)
+{
+ unsigned long pfn, i, flags;
+ struct page *page;
+ long rc;
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+ if (nr_pages > ARRAY_SIZE(frame_list))
+ nr_pages = ARRAY_SIZE(frame_list);
+
+ spin_lock_irqsave(&balloon_lock, flags);
+
+ page = balloon_first_page();
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(page == NULL);
+ frame_list[i] = page_to_pfn(page);
+ page = balloon_next_page(page);
+ }
+
+ set_xen_guest_handle(reservation.extent_start, frame_list);
+ reservation.nr_extents = nr_pages;
+ rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+ if (rc < 0)
+ goto out;
+
+ for (i = 0; i < rc; i++) {
+ page = balloon_retrieve();
+ BUG_ON(page == NULL);
+
+ pfn = page_to_pfn(page);
+ BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
+ phys_to_machine_mapping_valid(pfn));
+
+ set_phys_to_machine(pfn, frame_list[i]);
+
+ /* Link back into the page tables if not highmem. */
+ if (pfn < max_low_pfn) {
+ int ret;
+ ret = HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ mfn_pte(frame_list[i], PAGE_KERNEL),
+ 0);
+ BUG_ON(ret);
+ }
+
+ /* Relinquish the page back to the allocator. */
+ ClearPageReserved(page);
+ init_page_count(page);
+ __free_page(page);
+ }
+
+ balloon_stats.current_pages += rc;
+
+ out:
+ spin_unlock_irqrestore(&balloon_lock, flags);
+
+ return rc < 0 ? rc : rc != nr_pages;
+}
+
+static int decrease_reservation(unsigned long nr_pages)
+{
+ unsigned long pfn, i, flags;
+ struct page *page;
+ int need_sleep = 0;
+ int ret;
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+ if (nr_pages > ARRAY_SIZE(frame_list))
+ nr_pages = ARRAY_SIZE(frame_list);
+
+ for (i = 0; i < nr_pages; i++) {
+ if ((page = alloc_page(GFP_BALLOON)) == NULL) {
+ nr_pages = i;
+ need_sleep = 1;
+ break;
+ }
+
+ pfn = page_to_pfn(page);
+ frame_list[i] = pfn_to_mfn(pfn);
+
+ scrub_page(page);
+
+ if (!PageHighMem(page)) {
+ ret = HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ __pte_ma(0), 0);
+ BUG_ON(ret);
+ }
+
+ }
+
+ /* Ensure that ballooned highmem pages don't have kmaps. */
+ kmap_flush_unused();
+ flush_tlb_all();
+
+ spin_lock_irqsave(&balloon_lock, flags);
+
+ /* No more mappings: invalidate P2M and add to balloon. */
+ for (i = 0; i < nr_pages; i++) {
+ pfn = mfn_to_pfn(frame_list[i]);
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ balloon_append(pfn_to_page(pfn));
+ }
+
+ set_xen_guest_handle(reservation.extent_start, frame_list);
+ reservation.nr_extents = nr_pages;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+ BUG_ON(ret != nr_pages);
+
+ balloon_stats.current_pages -= nr_pages;
+
+ spin_unlock_irqrestore(&balloon_lock, flags);
+
+ return need_sleep;
+}
+
+/*
+ * We avoid multiple worker processes conflicting via the balloon mutex.
+ * We may of course race updates of the target counts (which are protected
+ * by the balloon lock), or with changes to the Xen hard limit, but we will
+ * recover from these in time.
+ */
+static void balloon_process(struct work_struct *work)
+{
+ int need_sleep = 0;
+ long credit;
+
+ mutex_lock(&balloon_mutex);
+
+ do {
+ credit = current_target() - balloon_stats.current_pages;
+ if (credit > 0)
+ need_sleep = (increase_reservation(credit) != 0);
+ if (credit < 0)
+ need_sleep = (decrease_reservation(-credit) != 0);
+
+#ifndef CONFIG_PREEMPT
+ if (need_resched())
+ schedule();
+#endif
+ } while ((credit != 0) && !need_sleep);
+
+ /* Schedule more work if there is some still to be done. */
+ if (current_target() != balloon_stats.current_pages)
+ mod_timer(&balloon_timer, jiffies + HZ);
+
+ mutex_unlock(&balloon_mutex);
+}
+
+/* Resets the Xen limit, sets new target, and kicks off processing. */
+static void balloon_set_new_target(unsigned long target)
+{
+ /* No need for lock. Not read-modify-write updates. */
+ balloon_stats.target_pages = target;
+ schedule_work(&balloon_worker);
+}
+
+static struct xenbus_watch target_watch =
+{
+ .node = "memory/target"
+};
+
+/* React to a change in the target key */
+static void watch_target(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ unsigned long long new_target;
+ int err;
+
+ err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
+ if (err != 1) {
+ /* This is ok (for domain0 at least) - so just return */
+ return;
+ }
+
+ /* The given memory/target value is in KiB, so it needs converting to
+ * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
+ */
+ balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
+}
+
+static int balloon_init_watcher(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ int err;
+
+ err = register_xenbus_watch(&target_watch);
+ if (err)
+ printk(KERN_ERR "Failed to set balloon watcher\n");
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block xenstore_notifier;
+
+static int __init balloon_init(void)
+{
+ unsigned long pfn;
+ struct page *page;
+
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+ pr_info("xen_balloon: Initialising balloon driver.\n");
+
+ balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
+ balloon_stats.target_pages = balloon_stats.current_pages;
+ balloon_stats.balloon_low = 0;
+ balloon_stats.balloon_high = 0;
+ balloon_stats.driver_pages = 0UL;
+
+ init_timer(&balloon_timer);
+ balloon_timer.data = 0;
+ balloon_timer.function = balloon_alarm;
+
+ register_balloon(&balloon_sysdev);
+
+ /* Initialise the balloon with excess memory space. */
+ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+ page = pfn_to_page(pfn);
+ if (!PageReserved(page))
+ balloon_append(page);
+ }
+
+ target_watch.callback = watch_target;
+ xenstore_notifier.notifier_call = balloon_init_watcher;
+
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+
+subsys_initcall(balloon_init);
+
+static void balloon_exit(void)
+{
+ /* XXX - release balloon here */
+ return;
+}
+
+module_exit(balloon_exit);
+
+#define BALLOON_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct sys_device *dev, \
+ struct sysdev_attribute *attr, \
+ char *buf) \
+ { \
+ return sprintf(buf, format, ##args); \
+ } \
+ static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
+
+BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages));
+BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low));
+BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
+BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages));
+
+static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages));
+}
+
+static ssize_t store_target_kb(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ char *endchar;
+ unsigned long long target_bytes;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ target_bytes = simple_strtoull(buf, &endchar, 0) * 1024;
+
+ balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+ return count;
+}
+
+static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
+ show_target_kb, store_target_kb);
+
+
+static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%llu\n",
+ (unsigned long long)balloon_stats.target_pages
+ << PAGE_SHIFT);
+}
+
+static ssize_t store_target(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ char *endchar;
+ unsigned long long target_bytes;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ target_bytes = memparse(buf, &endchar);
+
+ balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+ return count;
+}
+
+static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR,
+ show_target, store_target);
+
+
+static struct sysdev_attribute *balloon_attrs[] = {
+ &attr_target_kb,
+ &attr_target,
+};
+
+static struct attribute *balloon_info_attrs[] = {
+ &attr_current_kb.attr,
+ &attr_low_kb.attr,
+ &attr_high_kb.attr,
+ &attr_driver_kb.attr,
+ NULL
+};
+
+static struct attribute_group balloon_info_group = {
+ .name = "info",
+ .attrs = balloon_info_attrs,
+};
+
+static struct sysdev_class balloon_sysdev_class = {
+ .name = BALLOON_CLASS_NAME,
+};
+
+static int register_balloon(struct sys_device *sysdev)
+{
+ int i, error;
+
+ error = sysdev_class_register(&balloon_sysdev_class);
+ if (error)
+ return error;
+
+ sysdev->id = 0;
+ sysdev->cls = &balloon_sysdev_class;
+
+ error = sysdev_register(sysdev);
+ if (error) {
+ sysdev_class_unregister(&balloon_sysdev_class);
+ return error;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
+ error = sysdev_create_file(sysdev, balloon_attrs[i]);
+ if (error)
+ goto fail;
+ }
+
+ error = sysfs_create_group(&sysdev->kobj, &balloon_info_group);
+ if (error)
+ goto fail;
+
+ return 0;
+
+ fail:
+ while (--i >= 0)
+ sysdev_remove_file(sysdev, balloon_attrs[i]);
+ sysdev_unregister(sysdev);
+ sysdev_class_unregister(&balloon_sysdev_class);
+ return error;
+}
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
new file mode 100644
index 00000000000..14e2d995e95
--- /dev/null
+++ b/drivers/xen/cpu_hotplug.c
@@ -0,0 +1,111 @@
+#include <linux/notifier.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/cpu.h>
+
+static void enable_hotplug_cpu(int cpu)
+{
+ if (!cpu_present(cpu))
+ arch_register_cpu(cpu);
+
+ set_cpu_present(cpu, true);
+}
+
+static void disable_hotplug_cpu(int cpu)
+{
+ if (cpu_present(cpu))
+ arch_unregister_cpu(cpu);
+
+ set_cpu_present(cpu, false);
+}
+
+static int vcpu_online(unsigned int cpu)
+{
+ int err;
+ char dir[32], state[32];
+
+ sprintf(dir, "cpu/%u", cpu);
+ err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
+ if (err != 1) {
+ printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
+ return err;
+ }
+
+ if (strcmp(state, "online") == 0)
+ return 1;
+ else if (strcmp(state, "offline") == 0)
+ return 0;
+
+ printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", state, cpu);
+ return -EINVAL;
+}
+static void vcpu_hotplug(unsigned int cpu)
+{
+ if (!cpu_possible(cpu))
+ return;
+
+ switch (vcpu_online(cpu)) {
+ case 1:
+ enable_hotplug_cpu(cpu);
+ break;
+ case 0:
+ (void)cpu_down(cpu);
+ disable_hotplug_cpu(cpu);
+ break;
+ default:
+ break;
+ }
+}
+
+static void handle_vcpu_hotplug_event(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ unsigned int cpu;
+ char *cpustr;
+ const char *node = vec[XS_WATCH_PATH];
+
+ cpustr = strstr(node, "cpu/");
+ if (cpustr != NULL) {
+ sscanf(cpustr, "cpu/%u", &cpu);
+ vcpu_hotplug(cpu);
+ }
+}
+
+static int setup_cpu_watcher(struct notifier_block *notifier,
+ unsigned long event, void *data)
+{
+ int cpu;
+ static struct xenbus_watch cpu_watch = {
+ .node = "cpu",
+ .callback = handle_vcpu_hotplug_event};
+
+ (void)register_xenbus_watch(&cpu_watch);
+
+ for_each_possible_cpu(cpu) {
+ if (vcpu_online(cpu) == 0) {
+ (void)cpu_down(cpu);
+ set_cpu_present(cpu, false);
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int __init setup_vcpu_hotplug_event(void)
+{
+ static struct notifier_block xsn_cpu = {
+ .notifier_call = setup_cpu_watcher };
+
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+ register_xenstore_notifier(&xsn_cpu);
+
+ return 0;
+}
+
+arch_initcall(setup_vcpu_hotplug_event);
+
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
new file mode 100644
index 00000000000..ce602dd09bc
--- /dev/null
+++ b/drivers/xen/events.c
@@ -0,0 +1,946 @@
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels. Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels. The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip. When an event is recieved, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications. This includes all the virtual
+ * device events, since they're driven by front-ends in another domain
+ * (typically dom0).
+ * 2. VIRQs, typically used for timers. These are per-cpu events.
+ * 3. IPIs.
+ * 4. Hardware interrupts. Not supported at present.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/idle.h>
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
+
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
+
+/* Interrupt types. */
+enum xen_irq_type {
+ IRQT_UNBOUND = 0,
+ IRQT_PIRQ,
+ IRQT_VIRQ,
+ IRQT_IPI,
+ IRQT_EVTCHN
+};
+
+/*
+ * Packed IRQ information:
+ * type - enum xen_irq_type
+ * event channel - irq->event channel mapping
+ * cpu - cpu this event channel is bound to
+ * index - type-specific information:
+ * PIRQ - vector, with MSB being "needs EIO"
+ * VIRQ - virq number
+ * IPI - IPI vector
+ * EVTCHN -
+ */
+struct irq_info
+{
+ enum xen_irq_type type; /* type */
+ unsigned short evtchn; /* event channel */
+ unsigned short cpu; /* cpu bound */
+
+ union {
+ unsigned short virq;
+ enum ipi_vector ipi;
+ struct {
+ unsigned short gsi;
+ unsigned short vector;
+ } pirq;
+ } u;
+};
+
+static struct irq_info irq_info[NR_IRQS];
+
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+ [0 ... NR_EVENT_CHANNELS-1] = -1
+};
+struct cpu_evtchn_s {
+ unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
+};
+static struct cpu_evtchn_s *cpu_evtchn_mask_p;
+static inline unsigned long *cpu_evtchn_mask(int cpu)
+{
+ return cpu_evtchn_mask_p[cpu].bits;
+}
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn) ((chn) != 0)
+
+static struct irq_chip xen_dynamic_chip;
+
+/* Constructor for packed IRQ information. */
+static struct irq_info mk_unbound_info(void)
+{
+ return (struct irq_info) { .type = IRQT_UNBOUND };
+}
+
+static struct irq_info mk_evtchn_info(unsigned short evtchn)
+{
+ return (struct irq_info) { .type = IRQT_EVTCHN, .evtchn = evtchn,
+ .cpu = 0 };
+}
+
+static struct irq_info mk_ipi_info(unsigned short evtchn, enum ipi_vector ipi)
+{
+ return (struct irq_info) { .type = IRQT_IPI, .evtchn = evtchn,
+ .cpu = 0, .u.ipi = ipi };
+}
+
+static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq)
+{
+ return (struct irq_info) { .type = IRQT_VIRQ, .evtchn = evtchn,
+ .cpu = 0, .u.virq = virq };
+}
+
+static struct irq_info mk_pirq_info(unsigned short evtchn,
+ unsigned short gsi, unsigned short vector)
+{
+ return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
+ .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } };
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+static struct irq_info *info_for_irq(unsigned irq)
+{
+ return &irq_info[irq];
+}
+
+static unsigned int evtchn_from_irq(unsigned irq)
+{
+ return info_for_irq(irq)->evtchn;
+}
+
+unsigned irq_from_evtchn(unsigned int evtchn)
+{
+ return evtchn_to_irq[evtchn];
+}
+EXPORT_SYMBOL_GPL(irq_from_evtchn);
+
+static enum ipi_vector ipi_from_irq(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info == NULL);
+ BUG_ON(info->type != IRQT_IPI);
+
+ return info->u.ipi;
+}
+
+static unsigned virq_from_irq(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info == NULL);
+ BUG_ON(info->type != IRQT_VIRQ);
+
+ return info->u.virq;
+}
+
+static unsigned gsi_from_irq(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info == NULL);
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ return info->u.pirq.gsi;
+}
+
+static unsigned vector_from_irq(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info == NULL);
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ return info->u.pirq.vector;
+}
+
+static enum xen_irq_type type_from_irq(unsigned irq)
+{
+ return info_for_irq(irq)->type;
+}
+
+static unsigned cpu_from_irq(unsigned irq)
+{
+ return info_for_irq(irq)->cpu;
+}
+
+static unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+ int irq = evtchn_to_irq[evtchn];
+ unsigned ret = 0;
+
+ if (irq != -1)
+ ret = cpu_from_irq(irq);
+
+ return ret;
+}
+
+static inline unsigned long active_evtchns(unsigned int cpu,
+ struct shared_info *sh,
+ unsigned int idx)
+{
+ return (sh->evtchn_pending[idx] &
+ cpu_evtchn_mask(cpu)[idx] &
+ ~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+ int irq = evtchn_to_irq[chn];
+
+ BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+ cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
+#endif
+
+ __clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
+ __set_bit(chn, cpu_evtchn_mask(cpu));
+
+ irq_info[irq].cpu = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+#ifdef CONFIG_SMP
+ struct irq_desc *desc;
+ int i;
+
+ /* By default all event channels notify CPU#0. */
+ for_each_irq_desc(i, desc) {
+ cpumask_copy(desc->affinity, cpumask_of(0));
+ }
+#endif
+
+ memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0)));
+}
+
+static inline void clear_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline void set_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_set_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline int test_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ return sync_test_bit(port, &s->evtchn_pending[0]);
+}
+
+
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+static void mask_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_set_bit(port, &s->evtchn_mask[0]);
+}
+
+static void unmask_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ unsigned int cpu = get_cpu();
+
+ BUG_ON(!irqs_disabled());
+
+ /* Slow path (hypercall) if this is a non-local port. */
+ if (unlikely(cpu != cpu_from_evtchn(port))) {
+ struct evtchn_unmask unmask = { .port = port };
+ (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+ } else {
+ struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+
+ sync_clear_bit(port, &s->evtchn_mask[0]);
+
+ /*
+ * The following is basically the equivalent of
+ * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+ * the interrupt edge' if the channel is masked.
+ */
+ if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+ !sync_test_and_set_bit(port / BITS_PER_LONG,
+ &vcpu_info->evtchn_pending_sel))
+ vcpu_info->evtchn_upcall_pending = 1;
+ }
+
+ put_cpu();
+}
+
+static int find_unbound_irq(void)
+{
+ int irq;
+ struct irq_desc *desc;
+
+ for (irq = 0; irq < nr_irqs; irq++)
+ if (irq_info[irq].type == IRQT_UNBOUND)
+ break;
+
+ if (irq == nr_irqs)
+ panic("No available IRQ to bind to: increase nr_irqs!\n");
+
+ desc = irq_to_desc_alloc_node(irq, 0);
+ if (WARN_ON(desc == NULL))
+ return -1;
+
+ dynamic_irq_init(irq);
+
+ return irq;
+}
+
+int bind_evtchn_to_irq(unsigned int evtchn)
+{
+ int irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = evtchn_to_irq[evtchn];
+
+ if (irq == -1) {
+ irq = find_unbound_irq();
+
+ set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_level_irq, "event");
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_evtchn_info(evtchn);
+ }
+
+ spin_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+ struct evtchn_bind_ipi bind_ipi;
+ int evtchn, irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = per_cpu(ipi_to_irq, cpu)[ipi];
+
+ if (irq == -1) {
+ irq = find_unbound_irq();
+ if (irq < 0)
+ goto out;
+
+ set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_level_irq, "ipi");
+
+ bind_ipi.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+ &bind_ipi) != 0)
+ BUG();
+ evtchn = bind_ipi.port;
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_ipi_info(evtchn, ipi);
+ per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+ bind_evtchn_to_cpu(evtchn, cpu);
+ }
+
+ out:
+ spin_unlock(&irq_mapping_update_lock);
+ return irq;
+}
+
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+ struct evtchn_bind_virq bind_virq;
+ int evtchn, irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = per_cpu(virq_to_irq, cpu)[virq];
+
+ if (irq == -1) {
+ bind_virq.virq = virq;
+ bind_virq.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+ &bind_virq) != 0)
+ BUG();
+ evtchn = bind_virq.port;
+
+ irq = find_unbound_irq();
+
+ set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_level_irq, "virq");
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_virq_info(evtchn, virq);
+
+ per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+ bind_evtchn_to_cpu(evtchn, cpu);
+ }
+
+ spin_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+ struct evtchn_close close;
+ int evtchn = evtchn_from_irq(irq);
+
+ spin_lock(&irq_mapping_update_lock);
+
+ if (VALID_EVTCHN(evtchn)) {
+ close.port = evtchn;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+ BUG();
+
+ switch (type_from_irq(irq)) {
+ case IRQT_VIRQ:
+ per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+ [virq_from_irq(irq)] = -1;
+ break;
+ case IRQT_IPI:
+ per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
+ [ipi_from_irq(irq)] = -1;
+ break;
+ default:
+ break;
+ }
+
+ /* Closed ports are implicitly re-bound to VCPU0. */
+ bind_evtchn_to_cpu(evtchn, 0);
+
+ evtchn_to_irq[evtchn] = -1;
+ }
+
+ if (irq_info[irq].type != IRQT_UNBOUND) {
+ irq_info[irq] = mk_unbound_info();
+
+ dynamic_irq_cleanup(irq);
+ }
+
+ spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ unsigned int irq;
+ int retval;
+
+ irq = bind_evtchn_to_irq(evtchn);
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags, const char *devname, void *dev_id)
+{
+ unsigned int irq;
+ int retval;
+
+ irq = bind_virq_to_irq(virq, cpu);
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+ unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_ipi_to_irq(ipi, cpu);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+ free_irq(irq, dev_id);
+ unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+ int irq = per_cpu(ipi_to_irq, cpu)[vector];
+ BUG_ON(irq < 0);
+ notify_remote_via_irq(irq);
+}
+
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
+{
+ struct shared_info *sh = HYPERVISOR_shared_info;
+ int cpu = smp_processor_id();
+ int i;
+ unsigned long flags;
+ static DEFINE_SPINLOCK(debug_lock);
+
+ spin_lock_irqsave(&debug_lock, flags);
+
+ printk("vcpu %d\n ", cpu);
+
+ for_each_online_cpu(i) {
+ struct vcpu_info *v = per_cpu(xen_vcpu, i);
+ printk("%d: masked=%d pending=%d event_sel %08lx\n ", i,
+ (get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask,
+ v->evtchn_upcall_pending,
+ v->evtchn_pending_sel);
+ }
+ printk("pending:\n ");
+ for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+ printk("%08lx%s", sh->evtchn_pending[i],
+ i % 8 == 0 ? "\n " : " ");
+ printk("\nmasks:\n ");
+ for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%08lx%s", sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nunmasked:\n ");
+ for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\npending list:\n");
+ for(i = 0; i < NR_EVENT_CHANNELS; i++) {
+ if (sync_test_bit(i, sh->evtchn_pending)) {
+ printk(" %d: event %d -> irq %d\n",
+ cpu_from_evtchn(i), i,
+ evtchn_to_irq[i]);
+ }
+ }
+
+ spin_unlock_irqrestore(&debug_lock, flags);
+
+ return IRQ_HANDLED;
+}
+
+static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+
+/*
+ * Search the CPUs pending events bitmasks. For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for
+ * handling.
+ *
+ * Xen uses a two-level bitmap to speed searching. The first level is
+ * a bitset of words which contain pending event bits. The second
+ * level is a bitset of pending events themselves.
+ */
+void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+ int cpu = get_cpu();
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ struct shared_info *s = HYPERVISOR_shared_info;
+ struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+ unsigned count;
+
+ exit_idle();
+ irq_enter();
+
+ do {
+ unsigned long pending_words;
+
+ vcpu_info->evtchn_upcall_pending = 0;
+
+ if (__get_cpu_var(xed_nesting_count)++)
+ goto out;
+
+#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
+ /* Clear master flag /before/ clearing selector flag. */
+ wmb();
+#endif
+ pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+ while (pending_words != 0) {
+ unsigned long pending_bits;
+ int word_idx = __ffs(pending_words);
+ pending_words &= ~(1UL << word_idx);
+
+ while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+ int bit_idx = __ffs(pending_bits);
+ int port = (word_idx * BITS_PER_LONG) + bit_idx;
+ int irq = evtchn_to_irq[port];
+
+ if (irq != -1)
+ handle_irq(irq, regs);
+ }
+ }
+
+ BUG_ON(!irqs_disabled());
+
+ count = __get_cpu_var(xed_nesting_count);
+ __get_cpu_var(xed_nesting_count) = 0;
+ } while(count != 1);
+
+out:
+ irq_exit();
+ set_irq_regs(old_regs);
+
+ put_cpu();
+}
+
+/* Rebind a new event channel to an existing irq. */
+void rebind_evtchn_irq(int evtchn, int irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ /* Make sure the irq is masked, since the new event channel
+ will also be masked. */
+ disable_irq(irq);
+
+ spin_lock(&irq_mapping_update_lock);
+
+ /* After resume the irq<->evtchn mappings are all cleared out */
+ BUG_ON(evtchn_to_irq[evtchn] != -1);
+ /* Expect irq to have been bound before,
+ so there should be a proper type */
+ BUG_ON(info->type == IRQT_UNBOUND);
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_evtchn_info(evtchn);
+
+ spin_unlock(&irq_mapping_update_lock);
+
+ /* new event channels are always bound to cpu 0 */
+ irq_set_affinity(irq, cpumask_of(0));
+
+ /* Unmask the event channel. */
+ enable_irq(irq);
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+ struct evtchn_bind_vcpu bind_vcpu;
+ int evtchn = evtchn_from_irq(irq);
+
+ if (!VALID_EVTCHN(evtchn))
+ return -1;
+
+ /* Send future instances of this interrupt to other vcpu. */
+ bind_vcpu.port = evtchn;
+ bind_vcpu.vcpu = tcpu;
+
+ /*
+ * If this fails, it usually just indicates that we're dealing with a
+ * virq or IPI channel, which don't actually need to be rebound. Ignore
+ * it, but don't do the xenlinux-level rebind in that case.
+ */
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+ bind_evtchn_to_cpu(evtchn, tcpu);
+
+ return 0;
+}
+
+static int set_affinity_irq(unsigned irq, const struct cpumask *dest)
+{
+ unsigned tcpu = cpumask_first(dest);
+
+ return rebind_irq_to_cpu(irq, tcpu);
+}
+
+int resend_irq_on_evtchn(unsigned int irq)
+{
+ int masked, evtchn = evtchn_from_irq(irq);
+ struct shared_info *s = HYPERVISOR_shared_info;
+
+ if (!VALID_EVTCHN(evtchn))
+ return 1;
+
+ masked = sync_test_and_set_bit(evtchn, s->evtchn_mask);
+ sync_set_bit(evtchn, s->evtchn_pending);
+ if (!masked)
+ unmask_evtchn(evtchn);
+
+ return 1;
+}
+
+static void enable_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ move_native_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ clear_evtchn(evtchn);
+}
+
+static int retrigger_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+ struct shared_info *sh = HYPERVISOR_shared_info;
+ int ret = 0;
+
+ if (VALID_EVTCHN(evtchn)) {
+ int masked;
+
+ masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask);
+ sync_set_bit(evtchn, sh->evtchn_pending);
+ if (!masked)
+ unmask_evtchn(evtchn);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static void restore_cpu_virqs(unsigned int cpu)
+{
+ struct evtchn_bind_virq bind_virq;
+ int virq, irq, evtchn;
+
+ for (virq = 0; virq < NR_VIRQS; virq++) {
+ if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
+ continue;
+
+ BUG_ON(virq_from_irq(irq) != virq);
+
+ /* Get a new binding from Xen. */
+ bind_virq.virq = virq;
+ bind_virq.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+ &bind_virq) != 0)
+ BUG();
+ evtchn = bind_virq.port;
+
+ /* Record the new mapping. */
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_virq_info(evtchn, virq);
+ bind_evtchn_to_cpu(evtchn, cpu);
+
+ /* Ready for use. */
+ unmask_evtchn(evtchn);
+ }
+}
+
+static void restore_cpu_ipis(unsigned int cpu)
+{
+ struct evtchn_bind_ipi bind_ipi;
+ int ipi, irq, evtchn;
+
+ for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) {
+ if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
+ continue;
+
+ BUG_ON(ipi_from_irq(irq) != ipi);
+
+ /* Get a new binding from Xen. */
+ bind_ipi.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+ &bind_ipi) != 0)
+ BUG();
+ evtchn = bind_ipi.port;
+
+ /* Record the new mapping. */
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_ipi_info(evtchn, ipi);
+ bind_evtchn_to_cpu(evtchn, cpu);
+
+ /* Ready for use. */
+ unmask_evtchn(evtchn);
+
+ }
+}
+
+/* Clear an irq's pending state, in preparation for polling on it */
+void xen_clear_irq_pending(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ clear_evtchn(evtchn);
+}
+
+void xen_set_irq_pending(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ set_evtchn(evtchn);
+}
+
+bool xen_test_irq_pending(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+ bool ret = false;
+
+ if (VALID_EVTCHN(evtchn))
+ ret = test_evtchn(evtchn);
+
+ return ret;
+}
+
+/* Poll waiting for an irq to become pending. In the usual case, the
+ irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq)
+{
+ evtchn_port_t evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn)) {
+ struct sched_poll poll;
+
+ poll.nr_ports = 1;
+ poll.timeout = 0;
+ set_xen_guest_handle(poll.ports, &evtchn);
+
+ if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
+ BUG();
+ }
+}
+
+void xen_irq_resume(void)
+{
+ unsigned int cpu, irq, evtchn;
+
+ init_evtchn_cpu_bindings();
+
+ /* New event-channel space is not 'live' yet. */
+ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+ mask_evtchn(evtchn);
+
+ /* No IRQ <-> event-channel mappings. */
+ for (irq = 0; irq < nr_irqs; irq++)
+ irq_info[irq].evtchn = 0; /* zap event-channel binding */
+
+ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+ evtchn_to_irq[evtchn] = -1;
+
+ for_each_possible_cpu(cpu) {
+ restore_cpu_virqs(cpu);
+ restore_cpu_ipis(cpu);
+ }
+}
+
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+ .name = "xen-dyn",
+
+ .disable = disable_dynirq,
+ .mask = disable_dynirq,
+ .unmask = enable_dynirq,
+
+ .ack = ack_dynirq,
+ .set_affinity = set_affinity_irq,
+ .retrigger = retrigger_dynirq,
+};
+
+void __init xen_init_IRQ(void)
+{
+ int i;
+
+ cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
+ GFP_KERNEL);
+ BUG_ON(cpu_evtchn_mask_p == NULL);
+
+ init_evtchn_cpu_bindings();
+
+ /* No event channels are 'live' right now. */
+ for (i = 0; i < NR_EVENT_CHANNELS; i++)
+ mask_evtchn(i);
+
+ irq_ctx_init(smp_processor_id());
+}
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
new file mode 100644
index 00000000000..f70a4f4698c
--- /dev/null
+++ b/drivers/xen/evtchn.c
@@ -0,0 +1,508 @@
+/******************************************************************************
+ * evtchn.c
+ *
+ * Driver for receiving and demuxing event-channel signals.
+ *
+ * Copyright (c) 2004-2005, K A Fraser
+ * Multi-process extensions Copyright (c) 2004, Steven Smith
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/major.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/poll.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+
+#include <xen/xen.h>
+#include <xen/events.h>
+#include <xen/evtchn.h>
+#include <asm/xen/hypervisor.h>
+
+struct per_user_data {
+ struct mutex bind_mutex; /* serialize bind/unbind operations */
+
+ /* Notification ring, accessed via /dev/xen/evtchn. */
+#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t))
+#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
+ evtchn_port_t *ring;
+ unsigned int ring_cons, ring_prod, ring_overflow;
+ struct mutex ring_cons_mutex; /* protect against concurrent readers */
+
+ /* Processes wait on this queue when ring is empty. */
+ wait_queue_head_t evtchn_wait;
+ struct fasync_struct *evtchn_async_queue;
+ const char *name;
+};
+
+/* Who's bound to each port? */
+static struct per_user_data *port_user[NR_EVENT_CHANNELS];
+static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
+
+irqreturn_t evtchn_interrupt(int irq, void *data)
+{
+ unsigned int port = (unsigned long)data;
+ struct per_user_data *u;
+
+ spin_lock(&port_user_lock);
+
+ u = port_user[port];
+
+ disable_irq_nosync(irq);
+
+ if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
+ u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
+ wmb(); /* Ensure ring contents visible */
+ if (u->ring_cons == u->ring_prod++) {
+ wake_up_interruptible(&u->evtchn_wait);
+ kill_fasync(&u->evtchn_async_queue,
+ SIGIO, POLL_IN);
+ }
+ } else {
+ u->ring_overflow = 1;
+ }
+
+ spin_unlock(&port_user_lock);
+
+ return IRQ_HANDLED;
+}
+
+static ssize_t evtchn_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int rc;
+ unsigned int c, p, bytes1 = 0, bytes2 = 0;
+ struct per_user_data *u = file->private_data;
+
+ /* Whole number of ports. */
+ count &= ~(sizeof(evtchn_port_t)-1);
+
+ if (count == 0)
+ return 0;
+
+ if (count > PAGE_SIZE)
+ count = PAGE_SIZE;
+
+ for (;;) {
+ mutex_lock(&u->ring_cons_mutex);
+
+ rc = -EFBIG;
+ if (u->ring_overflow)
+ goto unlock_out;
+
+ c = u->ring_cons;
+ p = u->ring_prod;
+ if (c != p)
+ break;
+
+ mutex_unlock(&u->ring_cons_mutex);
+
+ if (file->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ rc = wait_event_interruptible(u->evtchn_wait,
+ u->ring_cons != u->ring_prod);
+ if (rc)
+ return rc;
+ }
+
+ /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
+ if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
+ bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
+ sizeof(evtchn_port_t);
+ bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
+ } else {
+ bytes1 = (p - c) * sizeof(evtchn_port_t);
+ bytes2 = 0;
+ }
+
+ /* Truncate chunks according to caller's maximum byte count. */
+ if (bytes1 > count) {
+ bytes1 = count;
+ bytes2 = 0;
+ } else if ((bytes1 + bytes2) > count) {
+ bytes2 = count - bytes1;
+ }
+
+ rc = -EFAULT;
+ rmb(); /* Ensure that we see the port before we copy it. */
+ if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
+ ((bytes2 != 0) &&
+ copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
+ goto unlock_out;
+
+ u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
+ rc = bytes1 + bytes2;
+
+ unlock_out:
+ mutex_unlock(&u->ring_cons_mutex);
+ return rc;
+}
+
+static ssize_t evtchn_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int rc, i;
+ evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+ struct per_user_data *u = file->private_data;
+
+ if (kbuf == NULL)
+ return -ENOMEM;
+
+ /* Whole number of ports. */
+ count &= ~(sizeof(evtchn_port_t)-1);
+
+ rc = 0;
+ if (count == 0)
+ goto out;
+
+ if (count > PAGE_SIZE)
+ count = PAGE_SIZE;
+
+ rc = -EFAULT;
+ if (copy_from_user(kbuf, buf, count) != 0)
+ goto out;
+
+ spin_lock_irq(&port_user_lock);
+ for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
+ if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
+ enable_irq(irq_from_evtchn(kbuf[i]));
+ spin_unlock_irq(&port_user_lock);
+
+ rc = count;
+
+ out:
+ free_page((unsigned long)kbuf);
+ return rc;
+}
+
+static int evtchn_bind_to_user(struct per_user_data *u, int port)
+{
+ int rc = 0;
+
+ /*
+ * Ports are never reused, so every caller should pass in a
+ * unique port.
+ *
+ * (Locking not necessary because we haven't registered the
+ * interrupt handler yet, and our caller has already
+ * serialized bind operations.)
+ */
+ BUG_ON(port_user[port] != NULL);
+ port_user[port] = u;
+
+ rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
+ u->name, (void *)(unsigned long)port);
+ if (rc >= 0)
+ rc = 0;
+
+ return rc;
+}
+
+static void evtchn_unbind_from_user(struct per_user_data *u, int port)
+{
+ int irq = irq_from_evtchn(port);
+
+ unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+
+ /* make sure we unbind the irq handler before clearing the port */
+ barrier();
+
+ port_user[port] = NULL;
+}
+
+static long evtchn_ioctl(struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ int rc;
+ struct per_user_data *u = file->private_data;
+ void __user *uarg = (void __user *) arg;
+
+ /* Prevent bind from racing with unbind */
+ mutex_lock(&u->bind_mutex);
+
+ switch (cmd) {
+ case IOCTL_EVTCHN_BIND_VIRQ: {
+ struct ioctl_evtchn_bind_virq bind;
+ struct evtchn_bind_virq bind_virq;
+
+ rc = -EFAULT;
+ if (copy_from_user(&bind, uarg, sizeof(bind)))
+ break;
+
+ bind_virq.virq = bind.virq;
+ bind_virq.vcpu = 0;
+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+ &bind_virq);
+ if (rc != 0)
+ break;
+
+ rc = evtchn_bind_to_user(u, bind_virq.port);
+ if (rc == 0)
+ rc = bind_virq.port;
+ break;
+ }
+
+ case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
+ struct ioctl_evtchn_bind_interdomain bind;
+ struct evtchn_bind_interdomain bind_interdomain;
+
+ rc = -EFAULT;
+ if (copy_from_user(&bind, uarg, sizeof(bind)))
+ break;
+
+ bind_interdomain.remote_dom = bind.remote_domain;
+ bind_interdomain.remote_port = bind.remote_port;
+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+ &bind_interdomain);
+ if (rc != 0)
+ break;
+
+ rc = evtchn_bind_to_user(u, bind_interdomain.local_port);
+ if (rc == 0)
+ rc = bind_interdomain.local_port;
+ break;
+ }
+
+ case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
+ struct ioctl_evtchn_bind_unbound_port bind;
+ struct evtchn_alloc_unbound alloc_unbound;
+
+ rc = -EFAULT;
+ if (copy_from_user(&bind, uarg, sizeof(bind)))
+ break;
+
+ alloc_unbound.dom = DOMID_SELF;
+ alloc_unbound.remote_dom = bind.remote_domain;
+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+ &alloc_unbound);
+ if (rc != 0)
+ break;
+
+ rc = evtchn_bind_to_user(u, alloc_unbound.port);
+ if (rc == 0)
+ rc = alloc_unbound.port;
+ break;
+ }
+
+ case IOCTL_EVTCHN_UNBIND: {
+ struct ioctl_evtchn_unbind unbind;
+
+ rc = -EFAULT;
+ if (copy_from_user(&unbind, uarg, sizeof(unbind)))
+ break;
+
+ rc = -EINVAL;
+ if (unbind.port >= NR_EVENT_CHANNELS)
+ break;
+
+ spin_lock_irq(&port_user_lock);
+
+ rc = -ENOTCONN;
+ if (port_user[unbind.port] != u) {
+ spin_unlock_irq(&port_user_lock);
+ break;
+ }
+
+ evtchn_unbind_from_user(u, unbind.port);
+
+ spin_unlock_irq(&port_user_lock);
+
+ rc = 0;
+ break;
+ }
+
+ case IOCTL_EVTCHN_NOTIFY: {
+ struct ioctl_evtchn_notify notify;
+
+ rc = -EFAULT;
+ if (copy_from_user(&notify, uarg, sizeof(notify)))
+ break;
+
+ if (notify.port >= NR_EVENT_CHANNELS) {
+ rc = -EINVAL;
+ } else if (port_user[notify.port] != u) {
+ rc = -ENOTCONN;
+ } else {
+ notify_remote_via_evtchn(notify.port);
+ rc = 0;
+ }
+ break;
+ }
+
+ case IOCTL_EVTCHN_RESET: {
+ /* Initialise the ring to empty. Clear errors. */
+ mutex_lock(&u->ring_cons_mutex);
+ spin_lock_irq(&port_user_lock);
+ u->ring_cons = u->ring_prod = u->ring_overflow = 0;
+ spin_unlock_irq(&port_user_lock);
+ mutex_unlock(&u->ring_cons_mutex);
+ rc = 0;
+ break;
+ }
+
+ default:
+ rc = -ENOSYS;
+ break;
+ }
+ mutex_unlock(&u->bind_mutex);
+
+ return rc;
+}
+
+static unsigned int evtchn_poll(struct file *file, poll_table *wait)
+{
+ unsigned int mask = POLLOUT | POLLWRNORM;
+ struct per_user_data *u = file->private_data;
+
+ poll_wait(file, &u->evtchn_wait, wait);
+ if (u->ring_cons != u->ring_prod)
+ mask |= POLLIN | POLLRDNORM;
+ if (u->ring_overflow)
+ mask = POLLERR;
+ return mask;
+}
+
+static int evtchn_fasync(int fd, struct file *filp, int on)
+{
+ struct per_user_data *u = filp->private_data;
+ return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
+}
+
+static int evtchn_open(struct inode *inode, struct file *filp)
+{
+ struct per_user_data *u;
+
+ u = kzalloc(sizeof(*u), GFP_KERNEL);
+ if (u == NULL)
+ return -ENOMEM;
+
+ u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm);
+ if (u->name == NULL) {
+ kfree(u);
+ return -ENOMEM;
+ }
+
+ init_waitqueue_head(&u->evtchn_wait);
+
+ u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+ if (u->ring == NULL) {
+ kfree(u->name);
+ kfree(u);
+ return -ENOMEM;
+ }
+
+ mutex_init(&u->bind_mutex);
+ mutex_init(&u->ring_cons_mutex);
+
+ filp->private_data = u;
+
+ return 0;
+}
+
+static int evtchn_release(struct inode *inode, struct file *filp)
+{
+ int i;
+ struct per_user_data *u = filp->private_data;
+
+ spin_lock_irq(&port_user_lock);
+
+ free_page((unsigned long)u->ring);
+
+ for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+ if (port_user[i] != u)
+ continue;
+
+ evtchn_unbind_from_user(port_user[i], i);
+ }
+
+ spin_unlock_irq(&port_user_lock);
+
+ kfree(u->name);
+ kfree(u);
+
+ return 0;
+}
+
+static const struct file_operations evtchn_fops = {
+ .owner = THIS_MODULE,
+ .read = evtchn_read,
+ .write = evtchn_write,
+ .unlocked_ioctl = evtchn_ioctl,
+ .poll = evtchn_poll,
+ .fasync = evtchn_fasync,
+ .open = evtchn_open,
+ .release = evtchn_release,
+};
+
+static struct miscdevice evtchn_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "evtchn",
+ .fops = &evtchn_fops,
+};
+static int __init evtchn_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ spin_lock_init(&port_user_lock);
+ memset(port_user, 0, sizeof(port_user));
+
+ /* Create '/dev/misc/evtchn'. */
+ err = misc_register(&evtchn_miscdev);
+ if (err != 0) {
+ printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
+ return err;
+ }
+
+ printk(KERN_INFO "Event-channel device installed.\n");
+
+ return 0;
+}
+
+static void __exit evtchn_cleanup(void)
+{
+ misc_deregister(&evtchn_miscdev);
+}
+
+module_init(evtchn_init);
+module_exit(evtchn_cleanup);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/features.c b/drivers/xen/features.c
new file mode 100644
index 00000000000..99eda169c77
--- /dev/null
+++ b/drivers/xen/features.c
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+
+#include <asm/xen/hypercall.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+#include <xen/features.h>
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+
+void xen_setup_features(void)
+{
+ struct xen_feature_info fi;
+ int i, j;
+
+ for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+ fi.submap_idx = i;
+ if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+ break;
+ for (j = 0; j < 32; j++)
+ xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
+ }
+}
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
new file mode 100644
index 00000000000..4c6c0bd636a
--- /dev/null
+++ b/drivers/xen/grant-table.c
@@ -0,0 +1,559 @@
+/******************************************************************************
+ * grant_table.c
+ *
+ * Granting foreign access to our memory reservation.
+ *
+ * Copyright (c) 2005-2006, Christopher Clark
+ * Copyright (c) 2004-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+#include <asm/xen/hypercall.h>
+
+#include <asm/pgtable.h>
+#include <asm/sync_bitops.h>
+
+
+/* External tools reserve first few grant table entries. */
+#define NR_RESERVED_ENTRIES 8
+#define GNTTAB_LIST_END 0xffffffff
+#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry))
+
+static grant_ref_t **gnttab_list;
+static unsigned int nr_grant_frames;
+static unsigned int boot_max_nr_grant_frames;
+static int gnttab_free_count;
+static grant_ref_t gnttab_free_head;
+static DEFINE_SPINLOCK(gnttab_list_lock);
+
+static struct grant_entry *shared;
+
+static struct gnttab_free_callback *gnttab_free_callback_list;
+
+static int gnttab_expand(unsigned int req_entries);
+
+#define RPP (PAGE_SIZE / sizeof(grant_ref_t))
+
+static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)
+{
+ return &gnttab_list[(entry) / RPP][(entry) % RPP];
+}
+/* This can be used as an l-value */
+#define gnttab_entry(entry) (*__gnttab_entry(entry))
+
+static int get_free_entries(unsigned count)
+{
+ unsigned long flags;
+ int ref, rc;
+ grant_ref_t head;
+
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+
+ if ((gnttab_free_count < count) &&
+ ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) {
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+ return rc;
+ }
+
+ ref = head = gnttab_free_head;
+ gnttab_free_count -= count;
+ while (count-- > 1)
+ head = gnttab_entry(head);
+ gnttab_free_head = gnttab_entry(head);
+ gnttab_entry(head) = GNTTAB_LIST_END;
+
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+
+ return ref;
+}
+
+static void do_free_callbacks(void)
+{
+ struct gnttab_free_callback *callback, *next;
+
+ callback = gnttab_free_callback_list;
+ gnttab_free_callback_list = NULL;
+
+ while (callback != NULL) {
+ next = callback->next;
+ if (gnttab_free_count >= callback->count) {
+ callback->next = NULL;
+ callback->fn(callback->arg);
+ } else {
+ callback->next = gnttab_free_callback_list;
+ gnttab_free_callback_list = callback;
+ }
+ callback = next;
+ }
+}
+
+static inline void check_free_callbacks(void)
+{
+ if (unlikely(gnttab_free_callback_list))
+ do_free_callbacks();
+}
+
+static void put_free_entry(grant_ref_t ref)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+ gnttab_entry(ref) = gnttab_free_head;
+ gnttab_free_head = ref;
+ gnttab_free_count++;
+ check_free_callbacks();
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+
+static void update_grant_entry(grant_ref_t ref, domid_t domid,
+ unsigned long frame, unsigned flags)
+{
+ /*
+ * Introducing a valid entry into the grant table:
+ * 1. Write ent->domid.
+ * 2. Write ent->frame:
+ * GTF_permit_access: Frame to which access is permitted.
+ * GTF_accept_transfer: Pseudo-phys frame slot being filled by new
+ * frame, or zero if none.
+ * 3. Write memory barrier (WMB).
+ * 4. Write ent->flags, inc. valid type.
+ */
+ shared[ref].frame = frame;
+ shared[ref].domid = domid;
+ wmb();
+ shared[ref].flags = flags;
+}
+
+/*
+ * Public grant-issuing interface functions
+ */
+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+ unsigned long frame, int readonly)
+{
+ update_grant_entry(ref, domid, frame,
+ GTF_permit_access | (readonly ? GTF_readonly : 0));
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
+
+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
+ int readonly)
+{
+ int ref;
+
+ ref = get_free_entries(1);
+ if (unlikely(ref < 0))
+ return -ENOSPC;
+
+ gnttab_grant_foreign_access_ref(ref, domid, frame, readonly);
+
+ return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
+
+int gnttab_query_foreign_access(grant_ref_t ref)
+{
+ u16 nflags;
+
+ nflags = shared[ref].flags;
+
+ return (nflags & (GTF_reading|GTF_writing));
+}
+EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
+
+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+{
+ u16 flags, nflags;
+
+ nflags = shared[ref].flags;
+ do {
+ flags = nflags;
+ if (flags & (GTF_reading|GTF_writing)) {
+ printk(KERN_ALERT "WARNING: g.e. still in use!\n");
+ return 0;
+ }
+ } while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
+
+void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
+ unsigned long page)
+{
+ if (gnttab_end_foreign_access_ref(ref, readonly)) {
+ put_free_entry(ref);
+ if (page != 0)
+ free_page(page);
+ } else {
+ /* XXX This needs to be fixed so that the ref and page are
+ placed on a list to be freed up later. */
+ printk(KERN_WARNING
+ "WARNING: leaking g.e. and page still in use!\n");
+ }
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
+
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
+{
+ int ref;
+
+ ref = get_free_entries(1);
+ if (unlikely(ref < 0))
+ return -ENOSPC;
+ gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
+
+ return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
+
+void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
+ unsigned long pfn)
+{
+ update_grant_entry(ref, domid, pfn, GTF_accept_transfer);
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+{
+ unsigned long frame;
+ u16 flags;
+
+ /*
+ * If a transfer is not even yet started, try to reclaim the grant
+ * reference and return failure (== 0).
+ */
+ while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
+ if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags)
+ return 0;
+ cpu_relax();
+ }
+
+ /* If a transfer is in progress then wait until it is completed. */
+ while (!(flags & GTF_transfer_completed)) {
+ flags = shared[ref].flags;
+ cpu_relax();
+ }
+
+ rmb(); /* Read the frame number /after/ reading completion status. */
+ frame = shared[ref].frame;
+ BUG_ON(frame == 0);
+
+ return frame;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
+
+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
+{
+ unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
+ put_free_entry(ref);
+ return frame;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
+
+void gnttab_free_grant_reference(grant_ref_t ref)
+{
+ put_free_entry(ref);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
+
+void gnttab_free_grant_references(grant_ref_t head)
+{
+ grant_ref_t ref;
+ unsigned long flags;
+ int count = 1;
+ if (head == GNTTAB_LIST_END)
+ return;
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+ ref = head;
+ while (gnttab_entry(ref) != GNTTAB_LIST_END) {
+ ref = gnttab_entry(ref);
+ count++;
+ }
+ gnttab_entry(ref) = gnttab_free_head;
+ gnttab_free_head = head;
+ gnttab_free_count += count;
+ check_free_callbacks();
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
+
+int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
+{
+ int h = get_free_entries(count);
+
+ if (h < 0)
+ return -ENOSPC;
+
+ *head = h;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
+
+int gnttab_empty_grant_references(const grant_ref_t *private_head)
+{
+ return (*private_head == GNTTAB_LIST_END);
+}
+EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
+
+int gnttab_claim_grant_reference(grant_ref_t *private_head)
+{
+ grant_ref_t g = *private_head;
+ if (unlikely(g == GNTTAB_LIST_END))
+ return -ENOSPC;
+ *private_head = gnttab_entry(g);
+ return g;
+}
+EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
+
+void gnttab_release_grant_reference(grant_ref_t *private_head,
+ grant_ref_t release)
+{
+ gnttab_entry(release) = *private_head;
+ *private_head = release;
+}
+EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
+
+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
+ void (*fn)(void *), void *arg, u16 count)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+ if (callback->next)
+ goto out;
+ callback->fn = fn;
+ callback->arg = arg;
+ callback->count = count;
+ callback->next = gnttab_free_callback_list;
+ gnttab_free_callback_list = callback;
+ check_free_callbacks();
+out:
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
+
+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
+{
+ struct gnttab_free_callback **pcb;
+ unsigned long flags;
+
+ spin_lock_irqsave(&gnttab_list_lock, flags);
+ for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
+ if (*pcb == callback) {
+ *pcb = callback->next;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
+
+static int grow_gnttab_list(unsigned int more_frames)
+{
+ unsigned int new_nr_grant_frames, extra_entries, i;
+ unsigned int nr_glist_frames, new_nr_glist_frames;
+
+ new_nr_grant_frames = nr_grant_frames + more_frames;
+ extra_entries = more_frames * GREFS_PER_GRANT_FRAME;
+
+ nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP;
+ new_nr_glist_frames =
+ (new_nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP;
+ for (i = nr_glist_frames; i < new_nr_glist_frames; i++) {
+ gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);
+ if (!gnttab_list[i])
+ goto grow_nomem;
+ }
+
+
+ for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames;
+ i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++)
+ gnttab_entry(i) = i + 1;
+
+ gnttab_entry(i) = gnttab_free_head;
+ gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames;
+ gnttab_free_count += extra_entries;
+
+ nr_grant_frames = new_nr_grant_frames;
+
+ check_free_callbacks();
+
+ return 0;
+
+grow_nomem:
+ for ( ; i >= nr_glist_frames; i--)
+ free_page((unsigned long) gnttab_list[i]);
+ return -ENOMEM;
+}
+
+static unsigned int __max_nr_grant_frames(void)
+{
+ struct gnttab_query_size query;
+ int rc;
+
+ query.dom = DOMID_SELF;
+
+ rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1);
+ if ((rc < 0) || (query.status != GNTST_okay))
+ return 4; /* Legacy max supported number of frames */
+
+ return query.max_nr_frames;
+}
+
+static inline unsigned int max_nr_grant_frames(void)
+{
+ unsigned int xen_max = __max_nr_grant_frames();
+
+ if (xen_max > boot_max_nr_grant_frames)
+ return boot_max_nr_grant_frames;
+ return xen_max;
+}
+
+static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+{
+ struct gnttab_setup_table setup;
+ unsigned long *frames;
+ unsigned int nr_gframes = end_idx + 1;
+ int rc;
+
+ frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
+ if (!frames)
+ return -ENOMEM;
+
+ setup.dom = DOMID_SELF;
+ setup.nr_frames = nr_gframes;
+ set_xen_guest_handle(setup.frame_list, frames);
+
+ rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
+ if (rc == -ENOSYS) {
+ kfree(frames);
+ return -ENOSYS;
+ }
+
+ BUG_ON(rc || setup.status);
+
+ rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(),
+ &shared);
+ BUG_ON(rc);
+
+ kfree(frames);
+
+ return 0;
+}
+
+int gnttab_resume(void)
+{
+ if (max_nr_grant_frames() < nr_grant_frames)
+ return -ENOSYS;
+ return gnttab_map(0, nr_grant_frames - 1);
+}
+
+int gnttab_suspend(void)
+{
+ arch_gnttab_unmap_shared(shared, nr_grant_frames);
+ return 0;
+}
+
+static int gnttab_expand(unsigned int req_entries)
+{
+ int rc;
+ unsigned int cur, extra;
+
+ cur = nr_grant_frames;
+ extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
+ GREFS_PER_GRANT_FRAME);
+ if (cur + extra > max_nr_grant_frames())
+ return -ENOSPC;
+
+ rc = gnttab_map(cur, cur + extra - 1);
+ if (rc == 0)
+ rc = grow_gnttab_list(extra);
+
+ return rc;
+}
+
+static int __devinit gnttab_init(void)
+{
+ int i;
+ unsigned int max_nr_glist_frames, nr_glist_frames;
+ unsigned int nr_init_grefs;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ nr_grant_frames = 1;
+ boot_max_nr_grant_frames = __max_nr_grant_frames();
+
+ /* Determine the maximum number of frames required for the
+ * grant reference free list on the current hypervisor.
+ */
+ max_nr_glist_frames = (boot_max_nr_grant_frames *
+ GREFS_PER_GRANT_FRAME / RPP);
+
+ gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),
+ GFP_KERNEL);
+ if (gnttab_list == NULL)
+ return -ENOMEM;
+
+ nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP;
+ for (i = 0; i < nr_glist_frames; i++) {
+ gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL);
+ if (gnttab_list[i] == NULL)
+ goto ini_nomem;
+ }
+
+ if (gnttab_resume() < 0)
+ return -ENODEV;
+
+ nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME;
+
+ for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
+ gnttab_entry(i) = i + 1;
+
+ gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END;
+ gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
+ gnttab_free_head = NR_RESERVED_ENTRIES;
+
+ printk("Grant table initialized\n");
+ return 0;
+
+ ini_nomem:
+ for (i--; i >= 0; i--)
+ free_page((unsigned long)gnttab_list[i]);
+ kfree(gnttab_list);
+ return -ENOMEM;
+}
+
+core_initcall(gnttab_init);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
new file mode 100644
index 00000000000..5d42d55e299
--- /dev/null
+++ b/drivers/xen/manage.c
@@ -0,0 +1,273 @@
+/*
+ * Handle extern requests for shutdown, reboot and sysrq
+ */
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+#include <linux/stop_machine.h>
+#include <linux/freezer.h>
+
+#include <xen/xenbus.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <xen/hvc-console.h>
+#include <xen/xen-ops.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+enum shutdown_state {
+ SHUTDOWN_INVALID = -1,
+ SHUTDOWN_POWEROFF = 0,
+ SHUTDOWN_SUSPEND = 2,
+ /* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+ report a crash, not be instructed to crash!
+ HALT is the same as POWEROFF, as far as we're concerned. The tools use
+ the distinction when we return the reason code to them. */
+ SHUTDOWN_HALT = 4,
+};
+
+/* Ignore multiple shutdown requests. */
+static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
+
+#ifdef CONFIG_PM_SLEEP
+static int xen_suspend(void *data)
+{
+ int *cancelled = data;
+ int err;
+
+ BUG_ON(!irqs_disabled());
+
+ err = sysdev_suspend(PMSG_SUSPEND);
+ if (err) {
+ printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
+ err);
+ return err;
+ }
+
+ xen_mm_pin_all();
+ gnttab_suspend();
+ xen_pre_suspend();
+
+ /*
+ * This hypercall returns 1 if suspend was cancelled
+ * or the domain was merely checkpointed, and 0 if it
+ * is resuming in a new domain.
+ */
+ *cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
+
+ xen_post_suspend(*cancelled);
+ gnttab_resume();
+ xen_mm_unpin_all();
+
+ if (!*cancelled) {
+ xen_irq_resume();
+ xen_console_resume();
+ xen_timer_resume();
+ }
+
+ sysdev_resume();
+
+ return 0;
+}
+
+static void do_suspend(void)
+{
+ int err;
+ int cancelled = 1;
+
+ shutting_down = SHUTDOWN_SUSPEND;
+
+ err = stop_machine_create();
+ if (err) {
+ printk(KERN_ERR "xen suspend: failed to setup stop_machine %d\n", err);
+ goto out;
+ }
+
+#ifdef CONFIG_PREEMPT
+ /* If the kernel is preemptible, we need to freeze all the processes
+ to prevent them from being in the middle of a pagetable update
+ during suspend. */
+ err = freeze_processes();
+ if (err) {
+ printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
+ goto out_destroy_sm;
+ }
+#endif
+
+ err = dpm_suspend_start(PMSG_SUSPEND);
+ if (err) {
+ printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err);
+ goto out_thaw;
+ }
+
+ printk(KERN_DEBUG "suspending xenstore...\n");
+ xs_suspend();
+
+ err = dpm_suspend_noirq(PMSG_SUSPEND);
+ if (err) {
+ printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err);
+ goto out_resume;
+ }
+
+ err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+
+ dpm_resume_noirq(PMSG_RESUME);
+
+ if (err) {
+ printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
+ cancelled = 1;
+ }
+
+out_resume:
+ if (!cancelled) {
+ xen_arch_resume();
+ xs_resume();
+ } else
+ xs_suspend_cancel();
+
+ dpm_resume_end(PMSG_RESUME);
+
+ /* Make sure timer events get retriggered on all CPUs */
+ clock_was_set();
+
+out_thaw:
+#ifdef CONFIG_PREEMPT
+ thaw_processes();
+
+out_destroy_sm:
+#endif
+ stop_machine_destroy();
+
+out:
+ shutting_down = SHUTDOWN_INVALID;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+static void shutdown_handler(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ char *str;
+ struct xenbus_transaction xbt;
+ int err;
+
+ if (shutting_down != SHUTDOWN_INVALID)
+ return;
+
+ again:
+ err = xenbus_transaction_start(&xbt);
+ if (err)
+ return;
+
+ str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+ /* Ignore read errors and empty reads. */
+ if (XENBUS_IS_ERR_READ(str)) {
+ xenbus_transaction_end(xbt, 1);
+ return;
+ }
+
+ xenbus_write(xbt, "control", "shutdown", "");
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN) {
+ kfree(str);
+ goto again;
+ }
+
+ if (strcmp(str, "poweroff") == 0 ||
+ strcmp(str, "halt") == 0) {
+ shutting_down = SHUTDOWN_POWEROFF;
+ orderly_poweroff(false);
+ } else if (strcmp(str, "reboot") == 0) {
+ shutting_down = SHUTDOWN_POWEROFF; /* ? */
+ ctrl_alt_del();
+#ifdef CONFIG_PM_SLEEP
+ } else if (strcmp(str, "suspend") == 0) {
+ do_suspend();
+#endif
+ } else {
+ printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
+ shutting_down = SHUTDOWN_INVALID;
+ }
+
+ kfree(str);
+}
+
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+ unsigned int len)
+{
+ char sysrq_key = '\0';
+ struct xenbus_transaction xbt;
+ int err;
+
+ again:
+ err = xenbus_transaction_start(&xbt);
+ if (err)
+ return;
+ if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+ printk(KERN_ERR "Unable to read sysrq code in "
+ "control/sysrq\n");
+ xenbus_transaction_end(xbt, 1);
+ return;
+ }
+
+ if (sysrq_key != '\0')
+ xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+
+ if (sysrq_key != '\0')
+ handle_sysrq(sysrq_key, NULL);
+}
+
+static struct xenbus_watch shutdown_watch = {
+ .node = "control/shutdown",
+ .callback = shutdown_handler
+};
+
+static struct xenbus_watch sysrq_watch = {
+ .node = "control/sysrq",
+ .callback = sysrq_handler
+};
+
+static int setup_shutdown_watcher(void)
+{
+ int err;
+
+ err = register_xenbus_watch(&shutdown_watch);
+ if (err) {
+ printk(KERN_ERR "Failed to set shutdown watcher\n");
+ return err;
+ }
+
+ err = register_xenbus_watch(&sysrq_watch);
+ if (err) {
+ printk(KERN_ERR "Failed to set sysrq watcher\n");
+ return err;
+ }
+
+ return 0;
+}
+
+static int shutdown_event(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ setup_shutdown_watcher();
+ return NOTIFY_DONE;
+}
+
+static int __init setup_shutdown_event(void)
+{
+ static struct notifier_block xenstore_notifier = {
+ .notifier_call = shutdown_event
+ };
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+
+subsys_initcall(setup_shutdown_event);
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
new file mode 100644
index 00000000000..ae5cb05a1a1
--- /dev/null
+++ b/drivers/xen/sys-hypervisor.c
@@ -0,0 +1,446 @@
+/*
+ * copyright (c) 2006 IBM Corporation
+ * Authored by: Mike D. Day <ncmike@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+
+#define HYPERVISOR_ATTR_RO(_name) \
+static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name)
+
+#define HYPERVISOR_ATTR_RW(_name) \
+static struct hyp_sysfs_attr _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+struct hyp_sysfs_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct hyp_sysfs_attr *, char *);
+ ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
+ void *hyp_attr_data;
+};
+
+static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ return sprintf(buffer, "xen\n");
+}
+
+HYPERVISOR_ATTR_RO(type);
+
+static int __init xen_sysfs_type_init(void)
+{
+ return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
+}
+
+static void xen_sysfs_type_destroy(void)
+{
+ sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
+}
+
+/* xen version attributes */
+static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ if (version)
+ return sprintf(buffer, "%d\n", version >> 16);
+ return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(major);
+
+static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ if (version)
+ return sprintf(buffer, "%d\n", version & 0xff);
+ return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(minor);
+
+static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ char *extra;
+
+ extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
+ if (extra) {
+ ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
+ if (!ret)
+ ret = sprintf(buffer, "%s\n", extra);
+ kfree(extra);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(extra);
+
+static struct attribute *version_attrs[] = {
+ &major_attr.attr,
+ &minor_attr.attr,
+ &extra_attr.attr,
+ NULL
+};
+
+static struct attribute_group version_group = {
+ .name = "version",
+ .attrs = version_attrs,
+};
+
+static int __init xen_sysfs_version_init(void)
+{
+ return sysfs_create_group(hypervisor_kobj, &version_group);
+}
+
+static void xen_sysfs_version_destroy(void)
+{
+ sysfs_remove_group(hypervisor_kobj, &version_group);
+}
+
+/* UUID */
+
+static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ char *vm, *val;
+ int ret;
+ extern int xenstored_ready;
+
+ if (!xenstored_ready)
+ return -EBUSY;
+
+ vm = xenbus_read(XBT_NIL, "vm", "", NULL);
+ if (IS_ERR(vm))
+ return PTR_ERR(vm);
+ val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
+ kfree(vm);
+ if (IS_ERR(val))
+ return PTR_ERR(val);
+ ret = sprintf(buffer, "%s\n", val);
+ kfree(val);
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(uuid);
+
+static int __init xen_sysfs_uuid_init(void)
+{
+ return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+static void xen_sysfs_uuid_destroy(void)
+{
+ sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+/* xen compilation attributes */
+
+static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ struct xen_compile_info *info;
+
+ info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+ if (info) {
+ ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+ if (!ret)
+ ret = sprintf(buffer, "%s\n", info->compiler);
+ kfree(info);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiler);
+
+static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ struct xen_compile_info *info;
+
+ info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+ if (info) {
+ ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+ if (!ret)
+ ret = sprintf(buffer, "%s\n", info->compile_by);
+ kfree(info);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiled_by);
+
+static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ struct xen_compile_info *info;
+
+ info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+ if (info) {
+ ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+ if (!ret)
+ ret = sprintf(buffer, "%s\n", info->compile_date);
+ kfree(info);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(compile_date);
+
+static struct attribute *xen_compile_attrs[] = {
+ &compiler_attr.attr,
+ &compiled_by_attr.attr,
+ &compile_date_attr.attr,
+ NULL
+};
+
+static struct attribute_group xen_compilation_group = {
+ .name = "compilation",
+ .attrs = xen_compile_attrs,
+};
+
+int __init static xen_compilation_init(void)
+{
+ return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+static void xen_compilation_destroy(void)
+{
+ sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+/* xen properties info */
+
+static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ char *caps;
+
+ caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
+ if (caps) {
+ ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
+ if (!ret)
+ ret = sprintf(buffer, "%s\n", caps);
+ kfree(caps);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(capabilities);
+
+static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ char *cset;
+
+ cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
+ if (cset) {
+ ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
+ if (!ret)
+ ret = sprintf(buffer, "%s\n", cset);
+ kfree(cset);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(changeset);
+
+static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret = -ENOMEM;
+ struct xen_platform_parameters *parms;
+
+ parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
+ if (parms) {
+ ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
+ parms);
+ if (!ret)
+ ret = sprintf(buffer, "%lx\n", parms->virt_start);
+ kfree(parms);
+ }
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(virtual_start);
+
+static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ int ret;
+
+ ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
+ if (ret > 0)
+ ret = sprintf(buffer, "%x\n", ret);
+
+ return ret;
+}
+
+HYPERVISOR_ATTR_RO(pagesize);
+
+static ssize_t xen_feature_show(int index, char *buffer)
+{
+ ssize_t ret;
+ struct xen_feature_info info;
+
+ info.submap_idx = index;
+ ret = HYPERVISOR_xen_version(XENVER_get_features, &info);
+ if (!ret)
+ ret = sprintf(buffer, "%08x", info.submap);
+
+ return ret;
+}
+
+static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+ ssize_t len;
+ int i;
+
+ len = 0;
+ for (i = XENFEAT_NR_SUBMAPS-1; i >= 0; i--) {
+ int ret = xen_feature_show(i, buffer + len);
+ if (ret < 0) {
+ if (len == 0)
+ len = ret;
+ break;
+ }
+ len += ret;
+ }
+ if (len > 0)
+ buffer[len++] = '\n';
+
+ return len;
+}
+
+HYPERVISOR_ATTR_RO(features);
+
+static struct attribute *xen_properties_attrs[] = {
+ &capabilities_attr.attr,
+ &changeset_attr.attr,
+ &virtual_start_attr.attr,
+ &pagesize_attr.attr,
+ &features_attr.attr,
+ NULL
+};
+
+static struct attribute_group xen_properties_group = {
+ .name = "properties",
+ .attrs = xen_properties_attrs,
+};
+
+static int __init xen_properties_init(void)
+{
+ return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
+}
+
+static void xen_properties_destroy(void)
+{
+ sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
+}
+
+static int __init hyper_sysfs_init(void)
+{
+ int ret;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ ret = xen_sysfs_type_init();
+ if (ret)
+ goto out;
+ ret = xen_sysfs_version_init();
+ if (ret)
+ goto version_out;
+ ret = xen_compilation_init();
+ if (ret)
+ goto comp_out;
+ ret = xen_sysfs_uuid_init();
+ if (ret)
+ goto uuid_out;
+ ret = xen_properties_init();
+ if (ret)
+ goto prop_out;
+
+ goto out;
+
+prop_out:
+ xen_sysfs_uuid_destroy();
+uuid_out:
+ xen_compilation_destroy();
+comp_out:
+ xen_sysfs_version_destroy();
+version_out:
+ xen_sysfs_type_destroy();
+out:
+ return ret;
+}
+
+static void __exit hyper_sysfs_exit(void)
+{
+ xen_properties_destroy();
+ xen_compilation_destroy();
+ xen_sysfs_uuid_destroy();
+ xen_sysfs_version_destroy();
+ xen_sysfs_type_destroy();
+
+}
+module_init(hyper_sysfs_init);
+module_exit(hyper_sysfs_exit);
+
+static ssize_t hyp_sysfs_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buffer)
+{
+ struct hyp_sysfs_attr *hyp_attr;
+ hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+ if (hyp_attr->show)
+ return hyp_attr->show(hyp_attr, buffer);
+ return 0;
+}
+
+static ssize_t hyp_sysfs_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buffer,
+ size_t len)
+{
+ struct hyp_sysfs_attr *hyp_attr;
+ hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+ if (hyp_attr->store)
+ return hyp_attr->store(hyp_attr, buffer, len);
+ return 0;
+}
+
+static struct sysfs_ops hyp_sysfs_ops = {
+ .show = hyp_sysfs_show,
+ .store = hyp_sysfs_store,
+};
+
+static struct kobj_type hyp_sysfs_kobj_type = {
+ .sysfs_ops = &hyp_sysfs_ops,
+};
+
+static int __init hypervisor_subsys_init(void)
+{
+ if (!xen_domain())
+ return -ENODEV;
+
+ hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
+ return 0;
+}
+device_initcall(hypervisor_subsys_init);
diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
new file mode 100644
index 00000000000..5571f5b8422
--- /dev/null
+++ b/drivers/xen/xenbus/Makefile
@@ -0,0 +1,7 @@
+obj-y += xenbus.o
+
+xenbus-objs =
+xenbus-objs += xenbus_client.o
+xenbus-objs += xenbus_comms.o
+xenbus-objs += xenbus_xs.o
+xenbus-objs += xenbus_probe.o
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
new file mode 100644
index 00000000000..92a1ef80a28
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -0,0 +1,568 @@
+/******************************************************************************
+ * Client-facing interface for the Xenbus driver. In other words, the
+ * interface between the Xenbus and the device-specific code, be it the
+ * frontend or the backend of that driver.
+ *
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+
+const char *xenbus_strstate(enum xenbus_state state)
+{
+ static const char *const name[] = {
+ [ XenbusStateUnknown ] = "Unknown",
+ [ XenbusStateInitialising ] = "Initialising",
+ [ XenbusStateInitWait ] = "InitWait",
+ [ XenbusStateInitialised ] = "Initialised",
+ [ XenbusStateConnected ] = "Connected",
+ [ XenbusStateClosing ] = "Closing",
+ [ XenbusStateClosed ] = "Closed",
+ };
+ return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
+}
+EXPORT_SYMBOL_GPL(xenbus_strstate);
+
+/**
+ * xenbus_watch_path - register a watch
+ * @dev: xenbus device
+ * @path: path to watch
+ * @watch: watch to register
+ * @callback: callback to register
+ *
+ * Register a @watch on the given path, using the given xenbus_watch structure
+ * for storage, and the given @callback function as the callback. Return 0 on
+ * success, or -errno on error. On success, the given @path will be saved as
+ * @watch->node, and remains the caller's to free. On error, @watch->node will
+ * be NULL, the device will switch to %XenbusStateClosing, and the error will
+ * be saved in the store.
+ */
+int xenbus_watch_path(struct xenbus_device *dev, const char *path,
+ struct xenbus_watch *watch,
+ void (*callback)(struct xenbus_watch *,
+ const char **, unsigned int))
+{
+ int err;
+
+ watch->node = path;
+ watch->callback = callback;
+
+ err = register_xenbus_watch(watch);
+
+ if (err) {
+ watch->node = NULL;
+ watch->callback = NULL;
+ xenbus_dev_fatal(dev, err, "adding watch on %s", path);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_watch_path);
+
+
+/**
+ * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path
+ * @dev: xenbus device
+ * @watch: watch to register
+ * @callback: callback to register
+ * @pathfmt: format of path to watch
+ *
+ * Register a watch on the given @path, using the given xenbus_watch
+ * structure for storage, and the given @callback function as the callback.
+ * Return 0 on success, or -errno on error. On success, the watched path
+ * (@path/@path2) will be saved as @watch->node, and becomes the caller's to
+ * kfree(). On error, watch->node will be NULL, so the caller has nothing to
+ * free, the device will switch to %XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
+int xenbus_watch_pathfmt(struct xenbus_device *dev,
+ struct xenbus_watch *watch,
+ void (*callback)(struct xenbus_watch *,
+ const char **, unsigned int),
+ const char *pathfmt, ...)
+{
+ int err;
+ va_list ap;
+ char *path;
+
+ va_start(ap, pathfmt);
+ path = kvasprintf(GFP_NOIO | __GFP_HIGH, pathfmt, ap);
+ va_end(ap);
+
+ if (!path) {
+ xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
+ return -ENOMEM;
+ }
+ err = xenbus_watch_path(dev, path, watch, callback);
+
+ if (err)
+ kfree(path);
+ return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
+
+
+/**
+ * xenbus_switch_state
+ * @dev: xenbus device
+ * @state: new state
+ *
+ * Advertise in the store a change of the given driver to the given new_state.
+ * Return 0 on success, or -errno on error. On error, the device will switch
+ * to XenbusStateClosing, and the error will be saved in the store.
+ */
+int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
+{
+ /* We check whether the state is currently set to the given value, and
+ if not, then the state is set. We don't want to unconditionally
+ write the given state, because we don't want to fire watches
+ unnecessarily. Furthermore, if the node has gone, we don't write
+ to it, as the device will be tearing down, and we don't want to
+ resurrect that directory.
+
+ Note that, because of this cached value of our state, this function
+ will not work inside a Xenstore transaction (something it was
+ trying to in the past) because dev->state would not get reset if
+ the transaction was aborted.
+
+ */
+
+ int current_state;
+ int err;
+
+ if (state == dev->state)
+ return 0;
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
+ &current_state);
+ if (err != 1)
+ return 0;
+
+ err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
+ if (err) {
+ if (state != XenbusStateClosing) /* Avoid looping */
+ xenbus_dev_fatal(dev, err, "writing new state");
+ return err;
+ }
+
+ dev->state = state;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_switch_state);
+
+int xenbus_frontend_closed(struct xenbus_device *dev)
+{
+ xenbus_switch_state(dev, XenbusStateClosed);
+ complete(&dev->down);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_frontend_closed);
+
+/**
+ * Return the path to the error node for the given device, or NULL on failure.
+ * If the value returned is non-NULL, then it is the caller's to kfree.
+ */
+static char *error_path(struct xenbus_device *dev)
+{
+ return kasprintf(GFP_KERNEL, "error/%s", dev->nodename);
+}
+
+
+static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
+ const char *fmt, va_list ap)
+{
+ int ret;
+ unsigned int len;
+ char *printf_buffer = NULL;
+ char *path_buffer = NULL;
+
+#define PRINTF_BUFFER_SIZE 4096
+ printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
+ if (printf_buffer == NULL)
+ goto fail;
+
+ len = sprintf(printf_buffer, "%i ", -err);
+ ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
+
+ BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
+
+ dev_err(&dev->dev, "%s\n", printf_buffer);
+
+ path_buffer = error_path(dev);
+
+ if (path_buffer == NULL) {
+ dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
+ dev->nodename, printf_buffer);
+ goto fail;
+ }
+
+ if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
+ dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
+ dev->nodename, printf_buffer);
+ goto fail;
+ }
+
+fail:
+ kfree(printf_buffer);
+ kfree(path_buffer);
+}
+
+
+/**
+ * xenbus_dev_error
+ * @dev: xenbus device
+ * @err: error to report
+ * @fmt: error message format
+ *
+ * Report the given negative errno into the store, along with the given
+ * formatted message.
+ */
+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ xenbus_va_dev_error(dev, err, fmt, ap);
+ va_end(ap);
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_error);
+
+/**
+ * xenbus_dev_fatal
+ * @dev: xenbus device
+ * @err: error to report
+ * @fmt: error message format
+ *
+ * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
+ * xenbus_switch_state(dev, XenbusStateClosing) to schedule an orderly
+ * closedown of this driver and its peer.
+ */
+
+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ xenbus_va_dev_error(dev, err, fmt, ap);
+ va_end(ap);
+
+ xenbus_switch_state(dev, XenbusStateClosing);
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
+
+/**
+ * xenbus_grant_ring
+ * @dev: xenbus device
+ * @ring_mfn: mfn of ring to grant
+
+ * Grant access to the given @ring_mfn to the peer of the given device. Return
+ * 0 on success, or -errno on error. On error, the device will switch to
+ * XenbusStateClosing, and the error will be saved in the store.
+ */
+int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
+{
+ int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
+ if (err < 0)
+ xenbus_dev_fatal(dev, err, "granting access to ring page");
+ return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_grant_ring);
+
+
+/**
+ * Allocate an event channel for the given xenbus_device, assigning the newly
+ * created local port to *port. Return 0 on success, or -errno on error. On
+ * error, the device will switch to XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
+int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
+{
+ struct evtchn_alloc_unbound alloc_unbound;
+ int err;
+
+ alloc_unbound.dom = DOMID_SELF;
+ alloc_unbound.remote_dom = dev->otherend_id;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+ &alloc_unbound);
+ if (err)
+ xenbus_dev_fatal(dev, err, "allocating event channel");
+ else
+ *port = alloc_unbound.port;
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
+
+
+/**
+ * Bind to an existing interdomain event channel in another domain. Returns 0
+ * on success and stores the local port in *port. On error, returns -errno,
+ * switches the device to XenbusStateClosing, and saves the error in XenStore.
+ */
+int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
+{
+ struct evtchn_bind_interdomain bind_interdomain;
+ int err;
+
+ bind_interdomain.remote_dom = dev->otherend_id;
+ bind_interdomain.remote_port = remote_port;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+ &bind_interdomain);
+ if (err)
+ xenbus_dev_fatal(dev, err,
+ "binding to event channel %d from domain %d",
+ remote_port, dev->otherend_id);
+ else
+ *port = bind_interdomain.local_port;
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
+
+
+/**
+ * Free an existing event channel. Returns 0 on success or -errno on error.
+ */
+int xenbus_free_evtchn(struct xenbus_device *dev, int port)
+{
+ struct evtchn_close close;
+ int err;
+
+ close.port = port;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+ if (err)
+ xenbus_dev_error(dev, err, "freeing event channel %d", port);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
+
+
+/**
+ * xenbus_map_ring_valloc
+ * @dev: xenbus device
+ * @gnt_ref: grant reference
+ * @vaddr: pointer to address to be filled out by mapping
+ *
+ * Based on Rusty Russell's skeleton driver's map_page.
+ * Map a page of memory into this domain from another domain's grant table.
+ * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
+ * page to that address, and sets *vaddr to that address.
+ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
+ * or -ENOMEM on error. If an error is returned, device will switch to
+ * XenbusStateClosing and the error message will be saved in XenStore.
+ */
+int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
+{
+ struct gnttab_map_grant_ref op = {
+ .flags = GNTMAP_host_map,
+ .ref = gnt_ref,
+ .dom = dev->otherend_id,
+ };
+ struct vm_struct *area;
+
+ *vaddr = NULL;
+
+ area = xen_alloc_vm_area(PAGE_SIZE);
+ if (!area)
+ return -ENOMEM;
+
+ op.host_addr = (unsigned long)area->addr;
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status != GNTST_okay) {
+ xen_free_vm_area(area);
+ xenbus_dev_fatal(dev, op.status,
+ "mapping in shared page %d from domain %d",
+ gnt_ref, dev->otherend_id);
+ return op.status;
+ }
+
+ /* Stuff the handle in an unused field */
+ area->phys_addr = (unsigned long)op.handle;
+
+ *vaddr = area->addr;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
+
+
+/**
+ * xenbus_map_ring
+ * @dev: xenbus device
+ * @gnt_ref: grant reference
+ * @handle: pointer to grant handle to be filled
+ * @vaddr: address to be mapped to
+ *
+ * Map a page of memory into this domain from another domain's grant table.
+ * xenbus_map_ring does not allocate the virtual address space (you must do
+ * this yourself!). It only maps in the page to the specified address.
+ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
+ * or -ENOMEM on error. If an error is returned, device will switch to
+ * XenbusStateClosing and the error message will be saved in XenStore.
+ */
+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
+ grant_handle_t *handle, void *vaddr)
+{
+ struct gnttab_map_grant_ref op = {
+ .host_addr = (unsigned long)vaddr,
+ .flags = GNTMAP_host_map,
+ .ref = gnt_ref,
+ .dom = dev->otherend_id,
+ };
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status != GNTST_okay) {
+ xenbus_dev_fatal(dev, op.status,
+ "mapping in shared page %d from domain %d",
+ gnt_ref, dev->otherend_id);
+ } else
+ *handle = op.handle;
+
+ return op.status;
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring);
+
+
+/**
+ * xenbus_unmap_ring_vfree
+ * @dev: xenbus device
+ * @vaddr: addr to unmap
+ *
+ * Based on Rusty Russell's skeleton driver's unmap_page.
+ * Unmap a page of memory in this domain that was imported from another domain.
+ * Use xenbus_unmap_ring_vfree if you mapped in your memory with
+ * xenbus_map_ring_valloc (it will free the virtual address space).
+ * Returns 0 on success and returns GNTST_* on error
+ * (see xen/include/interface/grant_table.h).
+ */
+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
+{
+ struct vm_struct *area;
+ struct gnttab_unmap_grant_ref op = {
+ .host_addr = (unsigned long)vaddr,
+ };
+
+ /* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr)
+ * method so that we don't have to muck with vmalloc internals here.
+ * We could force the user to hang on to their struct vm_struct from
+ * xenbus_map_ring_valloc, but these 6 lines considerably simplify
+ * this API.
+ */
+ read_lock(&vmlist_lock);
+ for (area = vmlist; area != NULL; area = area->next) {
+ if (area->addr == vaddr)
+ break;
+ }
+ read_unlock(&vmlist_lock);
+
+ if (!area) {
+ xenbus_dev_error(dev, -ENOENT,
+ "can't find mapped virtual address %p", vaddr);
+ return GNTST_bad_virt_addr;
+ }
+
+ op.handle = (grant_handle_t)area->phys_addr;
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status == GNTST_okay)
+ xen_free_vm_area(area);
+ else
+ xenbus_dev_error(dev, op.status,
+ "unmapping page at handle %d error %d",
+ (int16_t)area->phys_addr, op.status);
+
+ return op.status;
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+
+
+/**
+ * xenbus_unmap_ring
+ * @dev: xenbus device
+ * @handle: grant handle
+ * @vaddr: addr to unmap
+ *
+ * Unmap a page of memory in this domain that was imported from another domain.
+ * Returns 0 on success and returns GNTST_* on error
+ * (see xen/include/interface/grant_table.h).
+ */
+int xenbus_unmap_ring(struct xenbus_device *dev,
+ grant_handle_t handle, void *vaddr)
+{
+ struct gnttab_unmap_grant_ref op = {
+ .host_addr = (unsigned long)vaddr,
+ .handle = handle,
+ };
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status != GNTST_okay)
+ xenbus_dev_error(dev, op.status,
+ "unmapping page at handle %d error %d",
+ handle, op.status);
+
+ return op.status;
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
+
+
+/**
+ * xenbus_read_driver_state
+ * @path: path for driver
+ *
+ * Return the state of the driver rooted at the given store path, or
+ * XenbusStateUnknown if no state can be read.
+ */
+enum xenbus_state xenbus_read_driver_state(const char *path)
+{
+ enum xenbus_state result;
+ int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
+ if (err)
+ result = XenbusStateUnknown;
+
+ return result;
+}
+EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c
new file mode 100644
index 00000000000..090c61ee8fd
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@ -0,0 +1,234 @@
+/******************************************************************************
+ * xenbus_comms.c
+ *
+ * Low level code to talks to Xen Store: ringbuffer and event channel.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/wait.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <xen/xenbus.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include "xenbus_comms.h"
+
+static int xenbus_irq;
+
+static DECLARE_WORK(probe_work, xenbus_probe);
+
+static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
+
+static irqreturn_t wake_waiting(int irq, void *unused)
+{
+ if (unlikely(xenstored_ready == 0)) {
+ xenstored_ready = 1;
+ schedule_work(&probe_work);
+ }
+
+ wake_up(&xb_waitq);
+ return IRQ_HANDLED;
+}
+
+static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
+{
+ return ((prod - cons) <= XENSTORE_RING_SIZE);
+}
+
+static void *get_output_chunk(XENSTORE_RING_IDX cons,
+ XENSTORE_RING_IDX prod,
+ char *buf, uint32_t *len)
+{
+ *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
+ if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
+ *len = XENSTORE_RING_SIZE - (prod - cons);
+ return buf + MASK_XENSTORE_IDX(prod);
+}
+
+static const void *get_input_chunk(XENSTORE_RING_IDX cons,
+ XENSTORE_RING_IDX prod,
+ const char *buf, uint32_t *len)
+{
+ *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
+ if ((prod - cons) < *len)
+ *len = prod - cons;
+ return buf + MASK_XENSTORE_IDX(cons);
+}
+
+/**
+ * xb_write - low level write
+ * @data: buffer to send
+ * @len: length of buffer
+ *
+ * Returns 0 on success, error otherwise.
+ */
+int xb_write(const void *data, unsigned len)
+{
+ struct xenstore_domain_interface *intf = xen_store_interface;
+ XENSTORE_RING_IDX cons, prod;
+ int rc;
+
+ while (len != 0) {
+ void *dst;
+ unsigned int avail;
+
+ rc = wait_event_interruptible(
+ xb_waitq,
+ (intf->req_prod - intf->req_cons) !=
+ XENSTORE_RING_SIZE);
+ if (rc < 0)
+ return rc;
+
+ /* Read indexes, then verify. */
+ cons = intf->req_cons;
+ prod = intf->req_prod;
+ if (!check_indexes(cons, prod)) {
+ intf->req_cons = intf->req_prod = 0;
+ return -EIO;
+ }
+
+ dst = get_output_chunk(cons, prod, intf->req, &avail);
+ if (avail == 0)
+ continue;
+ if (avail > len)
+ avail = len;
+
+ /* Must write data /after/ reading the consumer index. */
+ mb();
+
+ memcpy(dst, data, avail);
+ data += avail;
+ len -= avail;
+
+ /* Other side must not see new producer until data is there. */
+ wmb();
+ intf->req_prod += avail;
+
+ /* Implies mb(): other side will see the updated producer. */
+ notify_remote_via_evtchn(xen_store_evtchn);
+ }
+
+ return 0;
+}
+
+int xb_data_to_read(void)
+{
+ struct xenstore_domain_interface *intf = xen_store_interface;
+ return (intf->rsp_cons != intf->rsp_prod);
+}
+
+int xb_wait_for_data_to_read(void)
+{
+ return wait_event_interruptible(xb_waitq, xb_data_to_read());
+}
+
+int xb_read(void *data, unsigned len)
+{
+ struct xenstore_domain_interface *intf = xen_store_interface;
+ XENSTORE_RING_IDX cons, prod;
+ int rc;
+
+ while (len != 0) {
+ unsigned int avail;
+ const char *src;
+
+ rc = xb_wait_for_data_to_read();
+ if (rc < 0)
+ return rc;
+
+ /* Read indexes, then verify. */
+ cons = intf->rsp_cons;
+ prod = intf->rsp_prod;
+ if (!check_indexes(cons, prod)) {
+ intf->rsp_cons = intf->rsp_prod = 0;
+ return -EIO;
+ }
+
+ src = get_input_chunk(cons, prod, intf->rsp, &avail);
+ if (avail == 0)
+ continue;
+ if (avail > len)
+ avail = len;
+
+ /* Must read data /after/ reading the producer index. */
+ rmb();
+
+ memcpy(data, src, avail);
+ data += avail;
+ len -= avail;
+
+ /* Other side must not see free space until we've copied out */
+ mb();
+ intf->rsp_cons += avail;
+
+ pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
+
+ /* Implies mb(): other side will see the updated consumer. */
+ notify_remote_via_evtchn(xen_store_evtchn);
+ }
+
+ return 0;
+}
+
+/**
+ * xb_init_comms - Set up interrupt handler off store event channel.
+ */
+int xb_init_comms(void)
+{
+ struct xenstore_domain_interface *intf = xen_store_interface;
+
+ if (intf->req_prod != intf->req_cons)
+ printk(KERN_ERR "XENBUS request ring is not quiescent "
+ "(%08x:%08x)!\n", intf->req_cons, intf->req_prod);
+
+ if (intf->rsp_prod != intf->rsp_cons) {
+ printk(KERN_WARNING "XENBUS response ring is not quiescent "
+ "(%08x:%08x): fixing up\n",
+ intf->rsp_cons, intf->rsp_prod);
+ intf->rsp_cons = intf->rsp_prod;
+ }
+
+ if (xenbus_irq) {
+ /* Already have an irq; assume we're resuming */
+ rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
+ } else {
+ int err;
+ err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
+ 0, "xenbus", &xb_waitq);
+ if (err <= 0) {
+ printk(KERN_ERR "XENBUS request irq failed %i\n", err);
+ return err;
+ }
+
+ xenbus_irq = err;
+ }
+
+ return 0;
+}
diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h
new file mode 100644
index 00000000000..c21db751373
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_comms.h
@@ -0,0 +1,46 @@
+/*
+ * Private include for xenbus communications.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _XENBUS_COMMS_H
+#define _XENBUS_COMMS_H
+
+int xs_init(void);
+int xb_init_comms(void);
+
+/* Low level routines. */
+int xb_write(const void *data, unsigned len);
+int xb_read(void *data, unsigned len);
+int xb_data_to_read(void);
+int xb_wait_for_data_to_read(void);
+int xs_input_avail(void);
+extern struct xenstore_domain_interface *xen_store_interface;
+extern int xen_store_evtchn;
+
+#endif /* _XENBUS_COMMS_H */
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
new file mode 100644
index 00000000000..2f7aaa99dc4
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -0,0 +1,958 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005, 2006 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define DPRINTK(fmt, args...) \
+ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
+ __func__, __LINE__, ##args)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/page.h>
+
+#include "xenbus_comms.h"
+#include "xenbus_probe.h"
+
+
+int xen_store_evtchn;
+EXPORT_SYMBOL(xen_store_evtchn);
+
+struct xenstore_domain_interface *xen_store_interface;
+static unsigned long xen_store_mfn;
+
+static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
+
+static void wait_for_devices(struct xenbus_driver *xendrv);
+
+static int xenbus_probe_frontend(const char *type, const char *name);
+
+static void xenbus_dev_shutdown(struct device *_dev);
+
+static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
+static int xenbus_dev_resume(struct device *dev);
+
+/* If something in array of ids matches this device, return it. */
+static const struct xenbus_device_id *
+match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
+{
+ for (; *arr->devicetype != '\0'; arr++) {
+ if (!strcmp(arr->devicetype, dev->devicetype))
+ return arr;
+ }
+ return NULL;
+}
+
+int xenbus_match(struct device *_dev, struct device_driver *_drv)
+{
+ struct xenbus_driver *drv = to_xenbus_driver(_drv);
+
+ if (!drv->ids)
+ return 0;
+
+ return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
+}
+
+static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env)
+{
+ struct xenbus_device *dev = to_xenbus_device(_dev);
+
+ if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* device/<type>/<id> => <type>-<id> */
+static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+ nodename = strchr(nodename, '/');
+ if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
+ printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
+ return -EINVAL;
+ }
+
+ strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
+ if (!strchr(bus_id, '/')) {
+ printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
+ return -EINVAL;
+ }
+ *strchr(bus_id, '/') = '-';
+ return 0;
+}
+
+
+static void free_otherend_details(struct xenbus_device *dev)
+{
+ kfree(dev->otherend);
+ dev->otherend = NULL;
+}
+
+
+static void free_otherend_watch(struct xenbus_device *dev)
+{
+ if (dev->otherend_watch.node) {
+ unregister_xenbus_watch(&dev->otherend_watch);
+ kfree(dev->otherend_watch.node);
+ dev->otherend_watch.node = NULL;
+ }
+}
+
+
+int read_otherend_details(struct xenbus_device *xendev,
+ char *id_node, char *path_node)
+{
+ int err = xenbus_gather(XBT_NIL, xendev->nodename,
+ id_node, "%i", &xendev->otherend_id,
+ path_node, NULL, &xendev->otherend,
+ NULL);
+ if (err) {
+ xenbus_dev_fatal(xendev, err,
+ "reading other end details from %s",
+ xendev->nodename);
+ return err;
+ }
+ if (strlen(xendev->otherend) == 0 ||
+ !xenbus_exists(XBT_NIL, xendev->otherend, "")) {
+ xenbus_dev_fatal(xendev, -ENOENT,
+ "unable to read other end from %s. "
+ "missing or inaccessible.",
+ xendev->nodename);
+ free_otherend_details(xendev);
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+
+static int read_backend_details(struct xenbus_device *xendev)
+{
+ return read_otherend_details(xendev, "backend-id", "backend");
+}
+
+static struct device_attribute xenbus_dev_attrs[] = {
+ __ATTR_NULL
+};
+
+/* Bus type for frontend drivers. */
+static struct xen_bus_type xenbus_frontend = {
+ .root = "device",
+ .levels = 2, /* device/type/<id> */
+ .get_bus_id = frontend_bus_id,
+ .probe = xenbus_probe_frontend,
+ .bus = {
+ .name = "xen",
+ .match = xenbus_match,
+ .uevent = xenbus_uevent,
+ .probe = xenbus_dev_probe,
+ .remove = xenbus_dev_remove,
+ .shutdown = xenbus_dev_shutdown,
+ .dev_attrs = xenbus_dev_attrs,
+
+ .suspend = xenbus_dev_suspend,
+ .resume = xenbus_dev_resume,
+ },
+};
+
+static void otherend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ struct xenbus_device *dev =
+ container_of(watch, struct xenbus_device, otherend_watch);
+ struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+ enum xenbus_state state;
+
+ /* Protect us against watches firing on old details when the otherend
+ details change, say immediately after a resume. */
+ if (!dev->otherend ||
+ strncmp(dev->otherend, vec[XS_WATCH_PATH],
+ strlen(dev->otherend))) {
+ dev_dbg(&dev->dev, "Ignoring watch at %s\n",
+ vec[XS_WATCH_PATH]);
+ return;
+ }
+
+ state = xenbus_read_driver_state(dev->otherend);
+
+ dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n",
+ state, xenbus_strstate(state), dev->otherend_watch.node,
+ vec[XS_WATCH_PATH]);
+
+ /*
+ * Ignore xenbus transitions during shutdown. This prevents us doing
+ * work that can fail e.g., when the rootfs is gone.
+ */
+ if (system_state > SYSTEM_RUNNING) {
+ struct xen_bus_type *bus = bus;
+ bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
+ /* If we're frontend, drive the state machine to Closed. */
+ /* This should cause the backend to release our resources. */
+ if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
+ xenbus_frontend_closed(dev);
+ return;
+ }
+
+ if (drv->otherend_changed)
+ drv->otherend_changed(dev, state);
+}
+
+
+static int talk_to_otherend(struct xenbus_device *dev)
+{
+ struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+
+ free_otherend_watch(dev);
+ free_otherend_details(dev);
+
+ return drv->read_otherend_details(dev);
+}
+
+
+static int watch_otherend(struct xenbus_device *dev)
+{
+ return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed,
+ "%s/%s", dev->otherend, "state");
+}
+
+
+int xenbus_dev_probe(struct device *_dev)
+{
+ struct xenbus_device *dev = to_xenbus_device(_dev);
+ struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
+ const struct xenbus_device_id *id;
+ int err;
+
+ DPRINTK("%s", dev->nodename);
+
+ if (!drv->probe) {
+ err = -ENODEV;
+ goto fail;
+ }
+
+ id = match_device(drv->ids, dev);
+ if (!id) {
+ err = -ENODEV;
+ goto fail;
+ }
+
+ err = talk_to_otherend(dev);
+ if (err) {
+ dev_warn(&dev->dev, "talk_to_otherend on %s failed.\n",
+ dev->nodename);
+ return err;
+ }
+
+ err = drv->probe(dev, id);
+ if (err)
+ goto fail;
+
+ err = watch_otherend(dev);
+ if (err) {
+ dev_warn(&dev->dev, "watch_otherend on %s failed.\n",
+ dev->nodename);
+ return err;
+ }
+
+ return 0;
+fail:
+ xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
+ xenbus_switch_state(dev, XenbusStateClosed);
+ return -ENODEV;
+}
+
+int xenbus_dev_remove(struct device *_dev)
+{
+ struct xenbus_device *dev = to_xenbus_device(_dev);
+ struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
+
+ DPRINTK("%s", dev->nodename);
+
+ free_otherend_watch(dev);
+ free_otherend_details(dev);
+
+ if (drv->remove)
+ drv->remove(dev);
+
+ xenbus_switch_state(dev, XenbusStateClosed);
+ return 0;
+}
+
+static void xenbus_dev_shutdown(struct device *_dev)
+{
+ struct xenbus_device *dev = to_xenbus_device(_dev);
+ unsigned long timeout = 5*HZ;
+
+ DPRINTK("%s", dev->nodename);
+
+ get_device(&dev->dev);
+ if (dev->state != XenbusStateConnected) {
+ printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__,
+ dev->nodename, xenbus_strstate(dev->state));
+ goto out;
+ }
+ xenbus_switch_state(dev, XenbusStateClosing);
+ timeout = wait_for_completion_timeout(&dev->down, timeout);
+ if (!timeout)
+ printk(KERN_INFO "%s: %s timeout closing device\n",
+ __func__, dev->nodename);
+ out:
+ put_device(&dev->dev);
+}
+
+int xenbus_register_driver_common(struct xenbus_driver *drv,
+ struct xen_bus_type *bus,
+ struct module *owner,
+ const char *mod_name)
+{
+ drv->driver.name = drv->name;
+ drv->driver.bus = &bus->bus;
+ drv->driver.owner = owner;
+ drv->driver.mod_name = mod_name;
+
+ return driver_register(&drv->driver);
+}
+
+int __xenbus_register_frontend(struct xenbus_driver *drv,
+ struct module *owner, const char *mod_name)
+{
+ int ret;
+
+ drv->read_otherend_details = read_backend_details;
+
+ ret = xenbus_register_driver_common(drv, &xenbus_frontend,
+ owner, mod_name);
+ if (ret)
+ return ret;
+
+ /* If this driver is loaded as a module wait for devices to attach. */
+ wait_for_devices(drv);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
+
+void xenbus_unregister_driver(struct xenbus_driver *drv)
+{
+ driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
+
+struct xb_find_info
+{
+ struct xenbus_device *dev;
+ const char *nodename;
+};
+
+static int cmp_dev(struct device *dev, void *data)
+{
+ struct xenbus_device *xendev = to_xenbus_device(dev);
+ struct xb_find_info *info = data;
+
+ if (!strcmp(xendev->nodename, info->nodename)) {
+ info->dev = xendev;
+ get_device(dev);
+ return 1;
+ }
+ return 0;
+}
+
+struct xenbus_device *xenbus_device_find(const char *nodename,
+ struct bus_type *bus)
+{
+ struct xb_find_info info = { .dev = NULL, .nodename = nodename };
+
+ bus_for_each_dev(bus, NULL, &info, cmp_dev);
+ return info.dev;
+}
+
+static int cleanup_dev(struct device *dev, void *data)
+{
+ struct xenbus_device *xendev = to_xenbus_device(dev);
+ struct xb_find_info *info = data;
+ int len = strlen(info->nodename);
+
+ DPRINTK("%s", info->nodename);
+
+ /* Match the info->nodename path, or any subdirectory of that path. */
+ if (strncmp(xendev->nodename, info->nodename, len))
+ return 0;
+
+ /* If the node name is longer, ensure it really is a subdirectory. */
+ if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
+ return 0;
+
+ info->dev = xendev;
+ get_device(dev);
+ return 1;
+}
+
+static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
+{
+ struct xb_find_info info = { .nodename = path };
+
+ do {
+ info.dev = NULL;
+ bus_for_each_dev(bus, NULL, &info, cleanup_dev);
+ if (info.dev) {
+ device_unregister(&info.dev->dev);
+ put_device(&info.dev->dev);
+ }
+ } while (info.dev);
+}
+
+static void xenbus_dev_release(struct device *dev)
+{
+ if (dev)
+ kfree(to_xenbus_device(dev));
+}
+
+static ssize_t xendev_show_nodename(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
+}
+static DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
+
+static ssize_t xendev_show_devtype(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
+}
+static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
+
+static ssize_t xendev_show_modalias(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
+}
+static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
+
+int xenbus_probe_node(struct xen_bus_type *bus,
+ const char *type,
+ const char *nodename)
+{
+ char devname[XEN_BUS_ID_SIZE];
+ int err;
+ struct xenbus_device *xendev;
+ size_t stringlen;
+ char *tmpstring;
+
+ enum xenbus_state state = xenbus_read_driver_state(nodename);
+
+ if (state != XenbusStateInitialising) {
+ /* Device is not new, so ignore it. This can happen if a
+ device is going away after switching to Closed. */
+ return 0;
+ }
+
+ stringlen = strlen(nodename) + 1 + strlen(type) + 1;
+ xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
+ if (!xendev)
+ return -ENOMEM;
+
+ xendev->state = XenbusStateInitialising;
+
+ /* Copy the strings into the extra space. */
+
+ tmpstring = (char *)(xendev + 1);
+ strcpy(tmpstring, nodename);
+ xendev->nodename = tmpstring;
+
+ tmpstring += strlen(tmpstring) + 1;
+ strcpy(tmpstring, type);
+ xendev->devicetype = tmpstring;
+ init_completion(&xendev->down);
+
+ xendev->dev.bus = &bus->bus;
+ xendev->dev.release = xenbus_dev_release;
+
+ err = bus->get_bus_id(devname, xendev->nodename);
+ if (err)
+ goto fail;
+
+ dev_set_name(&xendev->dev, devname);
+
+ /* Register with generic device framework. */
+ err = device_register(&xendev->dev);
+ if (err)
+ goto fail;
+
+ err = device_create_file(&xendev->dev, &dev_attr_nodename);
+ if (err)
+ goto fail_unregister;
+
+ err = device_create_file(&xendev->dev, &dev_attr_devtype);
+ if (err)
+ goto fail_remove_nodename;
+
+ err = device_create_file(&xendev->dev, &dev_attr_modalias);
+ if (err)
+ goto fail_remove_devtype;
+
+ return 0;
+fail_remove_devtype:
+ device_remove_file(&xendev->dev, &dev_attr_devtype);
+fail_remove_nodename:
+ device_remove_file(&xendev->dev, &dev_attr_nodename);
+fail_unregister:
+ device_unregister(&xendev->dev);
+fail:
+ kfree(xendev);
+ return err;
+}
+
+/* device/<typename>/<name> */
+static int xenbus_probe_frontend(const char *type, const char *name)
+{
+ char *nodename;
+ int err;
+
+ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s",
+ xenbus_frontend.root, type, name);
+ if (!nodename)
+ return -ENOMEM;
+
+ DPRINTK("%s", nodename);
+
+ err = xenbus_probe_node(&xenbus_frontend, type, nodename);
+ kfree(nodename);
+ return err;
+}
+
+static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
+{
+ int err = 0;
+ char **dir;
+ unsigned int dir_n = 0;
+ int i;
+
+ dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ for (i = 0; i < dir_n; i++) {
+ err = bus->probe(type, dir[i]);
+ if (err)
+ break;
+ }
+ kfree(dir);
+ return err;
+}
+
+int xenbus_probe_devices(struct xen_bus_type *bus)
+{
+ int err = 0;
+ char **dir;
+ unsigned int i, dir_n;
+
+ dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ for (i = 0; i < dir_n; i++) {
+ err = xenbus_probe_device_type(bus, dir[i]);
+ if (err)
+ break;
+ }
+ kfree(dir);
+ return err;
+}
+
+static unsigned int char_count(const char *str, char c)
+{
+ unsigned int i, ret = 0;
+
+ for (i = 0; str[i]; i++)
+ if (str[i] == c)
+ ret++;
+ return ret;
+}
+
+static int strsep_len(const char *str, char c, unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; str[i]; i++)
+ if (str[i] == c) {
+ if (len == 0)
+ return i;
+ len--;
+ }
+ return (len == 0) ? i : -ERANGE;
+}
+
+void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
+{
+ int exists, rootlen;
+ struct xenbus_device *dev;
+ char type[XEN_BUS_ID_SIZE];
+ const char *p, *root;
+
+ if (char_count(node, '/') < 2)
+ return;
+
+ exists = xenbus_exists(XBT_NIL, node, "");
+ if (!exists) {
+ xenbus_cleanup_devices(node, &bus->bus);
+ return;
+ }
+
+ /* backend/<type>/... or device/<type>/... */
+ p = strchr(node, '/') + 1;
+ snprintf(type, XEN_BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
+ type[XEN_BUS_ID_SIZE-1] = '\0';
+
+ rootlen = strsep_len(node, '/', bus->levels);
+ if (rootlen < 0)
+ return;
+ root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node);
+ if (!root)
+ return;
+
+ dev = xenbus_device_find(root, &bus->bus);
+ if (!dev)
+ xenbus_probe_node(bus, type, root);
+ else
+ put_device(&dev->dev);
+
+ kfree(root);
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_changed);
+
+static void frontend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ DPRINTK("");
+
+ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+}
+
+/* We watch for devices appearing and vanishing. */
+static struct xenbus_watch fe_watch = {
+ .node = "device",
+ .callback = frontend_changed,
+};
+
+static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
+{
+ int err = 0;
+ struct xenbus_driver *drv;
+ struct xenbus_device *xdev;
+
+ DPRINTK("");
+
+ if (dev->driver == NULL)
+ return 0;
+ drv = to_xenbus_driver(dev->driver);
+ xdev = container_of(dev, struct xenbus_device, dev);
+ if (drv->suspend)
+ err = drv->suspend(xdev, state);
+ if (err)
+ printk(KERN_WARNING
+ "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
+ return 0;
+}
+
+static int xenbus_dev_resume(struct device *dev)
+{
+ int err;
+ struct xenbus_driver *drv;
+ struct xenbus_device *xdev;
+
+ DPRINTK("");
+
+ if (dev->driver == NULL)
+ return 0;
+
+ drv = to_xenbus_driver(dev->driver);
+ xdev = container_of(dev, struct xenbus_device, dev);
+
+ err = talk_to_otherend(xdev);
+ if (err) {
+ printk(KERN_WARNING
+ "xenbus: resume (talk_to_otherend) %s failed: %i\n",
+ dev_name(dev), err);
+ return err;
+ }
+
+ xdev->state = XenbusStateInitialising;
+
+ if (drv->resume) {
+ err = drv->resume(xdev);
+ if (err) {
+ printk(KERN_WARNING
+ "xenbus: resume %s failed: %i\n",
+ dev_name(dev), err);
+ return err;
+ }
+ }
+
+ err = watch_otherend(xdev);
+ if (err) {
+ printk(KERN_WARNING
+ "xenbus_probe: resume (watch_otherend) %s failed: "
+ "%d.\n", dev_name(dev), err);
+ return err;
+ }
+
+ return 0;
+}
+
+/* A flag to determine if xenstored is 'ready' (i.e. has started) */
+int xenstored_ready = 0;
+
+
+int register_xenstore_notifier(struct notifier_block *nb)
+{
+ int ret = 0;
+
+ if (xenstored_ready > 0)
+ ret = nb->notifier_call(nb, 0, NULL);
+ else
+ blocking_notifier_chain_register(&xenstore_chain, nb);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_xenstore_notifier);
+
+void unregister_xenstore_notifier(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&xenstore_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
+
+void xenbus_probe(struct work_struct *unused)
+{
+ BUG_ON((xenstored_ready <= 0));
+
+ /* Enumerate devices in xenstore and watch for changes. */
+ xenbus_probe_devices(&xenbus_frontend);
+ register_xenbus_watch(&fe_watch);
+ xenbus_backend_probe_and_watch();
+
+ /* Notify others that xenstore is up */
+ blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
+}
+
+static int __init xenbus_probe_init(void)
+{
+ int err = 0;
+
+ DPRINTK("");
+
+ err = -ENODEV;
+ if (!xen_domain())
+ goto out_error;
+
+ /* Register ourselves with the kernel bus subsystem */
+ err = bus_register(&xenbus_frontend.bus);
+ if (err)
+ goto out_error;
+
+ err = xenbus_backend_bus_register();
+ if (err)
+ goto out_unreg_front;
+
+ /*
+ * Domain0 doesn't have a store_evtchn or store_mfn yet.
+ */
+ if (xen_initial_domain()) {
+ /* dom0 not yet supported */
+ } else {
+ xenstored_ready = 1;
+ xen_store_evtchn = xen_start_info->store_evtchn;
+ xen_store_mfn = xen_start_info->store_mfn;
+ }
+ xen_store_interface = mfn_to_virt(xen_store_mfn);
+
+ /* Initialize the interface to xenstore. */
+ err = xs_init();
+ if (err) {
+ printk(KERN_WARNING
+ "XENBUS: Error initializing xenstore comms: %i\n", err);
+ goto out_unreg_back;
+ }
+
+ if (!xen_initial_domain())
+ xenbus_probe(NULL);
+
+#ifdef CONFIG_XEN_COMPAT_XENFS
+ /*
+ * Create xenfs mountpoint in /proc for compatibility with
+ * utilities that expect to find "xenbus" under "/proc/xen".
+ */
+ proc_mkdir("xen", NULL);
+#endif
+
+ return 0;
+
+ out_unreg_back:
+ xenbus_backend_bus_unregister();
+
+ out_unreg_front:
+ bus_unregister(&xenbus_frontend.bus);
+
+ out_error:
+ return err;
+}
+
+postcore_initcall(xenbus_probe_init);
+
+MODULE_LICENSE("GPL");
+
+static int is_device_connecting(struct device *dev, void *data)
+{
+ struct xenbus_device *xendev = to_xenbus_device(dev);
+ struct device_driver *drv = data;
+ struct xenbus_driver *xendrv;
+
+ /*
+ * A device with no driver will never connect. We care only about
+ * devices which should currently be in the process of connecting.
+ */
+ if (!dev->driver)
+ return 0;
+
+ /* Is this search limited to a particular driver? */
+ if (drv && (dev->driver != drv))
+ return 0;
+
+ xendrv = to_xenbus_driver(dev->driver);
+ return (xendev->state < XenbusStateConnected ||
+ (xendev->state == XenbusStateConnected &&
+ xendrv->is_ready && !xendrv->is_ready(xendev)));
+}
+
+static int exists_connecting_device(struct device_driver *drv)
+{
+ return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+ is_device_connecting);
+}
+
+static int print_device_status(struct device *dev, void *data)
+{
+ struct xenbus_device *xendev = to_xenbus_device(dev);
+ struct device_driver *drv = data;
+
+ /* Is this operation limited to a particular driver? */
+ if (drv && (dev->driver != drv))
+ return 0;
+
+ if (!dev->driver) {
+ /* Information only: is this too noisy? */
+ printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
+ xendev->nodename);
+ } else if (xendev->state < XenbusStateConnected) {
+ enum xenbus_state rstate = XenbusStateUnknown;
+ if (xendev->otherend)
+ rstate = xenbus_read_driver_state(xendev->otherend);
+ printk(KERN_WARNING "XENBUS: Timeout connecting "
+ "to device: %s (local state %d, remote state %d)\n",
+ xendev->nodename, xendev->state, rstate);
+ }
+
+ return 0;
+}
+
+/* We only wait for device setup after most initcalls have run. */
+static int ready_to_wait_for_devices;
+
+/*
+ * On a 5-minute timeout, wait for all devices currently configured. We need
+ * to do this to guarantee that the filesystems and / or network devices
+ * needed for boot are available, before we can allow the boot to proceed.
+ *
+ * This needs to be on a late_initcall, to happen after the frontend device
+ * drivers have been initialised, but before the root fs is mounted.
+ *
+ * A possible improvement here would be to have the tools add a per-device
+ * flag to the store entry, indicating whether it is needed at boot time.
+ * This would allow people who knew what they were doing to accelerate their
+ * boot slightly, but of course needs tools or manual intervention to set up
+ * those flags correctly.
+ */
+static void wait_for_devices(struct xenbus_driver *xendrv)
+{
+ unsigned long start = jiffies;
+ struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+ unsigned int seconds_waited = 0;
+
+ if (!ready_to_wait_for_devices || !xen_domain())
+ return;
+
+ while (exists_connecting_device(drv)) {
+ if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
+ if (!seconds_waited)
+ printk(KERN_WARNING "XENBUS: Waiting for "
+ "devices to initialise: ");
+ seconds_waited += 5;
+ printk("%us...", 300 - seconds_waited);
+ if (seconds_waited == 300)
+ break;
+ }
+
+ schedule_timeout_interruptible(HZ/10);
+ }
+
+ if (seconds_waited)
+ printk("\n");
+
+ bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+ print_device_status);
+}
+
+#ifndef MODULE
+static int __init boot_wait_for_devices(void)
+{
+ ready_to_wait_for_devices = 1;
+ wait_for_devices(NULL);
+ return 0;
+}
+
+late_initcall(boot_wait_for_devices);
+#endif
diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h
new file mode 100644
index 00000000000..6c5e3185a6a
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe.h
@@ -0,0 +1,76 @@
+/******************************************************************************
+ * xenbus_probe.h
+ *
+ * Talks to Xen Store to figure out what devices we have.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 XenSource Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _XENBUS_PROBE_H
+#define _XENBUS_PROBE_H
+
+#define XEN_BUS_ID_SIZE 20
+
+#ifdef CONFIG_XEN_BACKEND
+extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
+extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
+extern void xenbus_backend_probe_and_watch(void);
+extern int xenbus_backend_bus_register(void);
+extern void xenbus_backend_bus_unregister(void);
+#else
+static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
+static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
+static inline void xenbus_backend_probe_and_watch(void) {}
+static inline int xenbus_backend_bus_register(void) { return 0; }
+static inline void xenbus_backend_bus_unregister(void) {}
+#endif
+
+struct xen_bus_type
+{
+ char *root;
+ unsigned int levels;
+ int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
+ int (*probe)(const char *type, const char *dir);
+ struct bus_type bus;
+};
+
+extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
+extern int xenbus_dev_probe(struct device *_dev);
+extern int xenbus_dev_remove(struct device *_dev);
+extern int xenbus_register_driver_common(struct xenbus_driver *drv,
+ struct xen_bus_type *bus,
+ struct module *owner,
+ const char *mod_name);
+extern int xenbus_probe_node(struct xen_bus_type *bus,
+ const char *type,
+ const char *nodename);
+extern int xenbus_probe_devices(struct xen_bus_type *bus);
+
+extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
+
+#endif
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
new file mode 100644
index 00000000000..7b547f53f65
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -0,0 +1,864 @@
+/******************************************************************************
+ * xenbus_xs.c
+ *
+ * This is the kernel equivalent of the "xs" library. We don't need everything
+ * and we use xenbus_comms for communication.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/unistd.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/fcntl.h>
+#include <linux/kthread.h>
+#include <linux/rwsem.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <xen/xenbus.h>
+#include "xenbus_comms.h"
+
+struct xs_stored_msg {
+ struct list_head list;
+
+ struct xsd_sockmsg hdr;
+
+ union {
+ /* Queued replies. */
+ struct {
+ char *body;
+ } reply;
+
+ /* Queued watch events. */
+ struct {
+ struct xenbus_watch *handle;
+ char **vec;
+ unsigned int vec_size;
+ } watch;
+ } u;
+};
+
+struct xs_handle {
+ /* A list of replies. Currently only one will ever be outstanding. */
+ struct list_head reply_list;
+ spinlock_t reply_lock;
+ wait_queue_head_t reply_waitq;
+
+ /*
+ * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex.
+ * response_mutex is never taken simultaneously with the other three.
+ */
+
+ /* One request at a time. */
+ struct mutex request_mutex;
+
+ /* Protect xenbus reader thread against save/restore. */
+ struct mutex response_mutex;
+
+ /* Protect transactions against save/restore. */
+ struct rw_semaphore transaction_mutex;
+
+ /* Protect watch (de)register against save/restore. */
+ struct rw_semaphore watch_mutex;
+};
+
+static struct xs_handle xs_state;
+
+/* List of registered watches, and a lock to protect it. */
+static LIST_HEAD(watches);
+static DEFINE_SPINLOCK(watches_lock);
+
+/* List of pending watch callback events, and a lock to protect it. */
+static LIST_HEAD(watch_events);
+static DEFINE_SPINLOCK(watch_events_lock);
+
+/*
+ * Details of the xenwatch callback kernel thread. The thread waits on the
+ * watch_events_waitq for work to do (queued on watch_events list). When it
+ * wakes up it acquires the xenwatch_mutex before reading the list and
+ * carrying out work.
+ */
+static pid_t xenwatch_pid;
+static DEFINE_MUTEX(xenwatch_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
+
+static int get_error(const char *errorstring)
+{
+ unsigned int i;
+
+ for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
+ if (i == ARRAY_SIZE(xsd_errors) - 1) {
+ printk(KERN_WARNING
+ "XENBUS xen store gave: unknown error %s",
+ errorstring);
+ return EINVAL;
+ }
+ }
+ return xsd_errors[i].errnum;
+}
+
+static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
+{
+ struct xs_stored_msg *msg;
+ char *body;
+
+ spin_lock(&xs_state.reply_lock);
+
+ while (list_empty(&xs_state.reply_list)) {
+ spin_unlock(&xs_state.reply_lock);
+ /* XXX FIXME: Avoid synchronous wait for response here. */
+ wait_event(xs_state.reply_waitq,
+ !list_empty(&xs_state.reply_list));
+ spin_lock(&xs_state.reply_lock);
+ }
+
+ msg = list_entry(xs_state.reply_list.next,
+ struct xs_stored_msg, list);
+ list_del(&msg->list);
+
+ spin_unlock(&xs_state.reply_lock);
+
+ *type = msg->hdr.type;
+ if (len)
+ *len = msg->hdr.len;
+ body = msg->u.reply.body;
+
+ kfree(msg);
+
+ return body;
+}
+
+void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
+{
+ void *ret;
+ struct xsd_sockmsg req_msg = *msg;
+ int err;
+
+ if (req_msg.type == XS_TRANSACTION_START)
+ down_read(&xs_state.transaction_mutex);
+
+ mutex_lock(&xs_state.request_mutex);
+
+ err = xb_write(msg, sizeof(*msg) + msg->len);
+ if (err) {
+ msg->type = XS_ERROR;
+ ret = ERR_PTR(err);
+ } else
+ ret = read_reply(&msg->type, &msg->len);
+
+ mutex_unlock(&xs_state.request_mutex);
+
+ if ((msg->type == XS_TRANSACTION_END) ||
+ ((req_msg.type == XS_TRANSACTION_START) &&
+ (msg->type == XS_ERROR)))
+ up_read(&xs_state.transaction_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL(xenbus_dev_request_and_reply);
+
+/* Send message to xs, get kmalloc'ed reply. ERR_PTR() on error. */
+static void *xs_talkv(struct xenbus_transaction t,
+ enum xsd_sockmsg_type type,
+ const struct kvec *iovec,
+ unsigned int num_vecs,
+ unsigned int *len)
+{
+ struct xsd_sockmsg msg;
+ void *ret = NULL;
+ unsigned int i;
+ int err;
+
+ msg.tx_id = t.id;
+ msg.req_id = 0;
+ msg.type = type;
+ msg.len = 0;
+ for (i = 0; i < num_vecs; i++)
+ msg.len += iovec[i].iov_len;
+
+ mutex_lock(&xs_state.request_mutex);
+
+ err = xb_write(&msg, sizeof(msg));
+ if (err) {
+ mutex_unlock(&xs_state.request_mutex);
+ return ERR_PTR(err);
+ }
+
+ for (i = 0; i < num_vecs; i++) {
+ err = xb_write(iovec[i].iov_base, iovec[i].iov_len);
+ if (err) {
+ mutex_unlock(&xs_state.request_mutex);
+ return ERR_PTR(err);
+ }
+ }
+
+ ret = read_reply(&msg.type, len);
+
+ mutex_unlock(&xs_state.request_mutex);
+
+ if (IS_ERR(ret))
+ return ret;
+
+ if (msg.type == XS_ERROR) {
+ err = get_error(ret);
+ kfree(ret);
+ return ERR_PTR(-err);
+ }
+
+ if (msg.type != type) {
+ if (printk_ratelimit())
+ printk(KERN_WARNING
+ "XENBUS unexpected type [%d], expected [%d]\n",
+ msg.type, type);
+ kfree(ret);
+ return ERR_PTR(-EINVAL);
+ }
+ return ret;
+}
+
+/* Simplified version of xs_talkv: single message. */
+static void *xs_single(struct xenbus_transaction t,
+ enum xsd_sockmsg_type type,
+ const char *string,
+ unsigned int *len)
+{
+ struct kvec iovec;
+
+ iovec.iov_base = (void *)string;
+ iovec.iov_len = strlen(string) + 1;
+ return xs_talkv(t, type, &iovec, 1, len);
+}
+
+/* Many commands only need an ack, don't care what it says. */
+static int xs_error(char *reply)
+{
+ if (IS_ERR(reply))
+ return PTR_ERR(reply);
+ kfree(reply);
+ return 0;
+}
+
+static unsigned int count_strings(const char *strings, unsigned int len)
+{
+ unsigned int num;
+ const char *p;
+
+ for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
+ num++;
+
+ return num;
+}
+
+/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
+static char *join(const char *dir, const char *name)
+{
+ char *buffer;
+
+ if (strlen(name) == 0)
+ buffer = kasprintf(GFP_NOIO | __GFP_HIGH, "%s", dir);
+ else
+ buffer = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/%s", dir, name);
+ return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
+}
+
+static char **split(char *strings, unsigned int len, unsigned int *num)
+{
+ char *p, **ret;
+
+ /* Count the strings. */
+ *num = count_strings(strings, len);
+
+ /* Transfer to one big alloc for easy freeing. */
+ ret = kmalloc(*num * sizeof(char *) + len, GFP_NOIO | __GFP_HIGH);
+ if (!ret) {
+ kfree(strings);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(&ret[*num], strings, len);
+ kfree(strings);
+
+ strings = (char *)&ret[*num];
+ for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
+ ret[(*num)++] = p;
+
+ return ret;
+}
+
+char **xenbus_directory(struct xenbus_transaction t,
+ const char *dir, const char *node, unsigned int *num)
+{
+ char *strings, *path;
+ unsigned int len;
+
+ path = join(dir, node);
+ if (IS_ERR(path))
+ return (char **)path;
+
+ strings = xs_single(t, XS_DIRECTORY, path, &len);
+ kfree(path);
+ if (IS_ERR(strings))
+ return (char **)strings;
+
+ return split(strings, len, num);
+}
+EXPORT_SYMBOL_GPL(xenbus_directory);
+
+/* Check if a path exists. Return 1 if it does. */
+int xenbus_exists(struct xenbus_transaction t,
+ const char *dir, const char *node)
+{
+ char **d;
+ int dir_n;
+
+ d = xenbus_directory(t, dir, node, &dir_n);
+ if (IS_ERR(d))
+ return 0;
+ kfree(d);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(xenbus_exists);
+
+/* Get the value of a single file.
+ * Returns a kmalloced value: call free() on it after use.
+ * len indicates length in bytes.
+ */
+void *xenbus_read(struct xenbus_transaction t,
+ const char *dir, const char *node, unsigned int *len)
+{
+ char *path;
+ void *ret;
+
+ path = join(dir, node);
+ if (IS_ERR(path))
+ return (void *)path;
+
+ ret = xs_single(t, XS_READ, path, len);
+ kfree(path);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_read);
+
+/* Write the value of a single file.
+ * Returns -err on failure.
+ */
+int xenbus_write(struct xenbus_transaction t,
+ const char *dir, const char *node, const char *string)
+{
+ const char *path;
+ struct kvec iovec[2];
+ int ret;
+
+ path = join(dir, node);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ iovec[0].iov_base = (void *)path;
+ iovec[0].iov_len = strlen(path) + 1;
+ iovec[1].iov_base = (void *)string;
+ iovec[1].iov_len = strlen(string);
+
+ ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
+ kfree(path);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_write);
+
+/* Create a new directory. */
+int xenbus_mkdir(struct xenbus_transaction t,
+ const char *dir, const char *node)
+{
+ char *path;
+ int ret;
+
+ path = join(dir, node);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
+ kfree(path);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_mkdir);
+
+/* Destroy a file or directory (directories must be empty). */
+int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
+{
+ char *path;
+ int ret;
+
+ path = join(dir, node);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ ret = xs_error(xs_single(t, XS_RM, path, NULL));
+ kfree(path);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_rm);
+
+/* Start a transaction: changes by others will not be seen during this
+ * transaction, and changes will not be visible to others until end.
+ */
+int xenbus_transaction_start(struct xenbus_transaction *t)
+{
+ char *id_str;
+
+ down_read(&xs_state.transaction_mutex);
+
+ id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
+ if (IS_ERR(id_str)) {
+ up_read(&xs_state.transaction_mutex);
+ return PTR_ERR(id_str);
+ }
+
+ t->id = simple_strtoul(id_str, NULL, 0);
+ kfree(id_str);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_transaction_start);
+
+/* End a transaction.
+ * If abandon is true, transaction is discarded instead of committed.
+ */
+int xenbus_transaction_end(struct xenbus_transaction t, int abort)
+{
+ char abortstr[2];
+ int err;
+
+ if (abort)
+ strcpy(abortstr, "F");
+ else
+ strcpy(abortstr, "T");
+
+ err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
+
+ up_read(&xs_state.transaction_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_transaction_end);
+
+/* Single read and scanf: returns -errno or num scanned. */
+int xenbus_scanf(struct xenbus_transaction t,
+ const char *dir, const char *node, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+ char *val;
+
+ val = xenbus_read(t, dir, node, NULL);
+ if (IS_ERR(val))
+ return PTR_ERR(val);
+
+ va_start(ap, fmt);
+ ret = vsscanf(val, fmt, ap);
+ va_end(ap);
+ kfree(val);
+ /* Distinctive errno. */
+ if (ret == 0)
+ return -ERANGE;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_scanf);
+
+/* Single printf and write: returns -errno or 0. */
+int xenbus_printf(struct xenbus_transaction t,
+ const char *dir, const char *node, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+#define PRINTF_BUFFER_SIZE 4096
+ char *printf_buffer;
+
+ printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_NOIO | __GFP_HIGH);
+ if (printf_buffer == NULL)
+ return -ENOMEM;
+
+ va_start(ap, fmt);
+ ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
+ va_end(ap);
+
+ BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
+ ret = xenbus_write(t, dir, node, printf_buffer);
+
+ kfree(printf_buffer);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_printf);
+
+/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
+int xenbus_gather(struct xenbus_transaction t, const char *dir, ...)
+{
+ va_list ap;
+ const char *name;
+ int ret = 0;
+
+ va_start(ap, dir);
+ while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
+ const char *fmt = va_arg(ap, char *);
+ void *result = va_arg(ap, void *);
+ char *p;
+
+ p = xenbus_read(t, dir, name, NULL);
+ if (IS_ERR(p)) {
+ ret = PTR_ERR(p);
+ break;
+ }
+ if (fmt) {
+ if (sscanf(p, fmt, result) == 0)
+ ret = -EINVAL;
+ kfree(p);
+ } else
+ *(char **)result = p;
+ }
+ va_end(ap);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_gather);
+
+static int xs_watch(const char *path, const char *token)
+{
+ struct kvec iov[2];
+
+ iov[0].iov_base = (void *)path;
+ iov[0].iov_len = strlen(path) + 1;
+ iov[1].iov_base = (void *)token;
+ iov[1].iov_len = strlen(token) + 1;
+
+ return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov,
+ ARRAY_SIZE(iov), NULL));
+}
+
+static int xs_unwatch(const char *path, const char *token)
+{
+ struct kvec iov[2];
+
+ iov[0].iov_base = (char *)path;
+ iov[0].iov_len = strlen(path) + 1;
+ iov[1].iov_base = (char *)token;
+ iov[1].iov_len = strlen(token) + 1;
+
+ return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov,
+ ARRAY_SIZE(iov), NULL));
+}
+
+static struct xenbus_watch *find_watch(const char *token)
+{
+ struct xenbus_watch *i, *cmp;
+
+ cmp = (void *)simple_strtoul(token, NULL, 16);
+
+ list_for_each_entry(i, &watches, list)
+ if (i == cmp)
+ return i;
+
+ return NULL;
+}
+
+/* Register callback to watch this node. */
+int register_xenbus_watch(struct xenbus_watch *watch)
+{
+ /* Pointer in ascii is the token. */
+ char token[sizeof(watch) * 2 + 1];
+ int err;
+
+ sprintf(token, "%lX", (long)watch);
+
+ down_read(&xs_state.watch_mutex);
+
+ spin_lock(&watches_lock);
+ BUG_ON(find_watch(token));
+ list_add(&watch->list, &watches);
+ spin_unlock(&watches_lock);
+
+ err = xs_watch(watch->node, token);
+
+ /* Ignore errors due to multiple registration. */
+ if ((err != 0) && (err != -EEXIST)) {
+ spin_lock(&watches_lock);
+ list_del(&watch->list);
+ spin_unlock(&watches_lock);
+ }
+
+ up_read(&xs_state.watch_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(register_xenbus_watch);
+
+void unregister_xenbus_watch(struct xenbus_watch *watch)
+{
+ struct xs_stored_msg *msg, *tmp;
+ char token[sizeof(watch) * 2 + 1];
+ int err;
+
+ sprintf(token, "%lX", (long)watch);
+
+ down_read(&xs_state.watch_mutex);
+
+ spin_lock(&watches_lock);
+ BUG_ON(!find_watch(token));
+ list_del(&watch->list);
+ spin_unlock(&watches_lock);
+
+ err = xs_unwatch(watch->node, token);
+ if (err)
+ printk(KERN_WARNING
+ "XENBUS Failed to release watch %s: %i\n",
+ watch->node, err);
+
+ up_read(&xs_state.watch_mutex);
+
+ /* Make sure there are no callbacks running currently (unless
+ its us) */
+ if (current->pid != xenwatch_pid)
+ mutex_lock(&xenwatch_mutex);
+
+ /* Cancel pending watch events. */
+ spin_lock(&watch_events_lock);
+ list_for_each_entry_safe(msg, tmp, &watch_events, list) {
+ if (msg->u.watch.handle != watch)
+ continue;
+ list_del(&msg->list);
+ kfree(msg->u.watch.vec);
+ kfree(msg);
+ }
+ spin_unlock(&watch_events_lock);
+
+ if (current->pid != xenwatch_pid)
+ mutex_unlock(&xenwatch_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
+
+void xs_suspend(void)
+{
+ down_write(&xs_state.transaction_mutex);
+ down_write(&xs_state.watch_mutex);
+ mutex_lock(&xs_state.request_mutex);
+ mutex_lock(&xs_state.response_mutex);
+}
+
+void xs_resume(void)
+{
+ struct xenbus_watch *watch;
+ char token[sizeof(watch) * 2 + 1];
+
+ xb_init_comms();
+
+ mutex_unlock(&xs_state.response_mutex);
+ mutex_unlock(&xs_state.request_mutex);
+ up_write(&xs_state.transaction_mutex);
+
+ /* No need for watches_lock: the watch_mutex is sufficient. */
+ list_for_each_entry(watch, &watches, list) {
+ sprintf(token, "%lX", (long)watch);
+ xs_watch(watch->node, token);
+ }
+
+ up_write(&xs_state.watch_mutex);
+}
+
+void xs_suspend_cancel(void)
+{
+ mutex_unlock(&xs_state.response_mutex);
+ mutex_unlock(&xs_state.request_mutex);
+ up_write(&xs_state.watch_mutex);
+ up_write(&xs_state.transaction_mutex);
+}
+
+static int xenwatch_thread(void *unused)
+{
+ struct list_head *ent;
+ struct xs_stored_msg *msg;
+
+ for (;;) {
+ wait_event_interruptible(watch_events_waitq,
+ !list_empty(&watch_events));
+
+ if (kthread_should_stop())
+ break;
+
+ mutex_lock(&xenwatch_mutex);
+
+ spin_lock(&watch_events_lock);
+ ent = watch_events.next;
+ if (ent != &watch_events)
+ list_del(ent);
+ spin_unlock(&watch_events_lock);
+
+ if (ent != &watch_events) {
+ msg = list_entry(ent, struct xs_stored_msg, list);
+ msg->u.watch.handle->callback(
+ msg->u.watch.handle,
+ (const char **)msg->u.watch.vec,
+ msg->u.watch.vec_size);
+ kfree(msg->u.watch.vec);
+ kfree(msg);
+ }
+
+ mutex_unlock(&xenwatch_mutex);
+ }
+
+ return 0;
+}
+
+static int process_msg(void)
+{
+ struct xs_stored_msg *msg;
+ char *body;
+ int err;
+
+ /*
+ * We must disallow save/restore while reading a xenstore message.
+ * A partial read across s/r leaves us out of sync with xenstored.
+ */
+ for (;;) {
+ err = xb_wait_for_data_to_read();
+ if (err)
+ return err;
+ mutex_lock(&xs_state.response_mutex);
+ if (xb_data_to_read())
+ break;
+ /* We raced with save/restore: pending data 'disappeared'. */
+ mutex_unlock(&xs_state.response_mutex);
+ }
+
+
+ msg = kmalloc(sizeof(*msg), GFP_NOIO | __GFP_HIGH);
+ if (msg == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xb_read(&msg->hdr, sizeof(msg->hdr));
+ if (err) {
+ kfree(msg);
+ goto out;
+ }
+
+ body = kmalloc(msg->hdr.len + 1, GFP_NOIO | __GFP_HIGH);
+ if (body == NULL) {
+ kfree(msg);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xb_read(body, msg->hdr.len);
+ if (err) {
+ kfree(body);
+ kfree(msg);
+ goto out;
+ }
+ body[msg->hdr.len] = '\0';
+
+ if (msg->hdr.type == XS_WATCH_EVENT) {
+ msg->u.watch.vec = split(body, msg->hdr.len,
+ &msg->u.watch.vec_size);
+ if (IS_ERR(msg->u.watch.vec)) {
+ err = PTR_ERR(msg->u.watch.vec);
+ kfree(msg);
+ goto out;
+ }
+
+ spin_lock(&watches_lock);
+ msg->u.watch.handle = find_watch(
+ msg->u.watch.vec[XS_WATCH_TOKEN]);
+ if (msg->u.watch.handle != NULL) {
+ spin_lock(&watch_events_lock);
+ list_add_tail(&msg->list, &watch_events);
+ wake_up(&watch_events_waitq);
+ spin_unlock(&watch_events_lock);
+ } else {
+ kfree(msg->u.watch.vec);
+ kfree(msg);
+ }
+ spin_unlock(&watches_lock);
+ } else {
+ msg->u.reply.body = body;
+ spin_lock(&xs_state.reply_lock);
+ list_add_tail(&msg->list, &xs_state.reply_list);
+ spin_unlock(&xs_state.reply_lock);
+ wake_up(&xs_state.reply_waitq);
+ }
+
+ out:
+ mutex_unlock(&xs_state.response_mutex);
+ return err;
+}
+
+static int xenbus_thread(void *unused)
+{
+ int err;
+
+ for (;;) {
+ err = process_msg();
+ if (err)
+ printk(KERN_WARNING "XENBUS error %d while reading "
+ "message\n", err);
+ if (kthread_should_stop())
+ break;
+ }
+
+ return 0;
+}
+
+int xs_init(void)
+{
+ int err;
+ struct task_struct *task;
+
+ INIT_LIST_HEAD(&xs_state.reply_list);
+ spin_lock_init(&xs_state.reply_lock);
+ init_waitqueue_head(&xs_state.reply_waitq);
+
+ mutex_init(&xs_state.request_mutex);
+ mutex_init(&xs_state.response_mutex);
+ init_rwsem(&xs_state.transaction_mutex);
+ init_rwsem(&xs_state.watch_mutex);
+
+ /* Initialize the shared memory rings to talk to xenstored */
+ err = xb_init_comms();
+ if (err)
+ return err;
+
+ task = kthread_run(xenwatch_thread, NULL, "xenwatch");
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ xenwatch_pid = task->pid;
+
+ task = kthread_run(xenbus_thread, NULL, "xenbus");
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ return 0;
+}
diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c
new file mode 100644
index 00000000000..a240b2c20b9
--- /dev/null
+++ b/drivers/xen/xencomm.c
@@ -0,0 +1,217 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (C) IBM Corp. 2006
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <xen/xencomm.h>
+#include <xen/interface/xen.h>
+#include <asm/xen/xencomm.h> /* for xencomm_is_phys_contiguous() */
+
+static int xencomm_init(struct xencomm_desc *desc,
+ void *buffer, unsigned long bytes)
+{
+ unsigned long recorded = 0;
+ int i = 0;
+
+ while ((recorded < bytes) && (i < desc->nr_addrs)) {
+ unsigned long vaddr = (unsigned long)buffer + recorded;
+ unsigned long paddr;
+ int offset;
+ int chunksz;
+
+ offset = vaddr % PAGE_SIZE; /* handle partial pages */
+ chunksz = min(PAGE_SIZE - offset, bytes - recorded);
+
+ paddr = xencomm_vtop(vaddr);
+ if (paddr == ~0UL) {
+ printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n",
+ __func__, vaddr);
+ return -EINVAL;
+ }
+
+ desc->address[i++] = paddr;
+ recorded += chunksz;
+ }
+
+ if (recorded < bytes) {
+ printk(KERN_DEBUG
+ "%s: could only translate %ld of %ld bytes\n",
+ __func__, recorded, bytes);
+ return -ENOSPC;
+ }
+
+ /* mark remaining addresses invalid (just for safety) */
+ while (i < desc->nr_addrs)
+ desc->address[i++] = XENCOMM_INVALID;
+
+ desc->magic = XENCOMM_MAGIC;
+
+ return 0;
+}
+
+static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask,
+ void *buffer, unsigned long bytes)
+{
+ struct xencomm_desc *desc;
+ unsigned long buffer_ulong = (unsigned long)buffer;
+ unsigned long start = buffer_ulong & PAGE_MASK;
+ unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK;
+ unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT;
+ unsigned long size = sizeof(*desc) +
+ sizeof(desc->address[0]) * nr_addrs;
+
+ /*
+ * slab allocator returns at least sizeof(void*) aligned pointer.
+ * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might
+ * cross page boundary.
+ */
+ if (sizeof(*desc) > sizeof(void *)) {
+ unsigned long order = get_order(size);
+ desc = (struct xencomm_desc *)__get_free_pages(gfp_mask,
+ order);
+ if (desc == NULL)
+ return NULL;
+
+ desc->nr_addrs =
+ ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) /
+ sizeof(*desc->address);
+ } else {
+ desc = kmalloc(size, gfp_mask);
+ if (desc == NULL)
+ return NULL;
+
+ desc->nr_addrs = nr_addrs;
+ }
+ return desc;
+}
+
+void xencomm_free(struct xencomm_handle *desc)
+{
+ if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) {
+ struct xencomm_desc *desc__ = (struct xencomm_desc *)desc;
+ if (sizeof(*desc__) > sizeof(void *)) {
+ unsigned long size = sizeof(*desc__) +
+ sizeof(desc__->address[0]) * desc__->nr_addrs;
+ unsigned long order = get_order(size);
+ free_pages((unsigned long)__va(desc), order);
+ } else
+ kfree(__va(desc));
+ }
+}
+
+static int xencomm_create(void *buffer, unsigned long bytes,
+ struct xencomm_desc **ret, gfp_t gfp_mask)
+{
+ struct xencomm_desc *desc;
+ int rc;
+
+ pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes);
+
+ if (bytes == 0) {
+ /* don't create a descriptor; Xen recognizes NULL. */
+ BUG_ON(buffer != NULL);
+ *ret = NULL;
+ return 0;
+ }
+
+ BUG_ON(buffer == NULL); /* 'bytes' is non-zero */
+
+ desc = xencomm_alloc(gfp_mask, buffer, bytes);
+ if (!desc) {
+ printk(KERN_DEBUG "%s failure\n", "xencomm_alloc");
+ return -ENOMEM;
+ }
+
+ rc = xencomm_init(desc, buffer, bytes);
+ if (rc) {
+ printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc);
+ xencomm_free((struct xencomm_handle *)__pa(desc));
+ return rc;
+ }
+
+ *ret = desc;
+ return 0;
+}
+
+static struct xencomm_handle *xencomm_create_inline(void *ptr)
+{
+ unsigned long paddr;
+
+ BUG_ON(!xencomm_is_phys_contiguous((unsigned long)ptr));
+
+ paddr = (unsigned long)xencomm_pa(ptr);
+ BUG_ON(paddr & XENCOMM_INLINE_FLAG);
+ return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG);
+}
+
+/* "mini" routine, for stack-based communications: */
+static int xencomm_create_mini(void *buffer,
+ unsigned long bytes, struct xencomm_mini *xc_desc,
+ struct xencomm_desc **ret)
+{
+ int rc = 0;
+ struct xencomm_desc *desc;
+ BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0);
+
+ desc = (void *)xc_desc;
+
+ desc->nr_addrs = XENCOMM_MINI_ADDRS;
+
+ rc = xencomm_init(desc, buffer, bytes);
+ if (!rc)
+ *ret = desc;
+
+ return rc;
+}
+
+struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes)
+{
+ int rc;
+ struct xencomm_desc *desc;
+
+ if (xencomm_is_phys_contiguous((unsigned long)ptr))
+ return xencomm_create_inline(ptr);
+
+ rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL);
+
+ if (rc || desc == NULL)
+ return NULL;
+
+ return xencomm_pa(desc);
+}
+
+struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes,
+ struct xencomm_mini *xc_desc)
+{
+ int rc;
+ struct xencomm_desc *desc = NULL;
+
+ if (xencomm_is_phys_contiguous((unsigned long)ptr))
+ return xencomm_create_inline(ptr);
+
+ rc = xencomm_create_mini(ptr, bytes, xc_desc,
+ &desc);
+
+ if (rc)
+ return NULL;
+
+ return xencomm_pa(desc);
+}
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
new file mode 100644
index 00000000000..25275c3bbdf
--- /dev/null
+++ b/drivers/xen/xenfs/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XENFS) += xenfs.o
+
+xenfs-objs = super.o xenbus.o \ No newline at end of file
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
new file mode 100644
index 00000000000..8924d93136f
--- /dev/null
+++ b/drivers/xen/xenfs/super.c
@@ -0,0 +1,83 @@
+/*
+ * xenfs.c - a filesystem for passing info between the a domain and
+ * the hypervisor.
+ *
+ * 2008-10-07 Alex Zeffertt Replaced /proc/xen/xenbus with xenfs filesystem
+ * and /proc/xen compatibility mount point.
+ * Turned xenfs into a loadable module.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+
+#include <xen/xen.h>
+
+#include "xenfs.h"
+
+#include <asm/xen/hypervisor.h>
+
+MODULE_DESCRIPTION("Xen filesystem");
+MODULE_LICENSE("GPL");
+
+static ssize_t capabilities_read(struct file *file, char __user *buf,
+ size_t size, loff_t *off)
+{
+ char *tmp = "";
+
+ if (xen_initial_domain())
+ tmp = "control_d\n";
+
+ return simple_read_from_buffer(buf, size, off, tmp, strlen(tmp));
+}
+
+static const struct file_operations capabilities_file_ops = {
+ .read = capabilities_read,
+};
+
+static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ static struct tree_descr xenfs_files[] = {
+ [1] = {},
+ { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
+ { "capabilities", &capabilities_file_ops, S_IRUGO },
+ {""},
+ };
+
+ return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+}
+
+static int xenfs_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data, struct vfsmount *mnt)
+{
+ return get_sb_single(fs_type, flags, data, xenfs_fill_super, mnt);
+}
+
+static struct file_system_type xenfs_type = {
+ .owner = THIS_MODULE,
+ .name = "xenfs",
+ .get_sb = xenfs_get_sb,
+ .kill_sb = kill_litter_super,
+};
+
+static int __init xenfs_init(void)
+{
+ if (xen_pv_domain())
+ return register_filesystem(&xenfs_type);
+
+ printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n");
+ return 0;
+}
+
+static void __exit xenfs_exit(void)
+{
+ if (xen_pv_domain())
+ unregister_filesystem(&xenfs_type);
+}
+
+module_init(xenfs_init);
+module_exit(xenfs_exit);
+
diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c
new file mode 100644
index 00000000000..6c4269b836b
--- /dev/null
+++ b/drivers/xen/xenfs/xenbus.c
@@ -0,0 +1,593 @@
+/*
+ * Driver giving user-space access to the kernel's xenbus connection
+ * to xenstore.
+ *
+ * Copyright (c) 2005, Christian Limpach
+ * Copyright (c) 2005, Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Changes:
+ * 2008-10-07 Alex Zeffertt Replaced /proc/xen/xenbus with xenfs filesystem
+ * and /proc/xen compatibility mount point.
+ * Turned xenfs into a loadable module.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/notifier.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+
+#include "xenfs.h"
+#include "../xenbus/xenbus_comms.h"
+
+#include <xen/xenbus.h>
+#include <asm/xen/hypervisor.h>
+
+/*
+ * An element of a list of outstanding transactions, for which we're
+ * still waiting a reply.
+ */
+struct xenbus_transaction_holder {
+ struct list_head list;
+ struct xenbus_transaction handle;
+};
+
+/*
+ * A buffer of data on the queue.
+ */
+struct read_buffer {
+ struct list_head list;
+ unsigned int cons;
+ unsigned int len;
+ char msg[];
+};
+
+struct xenbus_file_priv {
+ /*
+ * msgbuffer_mutex is held while partial requests are built up
+ * and complete requests are acted on. It therefore protects
+ * the "transactions" and "watches" lists, and the partial
+ * request length and buffer.
+ *
+ * reply_mutex protects the reply being built up to return to
+ * usermode. It nests inside msgbuffer_mutex but may be held
+ * alone during a watch callback.
+ */
+ struct mutex msgbuffer_mutex;
+
+ /* In-progress transactions */
+ struct list_head transactions;
+
+ /* Active watches. */
+ struct list_head watches;
+
+ /* Partial request. */
+ unsigned int len;
+ union {
+ struct xsd_sockmsg msg;
+ char buffer[PAGE_SIZE];
+ } u;
+
+ /* Response queue. */
+ struct mutex reply_mutex;
+ struct list_head read_buffers;
+ wait_queue_head_t read_waitq;
+
+};
+
+/* Read out any raw xenbus messages queued up. */
+static ssize_t xenbus_file_read(struct file *filp,
+ char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct xenbus_file_priv *u = filp->private_data;
+ struct read_buffer *rb;
+ unsigned i;
+ int ret;
+
+ mutex_lock(&u->reply_mutex);
+ while (list_empty(&u->read_buffers)) {
+ mutex_unlock(&u->reply_mutex);
+ ret = wait_event_interruptible(u->read_waitq,
+ !list_empty(&u->read_buffers));
+ if (ret)
+ return ret;
+ mutex_lock(&u->reply_mutex);
+ }
+
+ rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+ i = 0;
+ while (i < len) {
+ unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
+
+ ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
+
+ i += sz - ret;
+ rb->cons += sz - ret;
+
+ if (ret != sz) {
+ if (i == 0)
+ i = -EFAULT;
+ goto out;
+ }
+
+ /* Clear out buffer if it has been consumed */
+ if (rb->cons == rb->len) {
+ list_del(&rb->list);
+ kfree(rb);
+ if (list_empty(&u->read_buffers))
+ break;
+ rb = list_entry(u->read_buffers.next,
+ struct read_buffer, list);
+ }
+ }
+
+out:
+ mutex_unlock(&u->reply_mutex);
+ return i;
+}
+
+/*
+ * Add a buffer to the queue. Caller must hold the appropriate lock
+ * if the queue is not local. (Commonly the caller will build up
+ * multiple queued buffers on a temporary local list, and then add it
+ * to the appropriate list under lock once all the buffers have een
+ * successfully allocated.)
+ */
+static int queue_reply(struct list_head *queue, const void *data, size_t len)
+{
+ struct read_buffer *rb;
+
+ if (len == 0)
+ return 0;
+
+ rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+ if (rb == NULL)
+ return -ENOMEM;
+
+ rb->cons = 0;
+ rb->len = len;
+
+ memcpy(rb->msg, data, len);
+
+ list_add_tail(&rb->list, queue);
+ return 0;
+}
+
+/*
+ * Free all the read_buffer s on a list.
+ * Caller must have sole reference to list.
+ */
+static void queue_cleanup(struct list_head *list)
+{
+ struct read_buffer *rb;
+
+ while (!list_empty(list)) {
+ rb = list_entry(list->next, struct read_buffer, list);
+ list_del(list->next);
+ kfree(rb);
+ }
+}
+
+struct watch_adapter {
+ struct list_head list;
+ struct xenbus_watch watch;
+ struct xenbus_file_priv *dev_data;
+ char *token;
+};
+
+static void free_watch_adapter(struct watch_adapter *watch)
+{
+ kfree(watch->watch.node);
+ kfree(watch->token);
+ kfree(watch);
+}
+
+static struct watch_adapter *alloc_watch_adapter(const char *path,
+ const char *token)
+{
+ struct watch_adapter *watch;
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (watch == NULL)
+ goto out_fail;
+
+ watch->watch.node = kstrdup(path, GFP_KERNEL);
+ if (watch->watch.node == NULL)
+ goto out_free;
+
+ watch->token = kstrdup(token, GFP_KERNEL);
+ if (watch->token == NULL)
+ goto out_free;
+
+ return watch;
+
+out_free:
+ free_watch_adapter(watch);
+
+out_fail:
+ return NULL;
+}
+
+static void watch_fired(struct xenbus_watch *watch,
+ const char **vec,
+ unsigned int len)
+{
+ struct watch_adapter *adap;
+ struct xsd_sockmsg hdr;
+ const char *path, *token;
+ int path_len, tok_len, body_len, data_len = 0;
+ int ret;
+ LIST_HEAD(staging_q);
+
+ adap = container_of(watch, struct watch_adapter, watch);
+
+ path = vec[XS_WATCH_PATH];
+ token = adap->token;
+
+ path_len = strlen(path) + 1;
+ tok_len = strlen(token) + 1;
+ if (len > 2)
+ data_len = vec[len] - vec[2] + 1;
+ body_len = path_len + tok_len + data_len;
+
+ hdr.type = XS_WATCH_EVENT;
+ hdr.len = body_len;
+
+ mutex_lock(&adap->dev_data->reply_mutex);
+
+ ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
+ if (!ret)
+ ret = queue_reply(&staging_q, path, path_len);
+ if (!ret)
+ ret = queue_reply(&staging_q, token, tok_len);
+ if (!ret && len > 2)
+ ret = queue_reply(&staging_q, vec[2], data_len);
+
+ if (!ret) {
+ /* success: pass reply list onto watcher */
+ list_splice_tail(&staging_q, &adap->dev_data->read_buffers);
+ wake_up(&adap->dev_data->read_waitq);
+ } else
+ queue_cleanup(&staging_q);
+
+ mutex_unlock(&adap->dev_data->reply_mutex);
+}
+
+static int xenbus_write_transaction(unsigned msg_type,
+ struct xenbus_file_priv *u)
+{
+ int rc;
+ void *reply;
+ struct xenbus_transaction_holder *trans = NULL;
+ LIST_HEAD(staging_q);
+
+ if (msg_type == XS_TRANSACTION_START) {
+ trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+ if (!trans) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ }
+
+ reply = xenbus_dev_request_and_reply(&u->u.msg);
+ if (IS_ERR(reply)) {
+ kfree(trans);
+ rc = PTR_ERR(reply);
+ goto out;
+ }
+
+ if (msg_type == XS_TRANSACTION_START) {
+ trans->handle.id = simple_strtoul(reply, NULL, 0);
+
+ list_add(&trans->list, &u->transactions);
+ } else if (msg_type == XS_TRANSACTION_END) {
+ list_for_each_entry(trans, &u->transactions, list)
+ if (trans->handle.id == u->u.msg.tx_id)
+ break;
+ BUG_ON(&trans->list == &u->transactions);
+ list_del(&trans->list);
+
+ kfree(trans);
+ }
+
+ mutex_lock(&u->reply_mutex);
+ rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
+ if (!rc)
+ rc = queue_reply(&staging_q, reply, u->u.msg.len);
+ if (!rc) {
+ list_splice_tail(&staging_q, &u->read_buffers);
+ wake_up(&u->read_waitq);
+ } else {
+ queue_cleanup(&staging_q);
+ }
+ mutex_unlock(&u->reply_mutex);
+
+ kfree(reply);
+
+out:
+ return rc;
+}
+
+static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
+{
+ struct watch_adapter *watch, *tmp_watch;
+ char *path, *token;
+ int err, rc;
+ LIST_HEAD(staging_q);
+
+ path = u->u.buffer + sizeof(u->u.msg);
+ token = memchr(path, 0, u->u.msg.len);
+ if (token == NULL) {
+ rc = -EILSEQ;
+ goto out;
+ }
+ token++;
+
+ if (msg_type == XS_WATCH) {
+ watch = alloc_watch_adapter(path, token);
+ if (watch == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ watch->watch.callback = watch_fired;
+ watch->dev_data = u;
+
+ err = register_xenbus_watch(&watch->watch);
+ if (err) {
+ free_watch_adapter(watch);
+ rc = err;
+ goto out;
+ }
+ list_add(&watch->list, &u->watches);
+ } else {
+ list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+ if (!strcmp(watch->token, token) &&
+ !strcmp(watch->watch.node, path)) {
+ unregister_xenbus_watch(&watch->watch);
+ list_del(&watch->list);
+ free_watch_adapter(watch);
+ break;
+ }
+ }
+ }
+
+ /* Success. Synthesize a reply to say all is OK. */
+ {
+ struct {
+ struct xsd_sockmsg hdr;
+ char body[3];
+ } __packed reply = {
+ {
+ .type = msg_type,
+ .len = sizeof(reply.body)
+ },
+ "OK"
+ };
+
+ mutex_lock(&u->reply_mutex);
+ rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
+ mutex_unlock(&u->reply_mutex);
+ }
+
+out:
+ return rc;
+}
+
+static ssize_t xenbus_file_write(struct file *filp,
+ const char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct xenbus_file_priv *u = filp->private_data;
+ uint32_t msg_type;
+ int rc = len;
+ int ret;
+ LIST_HEAD(staging_q);
+
+ /*
+ * We're expecting usermode to be writing properly formed
+ * xenbus messages. If they write an incomplete message we
+ * buffer it up. Once it is complete, we act on it.
+ */
+
+ /*
+ * Make sure concurrent writers can't stomp all over each
+ * other's messages and make a mess of our partial message
+ * buffer. We don't make any attemppt to stop multiple
+ * writers from making a mess of each other's incomplete
+ * messages; we're just trying to guarantee our own internal
+ * consistency and make sure that single writes are handled
+ * atomically.
+ */
+ mutex_lock(&u->msgbuffer_mutex);
+
+ /* Get this out of the way early to avoid confusion */
+ if (len == 0)
+ goto out;
+
+ /* Can't write a xenbus message larger we can buffer */
+ if ((len + u->len) > sizeof(u->u.buffer)) {
+ /* On error, dump existing buffer */
+ u->len = 0;
+ rc = -EINVAL;
+ goto out;
+ }
+
+ ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
+
+ if (ret == len) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ /* Deal with a partial copy. */
+ len -= ret;
+ rc = len;
+
+ u->len += len;
+
+ /* Return if we haven't got a full message yet */
+ if (u->len < sizeof(u->u.msg))
+ goto out; /* not even the header yet */
+
+ /* If we're expecting a message that's larger than we can
+ possibly send, dump what we have and return an error. */
+ if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer)) {
+ rc = -E2BIG;
+ u->len = 0;
+ goto out;
+ }
+
+ if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
+ goto out; /* incomplete data portion */
+
+ /*
+ * OK, now we have a complete message. Do something with it.
+ */
+
+ msg_type = u->u.msg.type;
+
+ switch (msg_type) {
+ case XS_TRANSACTION_START:
+ case XS_TRANSACTION_END:
+ case XS_DIRECTORY:
+ case XS_READ:
+ case XS_GET_PERMS:
+ case XS_RELEASE:
+ case XS_GET_DOMAIN_PATH:
+ case XS_WRITE:
+ case XS_MKDIR:
+ case XS_RM:
+ case XS_SET_PERMS:
+ /* Send out a transaction */
+ ret = xenbus_write_transaction(msg_type, u);
+ break;
+
+ case XS_WATCH:
+ case XS_UNWATCH:
+ /* (Un)Ask for some path to be watched for changes */
+ ret = xenbus_write_watch(msg_type, u);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ if (ret != 0)
+ rc = ret;
+
+ /* Buffered message consumed */
+ u->len = 0;
+
+ out:
+ mutex_unlock(&u->msgbuffer_mutex);
+ return rc;
+}
+
+static int xenbus_file_open(struct inode *inode, struct file *filp)
+{
+ struct xenbus_file_priv *u;
+
+ if (xen_store_evtchn == 0)
+ return -ENOENT;
+
+ nonseekable_open(inode, filp);
+
+ u = kzalloc(sizeof(*u), GFP_KERNEL);
+ if (u == NULL)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&u->transactions);
+ INIT_LIST_HEAD(&u->watches);
+ INIT_LIST_HEAD(&u->read_buffers);
+ init_waitqueue_head(&u->read_waitq);
+
+ mutex_init(&u->reply_mutex);
+ mutex_init(&u->msgbuffer_mutex);
+
+ filp->private_data = u;
+
+ return 0;
+}
+
+static int xenbus_file_release(struct inode *inode, struct file *filp)
+{
+ struct xenbus_file_priv *u = filp->private_data;
+ struct xenbus_transaction_holder *trans, *tmp;
+ struct watch_adapter *watch, *tmp_watch;
+
+ /*
+ * No need for locking here because there are no other users,
+ * by definition.
+ */
+
+ list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
+ xenbus_transaction_end(trans->handle, 1);
+ list_del(&trans->list);
+ kfree(trans);
+ }
+
+ list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+ unregister_xenbus_watch(&watch->watch);
+ list_del(&watch->list);
+ free_watch_adapter(watch);
+ }
+
+ kfree(u);
+
+ return 0;
+}
+
+static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
+{
+ struct xenbus_file_priv *u = file->private_data;
+
+ poll_wait(file, &u->read_waitq, wait);
+ if (!list_empty(&u->read_buffers))
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+const struct file_operations xenbus_file_ops = {
+ .read = xenbus_file_read,
+ .write = xenbus_file_write,
+ .open = xenbus_file_open,
+ .release = xenbus_file_release,
+ .poll = xenbus_file_poll,
+};
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
new file mode 100644
index 00000000000..51f08b2d0bf
--- /dev/null
+++ b/drivers/xen/xenfs/xenfs.h
@@ -0,0 +1,6 @@
+#ifndef _XENFS_XENBUS_H
+#define _XENFS_XENBUS_H
+
+extern const struct file_operations xenbus_file_ops;
+
+#endif /* _XENFS_XENBUS_H */